Due to the lapse in federal government funding, NASA is not updating this website. We sincerely regret this inconvenience.
NASA Logo
Ocean Color Science Software

ocssw V2022
SessionUtils.py
Go to the documentation of this file.
1 import os
2 import sys
3 import time
4 import re
5 import requests
6 
7 python2 = sys.version_info.major < 3
8 
9 # URL parsing utils:
10 
11 if python2:
12  from urlparse import urljoin, urlsplit, urlunsplit
13 else: # python 3
14  from urllib.parse import urljoin, urlsplit, urlunsplit
15 
16 
17 def base_url(url):
18  parts = urlsplit(url)
19  return urlunsplit((parts.scheme, parts.netloc, parts.path, None, None))
20 
21 
22 def full_url(url, link):
23  """
24  Add query to urljoin() results
25  ONLY if it's a page
26  """
27  base = base_url(urljoin(url, link))
28  if not is_page(base):
29  return base
30  else:
31  scheme, netloc, path, query, fragment = urlsplit(base)
32  query = urlsplit(url).query
33  return urlunsplit((scheme, netloc, path, query, None))
34 
35 
36 def is_page(url):
37  """
38  Make the dangerous assumption that URLs
39  pointing to another web page always end in '/'.
40  """
41  return base_url(url).endswith('/')
42 
43 
44 # General utils:
45 
46 def retry(func, *args, **kwargs):
47  """
48  Retry specified function call after a short delay
49  """
50  from time import sleep
51  ntries = kwargs.get('ntries')
52  if ntries:
53  delay = int(5 + (30. * (1. / (float(ntries) + 1.))))
54  if kwargs.get('verbose'):
55  print('Sleeping {}s; {} tries left.'.format(delay, ntries - 1))
56  sleep(delay)
57  kwargs['ntries'] = ntries - 1
58  return func(*args, **kwargs)
59 
60 
61 def thiscall():
62  """
63  Get function and arguments for caller
64  """
65  import inspect
66  caller = inspect.stack()[1]
67  func = eval(caller[3]) # function object
68  args = inspect.getargvalues(caller[0]) # frame
69  values = [args.locals[arg] for arg in args.args]
70  arglist = dict(zip(args.args, values)) # all as keyword args
71  return func, arglist
72 
73 
74 def set_mtime(filepath, mtime):
75  """
76  Set modification time for specified file.
77  Set access time to "now".
78  """
79  atime = time.time()
80  if python2:
81  os.utime(filepath, (atime, mtime))
82  else:
83  os.utime(filepath, times=(atime, mtime))
84 
85 
86 # URL content parsing utils:
87 
88 def getlinks_html(content, regex=''):
89  from BeautifulSoup import BeautifulSoup, SoupStrainer
90  soup = BeautifulSoup(content, parseOnlyThese=SoupStrainer('a'))
91  linklist = soup.findAll('a', attrs={'href': re.compile(regex)})
92  linklist = [link.get('href') for link in linklist]
93  return linklist
94 
95 
96 def getlinks_json(content, regex=''):
97  import json
98  parsed_json = json.loads(content)['rows']
99  linklist = [str(row[0]) for row in parsed_json]
100  if regex != '':
101  import re
102  regex = re.compile(regex)
103  linklist = [link for link in linklist if regex.search(link)]
104  return linklist
105 
106 
107 # requests.Response utils:
108 
109 def print_response(response):
110  if response:
111  for key, value in response.headers.items():
112  print('{}\t= {}'.format(key, value))
113  print(response.status_code, response.reason)
114 
115 
116 def is_html(response):
117  return response and response.ok and ('html' in response.headers['Content-Type'])
118 
119 
120 def is_json(response):
121  return response and response.ok and ('json' in response.headers['Content-Type'])
122 
123 
124 def url_mtime(response):
125  """
126  Returns timestamp of remote file as seconds since the epoch.
127  """
128  try:
129  mtime = response.headers['Last-Modified']
130  urltime = time.strptime(mtime, "%a, %d %b %Y %H:%M:%S %Z")
131  return time.mktime(urltime)
132  except Exception as e:
133  print('Exception: {:}'.format(e))
134  return sys.maxsize
135 
136 
138 
139  def __init__(self, timeout=5, max_tries=5, verbose=False, clobber=False):
140  self.timeout = timeout
141  self.max_tries = max_tries
142  self.verbose = verbose
143  self.clobber = clobber
144  self.session = requests.Session()
145 
146  def open_url(self, url, ntries=None, get=False):
147  """
148  Return requests.Session object for specified url.
149  Retries up to self.max_tries times if server is busy.
150  By default, retrieves header only.
151  """
152  if not ntries:
153  ntries = self.max_tries
154  response = None
155 
156  try:
157  if get:
158  response = self.session.get(url, timeout=self.timeout)
159  else:
160  response = self.session.head(url, timeout=self.timeout)
161  # if self.verbose:
162  # print('{}\t{}\t{}'.format(
163  # response.status_code, url, response.headers['Content-Type']))
164 
165  # redirect as needed
166  # TODO: get new url back to caller
167  loc = response.headers.get('Location')
168  if loc: # response.is_redirect:
169  if self.verbose:
170  print('redirected to {}'.format(loc))
171  response = self.open_url(self, loc)
172 
173  # return response if okay
174  if response.ok:
175  pass
176 
177  # retry if server is busy
178  elif (response.status_code > 499) and (ntries > 0):
179  if self.verbose:
180  print('Server busy; will retry {}'.format(url))
181  response = retry(self.open_url, url, ntries=ntries, get=get)
182 
183  # give up if too many tries
184  elif ntries == 0:
185  print('FAILED after {} tries: {}'.format(ntries, url))
186 
187  # give up if bad response
188  else:
189  print('Bad response for {}'.format(url))
190  print_response(response)
191 
192  except requests.exceptions.Timeout:
193  if ntries > 0:
194  if self.verbose:
195  print('Server timeout; will retry {}'.format(url))
196  response = retry(self.open_url, url, ntries=ntries, get=get)
197  pass
198 
199  except Exception as e:
200  print('Exception: {:}'.format(e))
201 
202  finally:
203  return response
204 
205  def needs_download(self, url, filepath, check_times=False, response=None):
206  """
207  Returns False if filepath is present and size matches remote url;
208  True otherwise. Optionally check timestamp as well.
209  """
210 
211  # only download files
212  if is_page(url):
213  return False
214 
215  if not os.path.isfile(filepath):
216  # if self.verbose:
217  # print('Local file not found:', filepath)
218  return True
219 
220  if not response:
221  response = self.open_url(url)
222  if not (response and response.ok):
223  return False
224 
225  # check file size
226  diffsize = os.path.getsize(filepath) != int(response.headers['Content-Length'])
227  if not check_times:
228  return diffsize
229 
230  # optionally check timestamp
231  else:
232  older = os.path.getmtime(filepath) < url_mtime(response)
233  return diffsize or older
234 
235  def download_file(self, url, filepath):
236  try:
237  r = self.session.get(url, timeout=self.timeout, stream=True)
238  with open(filepath, 'wb') as fd:
239  for chunk in r.iter_content(chunk_size=512):
240  fd.write(chunk)
241  response = self.open_url(url)
242  set_mtime(filepath, url_mtime(response))
243  except Exception as e:
244  print('Exception: {:}'.format(e))
245 
246  def list_pageurls(self, url, regex=''):
247  """
248  Returns a sorted, unique set of links from a given url.
249  Optionally specify regex to filter for acceptable files;
250  default is to list only links starting with url.
251  """
252  response = self.open_url(url, get=True)
253  if is_html(response):
254  linklist = getlinks_html(response.text, regex)
255  elif is_json(response):
256  linklist = getlinks_json(response.text, regex)
257  else:
258  return []
259 
260  # get full url
261  linklist = [full_url(url, link) for link in linklist]
262 
263  # if no filter, return only links containing url
264  # TODO: skip original url, and urls ending in "/"
265  if regex == '':
266  linklist = [link for link in linklist if base_url(url) in link]
267 
268  # return sorted, unique list
269  return sorted(set(linklist))
270 
271  def download_allfiles(self, url, dirpath, regex='', check_times=False,
272  response=None, clobber=False, dry_run=False):
273  """
274  Downloads all available files from a remote url into a local dirpath.
275  Default is to download only if local file doesn't match remote size;
276  set clobber=True to always download.
277  """
278  if not response:
279  response = self.open_url(url)
280  if not (response and response.ok):
281  return []
282 
283  downloaded = []
284  if dry_run and self.verbose:
285  print('Dry run:')
286  if not os.path.exists(dirpath) and not dry_run:
287  os.makedirs(dirpath)
288 
289  for link in self.list_pageurls(url, regex=regex):
290  f = os.path.basename(link)
291  filepath = os.path.join(dirpath, f)
292  if clobber or self.needs_download(
293  link, filepath, check_times=check_times):
294  if not dry_run:
295  self.download_file(link, filepath)
296  downloaded.append(filepath)
297  if self.verbose:
298  print('+ ' + f)
299 
300  return downloaded
301 
302  def spider(self, url, level=0, visited=None):
303  """
304  Demo crawler
305  """
306  if visited is None:
307  visited = []
308  try:
309  response = self.open_url(url)
310  if response.ok:
311  if self.verbose:
312  print('{}\t{}\t{}'.
313  format(level, url, response.headers['Content-Type']))
314  else:
315  print(url)
316  visited.append(url)
317 
318  if is_page(url):
319  for link in self.list_pageurls(url):
320  if (base_url(url) in link) and (link not in visited):
321  visited = self.spider(link, level=level + 1,
322  visited=visited)
323  else:
324  print('spider {} {}:\t{}'.
325  format(response.status_code, response.reason, url))
326 
327  except Exception as e:
328  print('Exception: {:}'.format(e))
329 
330  finally:
331  return visited
332 
333 # end of class SessionUtils
334 
335 
336 if __name__ == '__main__':
337  # parameters
338  if len(sys.argv) > 1:
339  url = sys.argv[1]
340  else:
341  url = 'https://oceandata.sci.gsfc.nasa.gov/Ancillary/LUTs/?format=json'
342 
343  # logging
344  debug = False # True
345  if debug:
346  import logging
347 
348  logging.basicConfig(format='%(levelname)s:%(message)s', level=logging.DEBUG)
349 
350  # init session, run crawler
351  s = SessionUtils(verbose=True)
352  s.spider(url)
def __init__(self, timeout=5, max_tries=5, verbose=False, clobber=False)
def getlinks_html(content, regex='')
Definition: SessionUtils.py:88
@ head
Definition: dataday.h:37
def full_url(url, link)
Definition: SessionUtils.py:22
def list_pageurls(self, url, regex='')
def download_file(self, url, filepath)
def print_response(response)
void print(std::ostream &stream, const char *format)
Definition: PrintDebug.hpp:38
subroutine func(x, conec, n, bconecno, bn, units, u, inno, i, outno, o, Input, Targ, p, sqerr)
Definition: ffnet.f:287
def set_mtime(filepath, mtime)
Definition: SessionUtils.py:74
def open_url(self, url, ntries=None, get=False)
def needs_download(self, url, filepath, check_times=False, response=None)
def spider(self, url, level=0, visited=None)
def download_allfiles(self, url, dirpath, regex='', check_times=False, response=None, clobber=False, dry_run=False)
def getlinks_json(content, regex='')
Definition: SessionUtils.py:96
Definition: aerosol.c:136
def retry(func, *args, **kwargs)
Definition: SessionUtils.py:46