Due to the lapse in federal government funding, NASA is not updating this website. We sincerely regret this inconvenience.
NASA Logo
Ocean Color Science Software

ocssw V2022
JsonUtils.py
Go to the documentation of this file.
1 import os
2 import sys
3 import time
4 import json
5 
6 from seadasutils.ProcUtils import getSession, httpdl
7 
8 # URL parsing utils:
9 
10 from urllib.parse import urljoin, urlsplit, urlunsplit # python 3
11 
12 def base_url(url):
13  parts = urlsplit(url)
14  return urlunsplit((parts.scheme, parts.netloc, parts.path, None, None))
15 
16 
17 def full_url(url, link):
18  """
19  Add query to urljoin() results
20  ONLY if it's a page
21  """
22  base = base_url(urljoin(url, link))
23  if not is_page(base):
24  return base
25  else:
26  scheme, netloc, path, query, fragment = urlsplit(base)
27  query = urlsplit(url).query
28  return urlunsplit((scheme, netloc, path, query, None))
29 
30 
31 def is_page(url):
32  """
33  Make the dangerous assumption that URLs
34  pointing to another web page always end in '/'.
35  """
36  return base_url(url).endswith('/')
37 
38 
39 # General utils:
40 
41 def retry(func, *args, **kwargs):
42  """
43  Retry specified function call after a short delay
44  """
45  ntries = kwargs.get('ntries')
46  if ntries:
47  delay = int(5 + (30. * (1. / (float(ntries) + 1.))))
48  if kwargs.get('verbose'):
49  print('Sleeping {}s; {} tries left.'.format(delay, ntries - 1))
50  time.sleep(delay)
51  kwargs['ntries'] = ntries - 1
52  return func(*args, **kwargs)
53 
54 
55 def set_mtime(filepath, mtime):
56  """
57  Set modification time for specified file.
58  Set access time to "now".
59  """
60  atime = time.time()
61  try:
62  os.utime(filepath, times=(atime, mtime)) # python 3
63  except TypeError:
64  os.utime(filepath, (atime, mtime)) # python 2
65 
66 
67 # Table/link parsing utils:
68 
69 def linkdict(rows):
70  """
71  Each link in list is a dictionary describing a remote file:
72  link['href'] = URL pointing to file
73  link['mtime'] = timestamp as seconds since the epoch
74  link['size'] = size in bytes
75 """
76  keys = ['href', 'mtime', 'size']
77  linklist = []
78  for row in rows:
79  link = dict(list(zip(keys, row)))
80  link['mtime'] = link_mtime(link['mtime'])
81  linklist.append(link)
82  return linklist
83 
84 
85 def link_mtime(mtime):
86  """
87  Format remote file timestamp as seconds since the epoch.
88  """
89  try:
90  urltime = time.strptime(mtime, "%Y-%m-%d %H:%M:%S")
91  return time.mktime(urltime)
92  except ValueError:
93  return sys.maxsize
94 
95 
96 def getlinks_json(content):
97  return linkdict(json.loads(content.decode('utf-8'))['rows'])
98 
99 
100 def needs_download(link, filepath, check_times=False):
101  """
102  Returns False if filepath is present and size matches remote url;
103  True otherwise. Optionally check timestamp as well.
104  """
105 
106  # only download files
107  if is_page(link['href']):
108  return False
109 
110  # always download missing files
111  if not os.path.isfile(filepath):
112  return True
113 
114  # check file size
115  diffsize = os.path.getsize(filepath) != link['size']
116  if not check_times:
117  return diffsize
118 
119  # optionally check timestamp
120  else:
121  older = os.path.getmtime(filepath) < link['mtime']
122  return diffsize or older
123 
124 
125 # HTTPResponse utils:
126 def is_json(response):
127  return response and ('json' in response.headers.get('Content-Type'))
128 
129 
130 def ok_status(response):
131  return response and (response.status < 400)
132 
133 
135 
136  def __init__(self, timeout=5, max_tries=5, verbose=0, clobber=False):
137  self.timeout = timeout
138  self.max_tries = max_tries
139  self.verbose = verbose
140  self.clobber = clobber
141  self.status = 0
142 
143  def download_file(self, url, filepath):
144  try:
145  parts = urlsplit(url)
146  outputdir = os.path.dirname(filepath)
147  status = httpdl(parts.netloc, parts.path, localpath=outputdir,
148  timeout=self.timeout, ntries=self.max_tries, verbose=self.verbose)
149  if status:
150  self.status = 1
151  print('Error downloading {}'.format(filepath))
152  except Exception as e:
153  self.status = 1
154  print('Exception: {:}'.format(e))
155  return
156 
157  def get_links(self, url, regex=''):
158  """
159  Returns a unique set of links from a given url.
160  Optionally specify regex to filter for acceptable files;
161  default is to list only links starting with url.
162  """
163  linklist = []
164  session = getSession(verbose=self.verbose, ntries=self.max_tries)
165  with session.get(url, stream=True, timeout=self.timeout) as response:
166  if is_json(response):
167  linklist = getlinks_json(response.content)
168  else:
169  return []
170 
171  # make relative urls fully-qualified
172  for link in linklist:
173  link['href'] = full_url(url, link['href'])
174 
175  # filter for regex
176  if regex != '':
177  import re
178  regex = re.compile(regex)
179  linklist = [link for link in linklist if regex.search(link['href'])]
180  else: # if no filter, return only links containing url
181  linklist = [link for link in linklist if base_url(url) in link['href']]
182 
183  return linklist
184 
185  def download_allfiles(self, url, dirpath, regex='', check_times=False,
186  clobber=False, dry_run=False):
187  """
188  Downloads all available files from a remote url into a local dirpath.
189  Default is to download only if local file doesn't match remote size;
190  set clobber=True to always download.
191  """
192  downloaded = []
193  if dry_run and self.verbose:
194  print('Dry run:')
195  if not os.path.exists(dirpath) and not dry_run:
196  os.makedirs(dirpath)
197 
198  all_links = self.get_links(url, regex=regex)
199  for link in all_links:
200  f = os.path.basename(link['href'])
201  filepath = os.path.join(dirpath, f)
202  if clobber or needs_download(link, filepath,
203  check_times=check_times):
204  if not dry_run:
205  self.download_file(link['href'], filepath)
206  set_mtime(filepath, link['mtime'])
207  downloaded.append(filepath)
208  if self.verbose:
209  print('+ ' + f)
210 
211  return downloaded
212 
213 # end of class SessionUtils
214 
215 
216 if __name__ == '__main__':
217  # parameters
218  if len(sys.argv) > 1:
219  url = sys.argv[1]
220  else:
221  url = 'https://oceandata.sci.gsfc.nasa.gov/Ancillary/LUTs/?format=json'
222 
223  sessionUtil = SessionUtils()
224  links = sessionUtil.get_links(url)
225  print(links)
def getSession(verbose=0, ntries=5)
Definition: manifest.py:45
def ok_status(response)
Definition: JsonUtils.py:130
def download_file(self, url, filepath)
Definition: JsonUtils.py:143
def link_mtime(mtime)
Definition: JsonUtils.py:85
def needs_download(link, filepath, check_times=False)
Definition: JsonUtils.py:100
def getlinks_json(content)
Definition: JsonUtils.py:96
list(APPEND LIBS ${NETCDF_LIBRARIES}) find_package(GSL REQUIRED) include_directories($
Definition: CMakeLists.txt:8
def __init__(self, timeout=5, max_tries=5, verbose=0, clobber=False)
Definition: JsonUtils.py:136
void print(std::ostream &stream, const char *format)
Definition: PrintDebug.hpp:38
subroutine func(x, conec, n, bconecno, bn, units, u, inno, i, outno, o, Input, Targ, p, sqerr)
Definition: ffnet.f:287
def set_mtime(filepath, mtime)
Definition: JsonUtils.py:55
def retry(func, *args, **kwargs)
Definition: JsonUtils.py:41
def linkdict(rows)
Definition: JsonUtils.py:69
def download_allfiles(self, url, dirpath, regex='', check_times=False, clobber=False, dry_run=False)
Definition: JsonUtils.py:185
def full_url(url, link)
Definition: JsonUtils.py:17
def is_json(response)
Definition: JsonUtils.py:126
def get_links(self, url, regex='')
Definition: JsonUtils.py:157
def httpdl(server, request, localpath='.', outputfilename=None, ntries=5, uncompress=False, timeout=30., verbose=0, force_download=False, chunk_size=DEFAULT_CHUNK_SIZE)
Definition: manifest.py:75