OB.DAAC Logo
NASA Logo
Ocean Color Science Software

ocssw V2022
JsonUtils.py
Go to the documentation of this file.
1 import os
2 import sys
3 import time
4 import json
5 
6 from seadasutils.ProcUtils import getSession, httpdl
7 
8 # URL parsing utils:
9 
10 from urllib.parse import urljoin, urlsplit, urlunsplit # python 3
11 
12 def base_url(url):
13  parts = urlsplit(url)
14  return urlunsplit((parts.scheme, parts.netloc, parts.path, None, None))
15 
16 
17 def full_url(url, link):
18  """
19  Add query to urljoin() results
20  ONLY if it's a page
21  """
22  base = base_url(urljoin(url, link))
23  if not is_page(base):
24  return base
25  else:
26  scheme, netloc, path, query, fragment = urlsplit(base)
27  query = urlsplit(url).query
28  return urlunsplit((scheme, netloc, path, query, None))
29 
30 
31 def is_page(url):
32  """
33  Make the dangerous assumption that URLs
34  pointing to another web page always end in '/'.
35  """
36  return base_url(url).endswith('/')
37 
38 
39 # General utils:
40 
41 def retry(func, *args, **kwargs):
42  """
43  Retry specified function call after a short delay
44  """
45  ntries = kwargs.get('ntries')
46  if ntries:
47  delay = int(5 + (30. * (1. / (float(ntries) + 1.))))
48  if kwargs.get('verbose'):
49  print('Sleeping {}s; {} tries left.'.format(delay, ntries - 1))
50  time.sleep(delay)
51  kwargs['ntries'] = ntries - 1
52  return func(*args, **kwargs)
53 
54 
55 def set_mtime(filepath, mtime):
56  """
57  Set modification time for specified file.
58  Set access time to "now".
59  """
60  atime = time.time()
61  try:
62  os.utime(filepath, times=(atime, mtime)) # python 3
63  except TypeError:
64  os.utime(filepath, (atime, mtime)) # python 2
65 
66 
67 # Table/link parsing utils:
68 
69 def linkdict(rows):
70  """
71  Each link in list is a dictionary describing a remote file:
72  link['href'] = URL pointing to file
73  link['mtime'] = timestamp as seconds since the epoch
74  link['size'] = size in bytes
75 """
76  keys = ['href', 'mtime', 'size']
77  linklist = []
78  for row in rows:
79  link = dict(list(zip(keys, row)))
80  link['mtime'] = link_mtime(link['mtime'])
81  linklist.append(link)
82  return linklist
83 
84 
85 def link_mtime(mtime):
86  """
87  Format remote file timestamp as seconds since the epoch.
88  """
89  try:
90  urltime = time.strptime(mtime, "%Y-%m-%d %H:%M:%S")
91  return time.mktime(urltime)
92  except ValueError:
93  return sys.maxsize
94 
95 
96 def getlinks_json(content):
97  return linkdict(json.loads(content.decode('utf-8'))['rows'])
98 
99 
100 def needs_download(link, filepath, check_times=False):
101  """
102  Returns False if filepath is present and size matches remote url;
103  True otherwise. Optionally check timestamp as well.
104  """
105 
106  # only download files
107  if is_page(link['href']):
108  return False
109 
110  # always download missing files
111  if not os.path.isfile(filepath):
112  return True
113 
114  # check file size
115  diffsize = os.path.getsize(filepath) != link['size']
116  if not check_times:
117  return diffsize
118 
119  # optionally check timestamp
120  else:
121  older = os.path.getmtime(filepath) < link['mtime']
122  return diffsize or older
123 
124 
125 # HTTPResponse utils:
126 def is_json(response):
127  return response and ('json' in response.headers.get('Content-Type'))
128 
129 
130 def ok_status(response):
131  return response and (response.status < 400)
132 
133 
135 
136  def __init__(self, timeout=5, max_tries=5, verbose=0, clobber=False):
137  self.timeout = timeout
138  self.max_tries = max_tries
139  self.verbose = verbose
140  self.clobber = clobber
141  self.status = 0
142 
143  def download_file(self, url, filepath):
144  try:
145  parts = urlsplit(url)
146  outputdir = os.path.dirname(filepath)
147  status = httpdl(parts.netloc, parts.path, localpath=outputdir,
148  timeout=self.timeout, ntries=self.max_tries, verbose=self.verbose)
149  if status:
150  self.status = 1
151  print('Error downloading {}'.format(filepath))
152  except Exception as e:
153  self.status = 1
154  print('Exception: {:}'.format(e))
155  return
156 
157  def get_links(self, url, regex=''):
158  """
159  Returns a unique set of links from a given url.
160  Optionally specify regex to filter for acceptable files;
161  default is to list only links starting with url.
162  """
163  linklist = []
164  session = getSession(verbose=self.verbose, ntries=self.max_tries)
165  with session.get(url, stream=True, timeout=self.timeout) as response:
166  if is_json(response):
167  linklist = getlinks_json(response.content)
168  else:
169  return []
170 
171  # make relative urls fully-qualified
172  for link in linklist:
173  link['href'] = full_url(url, link['href'])
174 
175  # filter for regex
176  if regex != '':
177  import re
178  regex = re.compile(regex)
179  linklist = [link for link in linklist if regex.search(link['href'])]
180  else: # if no filter, return only links containing url
181  linklist = [link for link in linklist if base_url(url) in link['href']]
182 
183  return linklist
184 
185  def download_allfiles(self, url, dirpath, regex='', check_times=False,
186  clobber=False, dry_run=False):
187  """
188  Downloads all available files from a remote url into a local dirpath.
189  Default is to download only if local file doesn't match remote size;
190  set clobber=True to always download.
191  """
192  downloaded = []
193  if dry_run and self.verbose:
194  print('Dry run:')
195  if not os.path.exists(dirpath) and not dry_run:
196  os.makedirs(dirpath)
197 
198  all_links = self.get_links(url, regex=regex)
199  for link in all_links:
200  f = os.path.basename(link['href'])
201  filepath = os.path.join(dirpath, f)
202  if clobber or needs_download(link, filepath,
203  check_times=check_times):
204  if not dry_run:
205  self.download_file(link['href'], filepath)
206  set_mtime(filepath, link['mtime'])
207  downloaded.append(filepath)
208  if self.verbose:
209  print('+ ' + f)
210 
211  return downloaded
212 
213 # end of class SessionUtils
214 
215 
216 if __name__ == '__main__':
217  # parameters
218  if len(sys.argv) > 1:
219  url = sys.argv[1]
220  else:
221  url = 'https://oceandata.sci.gsfc.nasa.gov/Ancillary/LUTs/?format=json'
222 
223  sessionUtil = SessionUtils()
224  links = sessionUtil.get_links(url)
225  print(links)
def getSession(verbose=0, ntries=5)
Definition: manifest.py:45
list(APPEND LIBS ${PGSTK_LIBRARIES}) add_executable(atteph_info_modis atteph_info_modis.c) target_link_libraries(atteph_info_modis $
Definition: CMakeLists.txt:7
def ok_status(response)
Definition: JsonUtils.py:130
def download_file(self, url, filepath)
Definition: JsonUtils.py:143
def link_mtime(mtime)
Definition: JsonUtils.py:85
def needs_download(link, filepath, check_times=False)
Definition: JsonUtils.py:100
def getlinks_json(content)
Definition: JsonUtils.py:96
def __init__(self, timeout=5, max_tries=5, verbose=0, clobber=False)
Definition: JsonUtils.py:136
subroutine func(x, conec, n, bconecno, bn, units, u, inno, i, outno, o, Input, Targ, p, sqerr)
Definition: ffnet.f:287
def set_mtime(filepath, mtime)
Definition: JsonUtils.py:55
def retry(func, *args, **kwargs)
Definition: JsonUtils.py:41
def linkdict(rows)
Definition: JsonUtils.py:69
def download_allfiles(self, url, dirpath, regex='', check_times=False, clobber=False, dry_run=False)
Definition: JsonUtils.py:185
def full_url(url, link)
Definition: JsonUtils.py:17
def is_json(response)
Definition: JsonUtils.py:126
def get_links(self, url, regex='')
Definition: JsonUtils.py:157
def httpdl(server, request, localpath='.', outputfilename=None, ntries=5, uncompress=False, timeout=30., verbose=0, force_download=False, chunk_size=DEFAULT_CHUNK_SIZE)
Definition: manifest.py:75