Due to the lapse in federal government funding, NASA is not updating this website. We sincerely regret this inconvenience.
NASA Logo
Ocean Color Science Software

ocssw V2022
MetaUtils.py
Go to the documentation of this file.
1 """
2 Routines to parse file metadata.
3 
4 """
5 import tarfile
6 
7 import seadasutils.DictUtils as du
8 import os
9 import re
10 import subprocess
11 import sys
12 
13 def get_hdf4_content(filename):
14  """
15  Returns the header content from an HDF 4 file, which is obtained via
16  'hdp dumpsds'.
17  """
18  # does executable exist?
19  hdp = os.path.join(os.getenv('LIB3_BIN'), 'hdp')
20  if not (os.path.isfile(hdp) and os.access(hdp, os.X_OK)):
21  print(hdp, "is not executable.")
22  return None
23 
24  # dump file header
25  cmd = [hdp, 'dumpsds', '-h', '-s', filename]
26  hdp_data = subprocess.Popen(cmd, stdout=subprocess.PIPE, shell=False).stdout
27  contents = hdp_data.read().decode("utf-8")
28  return contents
29 
31  """
32  Returns the header content plain text from an HDF 5 file which is obtained via
33  'h5dump -H'.
34  """
35  h5dump = os.path.join(os.getenv('LIB3_BIN'), 'h5dump')
36  if not (os.path.isfile(h5dump) and os.access(h5dump, os.X_OK)):
37  print(h5dump, "is not executable.")
38  return None
39  cmd = [h5dump, '-H', filename]
40  h5dump_output = subprocess.Popen(cmd, stdout=subprocess.PIPE,
41  stderr=subprocess.PIPE, shell=False).stdout
42  content = h5dump_output.read().decode("utf-8")
43  if content.find('HDF') != -1:
44  return content
45  else:
46  return None
47 
48 def get_hdf5_header_xml(filename):
49  """
50  Returns the header content as XML from an HDF 5 file which is obtained via
51  'h5dump -Au'.
52  """
53  h5dump = os.path.join(os.getenv('LIB3_BIN'), 'h5dump')
54  if not (os.path.isfile(h5dump) and os.access(h5dump, os.X_OK)):
55  print(h5dump, "is not executable.")
56  return None
57 
58  # dump file header
59  cmd = [h5dump, '-A', '-u', filename]
60  h5dump_output = subprocess.Popen(cmd, stdout=subprocess.PIPE,
61  stderr=subprocess.PIPE, shell=False).stdout
62  content = h5dump_output.read().decode("utf-8")
63  if content.find('HDF') != -1:
64  return content
65  else:
66  return None
67 
68 def get_mime_data(filename):
69  """
70  Returns the mime data for the file named in filename as found by running
71  the file command
72  """
73  mimecmd = ['file', '--brief', filename]
74  mime_data = subprocess.Popen(mimecmd,
75  stdout=subprocess.PIPE, shell=False).communicate()[0]
76  return mime_data.decode("utf-8")
77 
78 def is_ascii_file(filename):
79  """
80  Returns True if the given file is an ASCII file, False otherwise.
81  """
82  file_cmd_path = os.path.join(os.sep, 'usr', 'bin', 'file')
83  if os.path.exists(file_cmd_path) and os.access(file_cmd_path, os.X_OK):
84  file_cmd = [file_cmd_path, '--brief', filename]
85  file_output = subprocess.Popen(file_cmd, shell=False,
86  stdout=subprocess.PIPE).stdout
87  file_type = file_output.read().decode("utf-8").strip()
88  if file_type.find('ASCII') != -1:
89  return True
90  else:
91  return False
92  else:
93  err_msg = 'Error! Unable to run the file command.'
94  sys.exit(err_msg)
95 
96 def is_hdf4(mime_data):
97  """
98  Return True when the mime data is from netCDF4/HDF 5 file.
99  """
100  return re.search('Hierarchical.*version.4', mime_data)
101 
102 def is_netcdf4(mime_data):
103  """
104  Return True when the mime data is from netCDF4/HDF 5 file.
105  """
106  return re.search('Hierarchical.*version.5', mime_data)
107 
108 def is_tar_file(file_path):
109  """
110  This function is deprecated. Using it is discouraged. Please call
111  tarfile.is_tarfile directly.
112 
113  Returns a boolean telling if the file is a tar archive file.
114  """
115  # is_tar = False
116  # try:
117  # test_tar_obj = tarfile.TarFile(file_path)
118  # is_tar = True
119  # test_tar_obj.close()
120  # except:
121  # pass
122  return tarfile.is_tarfile(file_path)
123 
124 def is_metadata_file(mime_data):
125  """
126  Return True when the mime data is from xml, Landsat L1 file or MSI L1C file.
127  """
128  return re.search('xml', mime_data) or re.search('LC08_L1', mime_data) or re.search('manifest.safe', mime_data)
129 
130 def dump_metadata(filename):
131  """Dump file metadata:
132  Call functions to get HDF 4 and HDF 5 header data
133  read ASCII header from MERIS N1 files
134  """
135 
136  # does input file exist?
137  if not os.path.isfile(filename):
138  print("Can't find input file '" + filename + "'.")
139  return None
140 
141  lib3_bin_dir = os.getenv('LIB3_BIN')
142  if not lib3_bin_dir:
143  sys.exit('Error! Unable to locate LIB3_BIN environment variable. You may need to run')
144  ncdump = os.path.join(lib3_bin_dir, 'ncdump')
145  ncdump_hdf = os.path.join(lib3_bin_dir, 'ncdump_hdf')
146 
147  # mimecmd = ['file', '--brief', filename]
148  # mime = subprocess.Popen(mimecmd, stdout=subprocess.PIPE).communicate()[0]
149  mime = get_mime_data(filename)
150 
151  if mime.strip() == 'data':
152  content = get_hdf5_header_xml(filename)
153  if content:
154  return content
155 
156  if re.search('Hierarchical.*version.4', mime):
157  contents = get_hdf4_content(filename)
158  return contents
159  elif re.search('Hierarchical.*version.5', mime):
160  content = get_hdf5_header_xml(filename)
161  return content
162  elif re.search('NetCDF Data Format', mime):
163  if not (os.path.isfile(ncdump_hdf) and os.access(ncdump_hdf, os.X_OK)):
164  print(ncdump_hdf, "is not executable.")
165  return None
166  cmd = [ncdump_hdf, '-h', filename]
167  hdr_content = subprocess.Popen(cmd, shell=False,
168  stdout=subprocess.PIPE).communicate()
169  return hdr_content[0].decode("utf-8").split('\n')
170  else:
171  fbuffer = open(filename, 'r', 1)
172  try:
173  line1 = fbuffer.readline()
174  fbuffer.close()
175 
176  if re.search("HDF_UserBlock", line1):
177  content = get_hdf5_header_xml(filename)
178  return content
179  elif line1[0:3] == 'CDF':
180  # For NetCDF files, such as some from MERIS
181  cmd = [ncdump, '-h', filename]
182  hdr_content = subprocess.Popen(cmd, stdout=subprocess.PIPE, shell=False).stdout
183  return hdr_content.read().decode("utf-8")
184  else:
185  header = []
186  fbuffer = open(filename, 'r', 100)
187  #for line in fbuffer.readlines(100):
188  line = fbuffer.readline()
189  while line:
190  line = line.strip()
191  if len(line):
192  header.append(line)
193  if re.search('LAST_LAST_LONG', line):
194  break
195  line = fbuffer.readline()
196  fbuffer.close()
197  return header
198  except UnicodeDecodeError:
199  return []
200 
201 def readMetadata(filename):
202  """
203  Returns a dictionary containing the metadata for the file named by filename.
204  """
205  # todo: MERIS N1 files?
206  text = dump_metadata(filename)
207  # Added text == [] & changed exit() to sys.exit() -Matt, Feb. 15, 2012
208  # Kept an exit here (instead of making it a return) as already
209  # existing programs assume the output from this function is good.
210  if text is None or text == '':
211  sys.exit("Error! dump_metadata failed.")
212 
213  attrs = None
214 
215  # extract meaningful parts
216  if isinstance(text, list):
217  if text == []:
218  return attrs
219  elif re.search('SENTINEL-2 MSI Level-1C User Product', text[0]):
220  attrs = {}
221  if re.search('2A', text[0]):
222  attrs['platform'] = 'S2A'
223  elif re.search('2B', text[0]):
224  attrs['platform'] = 'S2B'
225  if text[0].find('startTime') != -1:
226  line_parts = text[0].split('safe:startTime>')
227  line_parts2 = line_parts[1].split('Z<')
228  attrs['startTime'] = line_parts2[0].strip()
229  attrs['instrument'] = 'MSI'
230  attrs['processing_level'] = 'L1B'
231  return attrs
232  elif re.search('PRODUCT', text[0]):
233  attrs = {}
234  for line in text:
235  (key, value) = str(line).split('=')
236  attrs[key] = str(value).strip('"')
237  return attrs
238  elif text[0][0:4] == 'CWIF':
239  return {'Title': 'SeaWiFS Level-0'}
240  elif text[0].find('GROUP = L1_METADATA_FILE') != -1:
241  in_metadata_group = False
242  attrs = {}
243  for line in text:
244  if in_metadata_group:
245  if line.find('END_GROUP = PRODUCT_METADATA') != -1:
246  break
247  else:
248  line_parts = line.split('=')
249  attr_key = line_parts[0].strip()
250  attr_val = line_parts[1].strip()
251  attrs[attr_key] = attr_val
252  elif line.find('GROUP = PRODUCT_METADATA') != -1:
253  in_metadata_group = True
254  elif text[0].find('GROUP = LANDSAT_METADATA_FILE') != -1:
255  in_metadata_group = False
256  attrs = {}
257  for line in text:
258  if in_metadata_group:
259  if line.find('END_GROUP = LEVEL1_PROCESSING_RECORD') != -1:
260  break
261  else:
262  line_parts = line.split('=')
263  attr_key = line_parts[0].strip()
264  attr_val = line_parts[1].strip()
265  attrs[attr_key] = attr_val
266  elif line.find('GROUP = PRODUCT_CONTENTS') != -1:
267  in_metadata_group = True
268  elif text[0].find(' = INVENTORYMETADATA') != -1:
269  # in_metadata_group = False
270  in_sensor_group = False
271  in_date_group = False
272  in_time_group = False
273  attrs = {}
274  for line in text:
275  if in_sensor_group and line.find('VALUE') != -1:
276  line_parts = line.split('=')
277  attr_val = line_parts[1].strip().replace('"', '')
278  attrs['instrument'] = attr_val
279  break
280  elif in_date_group and line.find('VALUE') != -1:
281  line_parts = line.split('=')
282  attr_val = line_parts[1].strip().replace('"', '')
283  attrs['startDate'] = attr_val
284  in_date_group = False
285  elif in_time_group and line.find('VALUE') != -1:
286  line_parts = line.split('=')
287  attr_val = line_parts[1].strip().replace('"', '')
288  attrs['startTime'] = attr_val
289  in_time_group = False
290  elif line.find('SENSORSHORTNAME') != -1 and line.find('END_OBJECT') == -1:
291  in_sensor_group = True
292  elif line.find('TIMEOFDAY') != -1 and line.find('END_OBJECT') == -1:
293  in_time_group = True
294  elif line.find('CALENDARDATE') != -1 and line.find('END_OBJECT') == -1:
295  in_date_group = True
296  elif text[0].find('xml') != -1:
297  attrs = {}
298  for line in text:
299  if line.find('SENTINEL-2 MSI Level-1C User Product') != -1:
300  attrs['instrument'] = 'MSI'
301  attrs['processing_level'] = 'L1B'
302  if line.find('safe:startTime>') != -1:
303  line_parts = line.split('safe:startTime>')
304  line_parts2 = line_parts[1].split('<')
305  attrs['startTime'] = line_parts2[0].strip()
306  if line.find('stopTime') != -1:
307  line_parts = line.split('>')
308  line_parts2 = line_parts[1].split('<')
309  attrs['stopTime'] = line_parts2[0].strip()
310  if line.find('<envisat:productName>ENV_ME_1_') != -1:
311  attrs['platform'] = 'ENVISAT'
312  attrs['instrument'] = 'MERIS'
313  attrs['processing_level'] = 'L1B'
314  return attrs
315  if line.find('<sentinel3:productName>S3A_OL_1_ERR') != -1:
316  attrs['platform'] = '3A'
317  attrs['data_type'] = 'ERR'
318  attrs['instrument'] = 'OLCI'
319  attrs['processing_level'] = 'L1B'
320  return attrs
321  if line.find('<sentinel3:productName>S3A_OL_1_EFR') != -1:
322  attrs['platform'] = '3A'
323  attrs['data_type'] = 'EFR'
324  attrs['instrument'] = 'OLCI'
325  attrs['processing_level'] = 'L1B'
326  return attrs
327  if line.find('<sentinel3:productName>S3B_OL_1_ERR') != -1:
328  attrs['platform'] = '3B'
329  attrs['data_type'] = 'ERR'
330  attrs['instrument'] = 'OLCI'
331  attrs['processing_level'] = 'L1B'
332  return attrs
333  if line.find('<sentinel3:productName>S3B_OL_1_EFR') != -1:
334  attrs['platform'] = '3B'
335  attrs['data_type'] = 'EFR'
336  attrs['instrument'] = 'OLCI'
337  attrs['processing_level'] = 'L1B'
338  return attrs
339  if line.find('2A')!= -1 and attrs['instrument'] == 'MSI':
340  attrs['platform'] = 'S2A'
341  return attrs
342  if line.find('2B')!= -1 and attrs['instrument'] == 'MSI':
343  attrs['platform'] = 'S2B'
344  return attrs
345  else:
346  for line in text:
347  if line.find('title = ') != -1:
348  if line.find('Daily-OI') != -1:
349  # NOAA supplied SST Ancillary files
350  return {'Title': 'Ancillary', 'Data Type': 'SST'}
351  elif isinstance(text, bytes) and (text[0:6] == 'netcdf'):
352  attrs = {}
353  lines = text.split('\n')
354  for line in lines:
355  if line.find('=') != -1:
356  fields = line.split('=')
357  key = fields[0]
358  pos = 0
359  while (not fields[0][pos].isalpha()) and pos < len(fields[0]):
360  key = key[1:]
361  pos += 1
362  attrs[key.strip()] = fields[1].strip()
363  return attrs
364  elif isinstance(text, bytes) and (text[0:4] == 'HDF5'):
365  attrs = get_hdf5_attr(text)
366  return attrs
367  # elif isinstance(text, types.StringType) and text[0:4] == 'HDF5':
368  # attrs = get_hdf5_attr(text)
369  elif re.search(r'<\?xml', text) or (text[0:4] == 'HDF5'):
370  # if hdf5 file
371  attrs = get_xml_attr(text)
372  else:
373  #if hdf4 file
374  file_attr_re = re.compile('File attributes:(.+?)\n',
375  re.MULTILINE | re.DOTALL)
376  file_attr_results = file_attr_re.search(text)
377  if file_attr_results != None:
378  file_attr_var_re = re.compile('File attributes:(.+?)\nVariable',
379  re.MULTILINE | re.DOTALL)
380  file_attr_var_results = file_attr_var_re.search(text)
381  if file_attr_var_results != None:
382  allmeta = file_attr_var_results.group(1)
383  # remove spaces around "=" to speed future searches
384  allmeta = re.sub(r'\s*=\s*', '=', allmeta)
385  # parse each file attribute
386  attrs = get_odl_attr(allmeta)
387  else:
388  attrs = \
389  get_attr(text)
390  return attrs
391 
392 def get_attr(text):
393  """
394  :param text: Text containing metadata to be parsed.
395  :return: A dictionary containing metadata attributes.
396  """
397  attrs = {}
398  lines = text.split('\n')
399  attr_pattern = re.compile(r'^\s*Attr\d+: Name = ')
400  value_pattern = re.compile(r'^\s*Value = ')
401  in_attr = False
402  for line in lines:
403  if re.match(attr_pattern, line):
404  in_attr = True
405  attr_name = line.split('=')[1].strip()
406  elif in_attr:
407  if re.match(value_pattern, line):
408  val = str(line).split('=', 1)[1].strip()
409  if attr_name == 'Input Parameters':
410  attrs[attr_name] = {}
411  params = val.split('|')
412  for param in params:
413  parts = param.split('=')
414  if len(parts) == 2:
415  attrs[attr_name][parts[0].strip()] = parts[1].strip()
416  else:
417  attrs[attr_name] = val
418  return attrs
419 
420 def get_hdf5_attr(header_text):
421  """ Returns a Python dictionary containing the file metadata passed from
422  header_text. The dictionary keys will the attribute names and the values
423  will be the data values for the attributes. """
424  attributes = {}
425  attr_regex = re.compile(r'ATTRIBUTE "')
426  data_item_regex = re.compile(r'\(\d+(,\d+)?\): ".+"')
427  data_open_regex = re.compile(r'DATA \{')
428  close_regex = re.compile(r' \}')
429  data_lines = header_text.split('\n')
430  in_attr = False
431  in_data = False
432  for line in data_lines:
433  if attr_regex.search(line):
434  in_attr = True
435  attr_name = re.search(r'ATTRIBUTE "(.+)"', line).group(1)
436  attributes[attr_name] = ''
437  elif data_open_regex.search(line):
438  in_data = True
439  elif in_data:
440  if close_regex.search(line):
441  in_data = False
442  # elif data_item_regex.search(line):
443  elif re.search(r'\(\d+\)\:', line):
444  # data_name = re.search(r'\(\d+(,\d+)?\): "(.+)"', line).group(2)
445  # Because the data fields can start or end with extra spaces
446  # both inside and outside the quotation marks, there are
447  # multiple calls to .strip().
448  the_data = line.split(':')[1].strip().strip('"').strip()
449  attributes[attr_name] = the_data
450  elif in_attr and close_regex.search(line):
451  in_attr = False
452  return attributes
453 
454 def get_odl_attr(metatext):
455  """
456  get interesting bits from ODL formatted metadata
457  """
458  attrs = {}
459  pattern = r'^\s*Attr\d+: Name=(.+?)\s*Type=(.+?)\s*Count=(.+?)\s*Value=(.+?)$'
460  re_attr = re.compile(pattern, re.MULTILINE | re.DOTALL)
461 
462  for att in re_attr.finditer(metatext):
463  name, dtype, count, value = att.groups()
464 
465  if 'char' in dtype:
466  # interpret ASCII codes
467  value = re.sub(r'\\000', '', value) # null
468  value = re.sub(r'\\011', '\t', value) # horizontal tab
469  value = re.sub(r'\\012', '\n', value) # newline
470 
471  else:
472  # add commas between array elements so they'll evaluate correctly
473  if eval(count) > 1:
474  value = ','.join(value.split())
475  # evaluate string to numerical type
476  value = set_type(value)
477 
478  if 'Metadata.' in name:
479  # interpret ODL heirarchy
480  value = parse_odl(value)
481 
482  # add attribute to dictionary
483  attrs[name] = value
484 
485  # eliminate redundant info, then return dictionary.
486  prune_odl(attrs)
487  return attrs
488 
489 def add_xml_group(group, attr):
490  """
491  add xml attributes to attr and decend groups
492  """
493  for node in group:
494  if node.tag == 'Attribute':
495  try:
496  key = node.attrib['Name']
497  val = node.find('Data').find('DataFromFile').text.strip().strip('"')
498  attr[key] = val
499  except:
500  pass
501  elif node.tag == 'Group' or node.tag == 'Dataset':
502  add_xml_group(node, attr)
503 
504 
505 def get_xml_attr(metaxml):
506  """
507  parse xml formatted metadata
508  """
509  import xml.etree.ElementTree as ET
510 
511  attr = {}
512  root = ET.fromstring(metaxml).find('RootGroup')
513  add_xml_group(root, attr)
514  return attr
515 
516 def parse_odl(text):
517  """Recursively extract ODL groups and objects."""
518 
519  # descend into GROUP/OBJECT heirarchy
520  pattern = r"(GROUP|OBJECT)=(.+?)$(.+?)END_\1=\2"
521  re_odl = re.compile(pattern, re.MULTILINE | re.DOTALL)
522  items = {}
523  blocks = re_odl.findall(text)
524  for block in blocks:
525  key = block[1]
526  value = block[2]
527  items[key] = parse_odl(value)
528 
529  # get value(s) at innermost level
530  if not len(list(items.keys())):
531  for line in text.splitlines():
532  get_value(line, items)
533 
534  return items
535 
536 def get_value(text, items=None):
537  """Interpret text as key/value pairs, if possible."""
538  if items is None:
539  items = {}
540  try:
541  key, value = [i.strip() for i in text.split('=', 1)]
542  items[key] = set_type(value)
543  except ValueError:
544  pass
545  return items
546 
547 
548 def set_type(value):
549  """Parse string value into correct type"""
550  try:
551  return eval(value)
552  except (NameError, SyntaxError, TypeError):
553  return value # leave unchanged anything that can't be evaluated
554 
555 
556 def prune_odl(metadict):
557  du.delete_key(metadict, 'StructMetadata.[0-9]')
558  du.delete_key(metadict, '(NUM_VAL|CLASS)')
559  du.promote_value(metadict, '.*VALUE')
560  du.reassign_keys_in_dict(metadict,
561  'ADDITIONALATTRIBUTENAME', 'INFORMATIONCONTENT')
562  du.flatten_dict(metadict)
563  return
def add_xml_group(group, attr)
Definition: MetaUtils.py:489
def is_metadata_file(mime_data)
Definition: MetaUtils.py:124
def is_ascii_file(filename)
Definition: MetaUtils.py:78
def get_value(text, items=None)
Definition: MetaUtils.py:536
def set_type(value)
Definition: MetaUtils.py:548
def get_xml_attr(metaxml)
Definition: MetaUtils.py:505
def readMetadata(filename)
Definition: MetaUtils.py:201
def prune_odl(metadict)
Definition: MetaUtils.py:556
list(APPEND LIBS ${NETCDF_LIBRARIES}) find_package(GSL REQUIRED) include_directories($
Definition: CMakeLists.txt:8
def is_tar_file(file_path)
Definition: MetaUtils.py:108
void print(std::ostream &stream, const char *format)
Definition: PrintDebug.hpp:38
def get_hdf4_content(filename)
Definition: MetaUtils.py:13
def get_hdf5_header_xml(filename)
Definition: MetaUtils.py:48
def is_hdf4(mime_data)
Definition: MetaUtils.py:96
def get_hdf5_attr(header_text)
Definition: MetaUtils.py:420
def get_hdf5_header_plaintext(filename)
Definition: MetaUtils.py:30
#define isalpha(c)
def dump_metadata(filename)
Definition: MetaUtils.py:130
def is_netcdf4(mime_data)
Definition: MetaUtils.py:102
def get_mime_data(filename)
Definition: MetaUtils.py:68
def get_odl_attr(metatext)
Definition: MetaUtils.py:454
Definition: aerosol.c:136