NASA Logo
Ocean Color Science Software

ocssw V2022
l1aconvert_octs.py
Go to the documentation of this file.
1 #! /usr/bin/env python3
2 
3 # Pyhdf use version 0.10.5.
4 # Version 0.11.3 has a bug that does not let you read a char dataset.
5 # The fix was addressed Aug 2023, but the lastest pip package (as of Nov 2023)
6 # is still Jun 25, 2023. If a new version is released and it is > Aug 2023,
7 # use that one bc the fix should be merged.
8 # Fix commit: https://github.com/fhs/pyhdf/commit/e8f87adb3f308fc6dc6105aa631e2c659da8f4e6
9 
10 import argparse
11 from netCDF4 import Dataset as NC
12 import numpy as np
13 from pathlib import Path
14 from pyhdf.SD import SD as HDF
15 from pyhdf.SD import SDC
16 from netCDF4 import Dataset as NC
17 from datetime import datetime, timedelta, timezone
18 
19 datetime.isoformat
20 
21 __version__ = "1.0 2024-11-01"
22 
23 
26 
27 # Constants used to open, read and write the files
28 HDF_READ = SDC.READ
29 NC_WRITE = "w"
30 NC_FORMAT = "NETCDF4"
31 
32 # Do not copy these to NetCDF.
33 # year, day and milisecs will be made into isodate
34 SKIP_ATTRIBUTES = (
35  "Start Time", # dont need these because of time_coverage_start and end
36  "Start Year",
37  "Start Day",
38  "Start Millisec",
39  "End Time",
40  "End Year",
41  "End Day",
42  "End Millisec",
43  "Scene Center Time",
44  "Node Crossing Time",
45  "Processing Time",
46  "Latitude Units",
47  "Longitude Units",
48  "Scene Center Latitude",
49  "Scene Center Longitude",
50  "Scene Center Solar Zenith",
51  "Upper Left Latitude",
52  "Upper Left Longitude",
53  "Upper Right Latitude",
54  "Upper Right Longitude",
55  "Lower Left Latitude",
56  "Lower Left Longitude",
57  "Lower Right Latitude",
58  "Lower Right Longitude",
59  "Northernmost Latitude",
60  "Southernmost Latitude",
61  "Westernmost Longitude",
62  "Easternmost Longitude",
63  "Start Center Latitude",
64  "Start Center Longitude",
65  "End Center Latitude",
66  "End Center Longitude",
67  "Pixels per Scan Line",
68  "Number of Scan Lines",
69  "Scene Center Scan Line",
70  "Processing Log",
71  "Filled Scan Lines",
72  "FF Missing Frames",
73  "Replacement Flag",
74  "Processing Control"
75 )
76 
77 
78 # python has no switch, so this dict is used to get the NetCDF type equivalent based on HDF4 types.
79 # NetCDF API sets data types for variabled based on NumPy's type.
80 # list may not extensive, only includes ones used in **OCTS**
81 # key = HDF4 types
82 # value = NetCDF types
83 GET_NC_TYPES = {
84  SDC.UINT8: np.uint8, # ubyte
85  SDC.FLOAT32: np.float32, # float
86  SDC.FLOAT64: np.float64, # double
87  SDC.INT16: np.int16, # short
88  SDC.UINT16: np.uint16, # ushort
89  SDC.UINT32: np.uint32, # uint
90  SDC.INT32: np.int32, # int,
91  SDC.CHAR: 'S1' # string stored as 8bit char array
92 }
93 
94 # Global variables to make reading/writing easier across functions
95 hdfFile = None
96 ncFile = None
97 
98 # Convert HDF naming to NetCDF naming
99 # ie: "Data Center" to "data_center"
100 def convertToNetcdfName(hdfname:str):
101  return hdfname.lower().replace(" ", "_")
102 
103 # name of whatever dataset the program was trying to copy/do
104 # if error occurs, we'll know what to look at
105 errorAt = ""
106 
107 # copy global attributes from the hdf4 file into new NETCDF file
108 def copyGlobalAttributes(iFile: str, oFile: str):
109  global hdfFile
110  global ncFile
111  global errorAt
112 
113  try:
114  print("Copying global attributes")
115  globalAttr = hdfFile.attributes()
116 
117  for name, val in globalAttr.items():
118  errorAt = name
119 
120 
121  if (name in SKIP_ATTRIBUTES): continue
122 
123  # handle different data types when copying
124  # if string, then do nothing and write normally
125 
126  # strings
127  if (isinstance(val, str)):
128  ncFile.setncattr(convertToNetcdfName(name), val)
129  # numbers
130  else:
131  val = np.float32(val) if isinstance(val, float) else np.int32(val)
132  ncFile.setncattr(convertToNetcdfName(name), val)
133 
134  ncFile.setncattr(convertToNetcdfName(name), val)
135 
136  # convert node crossing time
137  # :node_crossing_time = "19961101 15:37:06.301" ;
138 
139  errorAt = "node_crossing_time"
140  node_crossing_str = globalAttr.get("Node Crossing Time")
141  ncd = node_crossing_str[:8]
142  nct = node_crossing_str[9:17]
143  fsec = node_crossing_str[18:21]
144  node_crossing = datetime.strptime(ncd+nct,"%Y%m%d%H:%M:%S")
145  ncFile.setncattr("node_crossing_time", str(node_crossing.strftime('%Y-%m-%dT%H:%M:%S.') + fsec[:3] + 'Z'))
146 
147  # convert scene center time
148  errorAt = "scene_center_time"
149  scene_center_str = globalAttr.get("Scene Center Time")
150  scd = node_crossing_str[:8]
151  sct = node_crossing_str[9:17]
152  fsec = node_crossing_str[18:21]
153  scene_center = datetime.strptime(scd+sct,"%Y%m%d%H:%M:%S")
154  ncFile.setncattr("scene_center_time", str(scene_center.strftime('%Y-%m-%dT%H:%M:%S.') + fsec[:3] + 'Z'))
155 
156  # make isodate for start/end time
157  errorAt = "time_coverage_start"
158  stime_str = globalAttr.get("Start Time")
159  scd = stime_str[:8]
160  sct = stime_str[9:17]
161  fsec = stime_str[18:21]
162  stime = datetime.strptime(scd+sct,"%Y%m%d%H:%M:%S")
163  ncFile.setncattr("time_coverage_start", str(stime.strftime('%Y-%m-%dT%H:%M:%S.') + fsec[:3] + 'Z'))
164 
165  # make isodate for end time
166  errorAt = "time_coverage_end"
167  etime_str = globalAttr.get("End Time")
168  ecd = etime_str[:8]
169  ect = etime_str[9:17]
170  fsec = etime_str[18:21]
171  etime = datetime.strptime(ecd+ect,"%Y%m%d%H:%M:%S")
172  ncFile.setncattr("time_coverage_end", str(etime.strftime('%Y-%m-%dT%H:%M:%S.') + fsec[:3] + 'Z'))
173 
174  # add converter version
175  ncFile.setncattr("l1aconvert_octs_version", __version__)
176 
177  # netcdf has history global attribute, so add one here:
178  # to convert a file, it's static except for the input and output files
179  ncFile.setncattr("history", "{}; python3 l1acovert_seawifs {} {}".format(globalAttr.get("Processing Control")[:-1], iFile, oFile))
180 
181  #add date_created
182  errorAt = "date_created"
183  create_date = datetime.now(tz=timezone.utc)
184 
185  ncFile.setncattr("date_created",str(create_date.strftime('%Y-%m-%dT%H:%M:%SZ')))
186 
187  # some static metadata
188  ncFile.setncattr("cdm_data_type","swath")
189  ncFile.setncattr("Conventions","CF-1.8, ACDD-1.3")
190  ncFile.setncattr("creator_email","data@oceancolor.gsfc.nasa.gov")
191  ncFile.setncattr("creator_name","NASA/GSFC/OBPG")
192  ncFile.setncattr("creator_url","https://oceancolor.gsfc.nasa.gov")
193  ncFile.setncattr("institution","NASA Goddard Space Flight Center, Ocean Biology Processing Group")
194  ncFile.setncattr("instrument","OCTS")
195  ncFile.setncattr("keywords_vocabulary","NASA Global Change Master Directory (GCMD) Science Keywords")
196  ncFile.setncattr("license","https://www.earthdata.nasa.gov/engage/open-data-services-and-software/data-and-information-policy")
197  ncFile.setncattr("naming_authority","gov.nasa.gsfc.oceancolor")
198  ncFile.setncattr("platform","ADEOS")
199  ncFile.setncattr("processing_level","L1A")
200  ncFile.setncattr("processing_version","V2")
201  ncFile.setncattr("product_name","{}".format(oFile.name))
202  ncFile.setncattr("project","Ocean Biology Processing Group")
203  ncFile.setncattr("publisher_email","data@oceancolor.gsfc.nasa.gov")
204  ncFile.setncattr("publisher_name","NASA/GSFC/OB.DAAC")
205  ncFile.setncattr("publisher_url","https://oceancolor.gsfc.nasa.gov")
206  ncFile.setncattr("standard_name_vocabulary","CF Standard Name Table v79")
207 
208  except:
209  print(f"-E- Error copying global attributes. Was processing <{errorAt}> from HDF4 when error was caught.")
210  exit()
211  errorAt = "" # reset
212 
213 
214 # Given a NetCDF variable, assign the attributes in attrList.
215 # attrList is the attribute list from the HDF4 dataset and it is copying
216 # that over to the NetCDF version
217 def assignNcVarAttr(ncVar, attrDict:dict, dataType):
218  for attr in attrDict.keys():
219  if (attr == "long_name" or attr == "long name"):
220  ncVar.long_name = attrDict.get(attr)
221  if (attr == "units"):
222  if ncVar.name != 'l1a_data':
223  ncVar.units = attrDict.get(attr)
224 
225  # valid range is valid_min and valid_max in netcdf
226  if (attr == "valid_range" or attr == "valid range"):
227 
228  # get the valid range
229  validRange = attrDict.get(attr)
230 
231  # valid range can be a string tuple, extract the values '(1,3)
232  # slice the string to get rid of the Parentheses ()
233  # otherwise, it is already a list
234  if isinstance(validRange, str):
235  validRange = attrDict.get(attr)[1:-2].split(",")
236 
237  # Based on the nctype, make sure the value is in numpy type
238  npValidMin = None
239  npValidMax = None
240  if (dataType == np.float32): # float
241  npValidMin = np.float32(validRange[0])
242  npValidMax = np.float32(validRange[1])
243 
244  elif (dataType == np.float64): # double
245  npValidMin = np.float64(validRange[0])
246  npValidMax = np.float64(validRange[1])
247 
248  elif (dataType == np.uint16): # ushort
249  npValidMin = np.uint16(validRange[0])
250  npValidMax = np.uint16(validRange[1])
251 
252  elif (dataType == np.uint32): # uint
253  npValidMin = np.uint32(validRange[0])
254  npValidMax = np.uint32(validRange[1])
255 
256  elif (dataType == np.int16): # short
257  npValidMin = np.int16(validRange[0])
258  npValidMax = np.int16(validRange[1])
259 
260  else: # int
261  npValidMin = np.int32(validRange[0])
262  npValidMax = np.int32(validRange[1])
263 
264  # assign it to valid min and max
265  ncVar.valid_min = npValidMin
266  ncVar.valid_max = npValidMax
267 
268 def getChunking(ncDims):
269  global ncFile
270  sizes = {
271  'bbt': 5,
272  'ndatas': 6,
273  'bsamp': 5,
274  'sline': 7,
275  'nsamp': 400,
276  'ndatas12': 72,
277  'lines': 32,
278  'nrec': 1,
279  'ntilt': 3,
280  'gains': 4,
281  'reft': 1,
282  'ntbl': 1,
283  'pixels': 128,
284  'err': 1,
285  'vec': 3,
286  'instt': 4,
287  'instr': 120,
288  'points': 1440,
289  'sets': 7,
290  'mat': 3,
291  'detectors': 10,
292  'odatas22': 22,
293  'rec': 32,
294  'tilts': 1,
295  'dets': 1,
296  'ninf': 1,
297  'rads': 1,
298  'blines': 200,
299  'coefs': 4,
300  'pxls': 7,
301  'pref': 128,
302  'pairs': 2,
303  'flag': 24,
304  'sdet': 2,
305  'ndatas22': 132,
306  'bands': 8,
307  'ndatas14': 84,
308  'bdatas': 2,
309  'odatas': 1,
310  }
311  chunks = []
312  for dim in ncDims:
313  dimsize = len(ncFile.dimensions[dim])
314  if sizes[dim] <= dimsize:
315  chunks.append(sizes[dim])
316  else:
317  chunks.append(1)
318  return chunks
319 
320 # Retrieves the HDF4 dataset and get all the required information to build
321 # the NetCDF version
323  global hdfFile
324  global ncFile
325  global errorAt
326  skipSDS = (
327  'ntilts',
328  'entry_year',
329  'entry_day',
330  'ref_year',
331  'ref_day',
332  'ref_minute'
333  )
334  try:
335  print("Copying datasets/variables...")
336 
337  datasetNames = hdfFile.datasets().keys()
338  for name in datasetNames:
339  if name in skipSDS:
340  continue
341  errorAt = name
342 
343  # hdf4 getting data
344  currSet = hdfFile.select(name)
345  hdfDataType = currSet.info()[3]; # index 3 gives the datasets data type
346  hdfDims = currSet.dimensions().keys() # get dimension names
347  data = currSet.get() # retrieves the data
348  hdfDatasetAttr = currSet.attributes() # variable attrs like long_name, units, etc.
349 
350  # netcdf writing data
351 
352  # get nc data type based on the hdf type
353  ncDatatype = GET_NC_TYPES.get(hdfDataType)
354  # get the dimensions to be set as a tuple
355  #ncDims = tuple(map(lambda dim: hdfDims))
356 
357  # create the variable and then assign the data
358  chunks = getChunking(hdfDims)
359  newVariable = ncFile.createVariable(convertToNetcdfName(name), ncDatatype, hdfDims, chunksizes=chunks, zlib=True)
360  newVariable[:] = data # slice entire variable and assign it
361 
362  # netcdf assigning attributes to the variables. NcVar is an object and can be
363  # passed by reference, so the function takes care of it
364  assignNcVarAttr(newVariable, hdfDatasetAttr, ncDatatype)
365  if name == 'msec':
366  newVariable.setncattr("units","milliseconds")
367  newVariable.setncattr("comment","milliseconds since start of day, day boundary crossing indicated if value is smaller than previous scan")
368 
369  except Exception as e:
370  print(f"-E- Error copying datasets/variables. Error occurred with HDF4 dataset named <{errorAt}>")
371  print(f"Reason: {e}")
372  exit()
373 
375  global ncFile
376 
377  try:
378  print("adding CF-compliant time variable...")
379 
380  msecs = ncFile['msec'][:]
381  timevar = np.zeros(len(msecs),np.float64)
382  time_coverage_start = datetime.fromisoformat(ncFile.time_coverage_start[:23])
383  day_start = datetime.strptime(time_coverage_start.strftime('%Y%m%d'),'%Y%m%d')
384  for i,msec in enumerate(msecs):
385  seconds = msec / 1000.
386  scantime = day_start + timedelta(seconds=seconds)
387  if scantime < time_coverage_start:
388  scantime = scantime+ timedelta(days=1)
389  timevar[i] = scantime.strftime('%s.%f')
390  chunksize = [32]
391  if len(ncFile.dimensions['rec']) < chunksize[0]:
392  chunksize[0] = 1
393  timeVariable = ncFile.createVariable('time', np.float64, 'rec', chunksizes=chunksize, zlib=True)
394  timeVariable[:] = timevar
395  timeVariable.setncattr("long_name","time")
396  timeVariable.setncattr("units","seconds since 1970-1-1")
397 
398  except Exception as e:
399  print(f"-E- Error occurred with adding time dataset...")
400  print(f"Reason: {e}")
401  exit()
402 
403 # Runs at the end of the program, closing both files.
405  global hdfFile
406  global ncFile
407 
408  print("Closing HDF4 File...")
409  hdfFile.end()
410  print("Closing NetCDF File...")
411  ncFile.close()
412 
413 
414 # Given a list of variables, extract the dimension data from it
415 # pyhdf lib does not have a way to get only the dims, you need to
416 # go through the variables and get it
418  global hdfFile
419  global ncFile
420  global errorAt
421 
422  errorAt = "Setting Dimensions"
423  # create a set so that when parsing all the variables with their dims,
424  # dims do not get repeated in the set
425  dimSet = set()
426  dimSet.add(("tilts", 1))
427  dimSet.add(("bands", 8))
428  dimSet.add(("bbt", 5))
429  dimSet.add(("ndatas", 6 ))
430  # dimSet.add(("bsamp", 5 ))
431  # dimSet.add(("sline", 22))
432  # dimSet.add(("nsamp", 400))
433  dimSet.add(("ndatas12", 72))
434  # lines = 2196
435  dimSet.add(("nrec", 1))
436  dimSet.add(("ntilt", 3))
437  dimSet.add(("gains", 4))
438  dimSet.add(("reft", 1))
439  dimSet.add(("ntbl", 1))
440  dimSet.add(("pixels", 2222))
441  dimSet.add(("err", 1))
442  dimSet.add(("vec", 3))
443  dimSet.add(("instt", 4))
444  dimSet.add(("instr", 120))
445  dimSet.add(("points", 1440))
446  # dimSet.add(("sets", 22))
447  dimSet.add(("mat", 3))
448  dimSet.add(("detectors", 10))
449  dimSet.add(("odatas22", 22))
450  # dimSet.add(("rec", 1098))
451  dimSet.add(("tilts", 1))
452  dimSet.add(("dets", 1))
453  dimSet.add(("ninf", 1))
454  dimSet.add(("rads", 1))
455  # dimSet.add(("blines", 10980))
456  dimSet.add(("coefs", 4))
457  # dimSet.add(("pxls", 21))
458  dimSet.add(("pref", 128))
459  dimSet.add(("pairs", 2))
460  dimSet.add(("flag", 24))
461  dimSet.add(("sdet", 2))
462  dimSet.add(("ndatas22", 132))
463  dimSet.add(("ndatas14", 84))
464  dimSet.add(("bdatas", 2))
465  dimSet.add(("odatas", 1))
466  try:
467  errorAt = "extract dimension details from HDF file"
468  # get key, value pair of all the HDF
469  hdfDatasets = hdfFile.datasets()
470  vars = ["l1a_data",
471  "lat",
472  "blk_data",
473  "start_line",
474  "samp_table"]
475 
476  for var in vars:
477  errorAt = f"extract {var} from HDF file"
478  varInfo = hdfDatasets[var] # returns something like: (('bands',), (8,), 22, 0
479  dimNames = varInfo[0] # tuple of dim names for var
480  dimVals = varInfo[1] # tuple of dim values for the dims
481 
482  # go though each dim and save it into the set
483  for i in range(len(dimNames)):
484  name = dimNames[i]
485  val = dimVals[i]
486  newDim = (convertToNetcdfName(name), val)
487  dimSet.add(newDim)
488 
489  # after getting all the dims, go through and set them in the NetCDF file
490 
491  for dim in dimSet:
492  errorAt = f"set {dim} in the NetCDF file"
493  name = dim[0]
494  val = dim[1]
495  ncFile.createDimension(name, val)
496 
497  except:
498  print(f"-E- Error copying dimensions. Was trying to {errorAt} when error was caught.")
499  exit()
500 
501 def main():
502  print(f"l1aconvert_octs {__version__}")
503 
504  # get global reference to the file variables
505  global ncFile
506  global hdfFile
507 
508  parser = argparse.ArgumentParser(description='''
509  Given an HDF4 Scientific Dataset (SD), convert it
510  into a NetCDF (NC) file format with the same name.
511  If the optional argument [oFile] name is given,
512  output as the oFile name.
513  ''')
514 
515  parser.add_argument("ifile", type=str, help="input HDF4 File")
516  parser.add_argument("ofile", nargs='?', type=str, help="Optional output file name; default = input + '.nc'")
517  parser.add_argument("--doi","-d",type=str,default=None,help="DOI string for metadata")
518 
519  # Parse the command-line arguments
520  # if oFile isn't given, then use the filename with .nc extension
521  args = parser.parse_args()
522  fileName = args.ifile
523  oFileName = fileName + ".nc" if args.ofile is None else args.ofile
524  oFileName = Path(oFileName)
525 
526  print(f"Input File:\t{fileName}")
527  print(f"Output File:\t{oFileName}\n")
528 
529  # Opening the hdf4 file for reading:
530  try:
531  hdfFile = HDF(fileName, HDF_READ)
532  print(f"Opening file:\t{fileName}")
533  except:
534  print(f"\n-E- Error opening file named: {fileName}.\n Make sure the filetype is hdf4.\n")
535  exit()
536 
537  # if opened successfully, create netcdf file to be written into
538  ncFile = NC(oFileName, NC_WRITE, NC_FORMAT)
539  setDimensions()
540  copyGlobalAttributes(fileName, oFileName)
541  if args.doi:
542  ncFile.setncattr("doi",args.doi)
543  copyDatasets()
545  closeFiles()
546  print("Conversion of L1A HDF4 to NETCDF successful")
547 
548 if __name__ == '__main__':
549  main()
#define NC
Definition: l1_octs.c:42
def convertToNetcdfName(str hdfname)
void print(std::ostream &stream, const char *format)
Definition: PrintDebug.hpp:38
def getChunking(ncDims)
Definition: aerosol.c:136
def copyGlobalAttributes(str iFile, str oFile)
def assignNcVarAttr(ncVar, dict attrDict, dataType)