acgc.icartt

Read and write ICARTT (ffi1001) format files

  1#!/usr/bin/env python3
  2# -*- coding: utf-8 -*-
  3'''Read and write ICARTT (ffi1001) format files
  4'''
  5
  6import os
  7import pandas as pd
  8
  9
 10def read_icartt( files, usePickle=False, timeIndex=False ):
 11    '''Read ICARTT file or files into a pandas DataFrame
 12    
 13    Parameters
 14    ----------
 15    files : list or str
 16        path to ICARTT file or files that will be read. 
 17        Data within these files will be concatenated, so files should all contain the same variables
 18    usePickle : bool, default=False
 19        if usePickle=True, the data will be written to a pkl file with ".pkl" appended to path
 20        On subsequent read_icartt calls, data will be read from the .pkl file, if it exists
 21    timeIndex : bool, default=False
 22        sets DataFrame index to the time variable from the ICARTT file, rather than a row counter
 23        
 24    Returns
 25    -------
 26    obs : pandas.DataFrame
 27        ICARTT file contents. In addition to column names for the ICARTT variables, 
 28        the DataFrame columns also include 'time' in pandas.DatetimeIndex format and 
 29        'file' that is the ordinal number of the input file that each row was read from
 30    '''
 31
 32    # Files input must be string or list of strings
 33    if isinstance( files, str ):
 34        # Convert to list
 35        files = [files]
 36    elif isinstance( files, list ):
 37        # Do nothing
 38        pass
 39    else:
 40        raise TypeError( 'read_icartt: files must be a filename or list of filenames' )
 41
 42    obsall = []
 43    for n,file in enumerate(files):
 44
 45        # Read from Pickle file, if it exists and is requested
 46        pklfile = file+'.pkl'
 47        if (usePickle and os.path.isfile(pklfile)):
 48            obs = pd.read_pickle(pklfile)
 49
 50        else:
 51
 52            # Ensure file exists
 53            if not os.path.isfile( file ):
 54                raise FileNotFoundError( file+" doesn't exist" )
 55
 56            # Read the ICARTT file
 57            with open( file, 'r', encoding='ascii' ) as f:
 58
 59                # Read the number of header lines
 60                nheader, fmt = f.readline().split(',')
 61                nheader = int(nheader)
 62
 63                # Ensure this is a 
 64                if int(fmt) != 1001:
 65                    raise Exception( 'read_icartt: '+file+' is not an ICARTT (ffi1001) file' )
 66
 67                # Skip 5 lines
 68                for junk in range(5):
 69                    next(f)
 70
 71                # Read date
 72                year, month, day = map( int, f.readline().split(',')[0:3] )
 73
 74                # Skip line
 75                next(f)
 76
 77                # Read name of the time variable
 78                tname, tunit = [s.strip() for s in f.readline().split(',')[0:2]]
 79
 80                # Raise exception if the time unit is not seconds;
 81                # may need to be handled differently below
 82                if (tunit in ['s','seconds','seconds (from midnight UTC)','seconds_past_midnight']):
 83                    # Use unit expected by pandas
 84                    tunit = 's'
 85                else:
 86                    print(tunit)
 87                    raise Exception( 'read_icartt: time expected in seconds (s); '+
 88                                     'unit in file: ',tunit )
 89
 90                # Number of dependent variables
 91                nvar = int( f.readline() )
 92
 93                # Scale factor for dependent variables
 94                scale = [ float(s) for s in f.readline().split(',') ]
 95
 96                # Missing value flag for dependent variables
 97                naflag = [ s.strip() for s in f.readline().split(',') ]
 98
 99                # Dependent variable names from the next nvar lines
100                varnames = [ f.readline().split(',')[0] for v in range(nvar) ]
101
102                # Missing flags for each variable as a dict
103                nadict = { varnames[i]: naflag[i] for i in range(nvar) }
104
105            # Read data
106            obs = pd.read_csv(file, skiprows=nheader-1,
107                              na_values=nadict, skipinitialspace=True)
108
109            # Catch missing data that are not reported as integers
110            #obs[obs==-99999] = np.nan
111
112            # Strip whitespace from column names
113            #obs.columns = obs.columns.str.strip()
114
115            # Apply scale factors
116            for i, s in enumerate(scale):
117                if s != 1:
118                    obs[varnames[i]] *= s
119
120            # Add a time variable in datetime format
121            obs['time'] = pd.DatetimeIndex( pd.Timestamp(year=year,month=month,day=day) +
122                                         pd.TimedeltaIndex(obs[tname],tunit) )
123
124            # Add flight number
125            obs['file'] = n+1
126
127            # Use time variable for index
128            if timeIndex:
129                obs.index = obs.time
130
131            # Save pickle
132            if usePickle:
133                obs.to_pickle(pklfile)
134
135        # Add to list
136        obsall.append(obs)
137
138    # Concatenate all files into one dataframe
139    obs = pd.concat( obsall, sort=True )
140
141    return obs
142
143def _get(obj,name,default_value=None):
144    '''Get value from either attribute or key `name` 
145    
146    Parameters
147    ----------
148    obj : dict or object
149    name : str
150        name of an attribute or key
151    default_value :
152        return value if the attribute or key do not exist
153    
154    Returns
155    -------
156    value :
157        value of attribute or dict key
158    '''
159    try:
160        # Try access as attribute
161        value = getattr(obj,name)
162    except AttributeError:
163        try:
164            # Try access as dict key
165            value = obj[name]
166        except KeyError:
167            # Not found so return default value
168            value = default_value
169    return value
170
171def write_icartt(filename, df, metadata, **kwargs):
172    '''Write an ICARTT ffi1001 file
173
174    The contents of a pandas DataFrame (``df``) are written to a text file in ICARTT format
175    using `metadata` to specify which variables are written and provide ICARTT file header.
176
177    ICARTT file format specification document:
178    https://www.earthdata.nasa.gov/esdis/esco/standards-and-practices/icartt-file-format
179
180    Parameters
181    ----------
182    filename : str
183        File to be created
184    df : pandas.DataFrame
185        Data values that will be written
186    metadata : dict or obj
187        See notes below for the attributes or keys that `metadata` must contain
188    **kwargs
189        passed to pandas.to_csv
190
191    Notes
192    -----
193    `metadata` can be a dict or any object, so long as it contains the following attributes or keys:
194    - independent_variable_definition (dict)
195        should have only one key
196    - dependent_variable_definition (dict)
197        Controls which variables from `df` are written to file
198    - measurement_start_date (pandas.Timestamp or datetime.datetime)
199        date UTC when measurement collection began
200    - pi_name (str)
201    - pi_contact_info (str)
202    - organization_name (str)
203    - dm_contact_info (str)
204    - mission_name (str)
205    - project_info (str)
206    - special_comments (list of str)
207    - platform (str)
208    - location (str)
209    - associated_data (str)
210    - intrument_info (str)
211    - data_info (str)
212    - uncertainty (str)
213    - ulod_flag (str)
214        commonly '-7777'
215    - ulod_value (str)
216    - llod_flag (str)
217        commonly '-8888'
218    - llod_value (str)
219    - stipulations_on_use (str)
220    - other_comments (str)
221    - revision (str)
222    - revision_comments (list of str)
223
224    The `independent_variable_defintion` and `dependent_variable_definition` are dicts
225    with entries of the form `{'VariableName':'units, standard name, [optional long name]'}`
226    The keys must correspond to columns of `df`.
227    `independent_variable_definition` should have only one key while 
228    `dependent_variable_definition` can have many. For example,
229    ``metadata.INDEPENDENT_VARIABLE_DEFINITION = 
230            {'Time_Start':'seconds, time at start of measurement, seconds since midnight UTC'}``
231    See Examples below.
232
233        
234    Examples
235    --------
236    ```
237    import pandas as pd
238    from acgc import icartt
239
240    df = pd.DataFrame( [[1,0,30],
241                        [2,10,29],
242                        [3,20,27],
243                        [4,30,25]],
244                        columns=['Time_Start','Alt','Temp'])
245
246    metadata = dict(
247        INDEPENDENT_VARIABLE_DEFINITION = 
248            {'Time_Start':'seconds, time, measurement time in seconds after takeoff'},
249        DEPENDENT_VARIABLE_DEFINITION = 
250            {'Alt':'m, altitude, altitude above ground level',
251             'Temp':'C, temperature, air temperature in Celsius'},
252        PI_NAME = 'Jane Doe',
253        ORGANIZATION_NAME = 'NASA',
254        SOURCE_DESCRIPTION = 'Invented Instrument',
255        MISSION_NAME = 'FIREX-AQ',
256        SPECIAL_COMMENTS = ['Special comments are optional and can be omitted.',
257                        'If used, they should be a list of one or more strings'],
258        PI_CONTACT_INFO = 'jdoe@email.com or postal address',
259        PLATFORM = 'NASA DC-8',
260        LOCATION = 'Boise, ID, USA',
261        ASSOCIATED_DATA = 'N/A',
262        INSTRUMENT_INFO = 'N/A',
263        DATA_INFO = 'N/A',
264        UNCERTAINTY = r'10% uncertainty in all values',
265        ULOD_FLAG = '-7777',
266        ULOD_VALUE = 'N/A',
267        LLOD_FLAG = '-8888',
268        LLOD_VALUE = 'N/A',
269        DM_CONTACT_INFO = 'Alice, data manager, alice@email.com',
270        STIPULATIONS_ON_USE = 'FIREX-AQ Data Use Policy',
271        PROJECT_INFO = 'FIREX-AQ 2019, https://project.com',
272        OTHER_COMMENTS = 'One line of comments',
273        REVISION = 'R1',
274        REVISION_COMMENTS = ['R0: Initial data',
275                            'R1: One string per revision'],
276        measurement_start_date = pd.Timestamp('2020-01-30 10:20')
277        )
278    
279    icartt.write_icartt( 'test.ict', df, metadata )
280    ```
281    '''
282
283    normal_comments = ['PI_CONTACT_INFO',
284                'PLATFORM',
285                'LOCATION',
286                'ASSOCIATED_DATA',
287                'INSTRUMENT_INFO',
288                'DATA_INFO',
289                'UNCERTAINTY',
290                'ULOD_FLAG',
291                'ULOD_VALUE',
292                'LLOD_FLAG',
293                'LLOD_VALUE',
294                'DM_CONTACT_INFO',
295                'PROJECT_INFO',
296                'STIPULATIONS_ON_USE',
297                'OTHER_COMMENTS',
298                'REVISION',
299                'REVISION_COMMENTS']
300
301    # Variables that will be written to file
302    ictvars = list(_get(metadata,'INDEPENDENT_VARIABLE_DEFINITION')) + \
303              list(_get(metadata,'DEPENDENT_VARIABLE_DEFINITION'))
304
305    # Variables that are not in the dataframe
306    missingvars = set(ictvars) - set(df.columns)
307
308    # Raise an error if there are missing variables
309    if len(missingvars)>0:
310        raise KeyError('Some output variables are not in the DataFrame: '+str(missingvars))
311
312    # Coerce to Timestamp
313    measurement_start_date = pd.Timestamp( _get(metadata, 'measurement_start_date') )
314
315    # Form the header
316    header = []
317
318    for k in ['PI_NAME',
319              'ORGANIZATION_NAME',
320              'SOURCE_DESCRIPTION',
321              'MISSION_NAME',]:
322        v = _get( metadata, k )
323        header.append( v )
324
325    # File volume
326    header.append( '1, 1')
327
328    # Date line
329    header.append( measurement_start_date.strftime('%Y, %m, %d, ') +
330                    pd.Timestamp.today().strftime('%Y, %m, %d'))
331
332    # Time interval
333    # Time spacing between records, set of unique values
334    independent_variable_name = list(_get(metadata,'INDEPENDENT_VARIABLE_DEFINITION'))[0]
335    dt = set( df[independent_variable_name].diff(1).dropna() )
336    if len(dt)==1:
337        # Constant time interval, use value
338        interval = list(dt)[0]
339    else:
340        # Time interval is not constant so code as 0
341        interval = 0
342    header.append( str(interval) )
343
344    # Independent variable 
345    keydict = _get( metadata, 'INDEPENDENT_VARIABLE_DEFINITION' )
346    header.append( list(keydict)[0] + ', ' + list(keydict.values())[0] )
347
348    # Dependent variables
349    keydict = _get( metadata, 'DEPENDENT_VARIABLE_DEFINITION' )
350    nvars = len(keydict)
351    header.append( str(nvars) )                 # Number of dependent variables
352    header.append( ','.join(['1']*nvars) )      # Scale factors for dependent variables
353    header.append( ','.join(['-9999']*nvars) )  # Missing data flags for dependent vars.
354    for kn in keydict.keys():
355        header.append( kn + ', ' + keydict[kn]) # Dependent variable definitions
356
357    # Special comments
358    v = _get( metadata, 'SPECIAL_COMMENTS' )
359    if v:
360        # Expect a string or array of several lines
361        header.append( str(len(list(v))) )
362        header.extend( list(v) )
363    else:
364        header.append( '0' )
365
366    # Normal Comments
367    nc= []
368    for kn in normal_comments:
369        v = _get( metadata, kn )
370        if kn=='REVISION_COMMENTS':
371            # Expect a string or array of several lines
372            nc.extend( list(v) )
373        else:    
374            nc.append( '{:s}: {:s}'.format(kn,v) )
375    # Variable short names
376    nc.append( ', '.join(ictvars))
377    # Add normal comments to the header
378    header.append( str(len(nc)) )
379    header.extend( nc )
380
381    # Write the file
382    with open(filename,'w',encoding='ascii') as f:
383        f.write(f'{len(header)+1:d}, 1001\n')  # +1 accounts for this line
384        for line in header:
385            f.write(line+'\n')
386        df[ictvars].to_csv( f,
387                            index=False,
388                            header=False,
389                            na_rep='-9999',
390                            **kwargs )
def read_icartt(files, usePickle=False, timeIndex=False):
 11def read_icartt( files, usePickle=False, timeIndex=False ):
 12    '''Read ICARTT file or files into a pandas DataFrame
 13    
 14    Parameters
 15    ----------
 16    files : list or str
 17        path to ICARTT file or files that will be read. 
 18        Data within these files will be concatenated, so files should all contain the same variables
 19    usePickle : bool, default=False
 20        if usePickle=True, the data will be written to a pkl file with ".pkl" appended to path
 21        On subsequent read_icartt calls, data will be read from the .pkl file, if it exists
 22    timeIndex : bool, default=False
 23        sets DataFrame index to the time variable from the ICARTT file, rather than a row counter
 24        
 25    Returns
 26    -------
 27    obs : pandas.DataFrame
 28        ICARTT file contents. In addition to column names for the ICARTT variables, 
 29        the DataFrame columns also include 'time' in pandas.DatetimeIndex format and 
 30        'file' that is the ordinal number of the input file that each row was read from
 31    '''
 32
 33    # Files input must be string or list of strings
 34    if isinstance( files, str ):
 35        # Convert to list
 36        files = [files]
 37    elif isinstance( files, list ):
 38        # Do nothing
 39        pass
 40    else:
 41        raise TypeError( 'read_icartt: files must be a filename or list of filenames' )
 42
 43    obsall = []
 44    for n,file in enumerate(files):
 45
 46        # Read from Pickle file, if it exists and is requested
 47        pklfile = file+'.pkl'
 48        if (usePickle and os.path.isfile(pklfile)):
 49            obs = pd.read_pickle(pklfile)
 50
 51        else:
 52
 53            # Ensure file exists
 54            if not os.path.isfile( file ):
 55                raise FileNotFoundError( file+" doesn't exist" )
 56
 57            # Read the ICARTT file
 58            with open( file, 'r', encoding='ascii' ) as f:
 59
 60                # Read the number of header lines
 61                nheader, fmt = f.readline().split(',')
 62                nheader = int(nheader)
 63
 64                # Ensure this is a 
 65                if int(fmt) != 1001:
 66                    raise Exception( 'read_icartt: '+file+' is not an ICARTT (ffi1001) file' )
 67
 68                # Skip 5 lines
 69                for junk in range(5):
 70                    next(f)
 71
 72                # Read date
 73                year, month, day = map( int, f.readline().split(',')[0:3] )
 74
 75                # Skip line
 76                next(f)
 77
 78                # Read name of the time variable
 79                tname, tunit = [s.strip() for s in f.readline().split(',')[0:2]]
 80
 81                # Raise exception if the time unit is not seconds;
 82                # may need to be handled differently below
 83                if (tunit in ['s','seconds','seconds (from midnight UTC)','seconds_past_midnight']):
 84                    # Use unit expected by pandas
 85                    tunit = 's'
 86                else:
 87                    print(tunit)
 88                    raise Exception( 'read_icartt: time expected in seconds (s); '+
 89                                     'unit in file: ',tunit )
 90
 91                # Number of dependent variables
 92                nvar = int( f.readline() )
 93
 94                # Scale factor for dependent variables
 95                scale = [ float(s) for s in f.readline().split(',') ]
 96
 97                # Missing value flag for dependent variables
 98                naflag = [ s.strip() for s in f.readline().split(',') ]
 99
100                # Dependent variable names from the next nvar lines
101                varnames = [ f.readline().split(',')[0] for v in range(nvar) ]
102
103                # Missing flags for each variable as a dict
104                nadict = { varnames[i]: naflag[i] for i in range(nvar) }
105
106            # Read data
107            obs = pd.read_csv(file, skiprows=nheader-1,
108                              na_values=nadict, skipinitialspace=True)
109
110            # Catch missing data that are not reported as integers
111            #obs[obs==-99999] = np.nan
112
113            # Strip whitespace from column names
114            #obs.columns = obs.columns.str.strip()
115
116            # Apply scale factors
117            for i, s in enumerate(scale):
118                if s != 1:
119                    obs[varnames[i]] *= s
120
121            # Add a time variable in datetime format
122            obs['time'] = pd.DatetimeIndex( pd.Timestamp(year=year,month=month,day=day) +
123                                         pd.TimedeltaIndex(obs[tname],tunit) )
124
125            # Add flight number
126            obs['file'] = n+1
127
128            # Use time variable for index
129            if timeIndex:
130                obs.index = obs.time
131
132            # Save pickle
133            if usePickle:
134                obs.to_pickle(pklfile)
135
136        # Add to list
137        obsall.append(obs)
138
139    # Concatenate all files into one dataframe
140    obs = pd.concat( obsall, sort=True )
141
142    return obs

Read ICARTT file or files into a pandas DataFrame

Parameters
  • files (list or str): path to ICARTT file or files that will be read. Data within these files will be concatenated, so files should all contain the same variables
  • usePickle (bool, default=False): if usePickle=True, the data will be written to a pkl file with ".pkl" appended to path On subsequent read_icartt calls, data will be read from the .pkl file, if it exists
  • timeIndex (bool, default=False): sets DataFrame index to the time variable from the ICARTT file, rather than a row counter
Returns
  • obs (pandas.DataFrame): ICARTT file contents. In addition to column names for the ICARTT variables, the DataFrame columns also include 'time' in pandas.DatetimeIndex format and 'file' that is the ordinal number of the input file that each row was read from
def write_icartt(filename, df, metadata, **kwargs):
172def write_icartt(filename, df, metadata, **kwargs):
173    '''Write an ICARTT ffi1001 file
174
175    The contents of a pandas DataFrame (``df``) are written to a text file in ICARTT format
176    using `metadata` to specify which variables are written and provide ICARTT file header.
177
178    ICARTT file format specification document:
179    https://www.earthdata.nasa.gov/esdis/esco/standards-and-practices/icartt-file-format
180
181    Parameters
182    ----------
183    filename : str
184        File to be created
185    df : pandas.DataFrame
186        Data values that will be written
187    metadata : dict or obj
188        See notes below for the attributes or keys that `metadata` must contain
189    **kwargs
190        passed to pandas.to_csv
191
192    Notes
193    -----
194    `metadata` can be a dict or any object, so long as it contains the following attributes or keys:
195    - independent_variable_definition (dict)
196        should have only one key
197    - dependent_variable_definition (dict)
198        Controls which variables from `df` are written to file
199    - measurement_start_date (pandas.Timestamp or datetime.datetime)
200        date UTC when measurement collection began
201    - pi_name (str)
202    - pi_contact_info (str)
203    - organization_name (str)
204    - dm_contact_info (str)
205    - mission_name (str)
206    - project_info (str)
207    - special_comments (list of str)
208    - platform (str)
209    - location (str)
210    - associated_data (str)
211    - intrument_info (str)
212    - data_info (str)
213    - uncertainty (str)
214    - ulod_flag (str)
215        commonly '-7777'
216    - ulod_value (str)
217    - llod_flag (str)
218        commonly '-8888'
219    - llod_value (str)
220    - stipulations_on_use (str)
221    - other_comments (str)
222    - revision (str)
223    - revision_comments (list of str)
224
225    The `independent_variable_defintion` and `dependent_variable_definition` are dicts
226    with entries of the form `{'VariableName':'units, standard name, [optional long name]'}`
227    The keys must correspond to columns of `df`.
228    `independent_variable_definition` should have only one key while 
229    `dependent_variable_definition` can have many. For example,
230    ``metadata.INDEPENDENT_VARIABLE_DEFINITION = 
231            {'Time_Start':'seconds, time at start of measurement, seconds since midnight UTC'}``
232    See Examples below.
233
234        
235    Examples
236    --------
237    ```
238    import pandas as pd
239    from acgc import icartt
240
241    df = pd.DataFrame( [[1,0,30],
242                        [2,10,29],
243                        [3,20,27],
244                        [4,30,25]],
245                        columns=['Time_Start','Alt','Temp'])
246
247    metadata = dict(
248        INDEPENDENT_VARIABLE_DEFINITION = 
249            {'Time_Start':'seconds, time, measurement time in seconds after takeoff'},
250        DEPENDENT_VARIABLE_DEFINITION = 
251            {'Alt':'m, altitude, altitude above ground level',
252             'Temp':'C, temperature, air temperature in Celsius'},
253        PI_NAME = 'Jane Doe',
254        ORGANIZATION_NAME = 'NASA',
255        SOURCE_DESCRIPTION = 'Invented Instrument',
256        MISSION_NAME = 'FIREX-AQ',
257        SPECIAL_COMMENTS = ['Special comments are optional and can be omitted.',
258                        'If used, they should be a list of one or more strings'],
259        PI_CONTACT_INFO = 'jdoe@email.com or postal address',
260        PLATFORM = 'NASA DC-8',
261        LOCATION = 'Boise, ID, USA',
262        ASSOCIATED_DATA = 'N/A',
263        INSTRUMENT_INFO = 'N/A',
264        DATA_INFO = 'N/A',
265        UNCERTAINTY = r'10% uncertainty in all values',
266        ULOD_FLAG = '-7777',
267        ULOD_VALUE = 'N/A',
268        LLOD_FLAG = '-8888',
269        LLOD_VALUE = 'N/A',
270        DM_CONTACT_INFO = 'Alice, data manager, alice@email.com',
271        STIPULATIONS_ON_USE = 'FIREX-AQ Data Use Policy',
272        PROJECT_INFO = 'FIREX-AQ 2019, https://project.com',
273        OTHER_COMMENTS = 'One line of comments',
274        REVISION = 'R1',
275        REVISION_COMMENTS = ['R0: Initial data',
276                            'R1: One string per revision'],
277        measurement_start_date = pd.Timestamp('2020-01-30 10:20')
278        )
279    
280    icartt.write_icartt( 'test.ict', df, metadata )
281    ```
282    '''
283
284    normal_comments = ['PI_CONTACT_INFO',
285                'PLATFORM',
286                'LOCATION',
287                'ASSOCIATED_DATA',
288                'INSTRUMENT_INFO',
289                'DATA_INFO',
290                'UNCERTAINTY',
291                'ULOD_FLAG',
292                'ULOD_VALUE',
293                'LLOD_FLAG',
294                'LLOD_VALUE',
295                'DM_CONTACT_INFO',
296                'PROJECT_INFO',
297                'STIPULATIONS_ON_USE',
298                'OTHER_COMMENTS',
299                'REVISION',
300                'REVISION_COMMENTS']
301
302    # Variables that will be written to file
303    ictvars = list(_get(metadata,'INDEPENDENT_VARIABLE_DEFINITION')) + \
304              list(_get(metadata,'DEPENDENT_VARIABLE_DEFINITION'))
305
306    # Variables that are not in the dataframe
307    missingvars = set(ictvars) - set(df.columns)
308
309    # Raise an error if there are missing variables
310    if len(missingvars)>0:
311        raise KeyError('Some output variables are not in the DataFrame: '+str(missingvars))
312
313    # Coerce to Timestamp
314    measurement_start_date = pd.Timestamp( _get(metadata, 'measurement_start_date') )
315
316    # Form the header
317    header = []
318
319    for k in ['PI_NAME',
320              'ORGANIZATION_NAME',
321              'SOURCE_DESCRIPTION',
322              'MISSION_NAME',]:
323        v = _get( metadata, k )
324        header.append( v )
325
326    # File volume
327    header.append( '1, 1')
328
329    # Date line
330    header.append( measurement_start_date.strftime('%Y, %m, %d, ') +
331                    pd.Timestamp.today().strftime('%Y, %m, %d'))
332
333    # Time interval
334    # Time spacing between records, set of unique values
335    independent_variable_name = list(_get(metadata,'INDEPENDENT_VARIABLE_DEFINITION'))[0]
336    dt = set( df[independent_variable_name].diff(1).dropna() )
337    if len(dt)==1:
338        # Constant time interval, use value
339        interval = list(dt)[0]
340    else:
341        # Time interval is not constant so code as 0
342        interval = 0
343    header.append( str(interval) )
344
345    # Independent variable 
346    keydict = _get( metadata, 'INDEPENDENT_VARIABLE_DEFINITION' )
347    header.append( list(keydict)[0] + ', ' + list(keydict.values())[0] )
348
349    # Dependent variables
350    keydict = _get( metadata, 'DEPENDENT_VARIABLE_DEFINITION' )
351    nvars = len(keydict)
352    header.append( str(nvars) )                 # Number of dependent variables
353    header.append( ','.join(['1']*nvars) )      # Scale factors for dependent variables
354    header.append( ','.join(['-9999']*nvars) )  # Missing data flags for dependent vars.
355    for kn in keydict.keys():
356        header.append( kn + ', ' + keydict[kn]) # Dependent variable definitions
357
358    # Special comments
359    v = _get( metadata, 'SPECIAL_COMMENTS' )
360    if v:
361        # Expect a string or array of several lines
362        header.append( str(len(list(v))) )
363        header.extend( list(v) )
364    else:
365        header.append( '0' )
366
367    # Normal Comments
368    nc= []
369    for kn in normal_comments:
370        v = _get( metadata, kn )
371        if kn=='REVISION_COMMENTS':
372            # Expect a string or array of several lines
373            nc.extend( list(v) )
374        else:    
375            nc.append( '{:s}: {:s}'.format(kn,v) )
376    # Variable short names
377    nc.append( ', '.join(ictvars))
378    # Add normal comments to the header
379    header.append( str(len(nc)) )
380    header.extend( nc )
381
382    # Write the file
383    with open(filename,'w',encoding='ascii') as f:
384        f.write(f'{len(header)+1:d}, 1001\n')  # +1 accounts for this line
385        for line in header:
386            f.write(line+'\n')
387        df[ictvars].to_csv( f,
388                            index=False,
389                            header=False,
390                            na_rep='-9999',
391                            **kwargs )

Write an ICARTT ffi1001 file

The contents of a pandas DataFrame (df) are written to a text file in ICARTT format using metadata to specify which variables are written and provide ICARTT file header.

ICARTT file format specification document: https://www.earthdata.nasa.gov/esdis/esco/standards-and-practices/icartt-file-format

Parameters
  • filename (str): File to be created
  • df (pandas.DataFrame): Data values that will be written
  • metadata (dict or obj): See notes below for the attributes or keys that metadata must contain
  • **kwargs: passed to pandas.to_csv
Notes

metadata can be a dict or any object, so long as it contains the following attributes or keys:

  • independent_variable_definition (dict) should have only one key
  • dependent_variable_definition (dict) Controls which variables from df are written to file
  • measurement_start_date (pandas.Timestamp or datetime.datetime) date UTC when measurement collection began
  • pi_name (str)
  • pi_contact_info (str)
  • organization_name (str)
  • dm_contact_info (str)
  • mission_name (str)
  • project_info (str)
  • special_comments (list of str)
  • platform (str)
  • location (str)
  • associated_data (str)
  • intrument_info (str)
  • data_info (str)
  • uncertainty (str)
  • ulod_flag (str) commonly '-7777'
  • ulod_value (str)
  • llod_flag (str) commonly '-8888'
  • llod_value (str)
  • stipulations_on_use (str)
  • other_comments (str)
  • revision (str)
  • revision_comments (list of str)

The independent_variable_defintion and dependent_variable_definition are dicts with entries of the form {'VariableName':'units, standard name, [optional long name]'} The keys must correspond to columns of df. independent_variable_definition should have only one key while dependent_variable_definition can have many. For example, metadata.INDEPENDENT_VARIABLE_DEFINITION = {'Time_Start':'seconds, time at start of measurement, seconds since midnight UTC'} See Examples below.

Examples
import pandas as pd
from acgc import icartt

df = pd.DataFrame( [[1,0,30],
                    [2,10,29],
                    [3,20,27],
                    [4,30,25]],
                    columns=['Time_Start','Alt','Temp'])

metadata = dict(
    INDEPENDENT_VARIABLE_DEFINITION = 
        {'Time_Start':'seconds, time, measurement time in seconds after takeoff'},
    DEPENDENT_VARIABLE_DEFINITION = 
        {'Alt':'m, altitude, altitude above ground level',
         'Temp':'C, temperature, air temperature in Celsius'},
    PI_NAME = 'Jane Doe',
    ORGANIZATION_NAME = 'NASA',
    SOURCE_DESCRIPTION = 'Invented Instrument',
    MISSION_NAME = 'FIREX-AQ',
    SPECIAL_COMMENTS = ['Special comments are optional and can be omitted.',
                    'If used, they should be a list of one or more strings'],
    PI_CONTACT_INFO = 'jdoe@email.com or postal address',
    PLATFORM = 'NASA DC-8',
    LOCATION = 'Boise, ID, USA',
    ASSOCIATED_DATA = 'N/A',
    INSTRUMENT_INFO = 'N/A',
    DATA_INFO = 'N/A',
    UNCERTAINTY = r'10% uncertainty in all values',
    ULOD_FLAG = '-7777',
    ULOD_VALUE = 'N/A',
    LLOD_FLAG = '-8888',
    LLOD_VALUE = 'N/A',
    DM_CONTACT_INFO = 'Alice, data manager, alice@email.com',
    STIPULATIONS_ON_USE = 'FIREX-AQ Data Use Policy',
    PROJECT_INFO = 'FIREX-AQ 2019, https://project.com',
    OTHER_COMMENTS = 'One line of comments',
    REVISION = 'R1',
    REVISION_COMMENTS = ['R0: Initial data',
                        'R1: One string per revision'],
    measurement_start_date = pd.Timestamp('2020-01-30 10:20')
    )

icartt.write_icartt( 'test.ict', df, metadata )