Coverage for /Users/Newville/Codes/xraylarch/larch/io/columnfile.py: 49%
365 statements
« prev ^ index » next coverage.py v7.3.2, created at 2023-11-09 10:08 -0600
« prev ^ index » next coverage.py v7.3.2, created at 2023-11-09 10:08 -0600
1#!/usr/bin/env python
2"""
3 Larch column file reader: read_ascii
4"""
5import os
6import sys
7import time
8import string
9from collections import namedtuple
10import numpy as np
11from dateutil.parser import parse as dateparse
12from math import log10
13from larch import Group
14from larch.symboltable import isgroup
15from ..utils import read_textfile
16from .fileutils import fix_varname
17from .xafs_beamlines import guess_beamline
19nanresult = namedtuple('NanResult', ('file_ok', 'message', 'nan_rows',
20 'nan_cols', 'inf_rows', 'inf_cols'))
22MODNAME = '_io'
23TINY = 1.e-7
24MAX_FILESIZE = 100*1024*1024 # 100 Mb limit
25COMMENTCHARS = '#;%*!$'
27def look_for_nans(path):
28 """
29 look for Nans and Infs in an ASCII data file
31 Arguments:
32 path (string): full path to ASCII column file
34 Returns:
35 NanResult, named tuple with elements
37 'file_ok' : bool, whether data is read and contains no Nans or Infs
38 'message' : exception message if file cannot be read at all or
39 'has nans', 'has infs' or 'has nans and infs'
40 `nan_rows`: list of rows containing Nans
41 `nan_cols`: list of columns containing Nans
42 `inf_rows`: list of rows containing Infs
43 `inf_cols`: list of columns containing Infs
44 """
46 nan_rows, nan_cols, inf_rows, inf_cols = [], [], [], []
47 try:
48 dat = read_ascii(path)
49 except:
50 etype, emsg, etb = sys.exc_info()
51 if len(dat.data) < 1:
52 return nanresult(False, 'no data in file', nan_rows, nan_cols, inf_rows, inf_cols)
54 if np.all(np.isfinite(dat.data)):
55 return nanresult(True, 'file ok', nan_rows, nan_cols, inf_rows, inf_cols)
57 msg = 'unknown'
58 nanvals = np.where(np.isnan(dat.data))
59 if len(nanvals[0]) > 0:
60 msg = 'has nans'
61 for icol in nanvals[0]:
62 if icol not in nan_cols:
63 nan_cols.append(icol)
64 for irow in nanvals[1]:
65 if irow not in nan_rows:
66 nan_rows.append(irow)
68 infvals = np.where(np.isinf(dat.data))
69 if len(infvals[0]) > 0:
70 if len(msg) == 0:
71 msg = 'has infs'
72 else:
73 msg = 'has nans and infs'
74 for icol in infvals[0]:
75 if icol not in inf_cols:
76 inf_cols.append(icol)
77 for irow in infvals[1]:
78 if irow not in inf_rows:
79 inf_rows.append(irow)
81 return nanresult(False, msg, nan_rows, nan_cols, inf_rows, inf_cols)
84def getfloats(txt, allow_times=True):
85 """convert a line of numbers into a list of floats,
86 as for reading a file with columnar numerical data.
88 Arguments
89 ---------
90 txt (str) : line of text to parse
91 allow_times (bool): whether to support time stamps [True]
93 Returns
94 -------
95 list with each entry either a float or None
97 Notes
98 -----
99 The `allow_times` will try to support common date-time strings
100 using the dateutil module, returning a numerical value as the
101 Unix timestamp, using
102 time.mktime(dateutil.parser.parse(word).timetuple())
103 """
104 words = [w.strip() for w in txt.replace(',', ' ').split()]
105 mktime = time.mktime
106 for i, w in enumerate(words):
107 val = None
108 try:
109 val = float(w)
110 except ValueError:
111 try:
112 val = mktime(dateparse(w).timetuple())
113 except ValueError:
114 pass
115 words[i] = val
116 return words
118def colname(txt):
119 return fix_varname(txt.strip().lower()).replace('.', '_')
122def lformat(val, length=12):
123 """Format a number with fixed-length format, somewhat like '%g' except that
125 a) the length of the output string will be the requested length.
126 b) positive numbers will have a leading blank.
127 b) the precision will be as high as possible.
128 c) trailing zeros will not be trimmed.
130 The precision will typically be length-7, but may be better than
131 that for values with absolute value between 1.e-5 and 1.e8.
133 Arguments:
134 val value to be formatted
135 length length of output string
137 Returns
138 -------
139 string of specified length.
141 Notes
142 ------
143 Positive values will have leading blank.
145 """
146 try:
147 expon = int(log10(abs(val)))
148 except (OverflowError, ValueError):
149 expon = 0
150 length = max(length, 7)
151 form = 'e'
152 prec = length - 7
153 if abs(expon) > 99:
154 prec -= 1
155 elif ((expon > 0 and expon < (prec+4)) or
156 (expon <= 0 and -expon < (prec-1))):
157 form = 'f'
158 prec += 4
159 if expon > 0:
160 prec -= expon
161 fmt = '{0: %i.%i%s}' % (length, prec, form)
162 return fmt.format(val)
164def parse_labelline(labelline, header):
165 """
166 parse the 'label line' for an ASCII file.
169 This is meant to handle some special cases of XAFS data collected at a variety of sources
170 """
171 pass
174def sum_fluor_channels(dgroup, roi, icr=None, ocr=None, ltime=None, label=None,
175 add_data=True, **kws):
176 """build summed, deadtime-corrected fluorescence spectrum for a Group
178 Arguments
179 ---------
180 dgroup data group
181 roi list in array indices for ROI
182 icr None or list of array indices for ICR [None]
183 ocr None or list of array indices for OCR [None]
184 ltime None or list of array indices for LTIME [None]
185 label None or label for the summed, corrected array [None]
186 add_data bool, whether to add label and data to datgroup [False]
188 Returns
189 -------
190 label, ndarray with summed, deadtime-corrected data
192 if add_data is True, the ndarray will also be appended to `dgroup.data,
193 and the label will be appended to dgroup.array_labels
196 Notes
197 ------
198 1. The output array will be Sum[ roi*icr/(ocr*ltime) ]
199 2. The default label will be like the array label for the 'dtcN' + first ROI
200 3. icr, ocr, or ltime can be `None`, '1.0', '-1', or '1' to mean '1.0' or
201 arrays of indices for the respective components: must be the same lenght as roi
203 4. an array index of -1 will indicate 'bad channel' and be skipped for ROI
204 or set to 1.0 for icr, ocr, or ltime
206 5. if the list of arrays in roi, icr, ocr, or ltime are otherwise out-of-range,
207 the returned (label, data) will be (None, None)
209 """
210 nchans = len(roi)
211 if icr in ('1.0', -1, 1, None):
212 icr = [-1]*nchans
213 if ocr in ('1.0', -1, 1, None):
214 ocr = [-1]*nchans
215 if ltime in ('1.0', -1, 1, None):
216 ltime = [-1]*nchans
217 if len(ltime) != nchans or len(icr) != nchans or len(ocr) != nchans:
218 raise Value("arrays of indices for for roi, icr, ocr, and ltime must be the same length")
220 narr, npts = dgroup.data.shape
221 nused = 0
222 sum = 0.0
223 olabel = None
224 def get_data(arr, idx):
225 iarr = arr[idx]
226 if iarr < 0:
227 return iarr, 1.0
228 if iarr > narr-1:
229 return None, None
230 return iarr, dgroup.data[iarr, :]
232 for pchan in range(nchans):
233 droi = dicr = docr = dltime = 1.0
234 iarr, droi = get_data(roi, pchan)
235 if isinstance(droi, np.ndarray):
236 if olabel is None:
237 olabel = dgroup.array_labels[iarr]
238 elif iarr is None:
239 return (None, None)
240 else: # index of -1 here means "skip"
241 continue
243 iarr, dicr = get_data(icr, pchan)
244 if iarr is None: return (None, None)
246 iarr, docr = get_data(ocr, pchan)
247 if iarr is None: return (None, None)
249 iarr, docr = get_data(ocr, pchan)
250 if iarr is None: return (None, None)
252 iarr, dltime= get_data(ltime, pchan)
253 if iarr is None: return (None, None)
255 sum += droi*dicr/(docr*dltime)
256 nused += 1
258 if label is None:
259 if olabel is None: olabel = 'ROI'
260 label = olabel = f'dtc{nused}_{olabel}'
261 n = 1
262 while label in dgroup.array_labels:
263 n += 1
264 label = f'{olabel}_{n}'
265 if add_data:
266 dgroup.array_labels.append(label)
267 dgroup.data = np.append(dgroup.data, sum.reshape(1, len(sum)), axis=0)
268 return (label, sum)
272def read_ascii(filename, labels=None, simple_labels=False,
273 sort=False, sort_column=0):
274 """read a column ascii column file, returning a group
275 containing the data extracted from the file.
277 Arguments:
278 filename (str): name of file to read
279 labels (ist or None) : list of labels to use for array labels [None]
280 simple_labels (bool) : whether to force simple column labels (note 1) [False]
281 sort (bool) : whether to sort row data (note 2) [False]
282 sort_column (int) : column to use for sorting (note 2) [0]
284 Returns:
285 Group
287 A data group containing data read from file, with several attributes:
289 | filename : text name of the file.
290 | array_labels : array labels, names of 1-D arrays.
291 | data : 2-dimensional data (ncolumns, nrows) with all data.
292 | header : array of text lines of the header.
293 | footer : array of text lines of the footer (text after the numerical data)
294 | attrs : group of attributes parsed from header lines.
296 Notes:
297 1. array labels. If `labels` is `None` (the default), column labels
298 and names of 1d arrays will be guessed from the file header. This often
299 means parsing the final header line, but tagged column files from several XAFS
300 beamlines will be tried and used if matching. Column labels may be like 'col1',
301 'col2', etc if suitable column labels cannot be guessed.
302 These labels will be used as names for the 1-d arrays from each column.
303 If `simple_labels` is `True`, the names 'col1', 'col2' etc will be used
304 regardless of the column labels found in the file.
306 2. sorting. Data can be sorted to be in increasing order of any column,
307 by giving the column index (starting from 0).
309 3. header parsing. If header lines are of the forms of
311 | KEY : VAL
312 | KEY = VAL
314 these will be parsed into a 'attrs' dictionary in the returned group.
316 Examples:
318 >>> feo_data = read_ascii('feo_rt1.dat')
319 >>> show(g)a
320 == Group ascii_file feo_rt1.dat: 0 methods, 8 attributes ==
321 array_labels: ['energy', 'xmu', 'i0']
322 attrs: <Group header attributes from feo_rt1.dat>
323 data: array<shape=(3, 412), type=dtype('float64')>
324 energy: array<shape=(412,), type=dtype('float64')>
325 filename: 'feo_rt1.dat'
326 header: ['# room temperature FeO', '# data from 20-BM, 2001, as part of NXS school', ... ]
327 i0: array<shape=(412,), type=dtype('float64')>
328 xmu: array<shape=(412,), type=dtype('float64')>
330 See Also:
331 read_xdi, write_ascii
333 """
334 if not os.path.isfile(filename):
335 raise OSError("File not found: '%s'" % filename)
336 if os.stat(filename).st_size > MAX_FILESIZE:
337 raise OSError("File '%s' too big for read_ascii()" % filename)
339 text = read_textfile(filename)
340 lines = text.split('\n')
342 ncol = None
343 data, footers, headers = [], [], []
345 lines.reverse()
346 section = 'FOOTER'
348 for line in lines:
349 line = line.strip()
350 if len(line) < 1:
351 continue
352 # look for section transitions (going from bottom to top)
353 if section == 'FOOTER' and not None in getfloats(line):
354 section = 'DATA'
355 elif section == 'DATA' and None in getfloats(line):
356 section = 'HEADER'
358 # act of current section:
359 if section == 'FOOTER':
360 footers.append(line)
361 elif section == 'HEADER':
362 headers.append(line)
363 elif section == 'DATA':
364 rowdat = getfloats(line)
365 if ncol is None:
366 ncol = len(rowdat)
367 elif ncol > len(rowdat):
368 rowdat.extend([np.nan]*(ncol-len(rowdat)))
369 elif ncol < len(rowdat):
370 for i in data:
371 i.extend([np.nan]*(len(rowdat)-ncol))
372 ncol = len(rowdat)
373 data.append(rowdat)
375 # reverse header, footer, data, convert to arrays
376 footers.reverse()
377 headers.reverse()
378 data.reverse()
379 data = np.array(data).transpose()
381 # try to parse attributes from header text
382 header_attrs = {}
383 for hline in headers:
384 hline = hline.strip().replace('\t', ' ')
385 if len(hline) < 1: continue
386 if hline[0] in COMMENTCHARS:
387 hline = hline[1:].strip()
388 keywds = []
389 if ':' in hline: # keywords in 'x: 22'
390 words = hline.split(':', 1)
391 keywds = words[0].split()
392 elif '=' in hline: # keywords in 'x = 22'
393 words = hline.split('=', 1)
394 keywds = words[0].split()
395 if len(keywds) == 1:
396 key = colname(keywds[0])
397 if key.startswith('_'):
398 key = key[1:]
399 if len(words) > 1:
400 header_attrs[key] = words[1].strip()
403 path, fname = os.path.split(filename)
404 attrs = {'filename': filename}
405 group = Group(name='ascii_file %s' % filename,
406 path=filename,
407 filename=fname,
408 header=headers, data=[], array_labels=[])
410 if len(data) == 0:
411 return group
413 if sort and sort_column >= 0 and sort_column < ncol:
414 data = data[:, np.argsort(data[sort_column])]
416 group.data = data
418 if len(footers) > 0:
419 group.footer = footers
421 group.attrs = Group(name='header attributes from %s' % filename)
422 for key, val in header_attrs.items():
423 setattr(group.attrs, key, val)
425 if isinstance(labels, str):
426 for bchar in ',#@%|:*':
427 labels = labels.replace(bchar, '')
428 labels = labels.split()
429 if labels is None and not simple_labels:
430 bldat = guess_beamline(headers)(headers)
431 labels = bldat.get_array_labels()
433 if getattr(bldat, 'energy_units', 'eV') != 'eV':
434 group.energy_units = bldat.energy_units
435 if getattr(bldat, 'energy_column', 1) != 1:
436 group.energy_column = bldat.energy_column
437 if getattr(bldat, 'mono_dspace', -1) > 0:
438 group.mono_dspace = bldat.mono_dspace
440 set_array_labels(group, labels=labels, simple_labels=simple_labels)
441 return group
443def set_array_labels(group, labels=None, simple_labels=False,
444 save_oldarrays=False):
446 """set array names for a group from its 2D `data` array.
448 Arguments
449 ----------
450 labels (list of strings or None) array of labels to use
451 simple_labels (bool): flag to use ('col1', 'col2', ...) [False]
452 save_oldarrays (bool): flag to save old array names [False]
455 Returns
456 -------
457 group with newly named attributes of 1D array data, and
458 an updated `array_labels` giving the mapping of `data`
459 columns to attribute names.
461 Notes
462 ------
463 1. if `simple_labels=True` it will overwrite any values in `labels`
465 3. Array labels must be valid python names. If not enough labels
466 are specified, or if name clashes arise, the array names may be
467 modified, often by appending an underscore and letter or by using
468 ('col1', 'col2', ...) etc.
470 4. When `save_oldarrays` is `False` (the default), arrays named in the
471 current `group.array_labels` will be erased. Other arrays and
472 attributes will not be changed.
474 """
475 write = sys.stdout.write
476 if not hasattr(group, 'data'):
477 write("cannot set array labels for group '%s': no `data`\n" % repr(group))
478 return
480 # clear old arrays, if desired
481 oldlabels = getattr(group, 'array_labels', None)
482 if oldlabels is not None and not save_oldarrays:
483 for attr in oldlabels:
484 if hasattr(group, attr):
485 delattr(group, attr)
487 ncols, nrow = group.data.shape
489 ####
490 # step 1: determine user-defined labels from input options
491 # generating array `tlabels` for test labels
492 #
493 # generate simple column labels, used as backup
494 clabels = ['col%d' % (i+1) for i in range(ncols)]
496 if isinstance(labels, str):
497 labels = labels.split()
500 tlabels = labels
501 # if simple column names requested or above failed, use simple column names
502 if simple_labels or tlabels is None:
503 tlabels = clabels
505 ####
506 # step 2: check input and correct problems
507 # 2.a: check for not enough and too many labels
508 if len(tlabels) < ncols:
509 for i in range(len(tlabels), ncols):
510 tlabels.append("col%i" % (i+1))
511 elif len(tlabels) > ncols:
512 tlabels = tlabels[:ncols]
514 # 2.b: check for names that clash with group attributes
515 # or that are repeated, append letter.
516 reserved_names = ('data', 'array_labels', 'filename',
517 'attrs', 'header', 'footer')
518 extras = string.ascii_lowercase
519 labels = []
520 for i in range(ncols):
521 lname = tlabels[i]
522 if lname in reserved_names or lname in labels:
523 lname = lname + '_a'
524 j = 0
525 while lname in labels:
526 j += 1
527 if j == len(extras):
528 break
529 lname = "%s_%s" % (tlabels[i], extras[j])
530 if lname in labels:
531 lname = clabels[i]
532 labels.append(lname)
534 ####
535 # step 3: assign attribue names, set 'array_labels'
536 for i, name in enumerate(labels):
537 setattr(group, name, group.data[i])
538 group.array_labels = labels
539 return group
542def write_ascii(filename, *args, commentchar='#', label=None, header=None):
543 """
544 write a list of items to an ASCII column file
546 Arguments:
547 args (list of groups): list of groups to write.
548 commentchar (str) : character for comment ('#')
549 label (str on None): array label line (autogenerated)
550 header (list of strings): array of strings for header
552 Returns:
553 None
555 Examples:
556 >>> write_ascii('myfile', group.energy, group.norm, header=['comment1', 'comment2']
558 """
559 ARRAY_MINLEN = 2
560 write = sys.stdout.write
561 com = commentchar
562 label = label
563 if header is None:
564 header = []
566 arrays = []
567 arraylen = None
569 for arg in args:
570 if isinstance(arg, np.ndarray):
571 if len(arg) > ARRAY_MINLEN:
572 if arraylen is None:
573 arraylen = len(arg)
574 else:
575 arraylen = min(arraylen, len(arg))
576 arrays.append(arg)
577 else:
578 header.append(repr(arg))
580 else:
581 header.append(repr(arg))
583 if arraylen is None:
584 raise ValueError("write_ascii() need %i or more elements in arrays." % ARRAY_MINLEN)
586 buff = []
587 if header is None:
588 buff = ['%s Output from Larch %s' % (com, time.ctime())]
589 for s in header:
590 buff.append('%s %s' % (com, s))
591 buff.append('%s---------------------------------'% com)
592 if label is None:
593 label = (' '*13).join(['col%d' % (i+1) for i in range(len(arrays))])
594 buff.append('# %s' % label)
596 arrays = np.array(arrays)
597 for i in range(arraylen):
598 w = [" %s" % lformat(val[i], length=14) for val in arrays]
599 buff.append(' '.join(w))
600 buff.append('')
602 with open(filename, 'w', encoding=sys.getdefaultencoding()) as fout:
603 fout.write('\n'.join(buff))
604 sys.stdout.write("wrote to file '%s'\n" % filename)
607def write_group(filename, group, scalars=None, arrays=None,
608 arrays_like=None, commentchar='#'):
609 """(deprecated) write components of a group to an ASCII column file
612 Warning:
613 This is pretty minimal and may work poorly for large groups of complex data.
614 Use `save_session` instead.
616 """
618 items = dir(group)
619 npts = 0
620 if arrays is None:
621 arrays = []
622 if scalars is None:
623 scalars = []
625 if arrays_like is not None and arrays_like in items:
626 array = getattr(group, arrays_like)
627 if isinstance(array, np.ndarray):
628 npts = len(array)
630 for name in items:
631 val = getattr(group, name)
632 if isinstance(val, np.ndarray):
633 if npts != 0 and npts == len(val) and name not in arrays:
634 arrays.append(name)
636 header =[]
637 for s in scalars:
638 if s in items:
639 val = getattr(group, s)
640 header.append("%s = %s" % (s, val))
642 label = ' '.join(arrays)
644 args = []
645 for name in arrays:
646 if name in items:
647 args.append(getattr(group, name))
649 write_ascii(filename, *args, commentchar=commentchar,
650 label=label, header=header)
653def read_fdmnes(filename, **kwargs):
654 """read [FDMNES](http://fdmnes.neel.cnrs.fr/) ascii files"""
655 group = read_ascii(filename, **kwargs)
656 group.header_dict = dict(filetype='FDMNES', energy_units='eV')
657 for headline in group.header:
658 if ("E_edge" in headline):
659 if headline.startswith("#"):
660 headline = headline[1:]
661 vals = [float(v) for v in headline.split(" = ")[0].split(" ") if v]
662 vals_names = headline.split(" = ")[1].split(", ")
663 group.header_dict.update(dict(zip(vals_names, vals)))
664 group.name = f'FDMNES file {filename}'
665 group.energy += group.header_dict["E_edge"]
666 #fix _arrlabel -> arrlabel
667 for ilab, lab in enumerate(group.array_labels):
668 if lab.startswith("_"):
669 fixlab = lab[1:]
670 group.array_labels[ilab] = fixlab
671 delattr(group, lab)
672 setattr(group, fixlab, group.data[ilab])
673 return group
675def guess_filereader(path, return_text=False):
676 """guess function name to use to read a data file based on the file header
678 Arguments
679 ---------
680 path (str) : file path to be read
682 Returns
683 -------
684 name of function (as a string) to use to read file
685 if return_text: text of the read file
686 """
687 text = read_textfile(path)
688 lines = text.split('\n')
689 line1 = lines[0].lower()
690 reader = 'read_ascii'
691 if 'epics scan' in line1:
692 reader = 'read_gsescan'
693 if 'xdi' in line1:
694 reader = 'read_xdi'
695 if 'epics stepscan file' in line1 :
696 reader = 'read_gsexdi'
697 if ("#s" in line1) or ("#f" in line1):
698 reader = 'read_specfile'
699 if 'fdmnes' in line1:
700 reader = 'read_fdmnes'
701 if return_text:
702 return reader, text
703 else:
704 return reader