Coverage for /Users/Newville/Codes/xraylarch/larch/io/columnfile.py: 49%

365 statements  

« prev     ^ index     » next       coverage.py v7.3.2, created at 2023-11-09 10:08 -0600

1#!/usr/bin/env python 

2""" 

3 Larch column file reader: read_ascii 

4""" 

5import os 

6import sys 

7import time 

8import string 

9from collections import namedtuple 

10import numpy as np 

11from dateutil.parser import parse as dateparse 

12from math import log10 

13from larch import Group 

14from larch.symboltable import isgroup 

15from ..utils import read_textfile 

16from .fileutils import fix_varname 

17from .xafs_beamlines import guess_beamline 

18 

19nanresult = namedtuple('NanResult', ('file_ok', 'message', 'nan_rows', 

20 'nan_cols', 'inf_rows', 'inf_cols')) 

21 

22MODNAME = '_io' 

23TINY = 1.e-7 

24MAX_FILESIZE = 100*1024*1024 # 100 Mb limit 

25COMMENTCHARS = '#;%*!$' 

26 

27def look_for_nans(path): 

28 """ 

29 look for Nans and Infs in an ASCII data file 

30 

31 Arguments: 

32 path (string): full path to ASCII column file 

33 

34 Returns: 

35 NanResult, named tuple with elements 

36 

37 'file_ok' : bool, whether data is read and contains no Nans or Infs 

38 'message' : exception message if file cannot be read at all or 

39 'has nans', 'has infs' or 'has nans and infs' 

40 `nan_rows`: list of rows containing Nans 

41 `nan_cols`: list of columns containing Nans 

42 `inf_rows`: list of rows containing Infs 

43 `inf_cols`: list of columns containing Infs 

44 """ 

45 

46 nan_rows, nan_cols, inf_rows, inf_cols = [], [], [], [] 

47 try: 

48 dat = read_ascii(path) 

49 except: 

50 etype, emsg, etb = sys.exc_info() 

51 if len(dat.data) < 1: 

52 return nanresult(False, 'no data in file', nan_rows, nan_cols, inf_rows, inf_cols) 

53 

54 if np.all(np.isfinite(dat.data)): 

55 return nanresult(True, 'file ok', nan_rows, nan_cols, inf_rows, inf_cols) 

56 

57 msg = 'unknown' 

58 nanvals = np.where(np.isnan(dat.data)) 

59 if len(nanvals[0]) > 0: 

60 msg = 'has nans' 

61 for icol in nanvals[0]: 

62 if icol not in nan_cols: 

63 nan_cols.append(icol) 

64 for irow in nanvals[1]: 

65 if irow not in nan_rows: 

66 nan_rows.append(irow) 

67 

68 infvals = np.where(np.isinf(dat.data)) 

69 if len(infvals[0]) > 0: 

70 if len(msg) == 0: 

71 msg = 'has infs' 

72 else: 

73 msg = 'has nans and infs' 

74 for icol in infvals[0]: 

75 if icol not in inf_cols: 

76 inf_cols.append(icol) 

77 for irow in infvals[1]: 

78 if irow not in inf_rows: 

79 inf_rows.append(irow) 

80 

81 return nanresult(False, msg, nan_rows, nan_cols, inf_rows, inf_cols) 

82 

83 

84def getfloats(txt, allow_times=True): 

85 """convert a line of numbers into a list of floats, 

86 as for reading a file with columnar numerical data. 

87 

88 Arguments 

89 --------- 

90 txt (str) : line of text to parse 

91 allow_times (bool): whether to support time stamps [True] 

92 

93 Returns 

94 ------- 

95 list with each entry either a float or None 

96 

97 Notes 

98 ----- 

99 The `allow_times` will try to support common date-time strings 

100 using the dateutil module, returning a numerical value as the 

101 Unix timestamp, using 

102 time.mktime(dateutil.parser.parse(word).timetuple()) 

103 """ 

104 words = [w.strip() for w in txt.replace(',', ' ').split()] 

105 mktime = time.mktime 

106 for i, w in enumerate(words): 

107 val = None 

108 try: 

109 val = float(w) 

110 except ValueError: 

111 try: 

112 val = mktime(dateparse(w).timetuple()) 

113 except ValueError: 

114 pass 

115 words[i] = val 

116 return words 

117 

118def colname(txt): 

119 return fix_varname(txt.strip().lower()).replace('.', '_') 

120 

121 

122def lformat(val, length=12): 

123 """Format a number with fixed-length format, somewhat like '%g' except that 

124 

125 a) the length of the output string will be the requested length. 

126 b) positive numbers will have a leading blank. 

127 b) the precision will be as high as possible. 

128 c) trailing zeros will not be trimmed. 

129 

130 The precision will typically be length-7, but may be better than 

131 that for values with absolute value between 1.e-5 and 1.e8. 

132 

133 Arguments: 

134 val value to be formatted 

135 length length of output string 

136 

137 Returns 

138 ------- 

139 string of specified length. 

140 

141 Notes 

142 ------ 

143 Positive values will have leading blank. 

144 

145 """ 

146 try: 

147 expon = int(log10(abs(val))) 

148 except (OverflowError, ValueError): 

149 expon = 0 

150 length = max(length, 7) 

151 form = 'e' 

152 prec = length - 7 

153 if abs(expon) > 99: 

154 prec -= 1 

155 elif ((expon > 0 and expon < (prec+4)) or 

156 (expon <= 0 and -expon < (prec-1))): 

157 form = 'f' 

158 prec += 4 

159 if expon > 0: 

160 prec -= expon 

161 fmt = '{0: %i.%i%s}' % (length, prec, form) 

162 return fmt.format(val) 

163 

164def parse_labelline(labelline, header): 

165 """ 

166 parse the 'label line' for an ASCII file. 

167 

168 

169 This is meant to handle some special cases of XAFS data collected at a variety of sources 

170 """ 

171 pass 

172 

173 

174def sum_fluor_channels(dgroup, roi, icr=None, ocr=None, ltime=None, label=None, 

175 add_data=True, **kws): 

176 """build summed, deadtime-corrected fluorescence spectrum for a Group 

177 

178 Arguments 

179 --------- 

180 dgroup data group 

181 roi list in array indices for ROI 

182 icr None or list of array indices for ICR [None] 

183 ocr None or list of array indices for OCR [None] 

184 ltime None or list of array indices for LTIME [None] 

185 label None or label for the summed, corrected array [None] 

186 add_data bool, whether to add label and data to datgroup [False] 

187 

188 Returns 

189 ------- 

190 label, ndarray with summed, deadtime-corrected data 

191 

192 if add_data is True, the ndarray will also be appended to `dgroup.data, 

193 and the label will be appended to dgroup.array_labels 

194 

195 

196 Notes 

197 ------ 

198 1. The output array will be Sum[ roi*icr/(ocr*ltime) ] 

199 2. The default label will be like the array label for the 'dtcN' + first ROI 

200 3. icr, ocr, or ltime can be `None`, '1.0', '-1', or '1' to mean '1.0' or 

201 arrays of indices for the respective components: must be the same lenght as roi 

202 

203 4. an array index of -1 will indicate 'bad channel' and be skipped for ROI 

204 or set to 1.0 for icr, ocr, or ltime 

205 

206 5. if the list of arrays in roi, icr, ocr, or ltime are otherwise out-of-range, 

207 the returned (label, data) will be (None, None) 

208 

209 """ 

210 nchans = len(roi) 

211 if icr in ('1.0', -1, 1, None): 

212 icr = [-1]*nchans 

213 if ocr in ('1.0', -1, 1, None): 

214 ocr = [-1]*nchans 

215 if ltime in ('1.0', -1, 1, None): 

216 ltime = [-1]*nchans 

217 if len(ltime) != nchans or len(icr) != nchans or len(ocr) != nchans: 

218 raise Value("arrays of indices for for roi, icr, ocr, and ltime must be the same length") 

219 

220 narr, npts = dgroup.data.shape 

221 nused = 0 

222 sum = 0.0 

223 olabel = None 

224 def get_data(arr, idx): 

225 iarr = arr[idx] 

226 if iarr < 0: 

227 return iarr, 1.0 

228 if iarr > narr-1: 

229 return None, None 

230 return iarr, dgroup.data[iarr, :] 

231 

232 for pchan in range(nchans): 

233 droi = dicr = docr = dltime = 1.0 

234 iarr, droi = get_data(roi, pchan) 

235 if isinstance(droi, np.ndarray): 

236 if olabel is None: 

237 olabel = dgroup.array_labels[iarr] 

238 elif iarr is None: 

239 return (None, None) 

240 else: # index of -1 here means "skip" 

241 continue 

242 

243 iarr, dicr = get_data(icr, pchan) 

244 if iarr is None: return (None, None) 

245 

246 iarr, docr = get_data(ocr, pchan) 

247 if iarr is None: return (None, None) 

248 

249 iarr, docr = get_data(ocr, pchan) 

250 if iarr is None: return (None, None) 

251 

252 iarr, dltime= get_data(ltime, pchan) 

253 if iarr is None: return (None, None) 

254 

255 sum += droi*dicr/(docr*dltime) 

256 nused += 1 

257 

258 if label is None: 

259 if olabel is None: olabel = 'ROI' 

260 label = olabel = f'dtc{nused}_{olabel}' 

261 n = 1 

262 while label in dgroup.array_labels: 

263 n += 1 

264 label = f'{olabel}_{n}' 

265 if add_data: 

266 dgroup.array_labels.append(label) 

267 dgroup.data = np.append(dgroup.data, sum.reshape(1, len(sum)), axis=0) 

268 return (label, sum) 

269 

270 

271 

272def read_ascii(filename, labels=None, simple_labels=False, 

273 sort=False, sort_column=0): 

274 """read a column ascii column file, returning a group 

275 containing the data extracted from the file. 

276 

277 Arguments: 

278 filename (str): name of file to read 

279 labels (ist or None) : list of labels to use for array labels [None] 

280 simple_labels (bool) : whether to force simple column labels (note 1) [False] 

281 sort (bool) : whether to sort row data (note 2) [False] 

282 sort_column (int) : column to use for sorting (note 2) [0] 

283 

284 Returns: 

285 Group 

286 

287 A data group containing data read from file, with several attributes: 

288 

289 | filename : text name of the file. 

290 | array_labels : array labels, names of 1-D arrays. 

291 | data : 2-dimensional data (ncolumns, nrows) with all data. 

292 | header : array of text lines of the header. 

293 | footer : array of text lines of the footer (text after the numerical data) 

294 | attrs : group of attributes parsed from header lines. 

295 

296 Notes: 

297 1. array labels. If `labels` is `None` (the default), column labels 

298 and names of 1d arrays will be guessed from the file header. This often 

299 means parsing the final header line, but tagged column files from several XAFS 

300 beamlines will be tried and used if matching. Column labels may be like 'col1', 

301 'col2', etc if suitable column labels cannot be guessed. 

302 These labels will be used as names for the 1-d arrays from each column. 

303 If `simple_labels` is `True`, the names 'col1', 'col2' etc will be used 

304 regardless of the column labels found in the file. 

305 

306 2. sorting. Data can be sorted to be in increasing order of any column, 

307 by giving the column index (starting from 0). 

308 

309 3. header parsing. If header lines are of the forms of 

310 

311 | KEY : VAL 

312 | KEY = VAL 

313 

314 these will be parsed into a 'attrs' dictionary in the returned group. 

315 

316 Examples: 

317 

318 >>> feo_data = read_ascii('feo_rt1.dat') 

319 >>> show(g)a 

320 == Group ascii_file feo_rt1.dat: 0 methods, 8 attributes == 

321 array_labels: ['energy', 'xmu', 'i0'] 

322 attrs: <Group header attributes from feo_rt1.dat> 

323 data: array<shape=(3, 412), type=dtype('float64')> 

324 energy: array<shape=(412,), type=dtype('float64')> 

325 filename: 'feo_rt1.dat' 

326 header: ['# room temperature FeO', '# data from 20-BM, 2001, as part of NXS school', ... ] 

327 i0: array<shape=(412,), type=dtype('float64')> 

328 xmu: array<shape=(412,), type=dtype('float64')> 

329 

330 See Also: 

331 read_xdi, write_ascii 

332 

333 """ 

334 if not os.path.isfile(filename): 

335 raise OSError("File not found: '%s'" % filename) 

336 if os.stat(filename).st_size > MAX_FILESIZE: 

337 raise OSError("File '%s' too big for read_ascii()" % filename) 

338 

339 text = read_textfile(filename) 

340 lines = text.split('\n') 

341 

342 ncol = None 

343 data, footers, headers = [], [], [] 

344 

345 lines.reverse() 

346 section = 'FOOTER' 

347 

348 for line in lines: 

349 line = line.strip() 

350 if len(line) < 1: 

351 continue 

352 # look for section transitions (going from bottom to top) 

353 if section == 'FOOTER' and not None in getfloats(line): 

354 section = 'DATA' 

355 elif section == 'DATA' and None in getfloats(line): 

356 section = 'HEADER' 

357 

358 # act of current section: 

359 if section == 'FOOTER': 

360 footers.append(line) 

361 elif section == 'HEADER': 

362 headers.append(line) 

363 elif section == 'DATA': 

364 rowdat = getfloats(line) 

365 if ncol is None: 

366 ncol = len(rowdat) 

367 elif ncol > len(rowdat): 

368 rowdat.extend([np.nan]*(ncol-len(rowdat))) 

369 elif ncol < len(rowdat): 

370 for i in data: 

371 i.extend([np.nan]*(len(rowdat)-ncol)) 

372 ncol = len(rowdat) 

373 data.append(rowdat) 

374 

375 # reverse header, footer, data, convert to arrays 

376 footers.reverse() 

377 headers.reverse() 

378 data.reverse() 

379 data = np.array(data).transpose() 

380 

381 # try to parse attributes from header text 

382 header_attrs = {} 

383 for hline in headers: 

384 hline = hline.strip().replace('\t', ' ') 

385 if len(hline) < 1: continue 

386 if hline[0] in COMMENTCHARS: 

387 hline = hline[1:].strip() 

388 keywds = [] 

389 if ':' in hline: # keywords in 'x: 22' 

390 words = hline.split(':', 1) 

391 keywds = words[0].split() 

392 elif '=' in hline: # keywords in 'x = 22' 

393 words = hline.split('=', 1) 

394 keywds = words[0].split() 

395 if len(keywds) == 1: 

396 key = colname(keywds[0]) 

397 if key.startswith('_'): 

398 key = key[1:] 

399 if len(words) > 1: 

400 header_attrs[key] = words[1].strip() 

401 

402 

403 path, fname = os.path.split(filename) 

404 attrs = {'filename': filename} 

405 group = Group(name='ascii_file %s' % filename, 

406 path=filename, 

407 filename=fname, 

408 header=headers, data=[], array_labels=[]) 

409 

410 if len(data) == 0: 

411 return group 

412 

413 if sort and sort_column >= 0 and sort_column < ncol: 

414 data = data[:, np.argsort(data[sort_column])] 

415 

416 group.data = data 

417 

418 if len(footers) > 0: 

419 group.footer = footers 

420 

421 group.attrs = Group(name='header attributes from %s' % filename) 

422 for key, val in header_attrs.items(): 

423 setattr(group.attrs, key, val) 

424 

425 if isinstance(labels, str): 

426 for bchar in ',#@%|:*': 

427 labels = labels.replace(bchar, '') 

428 labels = labels.split() 

429 if labels is None and not simple_labels: 

430 bldat = guess_beamline(headers)(headers) 

431 labels = bldat.get_array_labels() 

432 

433 if getattr(bldat, 'energy_units', 'eV') != 'eV': 

434 group.energy_units = bldat.energy_units 

435 if getattr(bldat, 'energy_column', 1) != 1: 

436 group.energy_column = bldat.energy_column 

437 if getattr(bldat, 'mono_dspace', -1) > 0: 

438 group.mono_dspace = bldat.mono_dspace 

439 

440 set_array_labels(group, labels=labels, simple_labels=simple_labels) 

441 return group 

442 

443def set_array_labels(group, labels=None, simple_labels=False, 

444 save_oldarrays=False): 

445 

446 """set array names for a group from its 2D `data` array. 

447 

448 Arguments 

449 ---------- 

450 labels (list of strings or None) array of labels to use 

451 simple_labels (bool): flag to use ('col1', 'col2', ...) [False] 

452 save_oldarrays (bool): flag to save old array names [False] 

453 

454 

455 Returns 

456 ------- 

457 group with newly named attributes of 1D array data, and 

458 an updated `array_labels` giving the mapping of `data` 

459 columns to attribute names. 

460 

461 Notes 

462 ------ 

463 1. if `simple_labels=True` it will overwrite any values in `labels` 

464 

465 3. Array labels must be valid python names. If not enough labels 

466 are specified, or if name clashes arise, the array names may be 

467 modified, often by appending an underscore and letter or by using 

468 ('col1', 'col2', ...) etc. 

469 

470 4. When `save_oldarrays` is `False` (the default), arrays named in the 

471 current `group.array_labels` will be erased. Other arrays and 

472 attributes will not be changed. 

473 

474 """ 

475 write = sys.stdout.write 

476 if not hasattr(group, 'data'): 

477 write("cannot set array labels for group '%s': no `data`\n" % repr(group)) 

478 return 

479 

480 # clear old arrays, if desired 

481 oldlabels = getattr(group, 'array_labels', None) 

482 if oldlabels is not None and not save_oldarrays: 

483 for attr in oldlabels: 

484 if hasattr(group, attr): 

485 delattr(group, attr) 

486 

487 ncols, nrow = group.data.shape 

488 

489 #### 

490 # step 1: determine user-defined labels from input options 

491 # generating array `tlabels` for test labels 

492 # 

493 # generate simple column labels, used as backup 

494 clabels = ['col%d' % (i+1) for i in range(ncols)] 

495 

496 if isinstance(labels, str): 

497 labels = labels.split() 

498 

499 

500 tlabels = labels 

501 # if simple column names requested or above failed, use simple column names 

502 if simple_labels or tlabels is None: 

503 tlabels = clabels 

504 

505 #### 

506 # step 2: check input and correct problems 

507 # 2.a: check for not enough and too many labels 

508 if len(tlabels) < ncols: 

509 for i in range(len(tlabels), ncols): 

510 tlabels.append("col%i" % (i+1)) 

511 elif len(tlabels) > ncols: 

512 tlabels = tlabels[:ncols] 

513 

514 # 2.b: check for names that clash with group attributes 

515 # or that are repeated, append letter. 

516 reserved_names = ('data', 'array_labels', 'filename', 

517 'attrs', 'header', 'footer') 

518 extras = string.ascii_lowercase 

519 labels = [] 

520 for i in range(ncols): 

521 lname = tlabels[i] 

522 if lname in reserved_names or lname in labels: 

523 lname = lname + '_a' 

524 j = 0 

525 while lname in labels: 

526 j += 1 

527 if j == len(extras): 

528 break 

529 lname = "%s_%s" % (tlabels[i], extras[j]) 

530 if lname in labels: 

531 lname = clabels[i] 

532 labels.append(lname) 

533 

534 #### 

535 # step 3: assign attribue names, set 'array_labels' 

536 for i, name in enumerate(labels): 

537 setattr(group, name, group.data[i]) 

538 group.array_labels = labels 

539 return group 

540 

541 

542def write_ascii(filename, *args, commentchar='#', label=None, header=None): 

543 """ 

544 write a list of items to an ASCII column file 

545 

546 Arguments: 

547 args (list of groups): list of groups to write. 

548 commentchar (str) : character for comment ('#') 

549 label (str on None): array label line (autogenerated) 

550 header (list of strings): array of strings for header 

551 

552 Returns: 

553 None 

554 

555 Examples: 

556 >>> write_ascii('myfile', group.energy, group.norm, header=['comment1', 'comment2'] 

557 

558 """ 

559 ARRAY_MINLEN = 2 

560 write = sys.stdout.write 

561 com = commentchar 

562 label = label 

563 if header is None: 

564 header = [] 

565 

566 arrays = [] 

567 arraylen = None 

568 

569 for arg in args: 

570 if isinstance(arg, np.ndarray): 

571 if len(arg) > ARRAY_MINLEN: 

572 if arraylen is None: 

573 arraylen = len(arg) 

574 else: 

575 arraylen = min(arraylen, len(arg)) 

576 arrays.append(arg) 

577 else: 

578 header.append(repr(arg)) 

579 

580 else: 

581 header.append(repr(arg)) 

582 

583 if arraylen is None: 

584 raise ValueError("write_ascii() need %i or more elements in arrays." % ARRAY_MINLEN) 

585 

586 buff = [] 

587 if header is None: 

588 buff = ['%s Output from Larch %s' % (com, time.ctime())] 

589 for s in header: 

590 buff.append('%s %s' % (com, s)) 

591 buff.append('%s---------------------------------'% com) 

592 if label is None: 

593 label = (' '*13).join(['col%d' % (i+1) for i in range(len(arrays))]) 

594 buff.append('# %s' % label) 

595 

596 arrays = np.array(arrays) 

597 for i in range(arraylen): 

598 w = [" %s" % lformat(val[i], length=14) for val in arrays] 

599 buff.append(' '.join(w)) 

600 buff.append('') 

601 

602 with open(filename, 'w', encoding=sys.getdefaultencoding()) as fout: 

603 fout.write('\n'.join(buff)) 

604 sys.stdout.write("wrote to file '%s'\n" % filename) 

605 

606 

607def write_group(filename, group, scalars=None, arrays=None, 

608 arrays_like=None, commentchar='#'): 

609 """(deprecated) write components of a group to an ASCII column file 

610 

611 

612 Warning: 

613 This is pretty minimal and may work poorly for large groups of complex data. 

614 Use `save_session` instead. 

615 

616 """ 

617 

618 items = dir(group) 

619 npts = 0 

620 if arrays is None: 

621 arrays = [] 

622 if scalars is None: 

623 scalars = [] 

624 

625 if arrays_like is not None and arrays_like in items: 

626 array = getattr(group, arrays_like) 

627 if isinstance(array, np.ndarray): 

628 npts = len(array) 

629 

630 for name in items: 

631 val = getattr(group, name) 

632 if isinstance(val, np.ndarray): 

633 if npts != 0 and npts == len(val) and name not in arrays: 

634 arrays.append(name) 

635 

636 header =[] 

637 for s in scalars: 

638 if s in items: 

639 val = getattr(group, s) 

640 header.append("%s = %s" % (s, val)) 

641 

642 label = ' '.join(arrays) 

643 

644 args = [] 

645 for name in arrays: 

646 if name in items: 

647 args.append(getattr(group, name)) 

648 

649 write_ascii(filename, *args, commentchar=commentchar, 

650 label=label, header=header) 

651 

652 

653def read_fdmnes(filename, **kwargs): 

654 """read [FDMNES](http://fdmnes.neel.cnrs.fr/) ascii files""" 

655 group = read_ascii(filename, **kwargs) 

656 group.header_dict = dict(filetype='FDMNES', energy_units='eV') 

657 for headline in group.header: 

658 if ("E_edge" in headline): 

659 if headline.startswith("#"): 

660 headline = headline[1:] 

661 vals = [float(v) for v in headline.split(" = ")[0].split(" ") if v] 

662 vals_names = headline.split(" = ")[1].split(", ") 

663 group.header_dict.update(dict(zip(vals_names, vals))) 

664 group.name = f'FDMNES file {filename}' 

665 group.energy += group.header_dict["E_edge"] 

666 #fix _arrlabel -> arrlabel 

667 for ilab, lab in enumerate(group.array_labels): 

668 if lab.startswith("_"): 

669 fixlab = lab[1:] 

670 group.array_labels[ilab] = fixlab 

671 delattr(group, lab) 

672 setattr(group, fixlab, group.data[ilab]) 

673 return group 

674 

675def guess_filereader(path, return_text=False): 

676 """guess function name to use to read a data file based on the file header 

677 

678 Arguments 

679 --------- 

680 path (str) : file path to be read 

681 

682 Returns 

683 ------- 

684 name of function (as a string) to use to read file 

685 if return_text: text of the read file 

686 """ 

687 text = read_textfile(path) 

688 lines = text.split('\n') 

689 line1 = lines[0].lower() 

690 reader = 'read_ascii' 

691 if 'epics scan' in line1: 

692 reader = 'read_gsescan' 

693 if 'xdi' in line1: 

694 reader = 'read_xdi' 

695 if 'epics stepscan file' in line1 : 

696 reader = 'read_gsexdi' 

697 if ("#s" in line1) or ("#f" in line1): 

698 reader = 'read_specfile' 

699 if 'fdmnes' in line1: 

700 reader = 'read_fdmnes' 

701 if return_text: 

702 return reader, text 

703 else: 

704 return reader