Coverage for C: \ Users \ peaco \ OneDrive \ Documents \ GitHub \ mt_metadata \ mt_metadata \ utils \ validators.py: 50%

300 statements  

« prev     ^ index     » next       coverage.py v7.13.1, created at 2026-01-10 00:11 -0800

1# -*- coding: utf-8 -*- 

2""" 

3======================= 

4schema 

5======================= 

6 

7Convenience Classes and Functions to deal with the base metadata standards 

8described by the csv file. 

9 

10The hope is that only the csv files will need to be changed as the standards 

11are modified. The attribute dictionaries are stored in ATTRICT 

12 

13Created on Wed Apr 29 11:11:31 2020 

14 

15@author: jpeacock 

16""" 

17import re 

18 

19# ============================================================================= 

20# Imports 

21# ============================================================================= 

22import sys 

23from collections.abc import Iterable 

24 

25import numpy as np 

26from loguru import logger 

27from pydantic import HttpUrl 

28 

29from mt_metadata import ACCEPTED_STYLES, REQUIRED_KEYS 

30from mt_metadata.utils.exceptions import MTSchemaError, MTValidatorError 

31 

32 

33# from mt_metadata.common.comment import Comment 

34 

35 

36# ============================================================================= 

37# validator functions 

38# ============================================================================= 

39 

40 

41def validate_doi(value: str | HttpUrl | None) -> HttpUrl | None: 

42 """ 

43 Validate a DOI string. 

44 

45 Parameters 

46 ---------- 

47 value : str 

48 The DOI string to validate. 

49 

50 Returns 

51 ------- 

52 str 

53 The validated DOI string. 

54 

55 Raises 

56 ------ 

57 ValueError 

58 If the DOI string is not valid. 

59 """ 

60 if value is None: 

61 return None 

62 elif isinstance(value, str): 

63 if value == "": 

64 return None 

65 if value.startswith("10."): 

66 value = f"https://doi.org/{value}" 

67 elif value.startswith("doi:"): 

68 value = f"https://doi.org/{value.replace('doi:', '')}" 

69 value = HttpUrl(value) 

70 elif isinstance(value, HttpUrl): 

71 pass 

72 # Check if the URL starts with a valid DOI prefix 

73 if not value.unicode_string().startswith("https://doi.org/"): 

74 if not value.unicode_string().startswith("https://dx.doi.org/"): 

75 raise ValueError(f"Invalid DOI: {value}") 

76 

77 return value 

78 

79 

80# def validate_comments(comments: str | Comment | None) -> Comment | None: 

81# """ 

82# Validate comments string. 

83 

84# Parameters 

85# ---------- 

86# comments : str | None 

87# The comments to validate. 

88 

89# Returns 

90# ------- 

91# str | None 

92# The validated comments string or None if empty. 

93# """ 

94# if isinstance(comments, str): 

95# return Comment(value=comments) # type: ignore 

96# return comments 

97 

98 

99def validate_header(header, attribute=False): 

100 """ 

101 validate header to make sure it includes the required keys: 

102 * 'attribute' 

103 * 'type' 

104 * 'required' 

105 * 'style' 

106 * 'units' 

107 

108 Parameters 

109 ---------- 

110 header : list 

111 list of header names 

112 attribute : bool, optional 

113 include attribute in test or not, by default False 

114 

115 Returns 

116 ------- 

117 list 

118 validated header 

119 """ 

120 if not isinstance(header, list): 

121 msg = "input header must be a list, not {type(header)}" 

122 raise MTValidatorError(msg) 

123 

124 if attribute: 

125 if sorted(header) != sorted(REQUIRED_KEYS): 

126 msg = ( 

127 f"Keys is not correct, must include {REQUIRED_KEYS}" 

128 + f". Currently has {header}" 

129 ) 

130 raise MTValidatorError(msg) 

131 else: 

132 required_keys = [key for key in REQUIRED_KEYS if key != "attribute"] 

133 if sorted(header) != sorted(required_keys): 

134 missing_keys = [x for x in required_keys if x not in header] 

135 msg = ( 

136 f"Keys is not correct, must include {required_keys}\n" 

137 + f". Currently has {header}\n" 

138 + f"Need to add keys: {missing_keys}" 

139 ) 

140 raise MTValidatorError(msg) 

141 return header 

142 

143 

144def validate_name(name): 

145 """ 

146 validate the name to conform to the standards 

147 name must be: 

148 

149 * all lower case {a-z; 1-9} 

150 * must start with a letter 

151 * categories are separated by '.' 

152 * words separated by '_' 

153 

154 {object}.{name_name} 

155 

156 '/' will be replaced with '.' 

157 converted to all lower case 

158 

159 Parameters 

160 ---------- 

161 name : str 

162 name name 

163 

164 Returns 

165 ------- 

166 str 

167 valid name name 

168 """ 

169 if not isinstance(name, str): 

170 msg = f"Attribute name must be a string, not {type(name)}" 

171 raise MTValidatorError(msg) 

172 

173 original = str(name) 

174 

175 if re.match("^[0-9]", name): 

176 msg = f"Attribute name cannot start with a number, {original}" 

177 raise MTValidatorError(msg) 

178 

179 if "/" in name: 

180 name = name.replace("/", ".") 

181 

182 if re.search("[A-Z].*?", name): 

183 name = "_".join(re.findall(".[^A-Z]*", name)) 

184 name = name.replace("._", ".") 

185 name = name.lower() 

186 

187 if original != name: 

188 msg = "input name {0} converted to {1} following MTH5 standards" 

189 

190 return name 

191 

192 

193def validate_station_name(name: str | int | float) -> str: 

194 """ 

195 validate station name to conform to general standards 

196 

197 - must be a string 

198 - must only contain letters, numbers, and underscores 

199 

200 Parameters 

201 ---------- 

202 name : str | int | float 

203 The station name to validate 

204 

205 Returns 

206 ------- 

207 str 

208 The validated station name 

209 

210 Raises 

211 ------ 

212 MTValidatorError 

213 If name is not a string or contains invalid characters 

214 """ 

215 name = str(name).strip() 

216 original = str(name) 

217 

218 # Replace spaces with underscores 

219 name = name.replace(" ", "_").replace("-", "_") 

220 

221 # Test if string contains only letters, numbers, and underscores 

222 if not re.match(r"^[a-zA-Z0-9_]+$", name): 

223 msg = f"Station name '{original}' contains invalid characters. Only letters, numbers, and underscores are allowed." 

224 raise MTValidatorError(msg) 

225 

226 return name 

227 

228 

229def validate_attribute(name): 

230 """ 

231 validate the name to conform to the standards 

232 name must be: 

233 

234 * all lower case {a-z; 1-9} 

235 * must start with a letter 

236 * categories are separated by '.' 

237 * words separated by '_' 

238 

239 {object}.{name_name} 

240 

241 '/' will be replaced with '.' 

242 converted to all lower case 

243 

244 Parameters 

245 ---------- 

246 name : str 

247 name name 

248 

249 Returns 

250 ------- 

251 str 

252 valid name name 

253 """ 

254 if not isinstance(name, str): 

255 msg = f"Attribute name must be a string, not {type(name)}" 

256 raise MTValidatorError(msg) 

257 

258 original = str(name) 

259 

260 if re.match("^[0-9]", name): 

261 msg = f"Attribute name cannot start with a number, {original}" 

262 raise MTValidatorError(msg) 

263 

264 if "/" in name: 

265 name = name.replace("/", ".") 

266 

267 if re.search("[A-Z].*?", name): 

268 name = "_".join(re.findall(".[^A-Z]*", name)) 

269 name = name.replace("._", ".") 

270 name = name.lower() 

271 

272 if original != name: 

273 msg = "input name {0} converted to {1} following MTH5 standards" 

274 

275 return name 

276 

277 

278def validate_required(value): 

279 """ 

280 Validate required, must be True or False 

281 

282 Parameters 

283 ---------- 

284 value : str or bool 

285 required value 

286 

287 Returns 

288 ------- 

289 bool 

290 validated required value 

291 """ 

292 if isinstance(value, bool): 

293 return value 

294 

295 if isinstance(value, str): 

296 if value.lower() in ["false"]: 

297 return False 

298 elif value.lower() in ["true"]: 

299 return True 

300 else: 

301 msg = "Required value must be True or False, not {value}" 

302 raise MTValidatorError(msg) 

303 else: 

304 msg = "Required value must be True or False, not {type(value)}" 

305 raise MTValidatorError(msg) 

306 

307 

308def validate_type(value): 

309 """ 

310 Validate required type. Must be: 

311 * str 

312 * float 

313 * int 

314 * bool 

315 * list 

316 * dict 

317 * object 

318 

319 Parameters 

320 ---------- 

321 value : type or str 

322 required type 

323 

324 Returns 

325 ------- 

326 str 

327 validated type 

328 """ 

329 if isinstance(value, type): 

330 value = "{0}".format(value).replace("<class", "").replace(">", "") 

331 

332 if isinstance(value, str): 

333 value = value.replace("<class", "").replace(">", "") 

334 if "int" in value.lower(): 

335 return "integer" 

336 elif "float" in value.lower(): 

337 return "float" 

338 elif "str" in value.lower(): 

339 return "string" 

340 elif "bool" in value.lower(): 

341 return "boolean" 

342 elif "list" in value.lower() or "array" in value.lower(): 

343 return "list" 

344 elif "dict" in value.lower(): 

345 return "dict" 

346 elif "object" in value.lower(): 

347 return "object" 

348 elif "h5py_reference" in value.lower(): 

349 return value 

350 

351 else: 

352 msg = ( 

353 "'type' must be type [ int | float " 

354 + f"| str | bool | list | dict | object ]. Not {value}" 

355 ) 

356 raise MTValidatorError(msg) 

357 else: 

358 msg = ( 

359 "'type' must be type [ int | float " 

360 + f"| str | bool | list | dict | object ] or string. Not {value}" 

361 ) 

362 raise MTValidatorError(msg) 

363 

364 

365def validate_units(value): 

366 """ 

367 Validate units 

368 

369 ..todo:: make a list of acceptable unit names 

370 

371 Parameters 

372 ---------- 

373 value : str 

374 unit value to be validated 

375 

376 Returns 

377 ------- 

378 str 

379 validated units 

380 """ 

381 if value is None: 

382 return value 

383 if isinstance(value, str): 

384 if value.lower() in ["none", "empty", ""]: 

385 return None 

386 else: 

387 return value.lower() 

388 else: 

389 msg = f"'units' must be a string or None, not {type(value)}" 

390 raise MTValidatorError(msg) 

391 

392 

393def validate_style(value): 

394 """ 

395 Validate string style 

396 

397 ..todo:: make list of accepted style formats 

398 

399 Parameters 

400 ---------- 

401 value : str 

402 style to be validated 

403 

404 Returns 

405 ------- 

406 str 

407 validated style 

408 """ 

409 # if None then return the generic name style 

410 if value is None: 

411 return "name" 

412 

413 if not isinstance(value, str): 

414 msg = f"'value' must be a string. Not {type(value)}" 

415 raise MTValidatorError(msg) 

416 

417 if value.lower() not in ACCEPTED_STYLES: 

418 msg = f"style {value} unknown, must be in {ACCEPTED_STYLES}" 

419 raise MTValidatorError(msg) 

420 

421 return value.lower() 

422 

423 

424def validate_description(description): 

425 """ 

426 make sure the description is a string 

427 

428 Parameters 

429 ---------- 

430 description : str 

431 detailed description of an attribute 

432 

433 Returns 

434 ------- 

435 str 

436 validated string of description 

437 """ 

438 if not isinstance(description, str): 

439 msg = f"Description must be a string, not {type(description)}" 

440 raise MTValidatorError(msg) 

441 

442 return description 

443 

444 

445def validate_options(options): 

446 """ 

447 turn options into a list of strings 

448 

449 Parameters 

450 ---------- 

451 options : TYPE 

452 DESCRIPTION 

453 

454 Returns 

455 ------- 

456 TYPE 

457 DESCRIPTION 

458 """ 

459 if isinstance(options, str): 

460 options = options.replace("[", "").replace("]", "").strip().split("|") 

461 names = [] 

462 for name in options: 

463 if not name.lower() in ["none", ""]: 

464 names.append(name.strip()) 

465 options = names 

466 

467 elif isinstance(options, (list, tuple)): 

468 options = [str(option) for option in options] 

469 elif isinstance(options, (float, int, bool)): 

470 options = ["{0}".format(options)] 

471 

472 else: 

473 msg = "Option type not understood {type(options)}" 

474 raise MTValidatorError(msg) 

475 return options 

476 

477 

478def validate_alias(alias): 

479 """ 

480 validate alias names 

481 

482 Parameters 

483 ---------- 

484 alias : TYPE 

485 DESCRIPTION 

486 

487 Returns 

488 ------- 

489 TYPE 

490 DESCRIPTION 

491 """ 

492 

493 if isinstance(alias, str): 

494 alias = alias.replace("[", "").replace("]", "").strip().split("|") 

495 names = [] 

496 for name in alias: 

497 if not name.lower() in ["none", ""]: 

498 names.append(name.strip()) 

499 alias = names 

500 

501 elif isinstance(alias, (list, tuple)): 

502 alias = [str(option) for option in alias] 

503 elif isinstance(alias, (float, int, bool)): 

504 alias = [f"{alias}"] 

505 

506 else: 

507 msg = f"Alias type not understood {alias}" 

508 raise MTValidatorError(msg) 

509 return alias 

510 

511 

512def validate_example(example): 

513 """ 

514 Validate example values 

515 

516 Parameters 

517 ---------- 

518 example : TYPE 

519 DESCRIPTION 

520 

521 Returns 

522 ------- 

523 TYPE 

524 DESCRIPTION 

525 """ 

526 if not isinstance(example, str): 

527 example = "{0}".format(example) 

528 return example 

529 

530 

531def validate_default(value_dict): 

532 """ 

533 validate default value 

534 

535 Parameters 

536 ---------- 

537 value_dict : TYPE 

538 DESCRIPTION 

539 

540 Returns 

541 ------- 

542 TYPE 

543 DESCRIPTION 

544 """ 

545 

546 if value_dict["required"]: 

547 if value_dict["default"] in [None]: 

548 if "list" in value_dict["style"]: 

549 value = [] 

550 elif "date" in value_dict["style"] or "time" in value_dict["style"]: 

551 value = "1980-01-01T00:00:00+00:00" 

552 elif "controlled" in value_dict["style"]: 

553 if "other" in value_dict["options"]: 

554 value = None 

555 else: 

556 value = value_dict["options"][0] 

557 else: 

558 if value_dict["type"] in ["integer", "float", int, float]: 

559 value = 0 

560 elif value_dict["type"] in ["string", str]: 

561 value = "none" 

562 elif value_dict["type"] in ["bool", bool]: 

563 value = False 

564 elif value_dict["type"] in ["h5py_reference"]: 

565 value = None 

566 else: 

567 value = validate_value_type( 

568 value_dict["default"], value_dict["type"], value_dict["style"] 

569 ) 

570 

571 else: 

572 if "date" in value_dict["style"] or "time" in value_dict["style"]: 

573 value = "1980-01-01T00:00:00+00:00" 

574 else: 

575 value = None 

576 return value 

577 

578 

579def validate_value_type(value, v_type, style=None): 

580 """ 

581 validate type from standards 

582 

583 Parameters 

584 ---------- 

585 value : TYPE 

586 DESCRIPTION 

587 v_type : TYPE 

588 DESCRIPTION 

589 style : TYPE, optional 

590 DESCRIPTION, by default None 

591 

592 Returns 

593 ------- 

594 TYPE 

595 DESCRIPTION 

596 """ 

597 

598 # if the value is a metadata type skip cause the individual components 

599 # will be validated separately 

600 if "metadata" in str(type(value)): 

601 return value 

602 # return if the value is None, this may need to change in the future 

603 # if an empty list or something else should be returned 

604 if not isinstance(value, (list, tuple, np.ndarray)): 

605 if value in [None, "None", "none", "unknown"]: 

606 return None 

607 # hack to get around h5py reference types, in the future will need 

608 # a more robust test. 

609 if v_type == "h5py_reference": 

610 return value 

611 

612 # return value if the value type is not defined. 

613 if v_type is None: 

614 msg = ( 

615 "standards data type is unknown, if you want to " 

616 + "propogate this attribute using to_dict, to_json or " 

617 + "to_series, you need to add attribute description using " 

618 + "class function add_base_attribute." 

619 ) 

620 logger.warning(msg) 

621 return value 

622 

623 # if not a python type but a string organize into a dictionary 

624 if not isinstance(v_type, type) and isinstance(v_type, str): 

625 type_dict = { 

626 "string": str, 

627 "integer": int, 

628 "float": float, 

629 "boolean": bool, 

630 "list": list, 

631 "dict": dict, 

632 "object": object, 

633 } 

634 v_type = type_dict[validate_type(v_type)] 

635 else: 

636 msg = "v_type must be a string or type not {0}".format(v_type) 

637 

638 # check style for a list, if it is split the string 

639 if style: 

640 if "list" in style and isinstance(value, str): 

641 delimeter = " " 

642 if value.count(",") > 0: 

643 delimeter = "," 

644 elif value.strip().count(" ") > 0: 

645 delimeter = " " 

646 elif value.count(";") > 0: 

647 delimeter = ";" 

648 value = value.replace("[", "").replace("]", "").split(delimeter) 

649 value = [ss.strip() for ss in value] 

650 

651 # if value is not of v_type 

652 if not isinstance(value, v_type): 

653 msg = "value=%s must be %s not %s" 

654 # if the value is a string, convert to appropriate type 

655 if isinstance(value, str): 

656 if v_type is int: 

657 try: 

658 if value.lower() in ["none", "nan", ""]: 

659 return None 

660 return int(value) 

661 except ValueError: 

662 raise MTSchemaError(msg, value, v_type, type(value)) 

663 elif v_type is float: 

664 try: 

665 return float(value) 

666 except ValueError: 

667 raise MTSchemaError(msg, value, v_type, type(value)) 

668 elif v_type is bool: 

669 if value.lower() in ["false", "0"]: 

670 return False 

671 elif value.lower() in ["true", "1"]: 

672 return True 

673 else: 

674 raise MTSchemaError(msg, value, v_type, type(value)) 

675 elif v_type is str: 

676 return value 

677 

678 # if a number convert to appropriate type 

679 elif isinstance(value, (int, np.int_, np.int64, np.int32, np.int16, np.int8)): 

680 if v_type is float: 

681 return float(value) 

682 elif v_type is str: 

683 return "{0:.0f}".format(value) 

684 return int(value) 

685 

686 # if a number convert to appropriate type 

687 elif isinstance(value, (float, np.float16, np.float32, np.float64)): 

688 if v_type is int: 

689 return int(value) 

690 elif v_type is str: 

691 return f"{value}" 

692 return float(value) 

693 

694 # if a list convert to appropriate entries to given type 

695 elif isinstance(value, Iterable): 

696 if v_type is str: 

697 if isinstance(value, np.ndarray): 

698 value = value.astype(np.str_) 

699 value = [f"{v}".replace("'", "").replace('"', "") for v in value] 

700 elif v_type is int: 

701 value = [int(float(v)) for v in value] 

702 elif v_type is float: 

703 value = [float(v) for v in value] 

704 elif v_type is bool: 

705 value_list = [] 

706 for v in value: 

707 if v in [True, "true", "True", "TRUE", 1, "1"]: 

708 value_list.append(True) 

709 elif v in [False, "false", "False", "FALSE", 0, "0"]: 

710 value_list.append(False) 

711 value = value_list 

712 return value 

713 

714 elif isinstance(value, (np.bool_)): 

715 return bool(value) 

716 

717 else: 

718 raise MTSchemaError(msg, value, v_type, type(value)) 

719 else: 

720 return value 

721 

722 

723def validate_value_dict(value_dict): 

724 """ 

725 Validate an input value dictionary 

726 

727 Must be of the form: 

728 {'type': str, 'required': True, 'style': 'name', 'units': units} 

729 

730 :param value_dict: DESCRIPTION 

731 :type value_dict: TYPE 

732 :return: DESCRIPTION 

733 :rtype: TYPE 

734 

735 """ 

736 if not isinstance(value_dict, dict): 

737 if isinstance(value_dict, type(logger)): 

738 return value_dict 

739 msg = f"Input must be a dictionary, not {type(value_dict)}" 

740 raise MTValidatorError(msg) 

741 

742 header = validate_header(list(value_dict.keys())) 

743 # loop over validating functions in this module 

744 for key in header: 

745 if key == "default": 

746 continue 

747 try: 

748 value_dict[key] = getattr(sys.modules[__name__], f"validate_{key}")( 

749 value_dict[key] 

750 ) 

751 except KeyError: 

752 raise KeyError("Could not find {key} for validator {__name__}") 

753 

754 # need to validate the default value after all other keys have been validated 

755 value_dict["default"] = validate_default(value_dict) 

756 

757 return value_dict