Coverage for src/file_tree/template.py: 87%
726 statements
« prev ^ index » next coverage.py v7.6.9, created at 2024-12-17 13:27 +0000
« prev ^ index » next coverage.py v7.6.9, created at 2024-12-17 13:27 +0000
1"""Define Placeholders and Template interface."""
2import itertools
3import os
4import re
5import string
6from collections import defaultdict
7from collections.abc import MutableMapping
8from functools import cmp_to_key, lru_cache
9from glob import glob
10from fnmatch import fnmatch
11from itertools import chain, combinations, product
12from pathlib import Path
13from typing import (
14 Any,
15 Collection,
16 Dict,
17 FrozenSet,
18 Generator,
19 Iterable,
20 Iterator,
21 List,
22 Optional,
23 Sequence,
24 Set,
25 Tuple,
26)
28import numpy as np
29import pandas as pd
30import xarray
31from parse import compile, extract_format
34def is_singular(value):
35 """Whether a value is singular or has multiple options."""
36 if isinstance(value, str):
37 return True
38 try:
39 iter(value)
40 return False
41 except TypeError:
42 return True
45class Placeholders(MutableMapping):
46 """Dictionary-like object containing the placeholder values.
48 It understands about sub-trees
49 (i.e., if "<sub_tree>/<placeholder>" does not exist it will return "<placeholder>" instead).
50 """
52 def __init__(self, *args, **kwargs):
53 """Create a new Placeholders as any dictionary."""
54 self.mapping = {}
55 self.linkages: Dict[str : FrozenSet[str]] = {}
56 self.update(dict(*args, **kwargs))
58 def copy(self) -> "Placeholders":
59 """Create copy of placeholder values."""
60 p = Placeholders()
61 p.mapping = dict(self.mapping)
62 p.linkages = dict(self.linkages)
63 return p
65 def __getitem__(self, key: str):
66 """Get placeholder values respecting sub-tree placeholders."""
67 actual_key = self.find_key(key)
68 if actual_key is None:
69 raise KeyError(f"No parameter value available for {key}")
70 if actual_key in self.linkages:
71 return self.mapping[self.linkages[actual_key]][actual_key]
72 return self.mapping[actual_key]
74 def __delitem__(self, key):
75 """Delete placeholder values represented by key."""
76 if isinstance(key, tuple):
77 key = frozenset(key)
78 del self.mapping[key]
79 if isinstance(key, frozenset):
80 for k in key:
81 del self.linkages[k]
83 def __setitem__(self, key, value):
84 """Overwrite placeholder value taking adjusting linked placeholders if needed."""
85 if isinstance(key, tuple): # create linked placeholders
86 if len(key) != len(value):
87 raise ValueError(
88 f"Attempting to set linked placeholders for {key}, "
89 + f"but {value} has a different number of elements than {key}"
90 )
91 if any([len(value[0]) != len(v) for v in value]):
92 raise ValueError(
93 f"Attempting to set linked placeholders for {key}, "
94 + f"but not all elements in {value} have the same length."
95 )
96 value = {k: v for k, v in zip(key, value)}
97 key = frozenset(key)
98 if isinstance(key, frozenset):
99 assert isinstance(value, dict)
100 for k in list(key):
101 if k in self.linkages:
102 unmatched_keys = [
103 unmatched
104 for unmatched in self.linkages[k]
105 if unmatched not in key
106 ]
107 if len(unmatched_keys) > 0:
108 raise ValueError(
109 f"Attempting to set linked placeholders for {key}, "
110 + f"but {k} is already linked to {unmatched_keys}."
111 )
112 self.mapping[key] = value
113 for k in list(key):
114 if k in self.mapping:
115 del self.mapping[k]
116 if k in self.linkages:
117 if self.linkages[k] in self.mapping and self.linkages[k] != key:
118 del self.mapping[self.linkages[k]]
119 del self.linkages[k]
120 self.linkages[k] = key
121 elif key in self.linkages:
122 old_values = self.mapping[self.linkages[key]]
123 if is_singular(value):
124 nvalue = old_values[key].count(value)
125 self.unlink(*old_values.keys())
126 if nvalue == 0:
127 for skey in old_values:
128 del self.mapping[skey]
129 self.mapping[key] = value
130 elif nvalue == 1:
131 idx = old_values[key].index(value)
132 for skey in old_values:
133 self.mapping[skey] = old_values[skey][idx]
134 else:
135 idx = [i for i, v in enumerate(old_values[key]) if v == value]
136 for skey in old_values:
137 if key == skey:
138 self.mapping[key] = value
139 else:
140 self.mapping[skey] = tuple(old_values[skey][i] for i in idx)
141 self.link(*[skey for skey in old_values if skey != key])
142 else:
143 idx = []
144 for new_v in value:
145 nfound = 0
146 for i, old_v in enumerate(old_values[key]):
147 if old_v == new_v and i not in idx:
148 idx.append(i)
149 nfound += 1
150 if nfound == 0:
151 idx.append((new_v, ))
152 for skey in old_values:
153 old_values[skey] = tuple(
154 old_values[skey][i] if isinstance(i, int) else
155 (i[0] if skey == key else None)
156 for i in idx)
157 else:
158 self.mapping[key] = value
160 def __iter__(self):
161 """Iterate over all placeholder keys that actually have values."""
162 for key in self.mapping:
163 if self.mapping[key] is not None:
164 yield key
166 def __len__(self):
167 """Return number of keys in the mapping."""
168 return len([k for k, v in self.mapping.items() if v is not None])
170 def __repr__(self):
171 """Text representation of placeholder values."""
172 return f"Placeholders({self.mapping})"
174 def find_key(self, key: str) -> Optional[str]:
175 """Find the actual key containing the value.
177 Will look for:
179 - not None value for the key itself
180 - not None value for any parent (i.e, for key "A/B", will look for "B" as well)
181 - otherwise will return None
183 Args:
184 key (str): placeholder name
186 Returns:
187 None if no value for the key is available, otherwise the key used to index the value
188 """
189 if not isinstance(key, str):
190 key = frozenset(key)
191 elif key in self.linkages:
192 return key
193 if self.mapping.get(key, None) is not None:
194 return key
195 elif "/" in key:
196 _, *parts = key.split("/")
197 new_key = "/".join(parts)
198 return self.find_key(new_key)
199 else:
200 return None
202 def missing_keys(self, all_keys: Collection[str], top_level=True) -> Set[str]:
203 """Identify any placeholder keys in `all_keys` that are not defined.
205 If `top_level` is True (default), any sub-tree information is removed from the missing keys.
206 """
207 not_defined = {key for key in all_keys if self.find_key(key) is None}
208 if not top_level:
209 return not_defined
210 return {key.split('/')[-1] for key in not_defined}
212 def split(self) -> Tuple["Placeholders", "Placeholders"]:
213 """Split all placeholders into those with a single value or those with multiple values.
215 Placeholders are considered to have multiple values if they are equivalent to 1D-arrays (lists, tuples, 1D ndarray, etc.).
216 Anything else is considered a single value (string, int, float, etc.).
218 Returns:
219 Tuple with two dictionaries:
221 1. placeholders with single values
222 2. placehodlers with multiple values
223 """
224 single_placeholders = Placeholders()
225 multi_placeholders = Placeholders()
226 for name, value in self.mapping.items():
227 if isinstance(name, frozenset) or not is_singular(value):
228 multi_placeholders[name] = value
229 else:
230 single_placeholders[name] = value
231 return single_placeholders, multi_placeholders
233 def iter_over(self, keys) -> Generator["Placeholders", None, None]:
234 """Iterate over the placeholder names.
236 Args:
237 keys (Sequence[str]): sequence of placeholder names to iterate over
239 Raises:
240 KeyError: Raised if any of the provided `keys` does not have any value.
242 Yields:
243 yield Placeholders object, where each of the listed keys only has a single possible value
244 """
245 actual_keys = [self.linkages.get(self.find_key(key), key) for key in keys]
246 unfilled = {orig for orig, key in zip(keys, actual_keys) if key is None}
247 if len(unfilled) > 0:
248 raise KeyError(f"Can not iterate over undefined placeholders: {unfilled}")
250 unique_keys = []
251 iter_values = {}
252 for key in actual_keys:
253 if key not in unique_keys:
254 if isinstance(key, frozenset): # linked placeholder
255 unique_keys.append(key)
256 iter_values[key] = [
257 {k: self[k][idx] for k in key}
258 for idx in range(len(self[list(key)[0]]))
259 ]
260 elif not is_singular(self[key]): # iterable placeholder
261 unique_keys.append(key)
262 iter_values[key] = self[key]
264 for values in product(*[iter_values[k] for k in unique_keys]):
265 new_vars = Placeholders(self)
266 for key, value in zip(unique_keys, values):
267 if isinstance(key, frozenset):
268 del new_vars[key] # break the placeholders link
269 new_vars.update(value)
270 else:
271 new_vars[key] = value
272 yield new_vars
274 def link(self, *keys):
275 """
276 Link the placeholders represented by `keys`.
278 When iterating over linked placeholders the i-th tree
279 will contain the i-th element from all linked placeholders,
280 instead of the tree containing all possible combinations of placeholder values.
282 This can be thought of using `zip` for linked variables and
283 `itertools.product` for unlinked ones.
284 """
285 actual_keys = set()
286 for key in keys:
287 if key in self.linkages:
288 actual_keys.update(self.linkages[key])
289 else:
290 actual_keys.add(key)
291 self[frozenset(actual_keys)] = {key: self[key] for key in actual_keys}
293 def unlink(self, *keys):
294 """
295 Unlink the placeholders represented by `keys`.
297 See :meth:`link` for how linking affects the iteration
298 through placeholders with multiple values.
300 Raises a ValueError if the placeholders are not actually linked.
301 """
302 if keys not in self:
303 raise ValueError(f"{keys} were not linked, so cannot unlink them")
304 new_vars = {k: self[k] for k in keys}
305 del self[keys]
306 self.update(new_vars)
308 def to_string(self, ):
309 lines = []
310 all_keys = sorted([
311 *self.linkages.keys(),
312 *[k for k in self.mapping.keys() if not isinstance(k, frozenset)]
313 ])
314 for key in sorted(all_keys):
315 value = self[key]
316 if value is None:
317 continue
318 if np.array(value).ndim == 1:
319 lines.append(
320 f"{key} = {', '.join([str(v) for v in value])}"
321 )
322 else:
323 lines.append(f"{key} = {value}")
324 for key in self.mapping.keys():
325 if isinstance(key, frozenset):
326 lines.append(f"&LINK {', '.join(sorted(key))}")
327 return "\n".join(lines)
330class MyDataArray:
331 """Wrapper around xarray.DataArray for internal usage.
333 It tries to delay creating the DataArray object as long as possible
334 (as using them for small arrays is slow...).
335 """
337 def __init__(self, data, coords=None):
338 """Create a new DataArray look-a-like."""
339 self.as_xarray = coords is None
340 if self.as_xarray:
341 assert isinstance(data, xarray.DataArray)
342 self.data_array = data
343 else:
344 self.data = data
345 self.coords = coords
347 def map(self, func) -> "MyDataArray":
348 """Apply `func` to each element of array."""
349 if self.as_xarray:
350 return MyDataArray(
351 xarray.apply_ufunc(func, self.data_array, vectorize=True)
352 )
353 else:
354 return MyDataArray(
355 np.array([func(d) for d in self.data.flat]).reshape(self.data.shape),
356 self.coords,
357 )
359 def to_xarray(
360 self,
361 ) -> xarray.DataArray:
362 """Convert to a real xarray.DataArray."""
363 if self.as_xarray:
364 return self.data_array
365 else:
366 return xarray.DataArray(
367 self.data, [_to_index(name, values) for name, values in self.coords]
368 )
370 @staticmethod
371 def concat(parts, new_index) -> "MyDataArray":
372 """Combine multiple DataArrays."""
373 if len(parts) == 0:
374 return MyDataArray(np.array([]), [])
375 to_xarray = any(p.as_xarray for p in parts) or any(
376 len(p.coords) != len(parts[0].coords)
377 or any(
378 np.all(name1 != name2)
379 for (name1, _), (name2, _) in zip(p.coords, parts[0].coords)
380 )
381 for p in parts
382 )
383 if to_xarray:
384 return MyDataArray(
385 xarray.concat([p.to_xarray() for p in parts], _to_index(*new_index))
386 )
387 else:
388 new_data = np.stack([p.data for p in parts], axis=0)
389 new_coords = list(parts[0].coords)
390 new_coords.insert(0, new_index)
391 return MyDataArray(new_data, new_coords)
394def _to_index(name, values):
395 """Convert to index for MyDataArray."""
396 if isinstance(name, str):
397 return pd.Index(values, name=name)
398 else:
399 return ("-".join(sorted(name)), pd.MultiIndex.from_tuples(values, names=name))
402class Template:
403 """Represents a single template in the FileTree."""
405 def __init__(self, parent: Optional["Template"], unique_part: str):
406 """Create a new child template in `parent` directory with `unique_part` filename."""
407 self.parent = parent
408 self.unique_part = unique_part
410 @property
411 def as_path(self) -> Path:
412 """Return the full path with no placeholders filled in."""
413 if self.parent is None:
414 return Path(self.unique_part)
415 return self.parent.as_path.joinpath(self.unique_part)
417 @property
418 def as_string(self):
419 """Return the full path with no placeholders filled in."""
420 if self.parent is None:
421 return str(self.unique_part)
422 return os.path.join(self.parent.as_string, str(self.unique_part))
424 def __str__(self):
425 """Return string representation of template."""
426 return f"Template({self.as_string})"
428 def children(self, templates: Iterable["Template"]) -> List["Template"]:
429 """Find children from a sequence of templates.
431 Args:
432 templates: sequence of possible child templates.
434 Returns:
435 list of children templates
436 """
437 res = []
438 def add_if_child(possible_child):
439 if isinstance(possible_child, DuplicateTemplate):
440 for t in possible_child.templates:
441 add_if_child(t)
442 elif possible_child.parent is self and possible_child not in res:
443 res.append(possible_child)
445 for t in templates:
446 add_if_child(t)
447 return sorted(res, key=lambda t: t.unique_part)
449 def as_multi_line(
450 self, other_templates: Dict["Template", Set[str]], indentation=4
451 ) -> str:
452 """Generate a string describing this and any child templates.
454 Args:
455 other_templates (Dict[Template, Set[str]]):
456 templates including all the child templates and itself.
457 indentation (int, optional):
458 number of spaces to use as indentation. Defaults to 4.
460 Returns:
461 str: multi-line string that can be processed by :meth:`file_tree.FileTree.read`
462 """
463 result = self._as_multi_line_helper(other_templates, indentation)
465 is_top_level = "" in other_templates[self]
466 if not is_top_level and self.parent is None:
467 return "!" + result
468 else:
469 return result
471 def _as_multi_line_helper(
472 self,
473 other_templates: Dict["Template", Set[str]],
474 indentation=4,
475 _current_indentation=0,
476 ) -> str:
477 leaves = []
478 branches = []
479 for t in sorted(
480 self.children(other_templates.keys()), key=lambda t: t.unique_part
481 ):
482 if len(t.children(other_templates.keys())) == 0:
483 leaves.append(t)
484 else:
485 branches.append(t)
487 is_top_level = "" in other_templates[self]
488 if is_top_level:
489 base_line = "."
490 assert _current_indentation == 0 and self.parent is None
491 _current_indentation = -indentation
492 else:
493 base_line = _current_indentation * " " + self.unique_part
495 all_keys = set(other_templates[self])
496 if is_top_level and all_keys == {""}:
497 lines = []
498 elif len(all_keys) == 1 and list(all_keys)[0] == self.guess_key():
499 lines = [base_line]
500 else:
501 if is_top_level:
502 all_keys.remove("")
503 lines = [base_line + f' ({",".join(sorted(all_keys))})']
505 already_done = set()
506 for t in leaves + branches:
507 if t not in already_done:
508 lines.append(
509 t._as_multi_line_helper(
510 other_templates, indentation, indentation + _current_indentation
511 )
512 )
513 already_done.add(t)
514 return "\n".join(lines)
516 @property
517 def _parts(
518 self,
519 ):
520 return TemplateParts.parse(self.as_string)
522 def placeholders(self, valid=None) -> List[str]:
523 """Return a list of the placeholder names.
525 Args:
526 valid: Collection of valid placeholder names.
527 An error is raised if any other placeholder is detected.
528 By default all placeholder names are fine.
530 Returns:
531 List[str]: placeholder names in order that they appear in the template
532 """
533 return self._parts.ordered_placeholders(valid)
535 def format_single(
536 self, placeholders: Placeholders, check=True, keep_optionals=False, glob=True
537 ) -> str:
538 """Format the template with the placeholders filled in.
540 Only placeholders with a single value are considered.
542 Args:
543 placeholders (Placeholders): values to fill into the placeholder
544 check (bool): skip check for missing placeholders if set to True
545 keep_optionals: if True keep optional parameters that have not been set (will cause the check to fail)
546 glob: setting for pattern matching
548 Raises:
549 KeyError: if any placeholder is missing
551 Returns:
552 str: filled in template
553 """
554 single_placeholders, _ = placeholders.split()
555 template = self._parts.fill_single_placeholders(single_placeholders)
556 if not keep_optionals:
557 template = template.remove_optionals()
558 if check:
559 unfilled = template.required_placeholders()
560 if len(unfilled) > 0:
561 raise KeyError(f"Missing placeholder values for {unfilled}")
562 return pattern_match(str(template), glob)
564 def format_mult(
565 self, placeholders: Placeholders, check=False, filter=False, matches=None, glob=False
566 ) -> xarray.DataArray:
567 """Replace placeholders in template with the provided placeholder values.
569 Args:
570 placeholders: mapping from placeholder names to single or multiple vaalues
571 check: skip check for missing placeholders if set to True
572 filter: filter out non-existing files if set to True
573 matches: Optional pre-generated list of any matches to the template.
574 glob: keyword determining the pattern matching behaviour
576 Raises:
577 KeyError: if any placeholder is missing
579 Returns:
580 xarray.DataArray: array with possible resolved paths.
581 If `filter` is set to True the non-existent paths are replaced by None
582 """
583 parts = self._parts
584 resolved = parts.resolve(placeholders)
585 if check:
586 for template in resolved.data.flatten():
587 unfilled = template.required_placeholders()
588 if len(unfilled) > 0:
589 raise KeyError(f"Missing placeholder values for {unfilled}")
591 def _match_single(t):
592 try:
593 return pattern_match(str(t), glob)
594 except FileNotFoundError:
595 if filter:
596 return ""
597 raise
598 paths = resolved.map(_match_single)
599 if not filter:
600 return paths.to_xarray()
601 placeholder_dict = dict(placeholders)
602 path_matches = [
603 str(
604 parts.fill_single_placeholders(
605 Placeholders({**placeholder_dict, **match})
606 ).remove_optionals()
607 )
608 for match in (
609 self.all_matches(placeholders) if matches is None else matches
610 )
611 ]
612 return paths.map(lambda p: p if any((fnmatch(p, m) if is_glob_pattern(m) else p == m) for m in path_matches) else "").to_xarray()
614 def optional_placeholders(
615 self,
616 ) -> Set[str]:
617 """Find all placeholders that are only within optional blocks (i.e., they do not require a value).
619 Returns:
620 Set[str]: names of optional placeholders
621 """
622 return self._parts.optional_placeholders()
624 def required_placeholders(
625 self,
626 ) -> Set[str]:
627 """Find all placeholders that are outside of optional blocks (i.e., they do require a value).
629 Returns:
630 Set[str]: names of required placeholders
631 """
632 return self._parts.required_placeholders()
634 def guess_key(
635 self,
636 ) -> str:
637 """Propose a short name for the template.
639 The proposed short name is created by:
641 - taking the basename (i.e., last component) of the path
642 - removing the first '.' and everything beyond (to remove the extension)
644 .. warning::
646 If there are multiple dots within the path's basename,
647 this might remove far more than just the extension.
649 Returns:
650 str: proposed short name for this template (used if user does not provide one)
651 """
652 parts = self.as_path.parts
653 if len(parts) == 0:
654 return ""
655 else:
656 return parts[-1].split(".")[0]
658 def add_precursor(self, text) -> "Template":
659 """Return a new Template with any placeholder names in the unique part now preceded by `text`.
661 Used for adding sub-trees
662 """
663 parts = TemplateParts.parse(self.unique_part).parts
664 updated = "".join([str(p.add_precursor(text)) for p in parts])
665 return Template(self.parent, updated)
667 def get_all_placeholders(
668 self, placeholders: Placeholders, link=None, return_matches=False
669 ) -> Placeholders:
670 """Fill placeholders with possible values based on what is available on disk.
672 Args:
673 placeholders: New values for undefined placeholders in template.
674 link: template keys that should be linked together in the output.
675 return_matches: if True, also returns any matches to the template, which can be passed on to `format_mult`.
677 Returns:
678 Set of placeholders updated based on filed existing on disk that match this template.
679 """
680 if link is None:
681 link = []
682 elif len(link) > 0 and isinstance(link[0], str):
683 link = [link]
684 # link is now a sequence of sequence of strings
686 all_to_link = [name for single in link for name in single]
687 template_keys = {
688 *self.optional_placeholders(),
689 *self.required_placeholders(),
690 }
692 undefined = set()
693 placeholder_with_linked = placeholders.copy()
694 for name in all_to_link:
695 if placeholder_with_linked.find_key(name) is None:
696 placeholder_with_linked[name] = ""
697 undefined.add(name)
698 undefined.update(placeholders.missing_keys(template_keys))
700 matches = self.all_matches(placeholders, undefined)
702 undefined = defaultdict(set)
703 for match in matches:
704 for name, value in match.items():
705 if placeholders.find_key(name) is None and name not in all_to_link:
706 undefined[name].add(value)
708 def cmp(item1, item2):
709 if item1 is None:
710 return -1
711 if item2 is None:
712 return 1
713 if item1 < item2:
714 return -1
715 if item1 > item2:
716 return 1
717 return 0
719 res = Placeholders(
720 {k: sorted(v, key=cmp_to_key(cmp)) for k, v in undefined.items()}
721 )
722 for to_link in link:
723 res[tuple(to_link)] = list(zip(*sorted(
724 {tuple(Placeholders(match).get(key, None) for key in to_link) for match in matches}
725 )))
726 if return_matches:
727 return (res, matches)
728 return res
730 def all_matches(self, placeholders: Placeholders, keys_to_fill: Collection[str]=None) -> List[Dict[str, Any]]:
731 """Return a sequence of all possible variable values for `keys_to_fill` matching existing files on disk.
733 Only variable values matching existing placeholder values (in `placeholders`) are returned
734 (undefined placeholders are unconstrained).
735 """
736 if keys_to_fill is None:
737 keys_to_fill = placeholders.missing_keys({
738 *self.required_placeholders(),
739 *self.optional_placeholders(),
740 })
742 single_vars, multi_vars = placeholders.split()
743 res = []
745 def check_name_with_edit(match, name):
746 value = match[name]
747 if name in single_vars and single_vars.find_key(name) == name:
748 return value == single_vars[name]
749 if name in multi_vars and multi_vars.find_key(name) == name:
750 return value in multi_vars[name]
751 if name in keys_to_fill:
752 return True
753 del match[name]
754 _, *parts = name.split('/')
755 parent_name = '/'.join(parts)
756 if parent_name in match:
757 return match[parent_name] == value
758 match[parent_name] = value
759 return check_name_with_edit(match, parent_name)
761 for match in self._parts.all_matches():
762 if not all(
763 check_name_with_edit(match, name) for name in list(match.keys())
764 ):
765 continue
766 res.append(match)
767 return res
769 def rich_line(self, all_keys):
770 """Produce a line for rendering using rich."""
771 keys = all_keys[self]
772 base = self.guess_key()
773 unique_part = str(self.unique_part)
774 if base in keys:
775 keys.remove(base)
776 unique_part = str.replace(unique_part, base, f"[cyan]{base}[/cyan]")
777 if len(keys) == 0:
778 return unique_part
779 return (
780 unique_part
781 + " ("
782 + ", ".join("[cyan]" + key + "[/cyan]" for key in keys)
783 + ")"
784 )
787class DuplicateTemplate:
788 """Represents the case where a single key points to multiple templates."""
790 def __init__(self, *templates: Template):
791 self._templates = list(templates)
793 def add_template(self, template: Template):
794 """Add another conflicting template."""
795 self._templates.append(template)
797 @property
798 def templates(self, ):
799 return tuple(self._templates)
802def extract_placeholders(template, filename, known_vars=None):
803 """
804 Extract the placeholder values from the filename.
806 :param template: template matching the given filename
807 :param filename: filename
808 :param known_vars: already known placeholders
809 :return: dictionary from placeholder names to string representations
810 (unused placeholders set to None)
811 """
812 return TemplateParts.parse(template).extract_placeholders(filename, known_vars)
815class Part:
816 """
817 Individual part of a template.
819 3 subclasses are defined:
821 - :class:`Literal`:
822 piece of text
823 - :class:`Required`:
824 required placeholder to fill in
825 (between curly brackets)
826 - :class:`OptionalPart`:
827 part of text containing optional placeholders
828 (between square brackets)
829 """
831 def fill_single_placeholders(
832 self, placeholders: Placeholders, ignore_type=False
833 ) -> Sequence["Part"]:
834 """Fill in the given placeholders."""
835 return (self,)
837 def optional_placeholders(
838 self,
839 ) -> Set[str]:
840 """Return all placeholders in optional parts."""
841 return set()
843 def required_placeholders(
844 self,
845 ) -> Set[str]:
846 """Return all required placeholders."""
847 return set()
849 def contains_optionals(self, placeholders: Set["Part"] = None):
850 """Return True if this part contains the optional placeholders."""
851 return False
853 def append_placeholders(self, placeholders: List[str], valid=None):
854 """Append the placeholders in this part to the provided list in order."""
855 pass
857 def add_precursor(self, text: str) -> "Part":
858 """Prepend any placeholder names by `text`."""
859 return self
861 def for_defined(self, placeholder_names: Set[str]) -> List["Part"]:
862 """Return the template string assuming the placeholders in `placeholder_names` are defined.
864 Removes any optional parts, whose placeholders are not in `placeholder_names`.
865 """
866 return [self]
868 def remove_precursors(self, placeholders=None):
869 """Remove precursor from placeholder key."""
870 return self
873class Literal(Part):
874 """Piece of text in template without placeholders."""
876 def __init__(self, text: str):
877 """
878 Literal part is defined purely by the text it contains.
880 :param text: part of the template
881 """
882 self.text = text
884 def __str__(self):
885 """Return this part of the template as a string."""
886 return self.text
888 def __eq__(self, other):
889 """Check if text matches other `Literal`."""
890 if not isinstance(other, Literal):
891 return NotImplemented
892 return self.text == other.text
895class Required(Part):
896 """Placeholder part of template that requires a value."""
898 def __init__(self, var_name, var_formatting=None):
899 """
900 Create required part of template (between curly brackets).
902 Required placeholder part of template is defined by placeholder name and its format
904 :param var_name: name of placeholder
905 :param var_formatting: how to format the placeholder
906 """
907 self.var_name = var_name
908 self.var_formatting = var_formatting
910 def __str__(self):
911 """Return this part of the template as a string."""
912 if self.var_formatting is None or len(self.var_formatting) == 0:
913 return "{" + self.var_name + "}"
914 else:
915 return "{" + self.var_name + ":" + self.var_formatting + "}"
917 def fill_single_placeholders(self, placeholders: Placeholders, ignore_type=False):
918 """Fill placeholder values into template obeying typing."""
919 value = placeholders.get(self.var_name, None)
920 if value is None:
921 return (self,)
922 else:
923 if not ignore_type and len(self.var_formatting) > 0:
924 format_type = extract_format(self.var_formatting, [])["type"]
925 if format_type in list(r"dnbox"):
926 value = int(value)
927 elif format_type in list(r"f%eg"):
928 value = float(value)
929 elif format_type in ["t" + ft for ft in "iegachs"] and isinstance(
930 value, str
931 ):
932 from dateutil import parser
934 value = parser(value)
935 res = TemplateParts.parse(
936 format(value, "" if ignore_type else self.var_formatting)
937 )
938 if len(res.parts) == 1:
939 return res.parts
940 return res.fill_single_placeholders(
941 placeholders, ignore_type=ignore_type
942 ).parts
944 def required_placeholders(
945 self,
946 ):
947 """Return variable names."""
948 return {self.var_name}
950 def append_placeholders(self, placeholders, valid=None):
951 """Add placeholder name to list of placeholders in template."""
952 if valid is not None and self.var_name not in valid:
953 raise ValueError(f"Placeholder {self.var_name} is not defined")
954 placeholders.append(self.var_name)
956 def add_precursor(self, text: str) -> "Required":
957 """Prepend any placeholder names by `text`."""
958 return Required(text + self.var_name, self.var_formatting)
960 def remove_precursors(self, placeholders=None):
961 """Remove precursor from placeholder key."""
962 if placeholders is None:
963 new_name = self.var_name.split("/")[-1]
964 else:
965 key = placeholders.find_key(self.var_name)
966 new_name = self.var_name if key is None else key
967 return Required(new_name, self.var_formatting)
969 def __eq__(self, other):
970 """Check whether `other` placeholder matches this one."""
971 if not isinstance(other, Required):
972 return NotImplemented
973 return (self.var_name == other.var_name) & (
974 self.var_formatting == other.var_formatting
975 )
978class OptionalPart(Part):
979 """Optional part of a template (i.e., between square brackets)."""
981 def __init__(self, sub_template: "TemplateParts"):
982 """
983 Create optional part of template (between square brackets).
985 Optional part can contain literal and required parts
987 :param sub_template: part of the template within square brackets
988 """
989 self.sub_template = sub_template
991 def __str__(self):
992 """Return string representation of optional part."""
993 return "[" + str(self.sub_template) + "]"
995 def fill_single_placeholders(self, placeholders: Placeholders, ignore_type=False):
996 """Fill placeholders into text within optional part."""
997 new_opt = self.sub_template.fill_single_placeholders(
998 placeholders, ignore_type=ignore_type
999 )
1000 if len(new_opt.required_placeholders()) == 0:
1001 return (Literal(str(new_opt)),)
1002 return (OptionalPart(new_opt),)
1004 def optional_placeholders(self):
1005 """Return sequence of any placeholders in the optional part of the template."""
1006 return self.sub_template.required_placeholders()
1008 def contains_optionals(self, placeholders=None):
1009 """Check if this optional part contains any placeholders not listed in `placeholders`."""
1010 if placeholders is None and len(self.optional_placeholders()) > 0:
1011 return True
1012 return len(self.optional_placeholders().intersection(placeholders)) > 0
1014 def append_placeholders(self, placeholders, valid=None):
1015 """Add any placeholders in the optional part to `placeholders` list."""
1016 try:
1017 placeholders.extend(self.sub_template.ordered_placeholders(valid=valid))
1018 except ValueError:
1019 pass
1021 def add_precursor(self, text: str) -> "OptionalPart":
1022 """Prepend precursor `text` to any placeholders in the optional part."""
1023 return OptionalPart(
1024 TemplateParts([p.add_precursor(text) for p in self.sub_template.parts])
1025 )
1027 def for_defined(self, placeholder_names: Set[str]) -> List["Part"]:
1028 """
1029 Return the template string assuming the placeholders in `placeholder_names` are defined.
1031 Removes any optional parts, whose placeholders are not in `placeholder_names`.
1032 """
1033 if len(self.optional_placeholders().difference(placeholder_names)) > 0:
1034 return []
1035 return list(self.sub_template.parts)
1037 def remove_precursors(self, placeholders=None):
1038 """Remove precursor from placeholder key."""
1039 return OptionalPart(self.sub_template.remove_precursors(placeholders))
1041 def __eq__(self, other):
1042 """Check whether two optional parts match."""
1043 if not isinstance(other, OptionalPart):
1044 return NotImplemented
1045 return self.sub_template == other.sub_template
1048class TemplateParts:
1049 """Representation of full template as sequence of `Part` objects."""
1051 optional_re = re.compile(r"(\[.*?\])")
1052 requires_re = re.compile(r"(\{.*?\})")
1054 def __init__(self, parts: Sequence[Part]):
1055 """Create new TemplateParts based on sequence."""
1056 if isinstance(parts, str):
1057 raise ValueError(
1058 "Input to Template should be a sequence of parts; "
1059 + "did you mean to call `TemplateParts.parse` instead?"
1060 )
1061 self.parts = tuple(parts)
1063 @staticmethod
1064 @lru_cache(1000)
1065 def parse(text: str) -> "TemplateParts":
1066 """Parse a template string into its constituent parts.
1068 Args:
1069 text: template as string.
1071 Raises:
1072 ValueError: raised if a parsing error is
1074 Returns:
1075 TemplateParts: object that contains the parts of the template
1076 """
1077 parts: List[Part] = []
1078 for optional_parts in TemplateParts.optional_re.split(text):
1079 if (
1080 len(optional_parts) > 0
1081 and optional_parts[0] == "["
1082 and optional_parts[-1] == "]"
1083 ):
1084 if "[" in optional_parts[1:-1] or "]" in optional_parts[1:-1]:
1085 raise ValueError(
1086 f"Can not parse {text}, because unmatching square brackets were found"
1087 )
1088 parts.append(OptionalPart(TemplateParts.parse(optional_parts[1:-1])))
1089 else:
1090 for required_parts in TemplateParts.requires_re.split(optional_parts):
1091 if (
1092 len(required_parts) > 0
1093 and required_parts[0] == "{"
1094 and required_parts[-1] == "}"
1095 ):
1096 if ":" in required_parts:
1097 var_name, var_type = required_parts[1:-1].split(":")
1098 else:
1099 var_name, var_type = required_parts[1:-1], ""
1100 parts.append(Required(var_name, var_type))
1101 else:
1102 parts.append(Literal(required_parts))
1103 return TemplateParts(parts)
1105 def __str__(self):
1106 """Return the template as a string."""
1107 return os.path.normpath("".join([str(p) for p in self.parts]))
1109 def optional_placeholders(
1110 self,
1111 ) -> Set[str]:
1112 """Set of optional placeholders."""
1113 if len(self.parts) == 0:
1114 return set()
1115 optionals = set.union(*[p.optional_placeholders() for p in self.parts])
1116 return optionals.difference(self.required_placeholders())
1118 def required_placeholders(
1119 self,
1120 ) -> Set[str]:
1121 """Set of required placeholders."""
1122 if len(self.parts) == 0:
1123 return set()
1124 return set.union(*[p.required_placeholders() for p in self.parts])
1126 def ordered_placeholders(self, valid=None) -> List[str]:
1127 """Sequence of all placeholders in order (can contain duplicates)."""
1128 ordered_vars: List[str] = []
1129 for p in self.parts:
1130 p.append_placeholders(ordered_vars, valid=valid)
1131 return ordered_vars
1133 def fill_known(self, placeholders: Placeholders, ignore_type=False) -> MyDataArray:
1134 """Fill in the known placeholders.
1136 Any optional parts, where all placeholders have been filled
1137 will be automatically replaced.
1138 """
1139 single, multi = placeholders.split()
1140 return self.remove_precursors(placeholders)._fill_known_helper(
1141 single, multi, ignore_type=ignore_type
1142 )
1144 def _fill_known_helper(
1145 self, single: Placeholders, multi: Placeholders, ignore_type=False
1146 ) -> MyDataArray:
1147 """Do work for `fill_known`."""
1148 new_template = self.fill_single_placeholders(single, ignore_type=ignore_type)
1149 for name in new_template.ordered_placeholders():
1150 use_name = multi.find_key(name)
1151 if use_name is None:
1152 continue
1153 new_multi = multi.copy()
1154 if use_name in multi.linkages:
1155 values = multi[multi.linkages[use_name]]
1156 keys = tuple(sorted(values.keys()))
1157 index = (keys, zip(*[values[k] for k in keys]))
1158 del new_multi[new_multi.linkages[use_name]]
1159 else:
1160 values = {use_name: list(multi[name])}
1161 index = (use_name, values[use_name])
1162 del new_multi[use_name]
1163 assert use_name is not None
1165 parts = []
1166 new_single = single.copy()
1167 for idx in range(len(values[use_name])):
1168 new_vals = {n: v[idx] for n, v in values.items()}
1169 new_single.mapping.update(new_vals)
1170 parts.append(
1171 new_template._fill_known_helper(
1172 new_single, new_multi, ignore_type=ignore_type
1173 )
1174 )
1176 return MyDataArray.concat(parts, index)
1177 return MyDataArray(np.array(new_template), [])
1179 def fill_single_placeholders(
1180 self, placeholders: Placeholders, ignore_type=False
1181 ) -> "TemplateParts":
1182 """
1183 Fill in placeholders with singular values.
1185 Assumes that all placeholders are in fact singular.
1186 """
1187 res = [
1188 p.fill_single_placeholders(placeholders, ignore_type=ignore_type)
1189 for p in self.parts
1190 ]
1191 return TemplateParts(list(chain(*res)))
1193 def remove_optionals(self, optionals=None) -> "TemplateParts":
1194 """
1195 Remove any optionals containing the provided placeholders.
1197 By default all optionals are removed.
1198 """
1199 return TemplateParts(
1200 [p for p in self.parts if not p.contains_optionals(optionals)]
1201 )
1203 def all_matches(
1204 self,
1205 ) -> List[Dict[str, Any]]:
1206 """Find all potential matches to existing templates.
1208 Returns a list with the possible combination of values for the placeholders.
1209 """
1210 required = self.required_placeholders()
1211 optional = self.optional_placeholders()
1212 matches = []
1213 already_globbed = {}
1214 for defined_optionals in [
1215 c for n in range(len(optional) + 1) for c in combinations(optional, n)
1216 ]:
1217 glob_placeholders = Placeholders(
1218 **{req: "*" for req in required},
1219 **{opt: "*" for opt in defined_optionals},
1220 )
1221 new_glob = str(
1222 self.fill_single_placeholders(
1223 glob_placeholders, ignore_type=True
1224 ).remove_optionals()
1225 )
1226 while "**" in new_glob:
1227 new_glob = new_glob.replace("**", "*")
1228 if new_glob not in already_globbed:
1229 already_globbed[new_glob] = glob(new_glob)
1230 res = []
1231 vars = required.union(defined_optionals)
1232 for p in self.parts:
1233 res.extend(p.for_defined(vars))
1234 parser = TemplateParts(res).get_parser()
1235 for fn in already_globbed[new_glob]:
1236 try:
1237 placeholders = parser(fn)
1238 except ValueError:
1239 continue
1240 for var_name in optional:
1241 if var_name not in placeholders:
1242 placeholders[var_name] = None
1243 matches.append(placeholders)
1244 return matches
1246 def resolve(self, placeholders, ignore_type=False) -> MyDataArray:
1247 """
1248 Resolve the template given a set of placeholders.
1250 :param placeholders: mapping of placeholder names to values
1251 :param ignore_type: if True, ignore the type formatting when
1252 filling in placeholders
1253 :return: cleaned string
1254 """
1255 return self.fill_known(placeholders, ignore_type=ignore_type).map(
1256 lambda t: t.remove_optionals()
1257 )
1259 def optional_subsets(
1260 self,
1261 ) -> Iterator["TemplateParts"]:
1262 """Yield template sub-sets with every combination optional placeholders."""
1263 optionals = self.optional_placeholders()
1264 for n_optional in range(len(optionals) + 1):
1265 for exclude_optional in itertools.combinations(optionals, n_optional):
1266 yield self.remove_optionals(exclude_optional)
1268 def extract_placeholders(self, filename, known_vars=None):
1269 """
1270 Extract the placeholder values from the filename.
1272 :param filename: filename
1273 :param known_vars: already known placeholders
1274 :return: dictionary from placeholder names to string representations
1275 (unused placeholders set to None)
1276 """
1277 if known_vars is not None:
1278 template = self.fill_known(known_vars)
1279 else:
1280 template = self
1281 while "//" in filename:
1282 filename = filename.replace("//", "/")
1284 required = template.required_placeholders()
1285 optional = template.optional_placeholders()
1286 results = []
1287 for to_fill in template.optional_subsets():
1288 sub_re = str(
1289 to_fill.fill_known(
1290 {var: r"(\S+)" for var in required.union(optional)},
1291 )
1292 )
1293 while "//" in sub_re:
1294 sub_re = sub_re.replace("//", "/")
1295 sub_re = sub_re.replace(".", r"\.")
1296 match = re.match(sub_re, filename)
1297 if match is None:
1298 continue
1300 extracted_value = {}
1301 ordered_vars = to_fill.ordered_placeholders()
1302 assert len(ordered_vars) == len(match.groups())
1304 failed = False
1305 for var, value in zip(ordered_vars, match.groups()):
1306 if var in extracted_value:
1307 if value != extracted_value[var]:
1308 failed = True
1309 break
1310 else:
1311 extracted_value[var] = value
1312 if failed or any("/" in value for value in extracted_value.values()):
1313 continue
1314 for name in template.optional_placeholders():
1315 if name not in extracted_value:
1316 extracted_value[name] = None
1317 if known_vars is not None:
1318 extracted_value.update(known_vars)
1319 results.append(extracted_value)
1320 if len(results) == 0:
1321 raise ValueError("{} did not match {}".format(filename, template))
1323 def score(placeholders):
1324 """
1325 Assign score to possible reconstructions of the placeholder values.
1327 The highest score is given to the set of placeholders that:
1329 1. has used the largest amount of optional placeholders
1330 2. has the shortest text within the placeholders (only used if equal at 1
1331 """
1332 number_used = len([v for v in placeholders.values() if v is not None])
1333 length_hint = sum([len(v) for v in placeholders.values() if v is not None])
1334 return number_used * 1000 - length_hint
1336 best = max(results, key=score)
1337 for var in results:
1338 if best != var and score(best) == score(var):
1339 raise KeyError(
1340 "Multiple equivalent ways found to parse {} using {}".format(
1341 filename, template
1342 )
1343 )
1344 return best
1346 def get_parser(self):
1347 """Create function that will parse a filename based on this template."""
1348 if any(isinstance(p, OptionalPart) for p in self.parts):
1349 raise ValueError(
1350 "Can not parse filename when there are optional parts in the template"
1351 )
1352 mapping = {
1353 old_key: "".join(new_key)
1354 for old_key, new_key in zip(
1355 self.required_placeholders(),
1356 itertools.product(*[string.ascii_letters] * 3),
1357 )
1358 }
1359 reverse = {new_key: old_key for old_key, new_key in mapping.items()}
1360 cleaned = str(TemplateParts(
1361 [
1362 Required(mapping[p.var_name], p.var_formatting)
1363 if isinstance(p, Required)
1364 else p
1365 for p in self.parts
1366 ]
1367 )).replace("?", "{:1}")
1369 if is_glob_pattern(cleaned):
1370 nreplace = cleaned.count("*")
1371 parsers = []
1373 for replace_with in product(*([["", "{}"]] * nreplace)):
1374 this_string = cleaned
1375 for r in replace_with:
1376 this_string = this_string.replace("*", r, 1)
1377 parsers.append(compile(this_string, case_sensitive=True).parse)
1379 def parser(filename):
1380 for p in parsers:
1381 result = p(filename)
1382 if result is not None:
1383 return result
1384 return None
1385 else:
1386 parser = compile(cleaned, case_sensitive=True).parse
1388 def parse_filename(filename):
1389 """Parse filename based on template."""
1390 result = parser(filename)
1391 if result is None:
1392 raise ValueError(
1393 f"template string ({str(self)}) does not mach filename ({filename})"
1394 )
1395 named = result.named
1396 if any(isinstance(value, str) and "/" in value for value in named.values()):
1397 raise ValueError("Placeholder can not span directories")
1398 return {reverse[key]: value for key, value in named.items()}
1400 return parse_filename
1402 def remove_precursors(self, placeholders=None):
1403 """Replace keys to those existing in the placeholders.
1405 If no placeholders provided all precursors are removed.
1406 """
1407 return TemplateParts([p.remove_precursors(placeholders) for p in self.parts])
1409 def __eq__(self, other):
1410 """Check whether other template matches this one."""
1411 if not isinstance(other, TemplateParts):
1412 return NotImplemented
1413 return (len(self.parts) == len(other.parts)) and all(
1414 p1 == p2 for p1, p2 in zip(self.parts, other.parts)
1415 )
1418def is_glob_pattern(path):
1419 return (
1420 '*' in path or
1421 '?' in path
1422 )
1424def pattern_match(path, glob_cmd):
1425 """
1426 Apply glob-like pattern matching to given `path`.
1428 The `path` will be returned directly if `path` does not contain any `*`, `?`, or `[]` or `glob_cmd` is False.
1429 Otherwise pattern matching using the python `glob` library is used.
1430 """
1431 if not (glob_cmd and is_glob_pattern(path)):
1432 return path
1434 matches = sorted(glob(path))
1435 if callable(glob_cmd):
1436 try:
1437 res = glob_cmd(matches)
1438 except Exception:
1439 if len(matches) == 0:
1440 raise FileNotFoundError(f"No file was found to match pattern `{path}`. The `FileTree.glob` function raised the underlying error.")
1441 if len(matches) > 1:
1442 raise FileNotFoundError(f"Multiple ({len(matches)}) files were found to match pattern `{path}`. The `FileTree.glob` function raised the underlying error.")
1443 raise
1444 if not isinstance(res, str):
1445 raise ValueError(f"The `FileTree.glob` function should return a single path as a string, not `{res}`.")
1446 return res
1447 else:
1448 if len(matches) == 0:
1449 raise FileNotFoundError(f"No file was found to match pattern `{path}`. Set `FileTree.glob` to False to return the pattern rather than a file matching the pattern.")
1450 if glob_cmd in (True, "default"):
1451 if len(matches) > 1:
1452 raise FileNotFoundError(f"Multiple ({len(matches)}) files were found to match pattern `{path}`. Set `FileTree.glob` to False to return the pattern rather than a file matching the pattern. You can also set it to 'first' or 'last' to get the first or last match.")
1453 return matches[0]
1454 elif glob_cmd == "first":
1455 return matches[0]
1456 elif glob_cmd == "last":
1457 return matches[-1]
1458 raise ValueError("`FileTree.glob` should be set to callable or one of `False`, `True`, 'default', 'first', or 'last'. Invalid value of `{glob_cmd}` given.")