Coverage for src/hdmf/term_set.py: 97%
143 statements
« prev ^ index » next coverage.py v7.3.2, created at 2023-10-04 02:57 +0000
« prev ^ index » next coverage.py v7.3.2, created at 2023-10-04 02:57 +0000
1import glob
2import os
3from collections import namedtuple
4from .utils import docval
5import warnings
6import numpy as np
7from .data_utils import append_data, extend_data
10class TermSet:
11 """
12 Class for implementing term sets from ontologies and other resources used to define the
13 meaning and/or identify of terms.
15 :ivar term_schema_path: The path to the LinkML YAML enumeration schema
16 :ivar sources: The prefixes for the ontologies used in the TermSet
17 :ivar view: SchemaView of the term set schema
18 :ivar schemasheets_folder: The path to the folder containing the LinkML TSV files
19 :ivar expanded_termset_path: The path to the schema with the expanded enumerations
20 """
21 def __init__(self,
22 term_schema_path: str=None,
23 schemasheets_folder: str=None,
24 dynamic: bool=False
25 ):
26 """
27 :param term_schema_path: The path to the LinkML YAML enumeration schema
28 :param schemasheets_folder: The path to the folder containing the LinkML TSV files
29 :param dynamic: Boolean parameter denoting whether the schema uses Dynamic Enumerations
31 """
32 try:
33 from linkml_runtime.utils.schemaview import SchemaView
34 except ImportError:
35 msg = "Install linkml_runtime"
36 raise ValueError(msg)
38 self.term_schema_path = term_schema_path
39 self.schemasheets_folder = schemasheets_folder
41 if self.schemasheets_folder is not None:
42 if self.term_schema_path is not None:
43 msg = "Cannot have both a path to a Schemasheets folder and a TermSet schema."
44 raise ValueError(msg)
45 else:
46 self.term_schema_path = self.__schemasheets_convert()
47 self.view = SchemaView(self.term_schema_path)
48 else:
49 self.view = SchemaView(self.term_schema_path)
50 self.expanded_termset_path = None
51 if dynamic:
52 # reset view to now include the dynamically populated termset
53 self.expanded_termset_path = self.__enum_expander()
54 self.view = SchemaView(self.expanded_termset_path)
56 self.sources = self.view.schema.prefixes
58 def __repr__(self):
59 re = "class: %s\n" % str(self.__class__)
60 re += "term_schema_path: %s\n" % self.term_schema_path
61 return re
63 def __perm_value_key_info(self, perm_values_dict: dict, key: str):
64 """
65 Private method to retrieve the id, description, and the meaning.
66 """
67 prefix_dict = self.view.schema.prefixes
68 info_tuple = namedtuple("Term_Info", ["id", "description", "meaning"])
69 description = perm_values_dict[key]['description']
70 enum_meaning = perm_values_dict[key]['meaning']
72 # filter for prefixes
73 marker = ':'
74 prefix = enum_meaning.split(marker, 1)[0]
75 id = enum_meaning.split(marker, 1)[1]
76 prefix_obj = prefix_dict[prefix]
77 prefix_reference = prefix_obj['prefix_reference']
79 # combine prefix and prefix_reference to make full term uri
80 meaning = prefix_reference+id
82 return info_tuple(enum_meaning, description, meaning)
84 @docval({'name': 'term', 'type': str, 'doc': "term to be validated"})
85 def validate(self, **kwargs):
86 """
87 Validate term in dataset towards a termset.
88 """
89 term = kwargs['term']
90 try:
91 self[term]
92 return True
93 except ValueError:
94 return False
96 @property
97 def view_set(self):
98 """
99 Property method to return a view of all terms in the the LinkML YAML Schema.
100 """
101 enumeration = list(self.view.all_enums())[0]
103 perm_values_dict = self.view.all_enums()[enumeration].permissible_values
104 enum_dict = {}
105 for perm_value_key in perm_values_dict.keys():
106 enum_dict[perm_value_key] = self.__perm_value_key_info(perm_values_dict=perm_values_dict,
107 key=perm_value_key)
109 return enum_dict
111 def __getitem__(self, term):
112 """
113 Method to retrieve a term and term information (LinkML description and LinkML meaning) from the set of terms.
114 """
115 enumeration = list(self.view.all_enums())[0]
116 perm_values_dict = self.view.all_enums()[enumeration].permissible_values
118 try:
119 term_info = self.__perm_value_key_info(perm_values_dict=perm_values_dict, key=term)
120 return term_info
122 except KeyError:
123 msg = 'Term not in schema'
124 raise ValueError(msg)
126 def __schemasheets_convert(self):
127 """
128 Method that will generate a schema from a directory of TSV files using SchemaMaker.
130 This method returns a path to the new schema to be viewed via SchemaView.
131 """
132 try:
133 import yaml
134 from linkml_runtime.utils.schema_as_dict import schema_as_dict
135 from schemasheets.schemamaker import SchemaMaker
136 except ImportError: # pragma: no cover
137 msg = "Install schemasheets."
138 raise ValueError(msg)
139 schema_maker = SchemaMaker()
140 tsv_file_paths = glob.glob(self.schemasheets_folder + "/*.tsv")
141 schema = schema_maker.create_schema(tsv_file_paths)
142 schema_dict = schema_as_dict(schema)
143 schemasheet_schema_path = os.path.join(self.schemasheets_folder, f"{schema_dict['name']}.yaml")
145 with open(schemasheet_schema_path, "w") as f:
146 yaml.dump(schema_dict, f)
148 return schemasheet_schema_path
150 def __enum_expander(self):
151 """
152 Method that will generate a new schema with the enumerations from the LinkML source.
153 This new schema will be stored in the same directory as the original schema with
154 the Dynamic Enumerations.
156 This method returns a path to the new schema to be viewed via SchemaView.
157 """
158 try:
159 with warnings.catch_warnings():
160 warnings.filterwarnings("ignore", category=DeprecationWarning)
161 from oaklib.utilities.subsets.value_set_expander import ValueSetExpander
162 except ImportError: # pragma: no cover
163 msg = 'Install oaklib.'
164 raise ValueError(msg)
165 expander = ValueSetExpander()
166 # TODO: linkml should raise a warning if the schema does not have dynamic enums
167 enum = list(self.view.all_enums())
168 schema_dir = os.path.dirname(self.term_schema_path)
169 file_name = os.path.basename(self.term_schema_path)
170 output_path = os.path.join(schema_dir, f"expanded_{file_name}")
171 expander.expand_in_place(self.term_schema_path, enum, output_path)
173 return output_path
175class TermSetWrapper:
176 """
177 This class allows any HDF5 dataset or attribute to have a TermSet.
178 """
179 @docval({'name': 'termset',
180 'type': TermSet,
181 'doc': 'The TermSet to be used.'},
182 {'name': 'value',
183 'type': (list, np.ndarray, dict, str, tuple),
184 'doc': 'The target item that is wrapped, either data or attribute.'},
185 )
186 def __init__(self, **kwargs):
187 self.__value = kwargs['value']
188 self.__termset = kwargs['termset']
189 self.__validate()
191 def __validate(self):
192 # check if list, tuple, array
193 if isinstance(self.__value, (list, np.ndarray, tuple)): # TODO: Future ticket on DataIO support
194 values = self.__value
195 # create list if none of those -> mostly for attributes
196 else:
197 values = [self.__value]
198 # iteratively validate
199 bad_values = []
200 for term in values:
201 validation = self.__termset.validate(term=term)
202 if not validation:
203 bad_values.append(term)
204 if len(bad_values)!=0:
205 msg = ('"%s" is not in the term set.' % ', '.join([str(value) for value in bad_values]))
206 raise ValueError(msg)
208 @property
209 def value(self):
210 return self.__value
212 @property
213 def termset(self):
214 return self.__termset
216 @property
217 def dtype(self):
218 return self.__getattr__('dtype')
220 def __getattr__(self, val):
221 """
222 This method is to get attributes that are not defined in init.
223 This is when dealing with data and numpy arrays.
224 """
225 return getattr(self.__value, val)
227 def __getitem__(self, val):
228 """
229 This is used when we want to index items.
230 """
231 return self.__value[val]
233 # uncomment when DataChunkIterator objects can be wrapped by TermSet
234 # def __next__(self):
235 # """
236 # Return the next item of a wrapped iterator.
237 # """
238 # return self.__value.__next__()
239 #
240 def __len__(self):
241 return len(self.__value)
243 def __iter__(self):
244 """
245 We want to make sure our wrapped items are still iterable.
246 """
247 return self.__value.__iter__()
249 def append(self, arg):
250 """
251 This append resolves the wrapper to use the append of the container using
252 the wrapper.
253 """
254 if self.termset.validate(term=arg):
255 self.__value = append_data(self.__value, arg)
256 else:
257 msg = ('"%s" is not in the term set.' % arg)
258 raise ValueError(msg)
260 def extend(self, arg):
261 """
262 This append resolves the wrapper to use the extend of the container using
263 the wrapper.
264 """
265 bad_data = []
266 for item in arg:
267 if not self.termset.validate(term=item):
268 bad_data.append(item)
270 if len(bad_data)==0:
271 self.__value = extend_data(self.__value, arg)
272 else:
273 msg = ('"%s" is not in the term set.' % ', '.join([str(item) for item in bad_data]))
274 raise ValueError(msg)