Coverage for src/hdmf/term_set.py: 97%

143 statements  

« prev     ^ index     » next       coverage.py v7.3.2, created at 2023-10-04 02:57 +0000

1import glob 

2import os 

3from collections import namedtuple 

4from .utils import docval 

5import warnings 

6import numpy as np 

7from .data_utils import append_data, extend_data 

8 

9 

10class TermSet: 

11 """ 

12 Class for implementing term sets from ontologies and other resources used to define the 

13 meaning and/or identify of terms. 

14 

15 :ivar term_schema_path: The path to the LinkML YAML enumeration schema 

16 :ivar sources: The prefixes for the ontologies used in the TermSet 

17 :ivar view: SchemaView of the term set schema 

18 :ivar schemasheets_folder: The path to the folder containing the LinkML TSV files 

19 :ivar expanded_termset_path: The path to the schema with the expanded enumerations 

20 """ 

21 def __init__(self, 

22 term_schema_path: str=None, 

23 schemasheets_folder: str=None, 

24 dynamic: bool=False 

25 ): 

26 """ 

27 :param term_schema_path: The path to the LinkML YAML enumeration schema 

28 :param schemasheets_folder: The path to the folder containing the LinkML TSV files 

29 :param dynamic: Boolean parameter denoting whether the schema uses Dynamic Enumerations 

30 

31 """ 

32 try: 

33 from linkml_runtime.utils.schemaview import SchemaView 

34 except ImportError: 

35 msg = "Install linkml_runtime" 

36 raise ValueError(msg) 

37 

38 self.term_schema_path = term_schema_path 

39 self.schemasheets_folder = schemasheets_folder 

40 

41 if self.schemasheets_folder is not None: 

42 if self.term_schema_path is not None: 

43 msg = "Cannot have both a path to a Schemasheets folder and a TermSet schema." 

44 raise ValueError(msg) 

45 else: 

46 self.term_schema_path = self.__schemasheets_convert() 

47 self.view = SchemaView(self.term_schema_path) 

48 else: 

49 self.view = SchemaView(self.term_schema_path) 

50 self.expanded_termset_path = None 

51 if dynamic: 

52 # reset view to now include the dynamically populated termset 

53 self.expanded_termset_path = self.__enum_expander() 

54 self.view = SchemaView(self.expanded_termset_path) 

55 

56 self.sources = self.view.schema.prefixes 

57 

58 def __repr__(self): 

59 re = "class: %s\n" % str(self.__class__) 

60 re += "term_schema_path: %s\n" % self.term_schema_path 

61 return re 

62 

63 def __perm_value_key_info(self, perm_values_dict: dict, key: str): 

64 """ 

65 Private method to retrieve the id, description, and the meaning. 

66 """ 

67 prefix_dict = self.view.schema.prefixes 

68 info_tuple = namedtuple("Term_Info", ["id", "description", "meaning"]) 

69 description = perm_values_dict[key]['description'] 

70 enum_meaning = perm_values_dict[key]['meaning'] 

71 

72 # filter for prefixes 

73 marker = ':' 

74 prefix = enum_meaning.split(marker, 1)[0] 

75 id = enum_meaning.split(marker, 1)[1] 

76 prefix_obj = prefix_dict[prefix] 

77 prefix_reference = prefix_obj['prefix_reference'] 

78 

79 # combine prefix and prefix_reference to make full term uri 

80 meaning = prefix_reference+id 

81 

82 return info_tuple(enum_meaning, description, meaning) 

83 

84 @docval({'name': 'term', 'type': str, 'doc': "term to be validated"}) 

85 def validate(self, **kwargs): 

86 """ 

87 Validate term in dataset towards a termset. 

88 """ 

89 term = kwargs['term'] 

90 try: 

91 self[term] 

92 return True 

93 except ValueError: 

94 return False 

95 

96 @property 

97 def view_set(self): 

98 """ 

99 Property method to return a view of all terms in the the LinkML YAML Schema. 

100 """ 

101 enumeration = list(self.view.all_enums())[0] 

102 

103 perm_values_dict = self.view.all_enums()[enumeration].permissible_values 

104 enum_dict = {} 

105 for perm_value_key in perm_values_dict.keys(): 

106 enum_dict[perm_value_key] = self.__perm_value_key_info(perm_values_dict=perm_values_dict, 

107 key=perm_value_key) 

108 

109 return enum_dict 

110 

111 def __getitem__(self, term): 

112 """ 

113 Method to retrieve a term and term information (LinkML description and LinkML meaning) from the set of terms. 

114 """ 

115 enumeration = list(self.view.all_enums())[0] 

116 perm_values_dict = self.view.all_enums()[enumeration].permissible_values 

117 

118 try: 

119 term_info = self.__perm_value_key_info(perm_values_dict=perm_values_dict, key=term) 

120 return term_info 

121 

122 except KeyError: 

123 msg = 'Term not in schema' 

124 raise ValueError(msg) 

125 

126 def __schemasheets_convert(self): 

127 """ 

128 Method that will generate a schema from a directory of TSV files using SchemaMaker. 

129 

130 This method returns a path to the new schema to be viewed via SchemaView. 

131 """ 

132 try: 

133 import yaml 

134 from linkml_runtime.utils.schema_as_dict import schema_as_dict 

135 from schemasheets.schemamaker import SchemaMaker 

136 except ImportError: # pragma: no cover 

137 msg = "Install schemasheets." 

138 raise ValueError(msg) 

139 schema_maker = SchemaMaker() 

140 tsv_file_paths = glob.glob(self.schemasheets_folder + "/*.tsv") 

141 schema = schema_maker.create_schema(tsv_file_paths) 

142 schema_dict = schema_as_dict(schema) 

143 schemasheet_schema_path = os.path.join(self.schemasheets_folder, f"{schema_dict['name']}.yaml") 

144 

145 with open(schemasheet_schema_path, "w") as f: 

146 yaml.dump(schema_dict, f) 

147 

148 return schemasheet_schema_path 

149 

150 def __enum_expander(self): 

151 """ 

152 Method that will generate a new schema with the enumerations from the LinkML source. 

153 This new schema will be stored in the same directory as the original schema with 

154 the Dynamic Enumerations. 

155 

156 This method returns a path to the new schema to be viewed via SchemaView. 

157 """ 

158 try: 

159 with warnings.catch_warnings(): 

160 warnings.filterwarnings("ignore", category=DeprecationWarning) 

161 from oaklib.utilities.subsets.value_set_expander import ValueSetExpander 

162 except ImportError: # pragma: no cover 

163 msg = 'Install oaklib.' 

164 raise ValueError(msg) 

165 expander = ValueSetExpander() 

166 # TODO: linkml should raise a warning if the schema does not have dynamic enums 

167 enum = list(self.view.all_enums()) 

168 schema_dir = os.path.dirname(self.term_schema_path) 

169 file_name = os.path.basename(self.term_schema_path) 

170 output_path = os.path.join(schema_dir, f"expanded_{file_name}") 

171 expander.expand_in_place(self.term_schema_path, enum, output_path) 

172 

173 return output_path 

174 

175class TermSetWrapper: 

176 """ 

177 This class allows any HDF5 dataset or attribute to have a TermSet. 

178 """ 

179 @docval({'name': 'termset', 

180 'type': TermSet, 

181 'doc': 'The TermSet to be used.'}, 

182 {'name': 'value', 

183 'type': (list, np.ndarray, dict, str, tuple), 

184 'doc': 'The target item that is wrapped, either data or attribute.'}, 

185 ) 

186 def __init__(self, **kwargs): 

187 self.__value = kwargs['value'] 

188 self.__termset = kwargs['termset'] 

189 self.__validate() 

190 

191 def __validate(self): 

192 # check if list, tuple, array 

193 if isinstance(self.__value, (list, np.ndarray, tuple)): # TODO: Future ticket on DataIO support 

194 values = self.__value 

195 # create list if none of those -> mostly for attributes 

196 else: 

197 values = [self.__value] 

198 # iteratively validate 

199 bad_values = [] 

200 for term in values: 

201 validation = self.__termset.validate(term=term) 

202 if not validation: 

203 bad_values.append(term) 

204 if len(bad_values)!=0: 

205 msg = ('"%s" is not in the term set.' % ', '.join([str(value) for value in bad_values])) 

206 raise ValueError(msg) 

207 

208 @property 

209 def value(self): 

210 return self.__value 

211 

212 @property 

213 def termset(self): 

214 return self.__termset 

215 

216 @property 

217 def dtype(self): 

218 return self.__getattr__('dtype') 

219 

220 def __getattr__(self, val): 

221 """ 

222 This method is to get attributes that are not defined in init. 

223 This is when dealing with data and numpy arrays. 

224 """ 

225 return getattr(self.__value, val) 

226 

227 def __getitem__(self, val): 

228 """ 

229 This is used when we want to index items. 

230 """ 

231 return self.__value[val] 

232 

233 # uncomment when DataChunkIterator objects can be wrapped by TermSet 

234 # def __next__(self): 

235 # """ 

236 # Return the next item of a wrapped iterator. 

237 # """ 

238 # return self.__value.__next__() 

239 # 

240 def __len__(self): 

241 return len(self.__value) 

242 

243 def __iter__(self): 

244 """ 

245 We want to make sure our wrapped items are still iterable. 

246 """ 

247 return self.__value.__iter__() 

248 

249 def append(self, arg): 

250 """ 

251 This append resolves the wrapper to use the append of the container using 

252 the wrapper. 

253 """ 

254 if self.termset.validate(term=arg): 

255 self.__value = append_data(self.__value, arg) 

256 else: 

257 msg = ('"%s" is not in the term set.' % arg) 

258 raise ValueError(msg) 

259 

260 def extend(self, arg): 

261 """ 

262 This append resolves the wrapper to use the extend of the container using 

263 the wrapper. 

264 """ 

265 bad_data = [] 

266 for item in arg: 

267 if not self.termset.validate(term=item): 

268 bad_data.append(item) 

269 

270 if len(bad_data)==0: 

271 self.__value = extend_data(self.__value, arg) 

272 else: 

273 msg = ('"%s" is not in the term set.' % ', '.join([str(item) for item in bad_data])) 

274 raise ValueError(msg)