Coverage for src/trapi_predict_kit/utils.py: 95%

110 statements  

« prev     ^ index     » next       coverage.py v7.2.7, created at 2023-07-25 21:14 +0200

1import logging 

2import uuid 

3from datetime import datetime 

4from itertools import zip_longest 

5from typing import List 

6 

7import requests 

8from rdflib import RDF, Graph, Literal, Namespace, URIRef 

9from rdflib.namespace import DC, RDFS, XSD 

10 

11from trapi_predict_kit.config import settings 

12 

13## Instantiate logging utility 

14log = logging.getLogger("uvicorn.error") 

15log.propagate = False 

16log_level = logging.getLevelName(settings.LOG_LEVEL) 

17log.setLevel(log_level) 

18console_handler = logging.StreamHandler() 

19formatter = logging.Formatter("%(asctime)s %(levelname)s: [%(module)s:%(funcName)s] %(message)s") 

20console_handler.setFormatter(formatter) 

21log.addHandler(console_handler) 

22 

23 

24def resolve_entities(label: str) -> dict: 

25 """Use Translator SRI Name Resolution API to get the preferred Translator ID""" 

26 resp = requests.post( 

27 "https://name-resolution-sri.renci.org/lookup", 

28 params={"string": label, "limit": 3}, 

29 timeout=settings.TIMEOUT, 

30 ) 

31 return resp.json() 

32 

33 

34def normalize_id_to_translator(ids_list: List[str]) -> dict: 

35 """Use Translator SRI NodeNormalization API to get the preferred Translator ID 

36 for an ID https://nodenormalization-sri.renci.org/docs 

37 """ 

38 converted_ids_obj = {} 

39 resolve_curies = requests.get( 

40 "https://nodenormalization-sri.renci.org/get_normalized_nodes", 

41 params={"curie": ids_list}, 

42 timeout=settings.TIMEOUT, 

43 ) 

44 # Get corresponding OMIM IDs for MONDO IDs if match 

45 resp = resolve_curies.json() 

46 for converted_id, translator_ids in resp.items(): 

47 try: 

48 pref_id = translator_ids["id"]["identifier"] 

49 log.info(converted_id + " > " + pref_id) 

50 converted_ids_obj[converted_id] = pref_id 

51 except Exception: 

52 log.error("❌️ " + converted_id + " > " + str(translator_ids)) 

53 return converted_ids_obj 

54 

55 

56def get_entity_types(entity): 

57 """Use Translator SRI NodeNormalization API to get the preferred Translator ID 

58 for an ID https://nodenormalization-sri.renci.org/docs 

59 """ 

60 resolve_curies = requests.get( 

61 "https://nodenormalization-sri.renci.org/get_normalized_nodes", 

62 params={"curie": [entity]}, 

63 timeout=settings.TIMEOUT, 

64 ) 

65 # Get corresponding OMIM IDs for MONDO IDs if match 

66 resp = resolve_curies.json() 

67 if entity in resp: 67 ↛ 69line 67 didn't jump to line 69, because the condition on line 67 was never false

68 return resp[entity]["type"] 

69 return [] 

70 

71 

72def get_entities_labels(entity_list): 

73 """Send the list of node IDs to Translator Normalization API to get labels 

74 See API: https://nodenormalization-sri.renci.org/apidocs/#/Interfaces/get_get_normalized_nodes 

75 and example notebook: https://github.com/TranslatorIIPrototypes/NodeNormalization/blob/master/documentation/NodeNormalization.ipynb 

76 """ 

77 # TODO: add the preferred identifier CURIE to our answer also? 

78 label_results = {} 

79 entity_list = list(entity_list) 

80 for chunk in split_list(entity_list, 300): 

81 try: 

82 get_label_result = requests.get( 

83 "https://nodenormalization-sri.renci.org/get_normalized_nodes", 

84 params={"curie": chunk}, 

85 timeout=settings.TIMEOUT, 

86 ) 

87 label_results.update(get_label_result.json()) 

88 except Exception as e: 

89 # Catch if the call to the API fails (API not available) 

90 logging.warn( 

91 f"Error getting entities labels from NodeNormalization API ({e}), it might be down: https://nodenormalization-sri.renci.org/docs" 

92 ) 

93 return label_results 

94 

95 

96def split_list(iterable, n, fillvalue=None): 

97 "Collect data into fixed-length chunks or blocks" 

98 args = [iter(iterable)] * n 

99 return zip_longest(*args, fillvalue=fillvalue) 

100 

101 

102OPENPREDICT_GRAPH = "https://w3id.org/openpredict/graph" 

103OPENPREDICT_NAMESPACE = "https://w3id.org/openpredict/" 

104BIOLINK = Namespace("https://w3id.org/biolink/vocab/") 

105 

106OWL = Namespace("http://www.w3.org/2002/07/owl#") 

107SKOS = Namespace("http://www.w3.org/2004/02/skos/core#") 

108SCHEMA = Namespace("http://schema.org/") 

109DCAT = Namespace("http://www.w3.org/ns/dcat#") 

110PROV = Namespace("http://www.w3.org/ns/prov#") 

111MLS = Namespace("http://www.w3.org/ns/mls#") 

112OPENPREDICT = Namespace("https://w3id.org/openpredict/") 

113 

114 

115def get_run_metadata(scores, model_features, hyper_params, run_id=None): 

116 """Generate RDF metadata for a classifier and save it in data/openpredict-metadata.ttl, based on OpenPredict model: 

117 https://github.com/fair-workflows/openpredict/blob/master/data/rdf/results_disjoint_lr.nq 

118 

119 :param scores: scores 

120 :param model_features: List of features in the model 

121 :param label: label of the classifier 

122 :return: Run id 

123 """ 

124 g = Graph() 

125 g.bind("mls", Namespace("http://www.w3.org/ns/mls#")) 

126 g.bind("prov", Namespace("http://www.w3.org/ns/prov#")) 

127 g.bind("dc", Namespace("http://purl.org/dc/elements/1.1/")) 

128 g.bind("openpredict", Namespace("https://w3id.org/openpredict/")) 

129 

130 if not run_id: 

131 # Generate random UUID for the run ID 

132 run_id = str(uuid.uuid1()) 

133 

134 run_uri = URIRef(OPENPREDICT_NAMESPACE + "run/" + run_id) 

135 run_prop_prefix = OPENPREDICT_NAMESPACE + run_id + "/" 

136 evaluation_uri = URIRef(OPENPREDICT_NAMESPACE + "run/" + run_id + "/ModelEvaluation") 

137 # The same for all run: 

138 implementation_uri = URIRef(OPENPREDICT_NAMESPACE + "implementation/OpenPredict") 

139 

140 # Add Run metadata 

141 g.add((run_uri, RDF.type, MLS["Run"])) 

142 g.add((run_uri, DC.identifier, Literal(run_id))) 

143 g.add((run_uri, PROV["generatedAtTime"], Literal(datetime.now(), datatype=XSD.dateTime))) 

144 g.add((run_uri, MLS["realizes"], OPENPREDICT["LogisticRegression"])) 

145 g.add((run_uri, MLS["executes"], implementation_uri)) 

146 g.add((run_uri, MLS["hasOutput"], evaluation_uri)) 

147 g.add((run_uri, MLS["hasOutput"], URIRef(run_prop_prefix + "Model"))) 

148 

149 # Add Model, should we point it to the generated model? 

150 g.add((URIRef(run_prop_prefix + "Model"), RDF.type, MLS["Model"])) 

151 

152 # Add implementation metadata 

153 g.add((OPENPREDICT["LogisticRegression"], RDF.type, MLS["Algorithm"])) 

154 g.add((implementation_uri, RDF.type, MLS["Implementation"])) 

155 g.add((implementation_uri, MLS["implements"], OPENPREDICT["LogisticRegression"])) 

156 

157 # Add HyperParameters and their settings to implementation 

158 for hyperparam, hyperparam_setting in hyper_params.items(): 

159 hyperparam_uri = URIRef(OPENPREDICT_NAMESPACE + "HyperParameter/" + hyperparam) 

160 g.add((implementation_uri, MLS["hasHyperParameter"], hyperparam_uri)) 

161 g.add((hyperparam_uri, RDF.type, MLS["HyperParameter"])) 

162 g.add((hyperparam_uri, RDFS.label, Literal(hyperparam))) 

163 

164 hyperparam_setting_uri = URIRef(OPENPREDICT_NAMESPACE + "HyperParameterSetting/" + hyperparam) 

165 g.add((implementation_uri, MLS["hasHyperParameter"], hyperparam_setting_uri)) 

166 g.add((hyperparam_setting_uri, RDF.type, MLS["HyperParameterSetting"])) 

167 g.add((hyperparam_setting_uri, MLS["specifiedBy"], hyperparam_uri)) 

168 g.add((hyperparam_setting_uri, MLS["hasValue"], Literal(hyperparam_setting))) 

169 g.add((run_uri, MLS["hasInput"], hyperparam_setting_uri)) 

170 

171 # TODO: improve how we retrieve features 

172 for feature in model_features: 

173 feature_uri = URIRef( 

174 OPENPREDICT_NAMESPACE + "feature/" + feature.replace(" ", "_").replace("(", "").replace(")", "") 

175 ) 

176 g.add((feature_uri, RDF.type, MLS["Feature"])) 

177 g.add((feature_uri, DC.identifier, Literal(feature))) 

178 g.add((run_uri, MLS["hasInput"], feature_uri)) 

179 

180 # TODO: those 2 triples are for the PLEX ontology 

181 g.add((evaluation_uri, RDF.type, PROV["Entity"])) 

182 g.add((evaluation_uri, PROV["wasGeneratedBy"], run_uri)) 

183 

184 # Add scores as EvaluationMeasures 

185 g.add((evaluation_uri, RDF.type, MLS["ModelEvaluation"])) 

186 for key in scores: 

187 key_uri = URIRef(run_prop_prefix + key) 

188 g.add((evaluation_uri, MLS["specifiedBy"], key_uri)) 

189 g.add((key_uri, RDF.type, MLS["EvaluationMeasure"])) 

190 g.add((key_uri, RDFS.label, Literal(key))) 

191 g.add((key_uri, MLS["hasValue"], Literal(scores[key], datatype=XSD.double))) 

192 # TODO: The Example 1 puts hasValue directly in the ModelEvaluation 

193 # but that prevents to provide multiple values for 1 evaluation 

194 # http://ml-schema.github.io/documentation/ML%20Schema.html#overview 

195 

196 return g