Coverage for src/trapi_predict_kit/utils.py: 95%
110 statements
« prev ^ index » next coverage.py v7.2.7, created at 2023-07-25 21:14 +0200
« prev ^ index » next coverage.py v7.2.7, created at 2023-07-25 21:14 +0200
1import logging
2import uuid
3from datetime import datetime
4from itertools import zip_longest
5from typing import List
7import requests
8from rdflib import RDF, Graph, Literal, Namespace, URIRef
9from rdflib.namespace import DC, RDFS, XSD
11from trapi_predict_kit.config import settings
13## Instantiate logging utility
14log = logging.getLogger("uvicorn.error")
15log.propagate = False
16log_level = logging.getLevelName(settings.LOG_LEVEL)
17log.setLevel(log_level)
18console_handler = logging.StreamHandler()
19formatter = logging.Formatter("%(asctime)s %(levelname)s: [%(module)s:%(funcName)s] %(message)s")
20console_handler.setFormatter(formatter)
21log.addHandler(console_handler)
24def resolve_entities(label: str) -> dict:
25 """Use Translator SRI Name Resolution API to get the preferred Translator ID"""
26 resp = requests.post(
27 "https://name-resolution-sri.renci.org/lookup",
28 params={"string": label, "limit": 3},
29 timeout=settings.TIMEOUT,
30 )
31 return resp.json()
34def normalize_id_to_translator(ids_list: List[str]) -> dict:
35 """Use Translator SRI NodeNormalization API to get the preferred Translator ID
36 for an ID https://nodenormalization-sri.renci.org/docs
37 """
38 converted_ids_obj = {}
39 resolve_curies = requests.get(
40 "https://nodenormalization-sri.renci.org/get_normalized_nodes",
41 params={"curie": ids_list},
42 timeout=settings.TIMEOUT,
43 )
44 # Get corresponding OMIM IDs for MONDO IDs if match
45 resp = resolve_curies.json()
46 for converted_id, translator_ids in resp.items():
47 try:
48 pref_id = translator_ids["id"]["identifier"]
49 log.info(converted_id + " > " + pref_id)
50 converted_ids_obj[converted_id] = pref_id
51 except Exception:
52 log.error("❌️ " + converted_id + " > " + str(translator_ids))
53 return converted_ids_obj
56def get_entity_types(entity):
57 """Use Translator SRI NodeNormalization API to get the preferred Translator ID
58 for an ID https://nodenormalization-sri.renci.org/docs
59 """
60 resolve_curies = requests.get(
61 "https://nodenormalization-sri.renci.org/get_normalized_nodes",
62 params={"curie": [entity]},
63 timeout=settings.TIMEOUT,
64 )
65 # Get corresponding OMIM IDs for MONDO IDs if match
66 resp = resolve_curies.json()
67 if entity in resp: 67 ↛ 69line 67 didn't jump to line 69, because the condition on line 67 was never false
68 return resp[entity]["type"]
69 return []
72def get_entities_labels(entity_list):
73 """Send the list of node IDs to Translator Normalization API to get labels
74 See API: https://nodenormalization-sri.renci.org/apidocs/#/Interfaces/get_get_normalized_nodes
75 and example notebook: https://github.com/TranslatorIIPrototypes/NodeNormalization/blob/master/documentation/NodeNormalization.ipynb
76 """
77 # TODO: add the preferred identifier CURIE to our answer also?
78 label_results = {}
79 entity_list = list(entity_list)
80 for chunk in split_list(entity_list, 300):
81 try:
82 get_label_result = requests.get(
83 "https://nodenormalization-sri.renci.org/get_normalized_nodes",
84 params={"curie": chunk},
85 timeout=settings.TIMEOUT,
86 )
87 label_results.update(get_label_result.json())
88 except Exception as e:
89 # Catch if the call to the API fails (API not available)
90 logging.warn(
91 f"Error getting entities labels from NodeNormalization API ({e}), it might be down: https://nodenormalization-sri.renci.org/docs"
92 )
93 return label_results
96def split_list(iterable, n, fillvalue=None):
97 "Collect data into fixed-length chunks or blocks"
98 args = [iter(iterable)] * n
99 return zip_longest(*args, fillvalue=fillvalue)
102OPENPREDICT_GRAPH = "https://w3id.org/openpredict/graph"
103OPENPREDICT_NAMESPACE = "https://w3id.org/openpredict/"
104BIOLINK = Namespace("https://w3id.org/biolink/vocab/")
106OWL = Namespace("http://www.w3.org/2002/07/owl#")
107SKOS = Namespace("http://www.w3.org/2004/02/skos/core#")
108SCHEMA = Namespace("http://schema.org/")
109DCAT = Namespace("http://www.w3.org/ns/dcat#")
110PROV = Namespace("http://www.w3.org/ns/prov#")
111MLS = Namespace("http://www.w3.org/ns/mls#")
112OPENPREDICT = Namespace("https://w3id.org/openpredict/")
115def get_run_metadata(scores, model_features, hyper_params, run_id=None):
116 """Generate RDF metadata for a classifier and save it in data/openpredict-metadata.ttl, based on OpenPredict model:
117 https://github.com/fair-workflows/openpredict/blob/master/data/rdf/results_disjoint_lr.nq
119 :param scores: scores
120 :param model_features: List of features in the model
121 :param label: label of the classifier
122 :return: Run id
123 """
124 g = Graph()
125 g.bind("mls", Namespace("http://www.w3.org/ns/mls#"))
126 g.bind("prov", Namespace("http://www.w3.org/ns/prov#"))
127 g.bind("dc", Namespace("http://purl.org/dc/elements/1.1/"))
128 g.bind("openpredict", Namespace("https://w3id.org/openpredict/"))
130 if not run_id:
131 # Generate random UUID for the run ID
132 run_id = str(uuid.uuid1())
134 run_uri = URIRef(OPENPREDICT_NAMESPACE + "run/" + run_id)
135 run_prop_prefix = OPENPREDICT_NAMESPACE + run_id + "/"
136 evaluation_uri = URIRef(OPENPREDICT_NAMESPACE + "run/" + run_id + "/ModelEvaluation")
137 # The same for all run:
138 implementation_uri = URIRef(OPENPREDICT_NAMESPACE + "implementation/OpenPredict")
140 # Add Run metadata
141 g.add((run_uri, RDF.type, MLS["Run"]))
142 g.add((run_uri, DC.identifier, Literal(run_id)))
143 g.add((run_uri, PROV["generatedAtTime"], Literal(datetime.now(), datatype=XSD.dateTime)))
144 g.add((run_uri, MLS["realizes"], OPENPREDICT["LogisticRegression"]))
145 g.add((run_uri, MLS["executes"], implementation_uri))
146 g.add((run_uri, MLS["hasOutput"], evaluation_uri))
147 g.add((run_uri, MLS["hasOutput"], URIRef(run_prop_prefix + "Model")))
149 # Add Model, should we point it to the generated model?
150 g.add((URIRef(run_prop_prefix + "Model"), RDF.type, MLS["Model"]))
152 # Add implementation metadata
153 g.add((OPENPREDICT["LogisticRegression"], RDF.type, MLS["Algorithm"]))
154 g.add((implementation_uri, RDF.type, MLS["Implementation"]))
155 g.add((implementation_uri, MLS["implements"], OPENPREDICT["LogisticRegression"]))
157 # Add HyperParameters and their settings to implementation
158 for hyperparam, hyperparam_setting in hyper_params.items():
159 hyperparam_uri = URIRef(OPENPREDICT_NAMESPACE + "HyperParameter/" + hyperparam)
160 g.add((implementation_uri, MLS["hasHyperParameter"], hyperparam_uri))
161 g.add((hyperparam_uri, RDF.type, MLS["HyperParameter"]))
162 g.add((hyperparam_uri, RDFS.label, Literal(hyperparam)))
164 hyperparam_setting_uri = URIRef(OPENPREDICT_NAMESPACE + "HyperParameterSetting/" + hyperparam)
165 g.add((implementation_uri, MLS["hasHyperParameter"], hyperparam_setting_uri))
166 g.add((hyperparam_setting_uri, RDF.type, MLS["HyperParameterSetting"]))
167 g.add((hyperparam_setting_uri, MLS["specifiedBy"], hyperparam_uri))
168 g.add((hyperparam_setting_uri, MLS["hasValue"], Literal(hyperparam_setting)))
169 g.add((run_uri, MLS["hasInput"], hyperparam_setting_uri))
171 # TODO: improve how we retrieve features
172 for feature in model_features:
173 feature_uri = URIRef(
174 OPENPREDICT_NAMESPACE + "feature/" + feature.replace(" ", "_").replace("(", "").replace(")", "")
175 )
176 g.add((feature_uri, RDF.type, MLS["Feature"]))
177 g.add((feature_uri, DC.identifier, Literal(feature)))
178 g.add((run_uri, MLS["hasInput"], feature_uri))
180 # TODO: those 2 triples are for the PLEX ontology
181 g.add((evaluation_uri, RDF.type, PROV["Entity"]))
182 g.add((evaluation_uri, PROV["wasGeneratedBy"], run_uri))
184 # Add scores as EvaluationMeasures
185 g.add((evaluation_uri, RDF.type, MLS["ModelEvaluation"]))
186 for key in scores:
187 key_uri = URIRef(run_prop_prefix + key)
188 g.add((evaluation_uri, MLS["specifiedBy"], key_uri))
189 g.add((key_uri, RDF.type, MLS["EvaluationMeasure"]))
190 g.add((key_uri, RDFS.label, Literal(key)))
191 g.add((key_uri, MLS["hasValue"], Literal(scores[key], datatype=XSD.double)))
192 # TODO: The Example 1 puts hasValue directly in the ModelEvaluation
193 # but that prevents to provide multiple values for 1 evaluation
194 # http://ml-schema.github.io/documentation/ML%20Schema.html#overview
196 return g