Coverage for src / pythinfer / infer.py: 0%

140 statements  

« prev     ^ index     » next       coverage.py v7.12.0, created at 2025-11-26 21:27 +0000

1#!/usr/bin/env python3 

2"""Script to merge TTL files and execute inference.""" 

3 

4import logging 

5from collections import defaultdict 

6from collections.abc import Callable 

7from pathlib import Path 

8 

9from owlrl import DeductiveClosure 

10from owlrl.OWLRL import OWLRL_Semantics 

11from rdflib import OWL, RDF, RDFS, BNode, Dataset, Graph, IdentifiedNode, Literal, Node 

12from rdflib.query import ResultRow 

13 

14from pythinfer.inout import Project, Query, load_sparql_inference_queries 

15from pythinfer.merge import IRI_EXTERNAL_INFERENCES, IRI_FULL_INFERENCES 

16from pythinfer.rdflibplus import DatasetView 

17 

18DEF_MAX_REASONING_ROUNDS = 5 

19SCRIPT_DIR = Path(__file__).parent 

20logger = logging.getLogger(__name__) 

21info = logger.info 

22dbg = debug = logger.debug 

23 

24 

25def apply_manual_sparql_inference(g: Graph, queries: list[Query]) -> Graph: 

26 """Apply manual SPARQL-based inference rules to the graph. 

27 

28 Args: 

29 g: RDF graph to apply inference to 

30 queries: list of SPARQL CONSTRUCT queries that perform the inferences 

31 Returns: 

32 Graph: New graph with only the inferred triples 

33 

34 """ 

35 info(" Have %d queries for inference", len(queries)) 

36 

37 g_infer = Graph() 

38 

39 # Apply each construct query to the graph 

40 for query in queries: 

41 dbg(" executing query '%s' (%d characters)", query.name, len(query)) 

42 results = g.query(query.content) 

43 for row in results: 

44 # Add each inferred triple to the graph 

45 if isinstance(row, (ResultRow, bool)): 

46 msg = f"Non-triple result ({type(row)}) from CONSTRUCT query " 

47 msg += f"'{query.source}'" 

48 raise TypeError(msg) 

49 g_infer.add(row) 

50 

51 return g_infer 

52 

53 

54def apply_owlrl_inference(graph: Graph, destination_graph: Graph) -> None: 

55 """Apply OWL2 RL inference rules using the Owlrl library. 

56 

57 NB: The destination graph must have the *same store* as the input graph. This 

58 is required by the `owlrl` library for some reason. 

59 

60 This function performs complete OWL 2 RL reasoning, which includes: 

61 - RDFS inference (subclass, subproperty, domain, range) 

62 - OWL inference (inverse properties, symmetric/transitive properties, 

63 property chains, equivalence, disjointness, etc.) 

64 

65 Args: 

66 graph: RDF graph to apply reasoning to 

67 destination_graph: Optional graph to store the inferred triples 

68 

69 Returns: 

70 None 

71 (inferred triples are added to destination_graph) 

72 

73 """ 

74 ntriples_orig = len(graph) 

75 info( 

76 " Applying OWL inference from `%s` into `%s`", 

77 graph.identifier, 

78 destination_graph.identifier, 

79 ) 

80 # Apply OWL 2 RL reasoning - this will add inferred triples to destination_graph 

81 DeductiveClosure(OWLRL_Semantics).expand(graph, destination_graph) # pyright: ignore[reportUnknownMemberType] 

82 ntriples_inferred = len(destination_graph) 

83 

84 nremoved, _ = filter_triples(destination_graph, filterset_invalid_triples) 

85 

86 info(" Original triples: %d", ntriples_orig) 

87 info(" Inferences, raw: %d", ntriples_inferred) 

88 info(" Invalid inferences: %d", nremoved) 

89 

90 

91### 

92# The following are triple filter functions. Some are per-triple (they ignore the graph) 

93# and some are graph-based (they use the full graph to determine whether to remove a 

94# triple). 

95# All must except Node,Node,Node as the first 3 arguments because that is what rdflib 

96# provides when iterating over a graph. 

97# The package does not prevent use of types that are invalid in RDF. 

98# The 4th argument is the full Graph. 

99# All must return True if the triple is to be removed. 

100### 

101_FilterFunction = Callable[[Node, Node, Node, Graph], bool] 

102 

103 

104# Per-triple filter functions (4th argument is ignored) 

105def _subject_is_literal(s: Node, p: Node, o: Node, g: Graph) -> bool: # noqa: ARG001 

106 """Identify when the subject is a Literal, which is invalid in RDF. 

107 

108 Likely related to at least this: https://github.com/RDFLib/OWL-RL/issues/50 

109 """ 

110 return isinstance(s, Literal) 

111 

112 

113def _object_is_empty_string(s: Node, p: Node, o: Node, g: Graph) -> bool: # noqa: ARG001 

114 """Empty strings would usually be better represented as missing values.""" 

115 return isinstance(o, Literal) and str(o) == "" 

116 

117 

118def _redundant_reflexives(s: Node, p: Node, o: Node, g: Graph) -> bool: 

119 """Reflexive statements that are redundant and useless, such as sameAs.""" 

120 return (s == o) and ( 

121 p 

122 in { 

123 OWL.sameAs, 

124 OWL.equivalentClass, 

125 OWL.equivalentProperty, 

126 RDFS.subClassOf, 

127 RDFS.subPropertyOf, 

128 } 

129 ) 

130 

131 

132def _redundant_thing_declarations(s: Node, p: Node, o: Node, g: Graph) -> bool: # noqa: ARG001 

133 """Identify useless declarations that `s` is a owl:Thing or a subclass of it.""" 

134 return (o == OWL.Thing) and ( 

135 p in {RDF.type, RDFS.subClassOf, RDFS.domain, RDFS.range} 

136 ) 

137 

138 

139def _redundant_nothing_subclass(s: Node, p: Node, o: Node, g: Graph) -> bool: # noqa: ARG001 

140 """Identify useless declarations owl:Nothing is a subclass of something.""" 

141 return (s == OWL.Nothing) and (p == RDFS.subClassOf) 

142 

143 

144### 

145# The following are Graph-based filter functions. 

146# They must accept the full graph as well in order to determine whether to 

147# remove a given triple. 

148# As above, must return True if the triple is to be removed. 

149### 

150 

151 

152def _undeclared_blank_nodes(s: Node, p: Node, o: Node, g: Graph) -> bool: 

153 """Identify triples with blank nodes that are not declared in the graph.""" 

154 if isinstance(o, BNode) and p in ( 

155 RDF.type, 

156 RDFS.subClassOf, 

157 RDFS.subPropertyOf, 

158 RDFS.domain, 

159 RDFS.range, 

160 ): 

161 # Check if there is any usage of this blank node as a subject in the graph 

162 return not any(g.triples((o, None, None))) 

163 return False 

164 

165 

166# Filterset for invalid RDF triples, which are logically but not syntactically valid. 

167# This can occur when the reasoner encounters malformed data or makes invalid 

168# inferences. 

169filterset_invalid_triples: list[_FilterFunction] = [_subject_is_literal] 

170# Filterset for unwanted triples that bloat the graph but are not invalid. 

171filterset_unwanted_triples: list[_FilterFunction] = [ 

172 _object_is_empty_string, 

173 _redundant_reflexives, 

174 _redundant_thing_declarations, 

175 _redundant_nothing_subclass, 

176 _undeclared_blank_nodes, 

177] 

178# Combined filterset 

179filterset_all: list[_FilterFunction] = ( 

180 filterset_invalid_triples + filterset_unwanted_triples 

181) 

182 

183 

184def filter_triples( 

185 graph: Graph, filter_functions: list[_FilterFunction] 

186) -> tuple[int, dict[_FilterFunction, int]]: 

187 """Filter triples from the graph using the provided filter functions. 

188 

189 ***NB: graph is modified in place.*** 

190 

191 Also note that the counts of triples to remove may overlap, as a triple may be 

192 identified for removal by multiple filter functions. Therefore, the number of 

193 triples actually removed will be *less than or equal to* the sum of the counts. 

194 

195 Note that this deliberately does not return the graph to make clear that graph is 

196 modified in place. 

197 

198 Args: 

199 graph (Graph): The RDF graph to validate and clean. 

200 filter_functions (list[Callable[[Triple, Graph], bool]]): List of functions that 

201 take a triple and a Graph and return True if the triple should be removed. 

202 

203 Returns: tuple of: 

204 int: number of triples actually removed 

205 dict[Callable, int]: number of triples identified for removal by each filter 

206 

207 """ 

208 norig = len(graph) 

209 # Make a list of triples to remove - do not remove while iterating 

210 to_remove: list[tuple[Node, Node, Node]] = [] 

211 removal_counts: defaultdict[_FilterFunction, int] = defaultdict(int) 

212 for s, p, o in graph: 

213 for filter_func in filter_functions: 

214 if filter_func(s, p, o, graph): 

215 to_remove.append((s, p, o)) 

216 removal_counts[filter_func] += 1 

217 

218 info( 

219 "%d triples identified for removal by %d filters:", 

220 sum(removal_counts.values()), 

221 len(filter_functions), 

222 ) 

223 if to_remove: 

224 for func, count in removal_counts.items(): 

225 info(" - %d triples identified by %s", count, func.__name__) 

226 for triple in to_remove: 

227 graph.remove(triple) 

228 

229 nremoved = norig - len(graph) 

230 if nremoved > 0: 

231 info("%d triples removed from graph", nremoved) 

232 return nremoved, removal_counts 

233 

234 

235def _generate_external_inferences( 

236 ds: Dataset, external_graph_ids: list[IdentifiedNode] 

237) -> Graph: 

238 """Generate inferences from external vocabularies only (step 2). 

239 

240 This creates the "noise floor" of inferences that come from external 

241 vocabularies like OWL, RDFS, SKOS, etc. These will be subtracted later. 

242 

243 Args: 

244 ds: Dataset containing all graphs. 

245 external_graph_ids: List of graph identifiers that are external. 

246 

247 Returns: 

248 Graph containing external inferences. 

249 

250 """ 

251 info("Step 2: Generating external inferences (baseline from external vocabs)...") 

252 

253 # Create a DatasetView containing only external vocabularies (or empty if none). 

254 external_view = DatasetView(ds, external_graph_ids) 

255 

256 # Workaround for owlrl bug #76: copy to temp dataset with triples in default graph 

257 # https://github.com/RDFLib/OWL-RL/issues/76 

258 info(" Creating temporary dataset to work around owlrl named graph bug...") 

259 temp_ds = external_view.collapse() 

260 info(" Temporary dataset created with %d triples in default graph", len(temp_ds)) 

261 

262 # Create inferences graph in temp dataset (must share same store) 

263 temp_inferences = temp_ds.graph(IRI_EXTERNAL_INFERENCES) 

264 

265 apply_owlrl_inference(temp_ds, temp_inferences) 

266 

267 g_external_inferences = ds.graph(IRI_EXTERNAL_INFERENCES) 

268 for s, p, o in temp_inferences: 

269 g_external_inferences.add((s, p, o)) 

270 info(" External inferences generated: %d triples", len(g_external_inferences)) 

271 return g_external_inferences 

272 

273 

274def _run_inference_iteration( 

275 ds: Dataset, 

276 g_full_inferences: Graph, 

277 sparql_queries: list[Query], 

278 iteration: int, 

279) -> tuple[int, int]: 

280 """Run one iteration of inference (steps 3-4). 

281 

282 Args: 

283 ds: Dataset containing all graphs. 

284 g_full_inferences: Graph to accumulate inferences into. 

285 sparql_queries: List of SPARQL CONSTRUCT queries for heuristics. 

286 iteration: Current iteration number (for logging). 

287 

288 Returns: 

289 Tuple of (triples_added_owl, triples_added_sparql). 

290 

291 """ 

292 info("--- Iteration %d ---", iteration) 

293 

294 # Step 3: Generate full inferences over current state 

295 info(" Step 3: Running OWL-RL inference over current state...") 

296 triples_before_owl = len(g_full_inferences) 

297 apply_owlrl_inference(ds, g_full_inferences) 

298 triples_added_owl = len(g_full_inferences) - triples_before_owl 

299 info(" OWL-RL added %d new inferences", triples_added_owl) 

300 

301 # Step 4: Run heuristics (SPARQL CONSTRUCT queries) 

302 if sparql_queries: 

303 info(" Step 4: Running %d SPARQL heuristics...", len(sparql_queries)) 

304 triples_before_sparql = len(g_full_inferences) 

305 

306 # Apply SPARQL constructs over the entire dataset (which now includes 

307 # the full inferences from step 3) 

308 heuristic_results = apply_manual_sparql_inference(ds, sparql_queries) 

309 

310 # Add heuristic results to full inferences 

311 for s, p, o in heuristic_results: 

312 g_full_inferences.add((s, p, o)) 

313 

314 triples_added_sparql = len(g_full_inferences) - triples_before_sparql 

315 info(" SPARQL heuristics added %d new inferences", triples_added_sparql) 

316 else: 

317 triples_added_sparql = 0 

318 info(" Step 4: No SPARQL heuristics to run") 

319 

320 return triples_added_owl, triples_added_sparql 

321 

322 

323def run_inference_backend( 

324 ds: Dataset, 

325 external_graph_ids: list[IdentifiedNode], 

326 project: Project, 

327 max_iterations: int = DEF_MAX_REASONING_ROUNDS, 

328 *, 

329 include_unwanted_triples: bool = False, 

330) -> list[IdentifiedNode]: 

331 """Run inference backend on merged graph using OWL-RL semantics. 

332 

333 Implements the inference process described in README.md: 

334 1. Load and merge (already done - ds contains merged data) 

335 2. Generate external inferences (do once - baseline noise from external vocabs) 

336 3. Generate full inferences over current state 

337 4. Run heuristics (SPARQL CONSTRUCT queries) 

338 5. Repeat steps 3-4 until convergence or max iterations 

339 6. Subtract external data and inferences 

340 7. Subtract unwanted inferences 

341 

342 Dataset is updated in-place with inferred triples: 

343 - Graph IRI_FULL_INFERENCES: inferred triples over all data and vocabs 

344 - Graph IRI_EXTERNAL_INFERENCES: inferred triples over external vocab only 

345 

346 Args: 

347 ds: Dataset containing data and vocabulary graphs. 

348 external_graph_ids: List of graph identifiers that are external (ephemeral). 

349 project: The project configuration to use (includes backend and other settings). 

350 max_iterations: Maximum number of inference iterations (default 5). 

351 include_unwanted_triples: If True, do not filter unwanted triples. 

352 

353 Returns: 

354 List of all external graph identifiers (input external_graph_ids plus 

355 IRI_EXTERNAL_INFERENCES). 

356 

357 Raises: 

358 ValueError: If backend is not 'owlrl'. 

359 

360 """ 

361 if project.owl_backend != "owlrl": 

362 msg = f"Unsupported inference backend: {project.owl_backend}. Only 'owlrl' is currently supported." 

363 raise NotImplementedError(msg) 

364 

365 sparql_queries = load_sparql_inference_queries(project.paths_sparql_inference or []) 

366 

367 # Step 2: Generate external inferences (once - this is the "noise floor") 

368 g_external_inferences = _generate_external_inferences(ds, external_graph_ids) 

369 

370 # Steps 3-5: Iterate full inferences + heuristics until convergence 

371 info( 

372 "Steps 3-5: Iterating full inferences + heuristics (max %d iterations)...", 

373 max_iterations, 

374 ) 

375 

376 g_full_inferences = ds.graph(IRI_FULL_INFERENCES) 

377 iteration = 0 

378 previous_triple_count = len(ds) # Count triples in entire dataset 

379 

380 while iteration < max_iterations: 

381 iteration += 1 

382 

383 triples_added_owl, triples_added_sparql = _run_inference_iteration( 

384 ds, g_full_inferences, sparql_queries, iteration 

385 ) 

386 

387 # Check for convergence 

388 current_triple_count = len(ds) 

389 new_triples_this_iteration = current_triple_count - previous_triple_count 

390 

391 info( 

392 " Total new triples this iteration: %d (OWL: %d, SPARQL: %d)", 

393 new_triples_this_iteration, 

394 triples_added_owl, 

395 triples_added_sparql, 

396 ) 

397 

398 if new_triples_this_iteration == 0: 

399 info(" Convergence reached - no new triples generated") 

400 break 

401 

402 previous_triple_count = current_triple_count 

403 

404 if iteration >= max_iterations: 

405 info(" Maximum iterations (%d) reached", max_iterations) 

406 

407 info("Total inferences after iteration: %d triples", len(g_full_inferences)) 

408 

409 # Step 6: Subtract external inferences from full inferences 

410 # This is actually unnecessary if we are only exporting internal graphs later, 

411 # because the inference engine is expected not to add inferences that already exist. 

412 # This has been verified with `owlrl`, but not other backends yet. 

413 

414 # As a matter of diagnostic, if we are in DEBUG mode, we check how many triples 

415 # would be removed here. 

416 info("Step 6: external inference subtraction implicit because of named graphs.") 

417 if logger.isEnabledFor(logging.DEBUG): 

418 dbg("%d external inferences", len(g_external_inferences)) 

419 

420 triples_overlapping = sum( 

421 1 for s, p, o in g_external_inferences if (s, p, o) in g_full_inferences 

422 ) 

423 

424 dbg(" %d of these exist in full", triples_overlapping) 

425 

426 assert triples_overlapping == 0 # noqa: S101 

427 

428 if not include_unwanted_triples: 

429 # Step 7: Subtract unwanted inferences 

430 info("Step 7: Filtering unwanted inferences...") 

431 filter_triples(g_full_inferences, filterset_all) 

432 

433 info("Final inference graph: %d triples", len(g_full_inferences)) 

434 

435 # Return all external graph IDs (originals plus external inferences) 

436 return [ 

437 *external_graph_ids, 

438 IRI_EXTERNAL_INFERENCES, 

439 ]