Coverage for src / pythinfer / infer.py: 0%
140 statements
« prev ^ index » next coverage.py v7.12.0, created at 2025-11-26 21:27 +0000
« prev ^ index » next coverage.py v7.12.0, created at 2025-11-26 21:27 +0000
1#!/usr/bin/env python3
2"""Script to merge TTL files and execute inference."""
4import logging
5from collections import defaultdict
6from collections.abc import Callable
7from pathlib import Path
9from owlrl import DeductiveClosure
10from owlrl.OWLRL import OWLRL_Semantics
11from rdflib import OWL, RDF, RDFS, BNode, Dataset, Graph, IdentifiedNode, Literal, Node
12from rdflib.query import ResultRow
14from pythinfer.inout import Project, Query, load_sparql_inference_queries
15from pythinfer.merge import IRI_EXTERNAL_INFERENCES, IRI_FULL_INFERENCES
16from pythinfer.rdflibplus import DatasetView
18DEF_MAX_REASONING_ROUNDS = 5
19SCRIPT_DIR = Path(__file__).parent
20logger = logging.getLogger(__name__)
21info = logger.info
22dbg = debug = logger.debug
25def apply_manual_sparql_inference(g: Graph, queries: list[Query]) -> Graph:
26 """Apply manual SPARQL-based inference rules to the graph.
28 Args:
29 g: RDF graph to apply inference to
30 queries: list of SPARQL CONSTRUCT queries that perform the inferences
31 Returns:
32 Graph: New graph with only the inferred triples
34 """
35 info(" Have %d queries for inference", len(queries))
37 g_infer = Graph()
39 # Apply each construct query to the graph
40 for query in queries:
41 dbg(" executing query '%s' (%d characters)", query.name, len(query))
42 results = g.query(query.content)
43 for row in results:
44 # Add each inferred triple to the graph
45 if isinstance(row, (ResultRow, bool)):
46 msg = f"Non-triple result ({type(row)}) from CONSTRUCT query "
47 msg += f"'{query.source}'"
48 raise TypeError(msg)
49 g_infer.add(row)
51 return g_infer
54def apply_owlrl_inference(graph: Graph, destination_graph: Graph) -> None:
55 """Apply OWL2 RL inference rules using the Owlrl library.
57 NB: The destination graph must have the *same store* as the input graph. This
58 is required by the `owlrl` library for some reason.
60 This function performs complete OWL 2 RL reasoning, which includes:
61 - RDFS inference (subclass, subproperty, domain, range)
62 - OWL inference (inverse properties, symmetric/transitive properties,
63 property chains, equivalence, disjointness, etc.)
65 Args:
66 graph: RDF graph to apply reasoning to
67 destination_graph: Optional graph to store the inferred triples
69 Returns:
70 None
71 (inferred triples are added to destination_graph)
73 """
74 ntriples_orig = len(graph)
75 info(
76 " Applying OWL inference from `%s` into `%s`",
77 graph.identifier,
78 destination_graph.identifier,
79 )
80 # Apply OWL 2 RL reasoning - this will add inferred triples to destination_graph
81 DeductiveClosure(OWLRL_Semantics).expand(graph, destination_graph) # pyright: ignore[reportUnknownMemberType]
82 ntriples_inferred = len(destination_graph)
84 nremoved, _ = filter_triples(destination_graph, filterset_invalid_triples)
86 info(" Original triples: %d", ntriples_orig)
87 info(" Inferences, raw: %d", ntriples_inferred)
88 info(" Invalid inferences: %d", nremoved)
91###
92# The following are triple filter functions. Some are per-triple (they ignore the graph)
93# and some are graph-based (they use the full graph to determine whether to remove a
94# triple).
95# All must except Node,Node,Node as the first 3 arguments because that is what rdflib
96# provides when iterating over a graph.
97# The package does not prevent use of types that are invalid in RDF.
98# The 4th argument is the full Graph.
99# All must return True if the triple is to be removed.
100###
101_FilterFunction = Callable[[Node, Node, Node, Graph], bool]
104# Per-triple filter functions (4th argument is ignored)
105def _subject_is_literal(s: Node, p: Node, o: Node, g: Graph) -> bool: # noqa: ARG001
106 """Identify when the subject is a Literal, which is invalid in RDF.
108 Likely related to at least this: https://github.com/RDFLib/OWL-RL/issues/50
109 """
110 return isinstance(s, Literal)
113def _object_is_empty_string(s: Node, p: Node, o: Node, g: Graph) -> bool: # noqa: ARG001
114 """Empty strings would usually be better represented as missing values."""
115 return isinstance(o, Literal) and str(o) == ""
118def _redundant_reflexives(s: Node, p: Node, o: Node, g: Graph) -> bool:
119 """Reflexive statements that are redundant and useless, such as sameAs."""
120 return (s == o) and (
121 p
122 in {
123 OWL.sameAs,
124 OWL.equivalentClass,
125 OWL.equivalentProperty,
126 RDFS.subClassOf,
127 RDFS.subPropertyOf,
128 }
129 )
132def _redundant_thing_declarations(s: Node, p: Node, o: Node, g: Graph) -> bool: # noqa: ARG001
133 """Identify useless declarations that `s` is a owl:Thing or a subclass of it."""
134 return (o == OWL.Thing) and (
135 p in {RDF.type, RDFS.subClassOf, RDFS.domain, RDFS.range}
136 )
139def _redundant_nothing_subclass(s: Node, p: Node, o: Node, g: Graph) -> bool: # noqa: ARG001
140 """Identify useless declarations owl:Nothing is a subclass of something."""
141 return (s == OWL.Nothing) and (p == RDFS.subClassOf)
144###
145# The following are Graph-based filter functions.
146# They must accept the full graph as well in order to determine whether to
147# remove a given triple.
148# As above, must return True if the triple is to be removed.
149###
152def _undeclared_blank_nodes(s: Node, p: Node, o: Node, g: Graph) -> bool:
153 """Identify triples with blank nodes that are not declared in the graph."""
154 if isinstance(o, BNode) and p in (
155 RDF.type,
156 RDFS.subClassOf,
157 RDFS.subPropertyOf,
158 RDFS.domain,
159 RDFS.range,
160 ):
161 # Check if there is any usage of this blank node as a subject in the graph
162 return not any(g.triples((o, None, None)))
163 return False
166# Filterset for invalid RDF triples, which are logically but not syntactically valid.
167# This can occur when the reasoner encounters malformed data or makes invalid
168# inferences.
169filterset_invalid_triples: list[_FilterFunction] = [_subject_is_literal]
170# Filterset for unwanted triples that bloat the graph but are not invalid.
171filterset_unwanted_triples: list[_FilterFunction] = [
172 _object_is_empty_string,
173 _redundant_reflexives,
174 _redundant_thing_declarations,
175 _redundant_nothing_subclass,
176 _undeclared_blank_nodes,
177]
178# Combined filterset
179filterset_all: list[_FilterFunction] = (
180 filterset_invalid_triples + filterset_unwanted_triples
181)
184def filter_triples(
185 graph: Graph, filter_functions: list[_FilterFunction]
186) -> tuple[int, dict[_FilterFunction, int]]:
187 """Filter triples from the graph using the provided filter functions.
189 ***NB: graph is modified in place.***
191 Also note that the counts of triples to remove may overlap, as a triple may be
192 identified for removal by multiple filter functions. Therefore, the number of
193 triples actually removed will be *less than or equal to* the sum of the counts.
195 Note that this deliberately does not return the graph to make clear that graph is
196 modified in place.
198 Args:
199 graph (Graph): The RDF graph to validate and clean.
200 filter_functions (list[Callable[[Triple, Graph], bool]]): List of functions that
201 take a triple and a Graph and return True if the triple should be removed.
203 Returns: tuple of:
204 int: number of triples actually removed
205 dict[Callable, int]: number of triples identified for removal by each filter
207 """
208 norig = len(graph)
209 # Make a list of triples to remove - do not remove while iterating
210 to_remove: list[tuple[Node, Node, Node]] = []
211 removal_counts: defaultdict[_FilterFunction, int] = defaultdict(int)
212 for s, p, o in graph:
213 for filter_func in filter_functions:
214 if filter_func(s, p, o, graph):
215 to_remove.append((s, p, o))
216 removal_counts[filter_func] += 1
218 info(
219 "%d triples identified for removal by %d filters:",
220 sum(removal_counts.values()),
221 len(filter_functions),
222 )
223 if to_remove:
224 for func, count in removal_counts.items():
225 info(" - %d triples identified by %s", count, func.__name__)
226 for triple in to_remove:
227 graph.remove(triple)
229 nremoved = norig - len(graph)
230 if nremoved > 0:
231 info("%d triples removed from graph", nremoved)
232 return nremoved, removal_counts
235def _generate_external_inferences(
236 ds: Dataset, external_graph_ids: list[IdentifiedNode]
237) -> Graph:
238 """Generate inferences from external vocabularies only (step 2).
240 This creates the "noise floor" of inferences that come from external
241 vocabularies like OWL, RDFS, SKOS, etc. These will be subtracted later.
243 Args:
244 ds: Dataset containing all graphs.
245 external_graph_ids: List of graph identifiers that are external.
247 Returns:
248 Graph containing external inferences.
250 """
251 info("Step 2: Generating external inferences (baseline from external vocabs)...")
253 # Create a DatasetView containing only external vocabularies (or empty if none).
254 external_view = DatasetView(ds, external_graph_ids)
256 # Workaround for owlrl bug #76: copy to temp dataset with triples in default graph
257 # https://github.com/RDFLib/OWL-RL/issues/76
258 info(" Creating temporary dataset to work around owlrl named graph bug...")
259 temp_ds = external_view.collapse()
260 info(" Temporary dataset created with %d triples in default graph", len(temp_ds))
262 # Create inferences graph in temp dataset (must share same store)
263 temp_inferences = temp_ds.graph(IRI_EXTERNAL_INFERENCES)
265 apply_owlrl_inference(temp_ds, temp_inferences)
267 g_external_inferences = ds.graph(IRI_EXTERNAL_INFERENCES)
268 for s, p, o in temp_inferences:
269 g_external_inferences.add((s, p, o))
270 info(" External inferences generated: %d triples", len(g_external_inferences))
271 return g_external_inferences
274def _run_inference_iteration(
275 ds: Dataset,
276 g_full_inferences: Graph,
277 sparql_queries: list[Query],
278 iteration: int,
279) -> tuple[int, int]:
280 """Run one iteration of inference (steps 3-4).
282 Args:
283 ds: Dataset containing all graphs.
284 g_full_inferences: Graph to accumulate inferences into.
285 sparql_queries: List of SPARQL CONSTRUCT queries for heuristics.
286 iteration: Current iteration number (for logging).
288 Returns:
289 Tuple of (triples_added_owl, triples_added_sparql).
291 """
292 info("--- Iteration %d ---", iteration)
294 # Step 3: Generate full inferences over current state
295 info(" Step 3: Running OWL-RL inference over current state...")
296 triples_before_owl = len(g_full_inferences)
297 apply_owlrl_inference(ds, g_full_inferences)
298 triples_added_owl = len(g_full_inferences) - triples_before_owl
299 info(" OWL-RL added %d new inferences", triples_added_owl)
301 # Step 4: Run heuristics (SPARQL CONSTRUCT queries)
302 if sparql_queries:
303 info(" Step 4: Running %d SPARQL heuristics...", len(sparql_queries))
304 triples_before_sparql = len(g_full_inferences)
306 # Apply SPARQL constructs over the entire dataset (which now includes
307 # the full inferences from step 3)
308 heuristic_results = apply_manual_sparql_inference(ds, sparql_queries)
310 # Add heuristic results to full inferences
311 for s, p, o in heuristic_results:
312 g_full_inferences.add((s, p, o))
314 triples_added_sparql = len(g_full_inferences) - triples_before_sparql
315 info(" SPARQL heuristics added %d new inferences", triples_added_sparql)
316 else:
317 triples_added_sparql = 0
318 info(" Step 4: No SPARQL heuristics to run")
320 return triples_added_owl, triples_added_sparql
323def run_inference_backend(
324 ds: Dataset,
325 external_graph_ids: list[IdentifiedNode],
326 project: Project,
327 max_iterations: int = DEF_MAX_REASONING_ROUNDS,
328 *,
329 include_unwanted_triples: bool = False,
330) -> list[IdentifiedNode]:
331 """Run inference backend on merged graph using OWL-RL semantics.
333 Implements the inference process described in README.md:
334 1. Load and merge (already done - ds contains merged data)
335 2. Generate external inferences (do once - baseline noise from external vocabs)
336 3. Generate full inferences over current state
337 4. Run heuristics (SPARQL CONSTRUCT queries)
338 5. Repeat steps 3-4 until convergence or max iterations
339 6. Subtract external data and inferences
340 7. Subtract unwanted inferences
342 Dataset is updated in-place with inferred triples:
343 - Graph IRI_FULL_INFERENCES: inferred triples over all data and vocabs
344 - Graph IRI_EXTERNAL_INFERENCES: inferred triples over external vocab only
346 Args:
347 ds: Dataset containing data and vocabulary graphs.
348 external_graph_ids: List of graph identifiers that are external (ephemeral).
349 project: The project configuration to use (includes backend and other settings).
350 max_iterations: Maximum number of inference iterations (default 5).
351 include_unwanted_triples: If True, do not filter unwanted triples.
353 Returns:
354 List of all external graph identifiers (input external_graph_ids plus
355 IRI_EXTERNAL_INFERENCES).
357 Raises:
358 ValueError: If backend is not 'owlrl'.
360 """
361 if project.owl_backend != "owlrl":
362 msg = f"Unsupported inference backend: {project.owl_backend}. Only 'owlrl' is currently supported."
363 raise NotImplementedError(msg)
365 sparql_queries = load_sparql_inference_queries(project.paths_sparql_inference or [])
367 # Step 2: Generate external inferences (once - this is the "noise floor")
368 g_external_inferences = _generate_external_inferences(ds, external_graph_ids)
370 # Steps 3-5: Iterate full inferences + heuristics until convergence
371 info(
372 "Steps 3-5: Iterating full inferences + heuristics (max %d iterations)...",
373 max_iterations,
374 )
376 g_full_inferences = ds.graph(IRI_FULL_INFERENCES)
377 iteration = 0
378 previous_triple_count = len(ds) # Count triples in entire dataset
380 while iteration < max_iterations:
381 iteration += 1
383 triples_added_owl, triples_added_sparql = _run_inference_iteration(
384 ds, g_full_inferences, sparql_queries, iteration
385 )
387 # Check for convergence
388 current_triple_count = len(ds)
389 new_triples_this_iteration = current_triple_count - previous_triple_count
391 info(
392 " Total new triples this iteration: %d (OWL: %d, SPARQL: %d)",
393 new_triples_this_iteration,
394 triples_added_owl,
395 triples_added_sparql,
396 )
398 if new_triples_this_iteration == 0:
399 info(" Convergence reached - no new triples generated")
400 break
402 previous_triple_count = current_triple_count
404 if iteration >= max_iterations:
405 info(" Maximum iterations (%d) reached", max_iterations)
407 info("Total inferences after iteration: %d triples", len(g_full_inferences))
409 # Step 6: Subtract external inferences from full inferences
410 # This is actually unnecessary if we are only exporting internal graphs later,
411 # because the inference engine is expected not to add inferences that already exist.
412 # This has been verified with `owlrl`, but not other backends yet.
414 # As a matter of diagnostic, if we are in DEBUG mode, we check how many triples
415 # would be removed here.
416 info("Step 6: external inference subtraction implicit because of named graphs.")
417 if logger.isEnabledFor(logging.DEBUG):
418 dbg("%d external inferences", len(g_external_inferences))
420 triples_overlapping = sum(
421 1 for s, p, o in g_external_inferences if (s, p, o) in g_full_inferences
422 )
424 dbg(" %d of these exist in full", triples_overlapping)
426 assert triples_overlapping == 0 # noqa: S101
428 if not include_unwanted_triples:
429 # Step 7: Subtract unwanted inferences
430 info("Step 7: Filtering unwanted inferences...")
431 filter_triples(g_full_inferences, filterset_all)
433 info("Final inference graph: %d triples", len(g_full_inferences))
435 # Return all external graph IDs (originals plus external inferences)
436 return [
437 *external_graph_ids,
438 IRI_EXTERNAL_INFERENCES,
439 ]