Coverage for src / pythinfer / merge.py: 0%
22 statements
« prev ^ index » next coverage.py v7.12.0, created at 2025-11-26 21:27 +0000
« prev ^ index » next coverage.py v7.12.0, created at 2025-11-26 21:27 +0000
1"""Merge RDF graphs from config, preserving named graph URIs for each input file."""
3import logging
5from rdflib import Dataset, IdentifiedNode, URIRef
7from pythinfer.inout import Project
9IRI_EXTERNAL_INFERENCES: URIRef = URIRef("inferences_external") # type: ignore[bad-assignment]
10IRI_FULL_INFERENCES: URIRef = URIRef("inferences_full") # type: ignore[bad-assignment]
12logger = logging.getLogger(__name__)
13info = logger.info
14dbg = debug = logger.debug
17# NB: in the below we are using the file *name* only as the named graph identifier.
18# This assumes that input files have unique names even if in different directories,
19# which is likely an invalid assumption...
22def merge_graphs(
23 cfg: Project,
24) -> tuple[Dataset, list[IdentifiedNode]]:
25 """Merge graphs: preserve named graphs for each input.
27 Loads all input files into a single Dataset with named graphs.
28 External vocabulary files are tracked separately for filtering during export.
30 Returns:
31 Tuple of (merged Dataset, list of external graph identifiers).
33 """
34 merged = Dataset()
35 external_graph_ids: list[IdentifiedNode] = []
37 # Load external vocabulary files (ephemeral - used for inference only)
38 for src in cfg.paths_vocab_ext:
39 g = merged.graph(src.name)
40 g.parse(src, format="turtle")
41 external_graph_ids.append(g.identifier)
43 # Load internal vocabulary files
44 for src in cfg.paths_vocab_int:
45 g = merged.graph(src.name)
46 g.parse(src, format="turtle")
48 # Load data files
49 for src in cfg.paths_data:
50 g = merged.graph(src.name)
51 g.parse(src, format="turtle")
53 return merged, external_graph_ids