Coverage for src / pythinfer / merge.py: 0%

22 statements  

« prev     ^ index     » next       coverage.py v7.12.0, created at 2025-11-26 21:27 +0000

1"""Merge RDF graphs from config, preserving named graph URIs for each input file.""" 

2 

3import logging 

4 

5from rdflib import Dataset, IdentifiedNode, URIRef 

6 

7from pythinfer.inout import Project 

8 

9IRI_EXTERNAL_INFERENCES: URIRef = URIRef("inferences_external") # type: ignore[bad-assignment] 

10IRI_FULL_INFERENCES: URIRef = URIRef("inferences_full") # type: ignore[bad-assignment] 

11 

12logger = logging.getLogger(__name__) 

13info = logger.info 

14dbg = debug = logger.debug 

15 

16 

17# NB: in the below we are using the file *name* only as the named graph identifier. 

18# This assumes that input files have unique names even if in different directories, 

19# which is likely an invalid assumption... 

20 

21 

22def merge_graphs( 

23 cfg: Project, 

24) -> tuple[Dataset, list[IdentifiedNode]]: 

25 """Merge graphs: preserve named graphs for each input. 

26 

27 Loads all input files into a single Dataset with named graphs. 

28 External vocabulary files are tracked separately for filtering during export. 

29 

30 Returns: 

31 Tuple of (merged Dataset, list of external graph identifiers). 

32 

33 """ 

34 merged = Dataset() 

35 external_graph_ids: list[IdentifiedNode] = [] 

36 

37 # Load external vocabulary files (ephemeral - used for inference only) 

38 for src in cfg.paths_vocab_ext: 

39 g = merged.graph(src.name) 

40 g.parse(src, format="turtle") 

41 external_graph_ids.append(g.identifier) 

42 

43 # Load internal vocabulary files 

44 for src in cfg.paths_vocab_int: 

45 g = merged.graph(src.name) 

46 g.parse(src, format="turtle") 

47 

48 # Load data files 

49 for src in cfg.paths_data: 

50 g = merged.graph(src.name) 

51 g.parse(src, format="turtle") 

52 

53 return merged, external_graph_ids