Coverage for src / pythinfer / inout.py: 78%

72 statements  

« prev     ^ index     » next       coverage.py v7.12.0, created at 2025-11-26 21:27 +0000

1"""Input/output utilities for pythinfer package.""" 

2 

3from collections.abc import Sequence 

4from dataclasses import dataclass 

5from pathlib import Path 

6 

7import yaml 

8 

9 

10@dataclass 

11class Project: 

12 """Represents a pythinfer project configuration. 

13 

14 Attributes: 

15 name: Name of the project. 

16 path_self: Path to the project config file itself. 

17 paths_data: List of paths to data files. [Must be > 1] 

18 paths_vocab_int: List of paths to internal vocabulary files. [Optional] 

19 paths_vocab_ext: List of paths to external vocabulary files. [Optional] 

20 

21 """ 

22 

23 name: str 

24 path_self: Path 

25 paths_data: list[Path] 

26 paths_vocab_int: list[Path] 

27 paths_vocab_ext: list[Path] 

28 owl_backend: str | None = None 

29 paths_sparql_inference: list[Path] | None = None 

30 

31 @staticmethod 

32 def from_yaml(config_path: Path | str) -> "Project": 

33 """Load project configuration from a YAML file.""" 

34 _config_path = Path(config_path) 

35 with _config_path.open() as f: 

36 cfg = yaml.safe_load(f) 

37 

38 # TODO(robert): handle path patterns. 

39 # TODO(robert): validate paths exist. 

40 return Project( 

41 name=cfg.get("name", _config_path.stem), 

42 path_self=_config_path, 

43 paths_vocab_ext=[Path(p) for p in cfg.get("external_vocabs", [])], 

44 paths_vocab_int=[Path(p) for p in cfg.get("internal_vocabs", [])], 

45 paths_data=[Path(p) for p in cfg["data"]], 

46 ) 

47 

48 

49PROJECT_FILE_NAME = "pythinfer.yaml" 

50MAX_DISCOVERY_SEARCH_DEPTH = 10 

51 

52 

53def discover_project(start_path: Path, _current_depth: int = 0) -> Path: 

54 """Discover a pythinfer project by searching for a config file. 

55 

56 Will recursively search parent directories until a config file is found or: 

57 1. The root directory is reached. 

58 2. A maximum search depth is reached (to avoid infinite recursion). 

59 3. The `$HOME` directory is reached. 

60 

61 Args: 

62 start_path: Path to start searching from. 

63 _current_depth: Current search depth (used internally). 

64 

65 Returns: 

66 Path to the discovered project config file 

67 

68 Raises: 

69 FileNotFoundError if search reaches limit without discovering a project. 

70 

71 """ 

72 current_path = start_path.resolve() 

73 config_path = current_path / PROJECT_FILE_NAME 

74 

75 # Positive case: config file found 

76 if config_path.exists(): 

77 return config_path 

78 

79 # Negative cases: check search limits 

80 msg = f"Search limit hit before finding project config (`{PROJECT_FILE_NAME}`)" 

81 if current_path.parent == current_path: 

82 raise FileNotFoundError(msg + ": reached root directory") 

83 if _current_depth >= MAX_DISCOVERY_SEARCH_DEPTH: 

84 raise FileNotFoundError( 

85 msg + f": reached maximum search depth ({_current_depth})" 

86 ) 

87 home_path = Path.home().resolve() 

88 if current_path == home_path: 

89 raise FileNotFoundError(msg + ": reached `$HOME` directory") 

90 

91 # Recurse to parent directory 

92 return discover_project(current_path.parent, _current_depth + 1) 

93 

94 

95def load_project(config_path: Path | None) -> Project: 

96 """Load a pythinfer project specification from a YAML file. 

97 

98 The config file can either be specified directly, or discovered by searching. 

99 

100 Args: 

101 config_path: Path to the config file, or None to trigger discovery. 

102 

103 """ 

104 _config_path = config_path or discover_project(Path.cwd()) 

105 return Project.from_yaml(_config_path) 

106 

107 

108@dataclass 

109class Query: 

110 """Represents a query string more meaningfully than str.""" 

111 

112 source: Path 

113 content: str # Should use Template or t-string 

114 

115 def __len__(self) -> int: 

116 """Return the length of the query string.""" 

117 return len(self.content) 

118 

119 def __str__(self) -> str: 

120 """Return the query contents.""" 

121 return self.content 

122 

123 @property 

124 def name(self) -> str: 

125 """Return the stem of the source path as the 'name' of the query.""" 

126 return self.source.stem 

127 

128 

129def load_sparql_inference_queries(query_files: Sequence[Path]) -> list[Query]: 

130 """Load SPARQL inference queries from files. 

131 

132 Returns: 

133 list[str]: List of SPARQL queries 

134 

135 """ 

136 queries: list[Query] = [] 

137 for query_file in query_files: 

138 with query_file.open() as f: 

139 q = Query(source=query_file, content=f.read()) 

140 queries.append(q) 

141 return queries 

142 

143 

144def create_project( 

145 scan_directory: Path | None = None, 

146 output_path: Path | str = PROJECT_FILE_NAME, 

147) -> Path: 

148 """Create a new pythinfer.yaml project file by scanning directory for RDF files. 

149 

150 Scans the specified directory (or current working directory) for RDF files 

151 (with .ttl or .rdf extensions) and creates a pythinfer.yaml configuration 

152 file listing them. 

153 

154 Args: 

155 scan_directory: Directory to scan for RDF files. If None, uses current working directory. 

156 output_path: Path where the project file should be created. 

157 

158 Returns: 

159 Path to the created project configuration file. 

160 

161 """ 

162 _scan_dir = (scan_directory or Path.cwd()).resolve() 

163 _output_path = Path(output_path) 

164 

165 # Ensure output directory exists 

166 _output_path.parent.mkdir(parents=True, exist_ok=True) 

167 

168 # Find all RDF files, excluding the 'derived' directory 

169 rdf_files: list[Path] = [] 

170 for rdf_ext in ("*.ttl", "*.rdf"): 

171 # Search recursively but exclude 'derived' directory 

172 for rdf_file in _scan_dir.rglob(rdf_ext): 

173 # Skip files in 'derived' directory 

174 if "derived" in rdf_file.parts: 174 ↛ 175line 174 didn't jump to line 175 because the condition on line 174 was never true

175 continue 

176 # Store relative paths from scan directory 

177 rel_path = rdf_file.relative_to(_scan_dir) 

178 rdf_files.append(rel_path) 

179 

180 # Sort for consistent output 

181 rdf_files.sort() 

182 

183 # Create project configuration 

184 project_config = { 

185 "name": _scan_dir.name, 

186 "data": [str(f) for f in rdf_files], 

187 } 

188 

189 # Write to YAML file 

190 with _output_path.open("w") as f: 

191 yaml.dump(project_config, f, default_flow_style=False) 

192 

193 return _output_path