Coverage for src / pythinfer / inout.py: 78%
72 statements
« prev ^ index » next coverage.py v7.12.0, created at 2025-11-26 21:27 +0000
« prev ^ index » next coverage.py v7.12.0, created at 2025-11-26 21:27 +0000
1"""Input/output utilities for pythinfer package."""
3from collections.abc import Sequence
4from dataclasses import dataclass
5from pathlib import Path
7import yaml
10@dataclass
11class Project:
12 """Represents a pythinfer project configuration.
14 Attributes:
15 name: Name of the project.
16 path_self: Path to the project config file itself.
17 paths_data: List of paths to data files. [Must be > 1]
18 paths_vocab_int: List of paths to internal vocabulary files. [Optional]
19 paths_vocab_ext: List of paths to external vocabulary files. [Optional]
21 """
23 name: str
24 path_self: Path
25 paths_data: list[Path]
26 paths_vocab_int: list[Path]
27 paths_vocab_ext: list[Path]
28 owl_backend: str | None = None
29 paths_sparql_inference: list[Path] | None = None
31 @staticmethod
32 def from_yaml(config_path: Path | str) -> "Project":
33 """Load project configuration from a YAML file."""
34 _config_path = Path(config_path)
35 with _config_path.open() as f:
36 cfg = yaml.safe_load(f)
38 # TODO(robert): handle path patterns.
39 # TODO(robert): validate paths exist.
40 return Project(
41 name=cfg.get("name", _config_path.stem),
42 path_self=_config_path,
43 paths_vocab_ext=[Path(p) for p in cfg.get("external_vocabs", [])],
44 paths_vocab_int=[Path(p) for p in cfg.get("internal_vocabs", [])],
45 paths_data=[Path(p) for p in cfg["data"]],
46 )
49PROJECT_FILE_NAME = "pythinfer.yaml"
50MAX_DISCOVERY_SEARCH_DEPTH = 10
53def discover_project(start_path: Path, _current_depth: int = 0) -> Path:
54 """Discover a pythinfer project by searching for a config file.
56 Will recursively search parent directories until a config file is found or:
57 1. The root directory is reached.
58 2. A maximum search depth is reached (to avoid infinite recursion).
59 3. The `$HOME` directory is reached.
61 Args:
62 start_path: Path to start searching from.
63 _current_depth: Current search depth (used internally).
65 Returns:
66 Path to the discovered project config file
68 Raises:
69 FileNotFoundError if search reaches limit without discovering a project.
71 """
72 current_path = start_path.resolve()
73 config_path = current_path / PROJECT_FILE_NAME
75 # Positive case: config file found
76 if config_path.exists():
77 return config_path
79 # Negative cases: check search limits
80 msg = f"Search limit hit before finding project config (`{PROJECT_FILE_NAME}`)"
81 if current_path.parent == current_path:
82 raise FileNotFoundError(msg + ": reached root directory")
83 if _current_depth >= MAX_DISCOVERY_SEARCH_DEPTH:
84 raise FileNotFoundError(
85 msg + f": reached maximum search depth ({_current_depth})"
86 )
87 home_path = Path.home().resolve()
88 if current_path == home_path:
89 raise FileNotFoundError(msg + ": reached `$HOME` directory")
91 # Recurse to parent directory
92 return discover_project(current_path.parent, _current_depth + 1)
95def load_project(config_path: Path | None) -> Project:
96 """Load a pythinfer project specification from a YAML file.
98 The config file can either be specified directly, or discovered by searching.
100 Args:
101 config_path: Path to the config file, or None to trigger discovery.
103 """
104 _config_path = config_path or discover_project(Path.cwd())
105 return Project.from_yaml(_config_path)
108@dataclass
109class Query:
110 """Represents a query string more meaningfully than str."""
112 source: Path
113 content: str # Should use Template or t-string
115 def __len__(self) -> int:
116 """Return the length of the query string."""
117 return len(self.content)
119 def __str__(self) -> str:
120 """Return the query contents."""
121 return self.content
123 @property
124 def name(self) -> str:
125 """Return the stem of the source path as the 'name' of the query."""
126 return self.source.stem
129def load_sparql_inference_queries(query_files: Sequence[Path]) -> list[Query]:
130 """Load SPARQL inference queries from files.
132 Returns:
133 list[str]: List of SPARQL queries
135 """
136 queries: list[Query] = []
137 for query_file in query_files:
138 with query_file.open() as f:
139 q = Query(source=query_file, content=f.read())
140 queries.append(q)
141 return queries
144def create_project(
145 scan_directory: Path | None = None,
146 output_path: Path | str = PROJECT_FILE_NAME,
147) -> Path:
148 """Create a new pythinfer.yaml project file by scanning directory for RDF files.
150 Scans the specified directory (or current working directory) for RDF files
151 (with .ttl or .rdf extensions) and creates a pythinfer.yaml configuration
152 file listing them.
154 Args:
155 scan_directory: Directory to scan for RDF files. If None, uses current working directory.
156 output_path: Path where the project file should be created.
158 Returns:
159 Path to the created project configuration file.
161 """
162 _scan_dir = (scan_directory or Path.cwd()).resolve()
163 _output_path = Path(output_path)
165 # Ensure output directory exists
166 _output_path.parent.mkdir(parents=True, exist_ok=True)
168 # Find all RDF files, excluding the 'derived' directory
169 rdf_files: list[Path] = []
170 for rdf_ext in ("*.ttl", "*.rdf"):
171 # Search recursively but exclude 'derived' directory
172 for rdf_file in _scan_dir.rglob(rdf_ext):
173 # Skip files in 'derived' directory
174 if "derived" in rdf_file.parts: 174 ↛ 175line 174 didn't jump to line 175 because the condition on line 174 was never true
175 continue
176 # Store relative paths from scan directory
177 rel_path = rdf_file.relative_to(_scan_dir)
178 rdf_files.append(rel_path)
180 # Sort for consistent output
181 rdf_files.sort()
183 # Create project configuration
184 project_config = {
185 "name": _scan_dir.name,
186 "data": [str(f) for f in rdf_files],
187 }
189 # Write to YAML file
190 with _output_path.open("w") as f:
191 yaml.dump(project_config, f, default_flow_style=False)
193 return _output_path