lmcat.lmcat
1import argparse 2import io 3import json 4 5# from dataclasses import dataclass, field 6from pathlib import Path 7import sys 8 9from lmcat.processing_pipeline import ProcessingPipeline 10 11 12# Handle Python 3.11+ vs older Python for TOML parsing 13try: 14 import tomllib 15except ImportError: 16 try: 17 import tomli as tomllib # type: ignore 18 except ImportError: 19 tomllib = None # type: ignore[assignment] 20 21import igittigitt # noqa: E402 22 23from muutils.json_serialize import ( 24 SerializableDataclass, 25 serializable_dataclass, 26 serializable_field, 27) 28from muutils.misc import shorten_numerical_to_str # noqa: E402 29 30 31from lmcat.file_stats import FileStats, TokenizerWrapper, TreeEntry, TOKENIZERS_PRESENT 32from lmcat.processing_pipeline import OnMultipleProcessors 33 34 35@serializable_dataclass(kw_only=True) 36class LMCatConfig(SerializableDataclass): 37 """Configuration dataclass for lmcat 38 39 # Parameters: 40 - `tree_divider: str` 41 - `tree_indent: str` 42 - `tree_file_divider: str` 43 - `content_divider: str` 44 - `include_gitignore: bool` (default True) 45 - `tree_only: bool` (default False) 46 """ 47 48 content_divider: str = serializable_field(default="``````") 49 tree_only: bool = serializable_field(default=False) 50 51 # ignoring 52 ignore_patterns: list[str] = serializable_field(default_factory=list) 53 ignore_patterns_files: list[Path] = serializable_field( 54 default_factory=lambda: [Path(".gitignore"), Path(".lmignore")], 55 serialization_fn=lambda x: [p.as_posix() for p in x], 56 deserialize_fn=lambda x: [Path(p) for p in x], 57 ) 58 59 # this file will be imported, and if the functions in it are decorated 60 # with one of the `register_*` decorators, they will be added to the functions 61 # which can be used in the processing pipeline 62 # --allow-plugins is a command line only option and must be set to true for this to work 63 plugins_file: Path | None = serializable_field( 64 default=None, 65 serialization_fn=lambda x: x.as_posix() if x else None, 66 deserialize_fn=lambda x: Path(x) if x else None, 67 ) 68 allow_plugins: bool = serializable_field( 69 default=False, 70 deserialize_fn=lambda x: False, # this can only be overriden through the command line 71 ) 72 73 # processing pipeline 74 glob_process: dict[str, str] = serializable_field(default_factory=dict) 75 decider_process: dict[str, str] = serializable_field(default_factory=dict) 76 on_multiple_processors: OnMultipleProcessors = serializable_field( 77 default="except", 78 assert_type=False, 79 ) 80 81 # tokenization 82 tokenizer: str = serializable_field( 83 default="gpt2" if TOKENIZERS_PRESENT else "whitespace-split" 84 ) 85 "Tokenizer to use for tokenizing the output. `gpt2` by default. passed to `tokenizers.Tokenizer.from_pretrained()`. If specified and `tokenizers` not installed, will throw exception. fallback `whitespace-split` used to avoid exception when `tokenizers` not installed." 86 87 # tree formatting 88 tree_divider: str = serializable_field(default="│ ") 89 tree_file_divider: str = serializable_field(default="├── ") 90 tree_indent: str = serializable_field(default=" ") 91 92 def get_tokenizer_obj(self) -> TokenizerWrapper: 93 """Get the tokenizer object""" 94 return TokenizerWrapper(self.tokenizer) 95 96 def get_processing_pipeline(self) -> ProcessingPipeline: 97 """Get the processing pipeline object""" 98 plugins_file: Path | None = self.plugins_file if self.allow_plugins else None 99 return ProcessingPipeline( 100 plugins_file=plugins_file, 101 decider_process_keys=self.decider_process, 102 glob_process_keys=self.glob_process, 103 on_multiple_processors=self.on_multiple_processors, 104 ) 105 106 @classmethod 107 def read(cls, root_dir: Path) -> "LMCatConfig": 108 """Attempt to read config from pyproject.toml, lmcat.toml, or lmcat.json.""" 109 pyproject_path: Path = root_dir / "pyproject.toml" 110 lmcat_toml_path: Path = root_dir / "lmcat.toml" 111 lmcat_json_path: Path = root_dir / "lmcat.json" 112 113 if ( 114 sum( 115 int(p.is_file()) 116 for p in (pyproject_path, lmcat_toml_path, lmcat_json_path) 117 ) 118 > 1 119 ): 120 raise ValueError( 121 "Multiple configuration files found. Please only use one of pyproject.toml, lmcat.toml, or lmcat.json." 122 ) 123 124 # Try pyproject.toml first 125 if tomllib is not None and pyproject_path.is_file(): 126 with pyproject_path.open("rb") as f: 127 pyproject_data = tomllib.load(f) 128 if "tool" in pyproject_data and "lmcat" in pyproject_data["tool"]: 129 return cls.load(pyproject_data["tool"]["lmcat"]) 130 131 # Then try lmcat.toml 132 if tomllib is not None and lmcat_toml_path.is_file(): 133 with lmcat_toml_path.open("rb") as f: 134 toml_data = tomllib.load(f) 135 return cls.load(toml_data) 136 137 # Finally try lmcat.json 138 if lmcat_json_path.is_file(): 139 with lmcat_json_path.open("r", encoding="utf-8") as f: 140 json_data = json.load(f) 141 return cls.load(json_data) 142 143 # Fallback to defaults 144 return cls() 145 146 147class IgnoreHandler: 148 """Handles all ignore pattern matching using igittigitt""" 149 150 def __init__(self, root_dir: Path, config: LMCatConfig): 151 self.root_dir: Path = root_dir 152 self.config: LMCatConfig = config 153 154 # set up parser 155 self.parser: igittigitt.IgnoreParser = igittigitt.IgnoreParser() 156 157 # first from the files 158 for ignore_file in self.config.ignore_patterns_files: 159 self.parser.parse_rule_files(self.root_dir, filename=ignore_file.name) 160 161 # then from the config itself 162 for pattern in self.config.ignore_patterns: 163 self.parser.add_rule(pattern=pattern, base_path=self.root_dir) 164 165 def is_ignored(self, path: Path) -> bool: 166 """Check if a path should be ignored""" 167 # Never ignore the gitignore/lmignore files themselves 168 if path.name in {".gitignore", ".lmignore"}: 169 return True 170 171 # Use igittigitt's matching 172 return self.parser.match(path) 173 174 175def sorted_entries(directory: Path) -> list[Path]: 176 """Return directory contents sorted: directories first, then files""" 177 subdirs: list[Path] = sorted( 178 [p for p in directory.iterdir() if p.is_dir()], key=lambda x: x.name 179 ) 180 files: list[Path] = sorted( 181 [p for p in directory.iterdir() if p.is_file()], key=lambda x: x.name 182 ) 183 return subdirs + files 184 185 186def walk_dir( 187 directory: Path, 188 ignore_handler: IgnoreHandler, 189 config: LMCatConfig, 190 tokenizer: TokenizerWrapper, 191 prefix: str = "", 192) -> tuple[list[TreeEntry], list[Path]]: 193 """Recursively walk a directory, building tree lines and collecting file paths""" 194 tree_output: list[TreeEntry] = [] 195 collected_files: list[Path] = [] 196 197 entries: list[Path] = sorted_entries(directory) 198 for i, entry in enumerate(entries): 199 if ignore_handler.is_ignored(entry): 200 continue 201 202 is_last: bool = i == len(entries) - 1 203 connector: str = ( 204 config.tree_file_divider 205 if not is_last 206 else config.tree_file_divider.replace("├", "└") 207 ) 208 209 if entry.is_dir(): 210 tree_output.append(TreeEntry(f"{prefix}{connector}{entry.name}", None)) 211 extension: str = config.tree_divider if not is_last else config.tree_indent 212 sub_output: list[TreeEntry] 213 sub_files: list[Path] 214 sub_output, sub_files = walk_dir( 215 directory=entry, 216 ignore_handler=ignore_handler, 217 config=config, 218 tokenizer=tokenizer, 219 prefix=prefix + extension, 220 ) 221 tree_output.extend(sub_output) 222 collected_files.extend(sub_files) 223 else: 224 stats: FileStats = FileStats.from_file(entry, tokenizer) 225 tree_output.append(TreeEntry(f"{prefix}{connector}{entry.name}", stats)) 226 collected_files.append(entry) 227 228 return tree_output, collected_files 229 230 231def format_tree_with_stats( 232 entries: list[TreeEntry], show_tokens: bool = False 233) -> list[str]: 234 """Format tree entries with aligned statistics 235 236 # Parameters: 237 - `entries : list[TreeEntry]` 238 List of tree entries with optional stats 239 - `show_tokens : bool` 240 Whether to show token counts 241 242 # Returns: 243 - `list[str]` 244 Formatted tree lines with aligned stats 245 """ 246 # Find max widths for alignment 247 max_line_len: int = max(len(entry.line) for entry in entries) 248 max_lines: int = max( 249 (len(f"{entry.stats.lines:,}") if entry.stats else 0) for entry in entries 250 ) 251 max_chars: int = max( 252 (len(f"{entry.stats.chars:,}") if entry.stats else 0) for entry in entries 253 ) 254 max_tokens: int = ( 255 max( 256 ( 257 len(f"{entry.stats.tokens:,}") 258 if entry.stats and entry.stats.tokens 259 else 0 260 ) 261 for entry in entries 262 ) 263 if show_tokens 264 else 0 265 ) 266 267 formatted: list[str] = [] 268 for entry in entries: 269 line: str = entry.line.ljust(max_line_len + 2) 270 if entry.stats: 271 lines_str: str = f"{entry.stats.lines:,}L".rjust(max_lines + 1) 272 chars_str: str = f"{entry.stats.chars:,}C".rjust(max_chars + 1) 273 stats_str: str = f"[{lines_str} {chars_str}" 274 if show_tokens and entry.stats.tokens is not None: 275 tokens_str: str = f"{entry.stats.tokens:,}T".rjust(max_tokens + 1) 276 stats_str += f" {tokens_str}" 277 stats_str += "]" 278 formatted.append(f"{line}{stats_str}") 279 else: 280 formatted.append(line) 281 282 return formatted 283 284 285def walk_and_collect( 286 root_dir: Path, 287 config: LMCatConfig, 288) -> tuple[list[str], list[Path]]: 289 """Walk filesystem from root_dir and gather tree listing plus file paths""" 290 if config is None: 291 config = LMCatConfig() 292 293 tokenizer: TokenizerWrapper = config.get_tokenizer_obj() 294 295 ignore_handler = IgnoreHandler(root_dir, config) 296 base_name = root_dir.resolve().name 297 298 # Start with root directory name 299 tree_output = [TreeEntry(base_name)] 300 301 # Walk the directory tree 302 sub_output, sub_files = walk_dir( 303 directory=root_dir, 304 ignore_handler=ignore_handler, 305 config=config, 306 tokenizer=tokenizer, 307 prefix="", 308 ) 309 tree_output.extend(sub_output) 310 311 # Format tree with stats 312 formatted_tree = format_tree_with_stats( 313 tree_output, show_tokens=tokenizer is not None 314 ) 315 316 return formatted_tree, sub_files 317 318 319def assemble_summary( 320 root_dir: Path, 321 config: LMCatConfig, 322) -> str: 323 """Assemble the summary output and return""" 324 325 processing_pipeline: ProcessingPipeline = config.get_processing_pipeline() 326 327 tree_output: list[str] 328 collected_files: list[Path] 329 tree_output, collected_files = walk_and_collect( 330 root_dir=root_dir, 331 config=config, 332 ) 333 334 output: list[str] = [] 335 output.append("# File Tree") 336 output.append("\n```") 337 output.extend(tree_output) 338 output.append("```\n") 339 340 # Add file contents if not suppressed 341 if not config.tree_only: 342 output.append("# File Contents") 343 344 for fpath in collected_files: 345 # get the path 346 relpath_posix: str = fpath.relative_to(root_dir).as_posix() 347 348 # process the contents 349 f_contents: str 350 p_name: str | None 351 f_contents, p_name = processing_pipeline.process_file(fpath) 352 processed_with: str = f'processed_with="{p_name}"' if p_name else "" 353 354 # start of file marker 355 pathspec_start: str = f'{{ path="{relpath_posix}" {processed_with} }}' 356 pathspec_end: str = f'{{ end_of_file="{relpath_posix}" }}' 357 output.append("") 358 output.append(config.content_divider + pathspec_start) 359 360 # process the actual contents of the file with the pipeline, and append 361 output.append(f_contents) 362 363 # add the end of file marker 364 output.append(config.content_divider + pathspec_end) 365 366 output_joined: str = "\n".join(output) 367 368 stats_dict_ints: dict[str, int] = { 369 "files": len(collected_files), 370 "lines": len(output_joined.splitlines()), 371 "chars": len(output_joined), 372 } 373 374 tokenizer: TokenizerWrapper = config.get_tokenizer_obj() 375 376 n_tokens: int = tokenizer.n_tokens(output_joined) 377 stats_dict_ints[f"`{tokenizer.name}` tokens"] = n_tokens 378 379 stats_header: list[str] = ["# Stats"] 380 for key, val in stats_dict_ints.items(): 381 val_str: str = str(val) 382 val_short: str = shorten_numerical_to_str(val) 383 if val_str != val_short: 384 stats_header.append(f"- {val} ({val_short}) {key}") 385 else: 386 stats_header.append(f"- {val} {key}") 387 388 output_complete: str = "\n".join(stats_header) + "\n\n" + output_joined 389 390 return output_complete 391 392 393def main() -> None: 394 """Main entry point for the script""" 395 arg_parser = argparse.ArgumentParser( 396 description="lmcat - list tree and content, combining .gitignore + .lmignore", 397 add_help=False, 398 ) 399 arg_parser.add_argument( 400 "-t", 401 "--tree-only", 402 action="store_true", 403 default=False, 404 help="Only print the tree, not the file contents.", 405 ) 406 arg_parser.add_argument( 407 "-o", 408 "--output", 409 action="store", 410 default=None, 411 help="Output file to write the tree and contents to.", 412 ) 413 arg_parser.add_argument( 414 "-h", "--help", action="help", help="Show this help message and exit." 415 ) 416 arg_parser.add_argument( 417 "--print-cfg", 418 action="store_true", 419 default=False, 420 help="Print the configuration as json and exit.", 421 ) 422 arg_parser.add_argument( 423 "--allow-plugins", 424 action="store_true", 425 default=False, 426 help="Allow plugins to be loaded from the plugins file. WARNING: this will execute arbitrary code found in the file pointed to by `config.plugins_file`, and **is a security risk**.", 427 ) 428 429 args: argparse.Namespace = arg_parser.parse_known_args()[0] 430 root_dir: Path = Path(".").resolve() 431 config: LMCatConfig = LMCatConfig.read(root_dir) 432 433 # CLI overrides 434 config.tree_only = args.tree_only 435 config.allow_plugins = args.allow_plugins 436 437 # print cfg and exit if requested 438 if args.print_cfg: 439 print(json.dumps(config.serialize(), indent="\t")) 440 return 441 442 # assemble summary 443 summary: str = assemble_summary(root_dir=root_dir, config=config) 444 445 # Write output 446 if args.output: 447 output_path: Path = Path(args.output) 448 output_path.parent.mkdir(parents=True, exist_ok=True) 449 output_path.write_text(summary, encoding="utf-8") 450 else: 451 if sys.platform == "win32": 452 sys.stdout = io.TextIOWrapper( 453 sys.stdout.buffer, encoding="utf-8", errors="replace" 454 ) 455 sys.stderr = io.TextIOWrapper( 456 sys.stderr.buffer, encoding="utf-8", errors="replace" 457 ) 458 459 print(summary) 460 461 462if __name__ == "__main__": 463 main()
36@serializable_dataclass(kw_only=True) 37class LMCatConfig(SerializableDataclass): 38 """Configuration dataclass for lmcat 39 40 # Parameters: 41 - `tree_divider: str` 42 - `tree_indent: str` 43 - `tree_file_divider: str` 44 - `content_divider: str` 45 - `include_gitignore: bool` (default True) 46 - `tree_only: bool` (default False) 47 """ 48 49 content_divider: str = serializable_field(default="``````") 50 tree_only: bool = serializable_field(default=False) 51 52 # ignoring 53 ignore_patterns: list[str] = serializable_field(default_factory=list) 54 ignore_patterns_files: list[Path] = serializable_field( 55 default_factory=lambda: [Path(".gitignore"), Path(".lmignore")], 56 serialization_fn=lambda x: [p.as_posix() for p in x], 57 deserialize_fn=lambda x: [Path(p) for p in x], 58 ) 59 60 # this file will be imported, and if the functions in it are decorated 61 # with one of the `register_*` decorators, they will be added to the functions 62 # which can be used in the processing pipeline 63 # --allow-plugins is a command line only option and must be set to true for this to work 64 plugins_file: Path | None = serializable_field( 65 default=None, 66 serialization_fn=lambda x: x.as_posix() if x else None, 67 deserialize_fn=lambda x: Path(x) if x else None, 68 ) 69 allow_plugins: bool = serializable_field( 70 default=False, 71 deserialize_fn=lambda x: False, # this can only be overriden through the command line 72 ) 73 74 # processing pipeline 75 glob_process: dict[str, str] = serializable_field(default_factory=dict) 76 decider_process: dict[str, str] = serializable_field(default_factory=dict) 77 on_multiple_processors: OnMultipleProcessors = serializable_field( 78 default="except", 79 assert_type=False, 80 ) 81 82 # tokenization 83 tokenizer: str = serializable_field( 84 default="gpt2" if TOKENIZERS_PRESENT else "whitespace-split" 85 ) 86 "Tokenizer to use for tokenizing the output. `gpt2` by default. passed to `tokenizers.Tokenizer.from_pretrained()`. If specified and `tokenizers` not installed, will throw exception. fallback `whitespace-split` used to avoid exception when `tokenizers` not installed." 87 88 # tree formatting 89 tree_divider: str = serializable_field(default="│ ") 90 tree_file_divider: str = serializable_field(default="├── ") 91 tree_indent: str = serializable_field(default=" ") 92 93 def get_tokenizer_obj(self) -> TokenizerWrapper: 94 """Get the tokenizer object""" 95 return TokenizerWrapper(self.tokenizer) 96 97 def get_processing_pipeline(self) -> ProcessingPipeline: 98 """Get the processing pipeline object""" 99 plugins_file: Path | None = self.plugins_file if self.allow_plugins else None 100 return ProcessingPipeline( 101 plugins_file=plugins_file, 102 decider_process_keys=self.decider_process, 103 glob_process_keys=self.glob_process, 104 on_multiple_processors=self.on_multiple_processors, 105 ) 106 107 @classmethod 108 def read(cls, root_dir: Path) -> "LMCatConfig": 109 """Attempt to read config from pyproject.toml, lmcat.toml, or lmcat.json.""" 110 pyproject_path: Path = root_dir / "pyproject.toml" 111 lmcat_toml_path: Path = root_dir / "lmcat.toml" 112 lmcat_json_path: Path = root_dir / "lmcat.json" 113 114 if ( 115 sum( 116 int(p.is_file()) 117 for p in (pyproject_path, lmcat_toml_path, lmcat_json_path) 118 ) 119 > 1 120 ): 121 raise ValueError( 122 "Multiple configuration files found. Please only use one of pyproject.toml, lmcat.toml, or lmcat.json." 123 ) 124 125 # Try pyproject.toml first 126 if tomllib is not None and pyproject_path.is_file(): 127 with pyproject_path.open("rb") as f: 128 pyproject_data = tomllib.load(f) 129 if "tool" in pyproject_data and "lmcat" in pyproject_data["tool"]: 130 return cls.load(pyproject_data["tool"]["lmcat"]) 131 132 # Then try lmcat.toml 133 if tomllib is not None and lmcat_toml_path.is_file(): 134 with lmcat_toml_path.open("rb") as f: 135 toml_data = tomllib.load(f) 136 return cls.load(toml_data) 137 138 # Finally try lmcat.json 139 if lmcat_json_path.is_file(): 140 with lmcat_json_path.open("r", encoding="utf-8") as f: 141 json_data = json.load(f) 142 return cls.load(json_data) 143 144 # Fallback to defaults 145 return cls()
Configuration dataclass for lmcat
Parameters:
tree_divider: strtree_indent: strtree_file_divider: strcontent_divider: strinclude_gitignore: bool(default True)tree_only: bool(default False)
Tokenizer to use for tokenizing the output. gpt2 by default. passed to tokenizers.Tokenizer.from_pretrained(). If specified and tokenizers not installed, will throw exception. fallback whitespace-split used to avoid exception when tokenizers not installed.
93 def get_tokenizer_obj(self) -> TokenizerWrapper: 94 """Get the tokenizer object""" 95 return TokenizerWrapper(self.tokenizer)
Get the tokenizer object
97 def get_processing_pipeline(self) -> ProcessingPipeline: 98 """Get the processing pipeline object""" 99 plugins_file: Path | None = self.plugins_file if self.allow_plugins else None 100 return ProcessingPipeline( 101 plugins_file=plugins_file, 102 decider_process_keys=self.decider_process, 103 glob_process_keys=self.glob_process, 104 on_multiple_processors=self.on_multiple_processors, 105 )
Get the processing pipeline object
107 @classmethod 108 def read(cls, root_dir: Path) -> "LMCatConfig": 109 """Attempt to read config from pyproject.toml, lmcat.toml, or lmcat.json.""" 110 pyproject_path: Path = root_dir / "pyproject.toml" 111 lmcat_toml_path: Path = root_dir / "lmcat.toml" 112 lmcat_json_path: Path = root_dir / "lmcat.json" 113 114 if ( 115 sum( 116 int(p.is_file()) 117 for p in (pyproject_path, lmcat_toml_path, lmcat_json_path) 118 ) 119 > 1 120 ): 121 raise ValueError( 122 "Multiple configuration files found. Please only use one of pyproject.toml, lmcat.toml, or lmcat.json." 123 ) 124 125 # Try pyproject.toml first 126 if tomllib is not None and pyproject_path.is_file(): 127 with pyproject_path.open("rb") as f: 128 pyproject_data = tomllib.load(f) 129 if "tool" in pyproject_data and "lmcat" in pyproject_data["tool"]: 130 return cls.load(pyproject_data["tool"]["lmcat"]) 131 132 # Then try lmcat.toml 133 if tomllib is not None and lmcat_toml_path.is_file(): 134 with lmcat_toml_path.open("rb") as f: 135 toml_data = tomllib.load(f) 136 return cls.load(toml_data) 137 138 # Finally try lmcat.json 139 if lmcat_json_path.is_file(): 140 with lmcat_json_path.open("r", encoding="utf-8") as f: 141 json_data = json.load(f) 142 return cls.load(json_data) 143 144 # Fallback to defaults 145 return cls()
Attempt to read config from pyproject.toml, lmcat.toml, or lmcat.json.
704 def serialize(self) -> dict[str, Any]: 705 result: dict[str, Any] = { 706 "__format__": f"{self.__class__.__name__}(SerializableDataclass)" 707 } 708 # for each field in the class 709 for field in dataclasses.fields(self): # type: ignore[arg-type] 710 # need it to be our special SerializableField 711 if not isinstance(field, SerializableField): 712 raise NotSerializableFieldException( 713 f"Field '{field.name}' on class {self.__class__.__module__}.{self.__class__.__name__} is not a `SerializableField`, " 714 f"but a {type(field)} " 715 "this state should be inaccessible, please report this bug!" 716 ) 717 718 # try to save it 719 if field.serialize: 720 try: 721 # get the val 722 value = getattr(self, field.name) 723 # if it is a serializable dataclass, serialize it 724 if isinstance(value, SerializableDataclass): 725 value = value.serialize() 726 # if the value has a serialization function, use that 727 if hasattr(value, "serialize") and callable(value.serialize): 728 value = value.serialize() 729 # if the field has a serialization function, use that 730 # it would be nice to be able to override a class's `.serialize()`, but that could lead to some inconsistencies! 731 elif field.serialization_fn: 732 value = field.serialization_fn(value) 733 734 # store the value in the result 735 result[field.name] = value 736 except Exception as e: 737 raise FieldSerializationError( 738 "\n".join( 739 [ 740 f"Error serializing field '{field.name}' on class {self.__class__.__module__}.{self.__class__.__name__}", 741 f"{field = }", 742 f"{value = }", 743 f"{self = }", 744 ] 745 ) 746 ) from e 747 748 # store each property if we can get it 749 for prop in self._properties_to_serialize: 750 if hasattr(cls, prop): 751 value = getattr(self, prop) 752 result[prop] = value 753 else: 754 raise AttributeError( 755 f"Cannot serialize property '{prop}' on class {self.__class__.__module__}.{self.__class__.__name__}" 756 + f"but it is in {self._properties_to_serialize = }" 757 + f"\n{self = }" 758 ) 759 760 return result
returns the class as a dict, implemented by using @serializable_dataclass decorator
767 @classmethod # type: ignore[misc] 768 def load(cls, data: dict[str, Any] | T) -> Type[T]: 769 # HACK: this is kind of ugly, but it fixes a lot of issues for when we do recursive loading with ZANJ 770 if isinstance(data, cls): 771 return data 772 773 assert isinstance( 774 data, typing.Mapping 775 ), f"When loading {cls.__name__ = } expected a Mapping, but got {type(data) = }:\n{data = }" 776 777 cls_type_hints: dict[str, Any] = get_cls_type_hints(cls) 778 779 # initialize dict for keeping what we will pass to the constructor 780 ctor_kwargs: dict[str, Any] = dict() 781 782 # iterate over the fields of the class 783 for field in dataclasses.fields(cls): 784 # check if the field is a SerializableField 785 assert isinstance( 786 field, SerializableField 787 ), f"Field '{field.name}' on class {cls.__name__} is not a SerializableField, but a {type(field)}. this state should be inaccessible, please report this bug!\nhttps://github.com/mivanit/muutils/issues/new" 788 789 # check if the field is in the data and if it should be initialized 790 if (field.name in data) and field.init: 791 # get the value, we will be processing it 792 value: Any = data[field.name] 793 794 # get the type hint for the field 795 field_type_hint: Any = cls_type_hints.get(field.name, None) 796 797 # we rely on the init of `SerializableField` to check that only one of `loading_fn` and `deserialize_fn` is set 798 if field.deserialize_fn: 799 # if it has a deserialization function, use that 800 value = field.deserialize_fn(value) 801 elif field.loading_fn: 802 # if it has a loading function, use that 803 value = field.loading_fn(data) 804 elif ( 805 field_type_hint is not None 806 and hasattr(field_type_hint, "load") 807 and callable(field_type_hint.load) 808 ): 809 # if no loading function but has a type hint with a load method, use that 810 if isinstance(value, dict): 811 value = field_type_hint.load(value) 812 else: 813 raise FieldLoadingError( 814 f"Cannot load value into {field_type_hint}, expected {type(value) = } to be a dict\n{value = }" 815 ) 816 else: 817 # assume no loading needs to happen, keep `value` as-is 818 pass 819 820 # store the value in the constructor kwargs 821 ctor_kwargs[field.name] = value 822 823 # create a new instance of the class with the constructor kwargs 824 output: cls = cls(**ctor_kwargs) 825 826 # validate the types of the fields if needed 827 if on_typecheck_mismatch != ErrorMode.IGNORE: 828 fields_valid: dict[str, bool] = ( 829 SerializableDataclass__validate_fields_types__dict( 830 output, 831 on_typecheck_error=on_typecheck_error, 832 ) 833 ) 834 835 # if there are any fields that are not valid, raise an error 836 if not all(fields_valid.values()): 837 msg: str = ( 838 f"Type mismatch in fields of {cls.__name__}:\n" 839 + "\n".join( 840 [ 841 f"{k}:\texpected {cls_type_hints[k] = }, but got value {getattr(output, k) = }, {type(getattr(output, k)) = }" 842 for k, v in fields_valid.items() 843 if not v 844 ] 845 ) 846 ) 847 848 on_typecheck_mismatch.process( 849 msg, except_cls=FieldTypeMismatchError 850 ) 851 852 # return the new instance 853 return output
takes in an appropriately structured dict and returns an instance of the class, implemented by using @serializable_dataclass decorator
304def SerializableDataclass__validate_fields_types( 305 self: SerializableDataclass, 306 on_typecheck_error: ErrorMode = _DEFAULT_ON_TYPECHECK_ERROR, 307) -> bool: 308 """validate the types of all the fields on a `SerializableDataclass`. calls `SerializableDataclass__validate_field_type` for each field""" 309 return all( 310 SerializableDataclass__validate_fields_types__dict( 311 self, on_typecheck_error=on_typecheck_error 312 ).values() 313 )
validate the types of all the fields on a SerializableDataclass. calls SerializableDataclass__validate_field_type for each field
Inherited Members
- muutils.json_serialize.serializable_dataclass.SerializableDataclass
- validate_field_type
- diff
- update_from_nested_dict
148class IgnoreHandler: 149 """Handles all ignore pattern matching using igittigitt""" 150 151 def __init__(self, root_dir: Path, config: LMCatConfig): 152 self.root_dir: Path = root_dir 153 self.config: LMCatConfig = config 154 155 # set up parser 156 self.parser: igittigitt.IgnoreParser = igittigitt.IgnoreParser() 157 158 # first from the files 159 for ignore_file in self.config.ignore_patterns_files: 160 self.parser.parse_rule_files(self.root_dir, filename=ignore_file.name) 161 162 # then from the config itself 163 for pattern in self.config.ignore_patterns: 164 self.parser.add_rule(pattern=pattern, base_path=self.root_dir) 165 166 def is_ignored(self, path: Path) -> bool: 167 """Check if a path should be ignored""" 168 # Never ignore the gitignore/lmignore files themselves 169 if path.name in {".gitignore", ".lmignore"}: 170 return True 171 172 # Use igittigitt's matching 173 return self.parser.match(path)
Handles all ignore pattern matching using igittigitt
151 def __init__(self, root_dir: Path, config: LMCatConfig): 152 self.root_dir: Path = root_dir 153 self.config: LMCatConfig = config 154 155 # set up parser 156 self.parser: igittigitt.IgnoreParser = igittigitt.IgnoreParser() 157 158 # first from the files 159 for ignore_file in self.config.ignore_patterns_files: 160 self.parser.parse_rule_files(self.root_dir, filename=ignore_file.name) 161 162 # then from the config itself 163 for pattern in self.config.ignore_patterns: 164 self.parser.add_rule(pattern=pattern, base_path=self.root_dir)
166 def is_ignored(self, path: Path) -> bool: 167 """Check if a path should be ignored""" 168 # Never ignore the gitignore/lmignore files themselves 169 if path.name in {".gitignore", ".lmignore"}: 170 return True 171 172 # Use igittigitt's matching 173 return self.parser.match(path)
Check if a path should be ignored
176def sorted_entries(directory: Path) -> list[Path]: 177 """Return directory contents sorted: directories first, then files""" 178 subdirs: list[Path] = sorted( 179 [p for p in directory.iterdir() if p.is_dir()], key=lambda x: x.name 180 ) 181 files: list[Path] = sorted( 182 [p for p in directory.iterdir() if p.is_file()], key=lambda x: x.name 183 ) 184 return subdirs + files
Return directory contents sorted: directories first, then files
187def walk_dir( 188 directory: Path, 189 ignore_handler: IgnoreHandler, 190 config: LMCatConfig, 191 tokenizer: TokenizerWrapper, 192 prefix: str = "", 193) -> tuple[list[TreeEntry], list[Path]]: 194 """Recursively walk a directory, building tree lines and collecting file paths""" 195 tree_output: list[TreeEntry] = [] 196 collected_files: list[Path] = [] 197 198 entries: list[Path] = sorted_entries(directory) 199 for i, entry in enumerate(entries): 200 if ignore_handler.is_ignored(entry): 201 continue 202 203 is_last: bool = i == len(entries) - 1 204 connector: str = ( 205 config.tree_file_divider 206 if not is_last 207 else config.tree_file_divider.replace("├", "└") 208 ) 209 210 if entry.is_dir(): 211 tree_output.append(TreeEntry(f"{prefix}{connector}{entry.name}", None)) 212 extension: str = config.tree_divider if not is_last else config.tree_indent 213 sub_output: list[TreeEntry] 214 sub_files: list[Path] 215 sub_output, sub_files = walk_dir( 216 directory=entry, 217 ignore_handler=ignore_handler, 218 config=config, 219 tokenizer=tokenizer, 220 prefix=prefix + extension, 221 ) 222 tree_output.extend(sub_output) 223 collected_files.extend(sub_files) 224 else: 225 stats: FileStats = FileStats.from_file(entry, tokenizer) 226 tree_output.append(TreeEntry(f"{prefix}{connector}{entry.name}", stats)) 227 collected_files.append(entry) 228 229 return tree_output, collected_files
Recursively walk a directory, building tree lines and collecting file paths
232def format_tree_with_stats( 233 entries: list[TreeEntry], show_tokens: bool = False 234) -> list[str]: 235 """Format tree entries with aligned statistics 236 237 # Parameters: 238 - `entries : list[TreeEntry]` 239 List of tree entries with optional stats 240 - `show_tokens : bool` 241 Whether to show token counts 242 243 # Returns: 244 - `list[str]` 245 Formatted tree lines with aligned stats 246 """ 247 # Find max widths for alignment 248 max_line_len: int = max(len(entry.line) for entry in entries) 249 max_lines: int = max( 250 (len(f"{entry.stats.lines:,}") if entry.stats else 0) for entry in entries 251 ) 252 max_chars: int = max( 253 (len(f"{entry.stats.chars:,}") if entry.stats else 0) for entry in entries 254 ) 255 max_tokens: int = ( 256 max( 257 ( 258 len(f"{entry.stats.tokens:,}") 259 if entry.stats and entry.stats.tokens 260 else 0 261 ) 262 for entry in entries 263 ) 264 if show_tokens 265 else 0 266 ) 267 268 formatted: list[str] = [] 269 for entry in entries: 270 line: str = entry.line.ljust(max_line_len + 2) 271 if entry.stats: 272 lines_str: str = f"{entry.stats.lines:,}L".rjust(max_lines + 1) 273 chars_str: str = f"{entry.stats.chars:,}C".rjust(max_chars + 1) 274 stats_str: str = f"[{lines_str} {chars_str}" 275 if show_tokens and entry.stats.tokens is not None: 276 tokens_str: str = f"{entry.stats.tokens:,}T".rjust(max_tokens + 1) 277 stats_str += f" {tokens_str}" 278 stats_str += "]" 279 formatted.append(f"{line}{stats_str}") 280 else: 281 formatted.append(line) 282 283 return formatted
Format tree entries with aligned statistics
Parameters:
entries : list[TreeEntry]List of tree entries with optional statsshow_tokens : boolWhether to show token counts
Returns:
list[str]Formatted tree lines with aligned stats
286def walk_and_collect( 287 root_dir: Path, 288 config: LMCatConfig, 289) -> tuple[list[str], list[Path]]: 290 """Walk filesystem from root_dir and gather tree listing plus file paths""" 291 if config is None: 292 config = LMCatConfig() 293 294 tokenizer: TokenizerWrapper = config.get_tokenizer_obj() 295 296 ignore_handler = IgnoreHandler(root_dir, config) 297 base_name = root_dir.resolve().name 298 299 # Start with root directory name 300 tree_output = [TreeEntry(base_name)] 301 302 # Walk the directory tree 303 sub_output, sub_files = walk_dir( 304 directory=root_dir, 305 ignore_handler=ignore_handler, 306 config=config, 307 tokenizer=tokenizer, 308 prefix="", 309 ) 310 tree_output.extend(sub_output) 311 312 # Format tree with stats 313 formatted_tree = format_tree_with_stats( 314 tree_output, show_tokens=tokenizer is not None 315 ) 316 317 return formatted_tree, sub_files
Walk filesystem from root_dir and gather tree listing plus file paths
320def assemble_summary( 321 root_dir: Path, 322 config: LMCatConfig, 323) -> str: 324 """Assemble the summary output and return""" 325 326 processing_pipeline: ProcessingPipeline = config.get_processing_pipeline() 327 328 tree_output: list[str] 329 collected_files: list[Path] 330 tree_output, collected_files = walk_and_collect( 331 root_dir=root_dir, 332 config=config, 333 ) 334 335 output: list[str] = [] 336 output.append("# File Tree") 337 output.append("\n```") 338 output.extend(tree_output) 339 output.append("```\n") 340 341 # Add file contents if not suppressed 342 if not config.tree_only: 343 output.append("# File Contents") 344 345 for fpath in collected_files: 346 # get the path 347 relpath_posix: str = fpath.relative_to(root_dir).as_posix() 348 349 # process the contents 350 f_contents: str 351 p_name: str | None 352 f_contents, p_name = processing_pipeline.process_file(fpath) 353 processed_with: str = f'processed_with="{p_name}"' if p_name else "" 354 355 # start of file marker 356 pathspec_start: str = f'{{ path="{relpath_posix}" {processed_with} }}' 357 pathspec_end: str = f'{{ end_of_file="{relpath_posix}" }}' 358 output.append("") 359 output.append(config.content_divider + pathspec_start) 360 361 # process the actual contents of the file with the pipeline, and append 362 output.append(f_contents) 363 364 # add the end of file marker 365 output.append(config.content_divider + pathspec_end) 366 367 output_joined: str = "\n".join(output) 368 369 stats_dict_ints: dict[str, int] = { 370 "files": len(collected_files), 371 "lines": len(output_joined.splitlines()), 372 "chars": len(output_joined), 373 } 374 375 tokenizer: TokenizerWrapper = config.get_tokenizer_obj() 376 377 n_tokens: int = tokenizer.n_tokens(output_joined) 378 stats_dict_ints[f"`{tokenizer.name}` tokens"] = n_tokens 379 380 stats_header: list[str] = ["# Stats"] 381 for key, val in stats_dict_ints.items(): 382 val_str: str = str(val) 383 val_short: str = shorten_numerical_to_str(val) 384 if val_str != val_short: 385 stats_header.append(f"- {val} ({val_short}) {key}") 386 else: 387 stats_header.append(f"- {val} {key}") 388 389 output_complete: str = "\n".join(stats_header) + "\n\n" + output_joined 390 391 return output_complete
Assemble the summary output and return
394def main() -> None: 395 """Main entry point for the script""" 396 arg_parser = argparse.ArgumentParser( 397 description="lmcat - list tree and content, combining .gitignore + .lmignore", 398 add_help=False, 399 ) 400 arg_parser.add_argument( 401 "-t", 402 "--tree-only", 403 action="store_true", 404 default=False, 405 help="Only print the tree, not the file contents.", 406 ) 407 arg_parser.add_argument( 408 "-o", 409 "--output", 410 action="store", 411 default=None, 412 help="Output file to write the tree and contents to.", 413 ) 414 arg_parser.add_argument( 415 "-h", "--help", action="help", help="Show this help message and exit." 416 ) 417 arg_parser.add_argument( 418 "--print-cfg", 419 action="store_true", 420 default=False, 421 help="Print the configuration as json and exit.", 422 ) 423 arg_parser.add_argument( 424 "--allow-plugins", 425 action="store_true", 426 default=False, 427 help="Allow plugins to be loaded from the plugins file. WARNING: this will execute arbitrary code found in the file pointed to by `config.plugins_file`, and **is a security risk**.", 428 ) 429 430 args: argparse.Namespace = arg_parser.parse_known_args()[0] 431 root_dir: Path = Path(".").resolve() 432 config: LMCatConfig = LMCatConfig.read(root_dir) 433 434 # CLI overrides 435 config.tree_only = args.tree_only 436 config.allow_plugins = args.allow_plugins 437 438 # print cfg and exit if requested 439 if args.print_cfg: 440 print(json.dumps(config.serialize(), indent="\t")) 441 return 442 443 # assemble summary 444 summary: str = assemble_summary(root_dir=root_dir, config=config) 445 446 # Write output 447 if args.output: 448 output_path: Path = Path(args.output) 449 output_path.parent.mkdir(parents=True, exist_ok=True) 450 output_path.write_text(summary, encoding="utf-8") 451 else: 452 if sys.platform == "win32": 453 sys.stdout = io.TextIOWrapper( 454 sys.stdout.buffer, encoding="utf-8", errors="replace" 455 ) 456 sys.stderr = io.TextIOWrapper( 457 sys.stderr.buffer, encoding="utf-8", errors="replace" 458 ) 459 460 print(summary)
Main entry point for the script