docs for lmcat v0.1.0
View Source on GitHub

lmcat.lmcat


  1import argparse
  2import io
  3import json
  4
  5# from dataclasses import dataclass, field
  6from pathlib import Path
  7import sys
  8
  9from lmcat.processing_pipeline import ProcessingPipeline
 10
 11
 12# Handle Python 3.11+ vs older Python for TOML parsing
 13try:
 14	import tomllib
 15except ImportError:
 16	try:
 17		import tomli as tomllib  # type: ignore
 18	except ImportError:
 19		tomllib = None  # type: ignore[assignment]
 20
 21import igittigitt  # noqa: E402
 22
 23from muutils.json_serialize import (
 24	SerializableDataclass,
 25	serializable_dataclass,
 26	serializable_field,
 27)
 28from muutils.misc import shorten_numerical_to_str  # noqa: E402
 29
 30
 31from lmcat.file_stats import FileStats, TokenizerWrapper, TreeEntry, TOKENIZERS_PRESENT
 32from lmcat.processing_pipeline import OnMultipleProcessors
 33
 34
 35@serializable_dataclass(kw_only=True)
 36class LMCatConfig(SerializableDataclass):
 37	"""Configuration dataclass for lmcat
 38
 39	# Parameters:
 40	 - `tree_divider: str`
 41	 - `tree_indent: str`
 42	 - `tree_file_divider: str`
 43	 - `content_divider: str`
 44	 - `include_gitignore: bool`  (default True)
 45	 - `tree_only: bool`  (default False)
 46	"""
 47
 48	content_divider: str = serializable_field(default="``````")
 49	tree_only: bool = serializable_field(default=False)
 50
 51	# ignoring
 52	ignore_patterns: list[str] = serializable_field(default_factory=list)
 53	ignore_patterns_files: list[Path] = serializable_field(
 54		default_factory=lambda: [Path(".gitignore"), Path(".lmignore")],
 55		serialization_fn=lambda x: [p.as_posix() for p in x],
 56		deserialize_fn=lambda x: [Path(p) for p in x],
 57	)
 58
 59	# this file will be imported, and if the functions in it are decorated
 60	# with one of the `register_*` decorators, they will be added to the functions
 61	# which can be used in the processing pipeline
 62	# --allow-plugins is a command line only option and must be set to true for this to work
 63	plugins_file: Path | None = serializable_field(
 64		default=None,
 65		serialization_fn=lambda x: x.as_posix() if x else None,
 66		deserialize_fn=lambda x: Path(x) if x else None,
 67	)
 68	allow_plugins: bool = serializable_field(
 69		default=False,
 70		deserialize_fn=lambda x: False,  # this can only be overriden through the command line
 71	)
 72
 73	# processing pipeline
 74	glob_process: dict[str, str] = serializable_field(default_factory=dict)
 75	decider_process: dict[str, str] = serializable_field(default_factory=dict)
 76	on_multiple_processors: OnMultipleProcessors = serializable_field(
 77		default="except",
 78		assert_type=False,
 79	)
 80
 81	# tokenization
 82	tokenizer: str = serializable_field(
 83		default="gpt2" if TOKENIZERS_PRESENT else "whitespace-split"
 84	)
 85	"Tokenizer to use for tokenizing the output. `gpt2` by default. passed to `tokenizers.Tokenizer.from_pretrained()`. If specified and `tokenizers` not installed, will throw exception. fallback `whitespace-split` used to avoid exception when `tokenizers` not installed."
 86
 87	# tree formatting
 88	tree_divider: str = serializable_field(default="│   ")
 89	tree_file_divider: str = serializable_field(default="├── ")
 90	tree_indent: str = serializable_field(default=" ")
 91
 92	def get_tokenizer_obj(self) -> TokenizerWrapper:
 93		"""Get the tokenizer object"""
 94		return TokenizerWrapper(self.tokenizer)
 95
 96	def get_processing_pipeline(self) -> ProcessingPipeline:
 97		"""Get the processing pipeline object"""
 98		plugins_file: Path | None = self.plugins_file if self.allow_plugins else None
 99		return ProcessingPipeline(
100			plugins_file=plugins_file,
101			decider_process_keys=self.decider_process,
102			glob_process_keys=self.glob_process,
103			on_multiple_processors=self.on_multiple_processors,
104		)
105
106	@classmethod
107	def read(cls, root_dir: Path) -> "LMCatConfig":
108		"""Attempt to read config from pyproject.toml, lmcat.toml, or lmcat.json."""
109		pyproject_path: Path = root_dir / "pyproject.toml"
110		lmcat_toml_path: Path = root_dir / "lmcat.toml"
111		lmcat_json_path: Path = root_dir / "lmcat.json"
112
113		if (
114			sum(
115				int(p.is_file())
116				for p in (pyproject_path, lmcat_toml_path, lmcat_json_path)
117			)
118			> 1
119		):
120			raise ValueError(
121				"Multiple configuration files found. Please only use one of pyproject.toml, lmcat.toml, or lmcat.json."
122			)
123
124		# Try pyproject.toml first
125		if tomllib is not None and pyproject_path.is_file():
126			with pyproject_path.open("rb") as f:
127				pyproject_data = tomllib.load(f)
128			if "tool" in pyproject_data and "lmcat" in pyproject_data["tool"]:
129				return cls.load(pyproject_data["tool"]["lmcat"])
130
131		# Then try lmcat.toml
132		if tomllib is not None and lmcat_toml_path.is_file():
133			with lmcat_toml_path.open("rb") as f:
134				toml_data = tomllib.load(f)
135			return cls.load(toml_data)
136
137		# Finally try lmcat.json
138		if lmcat_json_path.is_file():
139			with lmcat_json_path.open("r", encoding="utf-8") as f:
140				json_data = json.load(f)
141			return cls.load(json_data)
142
143		# Fallback to defaults
144		return cls()
145
146
147class IgnoreHandler:
148	"""Handles all ignore pattern matching using igittigitt"""
149
150	def __init__(self, root_dir: Path, config: LMCatConfig):
151		self.root_dir: Path = root_dir
152		self.config: LMCatConfig = config
153
154		# set up parser
155		self.parser: igittigitt.IgnoreParser = igittigitt.IgnoreParser()
156
157		# first from the files
158		for ignore_file in self.config.ignore_patterns_files:
159			self.parser.parse_rule_files(self.root_dir, filename=ignore_file.name)
160
161		# then from the config itself
162		for pattern in self.config.ignore_patterns:
163			self.parser.add_rule(pattern=pattern, base_path=self.root_dir)
164
165	def is_ignored(self, path: Path) -> bool:
166		"""Check if a path should be ignored"""
167		# Never ignore the gitignore/lmignore files themselves
168		if path.name in {".gitignore", ".lmignore"}:
169			return True
170
171		# Use igittigitt's matching
172		return self.parser.match(path)
173
174
175def sorted_entries(directory: Path) -> list[Path]:
176	"""Return directory contents sorted: directories first, then files"""
177	subdirs: list[Path] = sorted(
178		[p for p in directory.iterdir() if p.is_dir()], key=lambda x: x.name
179	)
180	files: list[Path] = sorted(
181		[p for p in directory.iterdir() if p.is_file()], key=lambda x: x.name
182	)
183	return subdirs + files
184
185
186def walk_dir(
187	directory: Path,
188	ignore_handler: IgnoreHandler,
189	config: LMCatConfig,
190	tokenizer: TokenizerWrapper,
191	prefix: str = "",
192) -> tuple[list[TreeEntry], list[Path]]:
193	"""Recursively walk a directory, building tree lines and collecting file paths"""
194	tree_output: list[TreeEntry] = []
195	collected_files: list[Path] = []
196
197	entries: list[Path] = sorted_entries(directory)
198	for i, entry in enumerate(entries):
199		if ignore_handler.is_ignored(entry):
200			continue
201
202		is_last: bool = i == len(entries) - 1
203		connector: str = (
204			config.tree_file_divider
205			if not is_last
206			else config.tree_file_divider.replace("├", "└")
207		)
208
209		if entry.is_dir():
210			tree_output.append(TreeEntry(f"{prefix}{connector}{entry.name}", None))
211			extension: str = config.tree_divider if not is_last else config.tree_indent
212			sub_output: list[TreeEntry]
213			sub_files: list[Path]
214			sub_output, sub_files = walk_dir(
215				directory=entry,
216				ignore_handler=ignore_handler,
217				config=config,
218				tokenizer=tokenizer,
219				prefix=prefix + extension,
220			)
221			tree_output.extend(sub_output)
222			collected_files.extend(sub_files)
223		else:
224			stats: FileStats = FileStats.from_file(entry, tokenizer)
225			tree_output.append(TreeEntry(f"{prefix}{connector}{entry.name}", stats))
226			collected_files.append(entry)
227
228	return tree_output, collected_files
229
230
231def format_tree_with_stats(
232	entries: list[TreeEntry], show_tokens: bool = False
233) -> list[str]:
234	"""Format tree entries with aligned statistics
235
236	# Parameters:
237	 - `entries : list[TreeEntry]`
238		List of tree entries with optional stats
239	 - `show_tokens : bool`
240		Whether to show token counts
241
242	# Returns:
243	 - `list[str]`
244		Formatted tree lines with aligned stats
245	"""
246	# Find max widths for alignment
247	max_line_len: int = max(len(entry.line) for entry in entries)
248	max_lines: int = max(
249		(len(f"{entry.stats.lines:,}") if entry.stats else 0) for entry in entries
250	)
251	max_chars: int = max(
252		(len(f"{entry.stats.chars:,}") if entry.stats else 0) for entry in entries
253	)
254	max_tokens: int = (
255		max(
256			(
257				len(f"{entry.stats.tokens:,}")
258				if entry.stats and entry.stats.tokens
259				else 0
260			)
261			for entry in entries
262		)
263		if show_tokens
264		else 0
265	)
266
267	formatted: list[str] = []
268	for entry in entries:
269		line: str = entry.line.ljust(max_line_len + 2)
270		if entry.stats:
271			lines_str: str = f"{entry.stats.lines:,}L".rjust(max_lines + 1)
272			chars_str: str = f"{entry.stats.chars:,}C".rjust(max_chars + 1)
273			stats_str: str = f"[{lines_str} {chars_str}"
274			if show_tokens and entry.stats.tokens is not None:
275				tokens_str: str = f"{entry.stats.tokens:,}T".rjust(max_tokens + 1)
276				stats_str += f" {tokens_str}"
277			stats_str += "]"
278			formatted.append(f"{line}{stats_str}")
279		else:
280			formatted.append(line)
281
282	return formatted
283
284
285def walk_and_collect(
286	root_dir: Path,
287	config: LMCatConfig,
288) -> tuple[list[str], list[Path]]:
289	"""Walk filesystem from root_dir and gather tree listing plus file paths"""
290	if config is None:
291		config = LMCatConfig()
292
293	tokenizer: TokenizerWrapper = config.get_tokenizer_obj()
294
295	ignore_handler = IgnoreHandler(root_dir, config)
296	base_name = root_dir.resolve().name
297
298	# Start with root directory name
299	tree_output = [TreeEntry(base_name)]
300
301	# Walk the directory tree
302	sub_output, sub_files = walk_dir(
303		directory=root_dir,
304		ignore_handler=ignore_handler,
305		config=config,
306		tokenizer=tokenizer,
307		prefix="",
308	)
309	tree_output.extend(sub_output)
310
311	# Format tree with stats
312	formatted_tree = format_tree_with_stats(
313		tree_output, show_tokens=tokenizer is not None
314	)
315
316	return formatted_tree, sub_files
317
318
319def assemble_summary(
320	root_dir: Path,
321	config: LMCatConfig,
322) -> str:
323	"""Assemble the summary output and return"""
324
325	processing_pipeline: ProcessingPipeline = config.get_processing_pipeline()
326
327	tree_output: list[str]
328	collected_files: list[Path]
329	tree_output, collected_files = walk_and_collect(
330		root_dir=root_dir,
331		config=config,
332	)
333
334	output: list[str] = []
335	output.append("# File Tree")
336	output.append("\n```")
337	output.extend(tree_output)
338	output.append("```\n")
339
340	# Add file contents if not suppressed
341	if not config.tree_only:
342		output.append("# File Contents")
343
344		for fpath in collected_files:
345			# get the path
346			relpath_posix: str = fpath.relative_to(root_dir).as_posix()
347
348			# process the contents
349			f_contents: str
350			p_name: str | None
351			f_contents, p_name = processing_pipeline.process_file(fpath)
352			processed_with: str = f'processed_with="{p_name}"' if p_name else ""
353
354			# start of file marker
355			pathspec_start: str = f'{{ path="{relpath_posix}" {processed_with} }}'
356			pathspec_end: str = f'{{ end_of_file="{relpath_posix}" }}'
357			output.append("")
358			output.append(config.content_divider + pathspec_start)
359
360			# process the actual contents of the file with the pipeline, and append
361			output.append(f_contents)
362
363			# add the end of file marker
364			output.append(config.content_divider + pathspec_end)
365
366	output_joined: str = "\n".join(output)
367
368	stats_dict_ints: dict[str, int] = {
369		"files": len(collected_files),
370		"lines": len(output_joined.splitlines()),
371		"chars": len(output_joined),
372	}
373
374	tokenizer: TokenizerWrapper = config.get_tokenizer_obj()
375
376	n_tokens: int = tokenizer.n_tokens(output_joined)
377	stats_dict_ints[f"`{tokenizer.name}` tokens"] = n_tokens
378
379	stats_header: list[str] = ["# Stats"]
380	for key, val in stats_dict_ints.items():
381		val_str: str = str(val)
382		val_short: str = shorten_numerical_to_str(val)
383		if val_str != val_short:
384			stats_header.append(f"- {val} ({val_short}) {key}")
385		else:
386			stats_header.append(f"- {val} {key}")
387
388	output_complete: str = "\n".join(stats_header) + "\n\n" + output_joined
389
390	return output_complete
391
392
393def main() -> None:
394	"""Main entry point for the script"""
395	arg_parser = argparse.ArgumentParser(
396		description="lmcat - list tree and content, combining .gitignore + .lmignore",
397		add_help=False,
398	)
399	arg_parser.add_argument(
400		"-t",
401		"--tree-only",
402		action="store_true",
403		default=False,
404		help="Only print the tree, not the file contents.",
405	)
406	arg_parser.add_argument(
407		"-o",
408		"--output",
409		action="store",
410		default=None,
411		help="Output file to write the tree and contents to.",
412	)
413	arg_parser.add_argument(
414		"-h", "--help", action="help", help="Show this help message and exit."
415	)
416	arg_parser.add_argument(
417		"--print-cfg",
418		action="store_true",
419		default=False,
420		help="Print the configuration as json and exit.",
421	)
422	arg_parser.add_argument(
423		"--allow-plugins",
424		action="store_true",
425		default=False,
426		help="Allow plugins to be loaded from the plugins file. WARNING: this will execute arbitrary code found in the file pointed to by `config.plugins_file`, and **is a security risk**.",
427	)
428
429	args: argparse.Namespace = arg_parser.parse_known_args()[0]
430	root_dir: Path = Path(".").resolve()
431	config: LMCatConfig = LMCatConfig.read(root_dir)
432
433	# CLI overrides
434	config.tree_only = args.tree_only
435	config.allow_plugins = args.allow_plugins
436
437	# print cfg and exit if requested
438	if args.print_cfg:
439		print(json.dumps(config.serialize(), indent="\t"))
440		return
441
442	# assemble summary
443	summary: str = assemble_summary(root_dir=root_dir, config=config)
444
445	# Write output
446	if args.output:
447		output_path: Path = Path(args.output)
448		output_path.parent.mkdir(parents=True, exist_ok=True)
449		output_path.write_text(summary, encoding="utf-8")
450	else:
451		if sys.platform == "win32":
452			sys.stdout = io.TextIOWrapper(
453				sys.stdout.buffer, encoding="utf-8", errors="replace"
454			)
455			sys.stderr = io.TextIOWrapper(
456				sys.stderr.buffer, encoding="utf-8", errors="replace"
457			)
458
459		print(summary)
460
461
462if __name__ == "__main__":
463	main()

@serializable_dataclass(kw_only=True)
class LMCatConfig(muutils.json_serialize.serializable_dataclass.SerializableDataclass):
 36@serializable_dataclass(kw_only=True)
 37class LMCatConfig(SerializableDataclass):
 38	"""Configuration dataclass for lmcat
 39
 40	# Parameters:
 41	 - `tree_divider: str`
 42	 - `tree_indent: str`
 43	 - `tree_file_divider: str`
 44	 - `content_divider: str`
 45	 - `include_gitignore: bool`  (default True)
 46	 - `tree_only: bool`  (default False)
 47	"""
 48
 49	content_divider: str = serializable_field(default="``````")
 50	tree_only: bool = serializable_field(default=False)
 51
 52	# ignoring
 53	ignore_patterns: list[str] = serializable_field(default_factory=list)
 54	ignore_patterns_files: list[Path] = serializable_field(
 55		default_factory=lambda: [Path(".gitignore"), Path(".lmignore")],
 56		serialization_fn=lambda x: [p.as_posix() for p in x],
 57		deserialize_fn=lambda x: [Path(p) for p in x],
 58	)
 59
 60	# this file will be imported, and if the functions in it are decorated
 61	# with one of the `register_*` decorators, they will be added to the functions
 62	# which can be used in the processing pipeline
 63	# --allow-plugins is a command line only option and must be set to true for this to work
 64	plugins_file: Path | None = serializable_field(
 65		default=None,
 66		serialization_fn=lambda x: x.as_posix() if x else None,
 67		deserialize_fn=lambda x: Path(x) if x else None,
 68	)
 69	allow_plugins: bool = serializable_field(
 70		default=False,
 71		deserialize_fn=lambda x: False,  # this can only be overriden through the command line
 72	)
 73
 74	# processing pipeline
 75	glob_process: dict[str, str] = serializable_field(default_factory=dict)
 76	decider_process: dict[str, str] = serializable_field(default_factory=dict)
 77	on_multiple_processors: OnMultipleProcessors = serializable_field(
 78		default="except",
 79		assert_type=False,
 80	)
 81
 82	# tokenization
 83	tokenizer: str = serializable_field(
 84		default="gpt2" if TOKENIZERS_PRESENT else "whitespace-split"
 85	)
 86	"Tokenizer to use for tokenizing the output. `gpt2` by default. passed to `tokenizers.Tokenizer.from_pretrained()`. If specified and `tokenizers` not installed, will throw exception. fallback `whitespace-split` used to avoid exception when `tokenizers` not installed."
 87
 88	# tree formatting
 89	tree_divider: str = serializable_field(default="│   ")
 90	tree_file_divider: str = serializable_field(default="├── ")
 91	tree_indent: str = serializable_field(default=" ")
 92
 93	def get_tokenizer_obj(self) -> TokenizerWrapper:
 94		"""Get the tokenizer object"""
 95		return TokenizerWrapper(self.tokenizer)
 96
 97	def get_processing_pipeline(self) -> ProcessingPipeline:
 98		"""Get the processing pipeline object"""
 99		plugins_file: Path | None = self.plugins_file if self.allow_plugins else None
100		return ProcessingPipeline(
101			plugins_file=plugins_file,
102			decider_process_keys=self.decider_process,
103			glob_process_keys=self.glob_process,
104			on_multiple_processors=self.on_multiple_processors,
105		)
106
107	@classmethod
108	def read(cls, root_dir: Path) -> "LMCatConfig":
109		"""Attempt to read config from pyproject.toml, lmcat.toml, or lmcat.json."""
110		pyproject_path: Path = root_dir / "pyproject.toml"
111		lmcat_toml_path: Path = root_dir / "lmcat.toml"
112		lmcat_json_path: Path = root_dir / "lmcat.json"
113
114		if (
115			sum(
116				int(p.is_file())
117				for p in (pyproject_path, lmcat_toml_path, lmcat_json_path)
118			)
119			> 1
120		):
121			raise ValueError(
122				"Multiple configuration files found. Please only use one of pyproject.toml, lmcat.toml, or lmcat.json."
123			)
124
125		# Try pyproject.toml first
126		if tomllib is not None and pyproject_path.is_file():
127			with pyproject_path.open("rb") as f:
128				pyproject_data = tomllib.load(f)
129			if "tool" in pyproject_data and "lmcat" in pyproject_data["tool"]:
130				return cls.load(pyproject_data["tool"]["lmcat"])
131
132		# Then try lmcat.toml
133		if tomllib is not None and lmcat_toml_path.is_file():
134			with lmcat_toml_path.open("rb") as f:
135				toml_data = tomllib.load(f)
136			return cls.load(toml_data)
137
138		# Finally try lmcat.json
139		if lmcat_json_path.is_file():
140			with lmcat_json_path.open("r", encoding="utf-8") as f:
141				json_data = json.load(f)
142			return cls.load(json_data)
143
144		# Fallback to defaults
145		return cls()

Configuration dataclass for lmcat

Parameters:

  • tree_divider: str
  • tree_indent: str
  • tree_file_divider: str
  • content_divider: str
  • include_gitignore: bool (default True)
  • tree_only: bool (default False)
LMCatConfig( *, content_divider: str = '``````', tree_only: bool = False, ignore_patterns: list[str] = <factory>, ignore_patterns_files: list[pathlib.Path] = <factory>, plugins_file: pathlib.Path | None = None, allow_plugins: bool = False, glob_process: dict[str, str] = <factory>, decider_process: dict[str, str] = <factory>, on_multiple_processors: Literal['warn', 'except', 'do_first', 'do_last', 'skip'] = 'except', tokenizer: str = 'gpt2', tree_divider: str = '│ ', tree_file_divider: str = '├── ', tree_indent: str = ' ')
content_divider: str = '``````'
tree_only: bool = False
ignore_patterns: list[str]
ignore_patterns_files: list[pathlib.Path]
plugins_file: pathlib.Path | None = None
allow_plugins: bool = False
glob_process: dict[str, str]
decider_process: dict[str, str]
on_multiple_processors: Literal['warn', 'except', 'do_first', 'do_last', 'skip'] = 'except'
tokenizer: str = 'gpt2'

Tokenizer to use for tokenizing the output. gpt2 by default. passed to tokenizers.Tokenizer.from_pretrained(). If specified and tokenizers not installed, will throw exception. fallback whitespace-split used to avoid exception when tokenizers not installed.

tree_divider: str = '│ '
tree_file_divider: str = '├── '
tree_indent: str = ' '
def get_tokenizer_obj(self) -> lmcat.file_stats.TokenizerWrapper:
93	def get_tokenizer_obj(self) -> TokenizerWrapper:
94		"""Get the tokenizer object"""
95		return TokenizerWrapper(self.tokenizer)

Get the tokenizer object

def get_processing_pipeline(self) -> lmcat.processing_pipeline.ProcessingPipeline:
 97	def get_processing_pipeline(self) -> ProcessingPipeline:
 98		"""Get the processing pipeline object"""
 99		plugins_file: Path | None = self.plugins_file if self.allow_plugins else None
100		return ProcessingPipeline(
101			plugins_file=plugins_file,
102			decider_process_keys=self.decider_process,
103			glob_process_keys=self.glob_process,
104			on_multiple_processors=self.on_multiple_processors,
105		)

Get the processing pipeline object

@classmethod
def read(cls, root_dir: pathlib.Path) -> LMCatConfig:
107	@classmethod
108	def read(cls, root_dir: Path) -> "LMCatConfig":
109		"""Attempt to read config from pyproject.toml, lmcat.toml, or lmcat.json."""
110		pyproject_path: Path = root_dir / "pyproject.toml"
111		lmcat_toml_path: Path = root_dir / "lmcat.toml"
112		lmcat_json_path: Path = root_dir / "lmcat.json"
113
114		if (
115			sum(
116				int(p.is_file())
117				for p in (pyproject_path, lmcat_toml_path, lmcat_json_path)
118			)
119			> 1
120		):
121			raise ValueError(
122				"Multiple configuration files found. Please only use one of pyproject.toml, lmcat.toml, or lmcat.json."
123			)
124
125		# Try pyproject.toml first
126		if tomllib is not None and pyproject_path.is_file():
127			with pyproject_path.open("rb") as f:
128				pyproject_data = tomllib.load(f)
129			if "tool" in pyproject_data and "lmcat" in pyproject_data["tool"]:
130				return cls.load(pyproject_data["tool"]["lmcat"])
131
132		# Then try lmcat.toml
133		if tomllib is not None and lmcat_toml_path.is_file():
134			with lmcat_toml_path.open("rb") as f:
135				toml_data = tomllib.load(f)
136			return cls.load(toml_data)
137
138		# Finally try lmcat.json
139		if lmcat_json_path.is_file():
140			with lmcat_json_path.open("r", encoding="utf-8") as f:
141				json_data = json.load(f)
142			return cls.load(json_data)
143
144		# Fallback to defaults
145		return cls()

Attempt to read config from pyproject.toml, lmcat.toml, or lmcat.json.

def serialize(self) -> dict[str, typing.Any]:
704        def serialize(self) -> dict[str, Any]:
705            result: dict[str, Any] = {
706                "__format__": f"{self.__class__.__name__}(SerializableDataclass)"
707            }
708            # for each field in the class
709            for field in dataclasses.fields(self):  # type: ignore[arg-type]
710                # need it to be our special SerializableField
711                if not isinstance(field, SerializableField):
712                    raise NotSerializableFieldException(
713                        f"Field '{field.name}' on class {self.__class__.__module__}.{self.__class__.__name__} is not a `SerializableField`, "
714                        f"but a {type(field)} "
715                        "this state should be inaccessible, please report this bug!"
716                    )
717
718                # try to save it
719                if field.serialize:
720                    try:
721                        # get the val
722                        value = getattr(self, field.name)
723                        # if it is a serializable dataclass, serialize it
724                        if isinstance(value, SerializableDataclass):
725                            value = value.serialize()
726                        # if the value has a serialization function, use that
727                        if hasattr(value, "serialize") and callable(value.serialize):
728                            value = value.serialize()
729                        # if the field has a serialization function, use that
730                        # it would be nice to be able to override a class's `.serialize()`, but that could lead to some inconsistencies!
731                        elif field.serialization_fn:
732                            value = field.serialization_fn(value)
733
734                        # store the value in the result
735                        result[field.name] = value
736                    except Exception as e:
737                        raise FieldSerializationError(
738                            "\n".join(
739                                [
740                                    f"Error serializing field '{field.name}' on class {self.__class__.__module__}.{self.__class__.__name__}",
741                                    f"{field = }",
742                                    f"{value = }",
743                                    f"{self = }",
744                                ]
745                            )
746                        ) from e
747
748            # store each property if we can get it
749            for prop in self._properties_to_serialize:
750                if hasattr(cls, prop):
751                    value = getattr(self, prop)
752                    result[prop] = value
753                else:
754                    raise AttributeError(
755                        f"Cannot serialize property '{prop}' on class {self.__class__.__module__}.{self.__class__.__name__}"
756                        + f"but it is in {self._properties_to_serialize = }"
757                        + f"\n{self = }"
758                    )
759
760            return result

returns the class as a dict, implemented by using @serializable_dataclass decorator

@classmethod
def load(cls, data: Union[dict[str, Any], ~T]) -> Type[~T]:
767        @classmethod  # type: ignore[misc]
768        def load(cls, data: dict[str, Any] | T) -> Type[T]:
769            # HACK: this is kind of ugly, but it fixes a lot of issues for when we do recursive loading with ZANJ
770            if isinstance(data, cls):
771                return data
772
773            assert isinstance(
774                data, typing.Mapping
775            ), f"When loading {cls.__name__ = } expected a Mapping, but got {type(data) = }:\n{data = }"
776
777            cls_type_hints: dict[str, Any] = get_cls_type_hints(cls)
778
779            # initialize dict for keeping what we will pass to the constructor
780            ctor_kwargs: dict[str, Any] = dict()
781
782            # iterate over the fields of the class
783            for field in dataclasses.fields(cls):
784                # check if the field is a SerializableField
785                assert isinstance(
786                    field, SerializableField
787                ), f"Field '{field.name}' on class {cls.__name__} is not a SerializableField, but a {type(field)}. this state should be inaccessible, please report this bug!\nhttps://github.com/mivanit/muutils/issues/new"
788
789                # check if the field is in the data and if it should be initialized
790                if (field.name in data) and field.init:
791                    # get the value, we will be processing it
792                    value: Any = data[field.name]
793
794                    # get the type hint for the field
795                    field_type_hint: Any = cls_type_hints.get(field.name, None)
796
797                    # we rely on the init of `SerializableField` to check that only one of `loading_fn` and `deserialize_fn` is set
798                    if field.deserialize_fn:
799                        # if it has a deserialization function, use that
800                        value = field.deserialize_fn(value)
801                    elif field.loading_fn:
802                        # if it has a loading function, use that
803                        value = field.loading_fn(data)
804                    elif (
805                        field_type_hint is not None
806                        and hasattr(field_type_hint, "load")
807                        and callable(field_type_hint.load)
808                    ):
809                        # if no loading function but has a type hint with a load method, use that
810                        if isinstance(value, dict):
811                            value = field_type_hint.load(value)
812                        else:
813                            raise FieldLoadingError(
814                                f"Cannot load value into {field_type_hint}, expected {type(value) = } to be a dict\n{value = }"
815                            )
816                    else:
817                        # assume no loading needs to happen, keep `value` as-is
818                        pass
819
820                    # store the value in the constructor kwargs
821                    ctor_kwargs[field.name] = value
822
823            # create a new instance of the class with the constructor kwargs
824            output: cls = cls(**ctor_kwargs)
825
826            # validate the types of the fields if needed
827            if on_typecheck_mismatch != ErrorMode.IGNORE:
828                fields_valid: dict[str, bool] = (
829                    SerializableDataclass__validate_fields_types__dict(
830                        output,
831                        on_typecheck_error=on_typecheck_error,
832                    )
833                )
834
835                # if there are any fields that are not valid, raise an error
836                if not all(fields_valid.values()):
837                    msg: str = (
838                        f"Type mismatch in fields of {cls.__name__}:\n"
839                        + "\n".join(
840                            [
841                                f"{k}:\texpected {cls_type_hints[k] = }, but got value {getattr(output, k) = }, {type(getattr(output, k)) = }"
842                                for k, v in fields_valid.items()
843                                if not v
844                            ]
845                        )
846                    )
847
848                    on_typecheck_mismatch.process(
849                        msg, except_cls=FieldTypeMismatchError
850                    )
851
852            # return the new instance
853            return output

takes in an appropriately structured dict and returns an instance of the class, implemented by using @serializable_dataclass decorator

def validate_fields_types( self: muutils.json_serialize.serializable_dataclass.SerializableDataclass, on_typecheck_error: muutils.errormode.ErrorMode = ErrorMode.Except) -> bool:
304def SerializableDataclass__validate_fields_types(
305    self: SerializableDataclass,
306    on_typecheck_error: ErrorMode = _DEFAULT_ON_TYPECHECK_ERROR,
307) -> bool:
308    """validate the types of all the fields on a `SerializableDataclass`. calls `SerializableDataclass__validate_field_type` for each field"""
309    return all(
310        SerializableDataclass__validate_fields_types__dict(
311            self, on_typecheck_error=on_typecheck_error
312        ).values()
313    )

validate the types of all the fields on a SerializableDataclass. calls SerializableDataclass__validate_field_type for each field

Inherited Members
muutils.json_serialize.serializable_dataclass.SerializableDataclass
validate_field_type
diff
update_from_nested_dict
class IgnoreHandler:
148class IgnoreHandler:
149	"""Handles all ignore pattern matching using igittigitt"""
150
151	def __init__(self, root_dir: Path, config: LMCatConfig):
152		self.root_dir: Path = root_dir
153		self.config: LMCatConfig = config
154
155		# set up parser
156		self.parser: igittigitt.IgnoreParser = igittigitt.IgnoreParser()
157
158		# first from the files
159		for ignore_file in self.config.ignore_patterns_files:
160			self.parser.parse_rule_files(self.root_dir, filename=ignore_file.name)
161
162		# then from the config itself
163		for pattern in self.config.ignore_patterns:
164			self.parser.add_rule(pattern=pattern, base_path=self.root_dir)
165
166	def is_ignored(self, path: Path) -> bool:
167		"""Check if a path should be ignored"""
168		# Never ignore the gitignore/lmignore files themselves
169		if path.name in {".gitignore", ".lmignore"}:
170			return True
171
172		# Use igittigitt's matching
173		return self.parser.match(path)

Handles all ignore pattern matching using igittigitt

IgnoreHandler(root_dir: pathlib.Path, config: LMCatConfig)
151	def __init__(self, root_dir: Path, config: LMCatConfig):
152		self.root_dir: Path = root_dir
153		self.config: LMCatConfig = config
154
155		# set up parser
156		self.parser: igittigitt.IgnoreParser = igittigitt.IgnoreParser()
157
158		# first from the files
159		for ignore_file in self.config.ignore_patterns_files:
160			self.parser.parse_rule_files(self.root_dir, filename=ignore_file.name)
161
162		# then from the config itself
163		for pattern in self.config.ignore_patterns:
164			self.parser.add_rule(pattern=pattern, base_path=self.root_dir)
root_dir: pathlib.Path
config: LMCatConfig
parser: igittigitt.igittigitt.IgnoreParser
def is_ignored(self, path: pathlib.Path) -> bool:
166	def is_ignored(self, path: Path) -> bool:
167		"""Check if a path should be ignored"""
168		# Never ignore the gitignore/lmignore files themselves
169		if path.name in {".gitignore", ".lmignore"}:
170			return True
171
172		# Use igittigitt's matching
173		return self.parser.match(path)

Check if a path should be ignored

def sorted_entries(directory: pathlib.Path) -> list[pathlib.Path]:
176def sorted_entries(directory: Path) -> list[Path]:
177	"""Return directory contents sorted: directories first, then files"""
178	subdirs: list[Path] = sorted(
179		[p for p in directory.iterdir() if p.is_dir()], key=lambda x: x.name
180	)
181	files: list[Path] = sorted(
182		[p for p in directory.iterdir() if p.is_file()], key=lambda x: x.name
183	)
184	return subdirs + files

Return directory contents sorted: directories first, then files

def walk_dir( directory: pathlib.Path, ignore_handler: IgnoreHandler, config: LMCatConfig, tokenizer: lmcat.file_stats.TokenizerWrapper, prefix: str = '') -> tuple[list[lmcat.file_stats.TreeEntry], list[pathlib.Path]]:
187def walk_dir(
188	directory: Path,
189	ignore_handler: IgnoreHandler,
190	config: LMCatConfig,
191	tokenizer: TokenizerWrapper,
192	prefix: str = "",
193) -> tuple[list[TreeEntry], list[Path]]:
194	"""Recursively walk a directory, building tree lines and collecting file paths"""
195	tree_output: list[TreeEntry] = []
196	collected_files: list[Path] = []
197
198	entries: list[Path] = sorted_entries(directory)
199	for i, entry in enumerate(entries):
200		if ignore_handler.is_ignored(entry):
201			continue
202
203		is_last: bool = i == len(entries) - 1
204		connector: str = (
205			config.tree_file_divider
206			if not is_last
207			else config.tree_file_divider.replace("├", "└")
208		)
209
210		if entry.is_dir():
211			tree_output.append(TreeEntry(f"{prefix}{connector}{entry.name}", None))
212			extension: str = config.tree_divider if not is_last else config.tree_indent
213			sub_output: list[TreeEntry]
214			sub_files: list[Path]
215			sub_output, sub_files = walk_dir(
216				directory=entry,
217				ignore_handler=ignore_handler,
218				config=config,
219				tokenizer=tokenizer,
220				prefix=prefix + extension,
221			)
222			tree_output.extend(sub_output)
223			collected_files.extend(sub_files)
224		else:
225			stats: FileStats = FileStats.from_file(entry, tokenizer)
226			tree_output.append(TreeEntry(f"{prefix}{connector}{entry.name}", stats))
227			collected_files.append(entry)
228
229	return tree_output, collected_files

Recursively walk a directory, building tree lines and collecting file paths

def format_tree_with_stats( entries: list[lmcat.file_stats.TreeEntry], show_tokens: bool = False) -> list[str]:
232def format_tree_with_stats(
233	entries: list[TreeEntry], show_tokens: bool = False
234) -> list[str]:
235	"""Format tree entries with aligned statistics
236
237	# Parameters:
238	 - `entries : list[TreeEntry]`
239		List of tree entries with optional stats
240	 - `show_tokens : bool`
241		Whether to show token counts
242
243	# Returns:
244	 - `list[str]`
245		Formatted tree lines with aligned stats
246	"""
247	# Find max widths for alignment
248	max_line_len: int = max(len(entry.line) for entry in entries)
249	max_lines: int = max(
250		(len(f"{entry.stats.lines:,}") if entry.stats else 0) for entry in entries
251	)
252	max_chars: int = max(
253		(len(f"{entry.stats.chars:,}") if entry.stats else 0) for entry in entries
254	)
255	max_tokens: int = (
256		max(
257			(
258				len(f"{entry.stats.tokens:,}")
259				if entry.stats and entry.stats.tokens
260				else 0
261			)
262			for entry in entries
263		)
264		if show_tokens
265		else 0
266	)
267
268	formatted: list[str] = []
269	for entry in entries:
270		line: str = entry.line.ljust(max_line_len + 2)
271		if entry.stats:
272			lines_str: str = f"{entry.stats.lines:,}L".rjust(max_lines + 1)
273			chars_str: str = f"{entry.stats.chars:,}C".rjust(max_chars + 1)
274			stats_str: str = f"[{lines_str} {chars_str}"
275			if show_tokens and entry.stats.tokens is not None:
276				tokens_str: str = f"{entry.stats.tokens:,}T".rjust(max_tokens + 1)
277				stats_str += f" {tokens_str}"
278			stats_str += "]"
279			formatted.append(f"{line}{stats_str}")
280		else:
281			formatted.append(line)
282
283	return formatted

Format tree entries with aligned statistics

Parameters:

  • entries : list[TreeEntry] List of tree entries with optional stats
  • show_tokens : bool Whether to show token counts

Returns:

  • list[str] Formatted tree lines with aligned stats
def walk_and_collect( root_dir: pathlib.Path, config: LMCatConfig) -> tuple[list[str], list[pathlib.Path]]:
286def walk_and_collect(
287	root_dir: Path,
288	config: LMCatConfig,
289) -> tuple[list[str], list[Path]]:
290	"""Walk filesystem from root_dir and gather tree listing plus file paths"""
291	if config is None:
292		config = LMCatConfig()
293
294	tokenizer: TokenizerWrapper = config.get_tokenizer_obj()
295
296	ignore_handler = IgnoreHandler(root_dir, config)
297	base_name = root_dir.resolve().name
298
299	# Start with root directory name
300	tree_output = [TreeEntry(base_name)]
301
302	# Walk the directory tree
303	sub_output, sub_files = walk_dir(
304		directory=root_dir,
305		ignore_handler=ignore_handler,
306		config=config,
307		tokenizer=tokenizer,
308		prefix="",
309	)
310	tree_output.extend(sub_output)
311
312	# Format tree with stats
313	formatted_tree = format_tree_with_stats(
314		tree_output, show_tokens=tokenizer is not None
315	)
316
317	return formatted_tree, sub_files

Walk filesystem from root_dir and gather tree listing plus file paths

def assemble_summary(root_dir: pathlib.Path, config: LMCatConfig) -> str:
320def assemble_summary(
321	root_dir: Path,
322	config: LMCatConfig,
323) -> str:
324	"""Assemble the summary output and return"""
325
326	processing_pipeline: ProcessingPipeline = config.get_processing_pipeline()
327
328	tree_output: list[str]
329	collected_files: list[Path]
330	tree_output, collected_files = walk_and_collect(
331		root_dir=root_dir,
332		config=config,
333	)
334
335	output: list[str] = []
336	output.append("# File Tree")
337	output.append("\n```")
338	output.extend(tree_output)
339	output.append("```\n")
340
341	# Add file contents if not suppressed
342	if not config.tree_only:
343		output.append("# File Contents")
344
345		for fpath in collected_files:
346			# get the path
347			relpath_posix: str = fpath.relative_to(root_dir).as_posix()
348
349			# process the contents
350			f_contents: str
351			p_name: str | None
352			f_contents, p_name = processing_pipeline.process_file(fpath)
353			processed_with: str = f'processed_with="{p_name}"' if p_name else ""
354
355			# start of file marker
356			pathspec_start: str = f'{{ path="{relpath_posix}" {processed_with} }}'
357			pathspec_end: str = f'{{ end_of_file="{relpath_posix}" }}'
358			output.append("")
359			output.append(config.content_divider + pathspec_start)
360
361			# process the actual contents of the file with the pipeline, and append
362			output.append(f_contents)
363
364			# add the end of file marker
365			output.append(config.content_divider + pathspec_end)
366
367	output_joined: str = "\n".join(output)
368
369	stats_dict_ints: dict[str, int] = {
370		"files": len(collected_files),
371		"lines": len(output_joined.splitlines()),
372		"chars": len(output_joined),
373	}
374
375	tokenizer: TokenizerWrapper = config.get_tokenizer_obj()
376
377	n_tokens: int = tokenizer.n_tokens(output_joined)
378	stats_dict_ints[f"`{tokenizer.name}` tokens"] = n_tokens
379
380	stats_header: list[str] = ["# Stats"]
381	for key, val in stats_dict_ints.items():
382		val_str: str = str(val)
383		val_short: str = shorten_numerical_to_str(val)
384		if val_str != val_short:
385			stats_header.append(f"- {val} ({val_short}) {key}")
386		else:
387			stats_header.append(f"- {val} {key}")
388
389	output_complete: str = "\n".join(stats_header) + "\n\n" + output_joined
390
391	return output_complete

Assemble the summary output and return

def main() -> None:
394def main() -> None:
395	"""Main entry point for the script"""
396	arg_parser = argparse.ArgumentParser(
397		description="lmcat - list tree and content, combining .gitignore + .lmignore",
398		add_help=False,
399	)
400	arg_parser.add_argument(
401		"-t",
402		"--tree-only",
403		action="store_true",
404		default=False,
405		help="Only print the tree, not the file contents.",
406	)
407	arg_parser.add_argument(
408		"-o",
409		"--output",
410		action="store",
411		default=None,
412		help="Output file to write the tree and contents to.",
413	)
414	arg_parser.add_argument(
415		"-h", "--help", action="help", help="Show this help message and exit."
416	)
417	arg_parser.add_argument(
418		"--print-cfg",
419		action="store_true",
420		default=False,
421		help="Print the configuration as json and exit.",
422	)
423	arg_parser.add_argument(
424		"--allow-plugins",
425		action="store_true",
426		default=False,
427		help="Allow plugins to be loaded from the plugins file. WARNING: this will execute arbitrary code found in the file pointed to by `config.plugins_file`, and **is a security risk**.",
428	)
429
430	args: argparse.Namespace = arg_parser.parse_known_args()[0]
431	root_dir: Path = Path(".").resolve()
432	config: LMCatConfig = LMCatConfig.read(root_dir)
433
434	# CLI overrides
435	config.tree_only = args.tree_only
436	config.allow_plugins = args.allow_plugins
437
438	# print cfg and exit if requested
439	if args.print_cfg:
440		print(json.dumps(config.serialize(), indent="\t"))
441		return
442
443	# assemble summary
444	summary: str = assemble_summary(root_dir=root_dir, config=config)
445
446	# Write output
447	if args.output:
448		output_path: Path = Path(args.output)
449		output_path.parent.mkdir(parents=True, exist_ok=True)
450		output_path.write_text(summary, encoding="utf-8")
451	else:
452		if sys.platform == "win32":
453			sys.stdout = io.TextIOWrapper(
454				sys.stdout.buffer, encoding="utf-8", errors="replace"
455			)
456			sys.stderr = io.TextIOWrapper(
457				sys.stderr.buffer, encoding="utf-8", errors="replace"
458			)
459
460		print(summary)

Main entry point for the script