docs for pattern_lens v0.4.0
View Source on GitHub

pattern_lens.prompts

implements load_text_data for loading prompts


 1"implements `load_text_data` for loading prompts"
 2
 3import json
 4import random
 5from pathlib import Path
 6
 7
 8def load_text_data(
 9	fname: Path,
10	min_chars: int | None = None,
11	max_chars: int | None = None,
12	shuffle: bool = False,
13) -> list[dict]:
14	"""given `fname`, the path to a jsonl file, split prompts up into more reasonable sizes
15
16	# Parameters:
17	- `fname : Path`
18		jsonl file with prompts. Expects a list of dicts with a "text" key
19	- `min_chars : int | None`
20		(defaults to `None`)
21	- `max_chars : int | None`
22		(defaults to `None`)
23	- `shuffle : bool`
24		(defaults to `False`)
25
26	# Returns:
27	- `list[dict]`
28		processed list of prompts. Each prompt has a "text" key w/ a string value and some metadata.
29		this is not guaranteed to be the same length as the input list!
30	"""
31	# read raw data
32	with open(fname, "r") as f:
33		data_raw: list[dict] = [json.loads(d) for d in f.readlines()]
34
35	# add fname metadata
36	for d in data_raw:
37		d["source_fname"] = fname.as_posix()
38
39	# trim too-short samples
40	if min_chars is not None:
41		data_raw = list(
42			filter(
43				lambda x: len(x["text"]) >= min_chars,
44				data_raw,
45			),
46		)
47
48	# split up too-long samples
49	if max_chars is not None:
50		data_new: list[dict] = []
51		for d in data_raw:
52			d_text: str = d["text"]
53			while len(d_text) > max_chars:
54				data_new.append(
55					{
56						**d,
57						"text": d_text[:max_chars],
58					},
59				)
60				d_text = d_text[max_chars:]
61			data_new.append(
62				{
63					**d,
64					"text": d_text,
65				},
66			)
67		data_raw = data_new
68
69	# trim too-short samples again
70	if min_chars is not None:
71		data_raw = list(
72			filter(
73				lambda x: len(x["text"]) >= min_chars,
74				data_raw,
75			),
76		)
77
78	# shuffle
79	if shuffle:
80		random.shuffle(data_raw)
81
82	return data_raw

def load_text_data( fname: pathlib.Path, min_chars: int | None = None, max_chars: int | None = None, shuffle: bool = False) -> list[dict]:
 9def load_text_data(
10	fname: Path,
11	min_chars: int | None = None,
12	max_chars: int | None = None,
13	shuffle: bool = False,
14) -> list[dict]:
15	"""given `fname`, the path to a jsonl file, split prompts up into more reasonable sizes
16
17	# Parameters:
18	- `fname : Path`
19		jsonl file with prompts. Expects a list of dicts with a "text" key
20	- `min_chars : int | None`
21		(defaults to `None`)
22	- `max_chars : int | None`
23		(defaults to `None`)
24	- `shuffle : bool`
25		(defaults to `False`)
26
27	# Returns:
28	- `list[dict]`
29		processed list of prompts. Each prompt has a "text" key w/ a string value and some metadata.
30		this is not guaranteed to be the same length as the input list!
31	"""
32	# read raw data
33	with open(fname, "r") as f:
34		data_raw: list[dict] = [json.loads(d) for d in f.readlines()]
35
36	# add fname metadata
37	for d in data_raw:
38		d["source_fname"] = fname.as_posix()
39
40	# trim too-short samples
41	if min_chars is not None:
42		data_raw = list(
43			filter(
44				lambda x: len(x["text"]) >= min_chars,
45				data_raw,
46			),
47		)
48
49	# split up too-long samples
50	if max_chars is not None:
51		data_new: list[dict] = []
52		for d in data_raw:
53			d_text: str = d["text"]
54			while len(d_text) > max_chars:
55				data_new.append(
56					{
57						**d,
58						"text": d_text[:max_chars],
59					},
60				)
61				d_text = d_text[max_chars:]
62			data_new.append(
63				{
64					**d,
65					"text": d_text,
66				},
67			)
68		data_raw = data_new
69
70	# trim too-short samples again
71	if min_chars is not None:
72		data_raw = list(
73			filter(
74				lambda x: len(x["text"]) >= min_chars,
75				data_raw,
76			),
77		)
78
79	# shuffle
80	if shuffle:
81		random.shuffle(data_raw)
82
83	return data_raw

given fname, the path to a jsonl file, split prompts up into more reasonable sizes

Parameters:

  • fname : Path jsonl file with prompts. Expects a list of dicts with a "text" key
  • min_chars : int | None (defaults to None)
  • max_chars : int | None (defaults to None)
  • shuffle : bool (defaults to False)

Returns:

  • list[dict] processed list of prompts. Each prompt has a "text" key w/ a string value and some metadata. this is not guaranteed to be the same length as the input list!