docs for pattern_lens v0.2.0
View Source on GitHub

pattern_lens.prompts

implements load_text_data for loading prompts


 1"implements `load_text_data` for loading prompts"
 2
 3import json
 4import random
 5from pathlib import Path
 6
 7
 8def load_text_data(
 9    fname: Path,
10    min_chars: int | None = None,
11    max_chars: int | None = None,
12    shuffle: bool = False,
13) -> list[dict]:
14    """given `fname`, the path to a jsonl file, split prompts up into more reasonable sizes
15
16    # Parameters:
17     - `fname : Path`
18        jsonl file with prompts. Expects a list of dicts with a "text" key
19     - `min_chars : int | None`
20       (defaults to `None`)
21     - `max_chars : int | None`
22       (defaults to `None`)
23     - `shuffle : bool`
24       (defaults to `False`)
25
26    # Returns:
27     - `list[dict]`
28       new, processed list of prompts. Each prompt has a "text" key with a string value, and some metadata. this is not guaranteed to be the same length as the input list!
29    """
30    # read raw data
31    with open(fname, "r") as f:
32        data_raw: list[dict] = [json.loads(d) for d in f.readlines()]
33
34    # add fname metadata
35    for d in data_raw:
36        d["source_fname"] = fname.as_posix()
37
38    # trim too-short samples
39    if min_chars is not None:
40        data_raw = list(
41            filter(
42                lambda x: len(x["text"]) >= min_chars,
43                data_raw,
44            )
45        )
46
47    # split up too-long samples
48    if max_chars is not None:
49        data_new: list[dict] = []
50        for d in data_raw:
51            d_text: str = d["text"]
52            while len(d_text) > max_chars:
53                data_new.append(
54                    {
55                        **d,
56                        "text": d_text[:max_chars],
57                    }
58                )
59                d_text = d_text[max_chars:]
60            data_new.append(
61                {
62                    **d,
63                    "text": d_text,
64                }
65            )
66        data_raw = data_new
67
68    # trim too-short samples again
69    if min_chars is not None:
70        data_raw = list(
71            filter(
72                lambda x: len(x["text"]) >= min_chars,
73                data_raw,
74            )
75        )
76
77    # shuffle
78    if shuffle:
79        random.shuffle(data_raw)
80
81    return data_raw

def load_text_data( fname: pathlib.Path, min_chars: int | None = None, max_chars: int | None = None, shuffle: bool = False) -> list[dict]:
 9def load_text_data(
10    fname: Path,
11    min_chars: int | None = None,
12    max_chars: int | None = None,
13    shuffle: bool = False,
14) -> list[dict]:
15    """given `fname`, the path to a jsonl file, split prompts up into more reasonable sizes
16
17    # Parameters:
18     - `fname : Path`
19        jsonl file with prompts. Expects a list of dicts with a "text" key
20     - `min_chars : int | None`
21       (defaults to `None`)
22     - `max_chars : int | None`
23       (defaults to `None`)
24     - `shuffle : bool`
25       (defaults to `False`)
26
27    # Returns:
28     - `list[dict]`
29       new, processed list of prompts. Each prompt has a "text" key with a string value, and some metadata. this is not guaranteed to be the same length as the input list!
30    """
31    # read raw data
32    with open(fname, "r") as f:
33        data_raw: list[dict] = [json.loads(d) for d in f.readlines()]
34
35    # add fname metadata
36    for d in data_raw:
37        d["source_fname"] = fname.as_posix()
38
39    # trim too-short samples
40    if min_chars is not None:
41        data_raw = list(
42            filter(
43                lambda x: len(x["text"]) >= min_chars,
44                data_raw,
45            )
46        )
47
48    # split up too-long samples
49    if max_chars is not None:
50        data_new: list[dict] = []
51        for d in data_raw:
52            d_text: str = d["text"]
53            while len(d_text) > max_chars:
54                data_new.append(
55                    {
56                        **d,
57                        "text": d_text[:max_chars],
58                    }
59                )
60                d_text = d_text[max_chars:]
61            data_new.append(
62                {
63                    **d,
64                    "text": d_text,
65                }
66            )
67        data_raw = data_new
68
69    # trim too-short samples again
70    if min_chars is not None:
71        data_raw = list(
72            filter(
73                lambda x: len(x["text"]) >= min_chars,
74                data_raw,
75            )
76        )
77
78    # shuffle
79    if shuffle:
80        random.shuffle(data_raw)
81
82    return data_raw

given fname, the path to a jsonl file, split prompts up into more reasonable sizes

Parameters:

  • fname : Path jsonl file with prompts. Expects a list of dicts with a "text" key
  • min_chars : int | None (defaults to None)
  • max_chars : int | None (defaults to None)
  • shuffle : bool (defaults to False)

Returns:

  • list[dict] new, processed list of prompts. Each prompt has a "text" key with a string value, and some metadata. this is not guaranteed to be the same length as the input list!