pattern_lens.prompts
implements load_text_data for loading prompts
1"implements `load_text_data` for loading prompts" 2 3import json 4import random 5from pathlib import Path 6 7 8def load_text_data( 9 fname: Path, 10 min_chars: int | None = None, 11 max_chars: int | None = None, 12 shuffle: bool = False, 13) -> list[dict]: 14 """given `fname`, the path to a jsonl file, split prompts up into more reasonable sizes 15 16 # Parameters: 17 - `fname : Path` 18 jsonl file with prompts. Expects a list of dicts with a "text" key 19 - `min_chars : int | None` 20 (defaults to `None`) 21 - `max_chars : int | None` 22 (defaults to `None`) 23 - `shuffle : bool` 24 (defaults to `False`) 25 26 # Returns: 27 - `list[dict]` 28 new, processed list of prompts. Each prompt has a "text" key with a string value, and some metadata. this is not guaranteed to be the same length as the input list! 29 """ 30 # read raw data 31 with open(fname, "r") as f: 32 data_raw: list[dict] = [json.loads(d) for d in f.readlines()] 33 34 # add fname metadata 35 for d in data_raw: 36 d["source_fname"] = fname.as_posix() 37 38 # trim too-short samples 39 if min_chars is not None: 40 data_raw = list( 41 filter( 42 lambda x: len(x["text"]) >= min_chars, 43 data_raw, 44 ) 45 ) 46 47 # split up too-long samples 48 if max_chars is not None: 49 data_new: list[dict] = [] 50 for d in data_raw: 51 d_text: str = d["text"] 52 while len(d_text) > max_chars: 53 data_new.append( 54 { 55 **d, 56 "text": d_text[:max_chars], 57 } 58 ) 59 d_text = d_text[max_chars:] 60 data_new.append( 61 { 62 **d, 63 "text": d_text, 64 } 65 ) 66 data_raw = data_new 67 68 # trim too-short samples again 69 if min_chars is not None: 70 data_raw = list( 71 filter( 72 lambda x: len(x["text"]) >= min_chars, 73 data_raw, 74 ) 75 ) 76 77 # shuffle 78 if shuffle: 79 random.shuffle(data_raw) 80 81 return data_raw
def
load_text_data( fname: pathlib.Path, min_chars: int | None = None, max_chars: int | None = None, shuffle: bool = False) -> list[dict]:
9def load_text_data( 10 fname: Path, 11 min_chars: int | None = None, 12 max_chars: int | None = None, 13 shuffle: bool = False, 14) -> list[dict]: 15 """given `fname`, the path to a jsonl file, split prompts up into more reasonable sizes 16 17 # Parameters: 18 - `fname : Path` 19 jsonl file with prompts. Expects a list of dicts with a "text" key 20 - `min_chars : int | None` 21 (defaults to `None`) 22 - `max_chars : int | None` 23 (defaults to `None`) 24 - `shuffle : bool` 25 (defaults to `False`) 26 27 # Returns: 28 - `list[dict]` 29 new, processed list of prompts. Each prompt has a "text" key with a string value, and some metadata. this is not guaranteed to be the same length as the input list! 30 """ 31 # read raw data 32 with open(fname, "r") as f: 33 data_raw: list[dict] = [json.loads(d) for d in f.readlines()] 34 35 # add fname metadata 36 for d in data_raw: 37 d["source_fname"] = fname.as_posix() 38 39 # trim too-short samples 40 if min_chars is not None: 41 data_raw = list( 42 filter( 43 lambda x: len(x["text"]) >= min_chars, 44 data_raw, 45 ) 46 ) 47 48 # split up too-long samples 49 if max_chars is not None: 50 data_new: list[dict] = [] 51 for d in data_raw: 52 d_text: str = d["text"] 53 while len(d_text) > max_chars: 54 data_new.append( 55 { 56 **d, 57 "text": d_text[:max_chars], 58 } 59 ) 60 d_text = d_text[max_chars:] 61 data_new.append( 62 { 63 **d, 64 "text": d_text, 65 } 66 ) 67 data_raw = data_new 68 69 # trim too-short samples again 70 if min_chars is not None: 71 data_raw = list( 72 filter( 73 lambda x: len(x["text"]) >= min_chars, 74 data_raw, 75 ) 76 ) 77 78 # shuffle 79 if shuffle: 80 random.shuffle(data_raw) 81 82 return data_raw
given fname, the path to a jsonl file, split prompts up into more reasonable sizes
Parameters:
fname : Pathjsonl file with prompts. Expects a list of dicts with a "text" keymin_chars : int | None(defaults toNone)max_chars : int | None(defaults toNone)shuffle : bool(defaults toFalse)
Returns:
list[dict]new, processed list of prompts. Each prompt has a "text" key with a string value, and some metadata. this is not guaranteed to be the same length as the input list!