pattern_lens.prompts
implements load_text_data for loading prompts
1"implements `load_text_data` for loading prompts" 2 3import json 4import random 5from pathlib import Path 6 7 8def load_text_data( 9 fname: Path, 10 min_chars: int | None = None, 11 max_chars: int | None = None, 12 shuffle: bool = False, 13) -> list[dict]: 14 """given `fname`, the path to a jsonl file, split prompts up into more reasonable sizes 15 16 # Parameters: 17 - `fname : Path` 18 jsonl file with prompts. Expects a list of dicts with a "text" key 19 - `min_chars : int | None` 20 (defaults to `None`) 21 - `max_chars : int | None` 22 (defaults to `None`) 23 - `shuffle : bool` 24 (defaults to `False`) 25 26 # Returns: 27 - `list[dict]` 28 processed list of prompts. Each prompt has a "text" key w/ a string value and some metadata. 29 this is not guaranteed to be the same length as the input list! 30 """ 31 # read raw data 32 with open(fname, "r") as f: 33 data_raw: list[dict] = [json.loads(d) for d in f.readlines()] 34 35 # add fname metadata 36 for d in data_raw: 37 d["source_fname"] = fname.as_posix() 38 39 # trim too-short samples 40 if min_chars is not None: 41 data_raw = list( 42 filter( 43 lambda x: len(x["text"]) >= min_chars, 44 data_raw, 45 ), 46 ) 47 48 # split up too-long samples 49 if max_chars is not None: 50 data_new: list[dict] = [] 51 for d in data_raw: 52 d_text: str = d["text"] 53 while len(d_text) > max_chars: 54 data_new.append( 55 { 56 **d, 57 "text": d_text[:max_chars], 58 }, 59 ) 60 d_text = d_text[max_chars:] 61 data_new.append( 62 { 63 **d, 64 "text": d_text, 65 }, 66 ) 67 data_raw = data_new 68 69 # trim too-short samples again 70 if min_chars is not None: 71 data_raw = list( 72 filter( 73 lambda x: len(x["text"]) >= min_chars, 74 data_raw, 75 ), 76 ) 77 78 # shuffle 79 if shuffle: 80 random.shuffle(data_raw) 81 82 return data_raw
def
load_text_data( fname: pathlib._local.Path, min_chars: int | None = None, max_chars: int | None = None, shuffle: bool = False) -> list[dict]:
9def load_text_data( 10 fname: Path, 11 min_chars: int | None = None, 12 max_chars: int | None = None, 13 shuffle: bool = False, 14) -> list[dict]: 15 """given `fname`, the path to a jsonl file, split prompts up into more reasonable sizes 16 17 # Parameters: 18 - `fname : Path` 19 jsonl file with prompts. Expects a list of dicts with a "text" key 20 - `min_chars : int | None` 21 (defaults to `None`) 22 - `max_chars : int | None` 23 (defaults to `None`) 24 - `shuffle : bool` 25 (defaults to `False`) 26 27 # Returns: 28 - `list[dict]` 29 processed list of prompts. Each prompt has a "text" key w/ a string value and some metadata. 30 this is not guaranteed to be the same length as the input list! 31 """ 32 # read raw data 33 with open(fname, "r") as f: 34 data_raw: list[dict] = [json.loads(d) for d in f.readlines()] 35 36 # add fname metadata 37 for d in data_raw: 38 d["source_fname"] = fname.as_posix() 39 40 # trim too-short samples 41 if min_chars is not None: 42 data_raw = list( 43 filter( 44 lambda x: len(x["text"]) >= min_chars, 45 data_raw, 46 ), 47 ) 48 49 # split up too-long samples 50 if max_chars is not None: 51 data_new: list[dict] = [] 52 for d in data_raw: 53 d_text: str = d["text"] 54 while len(d_text) > max_chars: 55 data_new.append( 56 { 57 **d, 58 "text": d_text[:max_chars], 59 }, 60 ) 61 d_text = d_text[max_chars:] 62 data_new.append( 63 { 64 **d, 65 "text": d_text, 66 }, 67 ) 68 data_raw = data_new 69 70 # trim too-short samples again 71 if min_chars is not None: 72 data_raw = list( 73 filter( 74 lambda x: len(x["text"]) >= min_chars, 75 data_raw, 76 ), 77 ) 78 79 # shuffle 80 if shuffle: 81 random.shuffle(data_raw) 82 83 return data_raw
given fname, the path to a jsonl file, split prompts up into more reasonable sizes
Parameters:
fname : Pathjsonl file with prompts. Expects a list of dicts with a "text" keymin_chars : int | None(defaults toNone)max_chars : int | None(defaults toNone)shuffle : bool(defaults toFalse)
Returns:
list[dict]processed list of prompts. Each prompt has a "text" key w/ a string value and some metadata. this is not guaranteed to be the same length as the input list!