Coverage for intelligence_toolkit/query_text_data/helper_functions.py: 0%
94 statements
« prev ^ index » next coverage.py v7.10.7, created at 2025-10-16 13:41 -0300
« prev ^ index » next coverage.py v7.10.7, created at 2025-10-16 13:41 -0300
1# Copyright (c) 2024 Microsoft Corporation. All rights reserved.
2# Licensed under the MIT license. See LICENSE file in the project.
4import json
6from intelligence_toolkit.AI.base_embedder import BaseEmbedder
7from intelligence_toolkit.AI.classes import VectorData
8from intelligence_toolkit.AI.utils import hash_text
11def get_adjacent_chunks(source, previous_chunk_dict, next_chunk_dict, steps):
12 prev_chunks = []
13 current_chunk = source
14 for i in range(steps):
15 prev_chunk = previous_chunk_dict.get(current_chunk, None)
16 if prev_chunk is None:
17 break
18 prev_chunks.append(prev_chunk)
19 current_chunk = prev_chunk
20 next_chunks = []
21 current_chunk = source
22 for i in range(steps):
23 next_chunk = next_chunk_dict.get(current_chunk, None)
24 if next_chunk is None:
25 break
26 next_chunks.append(next_chunk)
27 current_chunk = next_chunk
28 return set(prev_chunks + next_chunks)
31def get_test_progress(test_history):
32 current_search = ""
33 current_relevant = 0
34 current_tested = 0
35 total_relevant = 0
36 total_tested = 0
37 rounds = []
38 for ix, (search, chunk, response) in enumerate(test_history):
39 if search != current_search:
40 if current_search != "":
41 if current_relevant > 0:
42 rounds.append(
43 f"{current_search}: {current_relevant}/{current_tested}"
44 )
45 else:
46 rounds.append(
47 f"<span style='color: red'>{current_search}: {current_relevant}/{current_tested}</span>"
48 )
49 current_search = search
50 current_relevant = 0
51 current_tested = 0
52 current_tested += 1
53 total_tested += 1
54 if response == "Yes":
55 current_relevant += 1
56 total_relevant += 1
57 if current_search != "":
58 if current_relevant > 0:
59 rounds.append(f"{current_search}: {current_relevant}/{current_tested}")
60 else:
61 rounds.append(
62 f"<span style='color: red'>{current_search}: {current_relevant}/{current_tested}</span>"
63 )
64 response = f"**Relevant chunks / tested chunks: {total_relevant}/{total_tested}**"
65 if len(rounds) > 0:
66 response += " (" + "; ".join(rounds) + ")"
67 return response
70def test_history_elements(test_history, previous_cid, next_cid, adjacent_search_steps):
71 relevant_list = [x[1] for x in test_history if x[2] == "Yes"]
72 seen_list = [x[1] for x in test_history]
73 adjacent_targets = set()
74 for c in relevant_list:
75 adjacent_targets.update(
76 get_adjacent_chunks(c, previous_cid, next_cid, adjacent_search_steps)
77 )
78 adjacent_list = [x for x in adjacent_targets if x not in seen_list]
79 return relevant_list, seen_list, adjacent_list
82async def embed_texts(
83 cid_to_text, text_embedder: BaseEmbedder, cache_data=True, callbacks=[]
84) -> dict:
85 cid_to_vector = {}
86 data: list[VectorData] = []
88 for cid, text in cid_to_text.items():
89 data.append(
90 {"hash": hash_text(text), "text": text, "additional_details": {"cid": cid}}
91 )
93 embedded_data = await text_embedder.embed_store_many(data, callbacks, cache_data)
94 for item in embedded_data:
95 details = json.loads(item["additional_details"])
96 if len(details.keys()) == 0:
97 continue
98 cid_to_vector[details["cid"]] = item["vector"]
99 return cid_to_vector
102async def embed_queries(
103 qid_to_text, text_embedder: BaseEmbedder, cache_data=True, callbacks=[]
104) -> dict:
105 qid_to_vector = {}
106 data: list[VectorData] = []
108 for qid, text in qid_to_text.items():
109 data.append(
110 {"hash": hash_text(text), "text": text, "additional_details": {"qid": qid}}
111 )
113 embedded_data = await text_embedder.embed_store_many(data, callbacks, cache_data)
114 for item in embedded_data:
115 # find item in data
116 data_item = next((x for x in data if x["hash"] == item["hash"]), None)
118 if data_item is None:
119 print(f"No matching data item for {item}")
120 continue
122 details = json.loads(item["additional_details"])
123 additional_details = data_item["additional_details"]
125 if isinstance(additional_details, str):
126 additional_details = json.loads(additional_details)
128 qid = additional_details.get("qid")
129 if qid is None:
130 print(f"No qid found in additional details for {item}")
131 continue
133 if details.get("qid") != qid:
134 details = {"qid": qid}
136 qid_to_vector[qid] = item["vector"]
137 return qid_to_vector