Coverage for intelligence_toolkit/query_text_data/helper_functions.py: 0%

94 statements  

« prev     ^ index     » next       coverage.py v7.10.7, created at 2025-10-16 13:41 -0300

1# Copyright (c) 2024 Microsoft Corporation. All rights reserved. 

2# Licensed under the MIT license. See LICENSE file in the project. 

3 

4import json 

5 

6from intelligence_toolkit.AI.base_embedder import BaseEmbedder 

7from intelligence_toolkit.AI.classes import VectorData 

8from intelligence_toolkit.AI.utils import hash_text 

9 

10 

11def get_adjacent_chunks(source, previous_chunk_dict, next_chunk_dict, steps): 

12 prev_chunks = [] 

13 current_chunk = source 

14 for i in range(steps): 

15 prev_chunk = previous_chunk_dict.get(current_chunk, None) 

16 if prev_chunk is None: 

17 break 

18 prev_chunks.append(prev_chunk) 

19 current_chunk = prev_chunk 

20 next_chunks = [] 

21 current_chunk = source 

22 for i in range(steps): 

23 next_chunk = next_chunk_dict.get(current_chunk, None) 

24 if next_chunk is None: 

25 break 

26 next_chunks.append(next_chunk) 

27 current_chunk = next_chunk 

28 return set(prev_chunks + next_chunks) 

29 

30 

31def get_test_progress(test_history): 

32 current_search = "" 

33 current_relevant = 0 

34 current_tested = 0 

35 total_relevant = 0 

36 total_tested = 0 

37 rounds = [] 

38 for ix, (search, chunk, response) in enumerate(test_history): 

39 if search != current_search: 

40 if current_search != "": 

41 if current_relevant > 0: 

42 rounds.append( 

43 f"{current_search}: {current_relevant}/{current_tested}" 

44 ) 

45 else: 

46 rounds.append( 

47 f"<span style='color: red'>{current_search}: {current_relevant}/{current_tested}</span>" 

48 ) 

49 current_search = search 

50 current_relevant = 0 

51 current_tested = 0 

52 current_tested += 1 

53 total_tested += 1 

54 if response == "Yes": 

55 current_relevant += 1 

56 total_relevant += 1 

57 if current_search != "": 

58 if current_relevant > 0: 

59 rounds.append(f"{current_search}: {current_relevant}/{current_tested}") 

60 else: 

61 rounds.append( 

62 f"<span style='color: red'>{current_search}: {current_relevant}/{current_tested}</span>" 

63 ) 

64 response = f"**Relevant chunks / tested chunks: {total_relevant}/{total_tested}**" 

65 if len(rounds) > 0: 

66 response += " (" + "; ".join(rounds) + ")" 

67 return response 

68 

69 

70def test_history_elements(test_history, previous_cid, next_cid, adjacent_search_steps): 

71 relevant_list = [x[1] for x in test_history if x[2] == "Yes"] 

72 seen_list = [x[1] for x in test_history] 

73 adjacent_targets = set() 

74 for c in relevant_list: 

75 adjacent_targets.update( 

76 get_adjacent_chunks(c, previous_cid, next_cid, adjacent_search_steps) 

77 ) 

78 adjacent_list = [x for x in adjacent_targets if x not in seen_list] 

79 return relevant_list, seen_list, adjacent_list 

80 

81 

82async def embed_texts( 

83 cid_to_text, text_embedder: BaseEmbedder, cache_data=True, callbacks=[] 

84) -> dict: 

85 cid_to_vector = {} 

86 data: list[VectorData] = [] 

87 

88 for cid, text in cid_to_text.items(): 

89 data.append( 

90 {"hash": hash_text(text), "text": text, "additional_details": {"cid": cid}} 

91 ) 

92 

93 embedded_data = await text_embedder.embed_store_many(data, callbacks, cache_data) 

94 for item in embedded_data: 

95 details = json.loads(item["additional_details"]) 

96 if len(details.keys()) == 0: 

97 continue 

98 cid_to_vector[details["cid"]] = item["vector"] 

99 return cid_to_vector 

100 

101 

102async def embed_queries( 

103 qid_to_text, text_embedder: BaseEmbedder, cache_data=True, callbacks=[] 

104) -> dict: 

105 qid_to_vector = {} 

106 data: list[VectorData] = [] 

107 

108 for qid, text in qid_to_text.items(): 

109 data.append( 

110 {"hash": hash_text(text), "text": text, "additional_details": {"qid": qid}} 

111 ) 

112 

113 embedded_data = await text_embedder.embed_store_many(data, callbacks, cache_data) 

114 for item in embedded_data: 

115 # find item in data 

116 data_item = next((x for x in data if x["hash"] == item["hash"]), None) 

117 

118 if data_item is None: 

119 print(f"No matching data item for {item}") 

120 continue 

121 

122 details = json.loads(item["additional_details"]) 

123 additional_details = data_item["additional_details"] 

124 

125 if isinstance(additional_details, str): 

126 additional_details = json.loads(additional_details) 

127 

128 qid = additional_details.get("qid") 

129 if qid is None: 

130 print(f"No qid found in additional details for {item}") 

131 continue 

132 

133 if details.get("qid") != qid: 

134 details = {"qid": qid} 

135 

136 qid_to_vector[qid] = item["vector"] 

137 return qid_to_vector