Coverage for intelligence_toolkit/query_text_data/classes.py: 41%
37 statements
« prev ^ index » next coverage.py v7.10.7, created at 2025-10-16 13:41 -0300
« prev ^ index » next coverage.py v7.10.7, created at 2025-10-16 13:41 -0300
1# Copyright (c) 2024 Microsoft Corporation. All rights reserved.
2# Licensed under the MIT license. See LICENSE file in the project.
4import networkx as nx
5import graspologic as gc
7class ProcessedChunks:
8 def __init__(
9 self,
10 cid_to_text: dict[int, str],
11 text_to_cid: dict[str, int],
12 period_concept_graphs: dict[str, nx.Graph],
13 hierarchical_communities: gc.partition.HierarchicalCluster,
14 community_to_label: dict[int, dict[int, str]],
15 concept_to_cids: dict[str, list[int]],
16 cid_to_concepts: dict[int, list[str]],
17 previous_cid: dict[int, int],
18 next_cid: dict[int, int],
19 period_to_cids: dict[str, list[int]],
20 node_period_counts: dict[str, dict[str, int]],
21 edge_period_counts: dict[tuple[str, str], dict[str, int]]
22 ):
23 """
24 Represents the results of processing text chunks into concepts and communities.
26 Args:
27 cid_to_text (dict[int, str]): A dictionary of chunk IDs to text
28 text_to_cid (dict[str, int]): A dictionary of text to chunk IDs
29 period_concept_graphs (dict[str, nx.Graph]): A dictionary of period to concept graph
30 hierarchical_communities (gc.partition.HierarchicalCluster): A hierarchical community structure
31 community_to_label (dict[int, dict[int, str]]): A dictionary of community ID to a dictionary of node ID to label
32 concept_to_cids (dict[str, list[int]]): A dictionary of concept to chunk IDs
33 cid_to_concepts (dict[int, list[str]]): A dictionary of chunk ID to concepts
34 previous_cid (dict[int, int]): A dictionary of chunk ID to previous chunk ID
35 next_cid (dict[int, int]): A dictionary of chunk ID to next chunk ID
36 period_to_cids (dict[str, list[int]]): A dictionary of period to chunk IDs
37 node_period_counts (dict[str, dict[str, int]]): A dictionary of period to node to count
38 edge_period_counts (dict[tuple[str, str], dict[str, int]]): A dictionary of period to edge to count
39 """
40 self.cid_to_text = cid_to_text
41 self.text_to_cid = text_to_cid
42 self.period_concept_graphs = period_concept_graphs
43 self.hierarchical_communities = hierarchical_communities
44 self.community_to_label = community_to_label
45 self.concept_to_cids = concept_to_cids
46 self.cid_to_concepts = cid_to_concepts
47 self.previous_cid = previous_cid
48 self.next_cid = next_cid
49 self.period_to_cids = period_to_cids
50 self.node_period_counts = node_period_counts
51 self.edge_period_counts = edge_period_counts
53 def __repr__(self):
54 return f"ProcessedChunks(num_chunks={len(self.cid_to_text.keys())})"
56class ChunkSearchConfig:
57 def __init__(
58 self,
59 adjacent_test_steps: int,
60 community_relevance_tests: int,
61 community_ranking_chunks: int,
62 relevance_test_batch_size: int,
63 relevance_test_budget: int,
64 irrelevant_community_restart: int,
65 analysis_update_interval: int = 0
66 ) -> None:
67 """
68 Represents the configuration used to search for relevant text chunks.
70 Args:
71 adjacent_test_steps (int): How many chunks before and after each relevant chunk to test, once the relevance test budget is near or the search process has terminated
72 community_relevance_tests (int): How many relevance tests to run on each community in turn
73 community_ranking_chunks (int): How many chunks to use to rank communities by relevance
74 relevance_test_batch_size (int): How many relevance tests to run in parallel at a time
75 relevance_test_budget (int): How many relevance tests are permitted per query. Higher values may provide higher quality results at higher cost
76 irrelevant_community_restart (int): When to restart testing communities in relevance order
77 analysis_update_interval (int): How many chunks to process before updating the analysis. Use 0 to skip analysis updates
78 """
79 self.adjacent_test_steps = adjacent_test_steps
80 self.community_relevance_tests = community_relevance_tests
81 self.community_ranking_chunks = community_ranking_chunks
82 self.relevance_test_batch_size = relevance_test_batch_size
83 self.relevance_test_budget = relevance_test_budget
84 self.irrelevant_community_restart = irrelevant_community_restart
85 self.analysis_update_interval = analysis_update_interval
87 def __repr__(self):
88 return f"ChunkSearchConfig(adjacent_test_steps={self.adjacent_test_steps}, community_relevance_tests={self.community_relevance_tests}, relevance_test_batch_size={self.relevance_test_batch_size}, relevance_test_budget={self.relevance_test_budget}, irrelevant_community_restart={self.irrelevant_community_restart}, analysis_update_interval={self.analysis_update_interval})"
90class AnswerObject:
91 def __init__(
92 self,
93 extended_answer: str,
94 references: list[str],
95 referenced_chunks: list[int],
96 net_new_sources: int,
97 ) -> None:
98 """
99 Represents the answer to a user query.
101 Args:
102 extended_answer (str): The extended answer to the user query
103 references (list[str]): A list of references used in the answer
104 referenced_chunks (list[int]): A list of chunk IDs referenced in the answer
105 net_new_sources (int): The number of new sources used in the answer
106 """
107 self.extended_answer = extended_answer
108 self.references = references
109 self.referenced_chunks = referenced_chunks
110 self.net_new_sources = net_new_sources
112 def __repr__(self):
113 return f"AnswerObject(extended_answer={self.extended_answer[:100]}, references={len(self.references)}, referenced_chunks={len(self.referenced_chunks)}, net_new_sources={self.net_new_sources})"