Coverage for intelligence_toolkit/query_text_data/classes.py: 41%

37 statements  

« prev     ^ index     » next       coverage.py v7.10.7, created at 2025-10-16 13:41 -0300

1# Copyright (c) 2024 Microsoft Corporation. All rights reserved. 

2# Licensed under the MIT license. See LICENSE file in the project. 

3 

4import networkx as nx 

5import graspologic as gc 

6 

7class ProcessedChunks: 

8 def __init__( 

9 self, 

10 cid_to_text: dict[int, str], 

11 text_to_cid: dict[str, int], 

12 period_concept_graphs: dict[str, nx.Graph], 

13 hierarchical_communities: gc.partition.HierarchicalCluster, 

14 community_to_label: dict[int, dict[int, str]], 

15 concept_to_cids: dict[str, list[int]], 

16 cid_to_concepts: dict[int, list[str]], 

17 previous_cid: dict[int, int], 

18 next_cid: dict[int, int], 

19 period_to_cids: dict[str, list[int]], 

20 node_period_counts: dict[str, dict[str, int]], 

21 edge_period_counts: dict[tuple[str, str], dict[str, int]] 

22 ): 

23 """ 

24 Represents the results of processing text chunks into concepts and communities. 

25 

26 Args: 

27 cid_to_text (dict[int, str]): A dictionary of chunk IDs to text 

28 text_to_cid (dict[str, int]): A dictionary of text to chunk IDs 

29 period_concept_graphs (dict[str, nx.Graph]): A dictionary of period to concept graph 

30 hierarchical_communities (gc.partition.HierarchicalCluster): A hierarchical community structure 

31 community_to_label (dict[int, dict[int, str]]): A dictionary of community ID to a dictionary of node ID to label 

32 concept_to_cids (dict[str, list[int]]): A dictionary of concept to chunk IDs 

33 cid_to_concepts (dict[int, list[str]]): A dictionary of chunk ID to concepts 

34 previous_cid (dict[int, int]): A dictionary of chunk ID to previous chunk ID 

35 next_cid (dict[int, int]): A dictionary of chunk ID to next chunk ID 

36 period_to_cids (dict[str, list[int]]): A dictionary of period to chunk IDs 

37 node_period_counts (dict[str, dict[str, int]]): A dictionary of period to node to count 

38 edge_period_counts (dict[tuple[str, str], dict[str, int]]): A dictionary of period to edge to count 

39 """ 

40 self.cid_to_text = cid_to_text 

41 self.text_to_cid = text_to_cid 

42 self.period_concept_graphs = period_concept_graphs 

43 self.hierarchical_communities = hierarchical_communities 

44 self.community_to_label = community_to_label 

45 self.concept_to_cids = concept_to_cids 

46 self.cid_to_concepts = cid_to_concepts 

47 self.previous_cid = previous_cid 

48 self.next_cid = next_cid 

49 self.period_to_cids = period_to_cids 

50 self.node_period_counts = node_period_counts 

51 self.edge_period_counts = edge_period_counts 

52 

53 def __repr__(self): 

54 return f"ProcessedChunks(num_chunks={len(self.cid_to_text.keys())})" 

55 

56class ChunkSearchConfig: 

57 def __init__( 

58 self, 

59 adjacent_test_steps: int, 

60 community_relevance_tests: int, 

61 community_ranking_chunks: int, 

62 relevance_test_batch_size: int, 

63 relevance_test_budget: int, 

64 irrelevant_community_restart: int, 

65 analysis_update_interval: int = 0 

66 ) -> None: 

67 """ 

68 Represents the configuration used to search for relevant text chunks. 

69 

70 Args: 

71 adjacent_test_steps (int): How many chunks before and after each relevant chunk to test, once the relevance test budget is near or the search process has terminated 

72 community_relevance_tests (int): How many relevance tests to run on each community in turn 

73 community_ranking_chunks (int): How many chunks to use to rank communities by relevance 

74 relevance_test_batch_size (int): How many relevance tests to run in parallel at a time 

75 relevance_test_budget (int): How many relevance tests are permitted per query. Higher values may provide higher quality results at higher cost 

76 irrelevant_community_restart (int): When to restart testing communities in relevance order 

77 analysis_update_interval (int): How many chunks to process before updating the analysis. Use 0 to skip analysis updates 

78 """ 

79 self.adjacent_test_steps = adjacent_test_steps 

80 self.community_relevance_tests = community_relevance_tests 

81 self.community_ranking_chunks = community_ranking_chunks 

82 self.relevance_test_batch_size = relevance_test_batch_size 

83 self.relevance_test_budget = relevance_test_budget 

84 self.irrelevant_community_restart = irrelevant_community_restart 

85 self.analysis_update_interval = analysis_update_interval 

86 

87 def __repr__(self): 

88 return f"ChunkSearchConfig(adjacent_test_steps={self.adjacent_test_steps}, community_relevance_tests={self.community_relevance_tests}, relevance_test_batch_size={self.relevance_test_batch_size}, relevance_test_budget={self.relevance_test_budget}, irrelevant_community_restart={self.irrelevant_community_restart}, analysis_update_interval={self.analysis_update_interval})" 

89 

90class AnswerObject: 

91 def __init__( 

92 self, 

93 extended_answer: str, 

94 references: list[str], 

95 referenced_chunks: list[int], 

96 net_new_sources: int, 

97 ) -> None: 

98 """ 

99 Represents the answer to a user query. 

100 

101 Args: 

102 extended_answer (str): The extended answer to the user query 

103 references (list[str]): A list of references used in the answer 

104 referenced_chunks (list[int]): A list of chunk IDs referenced in the answer 

105 net_new_sources (int): The number of new sources used in the answer 

106 """ 

107 self.extended_answer = extended_answer 

108 self.references = references 

109 self.referenced_chunks = referenced_chunks 

110 self.net_new_sources = net_new_sources 

111 

112 def __repr__(self): 

113 return f"AnswerObject(extended_answer={self.extended_answer[:100]}, references={len(self.references)}, referenced_chunks={len(self.referenced_chunks)}, net_new_sources={self.net_new_sources})" 

114