Coverage for intelligence_toolkit/query_text_data/graph_builder.py: 0%

106 statements  

« prev     ^ index     » next       coverage.py v7.10.7, created at 2025-10-16 13:41 -0300

1# Copyright (c) 2024 Microsoft Corporation. All rights reserved. 

2# Licensed under the MIT license. See LICENSE file in the project. 

3 

4import re 

5from collections import defaultdict 

6 

7import nltk 

8import numpy as np 

9import networkx as nx 

10from graspologic import partition 

11from nltk.data import find 

12from textblob import TextBlob 

13 

14 

15def download_if_not_exists(resource_name) -> None: 

16 try: 

17 find(resource_name) 

18 except LookupError: 

19 nltk.download(resource_name) 

20 

21 

22download_if_not_exists("brown") 

23download_if_not_exists("punkt") 

24download_if_not_exists("punkt_tab") 

25 

26 

27def update_concept_graph_edges(node_to_period_counts, edge_to_period_counts, periods, chunk, cid, concept_to_cids, cid_to_concepts): 

28 nps = sorted(set(TextBlob(chunk).noun_phrases)) 

29 filtered_nps = [] 

30 for np in nps: 

31 # split on space 

32 parts = [p for p in re.split(r"[\s]+", np) if len(p) > 0] 

33 if len(parts) > 1 and all([re.match(r"^[a-zA-Z0-9\-]+\n?$", part) for part in parts]): 

34 filtered_nps.append(np.replace("\n", "")) 

35 filtered_nps = sorted(filtered_nps) 

36 for np in filtered_nps: 

37 concept_to_cids[np].append(cid) 

38 cid_to_concepts[cid] = filtered_nps 

39 for period in periods: 

40 for np in filtered_nps: 

41 node_to_period_counts[np][period] += 1 

42 for ix, np1 in enumerate(filtered_nps): 

43 for np2 in filtered_nps[ix + 1 :]: 

44 edge_to_period_counts[(np1, np2)][period] += 1 

45 

46 

47def prepare_concept_graphs(period_concept_graphs, max_cluster_size, min_edge_weight, min_node_degree): 

48 prepare_concept_graph(period_concept_graphs["ALL"], min_edge_weight, min_node_degree) 

49 hierarchical_communities, community_to_label = ( 

50 detect_concept_communities(period_concept_graphs["ALL"], max_cluster_size) 

51 ) 

52 if hierarchical_communities is not None: 

53 concept_to_community = hierarchical_communities.final_level_hierarchical_clustering() 

54 for period, G in period_concept_graphs.items(): 

55 for node in list(G.nodes()): 

56 if node not in concept_to_community: 

57 G.remove_node(node) 

58 for component in list(nx.connected_components(G)): 

59 highest_degree_node = max(component, key=lambda x: G.degree(x)) 

60 G.add_edge(highest_degree_node, 'dummynode', weight=1) 

61 for node, data in G.nodes(data=True): 

62 data['community'] = concept_to_community[node] if node in concept_to_community else -1 

63 

64 return hierarchical_communities, community_to_label 

65 

66def build_meta_graph(G, hierarchical_communities): 

67 level_to_communities = {} 

68 level_to_label_to_network = defaultdict(dict) 

69 if hierarchical_communities is not None and len(hierarchical_communities) > 0: 

70 max_level = max([hc.level for hc in hierarchical_communities]) 

71 level_community_nodes = {} 

72 for level in range(max_level+1): 

73 filtered_nodes = [hc for hc in hierarchical_communities if hc.level == level and hc.node != "dummynode"] 

74 community_nodes = defaultdict(set) 

75 for hc in filtered_nodes: 

76 community_nodes[hc.cluster].add(hc.node) 

77 level_community_nodes[level] = community_nodes 

78 sorted_communities = sorted(community_nodes.keys(), key=lambda x: len(community_nodes[x]), reverse=True) 

79 level_to_communities[level] = sorted_communities 

80 for level, community_list in level_to_communities.items(): 

81 for c in community_list: 

82 c_nodes = level_community_nodes[level][c] 

83 S = nx.subgraph(G, c_nodes) 

84 top_nodes = sorted(c_nodes, key=lambda x: G.degree(x), reverse=True)[:7] 

85 label = "; ".join(top_nodes) 

86 level_to_label_to_network[level][label] = S 

87 return level_to_label_to_network 

88 

89 

90def prepare_concept_graph(G, min_edge_weight, min_node_degree, std_trim=3): 

91 degrees = [x[1] for x in G.degree()] 

92 mean_degree = np.mean(degrees) 

93 std_degree = np.std(degrees) 

94 cutoff = mean_degree + std_trim * std_degree 

95 G.remove_nodes_from([n for n, d in G.degree() if d > cutoff]) 

96 G.remove_edges_from( 

97 [ 

98 (u, v) 

99 for u, v, d in G.edges(data=True) 

100 if u in G.nodes() and v in G.nodes() and d["weight"] < min_edge_weight 

101 ] 

102 ) 

103 G.remove_nodes_from([n for n, d in G.degree() if d < min_node_degree]) 

104 for component in list(nx.connected_components(G)): 

105 highest_degree_node = max(component, key=lambda x: G.degree(x)) 

106 G.add_edge(highest_degree_node, 'dummynode', weight=1) 

107 

108 

109def detect_concept_communities(G, max_cluster_size): 

110 hierarchical_communities = None 

111 community_to_label = {} 

112 if len(G.nodes()) > 0: 

113 hierarchical_communities = partition.hierarchical_leiden(G, max_cluster_size, random_seed=42) 

114 level_to_community_to_concepts = defaultdict(lambda: defaultdict(set)) 

115 level_to_community_to_parent = defaultdict(dict) 

116 level_to_community_to_children = defaultdict(lambda: defaultdict(set)) 

117 for hc in hierarchical_communities: 

118 level_to_community_to_concepts[hc.level][hc.cluster].add(hc.node) 

119 if hc.parent_cluster is not None: 

120 level_to_community_to_parent[hc.level][hc.cluster] = hc.parent_cluster 

121 level_to_community_to_children[hc.level][hc.parent_cluster].add(hc.cluster) 

122 

123 community_to_label = {} 

124 # rename communities as X.Y.Z where X is the size rank of the top-level community, Y is the size rank of the second-level community among all children of X, Y is the size rank of the third-level community among all children of Y, etc. 

125 for level, community_to_concepts in level_to_community_to_concepts.items(): 

126 for community, concepts in community_to_concepts.items(): 

127 if level == 0: 

128 root_rank = sorted(community_to_concepts, key=lambda x: len(community_to_concepts[x]), reverse=True).index(community) 

129 community_to_label[community] = f"1.{root_rank+1}" 

130 else: 

131 parent = level_to_community_to_parent[level][community] 

132 parent_label = community_to_label[parent] 

133 children = level_to_community_to_children[level][parent] 

134 child_rank = sorted(children, key=lambda x: len(level_to_community_to_concepts[level][x]), reverse=True).index(community) 

135 community_to_label[community] = f"{parent_label}.{child_rank+1}" 

136 return hierarchical_communities, community_to_label