Coverage for intelligence_toolkit/query_text_data/graph_builder.py: 0%
106 statements
« prev ^ index » next coverage.py v7.10.7, created at 2025-10-16 13:41 -0300
« prev ^ index » next coverage.py v7.10.7, created at 2025-10-16 13:41 -0300
1# Copyright (c) 2024 Microsoft Corporation. All rights reserved.
2# Licensed under the MIT license. See LICENSE file in the project.
4import re
5from collections import defaultdict
7import nltk
8import numpy as np
9import networkx as nx
10from graspologic import partition
11from nltk.data import find
12from textblob import TextBlob
15def download_if_not_exists(resource_name) -> None:
16 try:
17 find(resource_name)
18 except LookupError:
19 nltk.download(resource_name)
22download_if_not_exists("brown")
23download_if_not_exists("punkt")
24download_if_not_exists("punkt_tab")
27def update_concept_graph_edges(node_to_period_counts, edge_to_period_counts, periods, chunk, cid, concept_to_cids, cid_to_concepts):
28 nps = sorted(set(TextBlob(chunk).noun_phrases))
29 filtered_nps = []
30 for np in nps:
31 # split on space
32 parts = [p for p in re.split(r"[\s]+", np) if len(p) > 0]
33 if len(parts) > 1 and all([re.match(r"^[a-zA-Z0-9\-]+\n?$", part) for part in parts]):
34 filtered_nps.append(np.replace("\n", ""))
35 filtered_nps = sorted(filtered_nps)
36 for np in filtered_nps:
37 concept_to_cids[np].append(cid)
38 cid_to_concepts[cid] = filtered_nps
39 for period in periods:
40 for np in filtered_nps:
41 node_to_period_counts[np][period] += 1
42 for ix, np1 in enumerate(filtered_nps):
43 for np2 in filtered_nps[ix + 1 :]:
44 edge_to_period_counts[(np1, np2)][period] += 1
47def prepare_concept_graphs(period_concept_graphs, max_cluster_size, min_edge_weight, min_node_degree):
48 prepare_concept_graph(period_concept_graphs["ALL"], min_edge_weight, min_node_degree)
49 hierarchical_communities, community_to_label = (
50 detect_concept_communities(period_concept_graphs["ALL"], max_cluster_size)
51 )
52 if hierarchical_communities is not None:
53 concept_to_community = hierarchical_communities.final_level_hierarchical_clustering()
54 for period, G in period_concept_graphs.items():
55 for node in list(G.nodes()):
56 if node not in concept_to_community:
57 G.remove_node(node)
58 for component in list(nx.connected_components(G)):
59 highest_degree_node = max(component, key=lambda x: G.degree(x))
60 G.add_edge(highest_degree_node, 'dummynode', weight=1)
61 for node, data in G.nodes(data=True):
62 data['community'] = concept_to_community[node] if node in concept_to_community else -1
64 return hierarchical_communities, community_to_label
66def build_meta_graph(G, hierarchical_communities):
67 level_to_communities = {}
68 level_to_label_to_network = defaultdict(dict)
69 if hierarchical_communities is not None and len(hierarchical_communities) > 0:
70 max_level = max([hc.level for hc in hierarchical_communities])
71 level_community_nodes = {}
72 for level in range(max_level+1):
73 filtered_nodes = [hc for hc in hierarchical_communities if hc.level == level and hc.node != "dummynode"]
74 community_nodes = defaultdict(set)
75 for hc in filtered_nodes:
76 community_nodes[hc.cluster].add(hc.node)
77 level_community_nodes[level] = community_nodes
78 sorted_communities = sorted(community_nodes.keys(), key=lambda x: len(community_nodes[x]), reverse=True)
79 level_to_communities[level] = sorted_communities
80 for level, community_list in level_to_communities.items():
81 for c in community_list:
82 c_nodes = level_community_nodes[level][c]
83 S = nx.subgraph(G, c_nodes)
84 top_nodes = sorted(c_nodes, key=lambda x: G.degree(x), reverse=True)[:7]
85 label = "; ".join(top_nodes)
86 level_to_label_to_network[level][label] = S
87 return level_to_label_to_network
90def prepare_concept_graph(G, min_edge_weight, min_node_degree, std_trim=3):
91 degrees = [x[1] for x in G.degree()]
92 mean_degree = np.mean(degrees)
93 std_degree = np.std(degrees)
94 cutoff = mean_degree + std_trim * std_degree
95 G.remove_nodes_from([n for n, d in G.degree() if d > cutoff])
96 G.remove_edges_from(
97 [
98 (u, v)
99 for u, v, d in G.edges(data=True)
100 if u in G.nodes() and v in G.nodes() and d["weight"] < min_edge_weight
101 ]
102 )
103 G.remove_nodes_from([n for n, d in G.degree() if d < min_node_degree])
104 for component in list(nx.connected_components(G)):
105 highest_degree_node = max(component, key=lambda x: G.degree(x))
106 G.add_edge(highest_degree_node, 'dummynode', weight=1)
109def detect_concept_communities(G, max_cluster_size):
110 hierarchical_communities = None
111 community_to_label = {}
112 if len(G.nodes()) > 0:
113 hierarchical_communities = partition.hierarchical_leiden(G, max_cluster_size, random_seed=42)
114 level_to_community_to_concepts = defaultdict(lambda: defaultdict(set))
115 level_to_community_to_parent = defaultdict(dict)
116 level_to_community_to_children = defaultdict(lambda: defaultdict(set))
117 for hc in hierarchical_communities:
118 level_to_community_to_concepts[hc.level][hc.cluster].add(hc.node)
119 if hc.parent_cluster is not None:
120 level_to_community_to_parent[hc.level][hc.cluster] = hc.parent_cluster
121 level_to_community_to_children[hc.level][hc.parent_cluster].add(hc.cluster)
123 community_to_label = {}
124 # rename communities as X.Y.Z where X is the size rank of the top-level community, Y is the size rank of the second-level community among all children of X, Y is the size rank of the third-level community among all children of Y, etc.
125 for level, community_to_concepts in level_to_community_to_concepts.items():
126 for community, concepts in community_to_concepts.items():
127 if level == 0:
128 root_rank = sorted(community_to_concepts, key=lambda x: len(community_to_concepts[x]), reverse=True).index(community)
129 community_to_label[community] = f"1.{root_rank+1}"
130 else:
131 parent = level_to_community_to_parent[level][community]
132 parent_label = community_to_label[parent]
133 children = level_to_community_to_children[level][parent]
134 child_rank = sorted(children, key=lambda x: len(level_to_community_to_concepts[level][x]), reverse=True).index(community)
135 community_to_label[community] = f"{parent_label}.{child_rank+1}"
136 return hierarchical_communities, community_to_label