Coverage for .tox/p312/lib/python3.10/site-packages/scicom/utilities/statistics.py: 92%
106 statements
« prev ^ index » next coverage.py v7.4.4, created at 2024-05-28 12:02 +0200
« prev ^ index » next coverage.py v7.4.4, created at 2024-05-28 12:02 +0200
1"""Prune a network."""
2import igraph as ig
3import numpy as np
4import pandas as pd
7class PruneNetwork:
8 """Create statistics for communication networks by deletion.
10 For a given dataset with sender and receiver information,
11 create a weighted network with igraph. For a given number
12 of iterations, deletion amounts, and deletion types, the
13 algorithm then generates network statistics for randomly
14 sampled subnetworks.
15 """
17 def __init__(self, dataframe:pd.DataFrame) -> None:
18 """Initialize pruning."""
19 self.inputDF = dataframe
21 def makeNet(self, dataframe:pd.DataFrame) -> ig.Graph:
22 """Create network from dataframe.
24 Assumes the existence of sender, receiver and step
25 column names.
26 """
27 networkdata = dataframe.groupby(["sender", "receiver"]).agg({"step": lambda x: x.to_list()}).reset_index()
28 counts = networkdata.step.apply(lambda x : len(x))
29 networkdata.insert(3, "weight", counts)
30 graph = ig.Graph.TupleList(
31 networkdata.itertuples(index=False), directed=True, edge_attrs=["step", "weight"],
32 )
33 for node in graph.vs:
34 agent = node["name"]
35 edgSend = self.inputDF.query("sender == @agent")
36 maxSend = edgSend.step.max()
37 edgRec = self.inputDF.query("receiver == @agent")
38 maxRec = edgRec.step.max()
39 if maxSend > maxRec or np.isnan(maxRec):
40 lastLoc = edgSend.query("step == @maxSend")["sender_location"].iloc[0]
41 elif maxSend < maxRec or maxSend == maxRec or np.isnan(maxSend):
42 lastLoc = edgRec.query("step == @maxRec")["receiver_location"].iloc[0]
43 else:
44 text = f"No location for agent {agent}, got max send {maxSend} and max rec {maxRec}."
45 raise ValueError(text)
46 node["location"] = lastLoc
47 return graph
49 def setSurvivalProb(self, graph:ig.Graph, *, method:str = "agents", ranked:bool = True) -> pd.DataFrame:
50 """Generate probabilities for different survival modes."""
51 if method == "agents":
52 tempData = pd.DataFrame(
53 {"id": graph.vs["name"], "degree": graph.indegree()},
54 )
55 tempData = tempData.sort_values("degree", ascending=False) if ranked else tempData.sample(frac=1)
56 elif method == "regions":
57 tempData = pd.DataFrame(
58 pd.concat(
59 [self.inputDF.sender_location, self.inputDF.receiver_location],
60 ).unique(), columns = ["location"],
61 )
62 locations = pd.DataFrame({"id":graph.vs["name"], "location":graph.vs["location"]})
63 locations = locations.groupby("location")["id"].nunique().reset_index(name = "count")
64 tempData = tempData.merge(locations, how="left").fillna(0)
65 tempData = tempData.sort_values("count", ascending = False) if ranked else tempData.sample(frac=1)
66 elif method == "time":
67 tempData = pd.DataFrame({"step": range(self.inputDF.step.max() + 1)})
68 tempData = tempData.sort_values("step", ascending = False) if ranked else tempData.sample(frac=1)
69 rng = np.random.default_rng()
70 probabilities = pd.DataFrame(
71 {
72 "unif": -np.sort(-rng.uniform(0, 1, len(tempData))),
73 "log_normal1": -np.sort(-rng.lognormal(0, 1/2, len(tempData))),
74 "log_normal2": -np.sort(-rng.lognormal(0, 1, len(tempData))),
75 "log_normal3": -np.sort(-rng.lognormal(0, 2, len(tempData))),
76 "exp": -np.sort(-rng.exponential(10, len(tempData))),
77 "beta": -np.sort(-rng.beta(a=4, b=5, size=len(tempData))),
78 },
79 )
80 return pd.concat([tempData, probabilities], axis = 1)
82 def scaleSurvivalProb(self, probabilities:pd.DataFrame, *, method:str = "agents") -> pd.DataFrame:
83 """Scale survival for methods agents and regions."""
84 colsType = ["unif", "beta", "exp", "log_normal1", "log_normal2", "log_normal3"]
85 if method == "time":
86 return probabilities
87 if method == "agents":
88 cols = ["sender", "receiver"]
89 cols.extend(colsType)
90 tempData = self.inputDF[["sender", "receiver"]].drop_duplicates().merge(
91 probabilities, left_on="sender", right_on="id",
92 )
93 tempData = tempData.merge(probabilities, left_on="receiver", right_on="id")
94 if method == "regions":
95 cols = ["sender_location", "receiver_location"]
96 cols.extend(colsType)
97 tempData = self.inputDF[["sender_location", "receiver_location"]].drop_duplicates().merge(
98 probabilities, left_on="sender_location", right_on="location",
99 )
100 tempData = tempData.merge(probabilities, left_on="receiver_location", right_on="location")
101 for i in colsType:
102 tempData[i] = tempData[i + "_x"] * tempData[i + "_y"] / np.dot(tempData[i + "_x"], tempData[i + "_y"])
103 return tempData[cols]
105 def basicNetStats(self, graph:ig.Graph) -> pd.DataFrame:
106 """Generate base statistics of network."""
107 #Find the degree centrality
108 tempData = pd.DataFrame({"Degree":graph.degree()})
110 #Find the ranking
111 tempData["Rank"] = tempData["Degree"].rank(method = "min", ascending = False)
113 #Adding other types of centrality
114 tempData["Betweenness"] = graph.betweenness()
115 tempData["Closeness"] = graph.closeness()
116 tempData["Eigenvector"] = graph.eigenvector_centrality()
117 tempData["Page_Rank"] = graph.pagerank()
119 return tempData
121 def netStats(self, G:ig.Graph) -> pd.DataFrame:
122 """Generate network statistics."""
123 #Number of components:
124 no_components = len(G.components())
125 #Number of maximal cliques:
126 # TODO(Malte): Consider if these are necessary. Performance!
127 # no_cliques = len(G.maximal_cliques())
128 #Size of the largest clique:
129 # size_clique = G.omega()
130 #Average path length:
131 avg_path = G.average_path_length()
132 #Diameter:
133 diameter = G.diameter()
134 #Modularity:
135 modularity = G.modularity(G.components())
136 #Transitivity:
137 transitivity = G.transitivity_undirected()
138 #Cohesion
139 cohesion = G.cohesion()
140 #Degree assortativity:
141 assortativity = G.assortativity_degree()
142 #Find the in-degree centrality for each node:
143 indegrees = G.indegree()
144 #Average relative degree:
145 N = len(G.vs)
146 avg_rel_degree = np.mean([x/N for x in indegrees])
147 #Tail estimator (Hill):
148 hill = ig.statistics.power_law_fit(
149 indegrees,
150 xmin=None,
151 method = "hill",
152 ).alpha
153 #Centralization:
154 max_indegree = max(indegrees)
155 centralization = float(N*max_indegree - sum(indegrees))/(N-1)**2
157 return pd.DataFrame([{
158 "no_components":no_components,
159 # "no_cliques":no_cliques,
160 # "size_clique":size_clique,
161 "diameter":diameter,
162 "avg_path":avg_path,
163 "modularity":modularity,
164 "transitivity":transitivity,
165 "cohesion":cohesion,
166 "assortativity":assortativity,
167 "avg_degree":avg_rel_degree,
168 "centralization":centralization,
169 "hill":hill,
170 }])
172 def deleteFromNetwork(
173 self,
174 iterations: int = 10,
175 delAmounts: tuple = (0.1, 0.25, 0.5, 0.75, 0.9),
176 delTypes: tuple = ("unif", "log_normal1", "exp", "beta", "log_normal2", "log_normal3"),
177 delMethod: tuple = ("agents", "regions", "time"),
178 rankedVals: tuple = (True, False),
179 ) -> pd.DataFrame:
180 """Run the deletion by sampling."""
181 results = []
182 fullNet = self.makeNet(
183 self.inputDF,
184 )
185 fullStats = self.netStats(fullNet)
186 fullStats = fullStats.assign(
187 delVal=0, delType="NA", delIteration=0, delMethod="NA", rankedVal="NA",
188 )
189 results.append(fullStats)
190 for idx in range(1, iterations + 1):
191 for method in delMethod:
192 for ranked in rankedVals:
193 probVals = self.setSurvivalProb(
194 fullNet, method=method, ranked=ranked,
195 )
196 prunVals = self.scaleSurvivalProb(
197 probVals, method=method,
198 )
199 tempDF = self.inputDF.merge(
200 prunVals,
201 )
202 for val in list(delAmounts):
203 for deltype in list(delTypes):
204 delDF = tempDF.sample(
205 frac = (1 - val),
206 weights=deltype,
207 )
208 delNet = self.makeNet(delDF)
209 delStats = self.netStats(delNet)
210 delStats = delStats.assign(
211 delVal=val, delType=deltype, delIteration=idx, delMethod=method, rankedVal=ranked,
212 )
213 results.append(delStats)
214 return pd.concat(results)
218def prune(
219 modelparameters: dict,
220 network: tuple,
221 columns: list,
222 iterations: int = 10,
223 delAmounts: tuple = (0.1, 0.25, 0.5, 0.75, 0.9),
224 delTypes: tuple = ("unif", "log_normal1", "exp", "beta", "log_normal2", "log_normal3"),
225 delMethod: tuple = ("agents", "regions", "time"),
226 rankedVals: tuple = (True, False)) -> pd.DataFrame:
227 """Generate pruned networks from input.
229 Assumes existence of columns "sender", "receiver",
230 "sender_location", "receiver_location" and "step".
231 """
232 runDf = pd.DataFrame(network, columns = columns)
233 pruning = PruneNetwork(runDf)
234 result = pruning.deleteFromNetwork(
235 iterations=iterations,
236 delAmounts=delAmounts,
237 delTypes=delTypes,
238 delMethod=delMethod,
239 rankedVals=rankedVals,
240 )
241 return result.assign(**modelparameters)