Coverage for .tox/p312/lib/python3.10/site-packages/scicom/utilities/statistics.py: 92%

106 statements  

« prev     ^ index     » next       coverage.py v7.4.4, created at 2024-05-28 12:02 +0200

1"""Prune a network.""" 

2import igraph as ig 

3import numpy as np 

4import pandas as pd 

5 

6 

7class PruneNetwork: 

8 """Create statistics for communication networks by deletion. 

9 

10 For a given dataset with sender and receiver information, 

11 create a weighted network with igraph. For a given number 

12 of iterations, deletion amounts, and deletion types, the 

13 algorithm then generates network statistics for randomly 

14 sampled subnetworks. 

15 """ 

16 

17 def __init__(self, dataframe:pd.DataFrame) -> None: 

18 """Initialize pruning.""" 

19 self.inputDF = dataframe 

20 

21 def makeNet(self, dataframe:pd.DataFrame) -> ig.Graph: 

22 """Create network from dataframe. 

23 

24 Assumes the existence of sender, receiver and step 

25 column names. 

26 """ 

27 networkdata = dataframe.groupby(["sender", "receiver"]).agg({"step": lambda x: x.to_list()}).reset_index() 

28 counts = networkdata.step.apply(lambda x : len(x)) 

29 networkdata.insert(3, "weight", counts) 

30 graph = ig.Graph.TupleList( 

31 networkdata.itertuples(index=False), directed=True, edge_attrs=["step", "weight"], 

32 ) 

33 for node in graph.vs: 

34 agent = node["name"] 

35 edgSend = self.inputDF.query("sender == @agent") 

36 maxSend = edgSend.step.max() 

37 edgRec = self.inputDF.query("receiver == @agent") 

38 maxRec = edgRec.step.max() 

39 if maxSend > maxRec or np.isnan(maxRec): 

40 lastLoc = edgSend.query("step == @maxSend")["sender_location"].iloc[0] 

41 elif maxSend < maxRec or maxSend == maxRec or np.isnan(maxSend): 

42 lastLoc = edgRec.query("step == @maxRec")["receiver_location"].iloc[0] 

43 else: 

44 text = f"No location for agent {agent}, got max send {maxSend} and max rec {maxRec}." 

45 raise ValueError(text) 

46 node["location"] = lastLoc 

47 return graph 

48 

49 def setSurvivalProb(self, graph:ig.Graph, *, method:str = "agents", ranked:bool = True) -> pd.DataFrame: 

50 """Generate probabilities for different survival modes.""" 

51 if method == "agents": 

52 tempData = pd.DataFrame( 

53 {"id": graph.vs["name"], "degree": graph.indegree()}, 

54 ) 

55 tempData = tempData.sort_values("degree", ascending=False) if ranked else tempData.sample(frac=1) 

56 elif method == "regions": 

57 tempData = pd.DataFrame( 

58 pd.concat( 

59 [self.inputDF.sender_location, self.inputDF.receiver_location], 

60 ).unique(), columns = ["location"], 

61 ) 

62 locations = pd.DataFrame({"id":graph.vs["name"], "location":graph.vs["location"]}) 

63 locations = locations.groupby("location")["id"].nunique().reset_index(name = "count") 

64 tempData = tempData.merge(locations, how="left").fillna(0) 

65 tempData = tempData.sort_values("count", ascending = False) if ranked else tempData.sample(frac=1) 

66 elif method == "time": 

67 tempData = pd.DataFrame({"step": range(self.inputDF.step.max() + 1)}) 

68 tempData = tempData.sort_values("step", ascending = False) if ranked else tempData.sample(frac=1) 

69 rng = np.random.default_rng() 

70 probabilities = pd.DataFrame( 

71 { 

72 "unif": -np.sort(-rng.uniform(0, 1, len(tempData))), 

73 "log_normal1": -np.sort(-rng.lognormal(0, 1/2, len(tempData))), 

74 "log_normal2": -np.sort(-rng.lognormal(0, 1, len(tempData))), 

75 "log_normal3": -np.sort(-rng.lognormal(0, 2, len(tempData))), 

76 "exp": -np.sort(-rng.exponential(10, len(tempData))), 

77 "beta": -np.sort(-rng.beta(a=4, b=5, size=len(tempData))), 

78 }, 

79 ) 

80 return pd.concat([tempData, probabilities], axis = 1) 

81 

82 def scaleSurvivalProb(self, probabilities:pd.DataFrame, *, method:str = "agents") -> pd.DataFrame: 

83 """Scale survival for methods agents and regions.""" 

84 colsType = ["unif", "beta", "exp", "log_normal1", "log_normal2", "log_normal3"] 

85 if method == "time": 

86 return probabilities 

87 if method == "agents": 

88 cols = ["sender", "receiver"] 

89 cols.extend(colsType) 

90 tempData = self.inputDF[["sender", "receiver"]].drop_duplicates().merge( 

91 probabilities, left_on="sender", right_on="id", 

92 ) 

93 tempData = tempData.merge(probabilities, left_on="receiver", right_on="id") 

94 if method == "regions": 

95 cols = ["sender_location", "receiver_location"] 

96 cols.extend(colsType) 

97 tempData = self.inputDF[["sender_location", "receiver_location"]].drop_duplicates().merge( 

98 probabilities, left_on="sender_location", right_on="location", 

99 ) 

100 tempData = tempData.merge(probabilities, left_on="receiver_location", right_on="location") 

101 for i in colsType: 

102 tempData[i] = tempData[i + "_x"] * tempData[i + "_y"] / np.dot(tempData[i + "_x"], tempData[i + "_y"]) 

103 return tempData[cols] 

104 

105 def basicNetStats(self, graph:ig.Graph) -> pd.DataFrame: 

106 """Generate base statistics of network.""" 

107 #Find the degree centrality 

108 tempData = pd.DataFrame({"Degree":graph.degree()}) 

109 

110 #Find the ranking 

111 tempData["Rank"] = tempData["Degree"].rank(method = "min", ascending = False) 

112 

113 #Adding other types of centrality 

114 tempData["Betweenness"] = graph.betweenness() 

115 tempData["Closeness"] = graph.closeness() 

116 tempData["Eigenvector"] = graph.eigenvector_centrality() 

117 tempData["Page_Rank"] = graph.pagerank() 

118 

119 return tempData 

120 

121 def netStats(self, G:ig.Graph) -> pd.DataFrame: 

122 """Generate network statistics.""" 

123 #Number of components: 

124 no_components = len(G.components()) 

125 #Number of maximal cliques: 

126 # TODO(Malte): Consider if these are necessary. Performance! 

127 # no_cliques = len(G.maximal_cliques()) 

128 #Size of the largest clique: 

129 # size_clique = G.omega() 

130 #Average path length: 

131 avg_path = G.average_path_length() 

132 #Diameter: 

133 diameter = G.diameter() 

134 #Modularity: 

135 modularity = G.modularity(G.components()) 

136 #Transitivity: 

137 transitivity = G.transitivity_undirected() 

138 #Cohesion 

139 cohesion = G.cohesion() 

140 #Degree assortativity: 

141 assortativity = G.assortativity_degree() 

142 #Find the in-degree centrality for each node: 

143 indegrees = G.indegree() 

144 #Average relative degree: 

145 N = len(G.vs) 

146 avg_rel_degree = np.mean([x/N for x in indegrees]) 

147 #Tail estimator (Hill): 

148 hill = ig.statistics.power_law_fit( 

149 indegrees, 

150 xmin=None, 

151 method = "hill", 

152 ).alpha 

153 #Centralization: 

154 max_indegree = max(indegrees) 

155 centralization = float(N*max_indegree - sum(indegrees))/(N-1)**2 

156 

157 return pd.DataFrame([{ 

158 "no_components":no_components, 

159 # "no_cliques":no_cliques, 

160 # "size_clique":size_clique, 

161 "diameter":diameter, 

162 "avg_path":avg_path, 

163 "modularity":modularity, 

164 "transitivity":transitivity, 

165 "cohesion":cohesion, 

166 "assortativity":assortativity, 

167 "avg_degree":avg_rel_degree, 

168 "centralization":centralization, 

169 "hill":hill, 

170 }]) 

171 

172 def deleteFromNetwork( 

173 self, 

174 iterations: int = 10, 

175 delAmounts: tuple = (0.1, 0.25, 0.5, 0.75, 0.9), 

176 delTypes: tuple = ("unif", "log_normal1", "exp", "beta", "log_normal2", "log_normal3"), 

177 delMethod: tuple = ("agents", "regions", "time"), 

178 rankedVals: tuple = (True, False), 

179 ) -> pd.DataFrame: 

180 """Run the deletion by sampling.""" 

181 results = [] 

182 fullNet = self.makeNet( 

183 self.inputDF, 

184 ) 

185 fullStats = self.netStats(fullNet) 

186 fullStats = fullStats.assign( 

187 delVal=0, delType="NA", delIteration=0, delMethod="NA", rankedVal="NA", 

188 ) 

189 results.append(fullStats) 

190 for idx in range(1, iterations + 1): 

191 for method in delMethod: 

192 for ranked in rankedVals: 

193 probVals = self.setSurvivalProb( 

194 fullNet, method=method, ranked=ranked, 

195 ) 

196 prunVals = self.scaleSurvivalProb( 

197 probVals, method=method, 

198 ) 

199 tempDF = self.inputDF.merge( 

200 prunVals, 

201 ) 

202 for val in list(delAmounts): 

203 for deltype in list(delTypes): 

204 delDF = tempDF.sample( 

205 frac = (1 - val), 

206 weights=deltype, 

207 ) 

208 delNet = self.makeNet(delDF) 

209 delStats = self.netStats(delNet) 

210 delStats = delStats.assign( 

211 delVal=val, delType=deltype, delIteration=idx, delMethod=method, rankedVal=ranked, 

212 ) 

213 results.append(delStats) 

214 return pd.concat(results) 

215 

216 

217 

218def prune( 

219 modelparameters: dict, 

220 network: tuple, 

221 columns: list, 

222 iterations: int = 10, 

223 delAmounts: tuple = (0.1, 0.25, 0.5, 0.75, 0.9), 

224 delTypes: tuple = ("unif", "log_normal1", "exp", "beta", "log_normal2", "log_normal3"), 

225 delMethod: tuple = ("agents", "regions", "time"), 

226 rankedVals: tuple = (True, False)) -> pd.DataFrame: 

227 """Generate pruned networks from input. 

228 

229 Assumes existence of columns "sender", "receiver", 

230 "sender_location", "receiver_location" and "step". 

231 """ 

232 runDf = pd.DataFrame(network, columns = columns) 

233 pruning = PruneNetwork(runDf) 

234 result = pruning.deleteFromNetwork( 

235 iterations=iterations, 

236 delAmounts=delAmounts, 

237 delTypes=delTypes, 

238 delMethod=delMethod, 

239 rankedVals=rankedVals, 

240 ) 

241 return result.assign(**modelparameters)