Hide keyboard shortcuts

Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1import re 

2from typing import List 

3 

4import numpy as np 

5 

6from pandas.util._decorators import Appender, deprecate_kwarg 

7 

8from pandas.core.dtypes.common import is_extension_array_dtype, is_list_like 

9from pandas.core.dtypes.concat import concat_compat 

10from pandas.core.dtypes.generic import ABCMultiIndex 

11from pandas.core.dtypes.missing import notna 

12 

13from pandas.core.arrays import Categorical 

14import pandas.core.common as com 

15from pandas.core.frame import DataFrame, _shared_docs 

16from pandas.core.indexes.base import Index 

17from pandas.core.reshape.concat import concat 

18from pandas.core.tools.numeric import to_numeric 

19 

20 

21@Appender( 

22 _shared_docs["melt"] 

23 % dict(caller="pd.melt(df, ", versionadded="", other="DataFrame.melt") 

24) 

25def melt( 

26 frame: DataFrame, 

27 id_vars=None, 

28 value_vars=None, 

29 var_name=None, 

30 value_name="value", 

31 col_level=None, 

32) -> DataFrame: 

33 # TODO: what about the existing index? 

34 # If multiindex, gather names of columns on all level for checking presence 

35 # of `id_vars` and `value_vars` 

36 if isinstance(frame.columns, ABCMultiIndex): 

37 cols = [x for c in frame.columns for x in c] 

38 else: 

39 cols = list(frame.columns) 

40 

41 if id_vars is not None: 

42 if not is_list_like(id_vars): 

43 id_vars = [id_vars] 

44 elif isinstance(frame.columns, ABCMultiIndex) and not isinstance(id_vars, list): 

45 raise ValueError( 

46 "id_vars must be a list of tuples when columns are a MultiIndex" 

47 ) 

48 else: 

49 # Check that `id_vars` are in frame 

50 id_vars = list(id_vars) 

51 missing = Index(com.flatten(id_vars)).difference(cols) 

52 if not missing.empty: 

53 raise KeyError( 

54 "The following 'id_vars' are not present " 

55 "in the DataFrame: {missing}" 

56 "".format(missing=list(missing)) 

57 ) 

58 else: 

59 id_vars = [] 

60 

61 if value_vars is not None: 

62 if not is_list_like(value_vars): 

63 value_vars = [value_vars] 

64 elif isinstance(frame.columns, ABCMultiIndex) and not isinstance( 

65 value_vars, list 

66 ): 

67 raise ValueError( 

68 "value_vars must be a list of tuples when columns are a MultiIndex" 

69 ) 

70 else: 

71 value_vars = list(value_vars) 

72 # Check that `value_vars` are in frame 

73 missing = Index(com.flatten(value_vars)).difference(cols) 

74 if not missing.empty: 

75 raise KeyError( 

76 "The following 'value_vars' are not present in " 

77 "the DataFrame: {missing}" 

78 "".format(missing=list(missing)) 

79 ) 

80 frame = frame.loc[:, id_vars + value_vars] 

81 else: 

82 frame = frame.copy() 

83 

84 if col_level is not None: # allow list or other? 

85 # frame is a copy 

86 frame.columns = frame.columns.get_level_values(col_level) 

87 

88 if var_name is None: 

89 if isinstance(frame.columns, ABCMultiIndex): 

90 if len(frame.columns.names) == len(set(frame.columns.names)): 

91 var_name = frame.columns.names 

92 else: 

93 var_name = [ 

94 "variable_{i}".format(i=i) for i in range(len(frame.columns.names)) 

95 ] 

96 else: 

97 var_name = [ 

98 frame.columns.name if frame.columns.name is not None else "variable" 

99 ] 

100 if isinstance(var_name, str): 

101 var_name = [var_name] 

102 

103 N, K = frame.shape 

104 K -= len(id_vars) 

105 

106 mdata = {} 

107 for col in id_vars: 

108 id_data = frame.pop(col) 

109 if is_extension_array_dtype(id_data): 

110 id_data = concat([id_data] * K, ignore_index=True) 

111 else: 

112 id_data = np.tile(id_data.values, K) 

113 mdata[col] = id_data 

114 

115 mcolumns = id_vars + var_name + [value_name] 

116 

117 mdata[value_name] = frame.values.ravel("F") 

118 for i, col in enumerate(var_name): 

119 # asanyarray will keep the columns as an Index 

120 mdata[col] = np.asanyarray(frame.columns._get_level_values(i)).repeat(N) 

121 

122 return frame._constructor(mdata, columns=mcolumns) 

123 

124 

125@deprecate_kwarg(old_arg_name="label", new_arg_name=None) 

126def lreshape(data: DataFrame, groups, dropna: bool = True, label=None) -> DataFrame: 

127 """ 

128 Reshape long-format data to wide. Generalized inverse of DataFrame.pivot 

129 

130 Parameters 

131 ---------- 

132 data : DataFrame 

133 groups : dict 

134 {new_name : list_of_columns} 

135 dropna : boolean, default True 

136 

137 Examples 

138 -------- 

139 >>> data = pd.DataFrame({'hr1': [514, 573], 'hr2': [545, 526], 

140 ... 'team': ['Red Sox', 'Yankees'], 

141 ... 'year1': [2007, 2007], 'year2': [2008, 2008]}) 

142 >>> data 

143 hr1 hr2 team year1 year2 

144 0 514 545 Red Sox 2007 2008 

145 1 573 526 Yankees 2007 2008 

146 

147 >>> pd.lreshape(data, {'year': ['year1', 'year2'], 'hr': ['hr1', 'hr2']}) 

148 team year hr 

149 0 Red Sox 2007 514 

150 1 Yankees 2007 573 

151 2 Red Sox 2008 545 

152 3 Yankees 2008 526 

153 

154 Returns 

155 ------- 

156 reshaped : DataFrame 

157 """ 

158 if isinstance(groups, dict): 

159 keys = list(groups.keys()) 

160 values = list(groups.values()) 

161 else: 

162 keys, values = zip(*groups) 

163 

164 all_cols = list(set.union(*[set(x) for x in values])) 

165 id_cols = list(data.columns.difference(all_cols)) 

166 

167 K = len(values[0]) 

168 

169 for seq in values: 

170 if len(seq) != K: 

171 raise ValueError("All column lists must be same length") 

172 

173 mdata = {} 

174 pivot_cols = [] 

175 

176 for target, names in zip(keys, values): 

177 to_concat = [data[col].values for col in names] 

178 

179 mdata[target] = concat_compat(to_concat) 

180 pivot_cols.append(target) 

181 

182 for col in id_cols: 

183 mdata[col] = np.tile(data[col].values, K) 

184 

185 if dropna: 

186 mask = np.ones(len(mdata[pivot_cols[0]]), dtype=bool) 

187 for c in pivot_cols: 

188 mask &= notna(mdata[c]) 

189 if not mask.all(): 

190 mdata = {k: v[mask] for k, v in mdata.items()} 

191 

192 return data._constructor(mdata, columns=id_cols + pivot_cols) 

193 

194 

195def wide_to_long( 

196 df: DataFrame, stubnames, i, j, sep: str = "", suffix: str = r"\d+" 

197) -> DataFrame: 

198 r""" 

199 Wide panel to long format. Less flexible but more user-friendly than melt. 

200 

201 With stubnames ['A', 'B'], this function expects to find one or more 

202 group of columns with format 

203 A-suffix1, A-suffix2,..., B-suffix1, B-suffix2,... 

204 You specify what you want to call this suffix in the resulting long format 

205 with `j` (for example `j='year'`) 

206 

207 Each row of these wide variables are assumed to be uniquely identified by 

208 `i` (can be a single column name or a list of column names) 

209 

210 All remaining variables in the data frame are left intact. 

211 

212 Parameters 

213 ---------- 

214 df : DataFrame 

215 The wide-format DataFrame. 

216 stubnames : str or list-like 

217 The stub name(s). The wide format variables are assumed to 

218 start with the stub names. 

219 i : str or list-like 

220 Column(s) to use as id variable(s). 

221 j : str 

222 The name of the sub-observation variable. What you wish to name your 

223 suffix in the long format. 

224 sep : str, default "" 

225 A character indicating the separation of the variable names 

226 in the wide format, to be stripped from the names in the long format. 

227 For example, if your column names are A-suffix1, A-suffix2, you 

228 can strip the hyphen by specifying `sep='-'`. 

229 suffix : str, default '\\d+' 

230 A regular expression capturing the wanted suffixes. '\\d+' captures 

231 numeric suffixes. Suffixes with no numbers could be specified with the 

232 negated character class '\\D+'. You can also further disambiguate 

233 suffixes, for example, if your wide variables are of the form 

234 A-one, B-two,.., and you have an unrelated column A-rating, you can 

235 ignore the last one by specifying `suffix='(!?one|two)'`. 

236 

237 .. versionchanged:: 0.23.0 

238 When all suffixes are numeric, they are cast to int64/float64. 

239 

240 Returns 

241 ------- 

242 DataFrame 

243 A DataFrame that contains each stub name as a variable, with new index 

244 (i, j). 

245 

246 Notes 

247 ----- 

248 All extra variables are left untouched. This simply uses 

249 `pandas.melt` under the hood, but is hard-coded to "do the right thing" 

250 in a typical case. 

251 

252 Examples 

253 -------- 

254 >>> np.random.seed(123) 

255 >>> df = pd.DataFrame({"A1970" : {0 : "a", 1 : "b", 2 : "c"}, 

256 ... "A1980" : {0 : "d", 1 : "e", 2 : "f"}, 

257 ... "B1970" : {0 : 2.5, 1 : 1.2, 2 : .7}, 

258 ... "B1980" : {0 : 3.2, 1 : 1.3, 2 : .1}, 

259 ... "X" : dict(zip(range(3), np.random.randn(3))) 

260 ... }) 

261 >>> df["id"] = df.index 

262 >>> df 

263 A1970 A1980 B1970 B1980 X id 

264 0 a d 2.5 3.2 -1.085631 0 

265 1 b e 1.2 1.3 0.997345 1 

266 2 c f 0.7 0.1 0.282978 2 

267 >>> pd.wide_to_long(df, ["A", "B"], i="id", j="year") 

268 ... # doctest: +NORMALIZE_WHITESPACE 

269 X A B 

270 id year 

271 0 1970 -1.085631 a 2.5 

272 1 1970 0.997345 b 1.2 

273 2 1970 0.282978 c 0.7 

274 0 1980 -1.085631 d 3.2 

275 1 1980 0.997345 e 1.3 

276 2 1980 0.282978 f 0.1 

277 

278 With multiple id columns 

279 

280 >>> df = pd.DataFrame({ 

281 ... 'famid': [1, 1, 1, 2, 2, 2, 3, 3, 3], 

282 ... 'birth': [1, 2, 3, 1, 2, 3, 1, 2, 3], 

283 ... 'ht1': [2.8, 2.9, 2.2, 2, 1.8, 1.9, 2.2, 2.3, 2.1], 

284 ... 'ht2': [3.4, 3.8, 2.9, 3.2, 2.8, 2.4, 3.3, 3.4, 2.9] 

285 ... }) 

286 >>> df 

287 famid birth ht1 ht2 

288 0 1 1 2.8 3.4 

289 1 1 2 2.9 3.8 

290 2 1 3 2.2 2.9 

291 3 2 1 2.0 3.2 

292 4 2 2 1.8 2.8 

293 5 2 3 1.9 2.4 

294 6 3 1 2.2 3.3 

295 7 3 2 2.3 3.4 

296 8 3 3 2.1 2.9 

297 >>> l = pd.wide_to_long(df, stubnames='ht', i=['famid', 'birth'], j='age') 

298 >>> l 

299 ... # doctest: +NORMALIZE_WHITESPACE 

300 ht 

301 famid birth age 

302 1 1 1 2.8 

303 2 3.4 

304 2 1 2.9 

305 2 3.8 

306 3 1 2.2 

307 2 2.9 

308 2 1 1 2.0 

309 2 3.2 

310 2 1 1.8 

311 2 2.8 

312 3 1 1.9 

313 2 2.4 

314 3 1 1 2.2 

315 2 3.3 

316 2 1 2.3 

317 2 3.4 

318 3 1 2.1 

319 2 2.9 

320 

321 Going from long back to wide just takes some creative use of `unstack` 

322 

323 >>> w = l.unstack() 

324 >>> w.columns = w.columns.map('{0[0]}{0[1]}'.format) 

325 >>> w.reset_index() 

326 famid birth ht1 ht2 

327 0 1 1 2.8 3.4 

328 1 1 2 2.9 3.8 

329 2 1 3 2.2 2.9 

330 3 2 1 2.0 3.2 

331 4 2 2 1.8 2.8 

332 5 2 3 1.9 2.4 

333 6 3 1 2.2 3.3 

334 7 3 2 2.3 3.4 

335 8 3 3 2.1 2.9 

336 

337 Less wieldy column names are also handled 

338 

339 >>> np.random.seed(0) 

340 >>> df = pd.DataFrame({'A(weekly)-2010': np.random.rand(3), 

341 ... 'A(weekly)-2011': np.random.rand(3), 

342 ... 'B(weekly)-2010': np.random.rand(3), 

343 ... 'B(weekly)-2011': np.random.rand(3), 

344 ... 'X' : np.random.randint(3, size=3)}) 

345 >>> df['id'] = df.index 

346 >>> df # doctest: +NORMALIZE_WHITESPACE, +ELLIPSIS 

347 A(weekly)-2010 A(weekly)-2011 B(weekly)-2010 B(weekly)-2011 X id 

348 0 0.548814 0.544883 0.437587 0.383442 0 0 

349 1 0.715189 0.423655 0.891773 0.791725 1 1 

350 2 0.602763 0.645894 0.963663 0.528895 1 2 

351 

352 >>> pd.wide_to_long(df, ['A(weekly)', 'B(weekly)'], i='id', 

353 ... j='year', sep='-') 

354 ... # doctest: +NORMALIZE_WHITESPACE 

355 X A(weekly) B(weekly) 

356 id year 

357 0 2010 0 0.548814 0.437587 

358 1 2010 1 0.715189 0.891773 

359 2 2010 1 0.602763 0.963663 

360 0 2011 0 0.544883 0.383442 

361 1 2011 1 0.423655 0.791725 

362 2 2011 1 0.645894 0.528895 

363 

364 If we have many columns, we could also use a regex to find our 

365 stubnames and pass that list on to wide_to_long 

366 

367 >>> stubnames = sorted( 

368 ... set([match[0] for match in df.columns.str.findall( 

369 ... r'[A-B]\(.*\)').values if match != []]) 

370 ... ) 

371 >>> list(stubnames) 

372 ['A(weekly)', 'B(weekly)'] 

373 

374 All of the above examples have integers as suffixes. It is possible to 

375 have non-integers as suffixes. 

376 

377 >>> df = pd.DataFrame({ 

378 ... 'famid': [1, 1, 1, 2, 2, 2, 3, 3, 3], 

379 ... 'birth': [1, 2, 3, 1, 2, 3, 1, 2, 3], 

380 ... 'ht_one': [2.8, 2.9, 2.2, 2, 1.8, 1.9, 2.2, 2.3, 2.1], 

381 ... 'ht_two': [3.4, 3.8, 2.9, 3.2, 2.8, 2.4, 3.3, 3.4, 2.9] 

382 ... }) 

383 >>> df 

384 famid birth ht_one ht_two 

385 0 1 1 2.8 3.4 

386 1 1 2 2.9 3.8 

387 2 1 3 2.2 2.9 

388 3 2 1 2.0 3.2 

389 4 2 2 1.8 2.8 

390 5 2 3 1.9 2.4 

391 6 3 1 2.2 3.3 

392 7 3 2 2.3 3.4 

393 8 3 3 2.1 2.9 

394 

395 >>> l = pd.wide_to_long(df, stubnames='ht', i=['famid', 'birth'], j='age', 

396 ... sep='_', suffix='\w+') 

397 >>> l 

398 ... # doctest: +NORMALIZE_WHITESPACE 

399 ht 

400 famid birth age 

401 1 1 one 2.8 

402 two 3.4 

403 2 one 2.9 

404 two 3.8 

405 3 one 2.2 

406 two 2.9 

407 2 1 one 2.0 

408 two 3.2 

409 2 one 1.8 

410 two 2.8 

411 3 one 1.9 

412 two 2.4 

413 3 1 one 2.2 

414 two 3.3 

415 2 one 2.3 

416 two 3.4 

417 3 one 2.1 

418 two 2.9 

419 """ 

420 

421 def get_var_names(df, stub: str, sep: str, suffix: str) -> List[str]: 

422 regex = r"^{stub}{sep}{suffix}$".format( 

423 stub=re.escape(stub), sep=re.escape(sep), suffix=suffix 

424 ) 

425 pattern = re.compile(regex) 

426 return [col for col in df.columns if pattern.match(col)] 

427 

428 def melt_stub(df, stub: str, i, j, value_vars, sep: str): 

429 newdf = melt( 

430 df, 

431 id_vars=i, 

432 value_vars=value_vars, 

433 value_name=stub.rstrip(sep), 

434 var_name=j, 

435 ) 

436 newdf[j] = Categorical(newdf[j]) 

437 newdf[j] = newdf[j].str.replace(re.escape(stub + sep), "") 

438 

439 # GH17627 Cast numerics suffixes to int/float 

440 newdf[j] = to_numeric(newdf[j], errors="ignore") 

441 

442 return newdf.set_index(i + [j]) 

443 

444 if not is_list_like(stubnames): 

445 stubnames = [stubnames] 

446 else: 

447 stubnames = list(stubnames) 

448 

449 if any(col in stubnames for col in df.columns): 

450 raise ValueError("stubname can't be identical to a column name") 

451 

452 if not is_list_like(i): 

453 i = [i] 

454 else: 

455 i = list(i) 

456 

457 if df[i].duplicated().any(): 

458 raise ValueError("the id variables need to uniquely identify each row") 

459 

460 value_vars = [get_var_names(df, stub, sep, suffix) for stub in stubnames] 

461 

462 value_vars_flattened = [e for sublist in value_vars for e in sublist] 

463 id_vars = list(set(df.columns.tolist()).difference(value_vars_flattened)) 

464 

465 _melted = [melt_stub(df, s, i, j, v, sep) for s, v in zip(stubnames, value_vars)] 

466 melted = _melted[0].join(_melted[1:], how="outer") 

467 

468 if len(i) == 1: 

469 new = df[id_vars].set_index(i).join(melted) 

470 return new 

471 

472 new = df[id_vars].merge(melted.reset_index(), on=i).set_index(i + [j]) 

473 

474 return new