Hide keyboard shortcuts

Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1""" 

2Table Schema builders 

3 

4http://specs.frictionlessdata.io/json-table-schema/ 

5""" 

6import warnings 

7 

8import pandas._libs.json as json 

9 

10from pandas.core.dtypes.common import ( 

11 is_bool_dtype, 

12 is_categorical_dtype, 

13 is_datetime64_dtype, 

14 is_datetime64tz_dtype, 

15 is_integer_dtype, 

16 is_numeric_dtype, 

17 is_period_dtype, 

18 is_string_dtype, 

19 is_timedelta64_dtype, 

20) 

21from pandas.core.dtypes.dtypes import CategoricalDtype 

22 

23from pandas import DataFrame 

24import pandas.core.common as com 

25 

26loads = json.loads 

27 

28 

29def as_json_table_type(x): 

30 """ 

31 Convert a NumPy / pandas type to its corresponding json_table. 

32 

33 Parameters 

34 ---------- 

35 x : array or dtype 

36 

37 Returns 

38 ------- 

39 t : str 

40 the Table Schema data types 

41 

42 Notes 

43 ----- 

44 This table shows the relationship between NumPy / pandas dtypes, 

45 and Table Schema dtypes. 

46 

47 ============== ================= 

48 Pandas type Table Schema type 

49 ============== ================= 

50 int64 integer 

51 float64 number 

52 bool boolean 

53 datetime64[ns] datetime 

54 timedelta64[ns] duration 

55 object str 

56 categorical any 

57 =============== ================= 

58 """ 

59 if is_integer_dtype(x): 

60 return "integer" 

61 elif is_bool_dtype(x): 

62 return "boolean" 

63 elif is_numeric_dtype(x): 

64 return "number" 

65 elif is_datetime64_dtype(x) or is_datetime64tz_dtype(x) or is_period_dtype(x): 

66 return "datetime" 

67 elif is_timedelta64_dtype(x): 

68 return "duration" 

69 elif is_categorical_dtype(x): 

70 return "any" 

71 elif is_string_dtype(x): 

72 return "string" 

73 else: 

74 return "any" 

75 

76 

77def set_default_names(data): 

78 """Sets index names to 'index' for regular, or 'level_x' for Multi""" 

79 if com.all_not_none(*data.index.names): 

80 nms = data.index.names 

81 if len(nms) == 1 and data.index.name == "index": 

82 warnings.warn("Index name of 'index' is not round-trippable") 

83 elif len(nms) > 1 and any(x.startswith("level_") for x in nms): 

84 warnings.warn("Index names beginning with 'level_' are not round-trippable") 

85 return data 

86 

87 data = data.copy() 

88 if data.index.nlevels > 1: 

89 names = [ 

90 name if name is not None else f"level_{i}" 

91 for i, name in enumerate(data.index.names) 

92 ] 

93 data.index.names = names 

94 else: 

95 data.index.name = data.index.name or "index" 

96 return data 

97 

98 

99def convert_pandas_type_to_json_field(arr, dtype=None): 

100 dtype = dtype or arr.dtype 

101 if arr.name is None: 

102 name = "values" 

103 else: 

104 name = arr.name 

105 field = {"name": name, "type": as_json_table_type(dtype)} 

106 

107 if is_categorical_dtype(arr): 

108 if hasattr(arr, "categories"): 

109 cats = arr.categories 

110 ordered = arr.ordered 

111 else: 

112 cats = arr.cat.categories 

113 ordered = arr.cat.ordered 

114 field["constraints"] = {"enum": list(cats)} 

115 field["ordered"] = ordered 

116 elif is_period_dtype(arr): 

117 field["freq"] = arr.freqstr 

118 elif is_datetime64tz_dtype(arr): 

119 if hasattr(arr, "dt"): 

120 field["tz"] = arr.dt.tz.zone 

121 else: 

122 field["tz"] = arr.tz.zone 

123 return field 

124 

125 

126def convert_json_field_to_pandas_type(field): 

127 """ 

128 Converts a JSON field descriptor into its corresponding NumPy / pandas type 

129 

130 Parameters 

131 ---------- 

132 field 

133 A JSON field descriptor 

134 

135 Returns 

136 ------- 

137 dtype 

138 

139 Raises 

140 ------ 

141 ValueError 

142 If the type of the provided field is unknown or currently unsupported 

143 

144 Examples 

145 -------- 

146 >>> convert_json_field_to_pandas_type({'name': 'an_int', 

147 'type': 'integer'}) 

148 'int64' 

149 >>> convert_json_field_to_pandas_type({'name': 'a_categorical', 

150 'type': 'any', 

151 'constraints': {'enum': [ 

152 'a', 'b', 'c']}, 

153 'ordered': True}) 

154 'CategoricalDtype(categories=['a', 'b', 'c'], ordered=True)' 

155 >>> convert_json_field_to_pandas_type({'name': 'a_datetime', 

156 'type': 'datetime'}) 

157 'datetime64[ns]' 

158 >>> convert_json_field_to_pandas_type({'name': 'a_datetime_with_tz', 

159 'type': 'datetime', 

160 'tz': 'US/Central'}) 

161 'datetime64[ns, US/Central]' 

162 """ 

163 typ = field["type"] 

164 if typ == "string": 

165 return "object" 

166 elif typ == "integer": 

167 return "int64" 

168 elif typ == "number": 

169 return "float64" 

170 elif typ == "boolean": 

171 return "bool" 

172 elif typ == "duration": 

173 return "timedelta64" 

174 elif typ == "datetime": 

175 if field.get("tz"): 

176 return f"datetime64[ns, {field['tz']}]" 

177 else: 

178 return "datetime64[ns]" 

179 elif typ == "any": 

180 if "constraints" in field and "ordered" in field: 

181 return CategoricalDtype( 

182 categories=field["constraints"]["enum"], ordered=field["ordered"] 

183 ) 

184 else: 

185 return "object" 

186 

187 raise ValueError(f"Unsupported or invalid field type: {typ}") 

188 

189 

190def build_table_schema(data, index=True, primary_key=None, version=True): 

191 """ 

192 Create a Table schema from ``data``. 

193 

194 Parameters 

195 ---------- 

196 data : Series, DataFrame 

197 index : bool, default True 

198 Whether to include ``data.index`` in the schema. 

199 primary_key : bool or None, default True 

200 Column names to designate as the primary key. 

201 The default `None` will set `'primaryKey'` to the index 

202 level or levels if the index is unique. 

203 version : bool, default True 

204 Whether to include a field `pandas_version` with the version 

205 of pandas that generated the schema. 

206 

207 Returns 

208 ------- 

209 schema : dict 

210 

211 Notes 

212 ----- 

213 See `_as_json_table_type` for conversion types. 

214 Timedeltas as converted to ISO8601 duration format with 

215 9 decimal places after the seconds field for nanosecond precision. 

216 

217 Categoricals are converted to the `any` dtype, and use the `enum` field 

218 constraint to list the allowed values. The `ordered` attribute is included 

219 in an `ordered` field. 

220 

221 Examples 

222 -------- 

223 >>> df = pd.DataFrame( 

224 ... {'A': [1, 2, 3], 

225 ... 'B': ['a', 'b', 'c'], 

226 ... 'C': pd.date_range('2016-01-01', freq='d', periods=3), 

227 ... }, index=pd.Index(range(3), name='idx')) 

228 >>> build_table_schema(df) 

229 {'fields': [{'name': 'idx', 'type': 'integer'}, 

230 {'name': 'A', 'type': 'integer'}, 

231 {'name': 'B', 'type': 'string'}, 

232 {'name': 'C', 'type': 'datetime'}], 

233 'pandas_version': '0.20.0', 

234 'primaryKey': ['idx']} 

235 """ 

236 if index is True: 

237 data = set_default_names(data) 

238 

239 schema = {} 

240 fields = [] 

241 

242 if index: 

243 if data.index.nlevels > 1: 

244 for level, name in zip(data.index.levels, data.index.names): 

245 new_field = convert_pandas_type_to_json_field(level) 

246 new_field["name"] = name 

247 fields.append(new_field) 

248 else: 

249 fields.append(convert_pandas_type_to_json_field(data.index)) 

250 

251 if data.ndim > 1: 

252 for column, s in data.items(): 

253 fields.append(convert_pandas_type_to_json_field(s)) 

254 else: 

255 fields.append(convert_pandas_type_to_json_field(data)) 

256 

257 schema["fields"] = fields 

258 if index and data.index.is_unique and primary_key is None: 

259 if data.index.nlevels == 1: 

260 schema["primaryKey"] = [data.index.name] 

261 else: 

262 schema["primaryKey"] = data.index.names 

263 elif primary_key is not None: 

264 schema["primaryKey"] = primary_key 

265 

266 if version: 

267 schema["pandas_version"] = "0.20.0" 

268 return schema 

269 

270 

271def parse_table_schema(json, precise_float): 

272 """ 

273 Builds a DataFrame from a given schema 

274 

275 Parameters 

276 ---------- 

277 json : 

278 A JSON table schema 

279 precise_float : boolean 

280 Flag controlling precision when decoding string to double values, as 

281 dictated by ``read_json`` 

282 

283 Returns 

284 ------- 

285 df : DataFrame 

286 

287 Raises 

288 ------ 

289 NotImplementedError 

290 If the JSON table schema contains either timezone or timedelta data 

291 

292 Notes 

293 ----- 

294 Because :func:`DataFrame.to_json` uses the string 'index' to denote a 

295 name-less :class:`Index`, this function sets the name of the returned 

296 :class:`DataFrame` to ``None`` when said string is encountered with a 

297 normal :class:`Index`. For a :class:`MultiIndex`, the same limitation 

298 applies to any strings beginning with 'level_'. Therefore, an 

299 :class:`Index` name of 'index' and :class:`MultiIndex` names starting 

300 with 'level_' are not supported. 

301 

302 See Also 

303 -------- 

304 build_table_schema : Inverse function. 

305 pandas.read_json 

306 """ 

307 table = loads(json, precise_float=precise_float) 

308 col_order = [field["name"] for field in table["schema"]["fields"]] 

309 df = DataFrame(table["data"], columns=col_order)[col_order] 

310 

311 dtypes = { 

312 field["name"]: convert_json_field_to_pandas_type(field) 

313 for field in table["schema"]["fields"] 

314 } 

315 

316 # Cannot directly use as_type with timezone data on object; raise for now 

317 if any(str(x).startswith("datetime64[ns, ") for x in dtypes.values()): 

318 raise NotImplementedError('table="orient" can not yet read timezone data') 

319 

320 # No ISO constructor for Timedelta as of yet, so need to raise 

321 if "timedelta64" in dtypes.values(): 

322 raise NotImplementedError( 

323 'table="orient" can not yet read ISO-formatted Timedelta data' 

324 ) 

325 

326 df = df.astype(dtypes) 

327 

328 if "primaryKey" in table["schema"]: 

329 df = df.set_index(table["schema"]["primaryKey"]) 

330 if len(df.index.names) == 1: 

331 if df.index.name == "index": 

332 df.index.name = None 

333 else: 

334 df.index.names = [ 

335 None if x.startswith("level_") else x for x in df.index.names 

336 ] 

337 

338 return df