Coverage for /home/martinb/.local/share/virtualenvs/camcops/lib/python3.6/site-packages/pandas/io/json/_table_schema.py : 11%

Hot-keys on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1"""
2Table Schema builders
4http://specs.frictionlessdata.io/json-table-schema/
5"""
6import warnings
8import pandas._libs.json as json
10from pandas.core.dtypes.common import (
11 is_bool_dtype,
12 is_categorical_dtype,
13 is_datetime64_dtype,
14 is_datetime64tz_dtype,
15 is_integer_dtype,
16 is_numeric_dtype,
17 is_period_dtype,
18 is_string_dtype,
19 is_timedelta64_dtype,
20)
21from pandas.core.dtypes.dtypes import CategoricalDtype
23from pandas import DataFrame
24import pandas.core.common as com
26loads = json.loads
29def as_json_table_type(x):
30 """
31 Convert a NumPy / pandas type to its corresponding json_table.
33 Parameters
34 ----------
35 x : array or dtype
37 Returns
38 -------
39 t : str
40 the Table Schema data types
42 Notes
43 -----
44 This table shows the relationship between NumPy / pandas dtypes,
45 and Table Schema dtypes.
47 ============== =================
48 Pandas type Table Schema type
49 ============== =================
50 int64 integer
51 float64 number
52 bool boolean
53 datetime64[ns] datetime
54 timedelta64[ns] duration
55 object str
56 categorical any
57 =============== =================
58 """
59 if is_integer_dtype(x):
60 return "integer"
61 elif is_bool_dtype(x):
62 return "boolean"
63 elif is_numeric_dtype(x):
64 return "number"
65 elif is_datetime64_dtype(x) or is_datetime64tz_dtype(x) or is_period_dtype(x):
66 return "datetime"
67 elif is_timedelta64_dtype(x):
68 return "duration"
69 elif is_categorical_dtype(x):
70 return "any"
71 elif is_string_dtype(x):
72 return "string"
73 else:
74 return "any"
77def set_default_names(data):
78 """Sets index names to 'index' for regular, or 'level_x' for Multi"""
79 if com.all_not_none(*data.index.names):
80 nms = data.index.names
81 if len(nms) == 1 and data.index.name == "index":
82 warnings.warn("Index name of 'index' is not round-trippable")
83 elif len(nms) > 1 and any(x.startswith("level_") for x in nms):
84 warnings.warn("Index names beginning with 'level_' are not round-trippable")
85 return data
87 data = data.copy()
88 if data.index.nlevels > 1:
89 names = [
90 name if name is not None else f"level_{i}"
91 for i, name in enumerate(data.index.names)
92 ]
93 data.index.names = names
94 else:
95 data.index.name = data.index.name or "index"
96 return data
99def convert_pandas_type_to_json_field(arr, dtype=None):
100 dtype = dtype or arr.dtype
101 if arr.name is None:
102 name = "values"
103 else:
104 name = arr.name
105 field = {"name": name, "type": as_json_table_type(dtype)}
107 if is_categorical_dtype(arr):
108 if hasattr(arr, "categories"):
109 cats = arr.categories
110 ordered = arr.ordered
111 else:
112 cats = arr.cat.categories
113 ordered = arr.cat.ordered
114 field["constraints"] = {"enum": list(cats)}
115 field["ordered"] = ordered
116 elif is_period_dtype(arr):
117 field["freq"] = arr.freqstr
118 elif is_datetime64tz_dtype(arr):
119 if hasattr(arr, "dt"):
120 field["tz"] = arr.dt.tz.zone
121 else:
122 field["tz"] = arr.tz.zone
123 return field
126def convert_json_field_to_pandas_type(field):
127 """
128 Converts a JSON field descriptor into its corresponding NumPy / pandas type
130 Parameters
131 ----------
132 field
133 A JSON field descriptor
135 Returns
136 -------
137 dtype
139 Raises
140 ------
141 ValueError
142 If the type of the provided field is unknown or currently unsupported
144 Examples
145 --------
146 >>> convert_json_field_to_pandas_type({'name': 'an_int',
147 'type': 'integer'})
148 'int64'
149 >>> convert_json_field_to_pandas_type({'name': 'a_categorical',
150 'type': 'any',
151 'constraints': {'enum': [
152 'a', 'b', 'c']},
153 'ordered': True})
154 'CategoricalDtype(categories=['a', 'b', 'c'], ordered=True)'
155 >>> convert_json_field_to_pandas_type({'name': 'a_datetime',
156 'type': 'datetime'})
157 'datetime64[ns]'
158 >>> convert_json_field_to_pandas_type({'name': 'a_datetime_with_tz',
159 'type': 'datetime',
160 'tz': 'US/Central'})
161 'datetime64[ns, US/Central]'
162 """
163 typ = field["type"]
164 if typ == "string":
165 return "object"
166 elif typ == "integer":
167 return "int64"
168 elif typ == "number":
169 return "float64"
170 elif typ == "boolean":
171 return "bool"
172 elif typ == "duration":
173 return "timedelta64"
174 elif typ == "datetime":
175 if field.get("tz"):
176 return f"datetime64[ns, {field['tz']}]"
177 else:
178 return "datetime64[ns]"
179 elif typ == "any":
180 if "constraints" in field and "ordered" in field:
181 return CategoricalDtype(
182 categories=field["constraints"]["enum"], ordered=field["ordered"]
183 )
184 else:
185 return "object"
187 raise ValueError(f"Unsupported or invalid field type: {typ}")
190def build_table_schema(data, index=True, primary_key=None, version=True):
191 """
192 Create a Table schema from ``data``.
194 Parameters
195 ----------
196 data : Series, DataFrame
197 index : bool, default True
198 Whether to include ``data.index`` in the schema.
199 primary_key : bool or None, default True
200 Column names to designate as the primary key.
201 The default `None` will set `'primaryKey'` to the index
202 level or levels if the index is unique.
203 version : bool, default True
204 Whether to include a field `pandas_version` with the version
205 of pandas that generated the schema.
207 Returns
208 -------
209 schema : dict
211 Notes
212 -----
213 See `_as_json_table_type` for conversion types.
214 Timedeltas as converted to ISO8601 duration format with
215 9 decimal places after the seconds field for nanosecond precision.
217 Categoricals are converted to the `any` dtype, and use the `enum` field
218 constraint to list the allowed values. The `ordered` attribute is included
219 in an `ordered` field.
221 Examples
222 --------
223 >>> df = pd.DataFrame(
224 ... {'A': [1, 2, 3],
225 ... 'B': ['a', 'b', 'c'],
226 ... 'C': pd.date_range('2016-01-01', freq='d', periods=3),
227 ... }, index=pd.Index(range(3), name='idx'))
228 >>> build_table_schema(df)
229 {'fields': [{'name': 'idx', 'type': 'integer'},
230 {'name': 'A', 'type': 'integer'},
231 {'name': 'B', 'type': 'string'},
232 {'name': 'C', 'type': 'datetime'}],
233 'pandas_version': '0.20.0',
234 'primaryKey': ['idx']}
235 """
236 if index is True:
237 data = set_default_names(data)
239 schema = {}
240 fields = []
242 if index:
243 if data.index.nlevels > 1:
244 for level, name in zip(data.index.levels, data.index.names):
245 new_field = convert_pandas_type_to_json_field(level)
246 new_field["name"] = name
247 fields.append(new_field)
248 else:
249 fields.append(convert_pandas_type_to_json_field(data.index))
251 if data.ndim > 1:
252 for column, s in data.items():
253 fields.append(convert_pandas_type_to_json_field(s))
254 else:
255 fields.append(convert_pandas_type_to_json_field(data))
257 schema["fields"] = fields
258 if index and data.index.is_unique and primary_key is None:
259 if data.index.nlevels == 1:
260 schema["primaryKey"] = [data.index.name]
261 else:
262 schema["primaryKey"] = data.index.names
263 elif primary_key is not None:
264 schema["primaryKey"] = primary_key
266 if version:
267 schema["pandas_version"] = "0.20.0"
268 return schema
271def parse_table_schema(json, precise_float):
272 """
273 Builds a DataFrame from a given schema
275 Parameters
276 ----------
277 json :
278 A JSON table schema
279 precise_float : boolean
280 Flag controlling precision when decoding string to double values, as
281 dictated by ``read_json``
283 Returns
284 -------
285 df : DataFrame
287 Raises
288 ------
289 NotImplementedError
290 If the JSON table schema contains either timezone or timedelta data
292 Notes
293 -----
294 Because :func:`DataFrame.to_json` uses the string 'index' to denote a
295 name-less :class:`Index`, this function sets the name of the returned
296 :class:`DataFrame` to ``None`` when said string is encountered with a
297 normal :class:`Index`. For a :class:`MultiIndex`, the same limitation
298 applies to any strings beginning with 'level_'. Therefore, an
299 :class:`Index` name of 'index' and :class:`MultiIndex` names starting
300 with 'level_' are not supported.
302 See Also
303 --------
304 build_table_schema : Inverse function.
305 pandas.read_json
306 """
307 table = loads(json, precise_float=precise_float)
308 col_order = [field["name"] for field in table["schema"]["fields"]]
309 df = DataFrame(table["data"], columns=col_order)[col_order]
311 dtypes = {
312 field["name"]: convert_json_field_to_pandas_type(field)
313 for field in table["schema"]["fields"]
314 }
316 # Cannot directly use as_type with timezone data on object; raise for now
317 if any(str(x).startswith("datetime64[ns, ") for x in dtypes.values()):
318 raise NotImplementedError('table="orient" can not yet read timezone data')
320 # No ISO constructor for Timedelta as of yet, so need to raise
321 if "timedelta64" in dtypes.values():
322 raise NotImplementedError(
323 'table="orient" can not yet read ISO-formatted Timedelta data'
324 )
326 df = df.astype(dtypes)
328 if "primaryKey" in table["schema"]:
329 df = df.set_index(table["schema"]["primaryKey"])
330 if len(df.index.names) == 1:
331 if df.index.name == "index":
332 df.index.name = None
333 else:
334 df.index.names = [
335 None if x.startswith("level_") else x for x in df.index.names
336 ]
338 return df