Coverage for /home/martinb/.local/share/virtualenvs/camcops/lib/python3.6/site-packages/pandas/io/stata.py : 13%

Hot-keys on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1"""
2Module contains tools for processing Stata files into DataFrames
4The StataReader below was originally written by Joe Presbrey as part of PyDTA.
5It has been extended and improved by Skipper Seabold from the Statsmodels
6project who also developed the StataWriter and was finally added to pandas in
7a once again improved version.
9You can find more information on http://presbrey.mit.edu/PyDTA and
10http://www.statsmodels.org/devel/
11"""
12from collections import abc
13import datetime
14from io import BytesIO
15import os
16import struct
17import sys
18from typing import Any, Dict, Hashable, Optional, Sequence
19import warnings
21from dateutil.relativedelta import relativedelta
22import numpy as np
24from pandas._libs.lib import infer_dtype
25from pandas._libs.writers import max_len_string_array
26from pandas._typing import FilePathOrBuffer
27from pandas.util._decorators import Appender
29from pandas.core.dtypes.common import (
30 ensure_object,
31 is_categorical_dtype,
32 is_datetime64_dtype,
33)
35from pandas import (
36 Categorical,
37 DatetimeIndex,
38 NaT,
39 Timestamp,
40 concat,
41 isna,
42 to_datetime,
43 to_timedelta,
44)
45from pandas.core.frame import DataFrame
46from pandas.core.series import Series
48from pandas.io.common import get_filepath_or_buffer, stringify_path
50_version_error = (
51 "Version of given Stata file is {version}. pandas supports importing "
52 "versions 104, 105, 108, 111 (Stata 7SE), 113 (Stata 8/9), "
53 "114 (Stata 10/11), 115 (Stata 12), 117 (Stata 13), 118 (Stata 14/15/16),"
54 "and 119 (Stata 15/16, over 32,767 variables)."
55)
57_statafile_processing_params1 = """\
58convert_dates : bool, default True
59 Convert date variables to DataFrame time values.
60convert_categoricals : bool, default True
61 Read value labels and convert columns to Categorical/Factor variables."""
63_statafile_processing_params2 = """\
64index_col : str, optional
65 Column to set as index.
66convert_missing : bool, default False
67 Flag indicating whether to convert missing values to their Stata
68 representations. If False, missing values are replaced with nan.
69 If True, columns containing missing values are returned with
70 object data types and missing values are represented by
71 StataMissingValue objects.
72preserve_dtypes : bool, default True
73 Preserve Stata datatypes. If False, numeric data are upcast to pandas
74 default types for foreign data (float64 or int64).
75columns : list or None
76 Columns to retain. Columns will be returned in the given order. None
77 returns all columns.
78order_categoricals : bool, default True
79 Flag indicating whether converted categorical data are ordered."""
81_chunksize_params = """\
82chunksize : int, default None
83 Return StataReader object for iterations, returns chunks with
84 given number of lines."""
86_iterator_params = """\
87iterator : bool, default False
88 Return StataReader object."""
90_read_stata_doc = f"""
91Read Stata file into DataFrame.
93Parameters
94----------
95filepath_or_buffer : str, path object or file-like object
96 Any valid string path is acceptable. The string could be a URL. Valid
97 URL schemes include http, ftp, s3, and file. For file URLs, a host is
98 expected. A local file could be: ``file://localhost/path/to/table.dta``.
100 If you want to pass in a path object, pandas accepts any ``os.PathLike``.
102 By file-like object, we refer to objects with a ``read()`` method,
103 such as a file handler (e.g. via builtin ``open`` function)
104 or ``StringIO``.
105{_statafile_processing_params1}
106{_statafile_processing_params2}
107{_chunksize_params}
108{_iterator_params}
110Returns
111-------
112DataFrame or StataReader
114See Also
115--------
116io.stata.StataReader : Low-level reader for Stata data files.
117DataFrame.to_stata: Export Stata data files.
119Examples
120--------
121Read a Stata dta file:
123>>> df = pd.read_stata('filename.dta')
125Read a Stata dta file in 10,000 line chunks:
127>>> itr = pd.read_stata('filename.dta', chunksize=10000)
128>>> for chunk in itr:
129... do_something(chunk)
130"""
132_read_method_doc = f"""\
133Reads observations from Stata file, converting them into a dataframe
135Parameters
136----------
137nrows : int
138 Number of lines to read from data file, if None read whole file.
139{_statafile_processing_params1}
140{_statafile_processing_params2}
142Returns
143-------
144DataFrame
145"""
147_stata_reader_doc = f"""\
148Class for reading Stata dta files.
150Parameters
151----------
152path_or_buf : path (string), buffer or path object
153 string, path object (pathlib.Path or py._path.local.LocalPath) or object
154 implementing a binary read() functions.
156 .. versionadded:: 0.23.0 support for pathlib, py.path.
157{_statafile_processing_params1}
158{_statafile_processing_params2}
159{_chunksize_params}
160"""
163@Appender(_read_stata_doc)
164def read_stata(
165 filepath_or_buffer,
166 convert_dates=True,
167 convert_categoricals=True,
168 index_col=None,
169 convert_missing=False,
170 preserve_dtypes=True,
171 columns=None,
172 order_categoricals=True,
173 chunksize=None,
174 iterator=False,
175):
177 reader = StataReader(
178 filepath_or_buffer,
179 convert_dates=convert_dates,
180 convert_categoricals=convert_categoricals,
181 index_col=index_col,
182 convert_missing=convert_missing,
183 preserve_dtypes=preserve_dtypes,
184 columns=columns,
185 order_categoricals=order_categoricals,
186 chunksize=chunksize,
187 )
189 if iterator or chunksize:
190 data = reader
191 else:
192 try:
193 data = reader.read()
194 finally:
195 reader.close()
196 return data
199_date_formats = ["%tc", "%tC", "%td", "%d", "%tw", "%tm", "%tq", "%th", "%ty"]
202stata_epoch = datetime.datetime(1960, 1, 1)
205def _stata_elapsed_date_to_datetime_vec(dates, fmt):
206 """
207 Convert from SIF to datetime. http://www.stata.com/help.cgi?datetime
209 Parameters
210 ----------
211 dates : Series
212 The Stata Internal Format date to convert to datetime according to fmt
213 fmt : str
214 The format to convert to. Can be, tc, td, tw, tm, tq, th, ty
215 Returns
217 Returns
218 -------
219 converted : Series
220 The converted dates
222 Examples
223 --------
224 >>> dates = pd.Series([52])
225 >>> _stata_elapsed_date_to_datetime_vec(dates , "%tw")
226 0 1961-01-01
227 dtype: datetime64[ns]
229 Notes
230 -----
231 datetime/c - tc
232 milliseconds since 01jan1960 00:00:00.000, assuming 86,400 s/day
233 datetime/C - tC - NOT IMPLEMENTED
234 milliseconds since 01jan1960 00:00:00.000, adjusted for leap seconds
235 date - td
236 days since 01jan1960 (01jan1960 = 0)
237 weekly date - tw
238 weeks since 1960w1
239 This assumes 52 weeks in a year, then adds 7 * remainder of the weeks.
240 The datetime value is the start of the week in terms of days in the
241 year, not ISO calendar weeks.
242 monthly date - tm
243 months since 1960m1
244 quarterly date - tq
245 quarters since 1960q1
246 half-yearly date - th
247 half-years since 1960h1 yearly
248 date - ty
249 years since 0000
251 If you don't have pandas with datetime support, then you can't do
252 milliseconds accurately.
253 """
254 MIN_YEAR, MAX_YEAR = Timestamp.min.year, Timestamp.max.year
255 MAX_DAY_DELTA = (Timestamp.max - datetime.datetime(1960, 1, 1)).days
256 MIN_DAY_DELTA = (Timestamp.min - datetime.datetime(1960, 1, 1)).days
257 MIN_MS_DELTA = MIN_DAY_DELTA * 24 * 3600 * 1000
258 MAX_MS_DELTA = MAX_DAY_DELTA * 24 * 3600 * 1000
260 def convert_year_month_safe(year, month):
261 """
262 Convert year and month to datetimes, using pandas vectorized versions
263 when the date range falls within the range supported by pandas.
264 Otherwise it falls back to a slower but more robust method
265 using datetime.
266 """
267 if year.max() < MAX_YEAR and year.min() > MIN_YEAR:
268 return to_datetime(100 * year + month, format="%Y%m")
269 else:
270 index = getattr(year, "index", None)
271 return Series(
272 [datetime.datetime(y, m, 1) for y, m in zip(year, month)], index=index
273 )
275 def convert_year_days_safe(year, days):
276 """
277 Converts year (e.g. 1999) and days since the start of the year to a
278 datetime or datetime64 Series
279 """
280 if year.max() < (MAX_YEAR - 1) and year.min() > MIN_YEAR:
281 return to_datetime(year, format="%Y") + to_timedelta(days, unit="d")
282 else:
283 index = getattr(year, "index", None)
284 value = [
285 datetime.datetime(y, 1, 1) + relativedelta(days=int(d))
286 for y, d in zip(year, days)
287 ]
288 return Series(value, index=index)
290 def convert_delta_safe(base, deltas, unit):
291 """
292 Convert base dates and deltas to datetimes, using pandas vectorized
293 versions if the deltas satisfy restrictions required to be expressed
294 as dates in pandas.
295 """
296 index = getattr(deltas, "index", None)
297 if unit == "d":
298 if deltas.max() > MAX_DAY_DELTA or deltas.min() < MIN_DAY_DELTA:
299 values = [base + relativedelta(days=int(d)) for d in deltas]
300 return Series(values, index=index)
301 elif unit == "ms":
302 if deltas.max() > MAX_MS_DELTA or deltas.min() < MIN_MS_DELTA:
303 values = [
304 base + relativedelta(microseconds=(int(d) * 1000)) for d in deltas
305 ]
306 return Series(values, index=index)
307 else:
308 raise ValueError("format not understood")
309 base = to_datetime(base)
310 deltas = to_timedelta(deltas, unit=unit)
311 return base + deltas
313 # TODO: If/when pandas supports more than datetime64[ns], this should be
314 # improved to use correct range, e.g. datetime[Y] for yearly
315 bad_locs = np.isnan(dates)
316 has_bad_values = False
317 if bad_locs.any():
318 has_bad_values = True
319 data_col = Series(dates)
320 data_col[bad_locs] = 1.0 # Replace with NaT
321 dates = dates.astype(np.int64)
323 if fmt.startswith(("%tc", "tc")): # Delta ms relative to base
324 base = stata_epoch
325 ms = dates
326 conv_dates = convert_delta_safe(base, ms, "ms")
327 elif fmt.startswith(("%tC", "tC")):
329 warnings.warn("Encountered %tC format. Leaving in Stata Internal Format.")
330 conv_dates = Series(dates, dtype=np.object)
331 if has_bad_values:
332 conv_dates[bad_locs] = NaT
333 return conv_dates
334 # Delta days relative to base
335 elif fmt.startswith(("%td", "td", "%d", "d")):
336 base = stata_epoch
337 days = dates
338 conv_dates = convert_delta_safe(base, days, "d")
339 # does not count leap days - 7 days is a week.
340 # 52nd week may have more than 7 days
341 elif fmt.startswith(("%tw", "tw")):
342 year = stata_epoch.year + dates // 52
343 days = (dates % 52) * 7
344 conv_dates = convert_year_days_safe(year, days)
345 elif fmt.startswith(("%tm", "tm")): # Delta months relative to base
346 year = stata_epoch.year + dates // 12
347 month = (dates % 12) + 1
348 conv_dates = convert_year_month_safe(year, month)
349 elif fmt.startswith(("%tq", "tq")): # Delta quarters relative to base
350 year = stata_epoch.year + dates // 4
351 month = (dates % 4) * 3 + 1
352 conv_dates = convert_year_month_safe(year, month)
353 elif fmt.startswith(("%th", "th")): # Delta half-years relative to base
354 year = stata_epoch.year + dates // 2
355 month = (dates % 2) * 6 + 1
356 conv_dates = convert_year_month_safe(year, month)
357 elif fmt.startswith(("%ty", "ty")): # Years -- not delta
358 year = dates
359 month = np.ones_like(dates)
360 conv_dates = convert_year_month_safe(year, month)
361 else:
362 raise ValueError(f"Date fmt {fmt} not understood")
364 if has_bad_values: # Restore NaT for bad values
365 conv_dates[bad_locs] = NaT
367 return conv_dates
370def _datetime_to_stata_elapsed_vec(dates, fmt):
371 """
372 Convert from datetime to SIF. http://www.stata.com/help.cgi?datetime
374 Parameters
375 ----------
376 dates : Series
377 Series or array containing datetime.datetime or datetime64[ns] to
378 convert to the Stata Internal Format given by fmt
379 fmt : str
380 The format to convert to. Can be, tc, td, tw, tm, tq, th, ty
381 """
382 index = dates.index
383 NS_PER_DAY = 24 * 3600 * 1000 * 1000 * 1000
384 US_PER_DAY = NS_PER_DAY / 1000
386 def parse_dates_safe(dates, delta=False, year=False, days=False):
387 d = {}
388 if is_datetime64_dtype(dates.values):
389 if delta:
390 delta = dates - stata_epoch
391 d["delta"] = delta.values.astype(np.int64) // 1000 # microseconds
392 if days or year:
393 dates = DatetimeIndex(dates)
394 d["year"], d["month"] = dates.year, dates.month
395 if days:
396 days = dates.astype(np.int64) - to_datetime(
397 d["year"], format="%Y"
398 ).astype(np.int64)
399 d["days"] = days // NS_PER_DAY
401 elif infer_dtype(dates, skipna=False) == "datetime":
402 if delta:
403 delta = dates.values - stata_epoch
404 f = lambda x: US_PER_DAY * x.days + 1000000 * x.seconds + x.microseconds
405 v = np.vectorize(f)
406 d["delta"] = v(delta)
407 if year:
408 year_month = dates.apply(lambda x: 100 * x.year + x.month)
409 d["year"] = year_month.values // 100
410 d["month"] = year_month.values - d["year"] * 100
411 if days:
412 f = lambda x: (x - datetime.datetime(x.year, 1, 1)).days
413 v = np.vectorize(f)
414 d["days"] = v(dates)
415 else:
416 raise ValueError(
417 "Columns containing dates must contain either "
418 "datetime64, datetime.datetime or null values."
419 )
421 return DataFrame(d, index=index)
423 bad_loc = isna(dates)
424 index = dates.index
425 if bad_loc.any():
426 dates = Series(dates)
427 if is_datetime64_dtype(dates):
428 dates[bad_loc] = to_datetime(stata_epoch)
429 else:
430 dates[bad_loc] = stata_epoch
432 if fmt in ["%tc", "tc"]:
433 d = parse_dates_safe(dates, delta=True)
434 conv_dates = d.delta / 1000
435 elif fmt in ["%tC", "tC"]:
436 warnings.warn("Stata Internal Format tC not supported.")
437 conv_dates = dates
438 elif fmt in ["%td", "td"]:
439 d = parse_dates_safe(dates, delta=True)
440 conv_dates = d.delta // US_PER_DAY
441 elif fmt in ["%tw", "tw"]:
442 d = parse_dates_safe(dates, year=True, days=True)
443 conv_dates = 52 * (d.year - stata_epoch.year) + d.days // 7
444 elif fmt in ["%tm", "tm"]:
445 d = parse_dates_safe(dates, year=True)
446 conv_dates = 12 * (d.year - stata_epoch.year) + d.month - 1
447 elif fmt in ["%tq", "tq"]:
448 d = parse_dates_safe(dates, year=True)
449 conv_dates = 4 * (d.year - stata_epoch.year) + (d.month - 1) // 3
450 elif fmt in ["%th", "th"]:
451 d = parse_dates_safe(dates, year=True)
452 conv_dates = 2 * (d.year - stata_epoch.year) + (d.month > 6).astype(np.int)
453 elif fmt in ["%ty", "ty"]:
454 d = parse_dates_safe(dates, year=True)
455 conv_dates = d.year
456 else:
457 raise ValueError(f"Format {fmt} is not a known Stata date format")
459 conv_dates = Series(conv_dates, dtype=np.float64)
460 missing_value = struct.unpack("<d", b"\x00\x00\x00\x00\x00\x00\xe0\x7f")[0]
461 conv_dates[bad_loc] = missing_value
463 return Series(conv_dates, index=index)
466excessive_string_length_error = """
467Fixed width strings in Stata .dta files are limited to 244 (or fewer)
468characters. Column '{0}' does not satisfy this restriction. Use the
469'version=117' parameter to write the newer (Stata 13 and later) format.
470"""
473class PossiblePrecisionLoss(Warning):
474 pass
477precision_loss_doc = """
478Column converted from %s to %s, and some data are outside of the lossless
479conversion range. This may result in a loss of precision in the saved data.
480"""
483class ValueLabelTypeMismatch(Warning):
484 pass
487value_label_mismatch_doc = """
488Stata value labels (pandas categories) must be strings. Column {0} contains
489non-string labels which will be converted to strings. Please check that the
490Stata data file created has not lost information due to duplicate labels.
491"""
494class InvalidColumnName(Warning):
495 pass
498invalid_name_doc = """
499Not all pandas column names were valid Stata variable names.
500The following replacements have been made:
502 {0}
504If this is not what you expect, please make sure you have Stata-compliant
505column names in your DataFrame (strings only, max 32 characters, only
506alphanumerics and underscores, no Stata reserved words)
507"""
510def _cast_to_stata_types(data):
511 """Checks the dtypes of the columns of a pandas DataFrame for
512 compatibility with the data types and ranges supported by Stata, and
513 converts if necessary.
515 Parameters
516 ----------
517 data : DataFrame
518 The DataFrame to check and convert
520 Notes
521 -----
522 Numeric columns in Stata must be one of int8, int16, int32, float32 or
523 float64, with some additional value restrictions. int8 and int16 columns
524 are checked for violations of the value restrictions and upcast if needed.
525 int64 data is not usable in Stata, and so it is downcast to int32 whenever
526 the value are in the int32 range, and sidecast to float64 when larger than
527 this range. If the int64 values are outside of the range of those
528 perfectly representable as float64 values, a warning is raised.
530 bool columns are cast to int8. uint columns are converted to int of the
531 same size if there is no loss in precision, otherwise are upcast to a
532 larger type. uint64 is currently not supported since it is concerted to
533 object in a DataFrame.
534 """
535 ws = ""
536 # original, if small, if large
537 conversion_data = (
538 (np.bool, np.int8, np.int8),
539 (np.uint8, np.int8, np.int16),
540 (np.uint16, np.int16, np.int32),
541 (np.uint32, np.int32, np.int64),
542 )
544 float32_max = struct.unpack("<f", b"\xff\xff\xff\x7e")[0]
545 float64_max = struct.unpack("<d", b"\xff\xff\xff\xff\xff\xff\xdf\x7f")[0]
547 for col in data:
548 dtype = data[col].dtype
549 # Cast from unsupported types to supported types
550 for c_data in conversion_data:
551 if dtype == c_data[0]:
552 if data[col].max() <= np.iinfo(c_data[1]).max:
553 dtype = c_data[1]
554 else:
555 dtype = c_data[2]
556 if c_data[2] == np.float64: # Warn if necessary
557 if data[col].max() >= 2 ** 53:
558 ws = precision_loss_doc.format("uint64", "float64")
560 data[col] = data[col].astype(dtype)
562 # Check values and upcast if necessary
563 if dtype == np.int8:
564 if data[col].max() > 100 or data[col].min() < -127:
565 data[col] = data[col].astype(np.int16)
566 elif dtype == np.int16:
567 if data[col].max() > 32740 or data[col].min() < -32767:
568 data[col] = data[col].astype(np.int32)
569 elif dtype == np.int64:
570 if data[col].max() <= 2147483620 and data[col].min() >= -2147483647:
571 data[col] = data[col].astype(np.int32)
572 else:
573 data[col] = data[col].astype(np.float64)
574 if data[col].max() >= 2 ** 53 or data[col].min() <= -(2 ** 53):
575 ws = precision_loss_doc.format("int64", "float64")
576 elif dtype in (np.float32, np.float64):
577 value = data[col].max()
578 if np.isinf(value):
579 raise ValueError(
580 f"Column {col} has a maximum value of infinity which is outside "
581 "the range supported by Stata."
582 )
583 if dtype == np.float32 and value > float32_max:
584 data[col] = data[col].astype(np.float64)
585 elif dtype == np.float64:
586 if value > float64_max:
587 raise ValueError(
588 f"Column {col} has a maximum value ({value}) outside the range "
589 f"supported by Stata ({float64_max})"
590 )
592 if ws:
593 warnings.warn(ws, PossiblePrecisionLoss)
595 return data
598class StataValueLabel:
599 """
600 Parse a categorical column and prepare formatted output
602 Parameters
603 ----------
604 catarray : Categorical
605 Categorical Series to encode
606 encoding : {"latin-1", "utf-8"}
607 Encoding to use for value labels.
608 """
610 def __init__(self, catarray, encoding="latin-1"):
612 if encoding not in ("latin-1", "utf-8"):
613 raise ValueError("Only latin-1 and utf-8 are supported.")
614 self.labname = catarray.name
615 self._encoding = encoding
616 categories = catarray.cat.categories
617 self.value_labels = list(zip(np.arange(len(categories)), categories))
618 self.value_labels.sort(key=lambda x: x[0])
619 self.text_len = np.int32(0)
620 self.off = []
621 self.val = []
622 self.txt = []
623 self.n = 0
625 # Compute lengths and setup lists of offsets and labels
626 for vl in self.value_labels:
627 category = vl[1]
628 if not isinstance(category, str):
629 category = str(category)
630 warnings.warn(
631 value_label_mismatch_doc.format(catarray.name),
632 ValueLabelTypeMismatch,
633 )
634 category = category.encode(encoding)
635 self.off.append(self.text_len)
636 self.text_len += len(category) + 1 # +1 for the padding
637 self.val.append(vl[0])
638 self.txt.append(category)
639 self.n += 1
641 if self.text_len > 32000:
642 raise ValueError(
643 "Stata value labels for a single variable must "
644 "have a combined length less than 32,000 "
645 "characters."
646 )
648 # Ensure int32
649 self.off = np.array(self.off, dtype=np.int32)
650 self.val = np.array(self.val, dtype=np.int32)
652 # Total length
653 self.len = 4 + 4 + 4 * self.n + 4 * self.n + self.text_len
655 def _encode(self, s):
656 """
657 Python 3 compatibility shim
658 """
659 return s.encode(self._encoding)
661 def generate_value_label(self, byteorder):
662 """
663 Generate the binary representation of the value labals.
665 Parameters
666 ----------
667 byteorder : str
668 Byte order of the output
670 Returns
671 -------
672 value_label : bytes
673 Bytes containing the formatted value label
674 """
675 encoding = self._encoding
676 bio = BytesIO()
677 null_byte = b"\x00"
679 # len
680 bio.write(struct.pack(byteorder + "i", self.len))
682 # labname
683 labname = self.labname[:32].encode(encoding)
684 lab_len = 32 if encoding not in ("utf-8", "utf8") else 128
685 labname = _pad_bytes(labname, lab_len + 1)
686 bio.write(labname)
688 # padding - 3 bytes
689 for i in range(3):
690 bio.write(struct.pack("c", null_byte))
692 # value_label_table
693 # n - int32
694 bio.write(struct.pack(byteorder + "i", self.n))
696 # textlen - int32
697 bio.write(struct.pack(byteorder + "i", self.text_len))
699 # off - int32 array (n elements)
700 for offset in self.off:
701 bio.write(struct.pack(byteorder + "i", offset))
703 # val - int32 array (n elements)
704 for value in self.val:
705 bio.write(struct.pack(byteorder + "i", value))
707 # txt - Text labels, null terminated
708 for text in self.txt:
709 bio.write(text + null_byte)
711 bio.seek(0)
712 return bio.read()
715class StataMissingValue:
716 """
717 An observation's missing value.
719 Parameters
720 ----------
721 value : int8, int16, int32, float32 or float64
722 The Stata missing value code
724 Attributes
725 ----------
726 string : string
727 String representation of the Stata missing value
728 value : int8, int16, int32, float32 or float64
729 The original encoded missing value
731 Notes
732 -----
733 More information: <http://www.stata.com/help.cgi?missing>
735 Integer missing values make the code '.', '.a', ..., '.z' to the ranges
736 101 ... 127 (for int8), 32741 ... 32767 (for int16) and 2147483621 ...
737 2147483647 (for int32). Missing values for floating point data types are
738 more complex but the pattern is simple to discern from the following table.
740 np.float32 missing values (float in Stata)
741 0000007f .
742 0008007f .a
743 0010007f .b
744 ...
745 00c0007f .x
746 00c8007f .y
747 00d0007f .z
749 np.float64 missing values (double in Stata)
750 000000000000e07f .
751 000000000001e07f .a
752 000000000002e07f .b
753 ...
754 000000000018e07f .x
755 000000000019e07f .y
756 00000000001ae07f .z
757 """
759 # Construct a dictionary of missing values
760 MISSING_VALUES = {}
761 bases = (101, 32741, 2147483621)
762 for b in bases:
763 # Conversion to long to avoid hash issues on 32 bit platforms #8968
764 MISSING_VALUES[b] = "."
765 for i in range(1, 27):
766 MISSING_VALUES[i + b] = "." + chr(96 + i)
768 float32_base = b"\x00\x00\x00\x7f"
769 increment = struct.unpack("<i", b"\x00\x08\x00\x00")[0]
770 for i in range(27):
771 value = struct.unpack("<f", float32_base)[0]
772 MISSING_VALUES[value] = "."
773 if i > 0:
774 MISSING_VALUES[value] += chr(96 + i)
775 int_value = struct.unpack("<i", struct.pack("<f", value))[0] + increment
776 float32_base = struct.pack("<i", int_value)
778 float64_base = b"\x00\x00\x00\x00\x00\x00\xe0\x7f"
779 increment = struct.unpack("q", b"\x00\x00\x00\x00\x00\x01\x00\x00")[0]
780 for i in range(27):
781 value = struct.unpack("<d", float64_base)[0]
782 MISSING_VALUES[value] = "."
783 if i > 0:
784 MISSING_VALUES[value] += chr(96 + i)
785 int_value = struct.unpack("q", struct.pack("<d", value))[0] + increment
786 float64_base = struct.pack("q", int_value)
788 BASE_MISSING_VALUES = {
789 "int8": 101,
790 "int16": 32741,
791 "int32": 2147483621,
792 "float32": struct.unpack("<f", float32_base)[0],
793 "float64": struct.unpack("<d", float64_base)[0],
794 }
796 def __init__(self, value):
797 self._value = value
798 # Conversion to int to avoid hash issues on 32 bit platforms #8968
799 value = int(value) if value < 2147483648 else float(value)
800 self._str = self.MISSING_VALUES[value]
802 string = property(
803 lambda self: self._str,
804 doc="The Stata representation of the missing value: '.', '.a'..'.z'",
805 )
806 value = property(
807 lambda self: self._value, doc="The binary representation of the missing value."
808 )
810 def __str__(self) -> str:
811 return self.string
813 def __repr__(self) -> str:
814 return f"{type(self)}({self})"
816 def __eq__(self, other: Any) -> bool:
817 return (
818 isinstance(other, type(self))
819 and self.string == other.string
820 and self.value == other.value
821 )
823 @classmethod
824 def get_base_missing_value(cls, dtype):
825 if dtype == np.int8:
826 value = cls.BASE_MISSING_VALUES["int8"]
827 elif dtype == np.int16:
828 value = cls.BASE_MISSING_VALUES["int16"]
829 elif dtype == np.int32:
830 value = cls.BASE_MISSING_VALUES["int32"]
831 elif dtype == np.float32:
832 value = cls.BASE_MISSING_VALUES["float32"]
833 elif dtype == np.float64:
834 value = cls.BASE_MISSING_VALUES["float64"]
835 else:
836 raise ValueError("Unsupported dtype")
837 return value
840class StataParser:
841 def __init__(self):
843 # type code.
844 # --------------------
845 # str1 1 = 0x01
846 # str2 2 = 0x02
847 # ...
848 # str244 244 = 0xf4
849 # byte 251 = 0xfb (sic)
850 # int 252 = 0xfc
851 # long 253 = 0xfd
852 # float 254 = 0xfe
853 # double 255 = 0xff
854 # --------------------
855 # NOTE: the byte type seems to be reserved for categorical variables
856 # with a label, but the underlying variable is -127 to 100
857 # we're going to drop the label and cast to int
858 self.DTYPE_MAP = dict(
859 list(zip(range(1, 245), ["a" + str(i) for i in range(1, 245)]))
860 + [
861 (251, np.int8),
862 (252, np.int16),
863 (253, np.int32),
864 (254, np.float32),
865 (255, np.float64),
866 ]
867 )
868 self.DTYPE_MAP_XML = dict(
869 [
870 (32768, np.uint8), # Keys to GSO
871 (65526, np.float64),
872 (65527, np.float32),
873 (65528, np.int32),
874 (65529, np.int16),
875 (65530, np.int8),
876 ]
877 )
878 self.TYPE_MAP = list(range(251)) + list("bhlfd")
879 self.TYPE_MAP_XML = dict(
880 [
881 # Not really a Q, unclear how to handle byteswap
882 (32768, "Q"),
883 (65526, "d"),
884 (65527, "f"),
885 (65528, "l"),
886 (65529, "h"),
887 (65530, "b"),
888 ]
889 )
890 # NOTE: technically, some of these are wrong. there are more numbers
891 # that can be represented. it's the 27 ABOVE and BELOW the max listed
892 # numeric data type in [U] 12.2.2 of the 11.2 manual
893 float32_min = b"\xff\xff\xff\xfe"
894 float32_max = b"\xff\xff\xff\x7e"
895 float64_min = b"\xff\xff\xff\xff\xff\xff\xef\xff"
896 float64_max = b"\xff\xff\xff\xff\xff\xff\xdf\x7f"
897 self.VALID_RANGE = {
898 "b": (-127, 100),
899 "h": (-32767, 32740),
900 "l": (-2147483647, 2147483620),
901 "f": (
902 np.float32(struct.unpack("<f", float32_min)[0]),
903 np.float32(struct.unpack("<f", float32_max)[0]),
904 ),
905 "d": (
906 np.float64(struct.unpack("<d", float64_min)[0]),
907 np.float64(struct.unpack("<d", float64_max)[0]),
908 ),
909 }
911 self.OLD_TYPE_MAPPING = {
912 98: 251, # byte
913 105: 252, # int
914 108: 253, # long
915 102: 254 # float
916 # don't know old code for double
917 }
919 # These missing values are the generic '.' in Stata, and are used
920 # to replace nans
921 self.MISSING_VALUES = {
922 "b": 101,
923 "h": 32741,
924 "l": 2147483621,
925 "f": np.float32(struct.unpack("<f", b"\x00\x00\x00\x7f")[0]),
926 "d": np.float64(
927 struct.unpack("<d", b"\x00\x00\x00\x00\x00\x00\xe0\x7f")[0]
928 ),
929 }
930 self.NUMPY_TYPE_MAP = {
931 "b": "i1",
932 "h": "i2",
933 "l": "i4",
934 "f": "f4",
935 "d": "f8",
936 "Q": "u8",
937 }
939 # Reserved words cannot be used as variable names
940 self.RESERVED_WORDS = (
941 "aggregate",
942 "array",
943 "boolean",
944 "break",
945 "byte",
946 "case",
947 "catch",
948 "class",
949 "colvector",
950 "complex",
951 "const",
952 "continue",
953 "default",
954 "delegate",
955 "delete",
956 "do",
957 "double",
958 "else",
959 "eltypedef",
960 "end",
961 "enum",
962 "explicit",
963 "export",
964 "external",
965 "float",
966 "for",
967 "friend",
968 "function",
969 "global",
970 "goto",
971 "if",
972 "inline",
973 "int",
974 "local",
975 "long",
976 "NULL",
977 "pragma",
978 "protected",
979 "quad",
980 "rowvector",
981 "short",
982 "typedef",
983 "typename",
984 "virtual",
985 "_all",
986 "_N",
987 "_skip",
988 "_b",
989 "_pi",
990 "str#",
991 "in",
992 "_pred",
993 "strL",
994 "_coef",
995 "_rc",
996 "using",
997 "_cons",
998 "_se",
999 "with",
1000 "_n",
1001 )
1004class StataReader(StataParser, abc.Iterator):
1005 __doc__ = _stata_reader_doc
1007 def __init__(
1008 self,
1009 path_or_buf,
1010 convert_dates=True,
1011 convert_categoricals=True,
1012 index_col=None,
1013 convert_missing=False,
1014 preserve_dtypes=True,
1015 columns=None,
1016 order_categoricals=True,
1017 chunksize=None,
1018 ):
1019 super().__init__()
1020 self.col_sizes = ()
1022 # Arguments to the reader (can be temporarily overridden in
1023 # calls to read).
1024 self._convert_dates = convert_dates
1025 self._convert_categoricals = convert_categoricals
1026 self._index_col = index_col
1027 self._convert_missing = convert_missing
1028 self._preserve_dtypes = preserve_dtypes
1029 self._columns = columns
1030 self._order_categoricals = order_categoricals
1031 self._encoding = None
1032 self._chunksize = chunksize
1034 # State variables for the file
1035 self._has_string_data = False
1036 self._missing_values = False
1037 self._can_read_value_labels = False
1038 self._column_selector_set = False
1039 self._value_labels_read = False
1040 self._data_read = False
1041 self._dtype = None
1042 self._lines_read = 0
1044 self._native_byteorder = _set_endianness(sys.byteorder)
1045 path_or_buf = stringify_path(path_or_buf)
1046 if isinstance(path_or_buf, str):
1047 path_or_buf, encoding, _, should_close = get_filepath_or_buffer(path_or_buf)
1049 if isinstance(path_or_buf, (str, bytes)):
1050 self.path_or_buf = open(path_or_buf, "rb")
1051 else:
1052 # Copy to BytesIO, and ensure no encoding
1053 contents = path_or_buf.read()
1054 self.path_or_buf = BytesIO(contents)
1056 self._read_header()
1057 self._setup_dtype()
1059 def __enter__(self):
1060 """ enter context manager """
1061 return self
1063 def __exit__(self, exc_type, exc_value, traceback):
1064 """ exit context manager """
1065 self.close()
1067 def close(self):
1068 """ close the handle if its open """
1069 try:
1070 self.path_or_buf.close()
1071 except IOError:
1072 pass
1074 def _set_encoding(self):
1075 """
1076 Set string encoding which depends on file version
1077 """
1078 if self.format_version < 118:
1079 self._encoding = "latin-1"
1080 else:
1081 self._encoding = "utf-8"
1083 def _read_header(self):
1084 first_char = self.path_or_buf.read(1)
1085 if struct.unpack("c", first_char)[0] == b"<":
1086 self._read_new_header(first_char)
1087 else:
1088 self._read_old_header(first_char)
1090 self.has_string_data = len([x for x in self.typlist if type(x) is int]) > 0
1092 # calculate size of a data record
1093 self.col_sizes = [self._calcsize(typ) for typ in self.typlist]
1095 def _read_new_header(self, first_char):
1096 # The first part of the header is common to 117 - 119.
1097 self.path_or_buf.read(27) # stata_dta><header><release>
1098 self.format_version = int(self.path_or_buf.read(3))
1099 if self.format_version not in [117, 118, 119]:
1100 raise ValueError(_version_error.format(version=self.format_version))
1101 self._set_encoding()
1102 self.path_or_buf.read(21) # </release><byteorder>
1103 self.byteorder = self.path_or_buf.read(3) == b"MSF" and ">" or "<"
1104 self.path_or_buf.read(15) # </byteorder><K>
1105 nvar_type = "H" if self.format_version <= 118 else "I"
1106 nvar_size = 2 if self.format_version <= 118 else 4
1107 self.nvar = struct.unpack(
1108 self.byteorder + nvar_type, self.path_or_buf.read(nvar_size)
1109 )[0]
1110 self.path_or_buf.read(7) # </K><N>
1112 self.nobs = self._get_nobs()
1113 self.path_or_buf.read(11) # </N><label>
1114 self._data_label = self._get_data_label()
1115 self.path_or_buf.read(19) # </label><timestamp>
1116 self.time_stamp = self._get_time_stamp()
1117 self.path_or_buf.read(26) # </timestamp></header><map>
1118 self.path_or_buf.read(8) # 0x0000000000000000
1119 self.path_or_buf.read(8) # position of <map>
1121 self._seek_vartypes = (
1122 struct.unpack(self.byteorder + "q", self.path_or_buf.read(8))[0] + 16
1123 )
1124 self._seek_varnames = (
1125 struct.unpack(self.byteorder + "q", self.path_or_buf.read(8))[0] + 10
1126 )
1127 self._seek_sortlist = (
1128 struct.unpack(self.byteorder + "q", self.path_or_buf.read(8))[0] + 10
1129 )
1130 self._seek_formats = (
1131 struct.unpack(self.byteorder + "q", self.path_or_buf.read(8))[0] + 9
1132 )
1133 self._seek_value_label_names = (
1134 struct.unpack(self.byteorder + "q", self.path_or_buf.read(8))[0] + 19
1135 )
1137 # Requires version-specific treatment
1138 self._seek_variable_labels = self._get_seek_variable_labels()
1140 self.path_or_buf.read(8) # <characteristics>
1141 self.data_location = (
1142 struct.unpack(self.byteorder + "q", self.path_or_buf.read(8))[0] + 6
1143 )
1144 self.seek_strls = (
1145 struct.unpack(self.byteorder + "q", self.path_or_buf.read(8))[0] + 7
1146 )
1147 self.seek_value_labels = (
1148 struct.unpack(self.byteorder + "q", self.path_or_buf.read(8))[0] + 14
1149 )
1151 self.typlist, self.dtyplist = self._get_dtypes(self._seek_vartypes)
1153 self.path_or_buf.seek(self._seek_varnames)
1154 self.varlist = self._get_varlist()
1156 self.path_or_buf.seek(self._seek_sortlist)
1157 self.srtlist = struct.unpack(
1158 self.byteorder + ("h" * (self.nvar + 1)),
1159 self.path_or_buf.read(2 * (self.nvar + 1)),
1160 )[:-1]
1162 self.path_or_buf.seek(self._seek_formats)
1163 self.fmtlist = self._get_fmtlist()
1165 self.path_or_buf.seek(self._seek_value_label_names)
1166 self.lbllist = self._get_lbllist()
1168 self.path_or_buf.seek(self._seek_variable_labels)
1169 self._variable_labels = self._get_variable_labels()
1171 # Get data type information, works for versions 117-119.
1172 def _get_dtypes(self, seek_vartypes):
1174 self.path_or_buf.seek(seek_vartypes)
1175 raw_typlist = [
1176 struct.unpack(self.byteorder + "H", self.path_or_buf.read(2))[0]
1177 for i in range(self.nvar)
1178 ]
1180 def f(typ):
1181 if typ <= 2045:
1182 return typ
1183 try:
1184 return self.TYPE_MAP_XML[typ]
1185 except KeyError:
1186 raise ValueError(f"cannot convert stata types [{typ}]")
1188 typlist = [f(x) for x in raw_typlist]
1190 def f(typ):
1191 if typ <= 2045:
1192 return str(typ)
1193 try:
1194 return self.DTYPE_MAP_XML[typ]
1195 except KeyError:
1196 raise ValueError(f"cannot convert stata dtype [{typ}]")
1198 dtyplist = [f(x) for x in raw_typlist]
1200 return typlist, dtyplist
1202 def _get_varlist(self):
1203 if self.format_version == 117:
1204 b = 33
1205 elif self.format_version >= 118:
1206 b = 129
1208 return [self._decode(self.path_or_buf.read(b)) for i in range(self.nvar)]
1210 # Returns the format list
1211 def _get_fmtlist(self):
1212 if self.format_version >= 118:
1213 b = 57
1214 elif self.format_version > 113:
1215 b = 49
1216 elif self.format_version > 104:
1217 b = 12
1218 else:
1219 b = 7
1221 return [self._decode(self.path_or_buf.read(b)) for i in range(self.nvar)]
1223 # Returns the label list
1224 def _get_lbllist(self):
1225 if self.format_version >= 118:
1226 b = 129
1227 elif self.format_version > 108:
1228 b = 33
1229 else:
1230 b = 9
1231 return [self._decode(self.path_or_buf.read(b)) for i in range(self.nvar)]
1233 def _get_variable_labels(self):
1234 if self.format_version >= 118:
1235 vlblist = [
1236 self._decode(self.path_or_buf.read(321)) for i in range(self.nvar)
1237 ]
1238 elif self.format_version > 105:
1239 vlblist = [
1240 self._decode(self.path_or_buf.read(81)) for i in range(self.nvar)
1241 ]
1242 else:
1243 vlblist = [
1244 self._decode(self.path_or_buf.read(32)) for i in range(self.nvar)
1245 ]
1246 return vlblist
1248 def _get_nobs(self):
1249 if self.format_version >= 118:
1250 return struct.unpack(self.byteorder + "Q", self.path_or_buf.read(8))[0]
1251 else:
1252 return struct.unpack(self.byteorder + "I", self.path_or_buf.read(4))[0]
1254 def _get_data_label(self):
1255 if self.format_version >= 118:
1256 strlen = struct.unpack(self.byteorder + "H", self.path_or_buf.read(2))[0]
1257 return self._decode(self.path_or_buf.read(strlen))
1258 elif self.format_version == 117:
1259 strlen = struct.unpack("b", self.path_or_buf.read(1))[0]
1260 return self._decode(self.path_or_buf.read(strlen))
1261 elif self.format_version > 105:
1262 return self._decode(self.path_or_buf.read(81))
1263 else:
1264 return self._decode(self.path_or_buf.read(32))
1266 def _get_time_stamp(self):
1267 if self.format_version >= 118:
1268 strlen = struct.unpack("b", self.path_or_buf.read(1))[0]
1269 return self.path_or_buf.read(strlen).decode("utf-8")
1270 elif self.format_version == 117:
1271 strlen = struct.unpack("b", self.path_or_buf.read(1))[0]
1272 return self._decode(self.path_or_buf.read(strlen))
1273 elif self.format_version > 104:
1274 return self._decode(self.path_or_buf.read(18))
1275 else:
1276 raise ValueError()
1278 def _get_seek_variable_labels(self):
1279 if self.format_version == 117:
1280 self.path_or_buf.read(8) # <variable_lables>, throw away
1281 # Stata 117 data files do not follow the described format. This is
1282 # a work around that uses the previous label, 33 bytes for each
1283 # variable, 20 for the closing tag and 17 for the opening tag
1284 return self._seek_value_label_names + (33 * self.nvar) + 20 + 17
1285 elif self.format_version >= 118:
1286 return struct.unpack(self.byteorder + "q", self.path_or_buf.read(8))[0] + 17
1287 else:
1288 raise ValueError()
1290 def _read_old_header(self, first_char):
1291 self.format_version = struct.unpack("b", first_char)[0]
1292 if self.format_version not in [104, 105, 108, 111, 113, 114, 115]:
1293 raise ValueError(_version_error.format(version=self.format_version))
1294 self._set_encoding()
1295 self.byteorder = (
1296 struct.unpack("b", self.path_or_buf.read(1))[0] == 0x1 and ">" or "<"
1297 )
1298 self.filetype = struct.unpack("b", self.path_or_buf.read(1))[0]
1299 self.path_or_buf.read(1) # unused
1301 self.nvar = struct.unpack(self.byteorder + "H", self.path_or_buf.read(2))[0]
1302 self.nobs = self._get_nobs()
1304 self._data_label = self._get_data_label()
1306 self.time_stamp = self._get_time_stamp()
1308 # descriptors
1309 if self.format_version > 108:
1310 typlist = [ord(self.path_or_buf.read(1)) for i in range(self.nvar)]
1311 else:
1312 buf = self.path_or_buf.read(self.nvar)
1313 typlistb = np.frombuffer(buf, dtype=np.uint8)
1314 typlist = []
1315 for tp in typlistb:
1316 if tp in self.OLD_TYPE_MAPPING:
1317 typlist.append(self.OLD_TYPE_MAPPING[tp])
1318 else:
1319 typlist.append(tp - 127) # bytes
1321 try:
1322 self.typlist = [self.TYPE_MAP[typ] for typ in typlist]
1323 except ValueError:
1324 invalid_types = ",".join(str(x) for x in typlist)
1325 raise ValueError(f"cannot convert stata types [{invalid_types}]")
1326 try:
1327 self.dtyplist = [self.DTYPE_MAP[typ] for typ in typlist]
1328 except ValueError:
1329 invalid_dtypes = ",".join(str(x) for x in typlist)
1330 raise ValueError(f"cannot convert stata dtypes [{invalid_dtypes}]")
1332 if self.format_version > 108:
1333 self.varlist = [
1334 self._decode(self.path_or_buf.read(33)) for i in range(self.nvar)
1335 ]
1336 else:
1337 self.varlist = [
1338 self._decode(self.path_or_buf.read(9)) for i in range(self.nvar)
1339 ]
1340 self.srtlist = struct.unpack(
1341 self.byteorder + ("h" * (self.nvar + 1)),
1342 self.path_or_buf.read(2 * (self.nvar + 1)),
1343 )[:-1]
1345 self.fmtlist = self._get_fmtlist()
1347 self.lbllist = self._get_lbllist()
1349 self._variable_labels = self._get_variable_labels()
1351 # ignore expansion fields (Format 105 and later)
1352 # When reading, read five bytes; the last four bytes now tell you
1353 # the size of the next read, which you discard. You then continue
1354 # like this until you read 5 bytes of zeros.
1356 if self.format_version > 104:
1357 while True:
1358 data_type = struct.unpack(
1359 self.byteorder + "b", self.path_or_buf.read(1)
1360 )[0]
1361 if self.format_version > 108:
1362 data_len = struct.unpack(
1363 self.byteorder + "i", self.path_or_buf.read(4)
1364 )[0]
1365 else:
1366 data_len = struct.unpack(
1367 self.byteorder + "h", self.path_or_buf.read(2)
1368 )[0]
1369 if data_type == 0:
1370 break
1371 self.path_or_buf.read(data_len)
1373 # necessary data to continue parsing
1374 self.data_location = self.path_or_buf.tell()
1376 def _setup_dtype(self):
1377 """Map between numpy and state dtypes"""
1378 if self._dtype is not None:
1379 return self._dtype
1381 dtype = [] # Convert struct data types to numpy data type
1382 for i, typ in enumerate(self.typlist):
1383 if typ in self.NUMPY_TYPE_MAP:
1384 dtype.append(("s" + str(i), self.byteorder + self.NUMPY_TYPE_MAP[typ]))
1385 else:
1386 dtype.append(("s" + str(i), "S" + str(typ)))
1387 dtype = np.dtype(dtype)
1388 self._dtype = dtype
1390 return self._dtype
1392 def _calcsize(self, fmt):
1393 return type(fmt) is int and fmt or struct.calcsize(self.byteorder + fmt)
1395 def _decode(self, s):
1396 # have bytes not strings, so must decode
1397 s = s.partition(b"\0")[0]
1398 try:
1399 return s.decode(self._encoding)
1400 except UnicodeDecodeError:
1401 # GH 25960, fallback to handle incorrect format produced when 117
1402 # files are converted to 118 files in Stata
1403 encoding = self._encoding
1404 msg = f"""
1405One or more strings in the dta file could not be decoded using {encoding}, and
1406so the fallback encoding of latin-1 is being used. This can happen when a file
1407has been incorrectly encoded by Stata or some other software. You should verify
1408the string values returned are correct."""
1409 warnings.warn(msg, UnicodeWarning)
1410 return s.decode("latin-1")
1412 def _read_value_labels(self):
1413 if self._value_labels_read:
1414 # Don't read twice
1415 return
1416 if self.format_version <= 108:
1417 # Value labels are not supported in version 108 and earlier.
1418 self._value_labels_read = True
1419 self.value_label_dict = dict()
1420 return
1422 if self.format_version >= 117:
1423 self.path_or_buf.seek(self.seek_value_labels)
1424 else:
1425 offset = self.nobs * self._dtype.itemsize
1426 self.path_or_buf.seek(self.data_location + offset)
1428 self._value_labels_read = True
1429 self.value_label_dict = dict()
1431 while True:
1432 if self.format_version >= 117:
1433 if self.path_or_buf.read(5) == b"</val": # <lbl>
1434 break # end of value label table
1436 slength = self.path_or_buf.read(4)
1437 if not slength:
1438 break # end of value label table (format < 117)
1439 if self.format_version <= 117:
1440 labname = self._decode(self.path_or_buf.read(33))
1441 else:
1442 labname = self._decode(self.path_or_buf.read(129))
1443 self.path_or_buf.read(3) # padding
1445 n = struct.unpack(self.byteorder + "I", self.path_or_buf.read(4))[0]
1446 txtlen = struct.unpack(self.byteorder + "I", self.path_or_buf.read(4))[0]
1447 off = np.frombuffer(
1448 self.path_or_buf.read(4 * n), dtype=self.byteorder + "i4", count=n
1449 )
1450 val = np.frombuffer(
1451 self.path_or_buf.read(4 * n), dtype=self.byteorder + "i4", count=n
1452 )
1453 ii = np.argsort(off)
1454 off = off[ii]
1455 val = val[ii]
1456 txt = self.path_or_buf.read(txtlen)
1457 self.value_label_dict[labname] = dict()
1458 for i in range(n):
1459 end = off[i + 1] if i < n - 1 else txtlen
1460 self.value_label_dict[labname][val[i]] = self._decode(txt[off[i] : end])
1461 if self.format_version >= 117:
1462 self.path_or_buf.read(6) # </lbl>
1463 self._value_labels_read = True
1465 def _read_strls(self):
1466 self.path_or_buf.seek(self.seek_strls)
1467 # Wrap v_o in a string to allow uint64 values as keys on 32bit OS
1468 self.GSO = {"0": ""}
1469 while True:
1470 if self.path_or_buf.read(3) != b"GSO":
1471 break
1473 if self.format_version == 117:
1474 v_o = struct.unpack(self.byteorder + "Q", self.path_or_buf.read(8))[0]
1475 else:
1476 buf = self.path_or_buf.read(12)
1477 # Only tested on little endian file on little endian machine.
1478 v_size = 2 if self.format_version == 118 else 3
1479 if self.byteorder == "<":
1480 buf = buf[0:v_size] + buf[4 : 12 - v_size]
1481 else:
1482 # This path may not be correct, impossible to test
1483 buf = buf[0:v_size] + buf[4 + v_size :]
1484 v_o = struct.unpack("Q", buf)[0]
1485 typ = struct.unpack("B", self.path_or_buf.read(1))[0]
1486 length = struct.unpack(self.byteorder + "I", self.path_or_buf.read(4))[0]
1487 va = self.path_or_buf.read(length)
1488 if typ == 130:
1489 va = va[0:-1].decode(self._encoding)
1490 # Wrap v_o in a string to allow uint64 values as keys on 32bit OS
1491 self.GSO[str(v_o)] = va
1493 def __next__(self):
1494 return self.read(nrows=self._chunksize or 1)
1496 def get_chunk(self, size=None):
1497 """
1498 Reads lines from Stata file and returns as dataframe
1500 Parameters
1501 ----------
1502 size : int, defaults to None
1503 Number of lines to read. If None, reads whole file.
1505 Returns
1506 -------
1507 DataFrame
1508 """
1509 if size is None:
1510 size = self._chunksize
1511 return self.read(nrows=size)
1513 @Appender(_read_method_doc)
1514 def read(
1515 self,
1516 nrows=None,
1517 convert_dates=None,
1518 convert_categoricals=None,
1519 index_col=None,
1520 convert_missing=None,
1521 preserve_dtypes=None,
1522 columns=None,
1523 order_categoricals=None,
1524 ):
1525 # Handle empty file or chunk. If reading incrementally raise
1526 # StopIteration. If reading the whole thing return an empty
1527 # data frame.
1528 if (self.nobs == 0) and (nrows is None):
1529 self._can_read_value_labels = True
1530 self._data_read = True
1531 self.close()
1532 return DataFrame(columns=self.varlist)
1534 # Handle options
1535 if convert_dates is None:
1536 convert_dates = self._convert_dates
1537 if convert_categoricals is None:
1538 convert_categoricals = self._convert_categoricals
1539 if convert_missing is None:
1540 convert_missing = self._convert_missing
1541 if preserve_dtypes is None:
1542 preserve_dtypes = self._preserve_dtypes
1543 if columns is None:
1544 columns = self._columns
1545 if order_categoricals is None:
1546 order_categoricals = self._order_categoricals
1547 if index_col is None:
1548 index_col = self._index_col
1550 if nrows is None:
1551 nrows = self.nobs
1553 if (self.format_version >= 117) and (not self._value_labels_read):
1554 self._can_read_value_labels = True
1555 self._read_strls()
1557 # Read data
1558 dtype = self._dtype
1559 max_read_len = (self.nobs - self._lines_read) * dtype.itemsize
1560 read_len = nrows * dtype.itemsize
1561 read_len = min(read_len, max_read_len)
1562 if read_len <= 0:
1563 # Iterator has finished, should never be here unless
1564 # we are reading the file incrementally
1565 if convert_categoricals:
1566 self._read_value_labels()
1567 self.close()
1568 raise StopIteration
1569 offset = self._lines_read * dtype.itemsize
1570 self.path_or_buf.seek(self.data_location + offset)
1571 read_lines = min(nrows, self.nobs - self._lines_read)
1572 data = np.frombuffer(
1573 self.path_or_buf.read(read_len), dtype=dtype, count=read_lines
1574 )
1576 self._lines_read += read_lines
1577 if self._lines_read == self.nobs:
1578 self._can_read_value_labels = True
1579 self._data_read = True
1580 # if necessary, swap the byte order to native here
1581 if self.byteorder != self._native_byteorder:
1582 data = data.byteswap().newbyteorder()
1584 if convert_categoricals:
1585 self._read_value_labels()
1587 if len(data) == 0:
1588 data = DataFrame(columns=self.varlist)
1589 else:
1590 data = DataFrame.from_records(data)
1591 data.columns = self.varlist
1593 # If index is not specified, use actual row number rather than
1594 # restarting at 0 for each chunk.
1595 if index_col is None:
1596 ix = np.arange(self._lines_read - read_lines, self._lines_read)
1597 data = data.set_index(ix)
1599 if columns is not None:
1600 try:
1601 data = self._do_select_columns(data, columns)
1602 except ValueError:
1603 self.close()
1604 raise
1606 # Decode strings
1607 for col, typ in zip(data, self.typlist):
1608 if type(typ) is int:
1609 data[col] = data[col].apply(self._decode, convert_dtype=True)
1611 data = self._insert_strls(data)
1613 cols_ = np.where(self.dtyplist)[0]
1615 # Convert columns (if needed) to match input type
1616 ix = data.index
1617 requires_type_conversion = False
1618 data_formatted = []
1619 for i in cols_:
1620 if self.dtyplist[i] is not None:
1621 col = data.columns[i]
1622 dtype = data[col].dtype
1623 if dtype != np.dtype(object) and dtype != self.dtyplist[i]:
1624 requires_type_conversion = True
1625 data_formatted.append(
1626 (col, Series(data[col], ix, self.dtyplist[i]))
1627 )
1628 else:
1629 data_formatted.append((col, data[col]))
1630 if requires_type_conversion:
1631 data = DataFrame.from_dict(dict(data_formatted))
1632 del data_formatted
1634 data = self._do_convert_missing(data, convert_missing)
1636 if convert_dates:
1638 def any_startswith(x: str) -> bool:
1639 return any(x.startswith(fmt) for fmt in _date_formats)
1641 cols = np.where([any_startswith(x) for x in self.fmtlist])[0]
1642 for i in cols:
1643 col = data.columns[i]
1644 try:
1645 data[col] = _stata_elapsed_date_to_datetime_vec(
1646 data[col], self.fmtlist[i]
1647 )
1648 except ValueError:
1649 self.close()
1650 raise
1652 if convert_categoricals and self.format_version > 108:
1653 data = self._do_convert_categoricals(
1654 data, self.value_label_dict, self.lbllist, order_categoricals
1655 )
1657 if not preserve_dtypes:
1658 retyped_data = []
1659 convert = False
1660 for col in data:
1661 dtype = data[col].dtype
1662 if dtype in (np.float16, np.float32):
1663 dtype = np.float64
1664 convert = True
1665 elif dtype in (np.int8, np.int16, np.int32):
1666 dtype = np.int64
1667 convert = True
1668 retyped_data.append((col, data[col].astype(dtype)))
1669 if convert:
1670 data = DataFrame.from_dict(dict(retyped_data))
1672 if index_col is not None:
1673 data = data.set_index(data.pop(index_col))
1675 return data
1677 def _do_convert_missing(self, data, convert_missing):
1678 # Check for missing values, and replace if found
1679 replacements = {}
1680 for i, colname in enumerate(data):
1681 fmt = self.typlist[i]
1682 if fmt not in self.VALID_RANGE:
1683 continue
1685 nmin, nmax = self.VALID_RANGE[fmt]
1686 series = data[colname]
1687 missing = np.logical_or(series < nmin, series > nmax)
1689 if not missing.any():
1690 continue
1692 if convert_missing: # Replacement follows Stata notation
1694 missing_loc = np.argwhere(missing._ndarray_values)
1695 umissing, umissing_loc = np.unique(series[missing], return_inverse=True)
1696 replacement = Series(series, dtype=np.object)
1697 for j, um in enumerate(umissing):
1698 missing_value = StataMissingValue(um)
1700 loc = missing_loc[umissing_loc == j]
1701 replacement.iloc[loc] = missing_value
1702 else: # All replacements are identical
1703 dtype = series.dtype
1704 if dtype not in (np.float32, np.float64):
1705 dtype = np.float64
1706 replacement = Series(series, dtype=dtype)
1707 replacement[missing] = np.nan
1708 replacements[colname] = replacement
1709 if replacements:
1710 columns = data.columns
1711 replacements = DataFrame(replacements)
1712 data = concat([data.drop(replacements.columns, 1), replacements], 1)
1713 data = data[columns]
1714 return data
1716 def _insert_strls(self, data):
1717 if not hasattr(self, "GSO") or len(self.GSO) == 0:
1718 return data
1719 for i, typ in enumerate(self.typlist):
1720 if typ != "Q":
1721 continue
1722 # Wrap v_o in a string to allow uint64 values as keys on 32bit OS
1723 data.iloc[:, i] = [self.GSO[str(k)] for k in data.iloc[:, i]]
1724 return data
1726 def _do_select_columns(self, data, columns):
1728 if not self._column_selector_set:
1729 column_set = set(columns)
1730 if len(column_set) != len(columns):
1731 raise ValueError("columns contains duplicate entries")
1732 unmatched = column_set.difference(data.columns)
1733 if unmatched:
1734 raise ValueError(
1735 "The following columns were not found in the "
1736 "Stata data set: " + ", ".join(list(unmatched))
1737 )
1738 # Copy information for retained columns for later processing
1739 dtyplist = []
1740 typlist = []
1741 fmtlist = []
1742 lbllist = []
1743 for col in columns:
1744 i = data.columns.get_loc(col)
1745 dtyplist.append(self.dtyplist[i])
1746 typlist.append(self.typlist[i])
1747 fmtlist.append(self.fmtlist[i])
1748 lbllist.append(self.lbllist[i])
1750 self.dtyplist = dtyplist
1751 self.typlist = typlist
1752 self.fmtlist = fmtlist
1753 self.lbllist = lbllist
1754 self._column_selector_set = True
1756 return data[columns]
1758 def _do_convert_categoricals(
1759 self, data, value_label_dict, lbllist, order_categoricals
1760 ):
1761 """
1762 Converts categorical columns to Categorical type.
1763 """
1764 value_labels = list(value_label_dict.keys())
1765 cat_converted_data = []
1766 for col, label in zip(data, lbllist):
1767 if label in value_labels:
1768 # Explicit call with ordered=True
1769 cat_data = Categorical(data[col], ordered=order_categoricals)
1770 categories = []
1771 for category in cat_data.categories:
1772 if category in value_label_dict[label]:
1773 categories.append(value_label_dict[label][category])
1774 else:
1775 categories.append(category) # Partially labeled
1776 try:
1777 cat_data.categories = categories
1778 except ValueError:
1779 vc = Series(categories).value_counts()
1780 repeats = list(vc.index[vc > 1])
1781 repeats = "-" * 80 + "\n" + "\n".join(repeats)
1782 # GH 25772
1783 msg = f"""
1784Value labels for column {col} are not unique. These cannot be converted to
1785pandas categoricals.
1787Either read the file with `convert_categoricals` set to False or use the
1788low level interface in `StataReader` to separately read the values and the
1789value_labels.
1791The repeated labels are:
1792{repeats}
1793"""
1794 raise ValueError(msg)
1795 # TODO: is the next line needed above in the data(...) method?
1796 cat_data = Series(cat_data, index=data.index)
1797 cat_converted_data.append((col, cat_data))
1798 else:
1799 cat_converted_data.append((col, data[col]))
1800 data = DataFrame.from_dict(dict(cat_converted_data))
1801 return data
1803 @property
1804 def data_label(self):
1805 """
1806 Return data label of Stata file.
1807 """
1808 return self._data_label
1810 def variable_labels(self):
1811 """
1812 Return variable labels as a dict, associating each variable name
1813 with corresponding label.
1815 Returns
1816 -------
1817 dict
1818 """
1819 return dict(zip(self.varlist, self._variable_labels))
1821 def value_labels(self):
1822 """
1823 Return a dict, associating each variable name a dict, associating
1824 each value its corresponding label.
1826 Returns
1827 -------
1828 dict
1829 """
1830 if not self._value_labels_read:
1831 self._read_value_labels()
1833 return self.value_label_dict
1836def _open_file_binary_write(fname):
1837 """
1838 Open a binary file or no-op if file-like.
1840 Parameters
1841 ----------
1842 fname : string path, path object or buffer
1844 Returns
1845 -------
1846 file : file-like object
1847 File object supporting write
1848 own : bool
1849 True if the file was created, otherwise False
1850 """
1851 if hasattr(fname, "write"):
1852 # if 'b' not in fname.mode:
1853 return fname, False
1854 return open(fname, "wb"), True
1857def _set_endianness(endianness):
1858 if endianness.lower() in ["<", "little"]:
1859 return "<"
1860 elif endianness.lower() in [">", "big"]:
1861 return ">"
1862 else: # pragma : no cover
1863 raise ValueError(f"Endianness {endianness} not understood")
1866def _pad_bytes(name, length):
1867 """
1868 Take a char string and pads it with null bytes until it's length chars.
1869 """
1870 if isinstance(name, bytes):
1871 return name + b"\x00" * (length - len(name))
1872 return name + "\x00" * (length - len(name))
1875def _convert_datetime_to_stata_type(fmt):
1876 """
1877 Convert from one of the stata date formats to a type in TYPE_MAP.
1878 """
1879 if fmt in [
1880 "tc",
1881 "%tc",
1882 "td",
1883 "%td",
1884 "tw",
1885 "%tw",
1886 "tm",
1887 "%tm",
1888 "tq",
1889 "%tq",
1890 "th",
1891 "%th",
1892 "ty",
1893 "%ty",
1894 ]:
1895 return np.float64 # Stata expects doubles for SIFs
1896 else:
1897 raise NotImplementedError(f"Format {fmt} not implemented")
1900def _maybe_convert_to_int_keys(convert_dates, varlist):
1901 new_dict = {}
1902 for key in convert_dates:
1903 if not convert_dates[key].startswith("%"): # make sure proper fmts
1904 convert_dates[key] = "%" + convert_dates[key]
1905 if key in varlist:
1906 new_dict.update({varlist.index(key): convert_dates[key]})
1907 else:
1908 if not isinstance(key, int):
1909 raise ValueError("convert_dates key must be a column or an integer")
1910 new_dict.update({key: convert_dates[key]})
1911 return new_dict
1914def _dtype_to_stata_type(dtype, column):
1915 """
1916 Convert dtype types to stata types. Returns the byte of the given ordinal.
1917 See TYPE_MAP and comments for an explanation. This is also explained in
1918 the dta spec.
1919 1 - 244 are strings of this length
1920 Pandas Stata
1921 251 - for int8 byte
1922 252 - for int16 int
1923 253 - for int32 long
1924 254 - for float32 float
1925 255 - for double double
1927 If there are dates to convert, then dtype will already have the correct
1928 type inserted.
1929 """
1930 # TODO: expand to handle datetime to integer conversion
1931 if dtype.type == np.object_: # try to coerce it to the biggest string
1932 # not memory efficient, what else could we
1933 # do?
1934 itemsize = max_len_string_array(ensure_object(column.values))
1935 return max(itemsize, 1)
1936 elif dtype == np.float64:
1937 return 255
1938 elif dtype == np.float32:
1939 return 254
1940 elif dtype == np.int32:
1941 return 253
1942 elif dtype == np.int16:
1943 return 252
1944 elif dtype == np.int8:
1945 return 251
1946 else: # pragma : no cover
1947 raise NotImplementedError(f"Data type {dtype} not supported.")
1950def _dtype_to_default_stata_fmt(dtype, column, dta_version=114, force_strl=False):
1951 """
1952 Map numpy dtype to stata's default format for this type. Not terribly
1953 important since users can change this in Stata. Semantics are
1955 object -> "%DDs" where DD is the length of the string. If not a string,
1956 raise ValueError
1957 float64 -> "%10.0g"
1958 float32 -> "%9.0g"
1959 int64 -> "%9.0g"
1960 int32 -> "%12.0g"
1961 int16 -> "%8.0g"
1962 int8 -> "%8.0g"
1963 strl -> "%9s"
1964 """
1965 # TODO: Refactor to combine type with format
1966 # TODO: expand this to handle a default datetime format?
1967 if dta_version < 117:
1968 max_str_len = 244
1969 else:
1970 max_str_len = 2045
1971 if force_strl:
1972 return "%9s"
1973 if dtype.type == np.object_:
1974 itemsize = max_len_string_array(ensure_object(column.values))
1975 if itemsize > max_str_len:
1976 if dta_version >= 117:
1977 return "%9s"
1978 else:
1979 raise ValueError(excessive_string_length_error.format(column.name))
1980 return "%" + str(max(itemsize, 1)) + "s"
1981 elif dtype == np.float64:
1982 return "%10.0g"
1983 elif dtype == np.float32:
1984 return "%9.0g"
1985 elif dtype == np.int32:
1986 return "%12.0g"
1987 elif dtype == np.int8 or dtype == np.int16:
1988 return "%8.0g"
1989 else: # pragma : no cover
1990 raise NotImplementedError(f"Data type {dtype} not supported.")
1993class StataWriter(StataParser):
1994 """
1995 A class for writing Stata binary dta files
1997 Parameters
1998 ----------
1999 fname : path (string), buffer or path object
2000 string, path object (pathlib.Path or py._path.local.LocalPath) or
2001 object implementing a binary write() functions. If using a buffer
2002 then the buffer will not be automatically closed after the file
2003 is written.
2005 .. versionadded:: 0.23.0 support for pathlib, py.path.
2007 data : DataFrame
2008 Input to save
2009 convert_dates : dict
2010 Dictionary mapping columns containing datetime types to stata internal
2011 format to use when writing the dates. Options are 'tc', 'td', 'tm',
2012 'tw', 'th', 'tq', 'ty'. Column can be either an integer or a name.
2013 Datetime columns that do not have a conversion type specified will be
2014 converted to 'tc'. Raises NotImplementedError if a datetime column has
2015 timezone information
2016 write_index : bool
2017 Write the index to Stata dataset.
2018 byteorder : str
2019 Can be ">", "<", "little", or "big". default is `sys.byteorder`
2020 time_stamp : datetime
2021 A datetime to use as file creation date. Default is the current time
2022 data_label : str
2023 A label for the data set. Must be 80 characters or smaller.
2024 variable_labels : dict
2025 Dictionary containing columns as keys and variable labels as values.
2026 Each label must be 80 characters or smaller.
2028 Returns
2029 -------
2030 writer : StataWriter instance
2031 The StataWriter instance has a write_file method, which will
2032 write the file to the given `fname`.
2034 Raises
2035 ------
2036 NotImplementedError
2037 * If datetimes contain timezone information
2038 ValueError
2039 * Columns listed in convert_dates are neither datetime64[ns]
2040 or datetime.datetime
2041 * Column dtype is not representable in Stata
2042 * Column listed in convert_dates is not in DataFrame
2043 * Categorical label contains more than 32,000 characters
2045 Examples
2046 --------
2047 >>> data = pd.DataFrame([[1.0, 1]], columns=['a', 'b'])
2048 >>> writer = StataWriter('./data_file.dta', data)
2049 >>> writer.write_file()
2051 Or with dates
2052 >>> from datetime import datetime
2053 >>> data = pd.DataFrame([[datetime(2000,1,1)]], columns=['date'])
2054 >>> writer = StataWriter('./date_data_file.dta', data, {'date' : 'tw'})
2055 >>> writer.write_file()
2056 """
2058 _max_string_length = 244
2059 _encoding = "latin-1"
2061 def __init__(
2062 self,
2063 fname,
2064 data,
2065 convert_dates=None,
2066 write_index=True,
2067 byteorder=None,
2068 time_stamp=None,
2069 data_label=None,
2070 variable_labels=None,
2071 ):
2072 super().__init__()
2073 self._convert_dates = {} if convert_dates is None else convert_dates
2074 self._write_index = write_index
2075 self._time_stamp = time_stamp
2076 self._data_label = data_label
2077 self._variable_labels = variable_labels
2078 self._own_file = True
2079 # attach nobs, nvars, data, varlist, typlist
2080 self._prepare_pandas(data)
2082 if byteorder is None:
2083 byteorder = sys.byteorder
2084 self._byteorder = _set_endianness(byteorder)
2085 self._fname = stringify_path(fname)
2086 self.type_converters = {253: np.int32, 252: np.int16, 251: np.int8}
2087 self._converted_names = {}
2089 def _write(self, to_write):
2090 """
2091 Helper to call encode before writing to file for Python 3 compat.
2092 """
2093 self._file.write(to_write.encode(self._encoding or self._default_encoding))
2095 def _prepare_categoricals(self, data):
2096 """Check for categorical columns, retain categorical information for
2097 Stata file and convert categorical data to int"""
2099 is_cat = [is_categorical_dtype(data[col]) for col in data]
2100 self._is_col_cat = is_cat
2101 self._value_labels = []
2102 if not any(is_cat):
2103 return data
2105 get_base_missing_value = StataMissingValue.get_base_missing_value
2106 data_formatted = []
2107 for col, col_is_cat in zip(data, is_cat):
2108 if col_is_cat:
2109 svl = StataValueLabel(data[col], encoding=self._encoding)
2110 self._value_labels.append(svl)
2111 dtype = data[col].cat.codes.dtype
2112 if dtype == np.int64:
2113 raise ValueError(
2114 "It is not possible to export "
2115 "int64-based categorical data to Stata."
2116 )
2117 values = data[col].cat.codes.values.copy()
2119 # Upcast if needed so that correct missing values can be set
2120 if values.max() >= get_base_missing_value(dtype):
2121 if dtype == np.int8:
2122 dtype = np.int16
2123 elif dtype == np.int16:
2124 dtype = np.int32
2125 else:
2126 dtype = np.float64
2127 values = np.array(values, dtype=dtype)
2129 # Replace missing values with Stata missing value for type
2130 values[values == -1] = get_base_missing_value(dtype)
2131 data_formatted.append((col, values))
2132 else:
2133 data_formatted.append((col, data[col]))
2134 return DataFrame.from_dict(dict(data_formatted))
2136 def _replace_nans(self, data):
2137 # return data
2138 """Checks floating point data columns for nans, and replaces these with
2139 the generic Stata for missing value (.)"""
2140 for c in data:
2141 dtype = data[c].dtype
2142 if dtype in (np.float32, np.float64):
2143 if dtype == np.float32:
2144 replacement = self.MISSING_VALUES["f"]
2145 else:
2146 replacement = self.MISSING_VALUES["d"]
2147 data[c] = data[c].fillna(replacement)
2149 return data
2151 def _update_strl_names(self):
2152 """No-op, forward compatibility"""
2153 pass
2155 def _validate_variable_name(self, name):
2156 """
2157 Validate variable names for Stata export.
2159 Parameters
2160 ----------
2161 name : str
2162 Variable name
2164 Returns
2165 -------
2166 str
2167 The validated name with invalid characters replaced with
2168 underscores.
2170 Notes
2171 -----
2172 Stata 114 and 117 support ascii characters in a-z, A-Z, 0-9
2173 and _.
2174 """
2175 for c in name:
2176 if (
2177 (c < "A" or c > "Z")
2178 and (c < "a" or c > "z")
2179 and (c < "0" or c > "9")
2180 and c != "_"
2181 ):
2182 name = name.replace(c, "_")
2183 return name
2185 def _check_column_names(self, data):
2186 """
2187 Checks column names to ensure that they are valid Stata column names.
2188 This includes checks for:
2189 * Non-string names
2190 * Stata keywords
2191 * Variables that start with numbers
2192 * Variables with names that are too long
2194 When an illegal variable name is detected, it is converted, and if
2195 dates are exported, the variable name is propagated to the date
2196 conversion dictionary
2197 """
2198 converted_names = {}
2199 columns = list(data.columns)
2200 original_columns = columns[:]
2202 duplicate_var_id = 0
2203 for j, name in enumerate(columns):
2204 orig_name = name
2205 if not isinstance(name, str):
2206 name = str(name)
2208 name = self._validate_variable_name(name)
2210 # Variable name must not be a reserved word
2211 if name in self.RESERVED_WORDS:
2212 name = "_" + name
2214 # Variable name may not start with a number
2215 if name[0] >= "0" and name[0] <= "9":
2216 name = "_" + name
2218 name = name[: min(len(name), 32)]
2220 if not name == orig_name:
2221 # check for duplicates
2222 while columns.count(name) > 0:
2223 # prepend ascending number to avoid duplicates
2224 name = "_" + str(duplicate_var_id) + name
2225 name = name[: min(len(name), 32)]
2226 duplicate_var_id += 1
2227 converted_names[orig_name] = name
2229 columns[j] = name
2231 data.columns = columns
2233 # Check date conversion, and fix key if needed
2234 if self._convert_dates:
2235 for c, o in zip(columns, original_columns):
2236 if c != o:
2237 self._convert_dates[c] = self._convert_dates[o]
2238 del self._convert_dates[o]
2240 if converted_names:
2241 conversion_warning = []
2242 for orig_name, name in converted_names.items():
2243 # need to possibly encode the orig name if its unicode
2244 try:
2245 orig_name = orig_name.encode("utf-8")
2246 except (UnicodeDecodeError, AttributeError):
2247 pass
2248 msg = f"{orig_name} -> {name}"
2249 conversion_warning.append(msg)
2251 ws = invalid_name_doc.format("\n ".join(conversion_warning))
2252 warnings.warn(ws, InvalidColumnName)
2254 self._converted_names = converted_names
2255 self._update_strl_names()
2257 return data
2259 def _set_formats_and_types(self, dtypes):
2260 self.typlist = []
2261 self.fmtlist = []
2262 for col, dtype in dtypes.items():
2263 self.fmtlist.append(_dtype_to_default_stata_fmt(dtype, self.data[col]))
2264 self.typlist.append(_dtype_to_stata_type(dtype, self.data[col]))
2266 def _prepare_pandas(self, data):
2267 # NOTE: we might need a different API / class for pandas objects so
2268 # we can set different semantics - handle this with a PR to pandas.io
2270 data = data.copy()
2272 if self._write_index:
2273 data = data.reset_index()
2275 # Ensure column names are strings
2276 data = self._check_column_names(data)
2278 # Check columns for compatibility with stata, upcast if necessary
2279 # Raise if outside the supported range
2280 data = _cast_to_stata_types(data)
2282 # Replace NaNs with Stata missing values
2283 data = self._replace_nans(data)
2285 # Convert categoricals to int data, and strip labels
2286 data = self._prepare_categoricals(data)
2288 self.nobs, self.nvar = data.shape
2289 self.data = data
2290 self.varlist = data.columns.tolist()
2292 dtypes = data.dtypes
2294 # Ensure all date columns are converted
2295 for col in data:
2296 if col in self._convert_dates:
2297 continue
2298 if is_datetime64_dtype(data[col]):
2299 self._convert_dates[col] = "tc"
2301 self._convert_dates = _maybe_convert_to_int_keys(
2302 self._convert_dates, self.varlist
2303 )
2304 for key in self._convert_dates:
2305 new_type = _convert_datetime_to_stata_type(self._convert_dates[key])
2306 dtypes[key] = np.dtype(new_type)
2308 # Verify object arrays are strings and encode to bytes
2309 self._encode_strings()
2311 self._set_formats_and_types(dtypes)
2313 # set the given format for the datetime cols
2314 if self._convert_dates is not None:
2315 for key in self._convert_dates:
2316 self.fmtlist[key] = self._convert_dates[key]
2318 def _encode_strings(self):
2319 """
2320 Encode strings in dta-specific encoding
2322 Do not encode columns marked for date conversion or for strL
2323 conversion. The strL converter independently handles conversion and
2324 also accepts empty string arrays.
2325 """
2326 convert_dates = self._convert_dates
2327 # _convert_strl is not available in dta 114
2328 convert_strl = getattr(self, "_convert_strl", [])
2329 for i, col in enumerate(self.data):
2330 # Skip columns marked for date conversion or strl conversion
2331 if i in convert_dates or col in convert_strl:
2332 continue
2333 column = self.data[col]
2334 dtype = column.dtype
2335 if dtype.type == np.object_:
2336 inferred_dtype = infer_dtype(column, skipna=True)
2337 if not ((inferred_dtype in ("string", "unicode")) or len(column) == 0):
2338 col = column.name
2339 raise ValueError(
2340 f"""\
2341Column `{col}` cannot be exported.\n\nOnly string-like object arrays
2342containing all strings or a mix of strings and None can be exported.
2343Object arrays containing only null values are prohibited. Other object
2344types cannot be exported and must first be converted to one of the
2345supported types."""
2346 )
2347 encoded = self.data[col].str.encode(self._encoding)
2348 # If larger than _max_string_length do nothing
2349 if (
2350 max_len_string_array(ensure_object(encoded.values))
2351 <= self._max_string_length
2352 ):
2353 self.data[col] = encoded
2355 def write_file(self):
2356 self._file, self._own_file = _open_file_binary_write(self._fname)
2357 try:
2358 self._write_header(data_label=self._data_label, time_stamp=self._time_stamp)
2359 self._write_map()
2360 self._write_variable_types()
2361 self._write_varnames()
2362 self._write_sortlist()
2363 self._write_formats()
2364 self._write_value_label_names()
2365 self._write_variable_labels()
2366 self._write_expansion_fields()
2367 self._write_characteristics()
2368 self._prepare_data()
2369 self._write_data()
2370 self._write_strls()
2371 self._write_value_labels()
2372 self._write_file_close_tag()
2373 self._write_map()
2374 except Exception as exc:
2375 self._close()
2376 if self._own_file:
2377 try:
2378 os.unlink(self._fname)
2379 except OSError:
2380 warnings.warn(
2381 f"This save was not successful but {self._fname} could not "
2382 "be deleted. This file is not valid.",
2383 ResourceWarning,
2384 )
2385 raise exc
2386 else:
2387 self._close()
2389 def _close(self):
2390 """
2391 Close the file if it was created by the writer.
2393 If a buffer or file-like object was passed in, for example a GzipFile,
2394 then leave this file open for the caller to close. In either case,
2395 attempt to flush the file contents to ensure they are written to disk
2396 (if supported)
2397 """
2398 # Some file-like objects might not support flush
2399 try:
2400 self._file.flush()
2401 except AttributeError:
2402 pass
2403 if self._own_file:
2404 self._file.close()
2406 def _write_map(self):
2407 """No-op, future compatibility"""
2408 pass
2410 def _write_file_close_tag(self):
2411 """No-op, future compatibility"""
2412 pass
2414 def _write_characteristics(self):
2415 """No-op, future compatibility"""
2416 pass
2418 def _write_strls(self):
2419 """No-op, future compatibility"""
2420 pass
2422 def _write_expansion_fields(self):
2423 """Write 5 zeros for expansion fields"""
2424 self._write(_pad_bytes("", 5))
2426 def _write_value_labels(self):
2427 for vl in self._value_labels:
2428 self._file.write(vl.generate_value_label(self._byteorder))
2430 def _write_header(self, data_label=None, time_stamp=None):
2431 byteorder = self._byteorder
2432 # ds_format - just use 114
2433 self._file.write(struct.pack("b", 114))
2434 # byteorder
2435 self._write(byteorder == ">" and "\x01" or "\x02")
2436 # filetype
2437 self._write("\x01")
2438 # unused
2439 self._write("\x00")
2440 # number of vars, 2 bytes
2441 self._file.write(struct.pack(byteorder + "h", self.nvar)[:2])
2442 # number of obs, 4 bytes
2443 self._file.write(struct.pack(byteorder + "i", self.nobs)[:4])
2444 # data label 81 bytes, char, null terminated
2445 if data_label is None:
2446 self._file.write(self._null_terminate(_pad_bytes("", 80)))
2447 else:
2448 self._file.write(self._null_terminate(_pad_bytes(data_label[:80], 80)))
2449 # time stamp, 18 bytes, char, null terminated
2450 # format dd Mon yyyy hh:mm
2451 if time_stamp is None:
2452 time_stamp = datetime.datetime.now()
2453 elif not isinstance(time_stamp, datetime.datetime):
2454 raise ValueError("time_stamp should be datetime type")
2455 # GH #13856
2456 # Avoid locale-specific month conversion
2457 months = [
2458 "Jan",
2459 "Feb",
2460 "Mar",
2461 "Apr",
2462 "May",
2463 "Jun",
2464 "Jul",
2465 "Aug",
2466 "Sep",
2467 "Oct",
2468 "Nov",
2469 "Dec",
2470 ]
2471 month_lookup = {i + 1: month for i, month in enumerate(months)}
2472 ts = (
2473 time_stamp.strftime("%d ")
2474 + month_lookup[time_stamp.month]
2475 + time_stamp.strftime(" %Y %H:%M")
2476 )
2477 self._file.write(self._null_terminate(ts))
2479 def _write_variable_types(self):
2480 for typ in self.typlist:
2481 self._file.write(struct.pack("B", typ))
2483 def _write_varnames(self):
2484 # varlist names are checked by _check_column_names
2485 # varlist, requires null terminated
2486 for name in self.varlist:
2487 name = self._null_terminate(name, True)
2488 name = _pad_bytes(name[:32], 33)
2489 self._write(name)
2491 def _write_sortlist(self):
2492 # srtlist, 2*(nvar+1), int array, encoded by byteorder
2493 srtlist = _pad_bytes("", 2 * (self.nvar + 1))
2494 self._write(srtlist)
2496 def _write_formats(self):
2497 # fmtlist, 49*nvar, char array
2498 for fmt in self.fmtlist:
2499 self._write(_pad_bytes(fmt, 49))
2501 def _write_value_label_names(self):
2502 # lbllist, 33*nvar, char array
2503 for i in range(self.nvar):
2504 # Use variable name when categorical
2505 if self._is_col_cat[i]:
2506 name = self.varlist[i]
2507 name = self._null_terminate(name, True)
2508 name = _pad_bytes(name[:32], 33)
2509 self._write(name)
2510 else: # Default is empty label
2511 self._write(_pad_bytes("", 33))
2513 def _write_variable_labels(self):
2514 # Missing labels are 80 blank characters plus null termination
2515 blank = _pad_bytes("", 81)
2517 if self._variable_labels is None:
2518 for i in range(self.nvar):
2519 self._write(blank)
2520 return
2522 for col in self.data:
2523 if col in self._variable_labels:
2524 label = self._variable_labels[col]
2525 if len(label) > 80:
2526 raise ValueError("Variable labels must be 80 characters or fewer")
2527 is_latin1 = all(ord(c) < 256 for c in label)
2528 if not is_latin1:
2529 raise ValueError(
2530 "Variable labels must contain only characters that "
2531 "can be encoded in Latin-1"
2532 )
2533 self._write(_pad_bytes(label, 81))
2534 else:
2535 self._write(blank)
2537 def _convert_strls(self, data):
2538 """No-op, future compatibility"""
2539 return data
2541 def _prepare_data(self):
2542 data = self.data
2543 typlist = self.typlist
2544 convert_dates = self._convert_dates
2545 # 1. Convert dates
2546 if self._convert_dates is not None:
2547 for i, col in enumerate(data):
2548 if i in convert_dates:
2549 data[col] = _datetime_to_stata_elapsed_vec(
2550 data[col], self.fmtlist[i]
2551 )
2552 # 2. Convert strls
2553 data = self._convert_strls(data)
2555 # 3. Convert bad string data to '' and pad to correct length
2556 dtypes = {}
2557 native_byteorder = self._byteorder == _set_endianness(sys.byteorder)
2558 for i, col in enumerate(data):
2559 typ = typlist[i]
2560 if typ <= self._max_string_length:
2561 data[col] = data[col].fillna("").apply(_pad_bytes, args=(typ,))
2562 stype = f"S{typ}"
2563 dtypes[col] = stype
2564 data[col] = data[col].astype(stype)
2565 else:
2566 dtype = data[col].dtype
2567 if not native_byteorder:
2568 dtype = dtype.newbyteorder(self._byteorder)
2569 dtypes[col] = dtype
2571 self.data = data.to_records(index=False, column_dtypes=dtypes)
2573 def _write_data(self):
2574 data = self.data
2575 self._file.write(data.tobytes())
2577 def _null_terminate(self, s, as_string=False):
2578 null_byte = "\x00"
2579 s += null_byte
2581 if not as_string:
2582 s = s.encode(self._encoding)
2584 return s
2587def _dtype_to_stata_type_117(dtype, column, force_strl):
2588 """
2589 Converts dtype types to stata types. Returns the byte of the given ordinal.
2590 See TYPE_MAP and comments for an explanation. This is also explained in
2591 the dta spec.
2592 1 - 2045 are strings of this length
2593 Pandas Stata
2594 32768 - for object strL
2595 65526 - for int8 byte
2596 65527 - for int16 int
2597 65528 - for int32 long
2598 65529 - for float32 float
2599 65530 - for double double
2601 If there are dates to convert, then dtype will already have the correct
2602 type inserted.
2603 """
2604 # TODO: expand to handle datetime to integer conversion
2605 if force_strl:
2606 return 32768
2607 if dtype.type == np.object_: # try to coerce it to the biggest string
2608 # not memory efficient, what else could we
2609 # do?
2610 itemsize = max_len_string_array(ensure_object(column.values))
2611 itemsize = max(itemsize, 1)
2612 if itemsize <= 2045:
2613 return itemsize
2614 return 32768
2615 elif dtype == np.float64:
2616 return 65526
2617 elif dtype == np.float32:
2618 return 65527
2619 elif dtype == np.int32:
2620 return 65528
2621 elif dtype == np.int16:
2622 return 65529
2623 elif dtype == np.int8:
2624 return 65530
2625 else: # pragma : no cover
2626 raise NotImplementedError(f"Data type {dtype} not supported.")
2629def _pad_bytes_new(name, length):
2630 """
2631 Takes a bytes instance and pads it with null bytes until it's length chars.
2632 """
2633 if isinstance(name, str):
2634 name = bytes(name, "utf-8")
2635 return name + b"\x00" * (length - len(name))
2638class StataStrLWriter:
2639 """
2640 Converter for Stata StrLs
2642 Stata StrLs map 8 byte values to strings which are stored using a
2643 dictionary-like format where strings are keyed to two values.
2645 Parameters
2646 ----------
2647 df : DataFrame
2648 DataFrame to convert
2649 columns : list
2650 List of columns names to convert to StrL
2651 version : int, optional
2652 dta version. Currently supports 117, 118 and 119
2653 byteorder : str, optional
2654 Can be ">", "<", "little", or "big". default is `sys.byteorder`
2656 Notes
2657 -----
2658 Supports creation of the StrL block of a dta file for dta versions
2659 117, 118 and 119. These differ in how the GSO is stored. 118 and
2660 119 store the GSO lookup value as a uint32 and a uint64, while 117
2661 uses two uint32s. 118 and 119 also encode all strings as unicode
2662 which is required by the format. 117 uses 'latin-1' a fixed width
2663 encoding that extends the 7-bit ascii table with an additional 128
2664 characters.
2665 """
2667 def __init__(self, df, columns, version=117, byteorder=None):
2668 if version not in (117, 118, 119):
2669 raise ValueError("Only dta versions 117, 118 and 119 supported")
2670 self._dta_ver = version
2672 self.df = df
2673 self.columns = columns
2674 self._gso_table = {"": (0, 0)}
2675 if byteorder is None:
2676 byteorder = sys.byteorder
2677 self._byteorder = _set_endianness(byteorder)
2679 gso_v_type = "I" # uint32
2680 gso_o_type = "Q" # uint64
2681 self._encoding = "utf-8"
2682 if version == 117:
2683 o_size = 4
2684 gso_o_type = "I" # 117 used uint32
2685 self._encoding = "latin-1"
2686 elif version == 118:
2687 o_size = 6
2688 else: # version == 119
2689 o_size = 5
2690 self._o_offet = 2 ** (8 * (8 - o_size))
2691 self._gso_o_type = gso_o_type
2692 self._gso_v_type = gso_v_type
2694 def _convert_key(self, key):
2695 v, o = key
2696 return v + self._o_offet * o
2698 def generate_table(self):
2699 """
2700 Generates the GSO lookup table for the DataFrame
2702 Returns
2703 -------
2704 gso_table : dict
2705 Ordered dictionary using the string found as keys
2706 and their lookup position (v,o) as values
2707 gso_df : DataFrame
2708 DataFrame where strl columns have been converted to
2709 (v,o) values
2711 Notes
2712 -----
2713 Modifies the DataFrame in-place.
2715 The DataFrame returned encodes the (v,o) values as uint64s. The
2716 encoding depends on the dta version, and can be expressed as
2718 enc = v + o * 2 ** (o_size * 8)
2720 so that v is stored in the lower bits and o is in the upper
2721 bits. o_size is
2723 * 117: 4
2724 * 118: 6
2725 * 119: 5
2726 """
2728 gso_table = self._gso_table
2729 gso_df = self.df
2730 columns = list(gso_df.columns)
2731 selected = gso_df[self.columns]
2732 col_index = [(col, columns.index(col)) for col in self.columns]
2733 keys = np.empty(selected.shape, dtype=np.uint64)
2734 for o, (idx, row) in enumerate(selected.iterrows()):
2735 for j, (col, v) in enumerate(col_index):
2736 val = row[col]
2737 # Allow columns with mixed str and None (GH 23633)
2738 val = "" if val is None else val
2739 key = gso_table.get(val, None)
2740 if key is None:
2741 # Stata prefers human numbers
2742 key = (v + 1, o + 1)
2743 gso_table[val] = key
2744 keys[o, j] = self._convert_key(key)
2745 for i, col in enumerate(self.columns):
2746 gso_df[col] = keys[:, i]
2748 return gso_table, gso_df
2750 def generate_blob(self, gso_table):
2751 """
2752 Generates the binary blob of GSOs that is written to the dta file.
2754 Parameters
2755 ----------
2756 gso_table : dict
2757 Ordered dictionary (str, vo)
2759 Returns
2760 -------
2761 gso : bytes
2762 Binary content of dta file to be placed between strl tags
2764 Notes
2765 -----
2766 Output format depends on dta version. 117 uses two uint32s to
2767 express v and o while 118+ uses a uint32 for v and a uint64 for o.
2768 """
2769 # Format information
2770 # Length includes null term
2771 # 117
2772 # GSOvvvvooootllllxxxxxxxxxxxxxxx...x
2773 # 3 u4 u4 u1 u4 string + null term
2774 #
2775 # 118, 119
2776 # GSOvvvvooooooootllllxxxxxxxxxxxxxxx...x
2777 # 3 u4 u8 u1 u4 string + null term
2779 bio = BytesIO()
2780 gso = bytes("GSO", "ascii")
2781 gso_type = struct.pack(self._byteorder + "B", 130)
2782 null = struct.pack(self._byteorder + "B", 0)
2783 v_type = self._byteorder + self._gso_v_type
2784 o_type = self._byteorder + self._gso_o_type
2785 len_type = self._byteorder + "I"
2786 for strl, vo in gso_table.items():
2787 if vo == (0, 0):
2788 continue
2789 v, o = vo
2791 # GSO
2792 bio.write(gso)
2794 # vvvv
2795 bio.write(struct.pack(v_type, v))
2797 # oooo / oooooooo
2798 bio.write(struct.pack(o_type, o))
2800 # t
2801 bio.write(gso_type)
2803 # llll
2804 utf8_string = bytes(strl, "utf-8")
2805 bio.write(struct.pack(len_type, len(utf8_string) + 1))
2807 # xxx...xxx
2808 bio.write(utf8_string)
2809 bio.write(null)
2811 bio.seek(0)
2812 return bio.read()
2815class StataWriter117(StataWriter):
2816 """
2817 A class for writing Stata binary dta files in Stata 13 format (117)
2819 .. versionadded:: 0.23.0
2821 Parameters
2822 ----------
2823 fname : path (string), buffer or path object
2824 string, path object (pathlib.Path or py._path.local.LocalPath) or
2825 object implementing a binary write() functions. If using a buffer
2826 then the buffer will not be automatically closed after the file
2827 is written.
2828 data : DataFrame
2829 Input to save
2830 convert_dates : dict
2831 Dictionary mapping columns containing datetime types to stata internal
2832 format to use when writing the dates. Options are 'tc', 'td', 'tm',
2833 'tw', 'th', 'tq', 'ty'. Column can be either an integer or a name.
2834 Datetime columns that do not have a conversion type specified will be
2835 converted to 'tc'. Raises NotImplementedError if a datetime column has
2836 timezone information
2837 write_index : bool
2838 Write the index to Stata dataset.
2839 byteorder : str
2840 Can be ">", "<", "little", or "big". default is `sys.byteorder`
2841 time_stamp : datetime
2842 A datetime to use as file creation date. Default is the current time
2843 data_label : str
2844 A label for the data set. Must be 80 characters or smaller.
2845 variable_labels : dict
2846 Dictionary containing columns as keys and variable labels as values.
2847 Each label must be 80 characters or smaller.
2848 convert_strl : list
2849 List of columns names to convert to Stata StrL format. Columns with
2850 more than 2045 characters are automatically written as StrL.
2851 Smaller columns can be converted by including the column name. Using
2852 StrLs can reduce output file size when strings are longer than 8
2853 characters, and either frequently repeated or sparse.
2855 Returns
2856 -------
2857 writer : StataWriter117 instance
2858 The StataWriter117 instance has a write_file method, which will
2859 write the file to the given `fname`.
2861 Raises
2862 ------
2863 NotImplementedError
2864 * If datetimes contain timezone information
2865 ValueError
2866 * Columns listed in convert_dates are neither datetime64[ns]
2867 or datetime.datetime
2868 * Column dtype is not representable in Stata
2869 * Column listed in convert_dates is not in DataFrame
2870 * Categorical label contains more than 32,000 characters
2872 Examples
2873 --------
2874 >>> from pandas.io.stata import StataWriter117
2875 >>> data = pd.DataFrame([[1.0, 1, 'a']], columns=['a', 'b', 'c'])
2876 >>> writer = StataWriter117('./data_file.dta', data)
2877 >>> writer.write_file()
2879 Or with long strings stored in strl format
2881 >>> data = pd.DataFrame([['A relatively long string'], [''], ['']],
2882 ... columns=['strls'])
2883 >>> writer = StataWriter117('./data_file_with_long_strings.dta', data,
2884 ... convert_strl=['strls'])
2885 >>> writer.write_file()
2886 """
2888 _max_string_length = 2045
2889 _dta_version = 117
2891 def __init__(
2892 self,
2893 fname,
2894 data,
2895 convert_dates=None,
2896 write_index=True,
2897 byteorder=None,
2898 time_stamp=None,
2899 data_label=None,
2900 variable_labels=None,
2901 convert_strl=None,
2902 ):
2903 # Shallow copy since convert_strl might be modified later
2904 self._convert_strl = [] if convert_strl is None else convert_strl[:]
2906 super().__init__(
2907 fname,
2908 data,
2909 convert_dates,
2910 write_index,
2911 byteorder=byteorder,
2912 time_stamp=time_stamp,
2913 data_label=data_label,
2914 variable_labels=variable_labels,
2915 )
2916 self._map = None
2917 self._strl_blob = None
2919 @staticmethod
2920 def _tag(val, tag):
2921 """Surround val with <tag></tag>"""
2922 if isinstance(val, str):
2923 val = bytes(val, "utf-8")
2924 return bytes("<" + tag + ">", "utf-8") + val + bytes("</" + tag + ">", "utf-8")
2926 def _update_map(self, tag):
2927 """Update map location for tag with file position"""
2928 self._map[tag] = self._file.tell()
2930 def _write_header(self, data_label=None, time_stamp=None):
2931 """Write the file header"""
2932 byteorder = self._byteorder
2933 self._file.write(bytes("<stata_dta>", "utf-8"))
2934 bio = BytesIO()
2935 # ds_format - 117
2936 bio.write(self._tag(bytes(str(self._dta_version), "utf-8"), "release"))
2937 # byteorder
2938 bio.write(self._tag(byteorder == ">" and "MSF" or "LSF", "byteorder"))
2939 # number of vars, 2 bytes in 117 and 118, 4 byte in 119
2940 nvar_type = "H" if self._dta_version <= 118 else "I"
2941 bio.write(self._tag(struct.pack(byteorder + nvar_type, self.nvar), "K"))
2942 # 117 uses 4 bytes, 118 uses 8
2943 nobs_size = "I" if self._dta_version == 117 else "Q"
2944 bio.write(self._tag(struct.pack(byteorder + nobs_size, self.nobs), "N"))
2945 # data label 81 bytes, char, null terminated
2946 label = data_label[:80] if data_label is not None else ""
2947 label = label.encode(self._encoding)
2948 label_size = "B" if self._dta_version == 117 else "H"
2949 label_len = struct.pack(byteorder + label_size, len(label))
2950 label = label_len + label
2951 bio.write(self._tag(label, "label"))
2952 # time stamp, 18 bytes, char, null terminated
2953 # format dd Mon yyyy hh:mm
2954 if time_stamp is None:
2955 time_stamp = datetime.datetime.now()
2956 elif not isinstance(time_stamp, datetime.datetime):
2957 raise ValueError("time_stamp should be datetime type")
2958 # Avoid locale-specific month conversion
2959 months = [
2960 "Jan",
2961 "Feb",
2962 "Mar",
2963 "Apr",
2964 "May",
2965 "Jun",
2966 "Jul",
2967 "Aug",
2968 "Sep",
2969 "Oct",
2970 "Nov",
2971 "Dec",
2972 ]
2973 month_lookup = {i + 1: month for i, month in enumerate(months)}
2974 ts = (
2975 time_stamp.strftime("%d ")
2976 + month_lookup[time_stamp.month]
2977 + time_stamp.strftime(" %Y %H:%M")
2978 )
2979 # '\x11' added due to inspection of Stata file
2980 ts = b"\x11" + bytes(ts, "utf-8")
2981 bio.write(self._tag(ts, "timestamp"))
2982 bio.seek(0)
2983 self._file.write(self._tag(bio.read(), "header"))
2985 def _write_map(self):
2986 """Called twice during file write. The first populates the values in
2987 the map with 0s. The second call writes the final map locations when
2988 all blocks have been written."""
2989 if self._map is None:
2990 self._map = dict(
2991 (
2992 ("stata_data", 0),
2993 ("map", self._file.tell()),
2994 ("variable_types", 0),
2995 ("varnames", 0),
2996 ("sortlist", 0),
2997 ("formats", 0),
2998 ("value_label_names", 0),
2999 ("variable_labels", 0),
3000 ("characteristics", 0),
3001 ("data", 0),
3002 ("strls", 0),
3003 ("value_labels", 0),
3004 ("stata_data_close", 0),
3005 ("end-of-file", 0),
3006 )
3007 )
3008 # Move to start of map
3009 self._file.seek(self._map["map"])
3010 bio = BytesIO()
3011 for val in self._map.values():
3012 bio.write(struct.pack(self._byteorder + "Q", val))
3013 bio.seek(0)
3014 self._file.write(self._tag(bio.read(), "map"))
3016 def _write_variable_types(self):
3017 self._update_map("variable_types")
3018 bio = BytesIO()
3019 for typ in self.typlist:
3020 bio.write(struct.pack(self._byteorder + "H", typ))
3021 bio.seek(0)
3022 self._file.write(self._tag(bio.read(), "variable_types"))
3024 def _write_varnames(self):
3025 self._update_map("varnames")
3026 bio = BytesIO()
3027 # 118 scales by 4 to accommodate utf-8 data worst case encoding
3028 vn_len = 32 if self._dta_version == 117 else 128
3029 for name in self.varlist:
3030 name = self._null_terminate(name, True)
3031 name = _pad_bytes_new(name[:32].encode(self._encoding), vn_len + 1)
3032 bio.write(name)
3033 bio.seek(0)
3034 self._file.write(self._tag(bio.read(), "varnames"))
3036 def _write_sortlist(self):
3037 self._update_map("sortlist")
3038 sort_size = 2 if self._dta_version < 119 else 4
3039 self._file.write(self._tag(b"\x00" * sort_size * (self.nvar + 1), "sortlist"))
3041 def _write_formats(self):
3042 self._update_map("formats")
3043 bio = BytesIO()
3044 fmt_len = 49 if self._dta_version == 117 else 57
3045 for fmt in self.fmtlist:
3046 bio.write(_pad_bytes_new(fmt.encode(self._encoding), fmt_len))
3047 bio.seek(0)
3048 self._file.write(self._tag(bio.read(), "formats"))
3050 def _write_value_label_names(self):
3051 self._update_map("value_label_names")
3052 bio = BytesIO()
3053 # 118 scales by 4 to accommodate utf-8 data worst case encoding
3054 vl_len = 32 if self._dta_version == 117 else 128
3055 for i in range(self.nvar):
3056 # Use variable name when categorical
3057 name = "" # default name
3058 if self._is_col_cat[i]:
3059 name = self.varlist[i]
3060 name = self._null_terminate(name, True)
3061 name = _pad_bytes_new(name[:32].encode(self._encoding), vl_len + 1)
3062 bio.write(name)
3063 bio.seek(0)
3064 self._file.write(self._tag(bio.read(), "value_label_names"))
3066 def _write_variable_labels(self):
3067 # Missing labels are 80 blank characters plus null termination
3068 self._update_map("variable_labels")
3069 bio = BytesIO()
3070 # 118 scales by 4 to accommodate utf-8 data worst case encoding
3071 vl_len = 80 if self._dta_version == 117 else 320
3072 blank = _pad_bytes_new("", vl_len + 1)
3074 if self._variable_labels is None:
3075 for _ in range(self.nvar):
3076 bio.write(blank)
3077 bio.seek(0)
3078 self._file.write(self._tag(bio.read(), "variable_labels"))
3079 return
3081 for col in self.data:
3082 if col in self._variable_labels:
3083 label = self._variable_labels[col]
3084 if len(label) > 80:
3085 raise ValueError("Variable labels must be 80 characters or fewer")
3086 try:
3087 encoded = label.encode(self._encoding)
3088 except UnicodeEncodeError:
3089 raise ValueError(
3090 "Variable labels must contain only characters that "
3091 f"can be encoded in {self._encoding}"
3092 )
3094 bio.write(_pad_bytes_new(encoded, vl_len + 1))
3095 else:
3096 bio.write(blank)
3097 bio.seek(0)
3098 self._file.write(self._tag(bio.read(), "variable_labels"))
3100 def _write_characteristics(self):
3101 self._update_map("characteristics")
3102 self._file.write(self._tag(b"", "characteristics"))
3104 def _write_data(self):
3105 self._update_map("data")
3106 data = self.data
3107 self._file.write(b"<data>")
3108 self._file.write(data.tobytes())
3109 self._file.write(b"</data>")
3111 def _write_strls(self):
3112 self._update_map("strls")
3113 strls = b""
3114 if self._strl_blob is not None:
3115 strls = self._strl_blob
3116 self._file.write(self._tag(strls, "strls"))
3118 def _write_expansion_fields(self):
3119 """No-op in dta 117+"""
3120 pass
3122 def _write_value_labels(self):
3123 self._update_map("value_labels")
3124 bio = BytesIO()
3125 for vl in self._value_labels:
3126 lab = vl.generate_value_label(self._byteorder)
3127 lab = self._tag(lab, "lbl")
3128 bio.write(lab)
3129 bio.seek(0)
3130 self._file.write(self._tag(bio.read(), "value_labels"))
3132 def _write_file_close_tag(self):
3133 self._update_map("stata_data_close")
3134 self._file.write(bytes("</stata_dta>", "utf-8"))
3135 self._update_map("end-of-file")
3137 def _update_strl_names(self):
3138 """Update column names for conversion to strl if they might have been
3139 changed to comply with Stata naming rules"""
3140 # Update convert_strl if names changed
3141 for orig, new in self._converted_names.items():
3142 if orig in self._convert_strl:
3143 idx = self._convert_strl.index(orig)
3144 self._convert_strl[idx] = new
3146 def _convert_strls(self, data):
3147 """Convert columns to StrLs if either very large or in the
3148 convert_strl variable"""
3149 convert_cols = [
3150 col
3151 for i, col in enumerate(data)
3152 if self.typlist[i] == 32768 or col in self._convert_strl
3153 ]
3155 if convert_cols:
3156 ssw = StataStrLWriter(data, convert_cols, version=self._dta_version)
3157 tab, new_data = ssw.generate_table()
3158 data = new_data
3159 self._strl_blob = ssw.generate_blob(tab)
3160 return data
3162 def _set_formats_and_types(self, dtypes):
3163 self.typlist = []
3164 self.fmtlist = []
3165 for col, dtype in dtypes.items():
3166 force_strl = col in self._convert_strl
3167 fmt = _dtype_to_default_stata_fmt(
3168 dtype,
3169 self.data[col],
3170 dta_version=self._dta_version,
3171 force_strl=force_strl,
3172 )
3173 self.fmtlist.append(fmt)
3174 self.typlist.append(
3175 _dtype_to_stata_type_117(dtype, self.data[col], force_strl)
3176 )
3179class StataWriterUTF8(StataWriter117):
3180 """
3181 Stata binary dta file writing in Stata 15 (118) and 16 (119) formats
3183 DTA 118 and 119 format files support unicode string data (both fixed
3184 and strL) format. Unicode is also supported in value labels, variable
3185 labels and the dataset label. Format 119 is automatically used if the
3186 file contains more than 32,767 variables.
3188 .. versionadded:: 1.0.0
3190 Parameters
3191 ----------
3192 fname : path (string), buffer or path object
3193 string, path object (pathlib.Path or py._path.local.LocalPath) or
3194 object implementing a binary write() functions. If using a buffer
3195 then the buffer will not be automatically closed after the file
3196 is written.
3197 data : DataFrame
3198 Input to save
3199 convert_dates : dict, default None
3200 Dictionary mapping columns containing datetime types to stata internal
3201 format to use when writing the dates. Options are 'tc', 'td', 'tm',
3202 'tw', 'th', 'tq', 'ty'. Column can be either an integer or a name.
3203 Datetime columns that do not have a conversion type specified will be
3204 converted to 'tc'. Raises NotImplementedError if a datetime column has
3205 timezone information
3206 write_index : bool, default True
3207 Write the index to Stata dataset.
3208 byteorder : str, default None
3209 Can be ">", "<", "little", or "big". default is `sys.byteorder`
3210 time_stamp : datetime, default None
3211 A datetime to use as file creation date. Default is the current time
3212 data_label : str, default None
3213 A label for the data set. Must be 80 characters or smaller.
3214 variable_labels : dict, default None
3215 Dictionary containing columns as keys and variable labels as values.
3216 Each label must be 80 characters or smaller.
3217 convert_strl : list, default None
3218 List of columns names to convert to Stata StrL format. Columns with
3219 more than 2045 characters are automatically written as StrL.
3220 Smaller columns can be converted by including the column name. Using
3221 StrLs can reduce output file size when strings are longer than 8
3222 characters, and either frequently repeated or sparse.
3223 version : int, default None
3224 The dta version to use. By default, uses the size of data to determine
3225 the version. 118 is used if data.shape[1] <= 32767, and 119 is used
3226 for storing larger DataFrames.
3228 Returns
3229 -------
3230 StataWriterUTF8
3231 The instance has a write_file method, which will write the file to the
3232 given `fname`.
3234 Raises
3235 ------
3236 NotImplementedError
3237 * If datetimes contain timezone information
3238 ValueError
3239 * Columns listed in convert_dates are neither datetime64[ns]
3240 or datetime.datetime
3241 * Column dtype is not representable in Stata
3242 * Column listed in convert_dates is not in DataFrame
3243 * Categorical label contains more than 32,000 characters
3245 Examples
3246 --------
3247 Using Unicode data and column names
3249 >>> from pandas.io.stata import StataWriterUTF8
3250 >>> data = pd.DataFrame([[1.0, 1, 'ᴬ']], columns=['a', 'β', 'ĉ'])
3251 >>> writer = StataWriterUTF8('./data_file.dta', data)
3252 >>> writer.write_file()
3254 Or with long strings stored in strl format
3256 >>> data = pd.DataFrame([['ᴀ relatively long ŝtring'], [''], ['']],
3257 ... columns=['strls'])
3258 >>> writer = StataWriterUTF8('./data_file_with_long_strings.dta', data,
3259 ... convert_strl=['strls'])
3260 >>> writer.write_file()
3261 """
3263 _encoding = "utf-8"
3265 def __init__(
3266 self,
3267 fname: FilePathOrBuffer,
3268 data: DataFrame,
3269 convert_dates: Optional[Dict[Hashable, str]] = None,
3270 write_index: bool = True,
3271 byteorder: Optional[str] = None,
3272 time_stamp: Optional[datetime.datetime] = None,
3273 data_label: Optional[str] = None,
3274 variable_labels: Optional[Dict[Hashable, str]] = None,
3275 convert_strl: Optional[Sequence[Hashable]] = None,
3276 version: Optional[int] = None,
3277 ):
3278 if version is None:
3279 version = 118 if data.shape[1] <= 32767 else 119
3280 elif version not in (118, 119):
3281 raise ValueError("version must be either 118 or 119.")
3282 elif version == 118 and data.shape[1] > 32767:
3283 raise ValueError(
3284 "You must use version 119 for data sets containing more than"
3285 "32,767 variables"
3286 )
3288 super().__init__(
3289 fname,
3290 data,
3291 convert_dates=convert_dates,
3292 write_index=write_index,
3293 byteorder=byteorder,
3294 time_stamp=time_stamp,
3295 data_label=data_label,
3296 variable_labels=variable_labels,
3297 convert_strl=convert_strl,
3298 )
3299 # Override version set in StataWriter117 init
3300 self._dta_version = version
3302 def _validate_variable_name(self, name: str) -> str:
3303 """
3304 Validate variable names for Stata export.
3306 Parameters
3307 ----------
3308 name : str
3309 Variable name
3311 Returns
3312 -------
3313 str
3314 The validated name with invalid characters replaced with
3315 underscores.
3317 Notes
3318 -----
3319 Stata 118+ support most unicode characters. The only limitation is in
3320 the ascii range where the characters supported are a-z, A-Z, 0-9 and _.
3321 """
3322 # High code points appear to be acceptable
3323 for c in name:
3324 if (
3325 ord(c) < 128
3326 and (c < "A" or c > "Z")
3327 and (c < "a" or c > "z")
3328 and (c < "0" or c > "9")
3329 and c != "_"
3330 ) or 128 <= ord(c) < 256:
3331 name = name.replace(c, "_")
3333 return name