Hide keyboard shortcuts

Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1""" 

2Module contains tools for processing Stata files into DataFrames 

3 

4The StataReader below was originally written by Joe Presbrey as part of PyDTA. 

5It has been extended and improved by Skipper Seabold from the Statsmodels 

6project who also developed the StataWriter and was finally added to pandas in 

7a once again improved version. 

8 

9You can find more information on http://presbrey.mit.edu/PyDTA and 

10http://www.statsmodels.org/devel/ 

11""" 

12from collections import abc 

13import datetime 

14from io import BytesIO 

15import os 

16import struct 

17import sys 

18from typing import Any, Dict, Hashable, Optional, Sequence 

19import warnings 

20 

21from dateutil.relativedelta import relativedelta 

22import numpy as np 

23 

24from pandas._libs.lib import infer_dtype 

25from pandas._libs.writers import max_len_string_array 

26from pandas._typing import FilePathOrBuffer 

27from pandas.util._decorators import Appender 

28 

29from pandas.core.dtypes.common import ( 

30 ensure_object, 

31 is_categorical_dtype, 

32 is_datetime64_dtype, 

33) 

34 

35from pandas import ( 

36 Categorical, 

37 DatetimeIndex, 

38 NaT, 

39 Timestamp, 

40 concat, 

41 isna, 

42 to_datetime, 

43 to_timedelta, 

44) 

45from pandas.core.frame import DataFrame 

46from pandas.core.series import Series 

47 

48from pandas.io.common import get_filepath_or_buffer, stringify_path 

49 

50_version_error = ( 

51 "Version of given Stata file is {version}. pandas supports importing " 

52 "versions 104, 105, 108, 111 (Stata 7SE), 113 (Stata 8/9), " 

53 "114 (Stata 10/11), 115 (Stata 12), 117 (Stata 13), 118 (Stata 14/15/16)," 

54 "and 119 (Stata 15/16, over 32,767 variables)." 

55) 

56 

57_statafile_processing_params1 = """\ 

58convert_dates : bool, default True 

59 Convert date variables to DataFrame time values. 

60convert_categoricals : bool, default True 

61 Read value labels and convert columns to Categorical/Factor variables.""" 

62 

63_statafile_processing_params2 = """\ 

64index_col : str, optional 

65 Column to set as index. 

66convert_missing : bool, default False 

67 Flag indicating whether to convert missing values to their Stata 

68 representations. If False, missing values are replaced with nan. 

69 If True, columns containing missing values are returned with 

70 object data types and missing values are represented by 

71 StataMissingValue objects. 

72preserve_dtypes : bool, default True 

73 Preserve Stata datatypes. If False, numeric data are upcast to pandas 

74 default types for foreign data (float64 or int64). 

75columns : list or None 

76 Columns to retain. Columns will be returned in the given order. None 

77 returns all columns. 

78order_categoricals : bool, default True 

79 Flag indicating whether converted categorical data are ordered.""" 

80 

81_chunksize_params = """\ 

82chunksize : int, default None 

83 Return StataReader object for iterations, returns chunks with 

84 given number of lines.""" 

85 

86_iterator_params = """\ 

87iterator : bool, default False 

88 Return StataReader object.""" 

89 

90_read_stata_doc = f""" 

91Read Stata file into DataFrame. 

92 

93Parameters 

94---------- 

95filepath_or_buffer : str, path object or file-like object 

96 Any valid string path is acceptable. The string could be a URL. Valid 

97 URL schemes include http, ftp, s3, and file. For file URLs, a host is 

98 expected. A local file could be: ``file://localhost/path/to/table.dta``. 

99 

100 If you want to pass in a path object, pandas accepts any ``os.PathLike``. 

101 

102 By file-like object, we refer to objects with a ``read()`` method, 

103 such as a file handler (e.g. via builtin ``open`` function) 

104 or ``StringIO``. 

105{_statafile_processing_params1} 

106{_statafile_processing_params2} 

107{_chunksize_params} 

108{_iterator_params} 

109 

110Returns 

111------- 

112DataFrame or StataReader 

113 

114See Also 

115-------- 

116io.stata.StataReader : Low-level reader for Stata data files. 

117DataFrame.to_stata: Export Stata data files. 

118 

119Examples 

120-------- 

121Read a Stata dta file: 

122 

123>>> df = pd.read_stata('filename.dta') 

124 

125Read a Stata dta file in 10,000 line chunks: 

126 

127>>> itr = pd.read_stata('filename.dta', chunksize=10000) 

128>>> for chunk in itr: 

129... do_something(chunk) 

130""" 

131 

132_read_method_doc = f"""\ 

133Reads observations from Stata file, converting them into a dataframe 

134 

135Parameters 

136---------- 

137nrows : int 

138 Number of lines to read from data file, if None read whole file. 

139{_statafile_processing_params1} 

140{_statafile_processing_params2} 

141 

142Returns 

143------- 

144DataFrame 

145""" 

146 

147_stata_reader_doc = f"""\ 

148Class for reading Stata dta files. 

149 

150Parameters 

151---------- 

152path_or_buf : path (string), buffer or path object 

153 string, path object (pathlib.Path or py._path.local.LocalPath) or object 

154 implementing a binary read() functions. 

155 

156 .. versionadded:: 0.23.0 support for pathlib, py.path. 

157{_statafile_processing_params1} 

158{_statafile_processing_params2} 

159{_chunksize_params} 

160""" 

161 

162 

163@Appender(_read_stata_doc) 

164def read_stata( 

165 filepath_or_buffer, 

166 convert_dates=True, 

167 convert_categoricals=True, 

168 index_col=None, 

169 convert_missing=False, 

170 preserve_dtypes=True, 

171 columns=None, 

172 order_categoricals=True, 

173 chunksize=None, 

174 iterator=False, 

175): 

176 

177 reader = StataReader( 

178 filepath_or_buffer, 

179 convert_dates=convert_dates, 

180 convert_categoricals=convert_categoricals, 

181 index_col=index_col, 

182 convert_missing=convert_missing, 

183 preserve_dtypes=preserve_dtypes, 

184 columns=columns, 

185 order_categoricals=order_categoricals, 

186 chunksize=chunksize, 

187 ) 

188 

189 if iterator or chunksize: 

190 data = reader 

191 else: 

192 try: 

193 data = reader.read() 

194 finally: 

195 reader.close() 

196 return data 

197 

198 

199_date_formats = ["%tc", "%tC", "%td", "%d", "%tw", "%tm", "%tq", "%th", "%ty"] 

200 

201 

202stata_epoch = datetime.datetime(1960, 1, 1) 

203 

204 

205def _stata_elapsed_date_to_datetime_vec(dates, fmt): 

206 """ 

207 Convert from SIF to datetime. http://www.stata.com/help.cgi?datetime 

208 

209 Parameters 

210 ---------- 

211 dates : Series 

212 The Stata Internal Format date to convert to datetime according to fmt 

213 fmt : str 

214 The format to convert to. Can be, tc, td, tw, tm, tq, th, ty 

215 Returns 

216 

217 Returns 

218 ------- 

219 converted : Series 

220 The converted dates 

221 

222 Examples 

223 -------- 

224 >>> dates = pd.Series([52]) 

225 >>> _stata_elapsed_date_to_datetime_vec(dates , "%tw") 

226 0 1961-01-01 

227 dtype: datetime64[ns] 

228 

229 Notes 

230 ----- 

231 datetime/c - tc 

232 milliseconds since 01jan1960 00:00:00.000, assuming 86,400 s/day 

233 datetime/C - tC - NOT IMPLEMENTED 

234 milliseconds since 01jan1960 00:00:00.000, adjusted for leap seconds 

235 date - td 

236 days since 01jan1960 (01jan1960 = 0) 

237 weekly date - tw 

238 weeks since 1960w1 

239 This assumes 52 weeks in a year, then adds 7 * remainder of the weeks. 

240 The datetime value is the start of the week in terms of days in the 

241 year, not ISO calendar weeks. 

242 monthly date - tm 

243 months since 1960m1 

244 quarterly date - tq 

245 quarters since 1960q1 

246 half-yearly date - th 

247 half-years since 1960h1 yearly 

248 date - ty 

249 years since 0000 

250 

251 If you don't have pandas with datetime support, then you can't do 

252 milliseconds accurately. 

253 """ 

254 MIN_YEAR, MAX_YEAR = Timestamp.min.year, Timestamp.max.year 

255 MAX_DAY_DELTA = (Timestamp.max - datetime.datetime(1960, 1, 1)).days 

256 MIN_DAY_DELTA = (Timestamp.min - datetime.datetime(1960, 1, 1)).days 

257 MIN_MS_DELTA = MIN_DAY_DELTA * 24 * 3600 * 1000 

258 MAX_MS_DELTA = MAX_DAY_DELTA * 24 * 3600 * 1000 

259 

260 def convert_year_month_safe(year, month): 

261 """ 

262 Convert year and month to datetimes, using pandas vectorized versions 

263 when the date range falls within the range supported by pandas. 

264 Otherwise it falls back to a slower but more robust method 

265 using datetime. 

266 """ 

267 if year.max() < MAX_YEAR and year.min() > MIN_YEAR: 

268 return to_datetime(100 * year + month, format="%Y%m") 

269 else: 

270 index = getattr(year, "index", None) 

271 return Series( 

272 [datetime.datetime(y, m, 1) for y, m in zip(year, month)], index=index 

273 ) 

274 

275 def convert_year_days_safe(year, days): 

276 """ 

277 Converts year (e.g. 1999) and days since the start of the year to a 

278 datetime or datetime64 Series 

279 """ 

280 if year.max() < (MAX_YEAR - 1) and year.min() > MIN_YEAR: 

281 return to_datetime(year, format="%Y") + to_timedelta(days, unit="d") 

282 else: 

283 index = getattr(year, "index", None) 

284 value = [ 

285 datetime.datetime(y, 1, 1) + relativedelta(days=int(d)) 

286 for y, d in zip(year, days) 

287 ] 

288 return Series(value, index=index) 

289 

290 def convert_delta_safe(base, deltas, unit): 

291 """ 

292 Convert base dates and deltas to datetimes, using pandas vectorized 

293 versions if the deltas satisfy restrictions required to be expressed 

294 as dates in pandas. 

295 """ 

296 index = getattr(deltas, "index", None) 

297 if unit == "d": 

298 if deltas.max() > MAX_DAY_DELTA or deltas.min() < MIN_DAY_DELTA: 

299 values = [base + relativedelta(days=int(d)) for d in deltas] 

300 return Series(values, index=index) 

301 elif unit == "ms": 

302 if deltas.max() > MAX_MS_DELTA or deltas.min() < MIN_MS_DELTA: 

303 values = [ 

304 base + relativedelta(microseconds=(int(d) * 1000)) for d in deltas 

305 ] 

306 return Series(values, index=index) 

307 else: 

308 raise ValueError("format not understood") 

309 base = to_datetime(base) 

310 deltas = to_timedelta(deltas, unit=unit) 

311 return base + deltas 

312 

313 # TODO: If/when pandas supports more than datetime64[ns], this should be 

314 # improved to use correct range, e.g. datetime[Y] for yearly 

315 bad_locs = np.isnan(dates) 

316 has_bad_values = False 

317 if bad_locs.any(): 

318 has_bad_values = True 

319 data_col = Series(dates) 

320 data_col[bad_locs] = 1.0 # Replace with NaT 

321 dates = dates.astype(np.int64) 

322 

323 if fmt.startswith(("%tc", "tc")): # Delta ms relative to base 

324 base = stata_epoch 

325 ms = dates 

326 conv_dates = convert_delta_safe(base, ms, "ms") 

327 elif fmt.startswith(("%tC", "tC")): 

328 

329 warnings.warn("Encountered %tC format. Leaving in Stata Internal Format.") 

330 conv_dates = Series(dates, dtype=np.object) 

331 if has_bad_values: 

332 conv_dates[bad_locs] = NaT 

333 return conv_dates 

334 # Delta days relative to base 

335 elif fmt.startswith(("%td", "td", "%d", "d")): 

336 base = stata_epoch 

337 days = dates 

338 conv_dates = convert_delta_safe(base, days, "d") 

339 # does not count leap days - 7 days is a week. 

340 # 52nd week may have more than 7 days 

341 elif fmt.startswith(("%tw", "tw")): 

342 year = stata_epoch.year + dates // 52 

343 days = (dates % 52) * 7 

344 conv_dates = convert_year_days_safe(year, days) 

345 elif fmt.startswith(("%tm", "tm")): # Delta months relative to base 

346 year = stata_epoch.year + dates // 12 

347 month = (dates % 12) + 1 

348 conv_dates = convert_year_month_safe(year, month) 

349 elif fmt.startswith(("%tq", "tq")): # Delta quarters relative to base 

350 year = stata_epoch.year + dates // 4 

351 month = (dates % 4) * 3 + 1 

352 conv_dates = convert_year_month_safe(year, month) 

353 elif fmt.startswith(("%th", "th")): # Delta half-years relative to base 

354 year = stata_epoch.year + dates // 2 

355 month = (dates % 2) * 6 + 1 

356 conv_dates = convert_year_month_safe(year, month) 

357 elif fmt.startswith(("%ty", "ty")): # Years -- not delta 

358 year = dates 

359 month = np.ones_like(dates) 

360 conv_dates = convert_year_month_safe(year, month) 

361 else: 

362 raise ValueError(f"Date fmt {fmt} not understood") 

363 

364 if has_bad_values: # Restore NaT for bad values 

365 conv_dates[bad_locs] = NaT 

366 

367 return conv_dates 

368 

369 

370def _datetime_to_stata_elapsed_vec(dates, fmt): 

371 """ 

372 Convert from datetime to SIF. http://www.stata.com/help.cgi?datetime 

373 

374 Parameters 

375 ---------- 

376 dates : Series 

377 Series or array containing datetime.datetime or datetime64[ns] to 

378 convert to the Stata Internal Format given by fmt 

379 fmt : str 

380 The format to convert to. Can be, tc, td, tw, tm, tq, th, ty 

381 """ 

382 index = dates.index 

383 NS_PER_DAY = 24 * 3600 * 1000 * 1000 * 1000 

384 US_PER_DAY = NS_PER_DAY / 1000 

385 

386 def parse_dates_safe(dates, delta=False, year=False, days=False): 

387 d = {} 

388 if is_datetime64_dtype(dates.values): 

389 if delta: 

390 delta = dates - stata_epoch 

391 d["delta"] = delta.values.astype(np.int64) // 1000 # microseconds 

392 if days or year: 

393 dates = DatetimeIndex(dates) 

394 d["year"], d["month"] = dates.year, dates.month 

395 if days: 

396 days = dates.astype(np.int64) - to_datetime( 

397 d["year"], format="%Y" 

398 ).astype(np.int64) 

399 d["days"] = days // NS_PER_DAY 

400 

401 elif infer_dtype(dates, skipna=False) == "datetime": 

402 if delta: 

403 delta = dates.values - stata_epoch 

404 f = lambda x: US_PER_DAY * x.days + 1000000 * x.seconds + x.microseconds 

405 v = np.vectorize(f) 

406 d["delta"] = v(delta) 

407 if year: 

408 year_month = dates.apply(lambda x: 100 * x.year + x.month) 

409 d["year"] = year_month.values // 100 

410 d["month"] = year_month.values - d["year"] * 100 

411 if days: 

412 f = lambda x: (x - datetime.datetime(x.year, 1, 1)).days 

413 v = np.vectorize(f) 

414 d["days"] = v(dates) 

415 else: 

416 raise ValueError( 

417 "Columns containing dates must contain either " 

418 "datetime64, datetime.datetime or null values." 

419 ) 

420 

421 return DataFrame(d, index=index) 

422 

423 bad_loc = isna(dates) 

424 index = dates.index 

425 if bad_loc.any(): 

426 dates = Series(dates) 

427 if is_datetime64_dtype(dates): 

428 dates[bad_loc] = to_datetime(stata_epoch) 

429 else: 

430 dates[bad_loc] = stata_epoch 

431 

432 if fmt in ["%tc", "tc"]: 

433 d = parse_dates_safe(dates, delta=True) 

434 conv_dates = d.delta / 1000 

435 elif fmt in ["%tC", "tC"]: 

436 warnings.warn("Stata Internal Format tC not supported.") 

437 conv_dates = dates 

438 elif fmt in ["%td", "td"]: 

439 d = parse_dates_safe(dates, delta=True) 

440 conv_dates = d.delta // US_PER_DAY 

441 elif fmt in ["%tw", "tw"]: 

442 d = parse_dates_safe(dates, year=True, days=True) 

443 conv_dates = 52 * (d.year - stata_epoch.year) + d.days // 7 

444 elif fmt in ["%tm", "tm"]: 

445 d = parse_dates_safe(dates, year=True) 

446 conv_dates = 12 * (d.year - stata_epoch.year) + d.month - 1 

447 elif fmt in ["%tq", "tq"]: 

448 d = parse_dates_safe(dates, year=True) 

449 conv_dates = 4 * (d.year - stata_epoch.year) + (d.month - 1) // 3 

450 elif fmt in ["%th", "th"]: 

451 d = parse_dates_safe(dates, year=True) 

452 conv_dates = 2 * (d.year - stata_epoch.year) + (d.month > 6).astype(np.int) 

453 elif fmt in ["%ty", "ty"]: 

454 d = parse_dates_safe(dates, year=True) 

455 conv_dates = d.year 

456 else: 

457 raise ValueError(f"Format {fmt} is not a known Stata date format") 

458 

459 conv_dates = Series(conv_dates, dtype=np.float64) 

460 missing_value = struct.unpack("<d", b"\x00\x00\x00\x00\x00\x00\xe0\x7f")[0] 

461 conv_dates[bad_loc] = missing_value 

462 

463 return Series(conv_dates, index=index) 

464 

465 

466excessive_string_length_error = """ 

467Fixed width strings in Stata .dta files are limited to 244 (or fewer) 

468characters. Column '{0}' does not satisfy this restriction. Use the 

469'version=117' parameter to write the newer (Stata 13 and later) format. 

470""" 

471 

472 

473class PossiblePrecisionLoss(Warning): 

474 pass 

475 

476 

477precision_loss_doc = """ 

478Column converted from %s to %s, and some data are outside of the lossless 

479conversion range. This may result in a loss of precision in the saved data. 

480""" 

481 

482 

483class ValueLabelTypeMismatch(Warning): 

484 pass 

485 

486 

487value_label_mismatch_doc = """ 

488Stata value labels (pandas categories) must be strings. Column {0} contains 

489non-string labels which will be converted to strings. Please check that the 

490Stata data file created has not lost information due to duplicate labels. 

491""" 

492 

493 

494class InvalidColumnName(Warning): 

495 pass 

496 

497 

498invalid_name_doc = """ 

499Not all pandas column names were valid Stata variable names. 

500The following replacements have been made: 

501 

502 {0} 

503 

504If this is not what you expect, please make sure you have Stata-compliant 

505column names in your DataFrame (strings only, max 32 characters, only 

506alphanumerics and underscores, no Stata reserved words) 

507""" 

508 

509 

510def _cast_to_stata_types(data): 

511 """Checks the dtypes of the columns of a pandas DataFrame for 

512 compatibility with the data types and ranges supported by Stata, and 

513 converts if necessary. 

514 

515 Parameters 

516 ---------- 

517 data : DataFrame 

518 The DataFrame to check and convert 

519 

520 Notes 

521 ----- 

522 Numeric columns in Stata must be one of int8, int16, int32, float32 or 

523 float64, with some additional value restrictions. int8 and int16 columns 

524 are checked for violations of the value restrictions and upcast if needed. 

525 int64 data is not usable in Stata, and so it is downcast to int32 whenever 

526 the value are in the int32 range, and sidecast to float64 when larger than 

527 this range. If the int64 values are outside of the range of those 

528 perfectly representable as float64 values, a warning is raised. 

529 

530 bool columns are cast to int8. uint columns are converted to int of the 

531 same size if there is no loss in precision, otherwise are upcast to a 

532 larger type. uint64 is currently not supported since it is concerted to 

533 object in a DataFrame. 

534 """ 

535 ws = "" 

536 # original, if small, if large 

537 conversion_data = ( 

538 (np.bool, np.int8, np.int8), 

539 (np.uint8, np.int8, np.int16), 

540 (np.uint16, np.int16, np.int32), 

541 (np.uint32, np.int32, np.int64), 

542 ) 

543 

544 float32_max = struct.unpack("<f", b"\xff\xff\xff\x7e")[0] 

545 float64_max = struct.unpack("<d", b"\xff\xff\xff\xff\xff\xff\xdf\x7f")[0] 

546 

547 for col in data: 

548 dtype = data[col].dtype 

549 # Cast from unsupported types to supported types 

550 for c_data in conversion_data: 

551 if dtype == c_data[0]: 

552 if data[col].max() <= np.iinfo(c_data[1]).max: 

553 dtype = c_data[1] 

554 else: 

555 dtype = c_data[2] 

556 if c_data[2] == np.float64: # Warn if necessary 

557 if data[col].max() >= 2 ** 53: 

558 ws = precision_loss_doc.format("uint64", "float64") 

559 

560 data[col] = data[col].astype(dtype) 

561 

562 # Check values and upcast if necessary 

563 if dtype == np.int8: 

564 if data[col].max() > 100 or data[col].min() < -127: 

565 data[col] = data[col].astype(np.int16) 

566 elif dtype == np.int16: 

567 if data[col].max() > 32740 or data[col].min() < -32767: 

568 data[col] = data[col].astype(np.int32) 

569 elif dtype == np.int64: 

570 if data[col].max() <= 2147483620 and data[col].min() >= -2147483647: 

571 data[col] = data[col].astype(np.int32) 

572 else: 

573 data[col] = data[col].astype(np.float64) 

574 if data[col].max() >= 2 ** 53 or data[col].min() <= -(2 ** 53): 

575 ws = precision_loss_doc.format("int64", "float64") 

576 elif dtype in (np.float32, np.float64): 

577 value = data[col].max() 

578 if np.isinf(value): 

579 raise ValueError( 

580 f"Column {col} has a maximum value of infinity which is outside " 

581 "the range supported by Stata." 

582 ) 

583 if dtype == np.float32 and value > float32_max: 

584 data[col] = data[col].astype(np.float64) 

585 elif dtype == np.float64: 

586 if value > float64_max: 

587 raise ValueError( 

588 f"Column {col} has a maximum value ({value}) outside the range " 

589 f"supported by Stata ({float64_max})" 

590 ) 

591 

592 if ws: 

593 warnings.warn(ws, PossiblePrecisionLoss) 

594 

595 return data 

596 

597 

598class StataValueLabel: 

599 """ 

600 Parse a categorical column and prepare formatted output 

601 

602 Parameters 

603 ---------- 

604 catarray : Categorical 

605 Categorical Series to encode 

606 encoding : {"latin-1", "utf-8"} 

607 Encoding to use for value labels. 

608 """ 

609 

610 def __init__(self, catarray, encoding="latin-1"): 

611 

612 if encoding not in ("latin-1", "utf-8"): 

613 raise ValueError("Only latin-1 and utf-8 are supported.") 

614 self.labname = catarray.name 

615 self._encoding = encoding 

616 categories = catarray.cat.categories 

617 self.value_labels = list(zip(np.arange(len(categories)), categories)) 

618 self.value_labels.sort(key=lambda x: x[0]) 

619 self.text_len = np.int32(0) 

620 self.off = [] 

621 self.val = [] 

622 self.txt = [] 

623 self.n = 0 

624 

625 # Compute lengths and setup lists of offsets and labels 

626 for vl in self.value_labels: 

627 category = vl[1] 

628 if not isinstance(category, str): 

629 category = str(category) 

630 warnings.warn( 

631 value_label_mismatch_doc.format(catarray.name), 

632 ValueLabelTypeMismatch, 

633 ) 

634 category = category.encode(encoding) 

635 self.off.append(self.text_len) 

636 self.text_len += len(category) + 1 # +1 for the padding 

637 self.val.append(vl[0]) 

638 self.txt.append(category) 

639 self.n += 1 

640 

641 if self.text_len > 32000: 

642 raise ValueError( 

643 "Stata value labels for a single variable must " 

644 "have a combined length less than 32,000 " 

645 "characters." 

646 ) 

647 

648 # Ensure int32 

649 self.off = np.array(self.off, dtype=np.int32) 

650 self.val = np.array(self.val, dtype=np.int32) 

651 

652 # Total length 

653 self.len = 4 + 4 + 4 * self.n + 4 * self.n + self.text_len 

654 

655 def _encode(self, s): 

656 """ 

657 Python 3 compatibility shim 

658 """ 

659 return s.encode(self._encoding) 

660 

661 def generate_value_label(self, byteorder): 

662 """ 

663 Generate the binary representation of the value labals. 

664 

665 Parameters 

666 ---------- 

667 byteorder : str 

668 Byte order of the output 

669 

670 Returns 

671 ------- 

672 value_label : bytes 

673 Bytes containing the formatted value label 

674 """ 

675 encoding = self._encoding 

676 bio = BytesIO() 

677 null_byte = b"\x00" 

678 

679 # len 

680 bio.write(struct.pack(byteorder + "i", self.len)) 

681 

682 # labname 

683 labname = self.labname[:32].encode(encoding) 

684 lab_len = 32 if encoding not in ("utf-8", "utf8") else 128 

685 labname = _pad_bytes(labname, lab_len + 1) 

686 bio.write(labname) 

687 

688 # padding - 3 bytes 

689 for i in range(3): 

690 bio.write(struct.pack("c", null_byte)) 

691 

692 # value_label_table 

693 # n - int32 

694 bio.write(struct.pack(byteorder + "i", self.n)) 

695 

696 # textlen - int32 

697 bio.write(struct.pack(byteorder + "i", self.text_len)) 

698 

699 # off - int32 array (n elements) 

700 for offset in self.off: 

701 bio.write(struct.pack(byteorder + "i", offset)) 

702 

703 # val - int32 array (n elements) 

704 for value in self.val: 

705 bio.write(struct.pack(byteorder + "i", value)) 

706 

707 # txt - Text labels, null terminated 

708 for text in self.txt: 

709 bio.write(text + null_byte) 

710 

711 bio.seek(0) 

712 return bio.read() 

713 

714 

715class StataMissingValue: 

716 """ 

717 An observation's missing value. 

718 

719 Parameters 

720 ---------- 

721 value : int8, int16, int32, float32 or float64 

722 The Stata missing value code 

723 

724 Attributes 

725 ---------- 

726 string : string 

727 String representation of the Stata missing value 

728 value : int8, int16, int32, float32 or float64 

729 The original encoded missing value 

730 

731 Notes 

732 ----- 

733 More information: <http://www.stata.com/help.cgi?missing> 

734 

735 Integer missing values make the code '.', '.a', ..., '.z' to the ranges 

736 101 ... 127 (for int8), 32741 ... 32767 (for int16) and 2147483621 ... 

737 2147483647 (for int32). Missing values for floating point data types are 

738 more complex but the pattern is simple to discern from the following table. 

739 

740 np.float32 missing values (float in Stata) 

741 0000007f . 

742 0008007f .a 

743 0010007f .b 

744 ... 

745 00c0007f .x 

746 00c8007f .y 

747 00d0007f .z 

748 

749 np.float64 missing values (double in Stata) 

750 000000000000e07f . 

751 000000000001e07f .a 

752 000000000002e07f .b 

753 ... 

754 000000000018e07f .x 

755 000000000019e07f .y 

756 00000000001ae07f .z 

757 """ 

758 

759 # Construct a dictionary of missing values 

760 MISSING_VALUES = {} 

761 bases = (101, 32741, 2147483621) 

762 for b in bases: 

763 # Conversion to long to avoid hash issues on 32 bit platforms #8968 

764 MISSING_VALUES[b] = "." 

765 for i in range(1, 27): 

766 MISSING_VALUES[i + b] = "." + chr(96 + i) 

767 

768 float32_base = b"\x00\x00\x00\x7f" 

769 increment = struct.unpack("<i", b"\x00\x08\x00\x00")[0] 

770 for i in range(27): 

771 value = struct.unpack("<f", float32_base)[0] 

772 MISSING_VALUES[value] = "." 

773 if i > 0: 

774 MISSING_VALUES[value] += chr(96 + i) 

775 int_value = struct.unpack("<i", struct.pack("<f", value))[0] + increment 

776 float32_base = struct.pack("<i", int_value) 

777 

778 float64_base = b"\x00\x00\x00\x00\x00\x00\xe0\x7f" 

779 increment = struct.unpack("q", b"\x00\x00\x00\x00\x00\x01\x00\x00")[0] 

780 for i in range(27): 

781 value = struct.unpack("<d", float64_base)[0] 

782 MISSING_VALUES[value] = "." 

783 if i > 0: 

784 MISSING_VALUES[value] += chr(96 + i) 

785 int_value = struct.unpack("q", struct.pack("<d", value))[0] + increment 

786 float64_base = struct.pack("q", int_value) 

787 

788 BASE_MISSING_VALUES = { 

789 "int8": 101, 

790 "int16": 32741, 

791 "int32": 2147483621, 

792 "float32": struct.unpack("<f", float32_base)[0], 

793 "float64": struct.unpack("<d", float64_base)[0], 

794 } 

795 

796 def __init__(self, value): 

797 self._value = value 

798 # Conversion to int to avoid hash issues on 32 bit platforms #8968 

799 value = int(value) if value < 2147483648 else float(value) 

800 self._str = self.MISSING_VALUES[value] 

801 

802 string = property( 

803 lambda self: self._str, 

804 doc="The Stata representation of the missing value: '.', '.a'..'.z'", 

805 ) 

806 value = property( 

807 lambda self: self._value, doc="The binary representation of the missing value." 

808 ) 

809 

810 def __str__(self) -> str: 

811 return self.string 

812 

813 def __repr__(self) -> str: 

814 return f"{type(self)}({self})" 

815 

816 def __eq__(self, other: Any) -> bool: 

817 return ( 

818 isinstance(other, type(self)) 

819 and self.string == other.string 

820 and self.value == other.value 

821 ) 

822 

823 @classmethod 

824 def get_base_missing_value(cls, dtype): 

825 if dtype == np.int8: 

826 value = cls.BASE_MISSING_VALUES["int8"] 

827 elif dtype == np.int16: 

828 value = cls.BASE_MISSING_VALUES["int16"] 

829 elif dtype == np.int32: 

830 value = cls.BASE_MISSING_VALUES["int32"] 

831 elif dtype == np.float32: 

832 value = cls.BASE_MISSING_VALUES["float32"] 

833 elif dtype == np.float64: 

834 value = cls.BASE_MISSING_VALUES["float64"] 

835 else: 

836 raise ValueError("Unsupported dtype") 

837 return value 

838 

839 

840class StataParser: 

841 def __init__(self): 

842 

843 # type code. 

844 # -------------------- 

845 # str1 1 = 0x01 

846 # str2 2 = 0x02 

847 # ... 

848 # str244 244 = 0xf4 

849 # byte 251 = 0xfb (sic) 

850 # int 252 = 0xfc 

851 # long 253 = 0xfd 

852 # float 254 = 0xfe 

853 # double 255 = 0xff 

854 # -------------------- 

855 # NOTE: the byte type seems to be reserved for categorical variables 

856 # with a label, but the underlying variable is -127 to 100 

857 # we're going to drop the label and cast to int 

858 self.DTYPE_MAP = dict( 

859 list(zip(range(1, 245), ["a" + str(i) for i in range(1, 245)])) 

860 + [ 

861 (251, np.int8), 

862 (252, np.int16), 

863 (253, np.int32), 

864 (254, np.float32), 

865 (255, np.float64), 

866 ] 

867 ) 

868 self.DTYPE_MAP_XML = dict( 

869 [ 

870 (32768, np.uint8), # Keys to GSO 

871 (65526, np.float64), 

872 (65527, np.float32), 

873 (65528, np.int32), 

874 (65529, np.int16), 

875 (65530, np.int8), 

876 ] 

877 ) 

878 self.TYPE_MAP = list(range(251)) + list("bhlfd") 

879 self.TYPE_MAP_XML = dict( 

880 [ 

881 # Not really a Q, unclear how to handle byteswap 

882 (32768, "Q"), 

883 (65526, "d"), 

884 (65527, "f"), 

885 (65528, "l"), 

886 (65529, "h"), 

887 (65530, "b"), 

888 ] 

889 ) 

890 # NOTE: technically, some of these are wrong. there are more numbers 

891 # that can be represented. it's the 27 ABOVE and BELOW the max listed 

892 # numeric data type in [U] 12.2.2 of the 11.2 manual 

893 float32_min = b"\xff\xff\xff\xfe" 

894 float32_max = b"\xff\xff\xff\x7e" 

895 float64_min = b"\xff\xff\xff\xff\xff\xff\xef\xff" 

896 float64_max = b"\xff\xff\xff\xff\xff\xff\xdf\x7f" 

897 self.VALID_RANGE = { 

898 "b": (-127, 100), 

899 "h": (-32767, 32740), 

900 "l": (-2147483647, 2147483620), 

901 "f": ( 

902 np.float32(struct.unpack("<f", float32_min)[0]), 

903 np.float32(struct.unpack("<f", float32_max)[0]), 

904 ), 

905 "d": ( 

906 np.float64(struct.unpack("<d", float64_min)[0]), 

907 np.float64(struct.unpack("<d", float64_max)[0]), 

908 ), 

909 } 

910 

911 self.OLD_TYPE_MAPPING = { 

912 98: 251, # byte 

913 105: 252, # int 

914 108: 253, # long 

915 102: 254 # float 

916 # don't know old code for double 

917 } 

918 

919 # These missing values are the generic '.' in Stata, and are used 

920 # to replace nans 

921 self.MISSING_VALUES = { 

922 "b": 101, 

923 "h": 32741, 

924 "l": 2147483621, 

925 "f": np.float32(struct.unpack("<f", b"\x00\x00\x00\x7f")[0]), 

926 "d": np.float64( 

927 struct.unpack("<d", b"\x00\x00\x00\x00\x00\x00\xe0\x7f")[0] 

928 ), 

929 } 

930 self.NUMPY_TYPE_MAP = { 

931 "b": "i1", 

932 "h": "i2", 

933 "l": "i4", 

934 "f": "f4", 

935 "d": "f8", 

936 "Q": "u8", 

937 } 

938 

939 # Reserved words cannot be used as variable names 

940 self.RESERVED_WORDS = ( 

941 "aggregate", 

942 "array", 

943 "boolean", 

944 "break", 

945 "byte", 

946 "case", 

947 "catch", 

948 "class", 

949 "colvector", 

950 "complex", 

951 "const", 

952 "continue", 

953 "default", 

954 "delegate", 

955 "delete", 

956 "do", 

957 "double", 

958 "else", 

959 "eltypedef", 

960 "end", 

961 "enum", 

962 "explicit", 

963 "export", 

964 "external", 

965 "float", 

966 "for", 

967 "friend", 

968 "function", 

969 "global", 

970 "goto", 

971 "if", 

972 "inline", 

973 "int", 

974 "local", 

975 "long", 

976 "NULL", 

977 "pragma", 

978 "protected", 

979 "quad", 

980 "rowvector", 

981 "short", 

982 "typedef", 

983 "typename", 

984 "virtual", 

985 "_all", 

986 "_N", 

987 "_skip", 

988 "_b", 

989 "_pi", 

990 "str#", 

991 "in", 

992 "_pred", 

993 "strL", 

994 "_coef", 

995 "_rc", 

996 "using", 

997 "_cons", 

998 "_se", 

999 "with", 

1000 "_n", 

1001 ) 

1002 

1003 

1004class StataReader(StataParser, abc.Iterator): 

1005 __doc__ = _stata_reader_doc 

1006 

1007 def __init__( 

1008 self, 

1009 path_or_buf, 

1010 convert_dates=True, 

1011 convert_categoricals=True, 

1012 index_col=None, 

1013 convert_missing=False, 

1014 preserve_dtypes=True, 

1015 columns=None, 

1016 order_categoricals=True, 

1017 chunksize=None, 

1018 ): 

1019 super().__init__() 

1020 self.col_sizes = () 

1021 

1022 # Arguments to the reader (can be temporarily overridden in 

1023 # calls to read). 

1024 self._convert_dates = convert_dates 

1025 self._convert_categoricals = convert_categoricals 

1026 self._index_col = index_col 

1027 self._convert_missing = convert_missing 

1028 self._preserve_dtypes = preserve_dtypes 

1029 self._columns = columns 

1030 self._order_categoricals = order_categoricals 

1031 self._encoding = None 

1032 self._chunksize = chunksize 

1033 

1034 # State variables for the file 

1035 self._has_string_data = False 

1036 self._missing_values = False 

1037 self._can_read_value_labels = False 

1038 self._column_selector_set = False 

1039 self._value_labels_read = False 

1040 self._data_read = False 

1041 self._dtype = None 

1042 self._lines_read = 0 

1043 

1044 self._native_byteorder = _set_endianness(sys.byteorder) 

1045 path_or_buf = stringify_path(path_or_buf) 

1046 if isinstance(path_or_buf, str): 

1047 path_or_buf, encoding, _, should_close = get_filepath_or_buffer(path_or_buf) 

1048 

1049 if isinstance(path_or_buf, (str, bytes)): 

1050 self.path_or_buf = open(path_or_buf, "rb") 

1051 else: 

1052 # Copy to BytesIO, and ensure no encoding 

1053 contents = path_or_buf.read() 

1054 self.path_or_buf = BytesIO(contents) 

1055 

1056 self._read_header() 

1057 self._setup_dtype() 

1058 

1059 def __enter__(self): 

1060 """ enter context manager """ 

1061 return self 

1062 

1063 def __exit__(self, exc_type, exc_value, traceback): 

1064 """ exit context manager """ 

1065 self.close() 

1066 

1067 def close(self): 

1068 """ close the handle if its open """ 

1069 try: 

1070 self.path_or_buf.close() 

1071 except IOError: 

1072 pass 

1073 

1074 def _set_encoding(self): 

1075 """ 

1076 Set string encoding which depends on file version 

1077 """ 

1078 if self.format_version < 118: 

1079 self._encoding = "latin-1" 

1080 else: 

1081 self._encoding = "utf-8" 

1082 

1083 def _read_header(self): 

1084 first_char = self.path_or_buf.read(1) 

1085 if struct.unpack("c", first_char)[0] == b"<": 

1086 self._read_new_header(first_char) 

1087 else: 

1088 self._read_old_header(first_char) 

1089 

1090 self.has_string_data = len([x for x in self.typlist if type(x) is int]) > 0 

1091 

1092 # calculate size of a data record 

1093 self.col_sizes = [self._calcsize(typ) for typ in self.typlist] 

1094 

1095 def _read_new_header(self, first_char): 

1096 # The first part of the header is common to 117 - 119. 

1097 self.path_or_buf.read(27) # stata_dta><header><release> 

1098 self.format_version = int(self.path_or_buf.read(3)) 

1099 if self.format_version not in [117, 118, 119]: 

1100 raise ValueError(_version_error.format(version=self.format_version)) 

1101 self._set_encoding() 

1102 self.path_or_buf.read(21) # </release><byteorder> 

1103 self.byteorder = self.path_or_buf.read(3) == b"MSF" and ">" or "<" 

1104 self.path_or_buf.read(15) # </byteorder><K> 

1105 nvar_type = "H" if self.format_version <= 118 else "I" 

1106 nvar_size = 2 if self.format_version <= 118 else 4 

1107 self.nvar = struct.unpack( 

1108 self.byteorder + nvar_type, self.path_or_buf.read(nvar_size) 

1109 )[0] 

1110 self.path_or_buf.read(7) # </K><N> 

1111 

1112 self.nobs = self._get_nobs() 

1113 self.path_or_buf.read(11) # </N><label> 

1114 self._data_label = self._get_data_label() 

1115 self.path_or_buf.read(19) # </label><timestamp> 

1116 self.time_stamp = self._get_time_stamp() 

1117 self.path_or_buf.read(26) # </timestamp></header><map> 

1118 self.path_or_buf.read(8) # 0x0000000000000000 

1119 self.path_or_buf.read(8) # position of <map> 

1120 

1121 self._seek_vartypes = ( 

1122 struct.unpack(self.byteorder + "q", self.path_or_buf.read(8))[0] + 16 

1123 ) 

1124 self._seek_varnames = ( 

1125 struct.unpack(self.byteorder + "q", self.path_or_buf.read(8))[0] + 10 

1126 ) 

1127 self._seek_sortlist = ( 

1128 struct.unpack(self.byteorder + "q", self.path_or_buf.read(8))[0] + 10 

1129 ) 

1130 self._seek_formats = ( 

1131 struct.unpack(self.byteorder + "q", self.path_or_buf.read(8))[0] + 9 

1132 ) 

1133 self._seek_value_label_names = ( 

1134 struct.unpack(self.byteorder + "q", self.path_or_buf.read(8))[0] + 19 

1135 ) 

1136 

1137 # Requires version-specific treatment 

1138 self._seek_variable_labels = self._get_seek_variable_labels() 

1139 

1140 self.path_or_buf.read(8) # <characteristics> 

1141 self.data_location = ( 

1142 struct.unpack(self.byteorder + "q", self.path_or_buf.read(8))[0] + 6 

1143 ) 

1144 self.seek_strls = ( 

1145 struct.unpack(self.byteorder + "q", self.path_or_buf.read(8))[0] + 7 

1146 ) 

1147 self.seek_value_labels = ( 

1148 struct.unpack(self.byteorder + "q", self.path_or_buf.read(8))[0] + 14 

1149 ) 

1150 

1151 self.typlist, self.dtyplist = self._get_dtypes(self._seek_vartypes) 

1152 

1153 self.path_or_buf.seek(self._seek_varnames) 

1154 self.varlist = self._get_varlist() 

1155 

1156 self.path_or_buf.seek(self._seek_sortlist) 

1157 self.srtlist = struct.unpack( 

1158 self.byteorder + ("h" * (self.nvar + 1)), 

1159 self.path_or_buf.read(2 * (self.nvar + 1)), 

1160 )[:-1] 

1161 

1162 self.path_or_buf.seek(self._seek_formats) 

1163 self.fmtlist = self._get_fmtlist() 

1164 

1165 self.path_or_buf.seek(self._seek_value_label_names) 

1166 self.lbllist = self._get_lbllist() 

1167 

1168 self.path_or_buf.seek(self._seek_variable_labels) 

1169 self._variable_labels = self._get_variable_labels() 

1170 

1171 # Get data type information, works for versions 117-119. 

1172 def _get_dtypes(self, seek_vartypes): 

1173 

1174 self.path_or_buf.seek(seek_vartypes) 

1175 raw_typlist = [ 

1176 struct.unpack(self.byteorder + "H", self.path_or_buf.read(2))[0] 

1177 for i in range(self.nvar) 

1178 ] 

1179 

1180 def f(typ): 

1181 if typ <= 2045: 

1182 return typ 

1183 try: 

1184 return self.TYPE_MAP_XML[typ] 

1185 except KeyError: 

1186 raise ValueError(f"cannot convert stata types [{typ}]") 

1187 

1188 typlist = [f(x) for x in raw_typlist] 

1189 

1190 def f(typ): 

1191 if typ <= 2045: 

1192 return str(typ) 

1193 try: 

1194 return self.DTYPE_MAP_XML[typ] 

1195 except KeyError: 

1196 raise ValueError(f"cannot convert stata dtype [{typ}]") 

1197 

1198 dtyplist = [f(x) for x in raw_typlist] 

1199 

1200 return typlist, dtyplist 

1201 

1202 def _get_varlist(self): 

1203 if self.format_version == 117: 

1204 b = 33 

1205 elif self.format_version >= 118: 

1206 b = 129 

1207 

1208 return [self._decode(self.path_or_buf.read(b)) for i in range(self.nvar)] 

1209 

1210 # Returns the format list 

1211 def _get_fmtlist(self): 

1212 if self.format_version >= 118: 

1213 b = 57 

1214 elif self.format_version > 113: 

1215 b = 49 

1216 elif self.format_version > 104: 

1217 b = 12 

1218 else: 

1219 b = 7 

1220 

1221 return [self._decode(self.path_or_buf.read(b)) for i in range(self.nvar)] 

1222 

1223 # Returns the label list 

1224 def _get_lbllist(self): 

1225 if self.format_version >= 118: 

1226 b = 129 

1227 elif self.format_version > 108: 

1228 b = 33 

1229 else: 

1230 b = 9 

1231 return [self._decode(self.path_or_buf.read(b)) for i in range(self.nvar)] 

1232 

1233 def _get_variable_labels(self): 

1234 if self.format_version >= 118: 

1235 vlblist = [ 

1236 self._decode(self.path_or_buf.read(321)) for i in range(self.nvar) 

1237 ] 

1238 elif self.format_version > 105: 

1239 vlblist = [ 

1240 self._decode(self.path_or_buf.read(81)) for i in range(self.nvar) 

1241 ] 

1242 else: 

1243 vlblist = [ 

1244 self._decode(self.path_or_buf.read(32)) for i in range(self.nvar) 

1245 ] 

1246 return vlblist 

1247 

1248 def _get_nobs(self): 

1249 if self.format_version >= 118: 

1250 return struct.unpack(self.byteorder + "Q", self.path_or_buf.read(8))[0] 

1251 else: 

1252 return struct.unpack(self.byteorder + "I", self.path_or_buf.read(4))[0] 

1253 

1254 def _get_data_label(self): 

1255 if self.format_version >= 118: 

1256 strlen = struct.unpack(self.byteorder + "H", self.path_or_buf.read(2))[0] 

1257 return self._decode(self.path_or_buf.read(strlen)) 

1258 elif self.format_version == 117: 

1259 strlen = struct.unpack("b", self.path_or_buf.read(1))[0] 

1260 return self._decode(self.path_or_buf.read(strlen)) 

1261 elif self.format_version > 105: 

1262 return self._decode(self.path_or_buf.read(81)) 

1263 else: 

1264 return self._decode(self.path_or_buf.read(32)) 

1265 

1266 def _get_time_stamp(self): 

1267 if self.format_version >= 118: 

1268 strlen = struct.unpack("b", self.path_or_buf.read(1))[0] 

1269 return self.path_or_buf.read(strlen).decode("utf-8") 

1270 elif self.format_version == 117: 

1271 strlen = struct.unpack("b", self.path_or_buf.read(1))[0] 

1272 return self._decode(self.path_or_buf.read(strlen)) 

1273 elif self.format_version > 104: 

1274 return self._decode(self.path_or_buf.read(18)) 

1275 else: 

1276 raise ValueError() 

1277 

1278 def _get_seek_variable_labels(self): 

1279 if self.format_version == 117: 

1280 self.path_or_buf.read(8) # <variable_lables>, throw away 

1281 # Stata 117 data files do not follow the described format. This is 

1282 # a work around that uses the previous label, 33 bytes for each 

1283 # variable, 20 for the closing tag and 17 for the opening tag 

1284 return self._seek_value_label_names + (33 * self.nvar) + 20 + 17 

1285 elif self.format_version >= 118: 

1286 return struct.unpack(self.byteorder + "q", self.path_or_buf.read(8))[0] + 17 

1287 else: 

1288 raise ValueError() 

1289 

1290 def _read_old_header(self, first_char): 

1291 self.format_version = struct.unpack("b", first_char)[0] 

1292 if self.format_version not in [104, 105, 108, 111, 113, 114, 115]: 

1293 raise ValueError(_version_error.format(version=self.format_version)) 

1294 self._set_encoding() 

1295 self.byteorder = ( 

1296 struct.unpack("b", self.path_or_buf.read(1))[0] == 0x1 and ">" or "<" 

1297 ) 

1298 self.filetype = struct.unpack("b", self.path_or_buf.read(1))[0] 

1299 self.path_or_buf.read(1) # unused 

1300 

1301 self.nvar = struct.unpack(self.byteorder + "H", self.path_or_buf.read(2))[0] 

1302 self.nobs = self._get_nobs() 

1303 

1304 self._data_label = self._get_data_label() 

1305 

1306 self.time_stamp = self._get_time_stamp() 

1307 

1308 # descriptors 

1309 if self.format_version > 108: 

1310 typlist = [ord(self.path_or_buf.read(1)) for i in range(self.nvar)] 

1311 else: 

1312 buf = self.path_or_buf.read(self.nvar) 

1313 typlistb = np.frombuffer(buf, dtype=np.uint8) 

1314 typlist = [] 

1315 for tp in typlistb: 

1316 if tp in self.OLD_TYPE_MAPPING: 

1317 typlist.append(self.OLD_TYPE_MAPPING[tp]) 

1318 else: 

1319 typlist.append(tp - 127) # bytes 

1320 

1321 try: 

1322 self.typlist = [self.TYPE_MAP[typ] for typ in typlist] 

1323 except ValueError: 

1324 invalid_types = ",".join(str(x) for x in typlist) 

1325 raise ValueError(f"cannot convert stata types [{invalid_types}]") 

1326 try: 

1327 self.dtyplist = [self.DTYPE_MAP[typ] for typ in typlist] 

1328 except ValueError: 

1329 invalid_dtypes = ",".join(str(x) for x in typlist) 

1330 raise ValueError(f"cannot convert stata dtypes [{invalid_dtypes}]") 

1331 

1332 if self.format_version > 108: 

1333 self.varlist = [ 

1334 self._decode(self.path_or_buf.read(33)) for i in range(self.nvar) 

1335 ] 

1336 else: 

1337 self.varlist = [ 

1338 self._decode(self.path_or_buf.read(9)) for i in range(self.nvar) 

1339 ] 

1340 self.srtlist = struct.unpack( 

1341 self.byteorder + ("h" * (self.nvar + 1)), 

1342 self.path_or_buf.read(2 * (self.nvar + 1)), 

1343 )[:-1] 

1344 

1345 self.fmtlist = self._get_fmtlist() 

1346 

1347 self.lbllist = self._get_lbllist() 

1348 

1349 self._variable_labels = self._get_variable_labels() 

1350 

1351 # ignore expansion fields (Format 105 and later) 

1352 # When reading, read five bytes; the last four bytes now tell you 

1353 # the size of the next read, which you discard. You then continue 

1354 # like this until you read 5 bytes of zeros. 

1355 

1356 if self.format_version > 104: 

1357 while True: 

1358 data_type = struct.unpack( 

1359 self.byteorder + "b", self.path_or_buf.read(1) 

1360 )[0] 

1361 if self.format_version > 108: 

1362 data_len = struct.unpack( 

1363 self.byteorder + "i", self.path_or_buf.read(4) 

1364 )[0] 

1365 else: 

1366 data_len = struct.unpack( 

1367 self.byteorder + "h", self.path_or_buf.read(2) 

1368 )[0] 

1369 if data_type == 0: 

1370 break 

1371 self.path_or_buf.read(data_len) 

1372 

1373 # necessary data to continue parsing 

1374 self.data_location = self.path_or_buf.tell() 

1375 

1376 def _setup_dtype(self): 

1377 """Map between numpy and state dtypes""" 

1378 if self._dtype is not None: 

1379 return self._dtype 

1380 

1381 dtype = [] # Convert struct data types to numpy data type 

1382 for i, typ in enumerate(self.typlist): 

1383 if typ in self.NUMPY_TYPE_MAP: 

1384 dtype.append(("s" + str(i), self.byteorder + self.NUMPY_TYPE_MAP[typ])) 

1385 else: 

1386 dtype.append(("s" + str(i), "S" + str(typ))) 

1387 dtype = np.dtype(dtype) 

1388 self._dtype = dtype 

1389 

1390 return self._dtype 

1391 

1392 def _calcsize(self, fmt): 

1393 return type(fmt) is int and fmt or struct.calcsize(self.byteorder + fmt) 

1394 

1395 def _decode(self, s): 

1396 # have bytes not strings, so must decode 

1397 s = s.partition(b"\0")[0] 

1398 try: 

1399 return s.decode(self._encoding) 

1400 except UnicodeDecodeError: 

1401 # GH 25960, fallback to handle incorrect format produced when 117 

1402 # files are converted to 118 files in Stata 

1403 encoding = self._encoding 

1404 msg = f""" 

1405One or more strings in the dta file could not be decoded using {encoding}, and 

1406so the fallback encoding of latin-1 is being used. This can happen when a file 

1407has been incorrectly encoded by Stata or some other software. You should verify 

1408the string values returned are correct.""" 

1409 warnings.warn(msg, UnicodeWarning) 

1410 return s.decode("latin-1") 

1411 

1412 def _read_value_labels(self): 

1413 if self._value_labels_read: 

1414 # Don't read twice 

1415 return 

1416 if self.format_version <= 108: 

1417 # Value labels are not supported in version 108 and earlier. 

1418 self._value_labels_read = True 

1419 self.value_label_dict = dict() 

1420 return 

1421 

1422 if self.format_version >= 117: 

1423 self.path_or_buf.seek(self.seek_value_labels) 

1424 else: 

1425 offset = self.nobs * self._dtype.itemsize 

1426 self.path_or_buf.seek(self.data_location + offset) 

1427 

1428 self._value_labels_read = True 

1429 self.value_label_dict = dict() 

1430 

1431 while True: 

1432 if self.format_version >= 117: 

1433 if self.path_or_buf.read(5) == b"</val": # <lbl> 

1434 break # end of value label table 

1435 

1436 slength = self.path_or_buf.read(4) 

1437 if not slength: 

1438 break # end of value label table (format < 117) 

1439 if self.format_version <= 117: 

1440 labname = self._decode(self.path_or_buf.read(33)) 

1441 else: 

1442 labname = self._decode(self.path_or_buf.read(129)) 

1443 self.path_or_buf.read(3) # padding 

1444 

1445 n = struct.unpack(self.byteorder + "I", self.path_or_buf.read(4))[0] 

1446 txtlen = struct.unpack(self.byteorder + "I", self.path_or_buf.read(4))[0] 

1447 off = np.frombuffer( 

1448 self.path_or_buf.read(4 * n), dtype=self.byteorder + "i4", count=n 

1449 ) 

1450 val = np.frombuffer( 

1451 self.path_or_buf.read(4 * n), dtype=self.byteorder + "i4", count=n 

1452 ) 

1453 ii = np.argsort(off) 

1454 off = off[ii] 

1455 val = val[ii] 

1456 txt = self.path_or_buf.read(txtlen) 

1457 self.value_label_dict[labname] = dict() 

1458 for i in range(n): 

1459 end = off[i + 1] if i < n - 1 else txtlen 

1460 self.value_label_dict[labname][val[i]] = self._decode(txt[off[i] : end]) 

1461 if self.format_version >= 117: 

1462 self.path_or_buf.read(6) # </lbl> 

1463 self._value_labels_read = True 

1464 

1465 def _read_strls(self): 

1466 self.path_or_buf.seek(self.seek_strls) 

1467 # Wrap v_o in a string to allow uint64 values as keys on 32bit OS 

1468 self.GSO = {"0": ""} 

1469 while True: 

1470 if self.path_or_buf.read(3) != b"GSO": 

1471 break 

1472 

1473 if self.format_version == 117: 

1474 v_o = struct.unpack(self.byteorder + "Q", self.path_or_buf.read(8))[0] 

1475 else: 

1476 buf = self.path_or_buf.read(12) 

1477 # Only tested on little endian file on little endian machine. 

1478 v_size = 2 if self.format_version == 118 else 3 

1479 if self.byteorder == "<": 

1480 buf = buf[0:v_size] + buf[4 : 12 - v_size] 

1481 else: 

1482 # This path may not be correct, impossible to test 

1483 buf = buf[0:v_size] + buf[4 + v_size :] 

1484 v_o = struct.unpack("Q", buf)[0] 

1485 typ = struct.unpack("B", self.path_or_buf.read(1))[0] 

1486 length = struct.unpack(self.byteorder + "I", self.path_or_buf.read(4))[0] 

1487 va = self.path_or_buf.read(length) 

1488 if typ == 130: 

1489 va = va[0:-1].decode(self._encoding) 

1490 # Wrap v_o in a string to allow uint64 values as keys on 32bit OS 

1491 self.GSO[str(v_o)] = va 

1492 

1493 def __next__(self): 

1494 return self.read(nrows=self._chunksize or 1) 

1495 

1496 def get_chunk(self, size=None): 

1497 """ 

1498 Reads lines from Stata file and returns as dataframe 

1499 

1500 Parameters 

1501 ---------- 

1502 size : int, defaults to None 

1503 Number of lines to read. If None, reads whole file. 

1504 

1505 Returns 

1506 ------- 

1507 DataFrame 

1508 """ 

1509 if size is None: 

1510 size = self._chunksize 

1511 return self.read(nrows=size) 

1512 

1513 @Appender(_read_method_doc) 

1514 def read( 

1515 self, 

1516 nrows=None, 

1517 convert_dates=None, 

1518 convert_categoricals=None, 

1519 index_col=None, 

1520 convert_missing=None, 

1521 preserve_dtypes=None, 

1522 columns=None, 

1523 order_categoricals=None, 

1524 ): 

1525 # Handle empty file or chunk. If reading incrementally raise 

1526 # StopIteration. If reading the whole thing return an empty 

1527 # data frame. 

1528 if (self.nobs == 0) and (nrows is None): 

1529 self._can_read_value_labels = True 

1530 self._data_read = True 

1531 self.close() 

1532 return DataFrame(columns=self.varlist) 

1533 

1534 # Handle options 

1535 if convert_dates is None: 

1536 convert_dates = self._convert_dates 

1537 if convert_categoricals is None: 

1538 convert_categoricals = self._convert_categoricals 

1539 if convert_missing is None: 

1540 convert_missing = self._convert_missing 

1541 if preserve_dtypes is None: 

1542 preserve_dtypes = self._preserve_dtypes 

1543 if columns is None: 

1544 columns = self._columns 

1545 if order_categoricals is None: 

1546 order_categoricals = self._order_categoricals 

1547 if index_col is None: 

1548 index_col = self._index_col 

1549 

1550 if nrows is None: 

1551 nrows = self.nobs 

1552 

1553 if (self.format_version >= 117) and (not self._value_labels_read): 

1554 self._can_read_value_labels = True 

1555 self._read_strls() 

1556 

1557 # Read data 

1558 dtype = self._dtype 

1559 max_read_len = (self.nobs - self._lines_read) * dtype.itemsize 

1560 read_len = nrows * dtype.itemsize 

1561 read_len = min(read_len, max_read_len) 

1562 if read_len <= 0: 

1563 # Iterator has finished, should never be here unless 

1564 # we are reading the file incrementally 

1565 if convert_categoricals: 

1566 self._read_value_labels() 

1567 self.close() 

1568 raise StopIteration 

1569 offset = self._lines_read * dtype.itemsize 

1570 self.path_or_buf.seek(self.data_location + offset) 

1571 read_lines = min(nrows, self.nobs - self._lines_read) 

1572 data = np.frombuffer( 

1573 self.path_or_buf.read(read_len), dtype=dtype, count=read_lines 

1574 ) 

1575 

1576 self._lines_read += read_lines 

1577 if self._lines_read == self.nobs: 

1578 self._can_read_value_labels = True 

1579 self._data_read = True 

1580 # if necessary, swap the byte order to native here 

1581 if self.byteorder != self._native_byteorder: 

1582 data = data.byteswap().newbyteorder() 

1583 

1584 if convert_categoricals: 

1585 self._read_value_labels() 

1586 

1587 if len(data) == 0: 

1588 data = DataFrame(columns=self.varlist) 

1589 else: 

1590 data = DataFrame.from_records(data) 

1591 data.columns = self.varlist 

1592 

1593 # If index is not specified, use actual row number rather than 

1594 # restarting at 0 for each chunk. 

1595 if index_col is None: 

1596 ix = np.arange(self._lines_read - read_lines, self._lines_read) 

1597 data = data.set_index(ix) 

1598 

1599 if columns is not None: 

1600 try: 

1601 data = self._do_select_columns(data, columns) 

1602 except ValueError: 

1603 self.close() 

1604 raise 

1605 

1606 # Decode strings 

1607 for col, typ in zip(data, self.typlist): 

1608 if type(typ) is int: 

1609 data[col] = data[col].apply(self._decode, convert_dtype=True) 

1610 

1611 data = self._insert_strls(data) 

1612 

1613 cols_ = np.where(self.dtyplist)[0] 

1614 

1615 # Convert columns (if needed) to match input type 

1616 ix = data.index 

1617 requires_type_conversion = False 

1618 data_formatted = [] 

1619 for i in cols_: 

1620 if self.dtyplist[i] is not None: 

1621 col = data.columns[i] 

1622 dtype = data[col].dtype 

1623 if dtype != np.dtype(object) and dtype != self.dtyplist[i]: 

1624 requires_type_conversion = True 

1625 data_formatted.append( 

1626 (col, Series(data[col], ix, self.dtyplist[i])) 

1627 ) 

1628 else: 

1629 data_formatted.append((col, data[col])) 

1630 if requires_type_conversion: 

1631 data = DataFrame.from_dict(dict(data_formatted)) 

1632 del data_formatted 

1633 

1634 data = self._do_convert_missing(data, convert_missing) 

1635 

1636 if convert_dates: 

1637 

1638 def any_startswith(x: str) -> bool: 

1639 return any(x.startswith(fmt) for fmt in _date_formats) 

1640 

1641 cols = np.where([any_startswith(x) for x in self.fmtlist])[0] 

1642 for i in cols: 

1643 col = data.columns[i] 

1644 try: 

1645 data[col] = _stata_elapsed_date_to_datetime_vec( 

1646 data[col], self.fmtlist[i] 

1647 ) 

1648 except ValueError: 

1649 self.close() 

1650 raise 

1651 

1652 if convert_categoricals and self.format_version > 108: 

1653 data = self._do_convert_categoricals( 

1654 data, self.value_label_dict, self.lbllist, order_categoricals 

1655 ) 

1656 

1657 if not preserve_dtypes: 

1658 retyped_data = [] 

1659 convert = False 

1660 for col in data: 

1661 dtype = data[col].dtype 

1662 if dtype in (np.float16, np.float32): 

1663 dtype = np.float64 

1664 convert = True 

1665 elif dtype in (np.int8, np.int16, np.int32): 

1666 dtype = np.int64 

1667 convert = True 

1668 retyped_data.append((col, data[col].astype(dtype))) 

1669 if convert: 

1670 data = DataFrame.from_dict(dict(retyped_data)) 

1671 

1672 if index_col is not None: 

1673 data = data.set_index(data.pop(index_col)) 

1674 

1675 return data 

1676 

1677 def _do_convert_missing(self, data, convert_missing): 

1678 # Check for missing values, and replace if found 

1679 replacements = {} 

1680 for i, colname in enumerate(data): 

1681 fmt = self.typlist[i] 

1682 if fmt not in self.VALID_RANGE: 

1683 continue 

1684 

1685 nmin, nmax = self.VALID_RANGE[fmt] 

1686 series = data[colname] 

1687 missing = np.logical_or(series < nmin, series > nmax) 

1688 

1689 if not missing.any(): 

1690 continue 

1691 

1692 if convert_missing: # Replacement follows Stata notation 

1693 

1694 missing_loc = np.argwhere(missing._ndarray_values) 

1695 umissing, umissing_loc = np.unique(series[missing], return_inverse=True) 

1696 replacement = Series(series, dtype=np.object) 

1697 for j, um in enumerate(umissing): 

1698 missing_value = StataMissingValue(um) 

1699 

1700 loc = missing_loc[umissing_loc == j] 

1701 replacement.iloc[loc] = missing_value 

1702 else: # All replacements are identical 

1703 dtype = series.dtype 

1704 if dtype not in (np.float32, np.float64): 

1705 dtype = np.float64 

1706 replacement = Series(series, dtype=dtype) 

1707 replacement[missing] = np.nan 

1708 replacements[colname] = replacement 

1709 if replacements: 

1710 columns = data.columns 

1711 replacements = DataFrame(replacements) 

1712 data = concat([data.drop(replacements.columns, 1), replacements], 1) 

1713 data = data[columns] 

1714 return data 

1715 

1716 def _insert_strls(self, data): 

1717 if not hasattr(self, "GSO") or len(self.GSO) == 0: 

1718 return data 

1719 for i, typ in enumerate(self.typlist): 

1720 if typ != "Q": 

1721 continue 

1722 # Wrap v_o in a string to allow uint64 values as keys on 32bit OS 

1723 data.iloc[:, i] = [self.GSO[str(k)] for k in data.iloc[:, i]] 

1724 return data 

1725 

1726 def _do_select_columns(self, data, columns): 

1727 

1728 if not self._column_selector_set: 

1729 column_set = set(columns) 

1730 if len(column_set) != len(columns): 

1731 raise ValueError("columns contains duplicate entries") 

1732 unmatched = column_set.difference(data.columns) 

1733 if unmatched: 

1734 raise ValueError( 

1735 "The following columns were not found in the " 

1736 "Stata data set: " + ", ".join(list(unmatched)) 

1737 ) 

1738 # Copy information for retained columns for later processing 

1739 dtyplist = [] 

1740 typlist = [] 

1741 fmtlist = [] 

1742 lbllist = [] 

1743 for col in columns: 

1744 i = data.columns.get_loc(col) 

1745 dtyplist.append(self.dtyplist[i]) 

1746 typlist.append(self.typlist[i]) 

1747 fmtlist.append(self.fmtlist[i]) 

1748 lbllist.append(self.lbllist[i]) 

1749 

1750 self.dtyplist = dtyplist 

1751 self.typlist = typlist 

1752 self.fmtlist = fmtlist 

1753 self.lbllist = lbllist 

1754 self._column_selector_set = True 

1755 

1756 return data[columns] 

1757 

1758 def _do_convert_categoricals( 

1759 self, data, value_label_dict, lbllist, order_categoricals 

1760 ): 

1761 """ 

1762 Converts categorical columns to Categorical type. 

1763 """ 

1764 value_labels = list(value_label_dict.keys()) 

1765 cat_converted_data = [] 

1766 for col, label in zip(data, lbllist): 

1767 if label in value_labels: 

1768 # Explicit call with ordered=True 

1769 cat_data = Categorical(data[col], ordered=order_categoricals) 

1770 categories = [] 

1771 for category in cat_data.categories: 

1772 if category in value_label_dict[label]: 

1773 categories.append(value_label_dict[label][category]) 

1774 else: 

1775 categories.append(category) # Partially labeled 

1776 try: 

1777 cat_data.categories = categories 

1778 except ValueError: 

1779 vc = Series(categories).value_counts() 

1780 repeats = list(vc.index[vc > 1]) 

1781 repeats = "-" * 80 + "\n" + "\n".join(repeats) 

1782 # GH 25772 

1783 msg = f""" 

1784Value labels for column {col} are not unique. These cannot be converted to 

1785pandas categoricals. 

1786 

1787Either read the file with `convert_categoricals` set to False or use the 

1788low level interface in `StataReader` to separately read the values and the 

1789value_labels. 

1790 

1791The repeated labels are: 

1792{repeats} 

1793""" 

1794 raise ValueError(msg) 

1795 # TODO: is the next line needed above in the data(...) method? 

1796 cat_data = Series(cat_data, index=data.index) 

1797 cat_converted_data.append((col, cat_data)) 

1798 else: 

1799 cat_converted_data.append((col, data[col])) 

1800 data = DataFrame.from_dict(dict(cat_converted_data)) 

1801 return data 

1802 

1803 @property 

1804 def data_label(self): 

1805 """ 

1806 Return data label of Stata file. 

1807 """ 

1808 return self._data_label 

1809 

1810 def variable_labels(self): 

1811 """ 

1812 Return variable labels as a dict, associating each variable name 

1813 with corresponding label. 

1814 

1815 Returns 

1816 ------- 

1817 dict 

1818 """ 

1819 return dict(zip(self.varlist, self._variable_labels)) 

1820 

1821 def value_labels(self): 

1822 """ 

1823 Return a dict, associating each variable name a dict, associating 

1824 each value its corresponding label. 

1825 

1826 Returns 

1827 ------- 

1828 dict 

1829 """ 

1830 if not self._value_labels_read: 

1831 self._read_value_labels() 

1832 

1833 return self.value_label_dict 

1834 

1835 

1836def _open_file_binary_write(fname): 

1837 """ 

1838 Open a binary file or no-op if file-like. 

1839 

1840 Parameters 

1841 ---------- 

1842 fname : string path, path object or buffer 

1843 

1844 Returns 

1845 ------- 

1846 file : file-like object 

1847 File object supporting write 

1848 own : bool 

1849 True if the file was created, otherwise False 

1850 """ 

1851 if hasattr(fname, "write"): 

1852 # if 'b' not in fname.mode: 

1853 return fname, False 

1854 return open(fname, "wb"), True 

1855 

1856 

1857def _set_endianness(endianness): 

1858 if endianness.lower() in ["<", "little"]: 

1859 return "<" 

1860 elif endianness.lower() in [">", "big"]: 

1861 return ">" 

1862 else: # pragma : no cover 

1863 raise ValueError(f"Endianness {endianness} not understood") 

1864 

1865 

1866def _pad_bytes(name, length): 

1867 """ 

1868 Take a char string and pads it with null bytes until it's length chars. 

1869 """ 

1870 if isinstance(name, bytes): 

1871 return name + b"\x00" * (length - len(name)) 

1872 return name + "\x00" * (length - len(name)) 

1873 

1874 

1875def _convert_datetime_to_stata_type(fmt): 

1876 """ 

1877 Convert from one of the stata date formats to a type in TYPE_MAP. 

1878 """ 

1879 if fmt in [ 

1880 "tc", 

1881 "%tc", 

1882 "td", 

1883 "%td", 

1884 "tw", 

1885 "%tw", 

1886 "tm", 

1887 "%tm", 

1888 "tq", 

1889 "%tq", 

1890 "th", 

1891 "%th", 

1892 "ty", 

1893 "%ty", 

1894 ]: 

1895 return np.float64 # Stata expects doubles for SIFs 

1896 else: 

1897 raise NotImplementedError(f"Format {fmt} not implemented") 

1898 

1899 

1900def _maybe_convert_to_int_keys(convert_dates, varlist): 

1901 new_dict = {} 

1902 for key in convert_dates: 

1903 if not convert_dates[key].startswith("%"): # make sure proper fmts 

1904 convert_dates[key] = "%" + convert_dates[key] 

1905 if key in varlist: 

1906 new_dict.update({varlist.index(key): convert_dates[key]}) 

1907 else: 

1908 if not isinstance(key, int): 

1909 raise ValueError("convert_dates key must be a column or an integer") 

1910 new_dict.update({key: convert_dates[key]}) 

1911 return new_dict 

1912 

1913 

1914def _dtype_to_stata_type(dtype, column): 

1915 """ 

1916 Convert dtype types to stata types. Returns the byte of the given ordinal. 

1917 See TYPE_MAP and comments for an explanation. This is also explained in 

1918 the dta spec. 

1919 1 - 244 are strings of this length 

1920 Pandas Stata 

1921 251 - for int8 byte 

1922 252 - for int16 int 

1923 253 - for int32 long 

1924 254 - for float32 float 

1925 255 - for double double 

1926 

1927 If there are dates to convert, then dtype will already have the correct 

1928 type inserted. 

1929 """ 

1930 # TODO: expand to handle datetime to integer conversion 

1931 if dtype.type == np.object_: # try to coerce it to the biggest string 

1932 # not memory efficient, what else could we 

1933 # do? 

1934 itemsize = max_len_string_array(ensure_object(column.values)) 

1935 return max(itemsize, 1) 

1936 elif dtype == np.float64: 

1937 return 255 

1938 elif dtype == np.float32: 

1939 return 254 

1940 elif dtype == np.int32: 

1941 return 253 

1942 elif dtype == np.int16: 

1943 return 252 

1944 elif dtype == np.int8: 

1945 return 251 

1946 else: # pragma : no cover 

1947 raise NotImplementedError(f"Data type {dtype} not supported.") 

1948 

1949 

1950def _dtype_to_default_stata_fmt(dtype, column, dta_version=114, force_strl=False): 

1951 """ 

1952 Map numpy dtype to stata's default format for this type. Not terribly 

1953 important since users can change this in Stata. Semantics are 

1954 

1955 object -> "%DDs" where DD is the length of the string. If not a string, 

1956 raise ValueError 

1957 float64 -> "%10.0g" 

1958 float32 -> "%9.0g" 

1959 int64 -> "%9.0g" 

1960 int32 -> "%12.0g" 

1961 int16 -> "%8.0g" 

1962 int8 -> "%8.0g" 

1963 strl -> "%9s" 

1964 """ 

1965 # TODO: Refactor to combine type with format 

1966 # TODO: expand this to handle a default datetime format? 

1967 if dta_version < 117: 

1968 max_str_len = 244 

1969 else: 

1970 max_str_len = 2045 

1971 if force_strl: 

1972 return "%9s" 

1973 if dtype.type == np.object_: 

1974 itemsize = max_len_string_array(ensure_object(column.values)) 

1975 if itemsize > max_str_len: 

1976 if dta_version >= 117: 

1977 return "%9s" 

1978 else: 

1979 raise ValueError(excessive_string_length_error.format(column.name)) 

1980 return "%" + str(max(itemsize, 1)) + "s" 

1981 elif dtype == np.float64: 

1982 return "%10.0g" 

1983 elif dtype == np.float32: 

1984 return "%9.0g" 

1985 elif dtype == np.int32: 

1986 return "%12.0g" 

1987 elif dtype == np.int8 or dtype == np.int16: 

1988 return "%8.0g" 

1989 else: # pragma : no cover 

1990 raise NotImplementedError(f"Data type {dtype} not supported.") 

1991 

1992 

1993class StataWriter(StataParser): 

1994 """ 

1995 A class for writing Stata binary dta files 

1996 

1997 Parameters 

1998 ---------- 

1999 fname : path (string), buffer or path object 

2000 string, path object (pathlib.Path or py._path.local.LocalPath) or 

2001 object implementing a binary write() functions. If using a buffer 

2002 then the buffer will not be automatically closed after the file 

2003 is written. 

2004 

2005 .. versionadded:: 0.23.0 support for pathlib, py.path. 

2006 

2007 data : DataFrame 

2008 Input to save 

2009 convert_dates : dict 

2010 Dictionary mapping columns containing datetime types to stata internal 

2011 format to use when writing the dates. Options are 'tc', 'td', 'tm', 

2012 'tw', 'th', 'tq', 'ty'. Column can be either an integer or a name. 

2013 Datetime columns that do not have a conversion type specified will be 

2014 converted to 'tc'. Raises NotImplementedError if a datetime column has 

2015 timezone information 

2016 write_index : bool 

2017 Write the index to Stata dataset. 

2018 byteorder : str 

2019 Can be ">", "<", "little", or "big". default is `sys.byteorder` 

2020 time_stamp : datetime 

2021 A datetime to use as file creation date. Default is the current time 

2022 data_label : str 

2023 A label for the data set. Must be 80 characters or smaller. 

2024 variable_labels : dict 

2025 Dictionary containing columns as keys and variable labels as values. 

2026 Each label must be 80 characters or smaller. 

2027 

2028 Returns 

2029 ------- 

2030 writer : StataWriter instance 

2031 The StataWriter instance has a write_file method, which will 

2032 write the file to the given `fname`. 

2033 

2034 Raises 

2035 ------ 

2036 NotImplementedError 

2037 * If datetimes contain timezone information 

2038 ValueError 

2039 * Columns listed in convert_dates are neither datetime64[ns] 

2040 or datetime.datetime 

2041 * Column dtype is not representable in Stata 

2042 * Column listed in convert_dates is not in DataFrame 

2043 * Categorical label contains more than 32,000 characters 

2044 

2045 Examples 

2046 -------- 

2047 >>> data = pd.DataFrame([[1.0, 1]], columns=['a', 'b']) 

2048 >>> writer = StataWriter('./data_file.dta', data) 

2049 >>> writer.write_file() 

2050 

2051 Or with dates 

2052 >>> from datetime import datetime 

2053 >>> data = pd.DataFrame([[datetime(2000,1,1)]], columns=['date']) 

2054 >>> writer = StataWriter('./date_data_file.dta', data, {'date' : 'tw'}) 

2055 >>> writer.write_file() 

2056 """ 

2057 

2058 _max_string_length = 244 

2059 _encoding = "latin-1" 

2060 

2061 def __init__( 

2062 self, 

2063 fname, 

2064 data, 

2065 convert_dates=None, 

2066 write_index=True, 

2067 byteorder=None, 

2068 time_stamp=None, 

2069 data_label=None, 

2070 variable_labels=None, 

2071 ): 

2072 super().__init__() 

2073 self._convert_dates = {} if convert_dates is None else convert_dates 

2074 self._write_index = write_index 

2075 self._time_stamp = time_stamp 

2076 self._data_label = data_label 

2077 self._variable_labels = variable_labels 

2078 self._own_file = True 

2079 # attach nobs, nvars, data, varlist, typlist 

2080 self._prepare_pandas(data) 

2081 

2082 if byteorder is None: 

2083 byteorder = sys.byteorder 

2084 self._byteorder = _set_endianness(byteorder) 

2085 self._fname = stringify_path(fname) 

2086 self.type_converters = {253: np.int32, 252: np.int16, 251: np.int8} 

2087 self._converted_names = {} 

2088 

2089 def _write(self, to_write): 

2090 """ 

2091 Helper to call encode before writing to file for Python 3 compat. 

2092 """ 

2093 self._file.write(to_write.encode(self._encoding or self._default_encoding)) 

2094 

2095 def _prepare_categoricals(self, data): 

2096 """Check for categorical columns, retain categorical information for 

2097 Stata file and convert categorical data to int""" 

2098 

2099 is_cat = [is_categorical_dtype(data[col]) for col in data] 

2100 self._is_col_cat = is_cat 

2101 self._value_labels = [] 

2102 if not any(is_cat): 

2103 return data 

2104 

2105 get_base_missing_value = StataMissingValue.get_base_missing_value 

2106 data_formatted = [] 

2107 for col, col_is_cat in zip(data, is_cat): 

2108 if col_is_cat: 

2109 svl = StataValueLabel(data[col], encoding=self._encoding) 

2110 self._value_labels.append(svl) 

2111 dtype = data[col].cat.codes.dtype 

2112 if dtype == np.int64: 

2113 raise ValueError( 

2114 "It is not possible to export " 

2115 "int64-based categorical data to Stata." 

2116 ) 

2117 values = data[col].cat.codes.values.copy() 

2118 

2119 # Upcast if needed so that correct missing values can be set 

2120 if values.max() >= get_base_missing_value(dtype): 

2121 if dtype == np.int8: 

2122 dtype = np.int16 

2123 elif dtype == np.int16: 

2124 dtype = np.int32 

2125 else: 

2126 dtype = np.float64 

2127 values = np.array(values, dtype=dtype) 

2128 

2129 # Replace missing values with Stata missing value for type 

2130 values[values == -1] = get_base_missing_value(dtype) 

2131 data_formatted.append((col, values)) 

2132 else: 

2133 data_formatted.append((col, data[col])) 

2134 return DataFrame.from_dict(dict(data_formatted)) 

2135 

2136 def _replace_nans(self, data): 

2137 # return data 

2138 """Checks floating point data columns for nans, and replaces these with 

2139 the generic Stata for missing value (.)""" 

2140 for c in data: 

2141 dtype = data[c].dtype 

2142 if dtype in (np.float32, np.float64): 

2143 if dtype == np.float32: 

2144 replacement = self.MISSING_VALUES["f"] 

2145 else: 

2146 replacement = self.MISSING_VALUES["d"] 

2147 data[c] = data[c].fillna(replacement) 

2148 

2149 return data 

2150 

2151 def _update_strl_names(self): 

2152 """No-op, forward compatibility""" 

2153 pass 

2154 

2155 def _validate_variable_name(self, name): 

2156 """ 

2157 Validate variable names for Stata export. 

2158 

2159 Parameters 

2160 ---------- 

2161 name : str 

2162 Variable name 

2163 

2164 Returns 

2165 ------- 

2166 str 

2167 The validated name with invalid characters replaced with 

2168 underscores. 

2169 

2170 Notes 

2171 ----- 

2172 Stata 114 and 117 support ascii characters in a-z, A-Z, 0-9 

2173 and _. 

2174 """ 

2175 for c in name: 

2176 if ( 

2177 (c < "A" or c > "Z") 

2178 and (c < "a" or c > "z") 

2179 and (c < "0" or c > "9") 

2180 and c != "_" 

2181 ): 

2182 name = name.replace(c, "_") 

2183 return name 

2184 

2185 def _check_column_names(self, data): 

2186 """ 

2187 Checks column names to ensure that they are valid Stata column names. 

2188 This includes checks for: 

2189 * Non-string names 

2190 * Stata keywords 

2191 * Variables that start with numbers 

2192 * Variables with names that are too long 

2193 

2194 When an illegal variable name is detected, it is converted, and if 

2195 dates are exported, the variable name is propagated to the date 

2196 conversion dictionary 

2197 """ 

2198 converted_names = {} 

2199 columns = list(data.columns) 

2200 original_columns = columns[:] 

2201 

2202 duplicate_var_id = 0 

2203 for j, name in enumerate(columns): 

2204 orig_name = name 

2205 if not isinstance(name, str): 

2206 name = str(name) 

2207 

2208 name = self._validate_variable_name(name) 

2209 

2210 # Variable name must not be a reserved word 

2211 if name in self.RESERVED_WORDS: 

2212 name = "_" + name 

2213 

2214 # Variable name may not start with a number 

2215 if name[0] >= "0" and name[0] <= "9": 

2216 name = "_" + name 

2217 

2218 name = name[: min(len(name), 32)] 

2219 

2220 if not name == orig_name: 

2221 # check for duplicates 

2222 while columns.count(name) > 0: 

2223 # prepend ascending number to avoid duplicates 

2224 name = "_" + str(duplicate_var_id) + name 

2225 name = name[: min(len(name), 32)] 

2226 duplicate_var_id += 1 

2227 converted_names[orig_name] = name 

2228 

2229 columns[j] = name 

2230 

2231 data.columns = columns 

2232 

2233 # Check date conversion, and fix key if needed 

2234 if self._convert_dates: 

2235 for c, o in zip(columns, original_columns): 

2236 if c != o: 

2237 self._convert_dates[c] = self._convert_dates[o] 

2238 del self._convert_dates[o] 

2239 

2240 if converted_names: 

2241 conversion_warning = [] 

2242 for orig_name, name in converted_names.items(): 

2243 # need to possibly encode the orig name if its unicode 

2244 try: 

2245 orig_name = orig_name.encode("utf-8") 

2246 except (UnicodeDecodeError, AttributeError): 

2247 pass 

2248 msg = f"{orig_name} -> {name}" 

2249 conversion_warning.append(msg) 

2250 

2251 ws = invalid_name_doc.format("\n ".join(conversion_warning)) 

2252 warnings.warn(ws, InvalidColumnName) 

2253 

2254 self._converted_names = converted_names 

2255 self._update_strl_names() 

2256 

2257 return data 

2258 

2259 def _set_formats_and_types(self, dtypes): 

2260 self.typlist = [] 

2261 self.fmtlist = [] 

2262 for col, dtype in dtypes.items(): 

2263 self.fmtlist.append(_dtype_to_default_stata_fmt(dtype, self.data[col])) 

2264 self.typlist.append(_dtype_to_stata_type(dtype, self.data[col])) 

2265 

2266 def _prepare_pandas(self, data): 

2267 # NOTE: we might need a different API / class for pandas objects so 

2268 # we can set different semantics - handle this with a PR to pandas.io 

2269 

2270 data = data.copy() 

2271 

2272 if self._write_index: 

2273 data = data.reset_index() 

2274 

2275 # Ensure column names are strings 

2276 data = self._check_column_names(data) 

2277 

2278 # Check columns for compatibility with stata, upcast if necessary 

2279 # Raise if outside the supported range 

2280 data = _cast_to_stata_types(data) 

2281 

2282 # Replace NaNs with Stata missing values 

2283 data = self._replace_nans(data) 

2284 

2285 # Convert categoricals to int data, and strip labels 

2286 data = self._prepare_categoricals(data) 

2287 

2288 self.nobs, self.nvar = data.shape 

2289 self.data = data 

2290 self.varlist = data.columns.tolist() 

2291 

2292 dtypes = data.dtypes 

2293 

2294 # Ensure all date columns are converted 

2295 for col in data: 

2296 if col in self._convert_dates: 

2297 continue 

2298 if is_datetime64_dtype(data[col]): 

2299 self._convert_dates[col] = "tc" 

2300 

2301 self._convert_dates = _maybe_convert_to_int_keys( 

2302 self._convert_dates, self.varlist 

2303 ) 

2304 for key in self._convert_dates: 

2305 new_type = _convert_datetime_to_stata_type(self._convert_dates[key]) 

2306 dtypes[key] = np.dtype(new_type) 

2307 

2308 # Verify object arrays are strings and encode to bytes 

2309 self._encode_strings() 

2310 

2311 self._set_formats_and_types(dtypes) 

2312 

2313 # set the given format for the datetime cols 

2314 if self._convert_dates is not None: 

2315 for key in self._convert_dates: 

2316 self.fmtlist[key] = self._convert_dates[key] 

2317 

2318 def _encode_strings(self): 

2319 """ 

2320 Encode strings in dta-specific encoding 

2321 

2322 Do not encode columns marked for date conversion or for strL 

2323 conversion. The strL converter independently handles conversion and 

2324 also accepts empty string arrays. 

2325 """ 

2326 convert_dates = self._convert_dates 

2327 # _convert_strl is not available in dta 114 

2328 convert_strl = getattr(self, "_convert_strl", []) 

2329 for i, col in enumerate(self.data): 

2330 # Skip columns marked for date conversion or strl conversion 

2331 if i in convert_dates or col in convert_strl: 

2332 continue 

2333 column = self.data[col] 

2334 dtype = column.dtype 

2335 if dtype.type == np.object_: 

2336 inferred_dtype = infer_dtype(column, skipna=True) 

2337 if not ((inferred_dtype in ("string", "unicode")) or len(column) == 0): 

2338 col = column.name 

2339 raise ValueError( 

2340 f"""\ 

2341Column `{col}` cannot be exported.\n\nOnly string-like object arrays 

2342containing all strings or a mix of strings and None can be exported. 

2343Object arrays containing only null values are prohibited. Other object 

2344types cannot be exported and must first be converted to one of the 

2345supported types.""" 

2346 ) 

2347 encoded = self.data[col].str.encode(self._encoding) 

2348 # If larger than _max_string_length do nothing 

2349 if ( 

2350 max_len_string_array(ensure_object(encoded.values)) 

2351 <= self._max_string_length 

2352 ): 

2353 self.data[col] = encoded 

2354 

2355 def write_file(self): 

2356 self._file, self._own_file = _open_file_binary_write(self._fname) 

2357 try: 

2358 self._write_header(data_label=self._data_label, time_stamp=self._time_stamp) 

2359 self._write_map() 

2360 self._write_variable_types() 

2361 self._write_varnames() 

2362 self._write_sortlist() 

2363 self._write_formats() 

2364 self._write_value_label_names() 

2365 self._write_variable_labels() 

2366 self._write_expansion_fields() 

2367 self._write_characteristics() 

2368 self._prepare_data() 

2369 self._write_data() 

2370 self._write_strls() 

2371 self._write_value_labels() 

2372 self._write_file_close_tag() 

2373 self._write_map() 

2374 except Exception as exc: 

2375 self._close() 

2376 if self._own_file: 

2377 try: 

2378 os.unlink(self._fname) 

2379 except OSError: 

2380 warnings.warn( 

2381 f"This save was not successful but {self._fname} could not " 

2382 "be deleted. This file is not valid.", 

2383 ResourceWarning, 

2384 ) 

2385 raise exc 

2386 else: 

2387 self._close() 

2388 

2389 def _close(self): 

2390 """ 

2391 Close the file if it was created by the writer. 

2392 

2393 If a buffer or file-like object was passed in, for example a GzipFile, 

2394 then leave this file open for the caller to close. In either case, 

2395 attempt to flush the file contents to ensure they are written to disk 

2396 (if supported) 

2397 """ 

2398 # Some file-like objects might not support flush 

2399 try: 

2400 self._file.flush() 

2401 except AttributeError: 

2402 pass 

2403 if self._own_file: 

2404 self._file.close() 

2405 

2406 def _write_map(self): 

2407 """No-op, future compatibility""" 

2408 pass 

2409 

2410 def _write_file_close_tag(self): 

2411 """No-op, future compatibility""" 

2412 pass 

2413 

2414 def _write_characteristics(self): 

2415 """No-op, future compatibility""" 

2416 pass 

2417 

2418 def _write_strls(self): 

2419 """No-op, future compatibility""" 

2420 pass 

2421 

2422 def _write_expansion_fields(self): 

2423 """Write 5 zeros for expansion fields""" 

2424 self._write(_pad_bytes("", 5)) 

2425 

2426 def _write_value_labels(self): 

2427 for vl in self._value_labels: 

2428 self._file.write(vl.generate_value_label(self._byteorder)) 

2429 

2430 def _write_header(self, data_label=None, time_stamp=None): 

2431 byteorder = self._byteorder 

2432 # ds_format - just use 114 

2433 self._file.write(struct.pack("b", 114)) 

2434 # byteorder 

2435 self._write(byteorder == ">" and "\x01" or "\x02") 

2436 # filetype 

2437 self._write("\x01") 

2438 # unused 

2439 self._write("\x00") 

2440 # number of vars, 2 bytes 

2441 self._file.write(struct.pack(byteorder + "h", self.nvar)[:2]) 

2442 # number of obs, 4 bytes 

2443 self._file.write(struct.pack(byteorder + "i", self.nobs)[:4]) 

2444 # data label 81 bytes, char, null terminated 

2445 if data_label is None: 

2446 self._file.write(self._null_terminate(_pad_bytes("", 80))) 

2447 else: 

2448 self._file.write(self._null_terminate(_pad_bytes(data_label[:80], 80))) 

2449 # time stamp, 18 bytes, char, null terminated 

2450 # format dd Mon yyyy hh:mm 

2451 if time_stamp is None: 

2452 time_stamp = datetime.datetime.now() 

2453 elif not isinstance(time_stamp, datetime.datetime): 

2454 raise ValueError("time_stamp should be datetime type") 

2455 # GH #13856 

2456 # Avoid locale-specific month conversion 

2457 months = [ 

2458 "Jan", 

2459 "Feb", 

2460 "Mar", 

2461 "Apr", 

2462 "May", 

2463 "Jun", 

2464 "Jul", 

2465 "Aug", 

2466 "Sep", 

2467 "Oct", 

2468 "Nov", 

2469 "Dec", 

2470 ] 

2471 month_lookup = {i + 1: month for i, month in enumerate(months)} 

2472 ts = ( 

2473 time_stamp.strftime("%d ") 

2474 + month_lookup[time_stamp.month] 

2475 + time_stamp.strftime(" %Y %H:%M") 

2476 ) 

2477 self._file.write(self._null_terminate(ts)) 

2478 

2479 def _write_variable_types(self): 

2480 for typ in self.typlist: 

2481 self._file.write(struct.pack("B", typ)) 

2482 

2483 def _write_varnames(self): 

2484 # varlist names are checked by _check_column_names 

2485 # varlist, requires null terminated 

2486 for name in self.varlist: 

2487 name = self._null_terminate(name, True) 

2488 name = _pad_bytes(name[:32], 33) 

2489 self._write(name) 

2490 

2491 def _write_sortlist(self): 

2492 # srtlist, 2*(nvar+1), int array, encoded by byteorder 

2493 srtlist = _pad_bytes("", 2 * (self.nvar + 1)) 

2494 self._write(srtlist) 

2495 

2496 def _write_formats(self): 

2497 # fmtlist, 49*nvar, char array 

2498 for fmt in self.fmtlist: 

2499 self._write(_pad_bytes(fmt, 49)) 

2500 

2501 def _write_value_label_names(self): 

2502 # lbllist, 33*nvar, char array 

2503 for i in range(self.nvar): 

2504 # Use variable name when categorical 

2505 if self._is_col_cat[i]: 

2506 name = self.varlist[i] 

2507 name = self._null_terminate(name, True) 

2508 name = _pad_bytes(name[:32], 33) 

2509 self._write(name) 

2510 else: # Default is empty label 

2511 self._write(_pad_bytes("", 33)) 

2512 

2513 def _write_variable_labels(self): 

2514 # Missing labels are 80 blank characters plus null termination 

2515 blank = _pad_bytes("", 81) 

2516 

2517 if self._variable_labels is None: 

2518 for i in range(self.nvar): 

2519 self._write(blank) 

2520 return 

2521 

2522 for col in self.data: 

2523 if col in self._variable_labels: 

2524 label = self._variable_labels[col] 

2525 if len(label) > 80: 

2526 raise ValueError("Variable labels must be 80 characters or fewer") 

2527 is_latin1 = all(ord(c) < 256 for c in label) 

2528 if not is_latin1: 

2529 raise ValueError( 

2530 "Variable labels must contain only characters that " 

2531 "can be encoded in Latin-1" 

2532 ) 

2533 self._write(_pad_bytes(label, 81)) 

2534 else: 

2535 self._write(blank) 

2536 

2537 def _convert_strls(self, data): 

2538 """No-op, future compatibility""" 

2539 return data 

2540 

2541 def _prepare_data(self): 

2542 data = self.data 

2543 typlist = self.typlist 

2544 convert_dates = self._convert_dates 

2545 # 1. Convert dates 

2546 if self._convert_dates is not None: 

2547 for i, col in enumerate(data): 

2548 if i in convert_dates: 

2549 data[col] = _datetime_to_stata_elapsed_vec( 

2550 data[col], self.fmtlist[i] 

2551 ) 

2552 # 2. Convert strls 

2553 data = self._convert_strls(data) 

2554 

2555 # 3. Convert bad string data to '' and pad to correct length 

2556 dtypes = {} 

2557 native_byteorder = self._byteorder == _set_endianness(sys.byteorder) 

2558 for i, col in enumerate(data): 

2559 typ = typlist[i] 

2560 if typ <= self._max_string_length: 

2561 data[col] = data[col].fillna("").apply(_pad_bytes, args=(typ,)) 

2562 stype = f"S{typ}" 

2563 dtypes[col] = stype 

2564 data[col] = data[col].astype(stype) 

2565 else: 

2566 dtype = data[col].dtype 

2567 if not native_byteorder: 

2568 dtype = dtype.newbyteorder(self._byteorder) 

2569 dtypes[col] = dtype 

2570 

2571 self.data = data.to_records(index=False, column_dtypes=dtypes) 

2572 

2573 def _write_data(self): 

2574 data = self.data 

2575 self._file.write(data.tobytes()) 

2576 

2577 def _null_terminate(self, s, as_string=False): 

2578 null_byte = "\x00" 

2579 s += null_byte 

2580 

2581 if not as_string: 

2582 s = s.encode(self._encoding) 

2583 

2584 return s 

2585 

2586 

2587def _dtype_to_stata_type_117(dtype, column, force_strl): 

2588 """ 

2589 Converts dtype types to stata types. Returns the byte of the given ordinal. 

2590 See TYPE_MAP and comments for an explanation. This is also explained in 

2591 the dta spec. 

2592 1 - 2045 are strings of this length 

2593 Pandas Stata 

2594 32768 - for object strL 

2595 65526 - for int8 byte 

2596 65527 - for int16 int 

2597 65528 - for int32 long 

2598 65529 - for float32 float 

2599 65530 - for double double 

2600 

2601 If there are dates to convert, then dtype will already have the correct 

2602 type inserted. 

2603 """ 

2604 # TODO: expand to handle datetime to integer conversion 

2605 if force_strl: 

2606 return 32768 

2607 if dtype.type == np.object_: # try to coerce it to the biggest string 

2608 # not memory efficient, what else could we 

2609 # do? 

2610 itemsize = max_len_string_array(ensure_object(column.values)) 

2611 itemsize = max(itemsize, 1) 

2612 if itemsize <= 2045: 

2613 return itemsize 

2614 return 32768 

2615 elif dtype == np.float64: 

2616 return 65526 

2617 elif dtype == np.float32: 

2618 return 65527 

2619 elif dtype == np.int32: 

2620 return 65528 

2621 elif dtype == np.int16: 

2622 return 65529 

2623 elif dtype == np.int8: 

2624 return 65530 

2625 else: # pragma : no cover 

2626 raise NotImplementedError(f"Data type {dtype} not supported.") 

2627 

2628 

2629def _pad_bytes_new(name, length): 

2630 """ 

2631 Takes a bytes instance and pads it with null bytes until it's length chars. 

2632 """ 

2633 if isinstance(name, str): 

2634 name = bytes(name, "utf-8") 

2635 return name + b"\x00" * (length - len(name)) 

2636 

2637 

2638class StataStrLWriter: 

2639 """ 

2640 Converter for Stata StrLs 

2641 

2642 Stata StrLs map 8 byte values to strings which are stored using a 

2643 dictionary-like format where strings are keyed to two values. 

2644 

2645 Parameters 

2646 ---------- 

2647 df : DataFrame 

2648 DataFrame to convert 

2649 columns : list 

2650 List of columns names to convert to StrL 

2651 version : int, optional 

2652 dta version. Currently supports 117, 118 and 119 

2653 byteorder : str, optional 

2654 Can be ">", "<", "little", or "big". default is `sys.byteorder` 

2655 

2656 Notes 

2657 ----- 

2658 Supports creation of the StrL block of a dta file for dta versions 

2659 117, 118 and 119. These differ in how the GSO is stored. 118 and 

2660 119 store the GSO lookup value as a uint32 and a uint64, while 117 

2661 uses two uint32s. 118 and 119 also encode all strings as unicode 

2662 which is required by the format. 117 uses 'latin-1' a fixed width 

2663 encoding that extends the 7-bit ascii table with an additional 128 

2664 characters. 

2665 """ 

2666 

2667 def __init__(self, df, columns, version=117, byteorder=None): 

2668 if version not in (117, 118, 119): 

2669 raise ValueError("Only dta versions 117, 118 and 119 supported") 

2670 self._dta_ver = version 

2671 

2672 self.df = df 

2673 self.columns = columns 

2674 self._gso_table = {"": (0, 0)} 

2675 if byteorder is None: 

2676 byteorder = sys.byteorder 

2677 self._byteorder = _set_endianness(byteorder) 

2678 

2679 gso_v_type = "I" # uint32 

2680 gso_o_type = "Q" # uint64 

2681 self._encoding = "utf-8" 

2682 if version == 117: 

2683 o_size = 4 

2684 gso_o_type = "I" # 117 used uint32 

2685 self._encoding = "latin-1" 

2686 elif version == 118: 

2687 o_size = 6 

2688 else: # version == 119 

2689 o_size = 5 

2690 self._o_offet = 2 ** (8 * (8 - o_size)) 

2691 self._gso_o_type = gso_o_type 

2692 self._gso_v_type = gso_v_type 

2693 

2694 def _convert_key(self, key): 

2695 v, o = key 

2696 return v + self._o_offet * o 

2697 

2698 def generate_table(self): 

2699 """ 

2700 Generates the GSO lookup table for the DataFrame 

2701 

2702 Returns 

2703 ------- 

2704 gso_table : dict 

2705 Ordered dictionary using the string found as keys 

2706 and their lookup position (v,o) as values 

2707 gso_df : DataFrame 

2708 DataFrame where strl columns have been converted to 

2709 (v,o) values 

2710 

2711 Notes 

2712 ----- 

2713 Modifies the DataFrame in-place. 

2714 

2715 The DataFrame returned encodes the (v,o) values as uint64s. The 

2716 encoding depends on the dta version, and can be expressed as 

2717 

2718 enc = v + o * 2 ** (o_size * 8) 

2719 

2720 so that v is stored in the lower bits and o is in the upper 

2721 bits. o_size is 

2722 

2723 * 117: 4 

2724 * 118: 6 

2725 * 119: 5 

2726 """ 

2727 

2728 gso_table = self._gso_table 

2729 gso_df = self.df 

2730 columns = list(gso_df.columns) 

2731 selected = gso_df[self.columns] 

2732 col_index = [(col, columns.index(col)) for col in self.columns] 

2733 keys = np.empty(selected.shape, dtype=np.uint64) 

2734 for o, (idx, row) in enumerate(selected.iterrows()): 

2735 for j, (col, v) in enumerate(col_index): 

2736 val = row[col] 

2737 # Allow columns with mixed str and None (GH 23633) 

2738 val = "" if val is None else val 

2739 key = gso_table.get(val, None) 

2740 if key is None: 

2741 # Stata prefers human numbers 

2742 key = (v + 1, o + 1) 

2743 gso_table[val] = key 

2744 keys[o, j] = self._convert_key(key) 

2745 for i, col in enumerate(self.columns): 

2746 gso_df[col] = keys[:, i] 

2747 

2748 return gso_table, gso_df 

2749 

2750 def generate_blob(self, gso_table): 

2751 """ 

2752 Generates the binary blob of GSOs that is written to the dta file. 

2753 

2754 Parameters 

2755 ---------- 

2756 gso_table : dict 

2757 Ordered dictionary (str, vo) 

2758 

2759 Returns 

2760 ------- 

2761 gso : bytes 

2762 Binary content of dta file to be placed between strl tags 

2763 

2764 Notes 

2765 ----- 

2766 Output format depends on dta version. 117 uses two uint32s to 

2767 express v and o while 118+ uses a uint32 for v and a uint64 for o. 

2768 """ 

2769 # Format information 

2770 # Length includes null term 

2771 # 117 

2772 # GSOvvvvooootllllxxxxxxxxxxxxxxx...x 

2773 # 3 u4 u4 u1 u4 string + null term 

2774 # 

2775 # 118, 119 

2776 # GSOvvvvooooooootllllxxxxxxxxxxxxxxx...x 

2777 # 3 u4 u8 u1 u4 string + null term 

2778 

2779 bio = BytesIO() 

2780 gso = bytes("GSO", "ascii") 

2781 gso_type = struct.pack(self._byteorder + "B", 130) 

2782 null = struct.pack(self._byteorder + "B", 0) 

2783 v_type = self._byteorder + self._gso_v_type 

2784 o_type = self._byteorder + self._gso_o_type 

2785 len_type = self._byteorder + "I" 

2786 for strl, vo in gso_table.items(): 

2787 if vo == (0, 0): 

2788 continue 

2789 v, o = vo 

2790 

2791 # GSO 

2792 bio.write(gso) 

2793 

2794 # vvvv 

2795 bio.write(struct.pack(v_type, v)) 

2796 

2797 # oooo / oooooooo 

2798 bio.write(struct.pack(o_type, o)) 

2799 

2800 # t 

2801 bio.write(gso_type) 

2802 

2803 # llll 

2804 utf8_string = bytes(strl, "utf-8") 

2805 bio.write(struct.pack(len_type, len(utf8_string) + 1)) 

2806 

2807 # xxx...xxx 

2808 bio.write(utf8_string) 

2809 bio.write(null) 

2810 

2811 bio.seek(0) 

2812 return bio.read() 

2813 

2814 

2815class StataWriter117(StataWriter): 

2816 """ 

2817 A class for writing Stata binary dta files in Stata 13 format (117) 

2818 

2819 .. versionadded:: 0.23.0 

2820 

2821 Parameters 

2822 ---------- 

2823 fname : path (string), buffer or path object 

2824 string, path object (pathlib.Path or py._path.local.LocalPath) or 

2825 object implementing a binary write() functions. If using a buffer 

2826 then the buffer will not be automatically closed after the file 

2827 is written. 

2828 data : DataFrame 

2829 Input to save 

2830 convert_dates : dict 

2831 Dictionary mapping columns containing datetime types to stata internal 

2832 format to use when writing the dates. Options are 'tc', 'td', 'tm', 

2833 'tw', 'th', 'tq', 'ty'. Column can be either an integer or a name. 

2834 Datetime columns that do not have a conversion type specified will be 

2835 converted to 'tc'. Raises NotImplementedError if a datetime column has 

2836 timezone information 

2837 write_index : bool 

2838 Write the index to Stata dataset. 

2839 byteorder : str 

2840 Can be ">", "<", "little", or "big". default is `sys.byteorder` 

2841 time_stamp : datetime 

2842 A datetime to use as file creation date. Default is the current time 

2843 data_label : str 

2844 A label for the data set. Must be 80 characters or smaller. 

2845 variable_labels : dict 

2846 Dictionary containing columns as keys and variable labels as values. 

2847 Each label must be 80 characters or smaller. 

2848 convert_strl : list 

2849 List of columns names to convert to Stata StrL format. Columns with 

2850 more than 2045 characters are automatically written as StrL. 

2851 Smaller columns can be converted by including the column name. Using 

2852 StrLs can reduce output file size when strings are longer than 8 

2853 characters, and either frequently repeated or sparse. 

2854 

2855 Returns 

2856 ------- 

2857 writer : StataWriter117 instance 

2858 The StataWriter117 instance has a write_file method, which will 

2859 write the file to the given `fname`. 

2860 

2861 Raises 

2862 ------ 

2863 NotImplementedError 

2864 * If datetimes contain timezone information 

2865 ValueError 

2866 * Columns listed in convert_dates are neither datetime64[ns] 

2867 or datetime.datetime 

2868 * Column dtype is not representable in Stata 

2869 * Column listed in convert_dates is not in DataFrame 

2870 * Categorical label contains more than 32,000 characters 

2871 

2872 Examples 

2873 -------- 

2874 >>> from pandas.io.stata import StataWriter117 

2875 >>> data = pd.DataFrame([[1.0, 1, 'a']], columns=['a', 'b', 'c']) 

2876 >>> writer = StataWriter117('./data_file.dta', data) 

2877 >>> writer.write_file() 

2878 

2879 Or with long strings stored in strl format 

2880 

2881 >>> data = pd.DataFrame([['A relatively long string'], [''], ['']], 

2882 ... columns=['strls']) 

2883 >>> writer = StataWriter117('./data_file_with_long_strings.dta', data, 

2884 ... convert_strl=['strls']) 

2885 >>> writer.write_file() 

2886 """ 

2887 

2888 _max_string_length = 2045 

2889 _dta_version = 117 

2890 

2891 def __init__( 

2892 self, 

2893 fname, 

2894 data, 

2895 convert_dates=None, 

2896 write_index=True, 

2897 byteorder=None, 

2898 time_stamp=None, 

2899 data_label=None, 

2900 variable_labels=None, 

2901 convert_strl=None, 

2902 ): 

2903 # Shallow copy since convert_strl might be modified later 

2904 self._convert_strl = [] if convert_strl is None else convert_strl[:] 

2905 

2906 super().__init__( 

2907 fname, 

2908 data, 

2909 convert_dates, 

2910 write_index, 

2911 byteorder=byteorder, 

2912 time_stamp=time_stamp, 

2913 data_label=data_label, 

2914 variable_labels=variable_labels, 

2915 ) 

2916 self._map = None 

2917 self._strl_blob = None 

2918 

2919 @staticmethod 

2920 def _tag(val, tag): 

2921 """Surround val with <tag></tag>""" 

2922 if isinstance(val, str): 

2923 val = bytes(val, "utf-8") 

2924 return bytes("<" + tag + ">", "utf-8") + val + bytes("</" + tag + ">", "utf-8") 

2925 

2926 def _update_map(self, tag): 

2927 """Update map location for tag with file position""" 

2928 self._map[tag] = self._file.tell() 

2929 

2930 def _write_header(self, data_label=None, time_stamp=None): 

2931 """Write the file header""" 

2932 byteorder = self._byteorder 

2933 self._file.write(bytes("<stata_dta>", "utf-8")) 

2934 bio = BytesIO() 

2935 # ds_format - 117 

2936 bio.write(self._tag(bytes(str(self._dta_version), "utf-8"), "release")) 

2937 # byteorder 

2938 bio.write(self._tag(byteorder == ">" and "MSF" or "LSF", "byteorder")) 

2939 # number of vars, 2 bytes in 117 and 118, 4 byte in 119 

2940 nvar_type = "H" if self._dta_version <= 118 else "I" 

2941 bio.write(self._tag(struct.pack(byteorder + nvar_type, self.nvar), "K")) 

2942 # 117 uses 4 bytes, 118 uses 8 

2943 nobs_size = "I" if self._dta_version == 117 else "Q" 

2944 bio.write(self._tag(struct.pack(byteorder + nobs_size, self.nobs), "N")) 

2945 # data label 81 bytes, char, null terminated 

2946 label = data_label[:80] if data_label is not None else "" 

2947 label = label.encode(self._encoding) 

2948 label_size = "B" if self._dta_version == 117 else "H" 

2949 label_len = struct.pack(byteorder + label_size, len(label)) 

2950 label = label_len + label 

2951 bio.write(self._tag(label, "label")) 

2952 # time stamp, 18 bytes, char, null terminated 

2953 # format dd Mon yyyy hh:mm 

2954 if time_stamp is None: 

2955 time_stamp = datetime.datetime.now() 

2956 elif not isinstance(time_stamp, datetime.datetime): 

2957 raise ValueError("time_stamp should be datetime type") 

2958 # Avoid locale-specific month conversion 

2959 months = [ 

2960 "Jan", 

2961 "Feb", 

2962 "Mar", 

2963 "Apr", 

2964 "May", 

2965 "Jun", 

2966 "Jul", 

2967 "Aug", 

2968 "Sep", 

2969 "Oct", 

2970 "Nov", 

2971 "Dec", 

2972 ] 

2973 month_lookup = {i + 1: month for i, month in enumerate(months)} 

2974 ts = ( 

2975 time_stamp.strftime("%d ") 

2976 + month_lookup[time_stamp.month] 

2977 + time_stamp.strftime(" %Y %H:%M") 

2978 ) 

2979 # '\x11' added due to inspection of Stata file 

2980 ts = b"\x11" + bytes(ts, "utf-8") 

2981 bio.write(self._tag(ts, "timestamp")) 

2982 bio.seek(0) 

2983 self._file.write(self._tag(bio.read(), "header")) 

2984 

2985 def _write_map(self): 

2986 """Called twice during file write. The first populates the values in 

2987 the map with 0s. The second call writes the final map locations when 

2988 all blocks have been written.""" 

2989 if self._map is None: 

2990 self._map = dict( 

2991 ( 

2992 ("stata_data", 0), 

2993 ("map", self._file.tell()), 

2994 ("variable_types", 0), 

2995 ("varnames", 0), 

2996 ("sortlist", 0), 

2997 ("formats", 0), 

2998 ("value_label_names", 0), 

2999 ("variable_labels", 0), 

3000 ("characteristics", 0), 

3001 ("data", 0), 

3002 ("strls", 0), 

3003 ("value_labels", 0), 

3004 ("stata_data_close", 0), 

3005 ("end-of-file", 0), 

3006 ) 

3007 ) 

3008 # Move to start of map 

3009 self._file.seek(self._map["map"]) 

3010 bio = BytesIO() 

3011 for val in self._map.values(): 

3012 bio.write(struct.pack(self._byteorder + "Q", val)) 

3013 bio.seek(0) 

3014 self._file.write(self._tag(bio.read(), "map")) 

3015 

3016 def _write_variable_types(self): 

3017 self._update_map("variable_types") 

3018 bio = BytesIO() 

3019 for typ in self.typlist: 

3020 bio.write(struct.pack(self._byteorder + "H", typ)) 

3021 bio.seek(0) 

3022 self._file.write(self._tag(bio.read(), "variable_types")) 

3023 

3024 def _write_varnames(self): 

3025 self._update_map("varnames") 

3026 bio = BytesIO() 

3027 # 118 scales by 4 to accommodate utf-8 data worst case encoding 

3028 vn_len = 32 if self._dta_version == 117 else 128 

3029 for name in self.varlist: 

3030 name = self._null_terminate(name, True) 

3031 name = _pad_bytes_new(name[:32].encode(self._encoding), vn_len + 1) 

3032 bio.write(name) 

3033 bio.seek(0) 

3034 self._file.write(self._tag(bio.read(), "varnames")) 

3035 

3036 def _write_sortlist(self): 

3037 self._update_map("sortlist") 

3038 sort_size = 2 if self._dta_version < 119 else 4 

3039 self._file.write(self._tag(b"\x00" * sort_size * (self.nvar + 1), "sortlist")) 

3040 

3041 def _write_formats(self): 

3042 self._update_map("formats") 

3043 bio = BytesIO() 

3044 fmt_len = 49 if self._dta_version == 117 else 57 

3045 for fmt in self.fmtlist: 

3046 bio.write(_pad_bytes_new(fmt.encode(self._encoding), fmt_len)) 

3047 bio.seek(0) 

3048 self._file.write(self._tag(bio.read(), "formats")) 

3049 

3050 def _write_value_label_names(self): 

3051 self._update_map("value_label_names") 

3052 bio = BytesIO() 

3053 # 118 scales by 4 to accommodate utf-8 data worst case encoding 

3054 vl_len = 32 if self._dta_version == 117 else 128 

3055 for i in range(self.nvar): 

3056 # Use variable name when categorical 

3057 name = "" # default name 

3058 if self._is_col_cat[i]: 

3059 name = self.varlist[i] 

3060 name = self._null_terminate(name, True) 

3061 name = _pad_bytes_new(name[:32].encode(self._encoding), vl_len + 1) 

3062 bio.write(name) 

3063 bio.seek(0) 

3064 self._file.write(self._tag(bio.read(), "value_label_names")) 

3065 

3066 def _write_variable_labels(self): 

3067 # Missing labels are 80 blank characters plus null termination 

3068 self._update_map("variable_labels") 

3069 bio = BytesIO() 

3070 # 118 scales by 4 to accommodate utf-8 data worst case encoding 

3071 vl_len = 80 if self._dta_version == 117 else 320 

3072 blank = _pad_bytes_new("", vl_len + 1) 

3073 

3074 if self._variable_labels is None: 

3075 for _ in range(self.nvar): 

3076 bio.write(blank) 

3077 bio.seek(0) 

3078 self._file.write(self._tag(bio.read(), "variable_labels")) 

3079 return 

3080 

3081 for col in self.data: 

3082 if col in self._variable_labels: 

3083 label = self._variable_labels[col] 

3084 if len(label) > 80: 

3085 raise ValueError("Variable labels must be 80 characters or fewer") 

3086 try: 

3087 encoded = label.encode(self._encoding) 

3088 except UnicodeEncodeError: 

3089 raise ValueError( 

3090 "Variable labels must contain only characters that " 

3091 f"can be encoded in {self._encoding}" 

3092 ) 

3093 

3094 bio.write(_pad_bytes_new(encoded, vl_len + 1)) 

3095 else: 

3096 bio.write(blank) 

3097 bio.seek(0) 

3098 self._file.write(self._tag(bio.read(), "variable_labels")) 

3099 

3100 def _write_characteristics(self): 

3101 self._update_map("characteristics") 

3102 self._file.write(self._tag(b"", "characteristics")) 

3103 

3104 def _write_data(self): 

3105 self._update_map("data") 

3106 data = self.data 

3107 self._file.write(b"<data>") 

3108 self._file.write(data.tobytes()) 

3109 self._file.write(b"</data>") 

3110 

3111 def _write_strls(self): 

3112 self._update_map("strls") 

3113 strls = b"" 

3114 if self._strl_blob is not None: 

3115 strls = self._strl_blob 

3116 self._file.write(self._tag(strls, "strls")) 

3117 

3118 def _write_expansion_fields(self): 

3119 """No-op in dta 117+""" 

3120 pass 

3121 

3122 def _write_value_labels(self): 

3123 self._update_map("value_labels") 

3124 bio = BytesIO() 

3125 for vl in self._value_labels: 

3126 lab = vl.generate_value_label(self._byteorder) 

3127 lab = self._tag(lab, "lbl") 

3128 bio.write(lab) 

3129 bio.seek(0) 

3130 self._file.write(self._tag(bio.read(), "value_labels")) 

3131 

3132 def _write_file_close_tag(self): 

3133 self._update_map("stata_data_close") 

3134 self._file.write(bytes("</stata_dta>", "utf-8")) 

3135 self._update_map("end-of-file") 

3136 

3137 def _update_strl_names(self): 

3138 """Update column names for conversion to strl if they might have been 

3139 changed to comply with Stata naming rules""" 

3140 # Update convert_strl if names changed 

3141 for orig, new in self._converted_names.items(): 

3142 if orig in self._convert_strl: 

3143 idx = self._convert_strl.index(orig) 

3144 self._convert_strl[idx] = new 

3145 

3146 def _convert_strls(self, data): 

3147 """Convert columns to StrLs if either very large or in the 

3148 convert_strl variable""" 

3149 convert_cols = [ 

3150 col 

3151 for i, col in enumerate(data) 

3152 if self.typlist[i] == 32768 or col in self._convert_strl 

3153 ] 

3154 

3155 if convert_cols: 

3156 ssw = StataStrLWriter(data, convert_cols, version=self._dta_version) 

3157 tab, new_data = ssw.generate_table() 

3158 data = new_data 

3159 self._strl_blob = ssw.generate_blob(tab) 

3160 return data 

3161 

3162 def _set_formats_and_types(self, dtypes): 

3163 self.typlist = [] 

3164 self.fmtlist = [] 

3165 for col, dtype in dtypes.items(): 

3166 force_strl = col in self._convert_strl 

3167 fmt = _dtype_to_default_stata_fmt( 

3168 dtype, 

3169 self.data[col], 

3170 dta_version=self._dta_version, 

3171 force_strl=force_strl, 

3172 ) 

3173 self.fmtlist.append(fmt) 

3174 self.typlist.append( 

3175 _dtype_to_stata_type_117(dtype, self.data[col], force_strl) 

3176 ) 

3177 

3178 

3179class StataWriterUTF8(StataWriter117): 

3180 """ 

3181 Stata binary dta file writing in Stata 15 (118) and 16 (119) formats 

3182 

3183 DTA 118 and 119 format files support unicode string data (both fixed 

3184 and strL) format. Unicode is also supported in value labels, variable 

3185 labels and the dataset label. Format 119 is automatically used if the 

3186 file contains more than 32,767 variables. 

3187 

3188 .. versionadded:: 1.0.0 

3189 

3190 Parameters 

3191 ---------- 

3192 fname : path (string), buffer or path object 

3193 string, path object (pathlib.Path or py._path.local.LocalPath) or 

3194 object implementing a binary write() functions. If using a buffer 

3195 then the buffer will not be automatically closed after the file 

3196 is written. 

3197 data : DataFrame 

3198 Input to save 

3199 convert_dates : dict, default None 

3200 Dictionary mapping columns containing datetime types to stata internal 

3201 format to use when writing the dates. Options are 'tc', 'td', 'tm', 

3202 'tw', 'th', 'tq', 'ty'. Column can be either an integer or a name. 

3203 Datetime columns that do not have a conversion type specified will be 

3204 converted to 'tc'. Raises NotImplementedError if a datetime column has 

3205 timezone information 

3206 write_index : bool, default True 

3207 Write the index to Stata dataset. 

3208 byteorder : str, default None 

3209 Can be ">", "<", "little", or "big". default is `sys.byteorder` 

3210 time_stamp : datetime, default None 

3211 A datetime to use as file creation date. Default is the current time 

3212 data_label : str, default None 

3213 A label for the data set. Must be 80 characters or smaller. 

3214 variable_labels : dict, default None 

3215 Dictionary containing columns as keys and variable labels as values. 

3216 Each label must be 80 characters or smaller. 

3217 convert_strl : list, default None 

3218 List of columns names to convert to Stata StrL format. Columns with 

3219 more than 2045 characters are automatically written as StrL. 

3220 Smaller columns can be converted by including the column name. Using 

3221 StrLs can reduce output file size when strings are longer than 8 

3222 characters, and either frequently repeated or sparse. 

3223 version : int, default None 

3224 The dta version to use. By default, uses the size of data to determine 

3225 the version. 118 is used if data.shape[1] <= 32767, and 119 is used 

3226 for storing larger DataFrames. 

3227 

3228 Returns 

3229 ------- 

3230 StataWriterUTF8 

3231 The instance has a write_file method, which will write the file to the 

3232 given `fname`. 

3233 

3234 Raises 

3235 ------ 

3236 NotImplementedError 

3237 * If datetimes contain timezone information 

3238 ValueError 

3239 * Columns listed in convert_dates are neither datetime64[ns] 

3240 or datetime.datetime 

3241 * Column dtype is not representable in Stata 

3242 * Column listed in convert_dates is not in DataFrame 

3243 * Categorical label contains more than 32,000 characters 

3244 

3245 Examples 

3246 -------- 

3247 Using Unicode data and column names 

3248 

3249 >>> from pandas.io.stata import StataWriterUTF8 

3250 >>> data = pd.DataFrame([[1.0, 1, 'ᴬ']], columns=['a', 'β', 'ĉ']) 

3251 >>> writer = StataWriterUTF8('./data_file.dta', data) 

3252 >>> writer.write_file() 

3253 

3254 Or with long strings stored in strl format 

3255 

3256 >>> data = pd.DataFrame([['ᴀ relatively long ŝtring'], [''], ['']], 

3257 ... columns=['strls']) 

3258 >>> writer = StataWriterUTF8('./data_file_with_long_strings.dta', data, 

3259 ... convert_strl=['strls']) 

3260 >>> writer.write_file() 

3261 """ 

3262 

3263 _encoding = "utf-8" 

3264 

3265 def __init__( 

3266 self, 

3267 fname: FilePathOrBuffer, 

3268 data: DataFrame, 

3269 convert_dates: Optional[Dict[Hashable, str]] = None, 

3270 write_index: bool = True, 

3271 byteorder: Optional[str] = None, 

3272 time_stamp: Optional[datetime.datetime] = None, 

3273 data_label: Optional[str] = None, 

3274 variable_labels: Optional[Dict[Hashable, str]] = None, 

3275 convert_strl: Optional[Sequence[Hashable]] = None, 

3276 version: Optional[int] = None, 

3277 ): 

3278 if version is None: 

3279 version = 118 if data.shape[1] <= 32767 else 119 

3280 elif version not in (118, 119): 

3281 raise ValueError("version must be either 118 or 119.") 

3282 elif version == 118 and data.shape[1] > 32767: 

3283 raise ValueError( 

3284 "You must use version 119 for data sets containing more than" 

3285 "32,767 variables" 

3286 ) 

3287 

3288 super().__init__( 

3289 fname, 

3290 data, 

3291 convert_dates=convert_dates, 

3292 write_index=write_index, 

3293 byteorder=byteorder, 

3294 time_stamp=time_stamp, 

3295 data_label=data_label, 

3296 variable_labels=variable_labels, 

3297 convert_strl=convert_strl, 

3298 ) 

3299 # Override version set in StataWriter117 init 

3300 self._dta_version = version 

3301 

3302 def _validate_variable_name(self, name: str) -> str: 

3303 """ 

3304 Validate variable names for Stata export. 

3305 

3306 Parameters 

3307 ---------- 

3308 name : str 

3309 Variable name 

3310 

3311 Returns 

3312 ------- 

3313 str 

3314 The validated name with invalid characters replaced with 

3315 underscores. 

3316 

3317 Notes 

3318 ----- 

3319 Stata 118+ support most unicode characters. The only limitation is in 

3320 the ascii range where the characters supported are a-z, A-Z, 0-9 and _. 

3321 """ 

3322 # High code points appear to be acceptable 

3323 for c in name: 

3324 if ( 

3325 ord(c) < 128 

3326 and (c < "A" or c > "Z") 

3327 and (c < "a" or c > "z") 

3328 and (c < "0" or c > "9") 

3329 and c != "_" 

3330 ) or 128 <= ord(c) < 256: 

3331 name = name.replace(c, "_") 

3332 

3333 return name