Hide keyboard shortcuts

Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1""" 

2Input/Output tools for working with binary data. 

3 

4The Stata input tools were originally written by Joe Presbrey as part of PyDTA. 

5 

6You can find more information here http://presbrey.mit.edu/PyDTA 

7 

8See Also 

9-------- 

10numpy.lib.io 

11""" 

12import warnings 

13 

14from statsmodels.compat.python import (lzip, lmap, lrange, 

15 lfilter, asbytes, asstr) 

16from struct import unpack, calcsize, pack 

17from struct import error as struct_error 

18import datetime 

19import sys 

20 

21import numpy as np 

22import statsmodels.tools.data as data_util 

23from pandas import isnull 

24from pandas.io.stata import StataMissingValue 

25from statsmodels.iolib.openfile import get_file_obj 

26 

27_date_formats = ["%tc", "%tC", "%td", "%tw", "%tm", "%tq", "%th", "%ty"] 

28 

29def _datetime_to_stata_elapsed(date, fmt): 

30 """ 

31 Convert from datetime to SIF. http://www.stata.com/help.cgi?datetime 

32 

33 Parameters 

34 ---------- 

35 date : datetime.datetime 

36 The date to convert to the Stata Internal Format given by fmt 

37 fmt : str 

38 The format to convert to. Can be, tc, td, tw, tm, tq, th, ty 

39 """ 

40 if not isinstance(date, datetime.datetime): 

41 raise ValueError("date should be datetime.datetime format") 

42 stata_epoch = datetime.datetime(1960, 1, 1) 

43 if fmt in ["%tc", "tc"]: 

44 delta = date - stata_epoch 

45 return (delta.days * 86400000 + delta.seconds*1000 + 

46 delta.microseconds/1000) 

47 elif fmt in ["%tC", "tC"]: 

48 from warnings import warn 

49 warn("Stata Internal Format tC not supported.", UserWarning) 

50 return date 

51 elif fmt in ["%td", "td"]: 

52 return (date- stata_epoch).days 

53 elif fmt in ["%tw", "tw"]: 

54 return (52*(date.year-stata_epoch.year) + 

55 (date - datetime.datetime(date.year, 1, 1)).days / 7) 

56 elif fmt in ["%tm", "tm"]: 

57 return (12 * (date.year - stata_epoch.year) + date.month - 1) 

58 elif fmt in ["%tq", "tq"]: 

59 return 4*(date.year-stata_epoch.year) + int((date.month - 1)/3) 

60 elif fmt in ["%th", "th"]: 

61 return 2 * (date.year - stata_epoch.year) + int(date.month > 6) 

62 elif fmt in ["%ty", "ty"]: 

63 return date.year 

64 else: 

65 raise ValueError("fmt %s not understood" % fmt) 

66 

67def _stata_elapsed_date_to_datetime(date, fmt): 

68 """ 

69 Convert from SIF to datetime. http://www.stata.com/help.cgi?datetime 

70 

71 Parameters 

72 ---------- 

73 date : int 

74 The Stata Internal Format date to convert to datetime according to fmt 

75 fmt : str 

76 The format to convert to. Can be, tc, td, tw, tm, tq, th, ty 

77 

78 Examples 

79 -------- 

80 >>> _stata_elapsed_date_to_datetime(52, "%tw") datetime.datetime(1961, 1, 1, 0, 0) 

81 

82 Notes 

83 ----- 

84 datetime/c - tc 

85 milliseconds since 01jan1960 00:00:00.000, assuming 86,400 s/day 

86 datetime/C - tC - NOT IMPLEMENTED 

87 milliseconds since 01jan1960 00:00:00.000, adjusted for leap seconds 

88 date - td 

89 days since 01jan1960 (01jan1960 = 0) 

90 weekly date - tw 

91 weeks since 1960w1 

92 This assumes 52 weeks in a year, then adds 7 * remainder of the weeks. 

93 The datetime value is the start of the week in terms of days in the 

94 year, not ISO calendar weeks. 

95 monthly date - tm 

96 months since 1960m1 

97 quarterly date - tq 

98 quarters since 1960q1 

99 half-yearly date - th 

100 half-years since 1960h1 yearly 

101 date - ty 

102 years since 0000 

103 

104 If you do not have pandas with datetime support, then you cannot do 

105 milliseconds accurately. 

106 """ 

107 #NOTE: we could run into overflow / loss of precision situations here 

108 # casting to int, but I'm not sure what to do. datetime will not deal with 

109 # numpy types and numpy datetime is not mature enough / we cannot rely on 

110 # pandas version > 0.7.1 

111 #TODO: IIRC relative delta does not play well with np.datetime? 

112 date = int(date) 

113 stata_epoch = datetime.datetime(1960, 1, 1) 

114 if fmt in ["%tc", "tc"]: 

115 from dateutil.relativedelta import relativedelta 

116 return stata_epoch + relativedelta(microseconds=date*1000) 

117 elif fmt in ["%tC", "tC"]: 

118 from warnings import warn 

119 warn("Encountered %tC format. Leaving in Stata Internal Format.", 

120 UserWarning) 

121 return date 

122 elif fmt in ["%td", "td"]: 

123 return stata_epoch + datetime.timedelta(int(date)) 

124 elif fmt in ["%tw", "tw"]: # does not count leap days - 7 days is a week 

125 year = datetime.datetime(stata_epoch.year + date // 52, 1, 1) 

126 day_delta = (date % 52 ) * 7 

127 return year + datetime.timedelta(int(day_delta)) 

128 elif fmt in ["%tm", "tm"]: 

129 year = stata_epoch.year + date // 12 

130 month_delta = (date % 12 ) + 1 

131 return datetime.datetime(year, month_delta, 1) 

132 elif fmt in ["%tq", "tq"]: 

133 year = stata_epoch.year + date // 4 

134 month_delta = (date % 4) * 3 + 1 

135 return datetime.datetime(year, month_delta, 1) 

136 elif fmt in ["%th", "th"]: 

137 year = stata_epoch.year + date // 2 

138 month_delta = (date % 2) * 6 + 1 

139 return datetime.datetime(year, month_delta, 1) 

140 elif fmt in ["%ty", "ty"]: 

141 if date > 0: 

142 return datetime.datetime(date, 1, 1) 

143 else: # do not do negative years bc cannot mix dtypes in column 

144 raise ValueError("Year 0 and before not implemented") 

145 else: 

146 raise ValueError("Date fmt %s not understood" % fmt) 

147 

148 

149### Helper classes for StataReader ### 

150 

151class _StataVariable(object): 

152 """ 

153 A dataset variable. Not intended for public use. 

154 

155 Parameters 

156 ---------- 

157 variable_data 

158 

159 Attributes 

160 ---------- 

161 format : str 

162 Stata variable format. See notes for more information. 

163 index : int 

164 Zero-index column index of variable. 

165 label : str 

166 Data Label 

167 name : str 

168 Variable name 

169 type : str 

170 Stata data type. See notes for more information. 

171 value_format : str 

172 Value format. 

173 

174 Notes 

175 ----- 

176 More information: http://www.stata.com/help.cgi?format 

177 """ 

178 def __init__(self, variable_data): 

179 self._data = variable_data 

180 

181 def __int__(self): 

182 """the variable's index within an observation""" 

183 return self.index 

184 

185 def __str__(self): 

186 """the name of the variable""" 

187 return self.name 

188 

189 @property 

190 def index(self): 

191 """the variable's index within an observation""" 

192 return self._data[0] 

193 

194 @property 

195 def type(self): 

196 """ 

197 The data type of variable 

198 

199 Possible types are: 

200 {1..244:string, b:byte, h:int, l:long, f:float, d:double) 

201 """ 

202 return self._data[1] 

203 

204 @property 

205 def name(self): 

206 """the name of the variable""" 

207 return self._data[2] 

208 

209 @property 

210 def format(self): 

211 """the variable's Stata format""" 

212 return self._data[4] 

213 

214 @property 

215 def value_format(self): 

216 """the variable's value format""" 

217 return self._data[5] 

218 

219 @property 

220 def label(self): 

221 """The variable's label""" 

222 return self._data[6] 

223 

224 

225class StataReader(object): 

226 """ 

227 Stata .dta file reader. 

228 

229 Provides methods to return the metadata of a Stata .dta file and 

230 a generator for the data itself. 

231 

232 Parameters 

233 ---------- 

234 file : file-like 

235 A file-like object representing a Stata .dta file. 

236 missing_values : bool 

237 If missing_values is True, parse missing_values and return a 

238 Missing Values object instead of None. 

239 encoding : str, optional 

240 Used for Python 3 only. Encoding to use when reading the .dta file. 

241 Defaults to `locale.getpreferredencoding` 

242 

243 See Also 

244 -------- 

245 statsmodels.iolib.foreign.genfromdta 

246 pandas.read_stata 

247 pandas.io.stata.StataReader 

248 

249 Notes 

250 ----- 

251 This is known only to work on file formats 113 (Stata 8/9), 114 

252 (Stata 10/11), and 115 (Stata 12). Needs to be tested on older versions. 

253 Known not to work on format 104, 108. If you have the documentation for 

254 older formats, please contact the developers. 

255 

256 For more information about the .dta format see 

257 http://www.stata.com/help.cgi?dta 

258 http://www.stata.com/help.cgi?dta_113 

259 """ 

260 

261 _header = {} 

262 _data_location = 0 

263 _col_sizes = () 

264 _has_string_data = False 

265 _missing_values = False 

266 #type code 

267 #-------------------- 

268 #str1 1 = 0x01 

269 #str2 2 = 0x02 

270 #... 

271 #str244 244 = 0xf4 

272 #byte 251 = 0xfb (sic) 

273 #int 252 = 0xfc 

274 #long 253 = 0xfd 

275 #float 254 = 0xfe 

276 #double 255 = 0xff 

277 #-------------------- 

278 #NOTE: the byte type seems to be reserved for categorical variables 

279 # with a label, but the underlying variable is -127 to 100 

280 # we're going to drop the label and cast to int 

281 DTYPE_MAP = dict(lzip(lrange(1,245), ['a' + str(i) for i in range(1,245)]) + \ 

282 [(251, np.int16),(252, np.int32),(253, int), 

283 (254, np.float32), (255, np.float64)]) 

284 TYPE_MAP = lrange(251)+list('bhlfd') 

285 #NOTE: technically, some of these are wrong. there are more numbers 

286 # that can be represented. it's the 27 ABOVE and BELOW the max listed 

287 # numeric data type in [U] 12.2.2 of the 11.2 manual 

288 MISSING_VALUES = { 'b': (-127,100), 'h': (-32767, 32740), 'l': 

289 (-2147483647, 2147483620), 'f': (-1.701e+38, +1.701e+38), 'd': 

290 (-1.798e+308, +8.988e+307) } 

291 

292 def __init__(self, fname, missing_values=False, encoding=None): 

293 warnings.warn( 

294 "StataReader is deprecated as of 0.10.0 and will be removed in a " 

295 "future version. Use pandas.read_stata or " 

296 "pandas.io.stata.StataReader instead.", 

297 FutureWarning) 

298 

299 if encoding is None: 

300 import locale 

301 self._encoding = locale.getpreferredencoding() 

302 else: 

303 self._encoding = encoding 

304 self._missing_values = missing_values 

305 self._parse_header(fname) 

306 

307 def file_headers(self): 

308 """ 

309 Returns all .dta file headers. 

310 

311 out: dict 

312 Has keys typlist, data_label, lbllist, varlist, nvar, filetype, 

313 ds_format, nobs, fmtlist, vlblist, time_stamp, srtlist, byteorder 

314 """ 

315 return self._header 

316 

317 def file_format(self): 

318 """ 

319 Returns the file format. 

320 

321 Returns 

322 ------- 

323 out : int 

324 

325 Notes 

326 ----- 

327 Format 113: Stata 8/9 

328 Format 114: Stata 10/11 

329 Format 115: Stata 12 

330 """ 

331 return self._header['ds_format'] 

332 

333 def file_label(self): 

334 """ 

335 Returns the dataset's label. 

336 

337 Returns 

338 ------- 

339 out: str 

340 """ 

341 return self._header['data_label'] 

342 

343 def file_timestamp(self): 

344 """ 

345 Returns the date and time Stata recorded on last file save. 

346 

347 Returns 

348 ------- 

349 out : str 

350 """ 

351 return self._header['time_stamp'] 

352 

353 def variables(self): 

354 """ 

355 Returns a list of the dataset's StataVariables objects. 

356 """ 

357 return lmap(_StataVariable, zip(lrange(self._header['nvar']), 

358 self._header['typlist'], self._header['varlist'], 

359 self._header['srtlist'], 

360 self._header['fmtlist'], self._header['lbllist'], 

361 self._header['vlblist'])) 

362 

363 def dataset(self, as_dict=False): 

364 """ 

365 Returns a Python generator object for iterating over the dataset. 

366 

367 

368 Parameters 

369 ---------- 

370 as_dict : bool, optional 

371 If as_dict is True, yield each row of observations as a dict. 

372 If False, yields each row of observations as a list. 

373 

374 Returns 

375 ------- 

376 Generator object for iterating over the dataset. Yields each row of 

377 observations as a list by default. 

378 

379 Notes 

380 ----- 

381 If missing_values is True during instantiation of StataReader then 

382 observations with StataMissingValue(s) are not filtered and should 

383 be handled by your application. 

384 """ 

385 

386 try: 

387 self._file.seek(self._data_location) 

388 except Exception: 

389 pass 

390 

391 if as_dict: 

392 vars = lmap(str, self.variables()) 

393 for i in range(len(self)): 

394 yield dict(zip(vars, self._next())) 

395 else: 

396 for i in range(self._header['nobs']): 

397 yield self._next() 

398 

399 ### Python special methods 

400 

401 def __len__(self): 

402 """ 

403 Return the number of observations in the dataset. 

404 

405 This value is taken directly from the header and includes observations 

406 with missing values. 

407 """ 

408 return self._header['nobs'] 

409 

410 def __getitem__(self, k): 

411 """ 

412 Seek to an observation indexed k in the file and return it, ordered 

413 by Stata's output to the .dta file. 

414 

415 k is zero-indexed. Prefer using R.data() for performance. 

416 """ 

417 if not (isinstance(k, int)) or k < 0 or k > len(self)-1: 

418 raise IndexError(k) 

419 loc = self._data_location + sum(self._col_size()) * k 

420 if self._file.tell() != loc: 

421 self._file.seek(loc) 

422 return self._next() 

423 

424 # Private methods 

425 

426 def _null_terminate(self, s, encoding): 

427 null_byte = asbytes('\x00') 

428 try: 

429 s = s.lstrip(null_byte)[:s.index(null_byte)] 

430 except Exception: 

431 pass 

432 return s.decode(encoding) 

433 

434 def _parse_header(self, file_object): 

435 self._file = file_object 

436 encoding = self._encoding 

437 

438 # parse headers 

439 self._header['ds_format'] = unpack('b', self._file.read(1))[0] 

440 

441 if self._header['ds_format'] not in [113, 114, 115]: 

442 raise ValueError("Only file formats >= 113 (Stata >= 9)" 

443 " are supported. Got format %s. Please report " 

444 "if you think this error is incorrect." % 

445 self._header['ds_format']) 

446 byteorder = self._header['byteorder'] = unpack('b', 

447 self._file.read(1))[0]==0x1 and '>' or '<' 

448 self._header['filetype'] = unpack('b', self._file.read(1))[0] 

449 self._file.read(1) 

450 nvar = self._header['nvar'] = unpack(byteorder+'h', 

451 self._file.read(2))[0] 

452 self._header['nobs'] = unpack(byteorder+'i', self._file.read(4))[0] 

453 self._header['data_label'] = self._null_terminate(self._file.read(81), 

454 encoding) 

455 self._header['time_stamp'] = self._null_terminate(self._file.read(18), 

456 encoding) 

457 

458 # parse descriptors 

459 typlist =[ord(self._file.read(1)) for i in range(nvar)] 

460 self._header['typlist'] = [self.TYPE_MAP[typ] for typ in typlist] 

461 self._header['dtyplist'] = [self.DTYPE_MAP[typ] for typ in typlist] 

462 self._header['varlist'] = [self._null_terminate(self._file.read(33), 

463 encoding) for i in range(nvar)] 

464 self._header['srtlist'] = unpack(byteorder+('h'*(nvar+1)), 

465 self._file.read(2*(nvar+1)))[:-1] 

466 if self._header['ds_format'] <= 113: 

467 self._header['fmtlist'] = \ 

468 [self._null_terminate(self._file.read(12), encoding) \ 

469 for i in range(nvar)] 

470 else: 

471 self._header['fmtlist'] = \ 

472 [self._null_terminate(self._file.read(49), encoding) \ 

473 for i in range(nvar)] 

474 self._header['lbllist'] = [self._null_terminate(self._file.read(33), 

475 encoding) for i in range(nvar)] 

476 self._header['vlblist'] = [self._null_terminate(self._file.read(81), 

477 encoding) for i in range(nvar)] 

478 

479 # ignore expansion fields 

480 # When reading, read five bytes; the last four bytes now tell you the 

481 # size of the next read, which you discard. You then continue like 

482 # this until you read 5 bytes of zeros. 

483 

484 while True: 

485 data_type = unpack(byteorder+'b', self._file.read(1))[0] 

486 data_len = unpack(byteorder+'i', self._file.read(4))[0] 

487 if data_type == 0: 

488 break 

489 self._file.read(data_len) 

490 

491 # other state vars 

492 self._data_location = self._file.tell() 

493 self._has_string_data = len(lfilter(lambda x: isinstance(x, int), 

494 self._header['typlist'])) > 0 

495 self._col_size() 

496 

497 def _calcsize(self, fmt): 

498 return isinstance(fmt, int) and fmt or \ 

499 calcsize(self._header['byteorder']+fmt) 

500 

501 def _col_size(self, k = None): 

502 """Calculate size of a data record.""" 

503 if len(self._col_sizes) == 0: 

504 self._col_sizes = lmap(lambda x: self._calcsize(x), 

505 self._header['typlist']) 

506 if k is None: 

507 return self._col_sizes 

508 else: 

509 return self._col_sizes[k] 

510 

511 def _unpack(self, fmt, byt): 

512 d = unpack(self._header['byteorder']+fmt, byt)[0] 

513 if fmt[-1] in self.MISSING_VALUES: 

514 nmin, nmax = self.MISSING_VALUES[fmt[-1]] 

515 if d < nmin or d > nmax: 

516 if self._missing_values: 

517 return StataMissingValue(nmax, d) 

518 else: 

519 return None 

520 return d 

521 

522 def _next(self): 

523 typlist = self._header['typlist'] 

524 if self._has_string_data: 

525 data = [None]*self._header['nvar'] 

526 for i in range(len(data)): 

527 if isinstance(typlist[i], int): 

528 data[i] = self._null_terminate(self._file.read(typlist[i]), 

529 self._encoding) 

530 else: 

531 data[i] = self._unpack(typlist[i], 

532 self._file.read(self._col_size(i))) 

533 return data 

534 else: 

535 return lmap(lambda i: self._unpack(typlist[i], 

536 self._file.read(self._col_size(i))), 

537 lrange(self._header['nvar'])) 

538 

539def _set_endianness(endianness): 

540 if endianness.lower() in ["<", "little"]: 

541 return "<" 

542 elif endianness.lower() in [">", "big"]: 

543 return ">" 

544 else: # pragma : no cover 

545 raise ValueError("Endianness %s not understood" % endianness) 

546 

547def _dtype_to_stata_type(dtype): 

548 """ 

549 Converts dtype types to stata types. Returns the byte of the given ordinal. 

550 See TYPE_MAP and comments for an explanation. This is also explained in 

551 the dta spec. 

552 1 - 244 are strings of this length 

553 251 - chr(251) - for int8 and int16, byte 

554 252 - chr(252) - for int32, int 

555 253 - chr(253) - for int64, long 

556 254 - chr(254) - for float32, float 

557 255 - chr(255) - double, double 

558 

559 If there are dates to convert, then dtype will already have the correct 

560 type inserted. 

561 """ 

562 #TODO: expand to handle datetime to integer conversion 

563 if dtype.type == np.string_: 

564 return chr(dtype.itemsize) 

565 elif dtype.type == np.object_: 

566 # try to coerce it to the biggest string 

567 # not memory efficient, what else could we do? 

568 return chr(244) 

569 elif dtype == np.float64: 

570 return chr(255) 

571 elif dtype == np.float32: 

572 return chr(254) 

573 elif dtype == np.int64: 

574 return chr(253) 

575 elif dtype == np.int32: 

576 return chr(252) 

577 elif dtype == np.int8 or dtype == np.int16: # ok to assume bytes? 

578 return chr(251) 

579 else: # pragma : no cover 

580 raise ValueError("Data type %s not currently understood. " 

581 "Please report an error to the developers." % dtype) 

582 

583def _dtype_to_default_stata_fmt(dtype): 

584 """ 

585 Maps numpy dtype to stata's default format for this type. Not terribly 

586 important since users can change this in Stata. Semantics are 

587 

588 string -> "%DDs" where DD is the length of the string 

589 float64 -> "%10.0g" 

590 float32 -> "%9.0g" 

591 int64 -> "%9.0g" 

592 int32 -> "%9.0g" 

593 int16 -> "%9.0g" 

594 int8 -> "%8.0g" 

595 """ 

596 #TODO: expand this to handle a default datetime format? 

597 if dtype.type == np.string_: 

598 return "%" + str(dtype.itemsize) + "s" 

599 elif dtype.type == np.object_: 

600 return "%244s" 

601 elif dtype == np.float64: 

602 return "%10.0g" 

603 elif dtype == np.float32: 

604 return "%9.0g" 

605 elif dtype == np.int64: 

606 return "%9.0g" 

607 elif dtype == np.int32: 

608 return "%8.0g" 

609 elif dtype == np.int8 or dtype == np.int16: # ok to assume bytes? 

610 return "%8.0g" 

611 else: # pragma : no cover 

612 raise ValueError("Data type %s not currently understood. " 

613 "Please report an error to the developers." % dtype) 

614 

615def _pad_bytes(name, length): 

616 """ 

617 Takes a char string and pads it wih null bytes until it's length chars 

618 """ 

619 return name + "\x00" * (length - len(name)) 

620 

621def _default_names(nvar): 

622 """ 

623 Returns default Stata names v1, v2, ... vnvar 

624 """ 

625 return ["v%d" % i for i in range(1,nvar+1)] 

626 

627def _convert_datetime_to_stata_type(fmt): 

628 """ 

629 Converts from one of the stata date formats to a type in TYPE_MAP 

630 """ 

631 if fmt in ["tc", "%tc", "td", "%td", "tw", "%tw", "tm", "%tm", "tq", 

632 "%tq", "th", "%th", "ty", "%ty"]: 

633 return np.float64 # Stata expects doubles for SIFs 

634 else: 

635 raise ValueError("fmt %s not understood" % fmt) 

636 

637def _maybe_convert_to_int_keys(convert_dates, varlist): 

638 new_dict = {} 

639 for key in convert_dates: 

640 if not convert_dates[key].startswith("%"): # make sure proper fmts 

641 convert_dates[key] = "%" + convert_dates[key] 

642 if key in varlist: 

643 new_dict.update({varlist.index(key) : convert_dates[key]}) 

644 else: 

645 if not isinstance(key, int): 

646 raise ValueError("convery_dates key is not in varlist " 

647 "and is not an int") 

648 new_dict.update({key : convert_dates[key]}) 

649 return new_dict 

650 

651_type_converters = {253 : np.long, 252 : int} 

652 

653class StataWriter(object): 

654 """ 

655 A class for writing Stata binary dta files from array-like objects 

656 

657 Parameters 

658 ---------- 

659 fname : file path or buffer 

660 Where to save the dta file. 

661 data : array_like 

662 Array-like input to save. Pandas objects are also accepted. 

663 convert_dates : dict 

664 Dictionary mapping column of datetime types to the stata internal 

665 format that you want to use for the dates. Options are 

666 'tc', 'td', 'tm', 'tw', 'th', 'tq', 'ty'. Column can be either a 

667 number or a name. 

668 encoding : str 

669 Default is latin-1. Note that Stata does not support unicode. 

670 byteorder : str 

671 Can be ">", "<", "little", or "big". The default is None which uses 

672 `sys.byteorder` 

673 

674 Returns 

675 ------- 

676 writer : StataWriter instance 

677 The StataWriter instance has a write_file method, which will 

678 write the file to the given `fname`. 

679 

680 Examples 

681 -------- 

682 >>> writer = StataWriter('./data_file.dta', data) 

683 >>> writer.write_file() 

684 

685 Or with dates 

686 

687 >>> writer = StataWriter('./date_data_file.dta', date, {2 : 'tw'}) 

688 >>> writer.write_file() 

689 """ 

690 #type code 

691 #-------------------- 

692 #str1 1 = 0x01 

693 #str2 2 = 0x02 

694 #... 

695 #str244 244 = 0xf4 

696 #byte 251 = 0xfb (sic) 

697 #int 252 = 0xfc 

698 #long 253 = 0xfd 

699 #float 254 = 0xfe 

700 #double 255 = 0xff 

701 #-------------------- 

702 #NOTE: the byte type seems to be reserved for categorical variables 

703 # with a label, but the underlying variable is -127 to 100 

704 # we're going to drop the label and cast to int 

705 DTYPE_MAP = dict(lzip(lrange(1,245), ['a' + str(i) for i in range(1,245)]) + \ 

706 [(251, np.int16),(252, np.int32),(253, int), 

707 (254, np.float32), (255, np.float64)]) 

708 TYPE_MAP = lrange(251)+list('bhlfd') 

709 MISSING_VALUES = { 'b': 101, 

710 'h': 32741, 

711 'l' : 2147483621, 

712 'f': 1.7014118346046923e+38, 

713 'd': 8.98846567431158e+307} 

714 def __init__(self, fname, data, convert_dates=None, encoding="latin-1", 

715 byteorder=None): 

716 warnings.warn( 

717 "StataWriter is deprecated as of 0.10.0 and will be removed in a " 

718 "future version. Use pandas.DataFrame.to_stata or " 

719 "pandas.io.stata.StatWriter instead.", 

720 FutureWarning) 

721 

722 self._convert_dates = convert_dates 

723 # attach nobs, nvars, data, varlist, typlist 

724 if data_util._is_using_pandas(data, None): 

725 self._prepare_pandas(data) 

726 

727 elif data_util._is_array_like(data, None): 

728 data = np.asarray(data) 

729 if data_util._is_structured_ndarray(data): 

730 self._prepare_structured_array(data) 

731 else: 

732 if convert_dates is not None: 

733 raise ValueError("Not able to convert dates in a plain" 

734 " ndarray.") 

735 self._prepare_ndarray(data) 

736 

737 else: # pragma : no cover 

738 raise ValueError("Type %s for data not understood" % type(data)) 

739 

740 

741 if byteorder is None: 

742 byteorder = sys.byteorder 

743 self._byteorder = _set_endianness(byteorder) 

744 self._encoding = encoding 

745 self._file = get_file_obj(fname, 'wb', encoding) 

746 

747 def _write(self, to_write): 

748 """ 

749 Helper to call asbytes before writing to file for Python 3 compat. 

750 """ 

751 self._file.write(asbytes(to_write)) 

752 

753 def _prepare_structured_array(self, data): 

754 self.nobs = len(data) 

755 self.nvar = len(data.dtype) 

756 self.data = data 

757 self.datarows = iter(data) 

758 dtype = data.dtype 

759 descr = dtype.descr 

760 if dtype.names is None: 

761 varlist = _default_names(self.nvar) 

762 else: 

763 varlist = dtype.names 

764 

765 # check for datetime and change the type 

766 convert_dates = self._convert_dates 

767 if convert_dates is not None: 

768 convert_dates = _maybe_convert_to_int_keys(convert_dates, 

769 varlist) 

770 self._convert_dates = convert_dates 

771 for key in convert_dates: 

772 descr[key] = ( 

773 descr[key][0], 

774 _convert_datetime_to_stata_type(convert_dates[key]) 

775 ) 

776 dtype = np.dtype(descr) 

777 

778 self.varlist = varlist 

779 self.typlist = [_dtype_to_stata_type(dtype[i]) 

780 for i in range(self.nvar)] 

781 self.fmtlist = [_dtype_to_default_stata_fmt(dtype[i]) 

782 for i in range(self.nvar)] 

783 # set the given format for the datetime cols 

784 if convert_dates is not None: 

785 for key in convert_dates: 

786 self.fmtlist[key] = convert_dates[key] 

787 

788 

789 def _prepare_ndarray(self, data): 

790 if data.ndim == 1: 

791 data = data[:,None] 

792 self.nobs, self.nvar = data.shape 

793 self.data = data 

794 self.datarows = iter(data) 

795 #TODO: this should be user settable 

796 dtype = data.dtype 

797 self.varlist = _default_names(self.nvar) 

798 self.typlist = [_dtype_to_stata_type(dtype) for i in range(self.nvar)] 

799 self.fmtlist = [_dtype_to_default_stata_fmt(dtype) 

800 for i in range(self.nvar)] 

801 

802 def _prepare_pandas(self, data): 

803 #NOTE: we might need a different API / class for pandas objects so 

804 # we can set different semantics - handle this with a PR to pandas.io 

805 class DataFrameRowIter(object): 

806 def __init__(self, data): 

807 self.data = data 

808 

809 def __iter__(self): 

810 for i, row in data.iterrows(): 

811 yield row 

812 

813 data = data.reset_index() 

814 self.datarows = DataFrameRowIter(data) 

815 self.nobs, self.nvar = data.shape 

816 self.data = data 

817 self.varlist = data.columns.tolist() 

818 dtypes = data.dtypes 

819 convert_dates = self._convert_dates 

820 if convert_dates is not None: 

821 convert_dates = _maybe_convert_to_int_keys(convert_dates, 

822 self.varlist) 

823 self._convert_dates = convert_dates 

824 for key in convert_dates: 

825 new_type = _convert_datetime_to_stata_type(convert_dates[key]) 

826 dtypes[key] = np.dtype(new_type) 

827 self.typlist = [_dtype_to_stata_type(dt) for dt in dtypes] 

828 self.fmtlist = [_dtype_to_default_stata_fmt(dt) for dt in dtypes] 

829 # set the given format for the datetime cols 

830 if convert_dates is not None: 

831 for key in convert_dates: 

832 self.fmtlist[key] = convert_dates[key] 

833 

834 def write_file(self): 

835 self._write_header() 

836 self._write_descriptors() 

837 self._write_variable_labels() 

838 # write 5 zeros for expansion fields 

839 self._write(_pad_bytes("", 5)) 

840 if self._convert_dates is None: 

841 self._write_data_nodates() 

842 else: 

843 self._write_data_dates() 

844 #self._write_value_labels() 

845 

846 def _write_header(self, data_label=None, time_stamp=None): 

847 byteorder = self._byteorder 

848 # ds_format - just use 114 

849 self._write(pack("b", 114)) 

850 # byteorder 

851 self._write(byteorder == ">" and "\x01" or "\x02") 

852 # filetype 

853 self._write("\x01") 

854 # unused 

855 self._write("\x00") 

856 # number of vars, 2 bytes 

857 self._write(pack(byteorder+"h", self.nvar)[:2]) 

858 # number of obs, 4 bytes 

859 self._write(pack(byteorder+"i", self.nobs)[:4]) 

860 # data label 81 bytes, char, null terminated 

861 if data_label is None: 

862 self._write(self._null_terminate(_pad_bytes("", 80), 

863 self._encoding)) 

864 else: 

865 self._write(self._null_terminate(_pad_bytes(data_label[:80], 

866 80), self._encoding)) 

867 # time stamp, 18 bytes, char, null terminated 

868 # format dd Mon yyyy hh:mm 

869 if time_stamp is None: 

870 time_stamp = datetime.datetime.now() 

871 elif not isinstance(time_stamp, datetime): 

872 raise ValueError("time_stamp should be datetime type") 

873 self._write(self._null_terminate( 

874 time_stamp.strftime("%d %b %Y %H:%M"), 

875 self._encoding)) 

876 

877 def _write_descriptors(self, typlist=None, varlist=None, srtlist=None, 

878 fmtlist=None, lbllist=None): 

879 nvar = self.nvar 

880 # typlist, length nvar, format byte array 

881 for typ in self.typlist: 

882 self._write(typ) 

883 

884 # varlist, length 33*nvar, char array, null terminated 

885 for name in self.varlist: 

886 name = self._null_terminate(name, self._encoding) 

887 name = _pad_bytes(asstr(name[:32]), 33) 

888 self._write(name) 

889 

890 # srtlist, 2*(nvar+1), int array, encoded by byteorder 

891 srtlist = _pad_bytes("", (2*(nvar+1))) 

892 self._write(srtlist) 

893 

894 # fmtlist, 49*nvar, char array 

895 for fmt in self.fmtlist: 

896 self._write(_pad_bytes(fmt, 49)) 

897 

898 # lbllist, 33*nvar, char array 

899 #NOTE: this is where you could get fancy with pandas categorical type 

900 for i in range(nvar): 

901 self._write(_pad_bytes("", 33)) 

902 

903 def _write_variable_labels(self, labels=None): 

904 nvar = self.nvar 

905 if labels is None: 

906 for i in range(nvar): 

907 self._write(_pad_bytes("", 81)) 

908 

909 def _write_data_nodates(self): 

910 data = self.datarows 

911 byteorder = self._byteorder 

912 TYPE_MAP = self.TYPE_MAP 

913 typlist = self.typlist 

914 for row in data: 

915 #row = row.squeeze().tolist() # needed for structured arrays 

916 for i,var in enumerate(row): 

917 typ = ord(typlist[i]) 

918 if typ <= 244: # we've got a string 

919 if len(var) < typ: 

920 var = _pad_bytes(asstr(var), len(var) + 1) 

921 self._write(var) 

922 else: 

923 try: 

924 if typ in _type_converters: 

925 var = _type_converters[typ](var) 

926 self._write(pack(byteorder+TYPE_MAP[typ], var)) 

927 except struct_error: 

928 # have to be strict about type pack will not do any 

929 # kind of casting 

930 self._write(pack(byteorder+TYPE_MAP[typ], 

931 _type_converters[typ](var))) 

932 

933 def _write_data_dates(self): 

934 convert_dates = self._convert_dates 

935 data = self.datarows 

936 byteorder = self._byteorder 

937 TYPE_MAP = self.TYPE_MAP 

938 MISSING_VALUES = self.MISSING_VALUES 

939 typlist = self.typlist 

940 for row in data: 

941 #row = row.squeeze().tolist() # needed for structured arrays 

942 for i,var in enumerate(row): 

943 typ = ord(typlist[i]) 

944 #NOTE: If anyone finds this terribly slow, there is 

945 # a vectorized way to convert dates, see genfromdta for going 

946 # from int to datetime and reverse it. will copy data though 

947 if i in convert_dates: 

948 var = _datetime_to_stata_elapsed(var, self.fmtlist[i]) 

949 if typ <= 244: # we've got a string 

950 if isnull(var): 

951 var = "" # missing string 

952 if len(var) < typ: 

953 var = _pad_bytes(var, len(var) + 1) 

954 self._write(var) 

955 else: 

956 if isnull(var): # this only matters for floats 

957 var = MISSING_VALUES[typ] 

958 self._write(pack(byteorder+TYPE_MAP[typ], var)) 

959 

960 def _null_terminate(self, s, encoding): 

961 null_byte = '\x00' 

962 s += null_byte 

963 return s.encode(encoding) 

964 

965 

966def genfromdta(fname, missing_flt=-999., encoding=None, pandas=False, 

967 convert_dates=True): 

968 """ 

969 Returns an ndarray or DataFrame from a Stata .dta file. 

970 

971 Parameters 

972 ---------- 

973 fname : str or filehandle 

974 Stata .dta file. 

975 missing_flt : numeric 

976 The numeric value to replace missing values with. Will be used for 

977 any numeric value. 

978 encoding : str, optional 

979 Used for Python 3 only. Encoding to use when reading the .dta file. 

980 Defaults to `locale.getpreferredencoding` 

981 pandas : bool 

982 Optionally return a DataFrame instead of an ndarray 

983 convert_dates : bool 

984 If convert_dates is True, then Stata formatted dates will be converted 

985 to datetime types according to the variable's format. 

986 """ 

987 warnings.warn( 

988 "genfromdta is deprecated as of 0.10.0 and will be removed in a " 

989 "future version. Use pandas.read_stata instead.", 

990 FutureWarning) 

991 

992 if isinstance(fname, str): 

993 fhd = StataReader(open(fname, 'rb'), missing_values=False, 

994 encoding=encoding) 

995 elif not hasattr(fname, 'read'): 

996 raise TypeError("The input should be a string or a filehandle. "\ 

997 "(got %s instead)" % type(fname)) 

998 else: 

999 fhd = StataReader(fname, missing_values=False, encoding=encoding) 

1000# validate_names = np.lib._iotools.NameValidator(excludelist=excludelist, 

1001# deletechars=deletechars, 

1002# case_sensitive=case_sensitive) 

1003 

1004 #TODO: This needs to handle the byteorder? 

1005 header = fhd.file_headers() 

1006 types = header['dtyplist'] 

1007 nobs = header['nobs'] 

1008 numvars = header['nvar'] 

1009 varnames = header['varlist'] 

1010 fmtlist = header['fmtlist'] 

1011 dataname = header['data_label'] 

1012 labels = header['vlblist'] # labels are thrown away unless DataArray 

1013 # type is used 

1014 data = np.zeros((nobs,numvars)) 

1015 stata_dta = fhd.dataset() 

1016 

1017 dt = np.dtype(lzip(varnames, types)) 

1018 data = np.zeros((nobs), dtype=dt) # init final array 

1019 

1020 for rownum,line in enumerate(stata_dta): 

1021 # does not handle missing value objects, just casts 

1022 # None will only work without missing value object. 

1023 if None in line: 

1024 for i,val in enumerate(line): 

1025 #NOTE: This will only be scalar types because missing strings 

1026 # are empty not None in Stata 

1027 if val is None: 

1028 line[i] = missing_flt 

1029 data[rownum] = tuple(line) 

1030 

1031 if pandas: 

1032 from pandas import DataFrame 

1033 data = DataFrame.from_records(data) 

1034 if convert_dates: 

1035 cols = np.where(lmap(lambda x : x in _date_formats, fmtlist))[0] 

1036 for col in cols: 

1037 i = col 

1038 col = data.columns[col] 

1039 data[col] = data[col].apply(_stata_elapsed_date_to_datetime, 

1040 args=(fmtlist[i],)) 

1041 elif convert_dates: 

1042 # date_cols = np.where(map(lambda x : x in _date_formats, 

1043 # fmtlist))[0] 

1044 # make the dtype for the datetime types 

1045 cols = np.where(lmap(lambda x: x in _date_formats, fmtlist))[0] 

1046 dtype = data.dtype.descr 

1047 dtype = [(sub_dtype[0], object) if i in cols else sub_dtype 

1048 for i, sub_dtype in enumerate(dtype)] 

1049 data = data.astype(dtype) # have to copy 

1050 for col in cols: 

1051 def convert(x): 

1052 return _stata_elapsed_date_to_datetime(x, fmtlist[col]) 

1053 data[data.dtype.names[col]] = lmap(convert, 

1054 data[data.dtype.names[col]]) 

1055 return data 

1056 

1057 

1058def savetxt(fname, X, names=None, fmt='%.18e', delimiter=' '): 

1059 """ 

1060 Save an array to a text file. 

1061 

1062 This is just a copy of numpy.savetxt patched to support structured arrays 

1063 or a header of names. Does not include py3 support now in savetxt. 

1064 

1065 Parameters 

1066 ---------- 

1067 fname : filename or file handle 

1068 If the filename ends in ``.gz``, the file is automatically saved in 

1069 compressed gzip format. `loadtxt` understands gzipped files 

1070 transparently. 

1071 X : array_like 

1072 Data to be saved to a text file. 

1073 names : list, optional 

1074 If given names will be the column header in the text file. If None and 

1075 X is a structured or recarray then the names are taken from 

1076 X.dtype.names. 

1077 fmt : str or sequence of strs 

1078 A single format (%10.5f), a sequence of formats, or a 

1079 multi-format string, e.g. 'Iteration %d -- %10.5f', in which 

1080 case `delimiter` is ignored. 

1081 delimiter : str 

1082 Character separating columns. 

1083 

1084 See Also 

1085 -------- 

1086 save : Save an array to a binary file in NumPy ``.npy`` format 

1087 savez : Save several arrays into a ``.npz`` compressed archive 

1088 

1089 Notes 

1090 ----- 

1091 Further explanation of the `fmt` parameter 

1092 (``%[flag]width[.precision]specifier``): 

1093 

1094 flags: 

1095 ``-`` : left justify 

1096 

1097 ``+`` : Forces to preceed result with + or -. 

1098 

1099 ``0`` : Left pad the number with zeros instead of space (see width). 

1100 

1101 width: 

1102 Minimum number of characters to be printed. The value is not truncated 

1103 if it has more characters. 

1104 

1105 precision: 

1106 - For integer specifiers (eg. ``d,i,o,x``), the minimum number of 

1107 digits. 

1108 - For ``e, E`` and ``f`` specifiers, the number of digits to print 

1109 after the decimal point. 

1110 - For ``g`` and ``G``, the maximum number of significant digits. 

1111 - For ``s``, the maximum number of characters. 

1112 

1113 specifiers: 

1114 ``c`` : character 

1115 

1116 ``d`` or ``i`` : signed decimal integer 

1117 

1118 ``e`` or ``E`` : scientific notation with ``e`` or ``E``. 

1119 

1120 ``f`` : decimal floating point 

1121 

1122 ``g,G`` : use the shorter of ``e,E`` or ``f`` 

1123 

1124 ``o`` : signed octal 

1125 

1126 ``s`` : str of characters 

1127 

1128 ``u`` : unsigned decimal integer 

1129 

1130 ``x,X`` : unsigned hexadecimal integer 

1131 

1132 This explanation of ``fmt`` is not complete, for an exhaustive 

1133 specification see [1]_. 

1134 

1135 References 

1136 ---------- 

1137 .. [1] `Format Specification Mini-Language 

1138 <http://docs.python.org/library/string.html# 

1139 format-specification-mini-language>`_, Python Documentation. 

1140 

1141 Examples 

1142 -------- 

1143 >>> savetxt('test.out', x, delimiter=',') # x is an array 

1144 >>> savetxt('test.out', (x,y,z)) # x,y,z equal sized 1D arrays 

1145 >>> savetxt('test.out', x, fmt='%1.4e') # use exponential notation 

1146 """ 

1147 

1148 with get_file_obj(fname, 'w') as fh: 

1149 X = np.asarray(X) 

1150 

1151 # Handle 1-dimensional arrays 

1152 if X.ndim == 1: 

1153 # Common case -- 1d array of numbers 

1154 if X.dtype.names is None: 

1155 X = np.atleast_2d(X).T 

1156 ncol = 1 

1157 

1158 # Complex dtype -- each field indicates a separate column 

1159 else: 

1160 ncol = len(X.dtype.descr) 

1161 else: 

1162 ncol = X.shape[1] 

1163 

1164 # `fmt` can be a string with multiple insertion points or a list of formats. 

1165 # E.g. '%10.5f\t%10d' or ('%10.5f', '$10d') 

1166 if isinstance(fmt, (list, tuple)): 

1167 if len(fmt) != ncol: 

1168 raise AttributeError('fmt has wrong shape. %s' % str(fmt)) 

1169 format = delimiter.join(fmt) 

1170 elif isinstance(fmt, str): 

1171 if fmt.count('%') == 1: 

1172 fmt = [fmt, ]*ncol 

1173 format = delimiter.join(fmt) 

1174 elif fmt.count('%') != ncol: 

1175 raise AttributeError('fmt has wrong number of %% formats. %s' 

1176 % fmt) 

1177 else: 

1178 format = fmt 

1179 

1180 # handle names 

1181 if names is None and X.dtype.names: 

1182 names = X.dtype.names 

1183 if names is not None: 

1184 fh.write(delimiter.join(names) + '\n') 

1185 

1186 for row in X: 

1187 fh.write(format % tuple(row) + '\n')