Hide keyboard shortcuts

Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1from statsmodels.compat.python import lrange 

2 

3from io import StringIO 

4import shutil 

5from os import environ, makedirs 

6from os.path import expanduser, exists, dirname, abspath, join 

7from urllib.error import HTTPError, URLError 

8from urllib.request import urlopen 

9from urllib.parse import urljoin 

10 

11import numpy as np 

12from pandas import read_stata, read_csv, DataFrame, Series, Index 

13 

14 

15def webuse(data, baseurl='https://www.stata-press.com/data/r11/', as_df=True): 

16 """ 

17 Download and return an example dataset from Stata. 

18 

19 Parameters 

20 ---------- 

21 data : str 

22 Name of dataset to fetch. 

23 baseurl : str 

24 The base URL to the stata datasets. 

25 as_df : bool 

26 Deprecated. Always returns a DataFrame 

27 

28 Returns 

29 ------- 

30 dta : DataFrame 

31 A DataFrame containing the Stata dataset. 

32 

33 Examples 

34 -------- 

35 >>> dta = webuse('auto') 

36 

37 Notes 

38 ----- 

39 Make sure baseurl has trailing forward slash. Does not do any 

40 error checking in response URLs. 

41 """ 

42 url = urljoin(baseurl, data+'.dta') 

43 return read_stata(url) 

44 

45 

46class Dataset(dict): 

47 def __init__(self, **kw): 

48 # define some default attributes, so pylint can find them 

49 self.endog = None 

50 self.exog = None 

51 self.data = None 

52 self.names = None 

53 

54 dict.__init__(self, kw) 

55 self.__dict__ = self 

56 # Some datasets have string variables. If you want a raw_data 

57 # attribute you must create this in the dataset's load function. 

58 try: # some datasets have string variables 

59 self.raw_data = self.data.astype(float) 

60 except: 

61 pass 

62 

63 def __repr__(self): 

64 return str(self.__class__) 

65 

66 

67def process_pandas(data, endog_idx=0, exog_idx=None, index_idx=None): 

68 names = data.columns 

69 

70 if isinstance(endog_idx, int): 

71 endog_name = names[endog_idx] 

72 endog = data[endog_name].copy() 

73 if exog_idx is None: 

74 exog = data.drop([endog_name], axis=1) 

75 else: 

76 exog = data[names[exog_idx]].copy() 

77 else: 

78 endog = data.loc[:, endog_idx].copy() 

79 endog_name = list(endog.columns) 

80 if exog_idx is None: 

81 exog = data.drop(endog_name, axis=1) 

82 elif isinstance(exog_idx, int): 

83 exog = data[names[exog_idx]].copy() 

84 else: 

85 exog = data[names[exog_idx]].copy() 

86 

87 if index_idx is not None: # NOTE: will have to be improved for dates 

88 index = Index(data.iloc[:, index_idx]) 

89 endog.index = index 

90 exog.index = index.copy() 

91 data = data.set_index(names[index_idx]) 

92 

93 exog_name = list(exog.columns) 

94 dataset = Dataset(data=data, names=list(names), endog=endog, 

95 exog=exog, endog_name=endog_name, exog_name=exog_name) 

96 return dataset 

97 

98 

99def _maybe_reset_index(data): 

100 """ 

101 All the Rdatasets have the integer row.labels from R if there is no 

102 real index. Strip this for a zero-based index 

103 """ 

104 if data.index.equals(Index(lrange(1, len(data) + 1))): 

105 data = data.reset_index(drop=True) 

106 return data 

107 

108 

109def _get_cache(cache): 

110 if cache is False: 

111 # do not do any caching or load from cache 

112 cache = None 

113 elif cache is True: # use default dir for cache 

114 cache = get_data_home(None) 

115 else: 

116 cache = get_data_home(cache) 

117 return cache 

118 

119 

120def _cache_it(data, cache_path): 

121 import zlib 

122 open(cache_path, "wb").write(zlib.compress(data)) 

123 

124 

125def _open_cache(cache_path): 

126 import zlib 

127 data = zlib.decompress(open(cache_path, 'rb').read()) 

128 # return as bytes object encoded in utf-8 for cross-compat of cached 

129 return data 

130 

131 

132def _urlopen_cached(url, cache): 

133 """ 

134 Tries to load data from cache location otherwise downloads it. If it 

135 downloads the data and cache is not None then it will put the downloaded 

136 data in the cache path. 

137 """ 

138 from_cache = False 

139 if cache is not None: 

140 file_name = url.split("://")[-1].replace('/', ',') 

141 file_name = file_name.split('.') 

142 if len(file_name) > 1: 

143 file_name[-2] += '-v2' 

144 else: 

145 file_name[0] += '-v2' 

146 file_name = '.'.join(file_name) + ".zip" 

147 cache_path = join(cache, file_name) 

148 try: 

149 data = _open_cache(cache_path) 

150 from_cache = True 

151 except: 

152 pass 

153 

154 # not using the cache or did not find it in cache 

155 if not from_cache: 

156 data = urlopen(url, timeout=3).read() 

157 if cache is not None: # then put it in the cache 

158 _cache_it(data, cache_path) 

159 return data, from_cache 

160 

161 

162def _get_data(base_url, dataname, cache, extension="csv"): 

163 url = base_url + (dataname + ".%s") % extension 

164 try: 

165 data, from_cache = _urlopen_cached(url, cache) 

166 except HTTPError as err: 

167 if '404' in str(err): 

168 raise ValueError("Dataset %s was not found." % dataname) 

169 else: 

170 raise err 

171 

172 data = data.decode('utf-8', 'strict') 

173 return StringIO(data), from_cache 

174 

175 

176def _get_dataset_meta(dataname, package, cache): 

177 # get the index, you'll probably want this cached because you have 

178 # to download info about all the data to get info about any of the data... 

179 index_url = ("https://raw.githubusercontent.com/vincentarelbundock/" 

180 "Rdatasets/master/datasets.csv") 

181 data, _ = _urlopen_cached(index_url, cache) 

182 data = data.decode('utf-8', 'strict') 

183 index = read_csv(StringIO(data)) 

184 idx = np.logical_and(index.Item == dataname, index.Package == package) 

185 dataset_meta = index.loc[idx] 

186 return dataset_meta["Title"].iloc[0] 

187 

188 

189def get_rdataset(dataname, package="datasets", cache=False): 

190 """download and return R dataset 

191 

192 Parameters 

193 ---------- 

194 dataname : str 

195 The name of the dataset you want to download 

196 package : str 

197 The package in which the dataset is found. The default is the core 

198 'datasets' package. 

199 cache : bool or str 

200 If True, will download this data into the STATSMODELS_DATA folder. 

201 The default location is a folder called statsmodels_data in the 

202 user home folder. Otherwise, you can specify a path to a folder to 

203 use for caching the data. If False, the data will not be cached. 

204 

205 Returns 

206 ------- 

207 dataset : Dataset instance 

208 A `statsmodels.data.utils.Dataset` instance. This objects has 

209 attributes: 

210 

211 * data - A pandas DataFrame containing the data 

212 * title - The dataset title 

213 * package - The package from which the data came 

214 * from_cache - Whether not cached data was retrieved 

215 * __doc__ - The verbatim R documentation. 

216 

217 

218 Notes 

219 ----- 

220 If the R dataset has an integer index. This is reset to be zero-based. 

221 Otherwise the index is preserved. The caching facilities are dumb. That 

222 is, no download dates, e-tags, or otherwise identifying information 

223 is checked to see if the data should be downloaded again or not. If the 

224 dataset is in the cache, it's used. 

225 """ 

226 # NOTE: use raw github bc html site might not be most up to date 

227 data_base_url = ("https://raw.githubusercontent.com/vincentarelbundock/Rdatasets/" 

228 "master/csv/"+package+"/") 

229 docs_base_url = ("https://raw.githubusercontent.com/vincentarelbundock/Rdatasets/" 

230 "master/doc/"+package+"/rst/") 

231 cache = _get_cache(cache) 

232 data, from_cache = _get_data(data_base_url, dataname, cache) 

233 data = read_csv(data, index_col=0) 

234 data = _maybe_reset_index(data) 

235 

236 title = _get_dataset_meta(dataname, package, cache) 

237 doc, _ = _get_data(docs_base_url, dataname, cache, "rst") 

238 

239 return Dataset(data=data, __doc__=doc.read(), package=package, title=title, 

240 from_cache=from_cache) 

241 

242# The below function were taken from sklearn 

243 

244 

245def get_data_home(data_home=None): 

246 """Return the path of the statsmodels data dir. 

247 

248 This folder is used by some large dataset loaders to avoid 

249 downloading the data several times. 

250 

251 By default the data dir is set to a folder named 'statsmodels_data' 

252 in the user home folder. 

253 

254 Alternatively, it can be set by the 'STATSMODELS_DATA' environment 

255 variable or programatically by giving an explicit folder path. The 

256 '~' symbol is expanded to the user home folder. 

257 

258 If the folder does not already exist, it is automatically created. 

259 """ 

260 if data_home is None: 

261 data_home = environ.get('STATSMODELS_DATA', 

262 join('~', 'statsmodels_data')) 

263 data_home = expanduser(data_home) 

264 if not exists(data_home): 

265 makedirs(data_home) 

266 return data_home 

267 

268 

269def clear_data_home(data_home=None): 

270 """Delete all the content of the data home cache.""" 

271 data_home = get_data_home(data_home) 

272 shutil.rmtree(data_home) 

273 

274 

275def check_internet(url=None): 

276 """Check if internet is available""" 

277 url = "https://github.com" if url is None else url 

278 try: 

279 urlopen(url) 

280 except URLError as err: 

281 return False 

282 return True 

283 

284 

285def strip_column_names(df): 

286 """ 

287 Remove leading and trailing single quotes 

288 

289 Parameters 

290 ---------- 

291 df : DataFrame 

292 DataFrame to process 

293 

294 Returns 

295 ------- 

296 df : DataFrame 

297 DataFrame with stripped column names 

298 

299 Notes 

300 ----- 

301 In-place modification 

302 """ 

303 columns = [] 

304 for c in df: 

305 if c.startswith('\'') and c.endswith('\''): 

306 c = c[1:-1] 

307 elif c.startswith('\''): 

308 c = c[1:] 

309 elif c.endswith('\''): 

310 c = c[:-1] 

311 columns.append(c) 

312 df.columns = columns 

313 return df 

314 

315 

316def load_csv(base_file, csv_name, sep=',', convert_float=False): 

317 """Standard simple csv loader""" 

318 filepath = dirname(abspath(base_file)) 

319 filename = join(filepath,csv_name) 

320 engine = 'python' if sep != ',' else 'c' 

321 float_precision = {} 

322 if engine == 'c': 

323 float_precision = {'float_precision': 'high'} 

324 data = read_csv(filename, sep=sep, engine=engine, **float_precision) 

325 if convert_float: 

326 data = data.astype(float) 

327 return data 

328 

329 

330def as_numpy_dataset(ds, as_pandas=True, retain_index=False): 

331 """Convert a pandas dataset to a NumPy dataset""" 

332 if as_pandas: 

333 return ds 

334 ds.data = ds.data.to_records(index=retain_index) 

335 for d in dir(ds): 

336 if d.startswith('_'): 

337 continue 

338 attr = getattr(ds, d) 

339 if isinstance(attr, (Series, DataFrame)): 

340 setattr(ds, d, np.asarray(attr)) 

341 

342 return ds