Coverage for /home/martinb/.local/share/virtualenvs/camcops/lib/python3.6/site-packages/statsmodels/datasets/star98/data.py : 50%

Hot-keys on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1"""Star98 Educational Testing dataset."""
2from statsmodels.datasets import utils as du
4__docformat__ = 'restructuredtext'
6COPYRIGHT = """Used with express permission from the original author,
7who retains all rights."""
8TITLE = "Star98 Educational Dataset"
9SOURCE = """
10Jeff Gill's `Generalized Linear Models: A Unified Approach`
12http://jgill.wustl.edu/research/books.html
13"""
14DESCRSHORT = """Math scores for 303 student with 10 explanatory factors"""
16DESCRLONG = """
17This data is on the California education policy and outcomes (STAR program
18results for 1998. The data measured standardized testing by the California
19Department of Education that required evaluation of 2nd - 11th grade students
20by the the Stanford 9 test on a variety of subjects. This dataset is at
21the level of the unified school district and consists of 303 cases. The
22binary response variable represents the number of 9th graders scoring
23over the national median value on the mathematics exam.
25The data used in this example is only a subset of the original source.
26"""
28NOTE = """::
30 Number of Observations - 303 (counties in California).
32 Number of Variables - 13 and 8 interaction terms.
34 Definition of variables names::
36 NABOVE - Total number of students above the national median for the
37 math section.
38 NBELOW - Total number of students below the national median for the
39 math section.
40 LOWINC - Percentage of low income students
41 PERASIAN - Percentage of Asian student
42 PERBLACK - Percentage of black students
43 PERHISP - Percentage of Hispanic students
44 PERMINTE - Percentage of minority teachers
45 AVYRSEXP - Sum of teachers' years in educational service divided by the
46 number of teachers.
47 AVSALK - Total salary budget including benefits divided by the number
48 of full-time teachers (in thousands)
49 PERSPENK - Per-pupil spending (in thousands)
50 PTRATIO - Pupil-teacher ratio.
51 PCTAF - Percentage of students taking UC/CSU prep courses
52 PCTCHRT - Percentage of charter schools
53 PCTYRRND - Percentage of year-round schools
55 The below variables are interaction terms of the variables defined
56 above.
58 PERMINTE_AVYRSEXP
59 PEMINTE_AVSAL
60 AVYRSEXP_AVSAL
61 PERSPEN_PTRATIO
62 PERSPEN_PCTAF
63 PTRATIO_PCTAF
64 PERMINTE_AVTRSEXP_AVSAL
65 PERSPEN_PTRATIO_PCTAF
66"""
70def load(as_pandas=None):
71 """
72 Load the star98 data and returns a Dataset class instance.
74 Parameters
75 ----------
76 as_pandas : bool
77 Flag indicating whether to return pandas DataFrames and Series
78 or numpy recarrays and arrays. If True, returns pandas.
80 Returns
81 -------
82 Load instance:
83 a class of the data with array attrbutes 'endog' and 'exog'
84 """
85 return du.as_numpy_dataset(load_pandas(), as_pandas=as_pandas)
88def load_pandas():
89 data = _get_data()
90 return du.process_pandas(data, endog_idx=['NABOVE', 'NBELOW'])
93def _get_data():
94 data = du.load_csv(__file__, 'star98.csv')
95 names = ["NABOVE","NBELOW","LOWINC","PERASIAN","PERBLACK","PERHISP",
96 "PERMINTE","AVYRSEXP","AVSALK","PERSPENK","PTRATIO","PCTAF",
97 "PCTCHRT","PCTYRRND","PERMINTE_AVYRSEXP","PERMINTE_AVSAL",
98 "AVYRSEXP_AVSAL","PERSPEN_PTRATIO","PERSPEN_PCTAF","PTRATIO_PCTAF",
99 "PERMINTE_AVYRSEXP_AVSAL","PERSPEN_PTRATIO_PCTAF"]
100 data.columns = names
101 nabove = data['NABOVE'].copy()
102 nbelow = data['NBELOW'].copy()
104 data['NABOVE'] = nbelow # successes
105 data['NBELOW'] = nabove - nbelow # now failures
107 return data