Hide keyboard shortcuts

Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1"""Star98 Educational Testing dataset.""" 

2from statsmodels.datasets import utils as du 

3 

4__docformat__ = 'restructuredtext' 

5 

6COPYRIGHT = """Used with express permission from the original author, 

7who retains all rights.""" 

8TITLE = "Star98 Educational Dataset" 

9SOURCE = """ 

10Jeff Gill's `Generalized Linear Models: A Unified Approach` 

11 

12http://jgill.wustl.edu/research/books.html 

13""" 

14DESCRSHORT = """Math scores for 303 student with 10 explanatory factors""" 

15 

16DESCRLONG = """ 

17This data is on the California education policy and outcomes (STAR program 

18results for 1998. The data measured standardized testing by the California 

19Department of Education that required evaluation of 2nd - 11th grade students 

20by the the Stanford 9 test on a variety of subjects. This dataset is at 

21the level of the unified school district and consists of 303 cases. The 

22binary response variable represents the number of 9th graders scoring 

23over the national median value on the mathematics exam. 

24 

25The data used in this example is only a subset of the original source. 

26""" 

27 

28NOTE = """:: 

29 

30 Number of Observations - 303 (counties in California). 

31 

32 Number of Variables - 13 and 8 interaction terms. 

33 

34 Definition of variables names:: 

35 

36 NABOVE - Total number of students above the national median for the 

37 math section. 

38 NBELOW - Total number of students below the national median for the 

39 math section. 

40 LOWINC - Percentage of low income students 

41 PERASIAN - Percentage of Asian student 

42 PERBLACK - Percentage of black students 

43 PERHISP - Percentage of Hispanic students 

44 PERMINTE - Percentage of minority teachers 

45 AVYRSEXP - Sum of teachers' years in educational service divided by the 

46 number of teachers. 

47 AVSALK - Total salary budget including benefits divided by the number 

48 of full-time teachers (in thousands) 

49 PERSPENK - Per-pupil spending (in thousands) 

50 PTRATIO - Pupil-teacher ratio. 

51 PCTAF - Percentage of students taking UC/CSU prep courses 

52 PCTCHRT - Percentage of charter schools 

53 PCTYRRND - Percentage of year-round schools 

54 

55 The below variables are interaction terms of the variables defined 

56 above. 

57 

58 PERMINTE_AVYRSEXP 

59 PEMINTE_AVSAL 

60 AVYRSEXP_AVSAL 

61 PERSPEN_PTRATIO 

62 PERSPEN_PCTAF 

63 PTRATIO_PCTAF 

64 PERMINTE_AVTRSEXP_AVSAL 

65 PERSPEN_PTRATIO_PCTAF 

66""" 

67 

68 

69 

70def load(as_pandas=None): 

71 """ 

72 Load the star98 data and returns a Dataset class instance. 

73 

74 Parameters 

75 ---------- 

76 as_pandas : bool 

77 Flag indicating whether to return pandas DataFrames and Series 

78 or numpy recarrays and arrays. If True, returns pandas. 

79 

80 Returns 

81 ------- 

82 Load instance: 

83 a class of the data with array attrbutes 'endog' and 'exog' 

84 """ 

85 return du.as_numpy_dataset(load_pandas(), as_pandas=as_pandas) 

86 

87 

88def load_pandas(): 

89 data = _get_data() 

90 return du.process_pandas(data, endog_idx=['NABOVE', 'NBELOW']) 

91 

92 

93def _get_data(): 

94 data = du.load_csv(__file__, 'star98.csv') 

95 names = ["NABOVE","NBELOW","LOWINC","PERASIAN","PERBLACK","PERHISP", 

96 "PERMINTE","AVYRSEXP","AVSALK","PERSPENK","PTRATIO","PCTAF", 

97 "PCTCHRT","PCTYRRND","PERMINTE_AVYRSEXP","PERMINTE_AVSAL", 

98 "AVYRSEXP_AVSAL","PERSPEN_PTRATIO","PERSPEN_PCTAF","PTRATIO_PCTAF", 

99 "PERMINTE_AVYRSEXP_AVSAL","PERSPEN_PTRATIO_PCTAF"] 

100 data.columns = names 

101 nabove = data['NABOVE'].copy() 

102 nbelow = data['NBELOW'].copy() 

103 

104 data['NABOVE'] = nbelow # successes 

105 data['NBELOW'] = nabove - nbelow # now failures 

106 

107 return data