1 from collections import namedtuple
2 import csv
3
4 import numpy as np
5
6
7 Table = namedtuple('Table', ['types', 'names', 'rows'])
8 Column = namedtuple('Column', ['type', 'name', 'cells'])
9
10
11 -def read(fname, delimiter=',', skip_header=False):
12 """
13 read loads cell data, column headers and type information for each column
14 given a file path to a CSV formatted file. A "Table" namedtuple is returned
15 with fields "types", "names" and "rows".
16
17 All cells have left and right whitespace trimmed.
18
19 All rows MUST be the same length.
20
21 delimiter is the string the separates each field in a row.
22
23 If skip_header is set, then no column headers are read, and column names
24 are set to their corresponding indices (as strings).
25 """
26 names, rows = _data(fname, delimiter, skip_header)
27 types = _column_types(names, rows)
28
29 return cast(Table(types=types, names=names, rows=rows))
30
31
32 -def _data(fname, delimiter=',', skip_header=False):
33 """
34 _data loads cell data and column headers, and returns the names and rows.
35
36 All cells have left and right whitespace trimmed.
37
38 All rows MUST be the same length.
39
40 delimiter and skip_header are described in read.
41 """
42 names = []
43 rows = []
44 reader = csv.reader(open(fname), delimiter=delimiter)
45 if not skip_header:
46 names = map(str.strip, reader.next())
47
48 for i, row in enumerate(reader):
49
50
51
52 if len(names) == 0:
53 names = map(str, range(0, len(row)))
54 assert len(row) == len(names), \
55 'The length of row %d is %d, but others rows have length %d' \
56 % (i, len(row), len(names))
57
58 rows.append(map(str.strip, row))
59
60 return names, rows
61
62
64 """
65 _column_types infers type information from the columns in rows. Types are
66 stored as either a Python type conversion function (str, int or float) or
67 as a None value. A dictionary of column names to types is returned.
68
69 A column has type None if and only if all cells in the column are empty.
70 (Cells are empty if the length of its value is zero after left and right
71 whitespace has been trimmed.)
72
73 A column has type float if and only if all cells in the column are empty,
74 integers or floats AND at least one value is a float.
75
76 A column has type int if and only if all cells in the column are empty or
77 integers AND at least one value is an int.
78
79 A column has type string in any other case.
80 """
81 types = dict([(name, None) for name in names])
82
83 for c in xrange(len(names)):
84
85
86 prev_typ = None
87
88
89
90
91 next_typ = None
92
93 for row in rows:
94 col = row[c]
95
96
97 if len(col) == 0:
98 next_typ = None
99
100
101 elif prev_typ is not str:
102
103
104
105 try:
106
107
108
109 int(col)
110 next_typ = int
111 except ValueError:
112 try:
113
114
115 float(col)
116 next_typ = float
117 except ValueError:
118 next_typ = str
119
120
121
122 if prev_typ is str or next_typ is str:
123 prev_typ = str
124
125 elif next_typ is float and prev_typ is int:
126 prev_typ = float
127
128 elif prev_typ is None and next_typ is not None:
129 prev_typ = next_typ
130
131 types[names[c]] = prev_typ
132 return types
133
134
136 """
137 new_rows executes f on every column header in the table, with three
138 arguments, in order: column type, column index, column name. The result
139 of the function is placed in the corresponding header location.
140
141 A new table is returned with the new column names.
142 """
143 new_names = []
144 for i, name in enumerate(table.names):
145 new_names.append(f(table.types[name], i, name))
146 return table._replace(names=new_names)
147
148
150 """
151 new_rows executes f on every cell of data with five arguments, in order:
152 column type, column name, row index, column index, contents. The result
153 of the function is placed in the corresponding cell location.
154
155 A new table is returned with the converted values.
156 """
157 new_rows = [None] * len(table.rows)
158 for r, row in enumerate(table.rows):
159 new_row = [None] * len(row)
160 for c, col in enumerate(row):
161 name = table.names[c]
162 typ = table.types[name]
163 new_row[c] = f(typ, name, r, c, col)
164 new_rows[r] = new_row
165 return table._replace(rows=new_rows)
166
167
169 """
170 cast type casts all of the values in 'rows' to their corresponding types
171 in types.
172
173 The only special case here is missing values or NULL columns. If a value
174 is missing or a column has type NULL (i.e., all values are missing), then
175 the value is replaced with None, which is Python's version of a NULL value.
176
177 N.B. cast is idempotent. i.e., cast(x) = cast(cast(x)).
178 """
179 def f(typ, name, r, c, cell):
180 if (isinstance(cell, basestring) and len(cell) == 0) \
181 or typ is None or cell is None:
182 return None
183 return typ(cell)
184 return map_data(table, f)
185
186
188 """
189 convert_missing_cells changes the values of all NULL cells to the values
190 specified by dstr, dint and dfloat. For example, all NULL cells in columns
191 with type "string" will be replaced with the value given to dstr.
192 """
193 def f(typ, name, r, c, cell):
194 if cell is None and typ is not None:
195 if typ == str:
196 return dstr
197 elif typ == int:
198 return dint
199 elif typ == float:
200 return dfloat
201 else:
202 assert False, "Unknown type: %s" % typ
203 return cell
204 return map_data(table, f)
205
206
208 """
209 convert_columns executes converter functions on specific columns, where
210 the parameter names for kwargs are the column names, and the parameter
211 values are functions of one parameter that return a single value.
212
213 e.g., convert_columns(names, rows, colname=lambda s: s.lower()) would
214 convert all values in the column with name 'colname' to lowercase.
215 """
216 def f(typ, name, r, c, cell):
217 if name in kwargs:
218 return kwargs[name](cell)
219 return cell
220 return map_data(table, f)
221
222
224 """
225 convert_types works just like convert_columns, but on types instead of
226 specific columns. This function will likely be more useful, since
227 sanitizatiion functions are typically type oriented rather than column
228 oriented.
229
230 However, when there are specific kinds of columns that need special
231 sanitization, convert_columns should be used.
232 """
233 def f(typ, name, r, c, cell):
234 if typ == str and fstr is not None:
235 return fstr(cell)
236 elif typ == int and fint is not None:
237 return fint(cell)
238 elif typ == float and ffloat is not None:
239 return ffloat(cell)
240 return cell
241 return map_data(table, f)
242
243
245 """
246 column returns the column with name "colname", where the column returned
247 is a triple of the column type, the column name and a NumPy array of
248 cells in the column.
249 """
250 colcells = []
251 colname = colname.lower()
252 colindex = -1
253 for i, name in enumerate(table.names):
254 if name.lower() == colname.lower():
255 colindex = i
256 break
257 assert colindex > -1, 'Column name %s does not exist' % colname
258
259 for row in table.rows:
260 for i, col in enumerate(row):
261 if i == colindex:
262 colcells.append(col)
263
264 return Column(type=table.types[table.names[colindex]],
265 name=table.names[colindex],
266 cells=np.array(colcells))
267
268
270 """
271 columns returns a list of all columns in the data set, where each column
272 is a triple of its type, name and a NumPy array of cells in the column.
273 """
274 colcells = []
275 for _ in table.names:
276 colcells.append([])
277 for row in table.rows:
278 for i, col in enumerate(row):
279 colcells[i].append(col)
280
281 cols = []
282 for i, name in enumerate(table.names):
283 col = Column(type=table.types[name],
284 name=name,
285 cells=np.array(colcells[i]))
286 cols.append(col)
287 return cols
288
289
291 """
292 frequencies returns a dictionary where the keys are unique values in the
293 column, and the values correspond to the frequency of each value in the
294 column.
295 """
296 ukeys = np.unique(column.cells)
297 bins = np.searchsorted(ukeys, column.cells)
298 return dict(zip(ukeys, np.bincount(bins)))
299
300
302 """
303 type_str returns a string representation of a column type.
304 """
305 if typ is None:
306 return "None"
307 elif typ is float:
308 return "float"
309 elif typ is int:
310 return "int"
311 elif typ is str:
312 return "str"
313 return "Unknown"
314
315
317 """
318 cell_str is a convenience function for converting cell contents to a string
319 when there are still NULL values.
320
321 N.B. If you choose to work with data while keeping NULL values, you will
322 likely need to write more functions similar to this one.
323 """
324 if cell_contents is None:
325 return "NULL"
326 return str(cell_contents)
327
328
330 """
331 print_data_table is a convenience function for pretty-printing the
332 data in tabular format, including header names and type annotations.
333 """
334 padding = 2
335 headers = ['%s (%s)' % (n, type_str(table.types[n])) for n in table.names]
336 maxlens = map(len, headers)
337 for row in table.rows:
338 for i, col in enumerate(row):
339 maxlens[i] = max(maxlens[i], len(cell_str(col)))
340
341 def padded_cell(i, s):
342 spaces = maxlens[i] - len(cell_str(s)) + padding
343 return '%s%s' % (cell_str(s), ' ' * spaces)
344
345 line = ""
346 for i, name in enumerate(headers):
347 line += padded_cell(i, name)
348 print line
349 print '-' * (sum(map(len, headers)) + len(headers) * padding)
350 for row in table.rows:
351 line = ""
352 for i, col in enumerate(row):
353 line += padded_cell(i, cell_str(col))
354 print line
355
356
357 if __name__ == '__main__':
358
359 f = "sample.csv"
360
361 table = read(f)
362
363
364 print "# Raw data."
365 print_data_table(table)
366 print '\n'
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386 table = convert_missing_cells(table)
387 print "# Convert missing cells to arbitrary values"
388 print_data_table(table)
389 print '\n'
390
391
392
393
394
395
396
397
398
399 table = convert_columns(table, string1=str.lower)
400 print "# Sanitize just one column of data"
401 print_data_table(table)
402 print '\n'
403
404
405
406
407
408
409 table = convert_types(table, fstr=str.lower)
410 print "# Sanitize all cells that have type string"
411 print_data_table(table)
412 print '\n'
413
414
415 for col in columns(table):
416 print '(%s, %s) [%s]' \
417 % (col.name, col.type, ', '.join(map(cell_str, col.cells)))
418 print '\n'
419
420
421 print column(table, "mixed")
422