Package qcsv
[frames] | no frames]

Source Code for Package qcsv

  1  from collections import namedtuple 
  2  import csv 
  3   
  4  import numpy as np 
  5   
  6   
  7  Table = namedtuple('Table', ['types', 'names', 'rows']) 
  8  Column = namedtuple('Column', ['type', 'name', 'cells']) 
  9   
 10   
11 -def read(fname, delimiter=',', skip_header=False):
12 """ 13 read loads cell data, column headers and type information for each column 14 given a file path to a CSV formatted file. A "Table" namedtuple is returned 15 with fields "types", "names" and "rows". 16 17 All cells have left and right whitespace trimmed. 18 19 All rows MUST be the same length. 20 21 delimiter is the string the separates each field in a row. 22 23 If skip_header is set, then no column headers are read, and column names 24 are set to their corresponding indices (as strings). 25 """ 26 names, rows = _data(fname, delimiter, skip_header) 27 types = _column_types(names, rows) 28 29 return cast(Table(types=types, names=names, rows=rows))
30 31
32 -def _data(fname, delimiter=',', skip_header=False):
33 """ 34 _data loads cell data and column headers, and returns the names and rows. 35 36 All cells have left and right whitespace trimmed. 37 38 All rows MUST be the same length. 39 40 delimiter and skip_header are described in read. 41 """ 42 names = [] 43 rows = [] 44 reader = csv.reader(open(fname), delimiter=delimiter) 45 if not skip_header: 46 names = map(str.strip, reader.next()) 47 48 for i, row in enumerate(reader): 49 # If we haven't discovered names from column headers, then name the 50 # columns "0", "1", ..., "n-1" where "n" is the number of columns in 51 # the first row. 52 if len(names) == 0: 53 names = map(str, range(0, len(row))) 54 assert len(row) == len(names), \ 55 'The length of row %d is %d, but others rows have length %d' \ 56 % (i, len(row), len(names)) 57 58 rows.append(map(str.strip, row)) 59 60 return names, rows
61 62
63 -def _column_types(names, rows):
64 """ 65 _column_types infers type information from the columns in rows. Types are 66 stored as either a Python type conversion function (str, int or float) or 67 as a None value. A dictionary of column names to types is returned. 68 69 A column has type None if and only if all cells in the column are empty. 70 (Cells are empty if the length of its value is zero after left and right 71 whitespace has been trimmed.) 72 73 A column has type float if and only if all cells in the column are empty, 74 integers or floats AND at least one value is a float. 75 76 A column has type int if and only if all cells in the column are empty or 77 integers AND at least one value is an int. 78 79 A column has type string in any other case. 80 """ 81 types = dict([(name, None) for name in names]) 82 83 for c in xrange(len(names)): 84 # prev_typ is what we believe the type of this column to be up 85 # until this point. 86 prev_typ = None 87 88 # next_typ is the type of the current cell only. It is compared 89 # with prev_typ to determine the type of the overall column as 90 # per the conditions specified in this function's documentation. 91 next_typ = None 92 93 for row in rows: 94 col = row[c] 95 96 # A missing value always has type None. 97 if len(col) == 0: 98 next_typ = None 99 # No need to inspect the type if we've already committed to str. 100 # (We bail out because it's expensive to inspect types like this.) 101 elif prev_typ is not str: 102 # The trick here is to attempt type casting from a stirng to 103 # an int or a string to a float, and if Python doesn't like it, 104 # we try something else. 105 try: 106 # We try int first, since any integer can be successfully 107 # converted to a float, but not all floats can converted 108 # to integers. 109 int(col) 110 next_typ = int 111 except ValueError: 112 try: 113 # If we can't convert to float, then we must scale back 114 # to a string. 115 float(col) 116 next_typ = float 117 except ValueError: 118 next_typ = str 119 120 # If a column contains a string, the column type is always a 121 # string. 122 if prev_typ is str or next_typ is str: 123 prev_typ = str 124 # A column with floats and ints has type float. 125 elif next_typ is float and prev_typ is int: 126 prev_typ = float 127 # A column with missing values and X has type X. 128 elif prev_typ is None and next_typ is not None: 129 prev_typ = next_typ 130 131 types[names[c]] = prev_typ 132 return types
133 134
135 -def map_names(table, f):
136 """ 137 new_rows executes f on every column header in the table, with three 138 arguments, in order: column type, column index, column name. The result 139 of the function is placed in the corresponding header location. 140 141 A new table is returned with the new column names. 142 """ 143 new_names = [] 144 for i, name in enumerate(table.names): 145 new_names.append(f(table.types[name], i, name)) 146 return table._replace(names=new_names)
147 148
149 -def map_data(table, f):
150 """ 151 new_rows executes f on every cell of data with five arguments, in order: 152 column type, column name, row index, column index, contents. The result 153 of the function is placed in the corresponding cell location. 154 155 A new table is returned with the converted values. 156 """ 157 new_rows = [None] * len(table.rows) 158 for r, row in enumerate(table.rows): 159 new_row = [None] * len(row) 160 for c, col in enumerate(row): 161 name = table.names[c] 162 typ = table.types[name] 163 new_row[c] = f(typ, name, r, c, col) 164 new_rows[r] = new_row 165 return table._replace(rows=new_rows)
166 167
168 -def cast(table):
169 """ 170 cast type casts all of the values in 'rows' to their corresponding types 171 in types. 172 173 The only special case here is missing values or NULL columns. If a value 174 is missing or a column has type NULL (i.e., all values are missing), then 175 the value is replaced with None, which is Python's version of a NULL value. 176 177 N.B. cast is idempotent. i.e., cast(x) = cast(cast(x)). 178 """ 179 def f(typ, name, r, c, cell): 180 if (isinstance(cell, basestring) and len(cell) == 0) \ 181 or typ is None or cell is None: 182 return None 183 return typ(cell)
184 return map_data(table, f) 185 186
187 -def convert_missing_cells(table, dstr="", dint=0, dfloat=0.0):
188 """ 189 convert_missing_cells changes the values of all NULL cells to the values 190 specified by dstr, dint and dfloat. For example, all NULL cells in columns 191 with type "string" will be replaced with the value given to dstr. 192 """ 193 def f(typ, name, r, c, cell): 194 if cell is None and typ is not None: 195 if typ == str: 196 return dstr 197 elif typ == int: 198 return dint 199 elif typ == float: 200 return dfloat 201 else: 202 assert False, "Unknown type: %s" % typ 203 return cell
204 return map_data(table, f) 205 206
207 -def convert_columns(table, **kwargs):
208 """ 209 convert_columns executes converter functions on specific columns, where 210 the parameter names for kwargs are the column names, and the parameter 211 values are functions of one parameter that return a single value. 212 213 e.g., convert_columns(names, rows, colname=lambda s: s.lower()) would 214 convert all values in the column with name 'colname' to lowercase. 215 """ 216 def f(typ, name, r, c, cell): 217 if name in kwargs: 218 return kwargs[name](cell) 219 return cell
220 return map_data(table, f) 221 222
223 -def convert_types(table, fstr=None, fint=None, ffloat=None):
224 """ 225 convert_types works just like convert_columns, but on types instead of 226 specific columns. This function will likely be more useful, since 227 sanitizatiion functions are typically type oriented rather than column 228 oriented. 229 230 However, when there are specific kinds of columns that need special 231 sanitization, convert_columns should be used. 232 """ 233 def f(typ, name, r, c, cell): 234 if typ == str and fstr is not None: 235 return fstr(cell) 236 elif typ == int and fint is not None: 237 return fint(cell) 238 elif typ == float and ffloat is not None: 239 return ffloat(cell) 240 return cell
241 return map_data(table, f) 242 243
244 -def column(table, colname):
245 """ 246 column returns the column with name "colname", where the column returned 247 is a triple of the column type, the column name and a NumPy array of 248 cells in the column. 249 """ 250 colcells = [] 251 colname = colname.lower() 252 colindex = -1 253 for i, name in enumerate(table.names): 254 if name.lower() == colname.lower(): 255 colindex = i 256 break 257 assert colindex > -1, 'Column name %s does not exist' % colname 258 259 for row in table.rows: 260 for i, col in enumerate(row): 261 if i == colindex: 262 colcells.append(col) 263 264 return Column(type=table.types[table.names[colindex]], 265 name=table.names[colindex], 266 cells=np.array(colcells))
267 268
269 -def columns(table):
270 """ 271 columns returns a list of all columns in the data set, where each column 272 is a triple of its type, name and a NumPy array of cells in the column. 273 """ 274 colcells = [] 275 for _ in table.names: 276 colcells.append([]) 277 for row in table.rows: 278 for i, col in enumerate(row): 279 colcells[i].append(col) 280 281 cols = [] 282 for i, name in enumerate(table.names): 283 col = Column(type=table.types[name], 284 name=name, 285 cells=np.array(colcells[i])) 286 cols.append(col) 287 return cols
288 289
290 -def frequencies(column):
291 """ 292 frequencies returns a dictionary where the keys are unique values in the 293 column, and the values correspond to the frequency of each value in the 294 column. 295 """ 296 ukeys = np.unique(column.cells) 297 bins = np.searchsorted(ukeys, column.cells) 298 return dict(zip(ukeys, np.bincount(bins)))
299 300
301 -def type_str(typ):
302 """ 303 type_str returns a string representation of a column type. 304 """ 305 if typ is None: 306 return "None" 307 elif typ is float: 308 return "float" 309 elif typ is int: 310 return "int" 311 elif typ is str: 312 return "str" 313 return "Unknown"
314 315
316 -def cell_str(cell_contents):
317 """ 318 cell_str is a convenience function for converting cell contents to a string 319 when there are still NULL values. 320 321 N.B. If you choose to work with data while keeping NULL values, you will 322 likely need to write more functions similar to this one. 323 """ 324 if cell_contents is None: 325 return "NULL" 326 return str(cell_contents)
327 328 344 345 line = "" 346 for i, name in enumerate(headers): 347 line += padded_cell(i, name) 348 print line 349 print '-' * (sum(map(len, headers)) + len(headers) * padding) 350 for row in table.rows: 351 line = "" 352 for i, col in enumerate(row): 353 line += padded_cell(i, cell_str(col)) 354 print line 355 356 357 if __name__ == '__main__': 358 # File name. 359 f = "sample.csv" 360 361 table = read(f) 362 363 # Print the table of raw data. 364 print "# Raw data." 365 print_data_table(table) 366 print '\n' 367 368 # Print the table after converting missing values from NULL to concrete 369 # values. The benefit here is that NULL values are inherently incomputable. 370 # Whenever they get thrown into a computation on the data, they will always 371 # provoke a runtime error. This is a Good Thing, because missing values 372 # SHOULD be given explicit treatment. Inserting values into missing cells 373 # is making an *assumption* about the data, and should never be implied. 374 # 375 # Thus, `convert_missing_cells` is an EXPLICIT way of throwing away NULL 376 # cells. If you view the output from the previous table, and the output of 377 # the next table, you'll notice that NULL values have been replaced. 378 # (String columns get empty strings, integer columns get 0 and float 379 # columns get 0.0.) 380 # 381 # If you want to change what the missing values are replaced with, you can 382 # use the function's optional parameters: 383 # 384 # rows = convert_missing_cells(types, names, rows, 385 # dstr="-9.99", dfloat=-9.99, dint=-9) 386 table = convert_missing_cells(table) 387 print "# Convert missing cells to arbitrary values" 388 print_data_table(table) 389 print '\n' 390 391 # Now that all of the NULL cells have been removed, we are free to run data 392 # sanitization functions on the columns of data without worrying about 393 # seeing those nasty NULL values. For instance, we might want to make all 394 # strings in the 'string1' column be lowercase. We need only to pass a 395 # function as an argument, where the function we pass takes a single 396 # argument (the cell contents) and returns the new cell contents. In this 397 # case, we tell every cell in the `string1` column to be converted using 398 # the `str.lower` function. 399 table = convert_columns(table, string1=str.lower) 400 print "# Sanitize just one column of data" 401 print_data_table(table) 402 print '\n' 403 404 # The aforementioned function has limited use, since you typically 405 # want to be more dynamic than having to give names of columns. Thus, the 406 # `convert_types` function allows you to convert cells based on their 407 # *type*. That is, instead of making only a selection of columns lowercase, 408 # we can specify that *all* string columns should be lowercase. 409 table = convert_types(table, fstr=str.lower) 410 print "# Sanitize all cells that have type string" 411 print_data_table(table) 412 print '\n' 413 414 # Finally, you can traverse your data set by columns like so: 415 for col in columns(table): 416 print '(%s, %s) [%s]' \ 417 % (col.name, col.type, ', '.join(map(cell_str, col.cells))) 418 print '\n' 419 420 # Or pick out one column in particular: 421 print column(table, "mixed") 422