Package qcsv
[frames] | no frames]

Source Code for Package qcsv

  1  import csv 
  2   
  3   
4 -def read(fname, delimiter=',', skip_header=False):
5 """ 6 read loads cell data, column headers and type information for each column 7 given a file path to a CSV formatted file. 8 9 All cells have left and right whitespace trimmed. 10 11 All rows MUST be the same length. 12 13 delimiter is the string the separates each field in a row. 14 15 If skip_header is set, then no column headers are read, and column names 16 are set to their corresponding indices (as strings). 17 """ 18 names, rows = data(f, delimiter, skip_header) 19 types = column_types(names, rows) 20 rows = cast(types, names, rows) 21 return types, names, rows
22 23
24 -def data(fname, delimiter=',', skip_header=False):
25 """ 26 data loads cell data and column headers. 27 28 All cells have left and right whitespace trimmed. 29 30 All rows MUST be the same length. 31 32 delimiter and skip_header are described in read. 33 """ 34 names = [] 35 rows = [] 36 reader = csv.reader(open(fname), delimiter=delimiter) 37 if not skip_header: 38 names = map(str.strip, reader.next()) 39 40 for i, row in enumerate(reader): 41 # If we haven't discovered names from column headers, then name the 42 # columns "0", "1", ..., "n-1" where "n" is the number of columns in 43 # the first row. 44 if len(names) == 0: 45 names = map(str, range(0, len(row))) 46 assert len(row) == len(names), \ 47 'The length of row %d is %d, but others rows have length %d' \ 48 % (i, len(row), len(names)) 49 50 rows.append(map(str.strip, row)) 51 52 return names, rows
53 54
55 -def column_types(names, rows):
56 """ 57 column_types infers type information from the columns in rows. Types are 58 stored as either a Python type conversion function (str, int or float) or 59 as a None value. 60 61 A column has type None if and only if all cells in the column are empty. 62 (Cells are empty if the length of its value is zero after left and right 63 whitespace has been trimmed.) 64 65 A column has type float if and only if all cells in the column are empty, 66 integers or floats AND at least one value is a float. 67 68 A column has type int if and only if all cells in the column are empty or 69 integers AND at least one value is an int. 70 71 A column has type string in any other case. 72 """ 73 types = dict([(name, None) for name in names]) 74 for row in rows: 75 for i, col in enumerate(row): 76 name = names[i] 77 78 # prev_typ is what we believe the type of this column to be up 79 # until this point. 80 prev_typ = types[name] 81 82 # next_typ is the type of the current cell only. It is compared 83 # with prev_typ to determine the type of the overall column as 84 # per the conditions specified in this function's documentation. 85 next_typ = None 86 87 # A missing value always has type None. 88 if len(col) == 0: 89 next_typ = None 90 else: 91 # The trick here is to attempt type casting from a stirng to 92 # an int or a string to a float, and if Python doesn't like it, 93 # we try something else. 94 try: 95 # We try int first, since any integer can be successfully 96 # converted to a float, but not all floats can converted 97 # to integers. 98 int(col) 99 next_typ = int 100 except ValueError: 101 try: 102 # If we can't convert to float, then we must scale back 103 # to a string. 104 float(col) 105 next_typ = float 106 except ValueError: 107 next_typ = str 108 109 # If a column contains a string, the column type is always a 110 # string. 111 if prev_typ == str or next_typ == str: 112 types[name] = str 113 # A column with floats and ints has type float. 114 elif next_typ == float and prev_typ == int: 115 types[name] = float 116 # A column with missing values and X has type X. 117 elif prev_typ is None and next_typ is not None: 118 types[name] = next_typ 119 return types
120 121
122 -def cast(types, names, rows):
123 """ 124 cast type casts all of the values in 'rows' to their corresponding types 125 in types. 126 127 The only special case here is missing values or NULL columns. If a value 128 is missing or a column has type NULL (i.e., all values are missing), then 129 the value is replaced with None, which is Python's version of a NULL value. 130 131 N.B. cast is idempotent. i.e., cast(x) = cast(cast(x)). 132 """ 133 new_rows = [] 134 for row in rows: 135 new_row = [] 136 for i, col in enumerate(row): 137 typ = types[names[i]] 138 if (isinstance(col, basestring) and len(col) == 0) \ 139 or typ is None or col is None: 140 new_row.append(None) 141 else: 142 new_row.append(typ(col)) 143 new_rows.append(new_row) 144 return new_rows
145 146
147 -def convert_missing_cells(types, names, rows, dstr="", dint=0, dfloat=0.0):
148 """ 149 convert_missing_cells changes the values of all NULL cells to the values 150 specified by dstr, dint and dfloat. For example, all NULL cells in columns 151 with type "string" will be replaced with the value given to dstr. 152 """ 153 new_rows = [] 154 for row in rows: 155 new_row = [] 156 for i, col in enumerate(row): 157 name = names[i] 158 typ = types[name] 159 if col is None and typ is not None: 160 if typ == str: 161 new_row.append(dstr) 162 elif typ == int: 163 new_row.append(dint) 164 elif typ == float: 165 new_row.append(dfloat) 166 else: 167 assert False, "Unknown type: %s" % typ 168 else: 169 new_row.append(col) 170 new_rows.append(new_row) 171 return new_rows
172 173
174 -def convert_columns(names, rows, **kwargs):
175 """ 176 convert_columns executes converter functions on specific columns, where 177 the parameter names for kwargs are the column names, and the parameter 178 values are functions of one parameter that return a single value. 179 180 e.g., convert_columns(names, rows, colname=lambda s: s.lower()) would 181 convert all values in the column with name 'colname' to lowercase. 182 """ 183 new_rows = [] 184 for row in rows: 185 new_row = [] 186 for i, col in enumerate(row): 187 name = names[i] 188 if name in kwargs: 189 new_row.append(kwargs[name](col)) 190 else: 191 new_row.append(col) 192 new_rows.append(new_row) 193 return new_rows
194 195
196 -def convert_types(types, names, rows, fstr=None, fint=None, ffloat=None):
197 """ 198 convert_types works just like convert_columns, but on types instead of 199 specific columns. This function will likely be more useful, since 200 sanitizatiion functions are typically type oriented rather than column 201 oriented. 202 203 However, when there are specific kinds of columns that need special 204 sanitization, convert_columns should be used. 205 """ 206 new_rows = [] 207 for row in rows: 208 new_row = [] 209 for i, col in enumerate(row): 210 name = names[i] 211 typ = types[name] 212 if typ == str and fstr is not None: 213 new_row.append(fstr(col)) 214 elif typ == int and fint is not None: 215 new_row.append(fint(col)) 216 elif typ == float and ffloat is not None: 217 new_row.append(ffloat(col)) 218 else: 219 new_row.append(col) 220 new_rows.append(new_row) 221 return new_rows
222 223
224 -def column(types, names, rows, colname):
225 """ 226 column returns the column with name "colname", where the column returned 227 is a triple of the column type, the column name and a list of cells in the 228 column. 229 """ 230 colcells = [] 231 colname = colname.lower() 232 colindex = -1 233 for i, name in enumerate(names): 234 if name.lower() == colname.lower(): 235 colindex = i 236 break 237 assert colindex > -1, 'Column name %s does not exist' % colname 238 239 for row in rows: 240 for i, col in enumerate(row): 241 if i == colindex: 242 colcells.append(col) 243 244 return types[names[colindex]], names[colindex], colcells
245 246
247 -def columns(types, names, rows):
248 """ 249 columns returns a list of all columns in the data set, where each column 250 is a triple of its type, name and a list of cells in the column. 251 """ 252 colcells = [] 253 for _ in names: 254 colcells.append([]) 255 for row in rows: 256 for i, col in enumerate(row): 257 colcells[i].append(col) 258 259 cols = [] 260 for i, name in enumerate(names): 261 cols.append((types[name], name, colcells[i])) 262 return cols
263 264
265 -def type_str(typ):
266 """ 267 type_str returns a string representation of a column type. 268 """ 269 if typ is None: 270 return "None" 271 elif typ is float: 272 return "float" 273 elif typ is int: 274 return "int" 275 elif typ is str: 276 return "str" 277 return "Unknown"
278 279
280 -def cell_str(cell_contents):
281 """ 282 cell_str is a convenience function for converting cell contents to a string 283 when there are still NULL values. 284 285 N.B. If you choose to work with data while keeping NULL values, you will 286 likely need to write more functions similar to this one. 287 """ 288 if cell_contents is None: 289 return "NULL" 290 return str(cell_contents)
291 292 308 309 line = "" 310 for i, name in enumerate(headers): 311 line += padded_cell(i, name) 312 print line 313 print '-' * (sum(map(len, headers)) + len(headers) * padding) 314 for row in rows: 315 line = "" 316 for i, col in enumerate(row): 317 line += padded_cell(i, cell_str(col)) 318 print line 319 320 321 if __name__ == '__main__': 322 # File name. 323 f = "sample.csv" 324 325 # Get the initial data from the CSV file, "names" will contain the values 326 # of each column in the first row if the optional "skip_header" parameter 327 # is not set. If "skip_header" is set, then the column names are the column 328 # indices as strings. 329 names, rows = data(f) 330 331 # Infers the type of each column. See the function definition for how the 332 # types are computed. 333 types = column_types(names, rows) 334 335 # Finally, Python's CSV module reads raw data as strings. Thus, arithmetic 336 # operations on integer/float columns won't work well. So we use the type 337 # information gathered from `column_types` to cast all of the cell data to 338 # their appropriate types. 339 # 340 # Note that this is the place where missing values are inserted as "None". 341 # They can be changed with the conversion functions. (Examples follow.) 342 rows = cast(types, names, rows) 343 344 # At this point, we've: 345 # 1) Loaded the data into a list of column headers and a list of rows. 346 # 2) Inferred the type of each column (string, integer, float or None). 347 # 3) Casted all cells to their appropriate Python types. 348 # These three steps can be accomplished with a single convenience function: 349 # 350 # types, names, rows = read(f) 351 # 352 # I've taken the long approach here so that the process is less opaque. 353 354 # Print the table of raw data. 355 print "# Raw data." 356 print_data_table(types, names, rows) 357 print '\n' 358 359 # Print the table after converting missing values from NULL to concrete 360 # values. The benefit here is that NULL values are inherently incomputable. 361 # Whenever they get thrown into a computation on the data, they will always 362 # provoke a runtime error. This is a Good Thing, because missing values 363 # SHOULD be given explicit treatment. Inserting values into missing cells 364 # is making an *assumption* about the data, and should never be implied. 365 # 366 # Thus, `convert_missing_cells` is an EXPLICIT way of throwing away NULL 367 # cells. If you view the output from the previous table, and the output of 368 # the next table, you'll notice that NULL values have been replaced. 369 # (String columns get empty strings, integer columns get 0 and float 370 # columns get 0.0.) 371 # 372 # If you want to change what the missing values are replaced with, you can 373 # use the function's optional parameters: 374 # 375 # rows = convert_missing_cells(types, names, rows, 376 # dstr="-9.99", dfloat=-9.99, dint=-9) 377 rows = convert_missing_cells(types, names, rows) 378 print "# Convert missing cells to arbitrary values" 379 print_data_table(types, names, rows) 380 print '\n' 381 382 # Now that all of the NULL cells have been removed, we are free to run data 383 # sanitization functions on the columns of data without worrying about 384 # seeing those nasty NULL values. For instance, we might want to make all 385 # strings in the 'string1' column be lowercase. We need only to pass a 386 # function as an argument, where the function we pass takes a single 387 # argument (the cell contents) and returns the new cell contents. In this 388 # case, we tell every cell in the `string1` column to be converted using 389 # the `str.lower` function. 390 rows = convert_columns(names, rows, string1=str.lower) 391 print "# Sanitize just one column of data" 392 print_data_table(types, names, rows) 393 print '\n' 394 395 # The aforementioned function has limited use, since you typically 396 # want to be more dynamic than having to give names of columns. Thus, the 397 # `convert_types` function allows you to convert cells based on their 398 # *type*. That is, instead of making only a selection of columns lowercase, 399 # we can specify that *all* string columns should be lowercase. 400 rows = convert_types(types, names, rows, fstr=str.lower) 401 print "# Sanitize all cells that have type string" 402 print_data_table(types, names, rows) 403 print '\n' 404 405 # Finally, you can traverse your data set by columns like so: 406 for typ, name, cells in columns(types, names, rows): 407 print '(%s, %s) [%s]' % (name, typ, ', '.join(map(cell_str, cells))) 408 print '\n' 409 410 # Or pick out one column in particular: 411 print column(types, names, rows, "mixed") 412