1 """
2 Read, query and update textual tables via flexible SQL interface.
3
4 L{Table}s can be created and populated with data from scratch, built from TSV
5 files, 2D lists or other tables. Once the data is loaded in memory, each
6 storage operation on the table object is delegated via bridge to an SQL
7 storage backend (by default this is SQLite). However the table uses the backend
8 only as a temp storage to ensure maximum portability of the data. Tables can be
9 stored persistently as text (TSV) files and then loaded back in memory when
10 needed.
11
12 These Tables can be queried and updated in a vast number of ways; each query
13 returns a new L{Table}:
14
15 1. Using slice expressions. The general form of a slice expression is
16 C{[rows, columns]}, where C{rows} can be:
17
18
19 - a row index, 0-based, e.g. C{5}
20 - a tuple of row indices, e.g. C{(1, 3, 6)}
21 - a standard Python slice, e.g. C{1:3} or C{:5} or C{:}
22 - omitted (means: all rows)
23
24 and C{columns} can be:
25
26 - a column index, 0-based, e.g. C{5}
27 - a tuple of columns indices, 0-based
28 - a column name, e.g. C{'TmScore'}
29 - a tuple of column names, e.g. C{('ID', 'TmScore')}
30 - a standard Python slice using column indices
31 - a slice using column names, e.g. C{'ID':'TM'} or C{:'TM'} or C{:}
32 - omitted (means: all columns)
33
34 2. Using query expressions, for example:
35
36
37 >>> table.where('ID').between(1, 5).select('TmScore', 'RMSD')
38 Table ('TmScore', 'RMSD')
39
40 >>> table.where('ID').between(1, 5).update('RMSD', 0.2)
41 Table (the same table)
42
43 3. With SQL queries:
44
45
46 >>> t.query(r'''SELECT ColumnB * ColumnA AS ComputedValue
47 FROM {0.name}
48 WHERE ColumnC IN ({1}, {1})'''.format(t, Predicate.PH),
49 [12, 55])
50 iterable
51
52 The data contained in a Table can be extracted in several ways:
53
54 - if you need a single (scalar) value -- with the C{table[row, column]}
55 indexing expression or with the dedicated C{table.scalar(row, column)} method.
56 - by treating the table as an iterator; each cycle will then yield a L{DataRow}
57 object
58 - with text (TSV) serialization: simply call C{table.dump(file)}.
59
60 See L{Table} for full API details.
61 """
62
63 try:
64 import __builtin__ as builtins
65 except ImportError:
66 import builtins
67
68 import os
69
70 import csb.io
71 import csb.core
72
73 from abc import ABCMeta, abstractmethod, abstractproperty
77 """
78 Abstract SQL backend interface. Defines a number of platform-specific
79 operations, that each concrete backend implementor must provide.
80 """
81
82 __metaclass__ = ABCMeta
83
85 self._table = tablename
86
87 @abstractproperty
90
91 @property
93 """
94 Table name
95 @rtype: str
96 """
97 return self._table
98
99 - def query(self, sql, params=None):
100 """
101 Execute a native SQL query against the backend, as-is.
102
103 @param sql: SQL query
104 @type sql: str
105 @param params: query bound parameters, if any
106 @type params: tuple
107
108 @return: data reader (2D iterable)
109 """
110 raise NotImplementedError()
111
112 @abstractmethod
114 """
115 Count the number of rows in the table.
116
117 @rtype: int
118 """
119 pass
120
121 @abstractmethod
123 """
124 Perform a select operation given L{expression}.
125
126 @type expression: L{Expression}
127 @return: data reader (2D iterable)
128 """
129 pass
130
131 @abstractmethod
132 - def update(self, expression):
133 """
134 Perform an update operation given L{expression}.
135
136 @type expression: L{Expression}
137 @return: void
138 """
139 pass
140
141 @abstractmethod
143 """
144 Insert a new tuple in the table.
145
146 @type row: tuple
147 @return: void
148 """
149 pass
150
151 @abstractmethod
153 """
154 Create a table given L{metadata}.
155
156 @type metadata: tuple of L{ColumnInfo}
157 @return: void
158 """
159 pass
160
161 @abstractmethod
163 """
164 Perform cleanup (e.g. close connections).
165 """
166 pass
167
170
173
175 """
176 SQLite-based concrete repository implementor.
177 This is the default L{Table} backend.
178 """
179
180 PK = 'ROWID'
181 TYPES = { int: 'BIGINT', float: 'REAL', str: 'VARCHAR' }
182
184
185 SIZE = 10000
186
188 self._cursor = cursor
189
191 try:
192 while True:
193 rows = self._cursor.fetchmany(self.SIZE)
194 if not rows:
195 break
196 else:
197 for row in rows:
198 yield row
199 finally:
200 self._cursor.close()
201
209
210 @property
213
214 - def _cursor(self, sql, params=None):
221
222 - def query(self, sql, params=None):
223
224 return self._cursor(sql, params).fetchall()
225
227
228 query = 'SELECT COUNT(*)\nFROM {0}\n'.format(self.table)
229 return self._cursor(query).fetchone()[0]
230
243
256
263
265
266 cols = []
267
268 for ci in metadata:
269 type = self._gettype(ci.type)
270 cols.append('{0} {1}'.format(ci.name, type))
271
272 statement = 'CREATE TABLE {0} (\n {1}\n);'.format(self.table, ',\n '.join(cols))
273
274 self._conn.execute(statement)
275
281
283 try:
284 return self._conn.close()
285 except:
286 pass
287
289 """
290 Holder object for column metadata.
291
292 @param name: column name
293 @type name: str
294 @param type: column data type (Python)
295 @type type: type
296 """
297
301
302 @property
304 """
305 Colulmn name
306 @rtype: str
307 """
308 return self._name
309
310 @property
312 """
313 Python data type
314 @rtype: class
315 """
316 return self._type
317
319 return '{0.name}:{0.type.__name__}'.format(self)
320
322 """
323 @return: a deep copy of C{self}
324 """
325 return ColumnInfo(self.name, self.type)
326
328 """
329 Represents a table data row. This is basically what a table iterator
330 yields for each row in a table. Provides both index (position) and
331 column name-based access to the data.
332 """
333
334 - def __init__(self, columns, number, row):
335
336 self._number = number
337 self._row = tuple(row)
338 self._columns = {}
339 for i, c in enumerate(columns):
340 self._columns[c] = i
341 assert len(self._columns) == len(self._row)
342
344 return iter(self._row)
345
347
348 if isinstance(i, csb.core.string):
349 return self._row[self._columns[i]]
350 else:
351 return self._row[i]
352
354 return len(self._row)
355
357 return '{0}: {1}'.format(self.__class__.__name__, repr(self._row))
358
361
362 - def dump(self, delimiter='\t'):
363 """
364 Dump the row as a string.
365
366 @param delimiter: column separator (defaults to tab)
367 @type delimiter: str
368 """
369 return delimiter.join(map(str, self._row))
370
371 @property
373 """
374 Available column names
375 @rtype: tuple
376 """
377 c = dict((self._columns[cn], cn) for cn in self._columns)
378 return tuple(c[i] for i in sorted(c))
379
380 @property
382 """
383 Row number
384 @rtype: int
385 """
386 return self._number
387
389 """
390 Build and query a TSV Table. See the documentation of L{csb.io.tsv} for
391 details and examples.
392
393 @param definition: column definition string: C{ColA:typeA colB:typeB ...},
394 where C{ColN} is a column name and C{typeN} is one of the
395 base Python data types: str, int, long, float.
396 Alternatively, the table definition may be specified
397 directly as a list of metadata objects.
398 @type definition: str, tuple of L{ColumnInfo}
399 @param name: name of the table on the SQL backend. Useful when you need to
400 execute native SQL queries against the table.
401 @type name: str
402 @param backend: table backend storage engine. This must be a proper
403 L{RepositoryImp} bridge implementor.
404 @type backend: type (reference to a L{RepositoryImp} subclass)
405
406 @raise UnsupportedTypeError: when an unsupported type is used in the table
407 C{definition}
408 @raise ValueError: if the C{definition} is not valid
409 """
410
411 """
412 Table header string, used when saving and restoring TSV files.
413 """
414 HEADER = '# @TSV '
415
417
418 if not issubclass(backend, RepositoryImp):
419 raise TypeError('The Table Backend must be a Repository Implementor')
420
421 self._name = name
422 self._backend = backend
423 self._imp = backend(name)
424
425 try:
426 if isinstance(definition[0], ColumnInfo):
427 self._metadata = [ c.copy() for c in definition ]
428 else:
429 if isinstance(definition, csb.core.string):
430 definition = [ (d.split(':')[0], getattr(builtins, d.split(':')[1])) for d in definition.split() ]
431 self._metadata = [ ColumnInfo(c[0], c[1]) for c in definition ]
432 if len(self._metadata) < 1:
433 raise ValueError()
434 except UnsupportedTypeError:
435 raise
436 except (TypeError, IndexError, ValueError, NameError, AttributeError):
437 raise ValueError('Invalid table definition')
438
439 self._imp.create(self._metadata)
440
441 @staticmethod
443 """
444 Table factory: build a L{Table} from a TSV file.
445
446 @param tsv: TSV path and filename. This can be either a conventional
447 TSV/CSV file, or a file created with C{table.dump(tsv)}
448 @type tsv: str
449 @param definition: table column definition (see L{Table}). If defined,
450 this parameter will determine the structure of the
451 table. Otherwise, the table definition will be
452 extracted from the TSV header. If the file contains
453 no TSV header, this parameter is mandatory.
454 @type definition: str, tuple of L{ColumnInfo}
455 @param delimiter: column separator used in the file
456 @type delimiter: str
457 @param skip: skip the first N number of rows (the header can still be
458 extracted from those however)
459 @type skip: int
460
461 @rtype: L{Table}
462
463 @raise ValueError: if neither a table C{definition} is provided,
464 nor the C{tsv} file has a header line
465 """
466
467 if not definition:
468 with open(tsv) as tsvfile:
469 for line in tsvfile:
470 if line.startswith(Table.HEADER):
471 definition = line[ len(Table.HEADER) : ]
472
473 if not definition:
474 raise ValueError('No header definition found')
475
476 table = Table(definition, name=name, backend=backend)
477
478 with open(tsv) as tsvfile:
479 for i, line in enumerate(tsvfile, start=1):
480 if (skip and i <= skip) or line.startswith(Table.HEADER):
481 continue
482 table.insert(line.rstrip(os.linesep).split(delimiter))
483
484 return table
485
486 @staticmethod
488 """
489 Table factory: build a L{Table} from a 2D iterable/data reader.
490
491 @param iterable: data container
492 @type iterable: iterable (2D)
493 @param definition: table column definition (see L{Table}).
494 @type definition: str, tuple of L{ColumnInfo}
495
496 @rtype: L{Table}
497 """
498 table = Table(definition, name=name, backend=backend)
499
500 for row in iterable:
501 table.insert(list(row))
502
503 return table
504
505 @staticmethod
507 """
508 Table factory: build a L{Table} with the definition of another L{Table}.
509
510 @param table: template table
511 @type table: L{Table}
512 @param data: if True, also copy the data from the source C{table}
513 @type data: bool
514
515 @rtype: L{Table}
516 """
517 if data:
518 return Table.from_iterable(table, table._metadata, name=name, backend=backend)
519 else:
520 return Table(table._metadata, name=name, backend=backend)
521
524
526 return self._imp.count()
527
535
537 import numpy
538 return numpy.array([ tuple(row) for row in self ])
539
545
557
559
560 exp = self._interpret(i)
561
562 if len(exp.select) != 1:
563 raise NotImplementedError('single-column expression expected')
564 if csb.core.iterable(value):
565 raise NotImplementedError("single-value assignment expected")
566
567 exp.data = value
568 self._update(exp)
569
578
580 """
581 Parse a table slice and convert it into an L{Expression}.
582 @rtype: L{Expression}
583 """
584
585 if not csb.core.iterable(i):
586 i = [i, slice(None, None)]
587 else:
588 i = list(i)
589
590 if len(i) not in (1, 2):
591 raise ValueError('Tables are only 2 dimensional')
592 if len(i) == 1:
593 i.append(slice(None, None))
594
595 exp = Expression(self.columns)
596 columns = self._getcols(i[1])
597 if len(columns) < 1:
598 raise ValueError('Column slices must return at least one column')
599 exp.select = columns
600 exp.where = self.pk
601
602 if isinstance(i[0], int):
603 self._checkrow(i[0])
604 if len(columns) == 1 and isinstance(i[1], (int, csb.core.string)):
605 exp.scalar = True
606 exp.predicate = Equals(i[0] + 1)
607
608 elif csb.core.iterable(i[0]):
609 params = list(i[0])
610 self._checkrow(params)
611 params = list(map(lambda x: x + 1, params))
612 exp.predicate = In(params)
613
614 elif isinstance(i[0], slice):
615
616 sl = i[0]
617 if sl.step is not None:
618 raise NotImplementedError('Row slice steps are not supported')
619
620 if sl == slice(None, None):
621 exp.where = None
622 elif sl.start is None:
623 self._checkrow(sl.stop)
624 exp.predicate = Lower(sl.stop + 1)
625 elif sl.stop is None:
626 self._checkrow(sl.start)
627 exp.predicate = GreaterOrEquals(sl.start + 1)
628 else:
629 self._checkrow([sl.start, sl.stop])
630 exp.predicate = Between(sl.start + 1, sl.stop)
631
632 else:
633 raise TypeError("Can't handle row slice expression: {0}".format(i[0]))
634
635 return exp
636
638
639 if isinstance(i, int):
640 if i < 0:
641 raise NotImplementedError('Negative row indices are not supported')
642 elif csb.core.iterable(i):
643 for j in i:
644 self._checkrow(j)
645 else:
646 raise TypeError(i)
647
649
650 columns = list(self.columns)
651
652 if spec is None and ifnull is not None:
653 return [ifnull]
654
655 elif isinstance(spec, int):
656 try:
657 return [columns[spec]]
658 except:
659 raise IndexError('Column {0} out of range'.format(spec))
660
661 elif isinstance(spec, csb.core.string):
662 if spec in columns:
663 return [spec]
664 else:
665 raise InvalidColumnError(spec)
666
667 elif isinstance(spec, slice):
668 start = self._getcols(spec.start, columns[0])
669 start = columns.index(start[0])
670
671 end = self._getcols(spec.stop, columns[-1])
672 end = columns.index(end[0])
673 if spec.stop is None:
674 end += 1
675
676 return [columns[i] for i in range(start, end, spec.step or 1)]
677
678 elif csb.core.iterable(spec):
679 return [self._getcols(i)[0] for i in spec]
680
681 else:
682 raise TypeError("Can't handle column slice expression: {0}".format(spec))
683
684 @property
686 """
687 Table name
688 @rtype: str
689 """
690 return self._name
691
692 @property
694 """
695 All column names
696 @rtype: tuple
697 """
698 return tuple(i.name for i in self._metadata)
699
700 @property
703
704 - def dump(self, file):
705 """
706 Dump the table in a file.
707
708 @param file: destination stream or filename
709 @type file: file (stream) or str (filename)
710 """
711
712 with csb.io.EntryWriter(file, close=False) as out:
713
714 definition = map(str, self._metadata)
715 out.write(Table.HEADER)
716 out.writeall(definition, delimiter=' ')
717 out.write(csb.io.NEWLINE)
718
719 for row in self:
720 out.writeline(row.dump(delimiter='\t'))
721
722 - def query(self, sql, params=None):
723 """
724 Execute a native SQL query against the storage engine.
725
726 @param sql: SQL query text. May contain parameter binding placeholders
727 (see L{Predicate.PH}). The SQL dialect of the query depends
728 on the SQL C{backend} being used by the table.
729
730 @return: native data reader
731 @rtype: iterable (2D)
732 """
733 return self._imp.query(sql, params)
734
736 """
737 Insert a new row in the table.
738
739 @param row: a tuple of the appropriate length
740 @type row: tuple
741 """
742 self._imp.insert(row)
743
745
746 metadata = dict((c.name, c) for c in self._metadata)
747 try:
748 return [metadata[cn].copy() for cn in columns]
749 except KeyError as ke:
750 raise InvalidColumnError(ke.message)
751 except:
752 raise
753
759
767
768 - def where(self, column):
769 """
770 @param column: column name
771 @type column: str
772 @raise InvalidColumnError: when an invalid column is requested
773 """
774 exp = Expression(self.columns)
775 return Where(self, exp, column)
776
778 """
779 @return: a new L{Table}
780
781 @param columns: column names; defaults to all columns
782 @type columns: str, tuple of str
783 @raise InvalidColumnError: when an invalid column is requested
784 """
785 columns = Expression.array(columns)
786
787 exp = Expression(self.columns)
788 exp.select = columns
789
790 return self._execute(exp)
791
792 - def update(self, column, value):
793 """
794 Update C{column} for all rows in the table.
795
796 @param column: column to update (name)
797 @type column: str
798 @param value: new column value
799 @raise InvalidColumnError: when an invalid column is referenced
800 """
801 exp = Expression(self.columns)
802 exp.select = [column]
803 exp.data = value
804
805 return self._update(exp)
806
807 - def scalar(self, row=None, column=None):
808 """
809 @return: a scalar value at the specified row and column.
810
811 @param row: row index; if not specified - take the first row
812 @type row: int
813 @param column: column name; if not specified - take the first
814 @type column: str
815
816 @raise IndexError: when an invalid row is requested
817 @raise InvalidColumnError: when an invalid column is requested
818 """
819
820 if row is None:
821 row = 0
822 row += 1
823 if column is None:
824 column = self.columns[0]
825 elif column not in self.columns:
826 raise InvalidColumnError(column)
827
828 exp = Expression(self.columns)
829 exp.select = [column]
830 exp.where = self.pk
831 exp.predicate = Equals([row])
832
833 reader = list(self._imp.execute(exp))
834 if len(reader) > 0:
835 return reader[0][0]
836 else:
837 raise IndexError()
838
840 """
841 @return: a list all values in the specified column
842
843 @param column: column to fetch
844 @type column: str
845 """
846 if column not in self.columns:
847 raise InvalidColumnError(column)
848
849 return [ row[column] for row in self ]
850
852 """
853 Metadata container: represents a table select or update expression.
854 """
855
857
858 self._table = None
859 self._columns = []
860
861 self._columns = list(columns)
862 self._select = []
863 self._where = None
864 self._predicate = None
865 self._data = None
866 self._scalar = False
867
868 self.select = '*'
869
870 @staticmethod
875
876 @property
878 return tuple(self._columns)
879
880 @property
886
887 @property
890 @select.setter
905
906 @property
909 @where.setter
911 if not value:
912 self._where = None
913 self._predicate = None
914 else:
915 self._where = value
916
917 @property
919 return self._predicate
920 @predicate.setter
922 if not value:
923 self._where = None
924 self._predicate = None
925 else:
926 self._predicate = value
927
928 @property
931 @data.setter
932 - def data(self, value):
934
935 @property
938 @scalar.setter
941
943
948
949 @property
952
953 @property
955 return self._expression
956
958
959 - def __init__(self, table, expression, column):
966
967 - def in_(self, *values):
969
970 - def notin(self, *values):
972
975
978
981
984
987
989
990 - def __init__(self, table, expression, predicate):
994
996 """
997 @return: a new L{Table}
998
999 @param columns: column names; defaults to all columns
1000 @type columns: str, tuple of str
1001 @raise InvalidColumnError: when an invalid column is requested
1002 """
1003 exp = self.expression
1004 exp.select = columns
1005
1006 return self.table._execute(exp)
1007
1008 - def update(self, column, value):
1009 """
1010 Update C{column} for all rows in the table.
1011
1012 @param column: column to update (name)
1013 @type column: str
1014 @param value: new column value
1015 @raise InvalidColumnError: when an invalid column is referenced
1016 """
1017 exp = self.expression
1018 exp.select = [column]
1019 exp.data = value
1020
1021 return self.table._update(exp)
1022
1024
1025 __metaclass__ = ABCMeta
1026
1027 PH = '?'
1028
1043
1044 @property
1046 return tuple(self._params)
1047
1049
1050 if len(self._params) < 1:
1051 raise ValueError('{0} predicate with no params'.format(self.__class__.__name__))
1052
1053 @abstractproperty
1056
1059
1060 -class In(Predicate):
1066
1073
1079
1085
1091
1097
1103
1109
1117