1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32 __doc__ = """\
33
34 C{littletable} - a Python module to give ORM-like access to a collection of objects
35
36 The C{littletable} module provides a low-overhead, schema-less, in-memory database access to a
37 collection of user objects. C{littletable} provides a L{DataObject} class for ad hoc creation
38 of semi-immutable objects that can be stored in a C{littletable} L{Table}.
39
40 In addition to basic ORM-style insert/remove/query/delete access to the contents of a
41 Table, C{littletable} offers:
42 - simple indexing for improved retrieval performance, and optional enforcing key uniqueness
43 - access to objects using indexed attributes
44 - simplified joins using '+' operator syntax between annotated Tables
45 - the result of any query or join is a new first-class C{littletable} Table
46
47 C{littletable} Tables do not require an upfront schema definition, but simply work off of the
48 attributes in the stored values, and those referenced in any query parameters.
49
50 Here is a simple C{littletable} data storage/retrieval example::
51
52 from littletable import Table, DataObject
53
54 customers = Table('customers')
55 customers.create_index("id", unique=True)
56 customers.insert(DataObject(id="0010", name="George Jetson"))
57 customers.insert(DataObject(id="0020", name="Wile E. Coyote"))
58 customers.insert(DataObject(id="0030", name="Jonny Quest"))
59
60 catalog = Table('catalog')
61 catalog.create_index("sku", unique=True)
62 catalog.insert(DataObject(sku="ANVIL-001", descr="1000lb anvil", unitofmeas="EA",unitprice=100))
63 catalog.insert(DataObject(sku="BRDSD-001", descr="Bird seed", unitofmeas="LB",unitprice=3))
64 catalog.insert(DataObject(sku="MAGNT-001", descr="Magnet", unitofmeas="EA",unitprice=8))
65 catalog.insert(DataObject(sku="MAGLS-001", descr="Magnifying glass", unitofmeas="EA",unitprice=12))
66
67 wishitems = Table('wishitems')
68 wishitems.create_index("custid")
69 wishitems.create_index("sku")
70 wishitems.insert(DataObject(custid="0020", sku="ANVIL-001"))
71 wishitems.insert(DataObject(custid="0020", sku="BRDSD-001"))
72 wishitems.insert(DataObject(custid="0020", sku="MAGNT-001"))
73 wishitems.insert(DataObject(custid="0030", sku="MAGNT-001"))
74 wishitems.insert(DataObject(custid="0030", sku="MAGLS-001"))
75
76 # print a particular customer name
77 # (unique indexes will return a single item; non-unique
78 # indexes will return a list of all matching items)
79 print customers.id["0030"].name
80
81 # print all items sold by the pound
82 for item in catalog.query(unitofmeas="LB"):
83 print item.sku, item.descr
84
85 # print all items that cost more than 10
86 for item in catalog.where(lambda o : o.unitprice>10):
87 print item.sku, item.descr, item.unitprice
88
89 # join tables to create queryable wishlists collection
90 wishlists = customers.join_on("id") + wishitems.join_on("custid") + catalog.join_on("sku")
91
92 # print all wishlist items with price > 10
93 bigticketitems = wishlists().where(lambda ob : ob.unitprice > 10)
94 for item in bigticketitems:
95 print item
96
97 # list all wishlist items in descending order by price
98 for item in wishlists().sort("unitprice desc"):
99 print item
100 """
101
102 __version__ = "0.8"
103 __versionTime__ = "5 Dec 2015 20:03"
104 __author__ = "Paul McGuire <ptmcg@users.sourceforge.net>"
105
106 import sys
107 from collections import defaultdict, deque, namedtuple
108 from itertools import groupby,ifilter,islice,starmap,repeat
109 from operator import attrgetter
110 import csv
111 import json
112 _consumer = deque(maxlen=0)
113 do_all = _consumer.extend
114
115 try:
116 from itertools import product
117 except ImportError:
119 tupleseqs = [[(x,) for x in s] for s in seqs]
120 def _product(*seqs):
121 if len(seqs) == 1:
122 for x in seqs[0]:
123 yield x
124 else:
125 for x in seqs[0]:
126 for p in _product(*seqs[1:]):
127 yield x+p
128 for p in _product(*tupleseqs):
129 yield p
130
131 try:
132 t = basestring
133 except NameError:
134 basestring = str
135
136 __all__ = ["DataObject", "Table", "JoinTerm", "PivotTable"]
137
139 if hasattr(obj, "__dict__"):
140
141 return obj.__dict__.keys()
142 elif isinstance(obj, tuple) and hasattr(obj, "_fields"):
143
144 return obj._fields
145 elif hasattr(obj, "__slots__"):
146 return obj.__slots__
147 else:
148 raise ValueError("object with unknown attributes")
149
151 if hasattr(obj, "__dict__"):
152
153 return json.dumps(obj.__dict__)
154 elif isinstance(obj, tuple) and hasattr(obj, "_fields"):
155
156 return json.dumps(dict(zip(obj._fields, obj)))
157 elif hasattr(obj, "__slots__"):
158 return json.dumps({k:v for k,v in zip(obj.__slots__,
159 (getattr(obj,a) for a in obj.__slots__))})
160 else:
161 raise ValueError("object with unknown attributes")
162
164 """A generic semi-mutable object for storing data values in a table. Attributes
165 can be set by passing in named arguments in the constructor, or by setting them
166 as C{object.attribute = value}. New attributes can be added any time, but updates
167 are ignored. Table joins are returned as a Table of DataObjects."""
169 if kwargs:
170 self.__dict__.update(kwargs)
172 return repr(self.__dict__)
178 if hasattr(self,k):
179 return getattr(self,k)
180 else:
181 raise KeyError("object has no such attribute " + k)
183 return self.__dict__ == other.__dict__
184
187 self.attr = attr
188 self.obs = defaultdict(list)
189 self.is_unique = False
191 self.obs[k].append(v)
193 return self.obs.get(k,[])
197 return iter(self.obs)
199 return sorted(self.obs.keys())
201 return self.obs.items()
203 try:
204 k = getattr(obj, self.attr)
205 self.obs[k].remove(obj)
206 except (ValueError,AttributeError,KeyError):
207 pass
209 return key in self.obs
211 return self.__class__(self.attr)
212
214 - def __init__(self, attr, accept_none=False):
215 self.attr = attr
216 self.obs = {}
217 self.is_unique = True
218 self.accept_none = accept_none
219 self.none_values = set()
221 if k:
222 if k not in self.obs:
223 self.obs[k] = v
224 else:
225 raise KeyError("duplicate key value %s" % k)
226 else:
227 self.none_values.add(v)
229 if k:
230 return [self.obs.get(k)] if k in self.obs else []
231 else:
232 return list(self.none_values)
234 if k:
235 return k in self.obs
236 else:
237 return self.accept_none and self.none_values
239 return sorted(self.obs.keys()) + ([None,] if self.none_values else [])
241 return [(k,[v]) for k,v in self.obs.items()]
243 k = getattr(obj, self.attr)
244 if k:
245 if k in self.obs:
246 del self.obs[k]
247 else:
248 self.none_values.discard(obj)
249
254 return getattr(self._index, attr)
256 ret = Table()
257 if k in self._index:
258 ret.insert_many(self._index[k])
259 return ret
261 return k in self._index
262
267 return getattr(self._index, attr)
269 return k in self._index
271 if k:
272 return self._index[k][0]
273 else:
274 ret = Table()
275 if k in self._index:
276 ret.insert_many(self._index[k])
277 return ret
278
282
284 """A quick way to query for matching records using their indexed attributes. The attribute
285 name is used to locate the index, and returns a wrapper on the index. This wrapper provides
286 dict-like access to the underlying records in the table, as in::
287
288 employees.socsecnum["000-00-0000"]
289 customers.zipcode["12345"]
290
291 The behavior differs slightly for unique and non-unique indexes:
292 - if the index is unique, then retrieving a matching object, will return just the object;
293 if there is no matching object, C{KeyError} is raised (making a table with a unique
294 index behave very much like a Python dict)
295 - if the index is non-unique, then all matching objects will be returned in a new Table,
296 just as if a regular query had been performed; if no objects match the key value, an empty
297 Table is returned and no exception is raised.
298
299 If there is no index defined for the given attribute, then C{AttributeError} is raised.
300 """
301 if attr in self.table._indexes:
302 ret = self.table._indexes[attr]
303 if isinstance(ret, _UniqueObjIndex):
304 ret = _UniqueObjIndexWrapper(ret)
305 if isinstance(ret, _ObjIndex):
306 ret = _ObjIndexWrapper(ret)
307 return ret
308 raise AttributeError("Table '%s' has no index '%s'" % (self.table_name, attr))
309
310
312 """Table is the main class in C{littletable}, for representing a collection of DataObjects or
313 user-defined objects with publicly accessible attributes or properties. Tables can be:
314 - created, with an optional name, using standard Python L{C{Table() constructor}<__init__>}
315 - indexed, with multiple indexes, with unique or non-unique values, see L{create_index}
316 - queried, specifying values to exact match in the desired records, see L{where}
317 - filtered (using L{where}), using a simple predicate function to match desired records;
318 useful for selecting using inequalities or compound conditions
319 - accessed directly for keyed values, using C{table.indexattribute[key]} - see L{__getattr__}
320 - joined, using L{join_on} to identify attribute to be used for joining with another table, and
321 L{join} or operator '+' to perform the actual join
322 - pivoted, using L{pivot} to create a nested structure of sub-tables grouping objects
323 by attribute values
324 - grouped, using L{groupby} to create a summary table of computed values, grouped by a key
325 attribute
326 - L{imported<csv_import>}/L{exported<csv_export>} to CSV-format files
327 Queries and joins return their results as new Table objects, so that queries and joins can
328 be easily performed as a succession of operations.
329 """
331 """Create a new, empty Table.
332 @param table_name: name for Table
333 @type table_name: string (optional)
334 """
335 self(table_name)
336 self.obs = []
337 self._indexes = {}
338 self._uniqueIndexes = []
339 self.by = _IndexAccessor(self)
340
342 """Return the number of objects in the Table."""
343 return len(self.obs)
344
346 """Create an iterator over the objects in the Table."""
347 return iter(self.obs)
348
350 """Provides direct indexed/sliced access to the Table's underlying list of objects."""
351 if isinstance(i, slice):
352 ret = self.copy_template()
353 ret.insert_many(self.obs[i])
354 return ret
355 else:
356 return self.obs[i]
357
359 """A quick way to query for matching records using their indexed attributes. The attribute
360 name is used to locate the index, and returns a wrapper on the index. This wrapper provides
361 dict-like access to the underlying records in the table, as in::
362
363 employees.socsecnum["000-00-0000"]
364 customers.zipcode["12345"]
365
366 The behavior differs slightly for unique and non-unique indexes:
367 - if the index is unique, then retrieving a matching object, will return just the object;
368 if there is no matching object, C{KeyError} is raised (making a table with a unique
369 index behave very much like a Python dict)
370 - if the index is non-unique, then all matching objects will be returned in a new Table,
371 just as if a regular query had been performed; if no objects match the key value, an empty
372 Table is returned and no exception is raised.
373
374 If there is no index defined for the given attribute, then C{AttributeError} is raised.
375 """
376 if attr in self._indexes:
377 ret = self._indexes[attr]
378 if isinstance(ret, _UniqueObjIndex):
379 ret = _UniqueObjIndexWrapper(ret)
380 if isinstance(ret, _ObjIndex):
381 ret = _ObjIndexWrapper(ret)
382 return ret
383 raise AttributeError("Table '%s' has no index '%s'" % (self.table_name, attr))
384
386 return bool(self.obs)
387
388 __nonzero__ = __bool__
389
391 """Support UNION of 2 tables using "+" operator."""
392 if isinstance(other, JoinTerm):
393
394 return other + self
395 elif isinstance(other, Table):
396
397 return self.union(other)
398 else:
399
400 return self.clone().insert_many(other)
401
403 """Support UNION of 2 tables using "+=" operator."""
404 return self.insert_many(other)
405
408
410 """A simple way to assign a name to a table, such as those
411 dynamically created by joins and queries.
412 @param table_name: name for Table
413 @type table_name: string
414 """
415 self.table_name = table_name
416 return self
417
419 """Create empty copy of the current table, with copies of all
420 index definitions.
421 """
422 ret = Table(self.table_name)
423
424
425 ret._indexes.update(dict((k,v.copy_template()) for k,v in self._indexes.items()))
426 if name is not None:
427 ret(name)
428 return ret
429
430 - def clone(self, name=None):
431 """Create full copy of the current table, including table contents
432 and index definitions.
433 """
434 ret = self.copy_template()
435 ret.insert_many(self.obs)
436 if name is not None:
437 ret(name)
438 return ret
439
440 - def create_index(self, attr, unique=False, accept_none=False):
441 """Create a new index on a given attribute.
442 If C{unique} is True and records are found in the table with duplicate
443 attribute values, the index is deleted and C{KeyError} is raised.
444
445 If the table already has an index on the given attribute, then no
446 action is taken and no exception is raised.
447 @param attr: the attribute to be used for indexed access and joins
448 @type attr: string
449 @param unique: flag indicating whether the indexed field values are
450 expected to be unique across table entries
451 @type unique: boolean
452 @param accept_none: flag indicating whether None is an acceptable
453 unique key value for this attribute
454 @type accept_none: boolean
455 """
456 if attr in self._indexes:
457 return self
458
459 if unique:
460 self._indexes[attr] = _UniqueObjIndex(attr,accept_none)
461 self._uniqueIndexes = [ind for ind in self._indexes.values() if ind.is_unique]
462 else:
463 self._indexes[attr] = _ObjIndex(attr)
464 accept_none = True
465 ind = self._indexes[attr]
466 try:
467 for obj in self.obs:
468 if hasattr(obj, attr):
469 obval = getattr(obj, attr) or None
470 else:
471 obval = None
472 if obval or accept_none:
473 ind[obval] = obj
474 else:
475 raise KeyError("None is not an allowed key")
476 return self
477
478 except KeyError:
479 del self._indexes[attr]
480 self._uniqueIndexes = [ind for ind in self._indexes.values() if ind.is_unique]
481 raise
482
484 """Deletes an index from the Table. Can be used to drop and rebuild an index,
485 or to convert a non-unique index to a unique index, or vice versa.
486 @param attr: name of an indexed attribute
487 @type attr: string
488 """
489 if attr in self._indexes:
490 del self._indexes[attr]
491 self._uniqueIndexes = [ind for ind in self._indexes.values() if ind.is_unique]
492
494 """Insert a new object into this Table.
495 @param obj: any Python object
496 Objects can be constructed using the defined DataObject type, or they can
497 be any Python object that does not use the Python C{__slots__} feature; C{littletable}
498 introspect's the object's C{__dict__} or C{_fields} attributes to obtain join and
499 index attributes and values.
500
501 If the table contains a unique index, and the record to be inserted would add
502 a duplicate value for the indexed attribute, then C{KeyError} is raised, and the
503 object is not inserted.
504
505 If the table has no unique indexes, then it is possible to insert duplicate
506 objects into the table.
507 """
508
509
510 uniqueIndexes = self._uniqueIndexes
511 if any((getattr(obj, ind.attr, None) is None and not ind.accept_none)
512 or (
513 hasattr(obj, ind.attr) and getattr(obj, ind.attr) in ind
514 )
515 for ind in uniqueIndexes):
516
517 for ind in uniqueIndexes:
518 if (getattr(obj, ind.attr, None) is None and not ind.accept_none):
519 raise KeyError("unique key cannot be None or blank for index %s" % ind.attr, obj)
520 if getattr(obj, ind.attr) in ind:
521 raise KeyError("duplicate unique key value '%s' for index %s" % (getattr(obj,ind.attr), ind.attr), obj)
522
523 self.obs.append(obj)
524 for attr, ind in self._indexes.items():
525 obval = getattr(obj, attr)
526 ind[obval] = obj
527 return self
528
530 """Inserts a collection of objects into the table."""
531
532
533 do_all(self.insert(ob) for ob in it)
534 return self
535
537 """Removes an object from the table. If object is not in the table, then
538 no action is taken and no exception is raised."""
539
540
541
542 do_all(ind.remove(ob) for attr,ind in self._indexes.items())
543
544
545 self.obs.remove(ob)
546
548 """Removes a collection of objects from the table."""
549
550
551 do_all(self.remove(ob) for ob in it)
552
554 attr,v = attr_val
555 if attr in self._indexes:
556 idx = self._indexes[attr]
557 if v in idx:
558 return len(idx[v])
559 else:
560 return 0
561 else:
562 return 1e9
563
564 - def where(self, *args, **kwargs):
565 """
566 Retrieves matching objects from the table, based on given
567 named parameters. If multiple named parameters are given, then
568 only objects that satisfy all of the query criteria will be returned.
569
570 Special named args:
571 - C{_orderby="attr,..."} - resulting table should sort content objects
572 by the C{attr}s given in a comma-separated string; to sort in
573 descending order, reference the attribute as C{attr desc}.
574
575 - C{_limit} - maximum number of records to return
576
577 @param wherefn: a method or lambda that returns a boolean result, as in::
578
579 lambda ob : ob.unitprice > 10
580
581 @type wherefn: callable(object) returning boolean
582
583 @param kwargs: attributes for selecting records, given as additional
584 named arguments of the form C{attrname="attrvalue"}.
585
586 @return: a new Table containing the matching objects
587 """
588
589 flags = dict((k,v) for k,v in kwargs.items() if k.startswith("_"))
590 for f in flags:
591 del kwargs[f]
592
593 if kwargs:
594
595
596
597
598 kwargs = kwargs.items()
599 if len(kwargs) > 1 and len(self.obs) > 100:
600 kwargs = sorted(kwargs, key=self._query_attr_sort_fn)
601
602 ret = self
603 for k,v in kwargs:
604 newret = ret.copy_template()
605 if k in ret._indexes:
606 newret.insert_many(ret._indexes[k][v])
607 else:
608 newret.insert_many( r for r in ret.obs
609 if hasattr(r,k) and getattr(r,k) == v )
610 ret = newret
611 else:
612 ret = self.clone()
613
614
615
616 if flags:
617 if '_orderby' in flags:
618 ret.sort(flags['_orderby'])
619 if '_limit' in flags:
620 del ret.obs[flags['_limit']:]
621
622 if args:
623 wherefn = args[0]
624 newret = ret.copy_template()
625 newret.insert_many(ifilter(wherefn, ret.obs))
626 ret = newret
627
628 return ret
629
631 """Deletes matching objects from the table, based on given
632 named parameters. If multiple named parameters are given, then
633 only objects that satisfy all of the query criteria will be removed.
634 @param kwargs: attributes for selecting records, given as additional
635 named arguments of the form C{attrname="attrvalue"}.
636 @return: the number of objects removed from the table
637 """
638 if not kwargs:
639 return 0
640
641 affected = self.where(**kwargs)
642 self.remove_many(affected)
643 return len(affected)
644
645 - def sort(self, key, reverse=False):
646 """Sort Table in place, using given fields as sort key.
647 @param key: if this is a string, it is a comma-separated list of field names,
648 optionally followed by 'desc' to indicate descending sort instead of the
649 default ascending sort; if a list or tuple, it is a list or tuple of field names
650 or field names with ' desc' appended; if it is a function, then it is the
651 function to be used as the sort key function
652 @return: self
653 """
654 if isinstance(key, (basestring,list,tuple)):
655 if isinstance(key, basestring):
656 attrdefs = [s.strip() for s in key.split(',')]
657
658
659 attr_orders = [(a.split()+['asc',])[:2] for a in attrdefs][::-1]
660 else:
661
662 attr_orders = key
663 attrs = [attr for attr,order in attr_orders]
664
665
666 if all(order=='asc' for attr,order in attr_orders):
667 self.obs.sort(key=attrgetter(*attrs), reverse=reverse)
668 elif all(order=='desc' for attr,order in attr_orders):
669 self.obs.sort(key=attrgetter(*attrs), reverse=not reverse)
670 else:
671
672
673
674 do_all(self.obs.sort(key=attrgetter(attr), reverse=(order=="desc"))
675 for attr,order in attr_orders)
676 else:
677 keyfn = key
678 self.obs.sort(key=keyfn, reverse=reverse)
679 return self
680
681 - def select(self, fields, **exprs):
682 """
683 Create a new table containing a subset of attributes, with optionally
684 newly-added fields computed from each rec in the original table.
685
686 Special kwargs:
687 - C{_unique=True} - only return a set of unique rows
688
689 @param fields: list of strings, or single space-delimited string, listing attribute name to be included in the output
690 @type fields: list, or space-delimited string
691 @param exprs: one or more named callable arguments, to compute additional fields using the given function
692 @type exprs: C{name=callable}, callable takes the record as an argument, and returns the new attribute value
693 If a string is passed as a callable, this string will be used using string formatting, given the record
694 as a source of interpolation values. For instance, C{fullName = '%(lastName)s, %(firstName)s'}
695
696 """
697 if isinstance(fields, basestring):
698 fields = fields.split()
699
700 unique = exprs.pop('_unique', False)
701
702 def _makeStringCallable(expr):
703 if isinstance(expr,basestring):
704 return lambda rec: expr % rec
705 else:
706 return expr
707
708 exprs = dict((k, _makeStringCallable(v)) for k,v in exprs)
709
710 raw_tuples = []
711 for rec in self.obs:
712 attrvalues = tuple(getattr(rec, fieldname, None) for fieldname in fields)
713 if exprs:
714 attrvalues += tuple(expr(rec) for expr in exprs.values())
715 raw_tuples.append(attrvalues)
716
717 if unique:
718 raw_tuples = list(set(raw_tuples))
719
720 allNames = tuple(fields) + tuple(exprs.keys())
721 return Table().insert_many(DataObject(**dict(zip(allNames, outtuple))) for outtuple in raw_tuples)
722
749
750 - def join(self, other, attrlist=None, auto_create_indexes=True, **kwargs):
751 """
752 Join the objects of one table with the objects of another, based on the given
753 matching attributes in the named arguments. The attrlist specifies the attributes to
754 be copied from the source tables - if omitted, all attributes will be copied. Entries
755 in the attrlist may be single attribute names, or if there are duplicate names in both
756 tables, then a C{(table,attributename)} tuple can be given to disambiguate which
757 attribute is desired. A C{(table,attributename,alias)} tuple can also be passed, to
758 rename an attribute from a source table.
759
760 This method may be called directly, or can be constructed using the L{join_on} method and
761 the '+' operator. Using this syntax, the join is specified using C{table.join_on("xyz")}
762 to create a JoinTerm containing both table and joining attribute. Multiple JoinTerm
763 or tables can be added to construct a compound join expression. When complete, the
764 join expression gets executed by calling the resulting join definition,
765 using C{join_expression([attrlist])}.
766
767 @param other: other table to join to
768 @param attrlist: list of attributes to be copied to the new joined table; if
769 none provided, all attributes of both tables will be used (taken from the first
770 object in each table)
771 @type attrlist: string, or list of strings or C{(table,attribute[,alias])} tuples
772 (list may contain both strings and tuples)
773 @param kwargs: attributes to join on, given as additional named arguments
774 of the form C{table1attr="table2attr"}, or a dict mapping attribute names.
775 @returns: a new Table containing the joined data as new DataObjects
776 """
777 thiscol,othercol = kwargs.items()[0]
778
779 retname = ("(%s:%s^%s:%s)" %
780 (self.table_name, thiscol, other.table_name, othercol))
781
782 if not (self.obs and other.obs):
783 return Table(retname)
784
785 if isinstance(attrlist, basestring):
786 attrlist = re.split(r'[,\s]+', attrlist)
787
788
789 thisnames = set(_object_attrnames(self.obs[0]))
790 othernames = set(_object_attrnames(other.obs[0]))
791 fullcols = []
792 if attrlist is not None:
793 for col in attrlist:
794 if isinstance(col, tuple):
795
796
797 fullcols.append((col + (col[1],))[:3])
798 else:
799 if col in thisnames:
800 fullcols.append( (self, col, col) )
801 elif col in othernames:
802 fullcols.append( (other, col, col) )
803 else:
804 raise ValueError("join attribute not found: " + col)
805 else:
806 fullcols = [(self,n,n) for n in thisnames]
807 fullcols += [(other,n,n) for n in othernames]
808
809 thiscols = list(ifilter(lambda o:o[0] is self, fullcols))
810 othercols = list(ifilter(lambda o:o[0] is other, fullcols))
811
812 if auto_create_indexes:
813 if thiscol not in self._indexes:
814 self.create_index(thiscol)
815 if othercol not in other._indexes:
816 other.create_index(othercol)
817
818 thiscolindex = othercolindex = None
819 if thiscol in self._indexes:
820 thiscolindex = self._indexes[thiscol]
821 else:
822 raise ValueError("indexed attribute required for join: "+thiscol)
823 if othercol in other._indexes:
824 othercolindex = other._indexes[othercol]
825 else:
826 raise ValueError("indexed attribute required for join: "+othercol)
827
828
829 if len(thiscolindex) < len(othercolindex):
830 shortindex, longindex = (thiscolindex, othercolindex)
831 swap = False
832 else:
833 shortindex, longindex = (othercolindex, thiscolindex)
834 swap = True
835
836
837
838
839
840
841
842
843
844 matchingrows = list((longindex[key],rows) if swap else (rows, longindex[key])
845 for key,rows in shortindex.items())
846
847 joinrows = []
848 for thisrows,otherrows in matchingrows:
849 for trow,orow in product(thisrows,otherrows):
850 retobj = DataObject()
851
852
853 do_all(setattr(retobj, a, getattr(trow,c)) for _,c,a in thiscols)
854
855
856 do_all(setattr(retobj, a, getattr(orow,c)) for _,c,a in othercols)
857 joinrows.append(retobj)
858
859 ret = Table(retname)
860 for tbl,collist in zip([self,other],[thiscols,othercols]):
861 for _,c,a in collist:
862 if c in tbl._indexes:
863 ret.create_index(a)
864 ret.insert_many(joinrows)
865 return ret
866
868 """Creates a JoinTerm in preparation for joining with another table, to
869 indicate what attribute should be used in the join. Only indexed attributes
870 may be used in a join.
871 @param attr: attribute name to join from this table (may be different
872 from the attribute name in the table being joined to)
873 @type attr: string
874 @returns: L{JoinTerm}"""
875 if attr not in self._indexes:
876 raise ValueError("can only join on indexed attributes")
877 return JoinTerm(self, attr)
878
879 - def pivot(self, attrlist):
880 """Pivots the data using the given attributes, returning a L{PivotTable}.
881 @param attrlist: list of attributes to be used to construct the pivot table
882 @type attrlist: list of strings, or string of space-delimited attribute names
883 """
884 if isinstance(attrlist, basestring):
885 attrlist = attrlist.split()
886 if all(a in self._indexes for a in attrlist):
887 return PivotTable(self,[],attrlist)
888 else:
889 raise ValueError("pivot can only be called using indexed attributes")
890
891 - def _import(self, source, transforms=None, reader=csv.DictReader):
892 close_on_exit = False
893 if isinstance(source, basestring):
894 source = open(source)
895 close_on_exit = True
896 try:
897 csvdata = reader(source)
898 self.insert_many(DataObject(**s) for s in csvdata)
899 if transforms:
900 for attr,fn in transforms.items():
901 default = None
902 if isinstance(fn,tuple):
903 fn,default = fn
904 objfn = lambda obj : fn(getattr(obj,attr))
905 self.add_field(attr, objfn, default)
906 finally:
907 if close_on_exit:
908 source.close()
909
910 - def csv_import(self, csv_source, transforms=None):
911 """Imports the contents of a CSV-formatted file into this table.
912 @param csv_source: CSV file - if a string is given, the file with that name will be
913 opened, read, and closed; if a file object is given, then that object
914 will be read as-is, and left for the caller to be closed.
915 @type csv_source: string or file
916 @param transforms: dict of functions by attribute name; if given, each
917 attribute will be transformed using the corresponding transform; if there is no
918 matching transform, the attribute will be read as a string (default); the
919 transform function can also be defined as a (function, default-value) tuple; if
920 there is an Exception raised by the transform function, then the attribute will
921 be set to the given default value
922 @type transforms: dict (optional)
923 """
924 return self._import(csv_source, transforms)
925
926 - def _xsv_import(self, xsv_source, transforms=None, splitstr="\t"):
927 xsv_reader = lambda src: csv.DictReader(src, delimiter=splitstr)
928 return self._import(xsv_source, transforms, reader=xsv_reader)
929
930 - def tsv_import(self, xsv_source, transforms=None):
931 """Imports the contents of a tab-separated data file into this table.
932 @param xsv_source: tab-separated data file - if a string is given, the file with that name will be
933 opened, read, and closed; if a file object is given, then that object
934 will be read as-is, and left for the caller to be closed.
935 @type xsv_source: string or file
936 @param transforms: dict of functions by attribute name; if given, each
937 attribute will be transformed using the corresponding transform; if there is no
938 matching transform, the attribute will be read as a string (default); the
939 transform function can also be defined as a (function, default-value) tuple; if
940 there is an Exception raised by the transform function, then the attribute will
941 be set to the given default value
942 @type transforms: dict (optional)
943 """
944 return self._xsv_import(xsv_source, transforms=transforms, splitstr="\t")
945
947 """Exports the contents of the table to a CSV-formatted file.
948 @param csv_dest: CSV file - if a string is given, the file with that name will be
949 opened, written, and closed; if a file object is given, then that object
950 will be written as-is, and left for the caller to be closed.
951 @type csv_dest: string or file
952 @param fieldnames: attribute names to be exported; can be given as a single
953 string with space-delimited names, or as a list of attribute names
954 """
955 close_on_exit = False
956 if isinstance(csv_dest, basestring):
957 csv_dest = open(csv_dest,'wb')
958 close_on_exit = True
959 try:
960 if fieldnames is None:
961 fieldnames = list(_object_attrnames(self.obs[0]))
962 if isinstance(fieldnames, basestring):
963 fieldnames = fieldnames.split()
964
965 csv_dest.write(','.join(fieldnames) + '\n')
966 csvout = csv.DictWriter(csv_dest, fieldnames, extrasaction='ignore')
967 if hasattr(self.obs[0], "__dict__"):
968
969
970 do_all(csvout.writerow(o.__dict__) for o in self.obs)
971 else:
972
973
974
975
976 do_all(csvout.writerow(dict(starmap(lambda obj, fld: (fld, getattr(obj, fld)),
977 zip(repeat(o), fieldnames)))) for o in self.obs)
978 finally:
979 if close_on_exit:
980 csv_dest.close()
981
983 """Imports the contents of a JSON data file into this table.
984 @param source: JSON data file - if a string is given, the file with that name will be
985 opened, read, and closed; if a file object is given, then that object
986 will be read as-is, and left for the caller to be closed.
987 @type source: string or file
988 @param transforms: dict of functions by attribute name; if given, each
989 attribute will be transformed using the corresponding transform; if there is no
990 matching transform, the attribute will be read as a string (default); the
991 transform function can also be defined as a (function, default-value) tuple; if
992 there is an Exception raised by the transform function, then the attribute will
993 be set to the given default value
994 @type transforms: dict (optional)
995 """
996 class _JsonFileReader(object):
997 def __init__(self, src):
998 self.source = src
999 def __iter__(self):
1000 current = ''
1001 for line in self.source:
1002 if current:
1003 current += ' '
1004 current += line
1005 try:
1006 yield json.loads(current)
1007 current = ''
1008 except Exception:
1009 pass
1010 return self._import(source, transforms=transforms, reader=_JsonFileReader)
1011
1013 """Exports the contents of the table to a JSON-formatted file.
1014 @param dest: output file - if a string is given, the file with that name will be
1015 opened, written, and closed; if a file object is given, then that object
1016 will be written as-is, and left for the caller to be closed.
1017 @type dest: string or file
1018 @param fieldnames: attribute names to be exported; can be given as a single
1019 string with space-delimited names, or as a list of attribute names
1020 """
1021 close_on_exit = False
1022 if isinstance(dest, basestring):
1023 dest = open(dest,'wb')
1024 close_on_exit = True
1025 try:
1026 if isinstance(fieldnames, basestring):
1027 fieldnames = fieldnames.split()
1028
1029 if fieldnames is None:
1030 do_all(
1031 dest.write(_to_json(o)+'\n') for o in self.obs
1032 )
1033 else:
1034 do_all(
1035 dest.write(json.dumps({f:getattr(o, f) for f in fieldnames})+'\n')
1036 for o in self.obs
1037 )
1038 finally:
1039 if close_on_exit:
1040 dest.close()
1041
1042 - def add_field(self, attrname, fn, default=None):
1043 """Computes a new attribute for each object in table, or replaces an
1044 existing attribute in each record with a computed value
1045 @param attrname: attribute to compute for each object
1046 @type attrname: string
1047 @param fn: function used to compute new attribute value, based on
1048 other values in the object, as in::
1049
1050 lambda ob : ob.commission_pct/100.0 * ob.gross_sales
1051
1052 @type fn: function(obj) returns value
1053 @param default: value to use if an exception is raised while trying
1054 to evaluate fn
1055 """
1056
1057 def _addFieldToRec(rec, fn=fn, default=default):
1058 try:
1059 val = fn(rec)
1060 except Exception:
1061 val = default
1062 if isinstance(rec, DataObject):
1063 object.__setattr__(rec, attrname, val)
1064 else:
1065 setattr(rec, attrname, val)
1066 do_all(_addFieldToRec(r) for r in self)
1067 return self
1068
1069 - def addfield(self, attrname, fn, default=None):
1070
1071 return self.add_field(attrname, fn, default)
1072
1073 - def groupby(self, keyexpr, **outexprs):
1074 """simple prototype of group by, with support for expressions in the group-by clause
1075 and outputs
1076 @param keyexpr: grouping field and optional expression for computing the key value;
1077 if a string is passed
1078 @type keyexpr: string or tuple
1079 @param outexprs: named arguments describing one or more summary values to
1080 compute per key
1081 @type outexprs: callable, taking a sequence of objects as input and returning
1082 a single summary value
1083 """
1084 if isinstance(keyexpr, basestring):
1085 keyattrs = keyexpr.split()
1086 keyfn = lambda o : tuple(getattr(o, k) for k in keyattrs)
1087
1088 elif isinstance(keyexpr, tuple):
1089 keyattrs = (keyexpr[0],)
1090 keyfn = keyexpr[1]
1091
1092 groupedobs = defaultdict(list)
1093
1094
1095 do_all(groupedobs[keyfn(ob)].append(ob) for ob in self.obs)
1096
1097 tbl = Table()
1098
1099
1100 do_all(tbl.create_index(k, unique=(len(keyattrs)==1)) for k in keyattrs)
1101 for key, recs in sorted(groupedobs.iteritems()):
1102 groupobj = DataObject(**dict(zip(keyattrs, key)))
1103
1104
1105 do_all(setattr(groupobj, subkey, expr(recs))
1106 for subkey, expr in outexprs.items())
1107 tbl.insert(groupobj)
1108 return tbl
1109
1112
1114 ret = self.copy_template()
1115 seen = set()
1116 for rec in self:
1117 reckey = tuple(rec.__dict__.values())
1118 if reckey not in seen:
1119 seen.add(reckey)
1120 ret.insert(rec)
1121 return ret
1122
1124 """Enhanced Table containing pivot results from calling table.pivot().
1125 """
1126 - def __init__(self, parent, attr_val_path, attrlist):
1127 """PivotTable initializer - do not create these directly, use
1128 L{Table.pivot}.
1129 """
1130 super(PivotTable,self).__init__()
1131 self._attr_path = attr_val_path[:]
1132 self._pivot_attrs = attrlist[:]
1133 self._subtable_dict = {}
1134
1135
1136
1137 self._indexes.update(dict((k,v.copy_template()) for k,v in parent._indexes.items()))
1138 if not attr_val_path:
1139 self.insert_many(parent.obs)
1140 else:
1141 attr,val = attr_val_path[-1]
1142 self.insert_many(parent.where(**{attr:val}))
1143 parent._subtable_dict[val] = self
1144
1145 if len(attrlist) > 0:
1146 this_attr = attrlist[0]
1147 sub_attrlist = attrlist[1:]
1148 ind = parent._indexes[this_attr]
1149 self.subtables = [ PivotTable(self,
1150 attr_val_path + [(this_attr,k)],
1151 sub_attrlist) for k in sorted(ind.keys()) ]
1152 else:
1153 self.subtables = []
1154
1156 if self._subtable_dict:
1157 return self._subtable_dict[val]
1158 else:
1159 return super(PivotTable,self).__getitem__(val)
1160
1162 return sorted(self._subtable_dict.keys())
1163
1165 return sorted(self._subtable_dict.items())
1166
1168 return [self._subtable_dict.items[k] for k in self.keys()]
1169
1171 """Return the set of attribute-value pairs that define the contents of this
1172 table within the original source table.
1173 """
1174 return self._attr_path
1175
1177 """Return the pivot_key as a displayable string.
1178 """
1179 return '/'.join("%s:%s" % (attr,key) for attr,key in self._attr_path)
1180
1182 """Return whether this table has further subtables.
1183 """
1184 return bool(self.subtables)
1185
1186 - def dump(self, out=sys.stdout, row_fn=repr, limit=-1, indent=0):
1187 """Dump out the contents of this table in a nested listing.
1188 @param out: output stream to write to
1189 @param row_fn: function to call to display individual rows
1190 @param limit: number of records to show at deepest level of pivot (-1=show all)
1191 @param indent: current nesting level
1192 """
1193 NL = '\n'
1194 if indent:
1195 out.write(" "*indent + self.pivot_key_str())
1196 else:
1197 out.write("Pivot: %s" % ','.join(self._pivot_attrs))
1198 out.write(NL)
1199 if self.has_subtables():
1200
1201
1202
1203 do_all(sub.dump(out, row_fn, limit, indent+1) for sub in self.subtables if sub)
1204 else:
1205 if limit >= 0:
1206 showslice = slice(0,limit)
1207 else:
1208 showslice = slice(None,None)
1209
1210
1211 do_all(out.write(" "*(indent+1) + row_fn(r) + NL)
1212 for r in self.obs[showslice])
1213 out.flush()
1214
1215 - def dump_counts(self, out=sys.stdout, count_fn=len, colwidth=10):
1216 """Dump out the summary counts of entries in this pivot table as a tabular listing.
1217 @param out: output stream to write to
1218 """
1219 if len(self._pivot_attrs) == 1:
1220 out.write("Pivot: %s\n" % ','.join(self._pivot_attrs))
1221 maxkeylen = max(len(str(k)) for k in self.keys())
1222 maxvallen = colwidth
1223 keytally = {}
1224 for k, sub in self.items():
1225 sub_v = count_fn(sub)
1226 maxvallen = max(maxvallen, len(str(sub_v)))
1227 keytally[k] = sub_v
1228 for k,sub in self.items():
1229 out.write("%-*.*s " % (maxkeylen,maxkeylen,k))
1230 out.write("%*s\n" % (maxvallen,keytally[k]))
1231 elif len(self._pivot_attrs) == 2:
1232 out.write("Pivot: %s\n" % ','.join(self._pivot_attrs))
1233 maxkeylen = max(max(len(str(k)) for k in self.keys()),5)
1234 maxvallen = max(max(len(str(k)) for k in self.subtables[0].keys()),colwidth)
1235 keytally = dict((k,0) for k in self.subtables[0].keys())
1236 out.write("%*s " % (maxkeylen,''))
1237 out.write(' '.join("%*.*s" % (maxvallen,maxvallen,k) for k in self.subtables[0].keys()))
1238 out.write(' %*s\n' % (maxvallen, 'Total'))
1239 for k,sub in self.items():
1240 out.write("%-*.*s " % (maxkeylen,maxkeylen,k))
1241 for kk,ssub in sub.items():
1242 ssub_v = count_fn(ssub)
1243 out.write("%*d " % (maxvallen,ssub_v))
1244 keytally[kk] += ssub_v
1245 maxvallen = max(maxvallen, len(str(ssub_v)))
1246 sub_v = count_fn(sub)
1247 maxvallen = max(maxvallen, len(str(sub_v)))
1248 out.write("%*d\n" % (maxvallen,sub_v))
1249 out.write('%-*.*s ' % (maxkeylen,maxkeylen,"Total"))
1250 out.write(' '.join("%*d" % (maxvallen,tally) for k,tally in sorted(keytally.items())))
1251 out.write(" %*d\n" % (maxvallen,sum(tally for k,tally in keytally.items())))
1252 else:
1253 raise ValueError("can only dump summary counts for 1 or 2-attribute pivots")
1254
1256 """Dump out the summary counts of this pivot table as a Table.
1257 """
1258 if summarycolname is None:
1259 summarycolname = col
1260 ret = Table()
1261 topattr = self._pivot_attrs[0]
1262
1263
1264 do_all(ret.create_index(attr) for attr in self._pivot_attrs)
1265 if len(self._pivot_attrs) == 1:
1266 for sub in self.subtables:
1267 subattr,subval = sub._attr_path[-1]
1268 attrdict = {subattr:subval}
1269 if fn is None:
1270 attrdict['Count'] = len(sub)
1271 else:
1272 attrdict[summarycolname] = fn(s[col] for s in sub)
1273 ret.insert(DataObject(**attrdict))
1274 elif len(self._pivot_attrs) == 2:
1275 for sub in self.subtables:
1276 for ssub in sub.subtables:
1277 attrdict = dict(ssub._attr_path)
1278 if fn is None:
1279 attrdict['Count'] = len(ssub)
1280 else:
1281 attrdict[summarycolname] = fn(s[col] for s in ssub)
1282 ret.insert(DataObject(**attrdict))
1283 elif len(self._pivot_attrs) == 3:
1284 for sub in self.subtables:
1285 for ssub in sub.subtables:
1286 for sssub in ssub.subtables:
1287 attrdict = dict(sssub._attr_path)
1288 if fn is None:
1289 attrdict['Count'] = len(sssub)
1290 else:
1291 attrdict[summarycolname] = fn(s[col] for s in sssub)
1292 ret.insert(DataObject(**attrdict))
1293 else:
1294 raise ValueError("can only dump summary counts for 1 or 2-attribute pivots")
1295 return ret
1296
1298 """Temporary object created while composing a join across tables using
1299 L{Table.join_on} and '+' addition. JoinTerm's are usually created by
1300 calling join_on on a Table object, as in::
1301
1302 customers.join_on("id") + orders.join_on("custid")
1303
1304 This join expression would set up the join relationship
1305 equivalent to::
1306
1307 customers.join(orders, id="custid")
1308
1309 If tables are being joined on attributes that have the same name in
1310 both tables, then a join expression could be created by adding a
1311 JoinTerm of one table directly to the other table::
1312
1313 customers.join_on("custid") + orders
1314
1315 Once the join expression is composed, the actual join is performed
1316 using function call notation::
1317
1318 customerorders = customers.join_on("custid") + orders
1319 for custord in customerorders():
1320 print custord
1321
1322 When calling the join expression, you can optionally specify a
1323 list of attributes as defined in L{Table.join}.
1324 """
1325 - def __init__(self, sourceTable, joinfield):
1326 self.sourcetable = sourceTable
1327 self.joinfield = joinfield
1328 self.jointo = None
1329
1331 if isinstance(other, Table):
1332 other = other.join_on(self.joinfield)
1333 if isinstance(other, JoinTerm):
1334 if self.jointo is None:
1335 if other.jointo is None:
1336 self.jointo = other
1337 else:
1338 self.jointo = other()
1339 return self
1340 else:
1341 if other.jointo is None:
1342 return self() + other
1343 else:
1344 return self() + other()
1345 raise ValueError("cannot add object of type '%s' to JoinTerm" % other.__class__.__name__)
1346
1348 if isinstance(other, Table):
1349 return other.join_on(self.joinfield) + self
1350 raise ValueError("cannot add object of type '%s' to JoinTerm" % other.__class__.__name__)
1351
1353 if self.jointo:
1354 other = self.jointo
1355 if isinstance(other, Table):
1356 other = other.join_on(self.joinfield)
1357 ret = self.sourcetable.join(other.sourcetable, attrs,
1358 **{self.joinfield : other.joinfield})
1359 return ret
1360 else:
1361 return self.sourcetable.query()
1362
1365
1366
1367 if __name__ == "__main__":
1368
1369
1370 from functools import partial
1371 try:
1372 import simplejson as json
1373 json_dumps = partial(json.dumps, indent=' ')
1374 except ImportError:
1375 import json
1376 json_dumps = partial(json.dumps, indent=2)
1377
1378
1379 rawdata = """\
1380 Phoenix:AZ:85001:KPHX
1381 Phoenix:AZ:85001:KPHY
1382 Phoenix:AZ:85001:KPHA
1383 Dallas:TX:75201:KDFW""".splitlines()
1384
1385
1386 stations = Table()
1387
1388 stations.create_index("stn", unique=True)
1389
1390 fields = "city state zip stn".split()
1391 for d in rawdata:
1392 ob = DataObject()
1393 for k,v in zip(fields, d.split(':')):
1394 setattr(ob,k,v.strip())
1395 stations.insert(ob)
1396
1397
1398 for queryargs in [
1399 dict(city="Phoenix"),
1400 dict(city="Phoenix", stn="KPHX"),
1401 dict(stn="KPHA", city="Phoenix"),
1402 dict(state="TX"),
1403 dict(city="New York"),
1404 dict(city="Phoenix", _orderby="stn"),
1405 dict(city="Phoenix", _orderby="stn desc"),
1406 ]:
1407 print queryargs,
1408 result = stations.where(**queryargs)
1409 print len(result)
1410 for r in result: print r
1411 print
1412
1413
1414 print list(stations.where())
1415 print
1416
1417 amfm = Table()
1418 amfm.create_index("stn", unique=True)
1419 amfm.insert(DataObject(stn="KPHY", band="AM"))
1420 amfm.insert(DataObject(stn="KPHX", band="FM"))
1421 amfm.insert(DataObject(stn="KPHA", band="FM"))
1422 amfm.insert(DataObject(stn="KDFW", band="FM"))
1423
1424 try:
1425 amfm.insert(DataObject(stn="KPHA", band="AM"))
1426 except KeyError:
1427 print "duplicate key not allowed"
1428
1429 print
1430 for rec in (stations.join_on("stn") + amfm.join_on("stn")
1431 )(["stn", "city", (amfm,"band","AMFM"),
1432 (stations,"state","st")]).sort("AMFM"):
1433 print repr(rec)
1434
1435 print
1436 for rec in (stations.join_on("stn") + amfm.join_on("stn")
1437 )(["stn", "city", (amfm,"band"), (stations,"state","st")]):
1438 print json_dumps(rec.__dict__)
1439
1440 print
1441 for rec in (stations.join_on("stn") + amfm.join_on("stn"))():
1442 print json_dumps(rec.__dict__)
1443
1444 print
1445 stations.create_index("state")
1446 pivot = stations.pivot("state")
1447 pivot.dump_counts()
1448
1449 print
1450 amfm.create_index("band")
1451 pivot = (stations.join_on("stn") + amfm)().pivot("state band")
1452 pivot.dump_counts()
1453