Source code for lodstorage.sql

'''
Created on 2020-08-24

@author: wf
'''
# python standard library
import sqlite3
import datetime
import io
import time
import sys
import re
from lodstorage.lod import LOD

[docs]class SQLDB(object): ''' Structured Query Language Database wrapper :ivar dbname(string): name of the database :ivar debug(boolean): True if debug info should be provided :ivar errorDebug(boolean): True if debug info should be provided on errors (should not be used for production since it might reveal data) ''' RAM=":memory:" def __init__(self,dbname:str=':memory:',connection=None,check_same_thread=True,timeout=5,debug=False, errorDebug=False): ''' Construct me for the given dbname and debug Args: dbname(string): name of the database - default is a RAM based database connection(Connection): an optional connection to be reused check_same_thread(boolean): True if object handling needs to be on the same thread see https://stackoverflow.com/a/48234567/1497139 timeout(float): number of seconds for connection timeout debug(boolean): if True switch on debug errorDebug(boolean): True if debug info should be provided on errors (should not be used for production since it might reveal data) ''' self.dbname=dbname self.debug=debug self.errorDebug=errorDebug if connection is None: self.c=sqlite3.connect(dbname,detect_types=sqlite3.PARSE_DECLTYPES,check_same_thread=check_same_thread,timeout=timeout) else: self.c=connection
[docs] def logError(self,msg): ''' log the given error message to stderr Args: msg(str): the error messsage to display ''' print(msg,file=sys.stderr,flush=True)
[docs] def close(self): ''' close my connection ''' self.c.close()
[docs] def execute(self,ddlCmd): ''' execute the given Data Definition Command Args: ddlCmd(string): e.g. a CREATE TABLE or CREATE View command ''' self.c.execute(ddlCmd)
[docs] def createTable(self,listOfRecords,entityName:str,primaryKey:str=None,withCreate:bool=True,withDrop:bool=False,sampleRecordCount=1,failIfTooFew=True): ''' derive Data Definition Language CREATE TABLE command from list of Records by examining first recorda as defining sample record and execute DDL command auto detect column types see e.g. https://stackoverflow.com/a/57072280/1497139 Args: listOfRecords(list): a list of Dicts entityName(string): the entity / table name to use primaryKey(string): the key/column to use as a primary key withDrop(boolean): true if the existing Table should be dropped withCreate(boolean): true if the create Table command should be executed - false if only the entityInfo should be returned sampleRecords(int): number of sampleRecords expected and to be inspected failIfTooFew(boolean): raise an Exception if to few sampleRecords else warn only Returns: EntityInfo: meta data information for the created table ''' l= len(listOfRecords) if sampleRecordCount<0: sampleRecordCount=l if l<sampleRecordCount: msg="only %d/%d of needed sample records to createTable available" % (l,sampleRecordCount) if failIfTooFew: raise Exception(msg) else: if self.debug: self.logError(msg) sampleRecords=listOfRecords[:sampleRecordCount] entityInfo=EntityInfo(sampleRecords,entityName,primaryKey,debug=self.debug) if withDrop: self.c.execute(entityInfo.dropTableCmd) if withCreate: try: self.c.execute(entityInfo.createTableCmd) except sqlite3.OperationalError as oe: raise Exception(f"createTable failed with error {oe} for {entityInfo.createTableCmd}") return entityInfo
[docs] def getDebugInfo(self,record,index,executeMany): ''' get the debug info for the given record at the given index depending on the state of executeMany Args: record(dict): the record to show index(int): the index of the record executeMany(boolean): if True the record may be valid else not ''' debugInfo="" if not executeMany: # shall we shoe the details of the record (which might be a security risk) if self.errorDebug: # show details of record debugInfo="\nrecord #%d=%s" % (index,repr(record)) else: # show only index debugInfo="\nrecord #%d" % index return debugInfo
[docs] def store(self,listOfRecords,entityInfo,executeMany=False,fixNone=False): ''' store the given list of records based on the given entityInfo Args: listOfRecords(list): the list of Dicts to be stored entityInfo(EntityInfo): the meta data to be used for storing executeMany(bool): if True the insert command is done with many/all records at once fixNone(bool): if True make sure empty columns in the listOfDict are filled with "None" values ''' insertCmd=entityInfo.insertCmd record=None index=0 try: if executeMany: if fixNone: LOD.setNone4List(listOfRecords, entityInfo.typeMap.keys()) self.c.executemany(insertCmd,listOfRecords) else: for record in listOfRecords: index+=1 if fixNone: LOD.setNone(record, entityInfo.typeMap.keys()) self.c.execute(insertCmd,record) self.c.commit() except sqlite3.ProgrammingError as pe: msg=pe.args[0] if "You did not supply a value for binding" in msg: columnIndex=int(re.findall(r'\d+',msg)[0]) columnName=list(entityInfo.typeMap.keys())[columnIndex-1] debugInfo=self.getDebugInfo(record, index, executeMany) raise Exception("%s\nfailed: no value supplied for column '%s'%s" % (insertCmd,columnName,debugInfo)) else: raise pe except sqlite3.InterfaceError as ie: msg=ie.args[0] if "Error binding parameter" in msg: columnName=re.findall(r':[_a-zA-Z]\w*',msg)[0] debugInfo=self.getDebugInfo(record, index, executeMany) raise Exception("%s\nfailed: error binding column '%s'%s" % (insertCmd,columnName,debugInfo)) else: raise ie except Exception as ex: debugInfo=self.getDebugInfo(record, index, executeMany) msg="%s\nfailed:%s%s" % (insertCmd,str(ex),debugInfo) raise Exception(msg)
[docs] def queryGen(self,sqlQuery,params=None): ''' run the given sqlQuery a a generator for dicts Args: sqlQuery(string): the SQL query to be executed params(tuple): the query params, if any Returns: a generator of dicts ''' if self.debug: print(sqlQuery) if params is not None: print(params) # https://stackoverflow.com/a/13735506/1497139 cur=self.c.cursor() if params is not None: query = cur.execute(sqlQuery,params) else: query = cur.execute(sqlQuery) colname = [ d[0] for d in query.description ] try: # loop over all rows for row in query: record=dict(zip(colname, row)) yield record except Exception as ex: msg=str(ex) self.logError(msg) pass cur.close()
[docs] def query(self,sqlQuery,params=None): ''' run the given sqlQuery and return a list of Dicts Args: sqlQuery(string): the SQL query to be executed params(tuple): the query params, if any Returns: list: a list of Dicts ''' resultList=[] for record in self.queryGen(sqlQuery, params): resultList.append(record) return resultList
[docs] def queryAll(self,entityInfo,fixDates=True): ''' query all records for the given entityName/tableName Args: entityName(string): name of the entity/table to qury fixDates(boolean): True if date entries should be returned as such and not as strings ''' sqlQuery='SELECT * FROM %s' % entityInfo.name resultList=self.query(sqlQuery) if fixDates: entityInfo.fixDates(resultList) return resultList
[docs] def getTableList(self,tableType='table'): ''' get the schema information from this database Args: tableType(str): table or view Return: list: a list as derived from PRAGMA table_info ''' tableQuery=f"SELECT name FROM sqlite_master WHERE type='{tableType}'" tableList=self.query(tableQuery) for table in tableList: tableName=table['name'] columnQuery=f"PRAGMA table_info('{tableName}')" columns=self.query(columnQuery) table['columns']=columns return tableList
[docs] def getTableDict(self,tableType='table'): ''' get the schema information from this database as a dict Args: tableType(str): table or view Returns: dict: Lookup map of tables with columns also being converted to dict ''' tableDict={} for table in self.getTableList(tableType=tableType): colDict={} for col in table["columns"]: colDict[col['name']]=col table["columns"]=colDict tableDict[table['name']]=table return tableDict
[docs] def restoreProgress(self,status,remaining,total): self.progress("Restore",status,remaining,total)
[docs] def backupProgress(self,status,remaining,total): self.progress("Backup",status,remaining,total)
[docs] def progress(self,action,status, remaining, total): ''' show progress ''' print('%s %s at %5.0f%%' % (action,"... " if status==0 else "done",(total-remaining)/total*100))
[docs] def backup(self,backupDB,action="Backup",profile=False,showProgress:int=200,doClose=True): ''' create backup of this SQLDB to the given backup db see https://stackoverflow.com/a/59042442/1497139 Args: backupDB(string): the path to the backupdb or SQLDB.RAM for in memory action(string): the action to display profile(boolean): True if timing information shall be shown showProgress(int): show progress at each showProgress page (0=show no progress) ''' if sys.version_info <= (3, 6): raise Exception("backup via stdlibrary not available in python <=3.6 use copyToDB instead") startTime=time.time() bck=sqlite3.connect(backupDB) if showProgress>0: if action=="Restore": progress=self.restoreProgress else: progress=self.backupProgress else: progress=None with bck: self.c.backup(bck,pages=showProgress,progress=progress) elapsed=time.time()-startTime if profile: print("%s to %s took %5.1f s" % (action,backupDB,elapsed)) if doClose: bck.close() return None else: return bck
[docs] def showDump(self,dump,limit=10): ''' show the given dump up to the given limit Args: dump(string): the SQL dump to show limit(int): the maximum number of lines to display ''' s=io.StringIO(dump) index=0 for line in s: if index <= limit: print(line) index+=1 else: break
[docs] def executeDump(self,connection,dump,title,maxErrors=100,errorDisplayLimit=12,profile=True): ''' execute the given dump for the given connection Args: connection(Connection): the sqlite3 connection to use dump(string): the SQL commands for the dump title(string): the title of the dump maxErrors(int): maximum number of errors to be tolerated before stopping and doing a rollback profile(boolean): True if profiling information should be shown Returns: a list of errors ''' if self.debug: self.showDump(dump) startTime=time.time() if profile: print("dump of %s has size %4.1f MB" % (title,len(dump)/1024/1024)) errors=[] index=0 # fixes https://github.com/WolfgangFahl/ProceedingsTitleParser/issues/37 for line in dump.split(";\n"): try: connection.execute(line) except sqlite3.OperationalError as soe: msg="SQL error %s in line %d:\n\t%s" % (soe,index,line) errors.append(msg) if len(errors)<=errorDisplayLimit: print(msg) if len(errors)>=maxErrors: connection.execute("ROLLBACK;") break index=index+1 if profile: print("finished executing dump %s with %d lines and %d errors in %5.1f s" % (title,index,len(errors),time.time()-startTime)) return errors
[docs] def copyTo(self,copyDB,profile=True): ''' copy my content to another database Args: copyDB(Connection): the target database profile(boolean): if True show profile information ''' startTime=time.time() dump="\n".join(self.c.iterdump()) #cursor.executescript(dump) if profile: print("finished getting dump of %s in %5.1f s" % (self.dbname,time.time()-startTime)) dumpErrors=self.executeDump(copyDB.c,dump,self.dbname,profile=profile) return dumpErrors
[docs] @staticmethod def restore(backupDB,restoreDB,profile=False,showProgress=200,debug=False): ''' restore the restoreDB from the given backup DB Args: backupDB(string): path to the backupDB e.g. backup.db restoreDB(string): path to the restoreDB or in Memory SQLDB.RAM profile(boolean): True if timing information should be shown showProgress(int): show progress at each showProgress page (0=show no progress) ''' backupSQLDB=SQLDB(backupDB) connection=backupSQLDB.backup(restoreDB,action="Restore",profile=profile,showProgress=showProgress,doClose=False) restoreSQLDB=SQLDB(restoreDB,connection=connection,debug=debug) return restoreSQLDB
[docs]class EntityInfo(object): """ holds entity meta Info :ivar name(string): entity name = table name :ivar primaryKey(string): the name of the primary key column :ivar typeMap(dict): maps column names to python types :ivar debug(boolean): True if debug information should be shown """ def __init__(self,sampleRecords,name,primaryKey=None,debug=False): ''' construct me from the given name and primary key Args: name(string): the name of the entity primaryKey(string): the name of the primary key column debug(boolean): True if debug information should be shown ''' self.sampleRecords=sampleRecords self.name=name self.primaryKey=primaryKey self.debug=debug self.typeMap={} self.sqlTypeMap={} self.createTableCmd=self.getCreateTableCmd(sampleRecords) self.dropTableCmd="DROP TABLE IF EXISTS %s" % self.name self.insertCmd=self.getInsertCmd()
[docs] def getCreateTableCmd(self,sampleRecords): ''' get the CREATE TABLE DDL command for the given sample records Args: sampleRecords(list): a list of Dicts of sample Records Returns: string: CREATE TABLE DDL command for this entity info Example: .. code-block:: sql CREATE TABLE Person(name TEXT PRIMARY KEY,born DATE,numberInLine INTEGER,wikidataurl TEXT,age FLOAT,ofAge BOOLEAN) ''' ddlCmd="CREATE TABLE %s(" %self.name delim="" for sampleRecord in sampleRecords: for key,value in sampleRecord.items(): sqlType=None valueType=None if value is None: if len(sampleRecords)==1: print("Warning sampleRecord column %s is None - using TEXT as type" % key) valueType=str else: valueType=type(value) if valueType == str: sqlType="TEXT" elif valueType == int: sqlType="INTEGER" elif valueType == float: sqlType="FLOAT" elif valueType == bool: sqlType="BOOLEAN" elif valueType == datetime.date: sqlType="DATE" elif valueType== datetime.datetime: sqlType="TIMESTAMP" else: if valueType is not None: msg="warning: unsupported type %s for column %s " % (str(valueType),key) print(msg) if sqlType is not None and valueType is not None: self.addType(key,valueType,sqlType) for key,sqlType in self.sqlTypeMap.items(): ddlCmd+="%s%s %s%s" % (delim,key,sqlType," PRIMARY KEY" if key==self.primaryKey else "") delim="," ddlCmd+=")" if self.debug: print (ddlCmd) return ddlCmd
[docs] def getInsertCmd(self): ''' get the INSERT command for this entityInfo Returns: the INSERT INTO SQL command for his entityInfo e.g. Example: .. code-block:: sql INSERT INTO Person (name,born,numberInLine,wikidataurl,age,ofAge) values (?,?,?,?,?,?). ''' columns =','.join(self.typeMap.keys()) placeholders=':'+',:'.join(self.typeMap.keys()) insertCmd="INSERT INTO %s (%s) values (%s)" % (self.name, columns,placeholders) if self.debug: print(insertCmd) return insertCmd
[docs] def addType(self,column,valueType,sqlType): ''' add the python type for the given column to the typeMap Args: column(string): the name of the column valueType(type): the python type of the column ''' if not column in self.typeMap: self.typeMap[column]=valueType self.sqlTypeMap[column]=sqlType
[docs] def fixDates(self,resultList): ''' fix date entries in the given resultList by parsing the date content e.g. converting '1926-04-21' back to datetime.date(1926, 4, 21) Args: resultList(list): the list of records to be fixed ''' for record in resultList: for key,valueType in self.typeMap.items(): if valueType==datetime.date: dt=datetime.datetime.strptime(record[key],"%Y-%m-%d") dateValue=dt.date() record[key]=dateValue