"""
Created on 2020-08-24
@author: wf
"""
import datetime
import io
import re
# python standard library
import sqlite3
import sys
import time
from lodstorage.lod import LOD
[docs]
class SQLDB(object):
"""
Structured Query Language Database wrapper
:ivar dbname(string): name of the database
:ivar debug(boolean): True if debug info should be provided
:ivar errorDebug(boolean): True if debug info should be provided on errors (should not be used for production since it might reveal data)
"""
RAM = ":memory:"
def __init__(
self,
dbname: str = ":memory:",
connection=None,
check_same_thread=True,
timeout=5,
debug=False,
errorDebug=False,
):
"""
Construct me for the given dbname and debug
Args:
dbname(string): name of the database - default is a RAM based database
connection(Connection): an optional connection to be reused
check_same_thread(boolean): True if object handling needs to be on the same thread see https://stackoverflow.com/a/48234567/1497139
timeout(float): number of seconds for connection timeout
debug(boolean): if True switch on debug
errorDebug(boolean): True if debug info should be provided on errors (should not be used for production since it might reveal data)
"""
self.dbname = dbname
self.debug = debug
self.errorDebug = errorDebug
if connection is None:
self.c = sqlite3.connect(
dbname,
detect_types=sqlite3.PARSE_DECLTYPES,
check_same_thread=check_same_thread,
timeout=timeout,
)
else:
self.c = connection
[docs]
def logError(self, msg):
"""
log the given error message to stderr
Args:
msg(str): the error messsage to display
"""
print(msg, file=sys.stderr, flush=True)
[docs]
def close(self):
"""close my connection"""
self.c.close()
[docs]
def execute(self, ddlCmd):
"""
execute the given Data Definition Command
Args:
ddlCmd(string): e.g. a CREATE TABLE or CREATE View command
"""
self.c.execute(ddlCmd)
[docs]
def createTable(
self,
listOfRecords,
entityName: str,
primaryKey: str = None,
withCreate: bool = True,
withDrop: bool = False,
sampleRecordCount=1,
failIfTooFew=True,
):
"""
derive Data Definition Language CREATE TABLE command from list of Records by examining first recorda
as defining sample record and execute DDL command
auto detect column types see e.g. https://stackoverflow.com/a/57072280/1497139
Args:
listOfRecords(list): a list of Dicts
entityName(string): the entity / table name to use
primaryKey(string): the key/column to use as a primary key
withDrop(boolean): true if the existing Table should be dropped
withCreate(boolean): true if the create Table command should be executed - false if only the entityInfo should be returned
sampleRecords(int): number of sampleRecords expected and to be inspected
failIfTooFew(boolean): raise an Exception if to few sampleRecords else warn only
Returns:
EntityInfo: meta data information for the created table
"""
l = len(listOfRecords)
if sampleRecordCount < 0:
sampleRecordCount = l
if l < sampleRecordCount:
msg = "only %d/%d of needed sample records to createTable available" % (
l,
sampleRecordCount,
)
if failIfTooFew:
raise Exception(msg)
else:
if self.debug:
self.logError(msg)
sampleRecords = listOfRecords[:sampleRecordCount]
entityInfo = EntityInfo(sampleRecords, entityName, primaryKey, debug=self.debug)
if withDrop:
self.c.execute(entityInfo.dropTableCmd)
if withCreate:
try:
self.c.execute(entityInfo.createTableCmd)
except sqlite3.OperationalError as oe:
raise Exception(
f"createTable failed with error {oe} for {entityInfo.createTableCmd}"
)
return entityInfo
[docs]
def getDebugInfo(self, record, index, executeMany):
"""
get the debug info for the given record at the given index depending on the state of executeMany
Args:
record(dict): the record to show
index(int): the index of the record
executeMany(boolean): if True the record may be valid else not
"""
debugInfo = ""
if not executeMany:
# shall we shoe the details of the record (which might be a security risk)
if self.errorDebug:
# show details of record
debugInfo = "\nrecord #%d=%s" % (index, repr(record))
else:
# show only index
debugInfo = "\nrecord #%d" % index
return debugInfo
[docs]
def store(
self, listOfRecords, entityInfo, executeMany=False, fixNone=False, replace=False
):
"""
store the given list of records based on the given entityInfo
Args:
listOfRecords(list): the list of Dicts to be stored
entityInfo(EntityInfo): the meta data to be used for storing
executeMany(bool): if True the insert command is done with many/all records at once
fixNone(bool): if True make sure empty columns in the listOfDict are filled with "None" values
replace(bool): if True allow replace for insert
"""
insertCmd = entityInfo.getInsertCmd(replace=replace)
record = None
index = 0
try:
if executeMany:
if fixNone:
LOD.setNone4List(listOfRecords, entityInfo.typeMap.keys())
self.c.executemany(insertCmd, listOfRecords)
else:
for record in listOfRecords:
index += 1
if fixNone:
LOD.setNone(record, entityInfo.typeMap.keys())
self.c.execute(insertCmd, record)
self.c.commit()
except sqlite3.ProgrammingError as pe:
msg = pe.args[0]
if "You did not supply a value for binding" in msg:
if ":" in msg:
# sqlite now returns the parameter name not the number
# You did not supply a value for binding parameter :type.
columnName = re.findall(r":([a-zA-Z][a-zA-Z0-9_]*)", msg)[0]
columnName = columnName.replace(":", "")
else:
# pre python 3.10
# You did not supply a value for binding 2.
columnIndex = int(re.findall(r"\d+", msg)[0])
columnName = list(entityInfo.typeMap.keys())[columnIndex - 1]
debugInfo = self.getDebugInfo(record, index, executeMany)
raise Exception(
"%s\nfailed: no value supplied for column '%s'%s"
% (insertCmd, columnName, debugInfo)
)
else:
raise pe
except sqlite3.InterfaceError as ie:
msg = ie.args[0]
if "Error binding parameter" in msg:
columnName = re.findall(r":[_a-zA-Z]\w*", msg)[0]
debugInfo = self.getDebugInfo(record, index, executeMany)
raise Exception(
"%s\nfailed: error binding column '%s'%s"
% (insertCmd, columnName, debugInfo)
)
else:
raise ie
except Exception as ex:
debugInfo = self.getDebugInfo(record, index, executeMany)
msg = "%s\nfailed:%s%s" % (insertCmd, str(ex), debugInfo)
raise Exception(msg)
[docs]
def queryGen(self, sqlQuery, params=None):
"""
run the given sqlQuery a a generator for dicts
Args:
sqlQuery(string): the SQL query to be executed
params(tuple): the query params, if any
Returns:
a generator of dicts
"""
if self.debug:
print(sqlQuery)
if params is not None:
print(params)
# https://stackoverflow.com/a/13735506/1497139
cur = self.c.cursor()
if params is not None:
query = cur.execute(sqlQuery, params)
else:
query = cur.execute(sqlQuery)
colname = [d[0] for d in query.description]
try:
# loop over all rows
for row in query:
record = dict(zip(colname, row))
yield record
except Exception as ex:
msg = str(ex)
self.logError(msg)
pass
cur.close()
[docs]
def query(self, sqlQuery, params=None):
"""
run the given sqlQuery and return a list of Dicts
Args:
sqlQuery(string): the SQL query to be executed
params(tuple): the query params, if any
Returns:
list: a list of Dicts
"""
resultList = []
for record in self.queryGen(sqlQuery, params):
resultList.append(record)
return resultList
[docs]
def queryAll(self, entityInfo, fixDates=True):
"""
query all records for the given entityName/tableName
Args:
entityName(string): name of the entity/table to qury
fixDates(boolean): True if date entries should be returned as such and not as strings
"""
sqlQuery = "SELECT * FROM %s" % entityInfo.name
resultList = self.query(sqlQuery)
if fixDates:
entityInfo.fixDates(resultList)
return resultList
[docs]
def getTableList(self, tableType="table"):
"""
get the schema information from this database
Args:
tableType(str): table or view
Return:
list: a list as derived from PRAGMA table_info
"""
tableQuery = f"SELECT name FROM sqlite_master WHERE type='{tableType}'"
tableList = self.query(tableQuery)
for table in tableList:
tableName = table["name"]
columnQuery = f"PRAGMA table_info('{tableName}')"
columns = self.query(columnQuery)
table["columns"] = columns
return tableList
[docs]
def getTableDict(self, tableType="table"):
"""
get the schema information from this database as a dict
Args:
tableType(str): table or view
Returns:
dict: Lookup map of tables with columns also being converted to dict
"""
tableDict = {}
for table in self.getTableList(tableType=tableType):
colDict = {}
for col in table["columns"]:
colDict[col["name"]] = col
table["columns"] = colDict
tableDict[table["name"]] = table
return tableDict
[docs]
def restoreProgress(self, status, remaining, total):
self.progress("Restore", status, remaining, total)
[docs]
def backupProgress(self, status, remaining, total):
self.progress("Backup", status, remaining, total)
[docs]
def progress(self, action, status, remaining, total):
"""
show progress
"""
print(
"%s %s at %5.0f%%"
% (
action,
"... " if status == 0 else "done",
(total - remaining) / total * 100,
)
)
[docs]
def backup(
self,
backupDB,
action="Backup",
profile=False,
showProgress: int = 200,
doClose=True,
):
"""
create backup of this SQLDB to the given backup db
see https://stackoverflow.com/a/59042442/1497139
Args:
backupDB(string): the path to the backupdb or SQLDB.RAM for in memory
action(string): the action to display
profile(boolean): True if timing information shall be shown
showProgress(int): show progress at each showProgress page (0=show no progress)
"""
if sys.version_info <= (3, 6):
raise Exception(
"backup via stdlibrary not available in python <=3.6 use copyToDB instead"
)
startTime = time.time()
bck = sqlite3.connect(backupDB)
if showProgress > 0:
if action == "Restore":
progress = self.restoreProgress
else:
progress = self.backupProgress
else:
progress = None
with bck:
self.c.backup(bck, pages=showProgress, progress=progress)
elapsed = time.time() - startTime
if profile:
print("%s to %s took %5.1f s" % (action, backupDB, elapsed))
if doClose:
bck.close()
return None
else:
return bck
[docs]
def showDump(self, dump, limit=10):
"""
show the given dump up to the given limit
Args:
dump(string): the SQL dump to show
limit(int): the maximum number of lines to display
"""
s = io.StringIO(dump)
index = 0
for line in s:
if index <= limit:
print(line)
index += 1
else:
break
[docs]
def executeDump(
self, connection, dump, title, maxErrors=100, errorDisplayLimit=12, profile=True
):
"""
execute the given dump for the given connection
Args:
connection(Connection): the sqlite3 connection to use
dump(string): the SQL commands for the dump
title(string): the title of the dump
maxErrors(int): maximum number of errors to be tolerated before stopping and doing a rollback
profile(boolean): True if profiling information should be shown
Returns:
a list of errors
"""
if self.debug:
self.showDump(dump)
startTime = time.time()
if profile:
print("dump of %s has size %4.1f MB" % (title, len(dump) / 1024 / 1024))
errors = []
index = 0
# fixes https://github.com/WolfgangFahl/ProceedingsTitleParser/issues/37
for line in dump.split(";\n"):
try:
connection.execute(line)
except sqlite3.OperationalError as soe:
msg = "SQL error %s in line %d:\n\t%s" % (soe, index, line)
errors.append(msg)
if len(errors) <= errorDisplayLimit:
print(msg)
if len(errors) >= maxErrors:
connection.execute("ROLLBACK;")
break
index = index + 1
if profile:
print(
"finished executing dump %s with %d lines and %d errors in %5.1f s"
% (title, index, len(errors), time.time() - startTime)
)
return errors
[docs]
def copyTo(self, copyDB, profile=True):
"""
copy my content to another database
Args:
copyDB(Connection): the target database
profile(boolean): if True show profile information
"""
startTime = time.time()
dump = "\n".join(self.c.iterdump())
# cursor.executescript(dump)
if profile:
print(
"finished getting dump of %s in %5.1f s"
% (self.dbname, time.time() - startTime)
)
dumpErrors = self.executeDump(copyDB.c, dump, self.dbname, profile=profile)
return dumpErrors
[docs]
@staticmethod
def restore(backupDB, restoreDB, profile=False, showProgress=200, debug=False):
"""
restore the restoreDB from the given backup DB
Args:
backupDB(string): path to the backupDB e.g. backup.db
restoreDB(string): path to the restoreDB or in Memory SQLDB.RAM
profile(boolean): True if timing information should be shown
showProgress(int): show progress at each showProgress page (0=show no progress)
"""
backupSQLDB = SQLDB(backupDB)
connection = backupSQLDB.backup(
restoreDB,
action="Restore",
profile=profile,
showProgress=showProgress,
doClose=False,
)
restoreSQLDB = SQLDB(restoreDB, connection=connection, debug=debug)
return restoreSQLDB
[docs]
class EntityInfo(object):
"""
holds entity meta Info
:ivar name(string): entity name = table name
:ivar primaryKey(string): the name of the primary key column
:ivar typeMap(dict): maps column names to python types
:ivar debug(boolean): True if debug information should be shown
"""
def __init__(self, sampleRecords, name, primaryKey=None, debug=False):
"""
construct me from the given name and primary key
Args:
name(string): the name of the entity
primaryKey(string): the name of the primary key column
debug(boolean): True if debug information should be shown
"""
self.sampleRecords = sampleRecords
self.name = name
self.primaryKey = primaryKey
self.debug = debug
self.typeMap = {}
self.sqlTypeMap = {}
self.createTableCmd = self.getCreateTableCmd(sampleRecords)
self.dropTableCmd = "DROP TABLE IF EXISTS %s" % self.name
self.insertCmd = self.getInsertCmd()
[docs]
def getCreateTableCmd(self, sampleRecords):
"""
get the CREATE TABLE DDL command for the given sample records
Args:
sampleRecords(list): a list of Dicts of sample Records
Returns:
string: CREATE TABLE DDL command for this entity info
Example:
.. code-block:: sql
CREATE TABLE Person(name TEXT PRIMARY KEY,born DATE,numberInLine INTEGER,wikidataurl TEXT,age FLOAT,ofAge BOOLEAN)
"""
ddlCmd = "CREATE TABLE %s(" % self.name
delim = ""
for sampleRecord in sampleRecords:
for key, value in sampleRecord.items():
sqlType = None
valueType = None
if value is None:
if len(sampleRecords) == 1:
print(
"Warning sampleRecord column %s is None - using TEXT as type"
% key
)
valueType = str
else:
valueType = type(value)
if valueType == str:
sqlType = "TEXT"
elif valueType == int:
sqlType = "INTEGER"
elif valueType == float:
sqlType = "FLOAT"
elif valueType == bool:
sqlType = "BOOLEAN"
elif valueType == datetime.date:
sqlType = "DATE"
elif valueType == datetime.datetime:
sqlType = "TIMESTAMP"
else:
if valueType is not None:
msg = "warning: unsupported type %s for column %s " % (
str(valueType),
key,
)
print(msg)
if sqlType is not None and valueType is not None:
self.addType(key, valueType, sqlType)
for key, sqlType in self.sqlTypeMap.items():
ddlCmd += "%s%s %s%s" % (
delim,
key,
sqlType,
" PRIMARY KEY" if key == self.primaryKey else "",
)
delim = ","
ddlCmd += ")"
if self.debug:
print(ddlCmd)
return ddlCmd
[docs]
def getInsertCmd(self, replace: bool = False) -> str:
"""
get the INSERT command for this entityInfo
Args:
replace(bool): if True allow replace for insert
Returns:
str: the INSERT INTO SQL command for his entityInfo e.g.
Example:
.. code-block:: sql
INSERT INTO Person (name,born,numberInLine,wikidataurl,age,ofAge) values (?,?,?,?,?,?).
"""
columns = ",".join(self.typeMap.keys())
placeholders = ":" + ",:".join(self.typeMap.keys())
replaceClause = " OR REPLACE" if replace else ""
insertCmd = f"INSERT{replaceClause} INTO {self.name} ({columns}) values ({placeholders})"
if self.debug:
print(insertCmd)
return insertCmd
[docs]
def addType(self, column, valueType, sqlType):
"""
add the python type for the given column to the typeMap
Args:
column(string): the name of the column
valueType(type): the python type of the column
"""
if not column in self.typeMap:
self.typeMap[column] = valueType
self.sqlTypeMap[column] = sqlType
[docs]
def fixDates(self, resultList):
"""
fix date entries in the given resultList by parsing the date content e.g.
converting '1926-04-21' back to datetime.date(1926, 4, 21)
Args:
resultList(list): the list of records to be fixed
"""
for record in resultList:
for key, valueType in self.typeMap.items():
if valueType == datetime.date:
dt = datetime.datetime.strptime(record[key], "%Y-%m-%d")
dateValue = dt.date()
record[key] = dateValue