Source code for lodstorage.query

'''
Created on 2020-08-22

@author: wf
'''

from enum import Enum 
import os
import yaml
from tabulate import tabulate
import urllib
import copy
#from wikibot.mwTable import MediaWikiTable
# redundant copy in this library to avoid dependency issues
# original is at
from lodstorage.jsonable import JSONAble
from lodstorage.mwTable import MediaWikiTable
from pylatexenc.latexencode import unicode_to_latex
import re
import sys
from pathlib import Path
from pygments import highlight
from pygments.lexers import get_lexer_by_name
from pygments.formatters.html import HtmlFormatter
from pygments.formatters.latex import LatexFormatter


[docs]class Format(Enum): ''' the supported formats for the results to be delivered ''' csv = 'csv' json = 'json' xml = 'xml' tsv = 'tsv' latex = 'latex' mediawiki= 'mediawiki' github = 'github' def __str__(self): return self.value
[docs]class YamlPath:
[docs] @staticmethod def getPaths(yamlFileName:str,yamlPath:str=None): if yamlPath is None: yamlPath = f"{os.path.dirname(__file__)}/../sampledata/{yamlFileName}" yamlPaths=[yamlPath] home = str(Path.home()) # additional yamls from users yaml configuration homepath = f"{home}/.pylodstorage/{yamlFileName}" if os.path.isfile(homepath): yamlPaths.append(homepath) return yamlPaths
[docs]class ValueFormatter(): ''' a value Formatter ''' home = str(Path.home()) # additional endpoints from users endpoint configuration formatsPath=f"{os.path.dirname(__file__)}/../sampledata/formats.yaml" valueFormats=None def __init__(self,name:str,formatString:str,regexps:list=None,): ''' constructor Args: fstring(str): the format String to use regexps(list): the regular expressions to apply ''' self.name=name self.regexps=regexps self.formatString=formatString
[docs] @classmethod def fromDict(cls,name:str,record:dict): ''' create a ValueFormatter from the given dict ''' if "regexps" in record: regexps=record["regexps"] else: regexps=[] vf=ValueFormatter(name=name,formatString=record["format"],regexps=regexps) return vf
[docs] @classmethod def getFormats(cls,formatsPath:str=None)->dict: ''' get the available ValueFormatters Args: formatsPath(str): the path to the yaml file to read the format specs from Returns: dict: a map for ValueFormatters by formatter Name ''' if cls.valueFormats is None: valueFormats={} formatPaths=YamlPath.getPaths("formats.yaml",formatsPath) for formatPath in formatPaths: with open(formatPath, 'r') as stream: valueFormatRecords = yaml.safe_load(stream) for valueFormatKey,valueFormatRecord in valueFormatRecords.items(): valueFormats[valueFormatKey]=ValueFormatter.fromDict(name=valueFormatKey,record=valueFormatRecord) cls.valueFormats=valueFormats return cls.valueFormats
[docs] def applyFormat(self,record,key,resultFormat:Format): ''' apply the given format to the given record Args: record(dict): the record to handle key(str): the property key resultFormat(str): the resultFormat Style to apply ''' if key in record: value=record[key] if value is not None and isinstance(value,str): # if there are no regular expressions specified always format doformat=len(self.regexps)==0 for regexp in self.regexps: try: vmatch=re.match(regexp,value) if (vmatch): # we found a match and will format it if the value is not none doformat=True value=vmatch.group("value") except Exception as ex: print(f"ValueFormatter: {self.name}\nInvalid regular expression:{regexp}\n{str(ex)}",file=sys.stderr) if value is not None and doformat: link=self.formatString.format(value=value) newValue=None if resultFormat=="github": newValue=f"[{value}]({link})" elif resultFormat=="mediawiki": newValue=f"[{link} {value}]" elif resultFormat=="latex": newValue=f"\href{{{link}}}{{{value}}}" if newValue is not None: record[key]=newValue
[docs]class QuerySyntaxHighlight: ''' Syntax highlighting for queries with pygments ''' def __init__(self,query,highlightFormat:str="html"): ''' construct me for the given query and highlightFormat Args: query(Query): the query to do the syntax highlighting for highlightFormat(str): the highlight format to be used ''' self.query=query self.highlightFormat=highlightFormat self.lexer=get_lexer_by_name(self.query.lang) if self.highlightFormat=="html": self.formatter=HtmlFormatter() elif self.highlightFormat=="latex": self.formatter=LatexFormatter()
[docs] def highlight(self): ''' Returns: str: the result of the syntax highlighting with pygments ''' syntaxResult=highlight(self.query.query,self.lexer, self.formatter) return syntaxResult
[docs]class QueryResultDocumentation(): ''' documentation of a query result ''' def __init__(self,query,title:str,tablefmt:str,tryItMarkup:str,sourceCodeHeader:str,sourceCode:str,resultHeader:str,result:str): ''' constructor Args: query(Query): the query to be documented title(str): the title markup tablefmt(str): the tableformat that has been used tryItMarkup: the "try it!" markup to show sourceCodeHeader(str): the header title to use for the sourceCode sourceCode(str): the sourceCode resultCodeHeader(str): the header title to use for the result result(str): the result header ''' self.query=query self.title=title self.tablefmt=tablefmt self.tryItMarkup=f"\n{tryItMarkup}" self.sourceCodeHeader=sourceCodeHeader self.sourceCode=sourceCode self.resultHeader=resultHeader self.result=result
[docs] @staticmethod def uniCode2Latex(text:str,withConvert:bool=False)->str: ''' converts unicode text to latex and fixes UTF-8 chars for latex in a certain range: ₀:$_0$ ... ₉:$_9$ see https://github.com/phfaist/pylatexenc/issues/72 Args: text(str): the string to fix withConvert(bool): if unicode to latex libary conversion should be used Return: str: latex presentation of UTF-8 char ''' for code in range(8320,8330): text=text.replace(chr(code),f"$_{code-8320}$") if withConvert: latex=unicode_to_latex(text) # workaround {\textbackslash} being returned #latex=latex.replace("{\\textbackslash}",'\\') text=latex return text
def __str__(self): ''' simple string representation ''' return self.asText()
[docs] def asText(self): ''' return my text representation Returns: str: description, sourceCodeHeader, sourceCode, tryIt link and result table ''' text=f"{self.title}\n{self.query.description}\n{self.sourceCodeHeader}\n{self.sourceCode}{self.tryItMarkup}\n{self.resultHeader}\n{self.result}" fixedStr=self.uniCode2Latex(text) if self.tablefmt.lower()=="latex" else text return fixedStr
[docs]class Query(object): ''' a Query e.g. for SPAQRL ''' def __init__(self,name:str,query:str,lang='sparql',endpoint:str=None,title:str=None,description:str=None,prefixes=None,tryItUrl:str=None,formats:list=None,debug=False): ''' constructor Args: name(string): the name/label of the query query(string): the native Query text e.g. in SPARQL lang(string): the language of the query e.g. SPARQL endpoint(string): the endpoint url to use title(string): the header/title of the query description(string): the description of the query prefixes(list): list of prefixes to be resolved tryItUrl(str): the url of a "tryit" webpage formats(list): key,value pairs of ValueFormatters to be applied debug(boolean): true if debug mode should be switched on ''' self.name=name self.query=query self.lang=lang self.endpoint=endpoint self.title=title=name if title is None else title self.description="" if description is None else description self.prefixes=prefixes self.debug=debug self.tryItUrl=tryItUrl self.formats=formats self.formatCallBacks=[] def __str__(self): queryStr="\n".join([f"{key}:{value}" for key, value in self.__dict__.items() if value is not None]) return f"{queryStr}"
[docs] def addFormatCallBack(self,callback): self.formatCallBacks.append(callback)
[docs] def preFormatWithCallBacks(self,lod,tablefmt:str): ''' run the configured call backs to pre-format the given list of dicts for the given tableformat Args: lod(list): the list of dicts to handle tablefmt(str): the table format (according to tabulate) to apply ''' for record in lod: for key in record.keys(): value=record[key] if value is not None: for formatCallBack in self.formatCallBacks: formatCallBack(record,key,value,tablefmt)
[docs] def formatWithValueFormatters(self,lod,tablefmt:str): ''' format the given list of Dicts with the ValueFormatters ''' # is there anything to do? if self.formats is None: # no return # get the value Formatters that might apply here valueFormatters=ValueFormatter.getFormats() formatsToApply={} for valueFormatSpec in self.formats: parts=valueFormatSpec.split(":") # e.g. president:wikidata keytoformat=parts[0] formatName=parts[1] if formatName in valueFormatters: formatsToApply[keytoformat]=valueFormatters[formatName] for record in lod: for keytoformat in formatsToApply: valueFormatter=formatsToApply[keytoformat] # format all key values if keytoformat=="*": for key in record: valueFormatter.applyFormat(record,key,tablefmt) # or just a selected one elif keytoformat in record: valueFormatter.applyFormat(record,keytoformat,tablefmt) pass
[docs] def getTryItUrl(self,baseurl:str): ''' return the "try it!" url for the given baseurl Args: baseurl(str): the baseurl to used Returns: str: the "try it!" url for the given query ''' # https://stackoverflow.com/a/9345102/1497139 quoted=urllib.parse.quote(self.query) quoted=f"#{quoted}" url=f"{baseurl}/{quoted}" return url
[docs] def asYaml(self): yamlMarkup=yaml.dump(self) return yamlMarkup
[docs] def asWikiSourceMarkup(self): ''' convert me to Mediawiki markup for syntax highlighting using the "source" tag Returns: string: the Markup ''' markup="<source lang='%s'>\n%s\n</source>\n" %(self.lang,self.query) return markup
[docs] def asWikiMarkup(self,listOfDicts): ''' convert the given listOfDicts result to MediaWiki markup Args: listOfDicts(list): the list of Dicts to convert to MediaWiki markup Returns: string: the markup ''' if self.debug: print(listOfDicts) mwTable=MediaWikiTable() mwTable.fromListOfDicts(listOfDicts) markup=mwTable.asWikiMarkup() return markup
[docs] def documentQueryResult(self,qlod:list,limit=None,tablefmt:str="mediawiki",tryItUrl:str=None,withSourceCode=True,**kwArgs): ''' document the given query results - note that a copy of the whole list is going to be created for being able to format Args: qlod: the list of dicts result limit(int): the maximum number of records to display in result tabulate tablefmt(str): the table format to use tryItUrl: the "try it!" url to show withSourceCode(bool): if True document the source code Return: str: the documentation tabular text for the given parameters ''' sourceCode=self.query tryItMarkup="" sourceCodeHeader="" resultHeader="" title=self.title if limit is not None: lod=copy.deepcopy(qlod[:limit]) else: lod=copy.deepcopy(qlod) self.preFormatWithCallBacks(lod,tablefmt=tablefmt) self.formatWithValueFormatters(lod,tablefmt=tablefmt) result=tabulate(lod,headers="keys",tablefmt=tablefmt,**kwArgs) if tryItUrl is None and hasattr(self,'tryItUrl'): tryItUrl=self.tryItUrl if tablefmt=="github": title=f"## {self.title}" resultHeader="## result" elif tablefmt=="mediawiki": title=f"== {self.title} ==" resultHeader="=== result ===" elif tablefmt=="latex": resultHeader="" result=r"""\begin{table} \caption{%s} \label{tab:%s} %s \end{table} """ % (self.title,self.name,result) else: title=f"{self.title}" resultHeader="result:" if withSourceCode: tryItUrlEncoded=self.getTryItUrl(tryItUrl) tryItMarkup=self.getLink(tryItUrlEncoded, "try it!", tablefmt) if tablefmt=="github": sourceCodeHeader="### query" sourceCode=f"""```{self.lang} {self.query} ```""" elif tablefmt=="mediawiki": sourceCodeHeader="=== query ===" sourceCode=f"""<source lang='{self.lang}'> {self.query} </source> """ elif tablefmt=="latex": sourceCodeHeader=r"see query listing \ref{listing:%s} and result table \ref{tab:%s}" % (self.name,self.name) sourceCode=r"""\begin{listing}[ht] \caption{%s} \label{listing:%s} \begin{minted}{%s} %s \end{minted} %s \end{listing} """ % (self.title,self.name,self.lang.lower(),self.query,tryItMarkup) else: sourceCodeHeader="query:" sourceCode=f"{self.query}" if self.lang!="sparql": tryItMarkup="" queryResultDocumentation=QueryResultDocumentation(query=self,title=title,tablefmt=tablefmt,tryItMarkup=tryItMarkup,sourceCodeHeader=sourceCodeHeader,sourceCode=sourceCode,resultHeader=resultHeader,result=result) return queryResultDocumentation
[docs]class QueryManager(object): ''' manages pre packaged Queries ''' def __init__(self,lang:str=None,debug=False,queriesPath=None): ''' Constructor Args: lang(string): the language to use for the queries sql or sparql debug(boolean): True if debug information should be shown ''' if lang is None: lang='sql' self.queriesByName={} self.lang=lang self.debug=debug queries=QueryManager.getQueries(queriesPath=queriesPath) for name,queryDict in queries.items(): if self.lang in queryDict: queryText=queryDict.pop(self.lang) for qformat in ['sparql', 'sql']: # drop not needed query variants if qformat in queryDict: queryDict.pop(qformat) query=Query(name=name,query=queryText,lang=self.lang,**queryDict,debug=self.debug) self.queriesByName[name]=query
[docs] @staticmethod def getQueries(queriesPath=None): ''' get the queries for thee given queries Path ''' queriesPaths=YamlPath.getPaths("queries.yaml", queriesPath) queries={} for queriesPath in queriesPaths: with open(queriesPath, 'r') as stream: lqueries = yaml.safe_load(stream) for key in lqueries: queries[key]=lqueries[key] return queries
[docs]class EndpointManager(object): """ manages a set of SPARQL endpoints """
[docs] @staticmethod def getEndpoints(endpointPath=None): ''' get the queries for thee given queries Path ''' endpointPaths=YamlPath.getPaths("endpoints.yaml",endpointPath) endpoints={} for lEndpointPath in endpointPaths: with open(lEndpointPath, 'r') as stream: endpointRecords = yaml.safe_load(stream) for name, record in endpointRecords.items(): endpoint=Endpoint() endpoint.fromDict({"name": name, **record}) endpoints[name]=endpoint return endpoints
[docs] @staticmethod def getEndpointNames(endpointPath=None) -> list: """ Returns a list of all available endpoint names """ endpoints = EndpointManager.getEndpoints(endpointPath) return list(endpoints.keys())
[docs]class Endpoint(JSONAble): """ a query endpoint """
[docs] @staticmethod def getSamples(): samples=[ { "name": "wikidata", "lang": "sparql", "endpoint": "https://query.wikidata.org/sparql", "method": "POST", "prefixes": "PREFIX bd: <http://www.bigdata.com/rdf#>\nPREFIX cc: <http://creativecommons.org/ns#>" } ] return samples
def __init__(self): ''' constructor for setting defaults ''' self.method="POST" self.lang="SPARQL" def __str__(self): ''' Returns: str: a string representation of this Endpoint ''' text=f"{self.name}({self.name}):{self.endpoint}({self.method})" return text