Coverage for /home/martinb/.local/share/virtualenvs/camcops/lib/python3.6/site-packages/pdfkit/pdfkit.py : 17%

Hot-keys on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1# -*- coding: utf-8 -*-
2import re
3import subprocess
4import sys
5from .source import Source
6from .configuration import Configuration
7import io
8import codecs
9try:
10 # Python 2.x and 3.x support for checking string types
11 assert basestring
12except NameError:
13 basestring = str
16class PDFKit(object):
17 """
18 Main class that does all generation routine.
20 :param url_or_file: str - either a URL, a path to a file or a string containing HTML
21 to convert
22 :param type_: str - either 'url', 'file' or 'string'
23 :param options: dict (optional) with wkhtmltopdf options, with or w/o '--'
24 :param toc: dict (optional) - toc-specific wkhtmltopdf options, with or w/o '--'
25 :param cover: str (optional) - url/filename with a cover html page
26 :param configuration: (optional) instance of pdfkit.configuration.Configuration()
27 """
29 class ImproperSourceError(Exception):
30 """Wrong source type for stylesheets"""
32 def __init__(self, msg):
33 self.msg = msg
35 def __str__(self):
36 return self.msg
38 def __init__(self, url_or_file, type_, options=None, toc=None, cover=None,
39 css=None, configuration=None, cover_first=False):
41 self.source = Source(url_or_file, type_)
42 self.configuration = (Configuration() if configuration is None
43 else configuration)
44 try:
45 self.wkhtmltopdf = self.configuration.wkhtmltopdf.decode('utf-8')
46 except AttributeError:
47 self.wkhtmltopdf = self.configuration.wkhtmltopdf
49 self.options = dict()
50 if self.source.isString():
51 self.options.update(self._find_options_in_meta(url_or_file))
53 if options is not None: self.options.update(options)
55 self.toc = {} if toc is None else toc
56 self.cover = cover
57 self.cover_first = cover_first
58 self.css = css
59 self.stylesheets = []
61 def _genargs(self, opts):
62 """
63 Generator of args parts based on options specification.
65 Note: Empty parts will be filtered out at _command generator
66 """
67 for optkey, optval in self._normalize_options(opts):
68 yield optkey
70 if isinstance(optval, (list, tuple)):
71 assert len(optval) == 2 and optval[0] and optval[1], 'Option value can only be either a string or a (tuple, list) of 2 items'
72 yield optval[0]
73 yield optval[1]
74 else:
75 yield optval
77 def _command(self, path=None):
78 """
79 Generator of all command parts
80 """
81 if self.css:
82 self._prepend_css(self.css)
84 yield self.wkhtmltopdf
86 for argpart in self._genargs(self.options):
87 if argpart:
88 yield argpart
90 if self.cover and self.cover_first:
91 yield 'cover'
92 yield self.cover
94 if self.toc:
95 yield 'toc'
96 for argpart in self._genargs(self.toc):
97 if argpart:
98 yield argpart
100 if self.cover and not self.cover_first:
101 yield 'cover'
102 yield self.cover
104 # If the source is a string then we will pipe it into wkhtmltopdf
105 # If the source is file-like then we will read from it and pipe it in
106 if self.source.isString() or self.source.isFileObj():
107 yield '-'
108 else:
109 if isinstance(self.source.source, basestring):
110 yield self.source.to_s()
111 else:
112 for s in self.source.source:
113 yield s
115 # If output_path evaluates to False append '-' to end of args
116 # and wkhtmltopdf will pass generated PDF to stdout
117 if path:
118 yield path
119 else:
120 yield '-'
122 def command(self, path=None):
123 return list(self._command(path))
125 def to_pdf(self, path=None):
126 args = self.command(path)
128 result = subprocess.Popen(args, stdin=subprocess.PIPE, stdout=subprocess.PIPE,
129 stderr=subprocess.PIPE)
131 # If the source is a string then we will pipe it into wkhtmltopdf.
132 # If we want to add custom CSS to file then we read input file to
133 # string and prepend css to it and then pass it to stdin.
134 # This is a workaround for a bug in wkhtmltopdf (look closely in README)
135 if self.source.isString() or (self.source.isFile() and self.css):
136 input = self.source.to_s().encode('utf-8')
137 elif self.source.isFileObj():
138 input = self.source.source.read().encode('utf-8')
139 else:
140 input = None
141 stdout, stderr = result.communicate(input=input)
142 stderr = stderr or stdout
143 try:
144 stderr = stderr.decode('utf-8')
145 except UnicodeDecodeError:
146 stderr = ''
147 exit_code = result.returncode
149 if 'cannot connect to X server' in stderr:
150 raise IOError('%s\n'
151 'You will need to run wkhtmltopdf within a "virtual" X server.\n'
152 'Go to the link below for more information\n'
153 'https://github.com/JazzCore/python-pdfkit/wiki/Using-wkhtmltopdf-without-X-server' % stderr)
155 if 'Error' in stderr:
156 raise IOError('wkhtmltopdf reported an error:\n' + stderr)
158 if exit_code != 0:
159 raise IOError("wkhtmltopdf exited with non-zero code {0}. error:\n{1}".format(exit_code, stderr))
161 # Since wkhtmltopdf sends its output to stderr we will capture it
162 # and properly send to stdout
163 if '--quiet' not in args:
164 sys.stdout.write(stderr)
166 if not path:
167 return stdout
168 else:
169 try:
170 with codecs.open(path, encoding='utf-8') as f:
171 # read 4 bytes to get PDF signature '%PDF'
172 text = f.read(4)
173 if text == '':
174 raise IOError('Command failed: %s\n'
175 'Check whhtmltopdf output without \'quiet\' '
176 'option' % ' '.join(args))
177 return True
178 except IOError as e:
179 raise IOError('Command failed: %s\n'
180 'Check whhtmltopdf output without \'quiet\' option\n'
181 '%s ' %(' '.join(args)),e)
183 def _normalize_options(self, options):
184 """ Generator of 2-tuples (option-key, option-value).
185 When options spec is a list, generate a 2-tuples per list item.
187 :param options: dict {option name: value}
189 returns:
190 iterator (option-key, option-value)
191 - option names lower cased and prepended with
192 '--' if necessary. Non-empty values cast to str
193 """
195 for key, value in list(options.items()):
196 if not '--' in key:
197 normalized_key = '--%s' % self._normalize_arg(key)
198 else:
199 normalized_key = self._normalize_arg(key)
201 if isinstance(value, (list, tuple)):
202 for optval in value:
203 yield (normalized_key, optval)
204 else:
205 yield (normalized_key, str(value) if value else value)
208 def _normalize_arg(self, arg):
209 return arg.lower()
211 def _style_tag_for(self, stylesheet):
212 return "<style>%s</style>" % stylesheet
214 def _prepend_css(self, path):
215 if self.source.isUrl() or isinstance(self.source.source, list):
216 raise self.ImproperSourceError('CSS files can be added only to a single '
217 'file or string')
219 if not isinstance(path, list):
220 path = [path]
222 css_data = []
223 for p in path:
224 with codecs.open(p, encoding="UTF-8") as f:
225 css_data.append(f.read())
226 css_data = "\n".join(css_data)
228 if self.source.isFile():
229 with codecs.open(self.source.to_s(), encoding="UTF-8") as f:
230 inp = f.read()
231 self.source = Source(
232 inp.replace('</head>', self._style_tag_for(css_data) + '</head>'),
233 'string')
235 elif self.source.isString():
236 if '</head>' in self.source.to_s():
237 self.source.source = self.source.to_s().replace(
238 '</head>', self._style_tag_for(css_data) + '</head>')
239 else:
240 self.source.source = self._style_tag_for(css_data) + self.source.to_s()
242 def _find_options_in_meta(self, content):
243 """Reads 'content' and extracts options encoded in HTML meta tags
245 :param content: str or file-like object - contains HTML to parse
247 returns:
248 dict: {config option: value}
249 """
250 if (isinstance(content, io.IOBase)
251 or content.__class__.__name__ == 'StreamReaderWriter'):
252 content = content.read()
254 found = {}
256 for x in re.findall('<meta [^>]*>', content):
257 if re.search('name=["\']%s' % self.configuration.meta_tag_prefix, x):
258 name = re.findall('name=["\']%s([^"\']*)' %
259 self.configuration.meta_tag_prefix, x)[0]
260 found[name] = re.findall('content=["\']([^"\']*)', x)[0]
262 return found