Coverage for /home/martinb/.local/share/virtualenvs/camcops/lib/python3.6/site-packages/cardinal_pythonlib/pdf.py : 25%

Hot-keys on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1#!/usr/bin/env python
2# cardinal_pythonlib/pdf.py
4"""
5===============================================================================
7 Original code copyright (C) 2009-2021 Rudolf Cardinal (rudolf@pobox.com).
9 This file is part of cardinal_pythonlib.
11 Licensed under the Apache License, Version 2.0 (the "License");
12 you may not use this file except in compliance with the License.
13 You may obtain a copy of the License at
15 https://www.apache.org/licenses/LICENSE-2.0
17 Unless required by applicable law or agreed to in writing, software
18 distributed under the License is distributed on an "AS IS" BASIS,
19 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
20 See the License for the specific language governing permissions and
21 limitations under the License.
23===============================================================================
25**Support functions to generate (and serve) PDFs.**
27"""
29import getpass
30import io
31import logging
32import os
33from pprint import pformat
34import shutil
35import sys
36import tempfile
37from typing import Any, Dict, Iterable, Union
39from cardinal_pythonlib.logs import get_brace_style_log_with_null_handler
40# noinspection PyProtectedMember
41from PyPDF2 import PdfFileMerger, PdfFileReader, PdfFileWriter
42from semantic_version import Version
44# =============================================================================
45# Conditional/optional imports
46# =============================================================================
48log = get_brace_style_log_with_null_handler(__name__)
50pdfkit = None
51xhtml2pdf = None
52weasyprint = None
54# Preference 1
55try:
56 log.debug("trying pdfkit...")
57 # noinspection PyPackageRequirements
58 import pdfkit # sudo apt-get install wkhtmltopdf; sudo pip install pdfkit
59 log.debug("pdfkit: loaded")
60except ImportError:
61 pdfkit = None
62 log.debug("pdfkit: failed to load")
64if pdfkit:
65 log.debug("pdfkit found, so skipping other PDF rendering engines")
66else:
67 try:
68 # noinspection PyPackageRequirements
69 import xhtml2pdf # pip install xhtml2pdf
70 # noinspection PyPackageRequirements
71 import xhtml2pdf.document # pip install xhtml2pdf
72 log.debug("xhtml2pdf: loaded")
73 except ImportError:
74 xhtml2pdf = None
75 log.debug("xhtml2pdf: failed to load")
77 try:
78 log.debug("trying weasyprint...")
79 # noinspection PyPackageRequirements
80 import weasyprint
81 log.debug("weasyprint: loaded")
82 except ImportError:
83 weasyprint = None
84 log.debug("weasyprint: failed to load")
86# =============================================================================
87# Onwards
88# =============================================================================
90if not any([xhtml2pdf, weasyprint, pdfkit]):
91 raise RuntimeError("No PDF engine (xhtml2pdf, weasyprint, pdfkit) "
92 "available; can't load")
95class Processors:
96 """
97 Class to enumerate possible PDF processors.
98 """
99 XHTML2PDF = "xhtml2pdf"
100 WEASYPRINT = "weasyprint"
101 PDFKIT = "pdfkit"
104_WKHTMLTOPDF_FILENAME = shutil.which("wkhtmltopdf")
106if pdfkit:
107 _DEFAULT_PROCESSOR = Processors.PDFKIT # the best
108elif weasyprint:
109 _DEFAULT_PROCESSOR = Processors.WEASYPRINT # imperfect tables
110else:
111 _DEFAULT_PROCESSOR = Processors.XHTML2PDF # simple/slow
114# =============================================================================
115# PdfPlan
116# =============================================================================
118class PdfPlan(object):
119 """
120 Class to describe a PDF on disk or the information required to create the
121 PDF from HTML.
122 """
123 def __init__(self,
124 # HTML mode
125 is_html: bool = False,
126 html: str = None,
127 header_html: str = None,
128 footer_html: str = None,
129 wkhtmltopdf_filename: str = None,
130 wkhtmltopdf_options: Dict[str, Any] = None,
131 # Filename mode
132 is_filename: bool = False,
133 filename: str = None):
134 """
135 Args:
136 is_html: use HTML mode?
137 html: for HTML mode, the main HTML
138 header_html: for HTML mode, an optional page header (in HTML)
139 footer_html: for HTML mode, an optional page footer (in HTML)
140 wkhtmltopdf_filename: filename of the ``wkhtmltopdf`` executable
141 wkhtmltopdf_options: options for ``wkhtmltopdf``
142 is_filename: use file mode?
143 filename: for file mode, the filename of the existing PDF on disk
145 Use either ``is_html`` or ``is_filename``, not both.
146 """
147 assert is_html != is_filename, "Specify is_html XOR is_filename"
149 self.is_html = is_html
150 # is_html options:
151 self.html = html
152 self.header_html = header_html
153 self.footer_html = footer_html
154 self.wkhtmltopdf_filename = wkhtmltopdf_filename
155 self.wkhtmltopdf_options = wkhtmltopdf_options
157 self.is_filename = is_filename
158 # is_filename options
159 self.filename = filename
161 def add_to_writer(self,
162 writer: PdfFileWriter,
163 start_recto: bool = True) -> None:
164 """
165 Add the PDF described by this class to a PDF writer.
167 Args:
168 writer: a :class:`PyPDF2.PdfFileWriter`
169 start_recto: start a new right-hand page?
171 """
172 if self.is_html:
173 pdf = get_pdf_from_html(
174 html=self.html,
175 header_html=self.header_html,
176 footer_html=self.footer_html,
177 wkhtmltopdf_filename=self.wkhtmltopdf_filename,
178 wkhtmltopdf_options=self.wkhtmltopdf_options)
179 append_memory_pdf_to_writer(pdf, writer, start_recto=start_recto)
180 elif self.is_filename:
181 if start_recto and writer.getNumPages() % 2 != 0:
182 writer.addBlankPage()
183 writer.appendPagesFromReader(PdfFileReader(
184 open(self.filename, 'rb')))
185 else:
186 raise AssertionError("PdfPlan: shouldn't get here!")
189# =============================================================================
190# Ancillary functions for PDFs
191# =============================================================================
193def assert_processor_available(processor: str) -> None:
194 """
195 Assert that a specific PDF processor is available.
197 Args:
198 processor: a PDF processor type from :class:`Processors`
200 Raises:
201 AssertionError: if bad ``processor``
202 RuntimeError: if requested processor is unavailable
203 """
204 if processor not in [Processors.XHTML2PDF,
205 Processors.WEASYPRINT,
206 Processors.PDFKIT]:
207 raise AssertionError("rnc_pdf.set_pdf_processor: invalid PDF processor"
208 " specified")
209 if processor == Processors.WEASYPRINT and not weasyprint:
210 raise RuntimeError("rnc_pdf: Weasyprint requested, but not available")
211 if processor == Processors.XHTML2PDF and not xhtml2pdf:
212 raise RuntimeError("rnc_pdf: xhtml2pdf requested, but not available")
213 if processor == Processors.PDFKIT and not pdfkit:
214 raise RuntimeError("rnc_pdf: pdfkit requested, but not available")
217def get_default_fix_pdfkit_encoding_bug() -> bool:
218 """
219 Should we be trying to fix a ``pdfkit`` encoding bug, by default?
221 Returns:
222 should we? Yes if we have the specific buggy version of ``pdfkit``.
224 """
225 # Auto-determine.
226 if pdfkit is None:
227 return False
228 else:
229 # noinspection PyUnresolvedReferences
230 return bool(Version(pdfkit.__version__) == Version("0.5.0"))
233def make_pdf_from_html(
234 # Mandatory parameters:
235 on_disk: bool,
236 html: str,
237 # Disk options:
238 output_path: str = None,
239 # Shared options:
240 header_html: str = None,
241 footer_html: str = None,
242 wkhtmltopdf_filename: str = _WKHTMLTOPDF_FILENAME,
243 wkhtmltopdf_options: Dict[str, Any] = None,
244 file_encoding: str = "utf-8",
245 debug_options: bool = False,
246 debug_content: bool = False,
247 debug_wkhtmltopdf_args: bool = True,
248 fix_pdfkit_encoding_bug: bool = None,
249 processor: str = _DEFAULT_PROCESSOR) -> Union[bytes, bool]:
250 """
251 Takes HTML and either returns a PDF in memory or makes one on disk.
253 For preference, uses ``wkhtmltopdf`` (with ``pdfkit``):
255 - faster than ``xhtml2pdf``
256 - tables not buggy like ``Weasyprint``
257 - however, doesn't support CSS Paged Media, so we have the
258 ``header_html`` and ``footer_html`` options to allow you to pass
259 appropriate HTML content to serve as the header/footer (rather than
260 passing it within the main HTML).
262 Args:
263 on_disk: make file on disk (rather than returning it in memory)?
265 html: main HTML
267 output_path: if ``on_disk``, the output filename
269 header_html: optional page header, as HTML
271 footer_html: optional page footer, as HTML
273 wkhtmltopdf_filename: filename of the ``wkhtmltopdf`` executable
275 wkhtmltopdf_options: options for ``wkhtmltopdf``
277 file_encoding: encoding to use when writing the header/footer to disk
279 debug_options: log ``wkhtmltopdf`` config/options passed to ``pdfkit``?
281 debug_content: log the main/header/footer HTML?
283 debug_wkhtmltopdf_args: log the final command-line arguments to
284 that will be used by ``pdfkit`` when it calls ``wkhtmltopdf``?
286 fix_pdfkit_encoding_bug: attempt to work around bug in e.g.
287 ``pdfkit==0.5.0`` by encoding ``wkhtmltopdf_filename`` to UTF-8
288 before passing it to ``pdfkit``? If you pass ``None`` here, then
289 a default value is used, from
290 :func:`get_default_fix_pdfkit_encoding_bug`.
292 processor: a PDF processor type from :class:`Processors`
294 Returns:
295 the PDF binary as a ``bytes`` object
297 Raises:
298 AssertionError: if bad ``processor``
299 RuntimeError: if requested processor is unavailable
301 """
302 wkhtmltopdf_options = wkhtmltopdf_options or {} # type: Dict[str, Any]
303 assert_processor_available(processor)
305 if debug_content:
306 log.debug("html: {}", html)
307 log.debug("header_html: {}", header_html)
308 log.debug("footer_html: {}", footer_html)
310 if fix_pdfkit_encoding_bug is None:
311 fix_pdfkit_encoding_bug = get_default_fix_pdfkit_encoding_bug()
313 if processor == Processors.XHTML2PDF:
315 if on_disk:
316 with open(output_path, mode='wb') as outfile:
317 # noinspection PyUnresolvedReferences
318 xhtml2pdf.document.pisaDocument(html, outfile)
319 return True
320 else:
321 memfile = io.BytesIO()
322 # noinspection PyUnresolvedReferences
323 xhtml2pdf.document.pisaDocument(html, memfile)
324 # ... returns a document, but we don't use it, so we don't store it
325 # to stop pychecker complaining
326 # http://xhtml2pdf.appspot.com/static/pisa-en.html
327 memfile.seek(0)
328 return memfile.read()
329 # https://stackoverflow.com/questions/3310584
331 elif processor == Processors.WEASYPRINT:
333 if on_disk:
334 # noinspection PyUnresolvedReferences
335 return weasyprint.HTML(string=html).write_pdf(output_path)
336 else:
337 # http://ampad.de/blog/generating-pdfs-django/
338 # noinspection PyUnresolvedReferences
339 return weasyprint.HTML(string=html).write_pdf()
341 elif processor == Processors.PDFKIT:
343 # Config:
344 if not wkhtmltopdf_filename:
345 config = None
346 else:
347 if fix_pdfkit_encoding_bug: # needs to be True for pdfkit==0.5.0
348 log.debug("Attempting to fix bug in pdfkit (e.g. version 0.5.0)"
349 " by encoding wkhtmltopdf_filename to UTF-8")
350 # noinspection PyUnresolvedReferences
351 config = pdfkit.configuration(
352 wkhtmltopdf=wkhtmltopdf_filename.encode('utf-8'))
353 # the bug is that pdfkit.pdfkit.PDFKit.__init__ will attempt to
354 # decode the string in its configuration object;
355 # https://github.com/JazzCore/python-pdfkit/issues/32
356 else:
357 # noinspection PyUnresolvedReferences
358 config = pdfkit.configuration(wkhtmltopdf=wkhtmltopdf_filename)
360 # Temporary files that a subprocess can read:
361 # https://stackoverflow.com/questions/15169101
362 # wkhtmltopdf requires its HTML files to have ".html" extensions:
363 # https://stackoverflow.com/questions/5776125
364 h_filename = None
365 f_filename = None
366 try:
367 if header_html:
368 h_fd, h_filename = tempfile.mkstemp(suffix='.html')
369 os.write(h_fd, header_html.encode(file_encoding))
370 os.close(h_fd)
371 wkhtmltopdf_options["header-html"] = h_filename
372 if footer_html:
373 f_fd, f_filename = tempfile.mkstemp(suffix='.html')
374 os.write(f_fd, footer_html.encode(file_encoding))
375 os.close(f_fd)
376 wkhtmltopdf_options["footer-html"] = f_filename
377 if debug_options:
378 log.debug("wkhtmltopdf config: {!r}", config)
379 log.debug("wkhtmltopdf_options: {}",
380 pformat(wkhtmltopdf_options))
381 # noinspection PyUnresolvedReferences
382 kit = pdfkit.pdfkit.PDFKit(html, 'string', configuration=config,
383 options=wkhtmltopdf_options)
385 if on_disk:
386 path = output_path
387 else:
388 path = None
389 # With "path=None", the to_pdf() function directly returns
390 # stdout from a subprocess.Popen().communicate() call (see
391 # pdfkit.py). Since universal_newlines is not set, stdout will
392 # be bytes in Python 3.
394 if debug_wkhtmltopdf_args:
395 log.debug("Probable current user: {!r}", getpass.getuser())
396 log.debug("wkhtmltopdf arguments will be: {!r}",
397 kit.command(path=path))
399 return kit.to_pdf(path=path)
401 finally:
402 if h_filename:
403 os.remove(h_filename)
404 if f_filename:
405 os.remove(f_filename)
407 else:
408 raise AssertionError("Unknown PDF engine")
411def get_pdf_from_html(html: str,
412 header_html: str = None,
413 footer_html: str = None,
414 wkhtmltopdf_filename: str = _WKHTMLTOPDF_FILENAME,
415 wkhtmltopdf_options: Dict[str, Any] = None,
416 file_encoding: str = "utf-8",
417 debug_options: bool = False,
418 debug_content: bool = False,
419 debug_wkhtmltopdf_args: bool = True,
420 fix_pdfkit_encoding_bug: bool = None,
421 processor: str = _DEFAULT_PROCESSOR) -> bytes:
422 """
423 Takes HTML and returns a PDF.
425 See the arguments to :func:`make_pdf_from_html` (except ``on_disk``).
427 Returns:
428 the PDF binary as a ``bytes`` object
429 """
430 result = make_pdf_from_html(
431 on_disk=False,
432 html=html,
433 header_html=header_html,
434 footer_html=footer_html,
435 wkhtmltopdf_filename=wkhtmltopdf_filename,
436 wkhtmltopdf_options=wkhtmltopdf_options,
437 file_encoding=file_encoding,
438 debug_options=debug_options,
439 debug_content=debug_content,
440 debug_wkhtmltopdf_args=debug_wkhtmltopdf_args,
441 fix_pdfkit_encoding_bug=fix_pdfkit_encoding_bug,
442 processor=processor,
443 ) # type: bytes
444 return result
447def pdf_from_html(html: str,
448 header_html: str = None,
449 footer_html: str = None,
450 wkhtmltopdf_filename: str = _WKHTMLTOPDF_FILENAME,
451 wkhtmltopdf_options: Dict[str, Any] = None,
452 file_encoding: str = "utf-8",
453 debug_options: bool = False,
454 debug_content: bool = False,
455 fix_pdfkit_encoding_bug: bool = True,
456 processor: str = _DEFAULT_PROCESSOR) -> bytes:
457 """
458 Older function name for :func:`get_pdf_from_html` (q.v.).
459 """
460 return get_pdf_from_html(html=html,
461 header_html=header_html,
462 footer_html=footer_html,
463 wkhtmltopdf_filename=wkhtmltopdf_filename,
464 wkhtmltopdf_options=wkhtmltopdf_options,
465 file_encoding=file_encoding,
466 debug_options=debug_options,
467 debug_content=debug_content,
468 fix_pdfkit_encoding_bug=fix_pdfkit_encoding_bug,
469 processor=processor)
472def make_pdf_on_disk_from_html(
473 html: str,
474 output_path: str,
475 header_html: str = None,
476 footer_html: str = None,
477 wkhtmltopdf_filename: str = _WKHTMLTOPDF_FILENAME,
478 wkhtmltopdf_options: Dict[str, Any] = None,
479 file_encoding: str = "utf-8",
480 debug_options: bool = False,
481 debug_content: bool = False,
482 debug_wkhtmltopdf_args: bool = True,
483 fix_pdfkit_encoding_bug: bool = None,
484 processor: str = _DEFAULT_PROCESSOR) -> bool:
485 """
486 Takes HTML and writes a PDF to the file specified by ``output_path``.
488 See the arguments to :func:`make_pdf_from_html` (except ``on_disk``).
490 Returns:
491 success?
492 """
493 result = make_pdf_from_html(
494 on_disk=True,
495 output_path=output_path,
496 html=html,
497 header_html=header_html,
498 footer_html=footer_html,
499 wkhtmltopdf_filename=wkhtmltopdf_filename,
500 wkhtmltopdf_options=wkhtmltopdf_options,
501 file_encoding=file_encoding,
502 debug_options=debug_options,
503 debug_content=debug_content,
504 debug_wkhtmltopdf_args=debug_wkhtmltopdf_args,
505 fix_pdfkit_encoding_bug=fix_pdfkit_encoding_bug,
506 processor=processor,
507 ) # type: bool
508 return result
511def pdf_from_writer(writer: Union[PdfFileWriter, PdfFileMerger]) -> bytes:
512 """
513 Extracts a PDF (as binary data) from a PyPDF2 writer or merger object.
514 """
515 memfile = io.BytesIO()
516 writer.write(memfile)
517 memfile.seek(0)
518 return memfile.read()
521def serve_pdf_to_stdout(pdf: bytes) -> None:
522 """
523 Serves a PDF to ``stdout`` (for web servers).
525 Writes a ``Content-Type: application/pdf`` header and then the PDF to
526 ``stdout``.
528 See:
530 - https://stackoverflow.com/questions/312230/proper-mime-type-for-pdf-files
531 - http://www.askapache.com/htaccess/pdf-cookies-headers-rewrites.html
532 - https://stackoverflow.com/questions/2374427
534 """
535 # print("Content-type: text/plain\n") # for debugging
536 print("Content-Type: application/pdf\n")
537 # https://stackoverflow.com/questions/908331/how-to-write-binary-data-to-stdout-in-python-3 # noqa
538 sys.stdout.buffer.write(pdf)
541def make_pdf_writer() -> PdfFileWriter:
542 """
543 Creates and returns a PyPDF2 writer.
544 """
545 return PdfFileWriter()
548def append_memory_pdf_to_writer(input_pdf: bytes,
549 writer: PdfFileWriter,
550 start_recto: bool = True) -> None:
551 """
552 Appends a PDF (as bytes in memory) to a PyPDF2 writer.
554 Args:
555 input_pdf: the PDF, as ``bytes``
556 writer: the writer
557 start_recto: start a new right-hand page?
558 """
559 if not input_pdf:
560 return
561 if start_recto and writer.getNumPages() % 2 != 0:
562 writer.addBlankPage()
563 # ... suitable for double-sided printing
564 infile = io.BytesIO(input_pdf)
565 reader = PdfFileReader(infile)
566 for page_num in range(reader.numPages):
567 writer.addPage(reader.getPage(page_num))
570def append_pdf(input_pdf: bytes, output_writer: PdfFileWriter):
571 """
572 Appends a PDF to a pyPDF writer. Legacy interface.
573 """
574 append_memory_pdf_to_writer(input_pdf=input_pdf,
575 writer=output_writer)
578# =============================================================================
579# Serve concatenated PDFs
580# =============================================================================
581# Two ways in principle to do this:
582# (1) Load data from each PDF into memory; concatenate; serve the result.
583# (2) With each PDF on disk, create a temporary file (e.g. with pdftk),
584# serve the result (e.g. in one go), then delete the temporary file.
585# This may be more memory-efficient.
586# However, there can be problems:
587# https://stackoverflow.com/questions/7543452/how-to-launch-a-pdftk-subprocess-while-in-wsgi # noqa
588# Others' examples:
589# https://gist.github.com/zyegfryed/918403
590# https://gist.github.com/grantmcconnaughey/ce90a689050c07c61c96
591# https://stackoverflow.com/questions/3582414/removing-tmp-file-after-return-httpresponse-in-django # noqa
593# def append_disk_pdf_to_writer(filename, writer):
594# """Appends a PDF from disk to a pyPDF writer."""
595# if writer.getNumPages() % 2 != 0:
596# writer.addBlankPage()
597# # ... keeps final result suitable for double-sided printing
598# with open(filename, mode='rb') as infile:
599# reader = PdfFileReader(infile)
600# for page_num in range(reader.numPages):
601# writer.addPage(reader.getPage(page_num))
604def get_concatenated_pdf_from_disk(filenames: Iterable[str],
605 start_recto: bool = True) -> bytes:
606 """
607 Concatenates PDFs from disk and returns them as an in-memory binary PDF.
609 Args:
610 filenames: iterable of filenames of PDFs to concatenate
611 start_recto: start a new right-hand page for each new PDF?
613 Returns:
614 concatenated PDF, as ``bytes``
616 """
617 # https://stackoverflow.com/questions/17104926/pypdf-merging-multiple-pdf-files-into-one-pdf # noqa
618 # https://en.wikipedia.org/wiki/Recto_and_verso
619 if start_recto:
620 writer = PdfFileWriter()
621 for filename in filenames:
622 if filename:
623 if writer.getNumPages() % 2 != 0:
624 writer.addBlankPage()
625 writer.appendPagesFromReader(
626 PdfFileReader(open(filename, 'rb')))
627 return pdf_from_writer(writer)
628 else:
629 merger = PdfFileMerger()
630 for filename in filenames:
631 if filename:
632 merger.append(open(filename, 'rb'))
633 return pdf_from_writer(merger)
636def get_concatenated_pdf_in_memory(
637 pdf_plans: Iterable[PdfPlan],
638 start_recto: bool = True) -> bytes:
639 """
640 Concatenates PDFs and returns them as an in-memory binary PDF.
642 Args:
643 pdf_plans: iterable of :class:`PdfPlan` objects
644 start_recto: start a new right-hand page for each new PDF?
646 Returns:
647 concatenated PDF, as ``bytes``
649 """
650 writer = PdfFileWriter()
651 for pdfplan in pdf_plans:
652 pdfplan.add_to_writer(writer, start_recto=start_recto)
653 return pdf_from_writer(writer)
656# =============================================================================
657# Main -- to enable logging for imports, for debugging
658# =============================================================================
660if __name__ == '__main__':
661 logging.basicConfig()
662 log.setLevel(logging.DEBUG)