Coverage for /home/martinb/.local/share/virtualenvs/camcops/lib/python3.6/site-packages/PyPDF2/merger.py : 12%

Hot-keys on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1# vim: sw=4:expandtab:foldmethod=marker
2#
3# Copyright (c) 2006, Mathieu Fenniak
4# All rights reserved.
5#
6# Redistribution and use in source and binary forms, with or without
7# modification, are permitted provided that the following conditions are
8# met:
9#
10# * Redistributions of source code must retain the above copyright notice,
11# this list of conditions and the following disclaimer.
12# * Redistributions in binary form must reproduce the above copyright notice,
13# this list of conditions and the following disclaimer in the documentation
14# and/or other materials provided with the distribution.
15# * The name of the author may not be used to endorse or promote products
16# derived from this software without specific prior written permission.
17#
18# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
19# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
20# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
21# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
22# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
23# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
24# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
25# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
26# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
27# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
28# POSSIBILITY OF SUCH DAMAGE.
30from .generic import *
31from .utils import isString, str_
32from .pdf import PdfFileReader, PdfFileWriter
33from .pagerange import PageRange
34from sys import version_info
35if version_info < ( 3, 0 ):
36 from cStringIO import StringIO
37 StreamIO = StringIO
38else:
39 from io import BytesIO
40 from io import FileIO as file
41 StreamIO = BytesIO
44class _MergedPage(object):
45 """
46 _MergedPage is used internally by PdfFileMerger to collect necessary
47 information on each page that is being merged.
48 """
49 def __init__(self, pagedata, src, id):
50 self.src = src
51 self.pagedata = pagedata
52 self.out_pagedata = None
53 self.id = id
56class PdfFileMerger(object):
57 """
58 Initializes a PdfFileMerger object. PdfFileMerger merges multiple PDFs
59 into a single PDF. It can concatenate, slice, insert, or any combination
60 of the above.
62 See the functions :meth:`merge()<merge>` (or :meth:`append()<append>`)
63 and :meth:`write()<write>` for usage information.
65 :param bool strict: Determines whether user should be warned of all
66 problems and also causes some correctable problems to be fatal.
67 Defaults to ``True``.
68 """
70 def __init__(self, strict=True):
71 self.inputs = []
72 self.pages = []
73 self.output = PdfFileWriter()
74 self.bookmarks = []
75 self.named_dests = []
76 self.id_count = 0
77 self.strict = strict
79 def merge(self, position, fileobj, bookmark=None, pages=None, import_bookmarks=True):
80 """
81 Merges the pages from the given file into the output file at the
82 specified page number.
84 :param int position: The *page number* to insert this file. File will
85 be inserted after the given number.
87 :param fileobj: A File Object or an object that supports the standard read
88 and seek methods similar to a File Object. Could also be a
89 string representing a path to a PDF file.
91 :param str bookmark: Optionally, you may specify a bookmark to be applied at
92 the beginning of the included file by supplying the text of the bookmark.
94 :param pages: can be a :ref:`Page Range <page-range>` or a ``(start, stop[, step])`` tuple
95 to merge only the specified range of pages from the source
96 document into the output document.
98 :param bool import_bookmarks: You may prevent the source document's bookmarks
99 from being imported by specifying this as ``False``.
100 """
102 # This parameter is passed to self.inputs.append and means
103 # that the stream used was created in this method.
104 my_file = False
106 # If the fileobj parameter is a string, assume it is a path
107 # and create a file object at that location. If it is a file,
108 # copy the file's contents into a BytesIO (or StreamIO) stream object; if
109 # it is a PdfFileReader, copy that reader's stream into a
110 # BytesIO (or StreamIO) stream.
111 # If fileobj is none of the above types, it is not modified
112 decryption_key = None
113 if isString(fileobj):
114 fileobj = file(fileobj, 'rb')
115 my_file = True
116 elif isinstance(fileobj, file):
117 fileobj.seek(0)
118 filecontent = fileobj.read()
119 fileobj = StreamIO(filecontent)
120 my_file = True
121 elif isinstance(fileobj, PdfFileReader):
122 orig_tell = fileobj.stream.tell()
123 fileobj.stream.seek(0)
124 filecontent = StreamIO(fileobj.stream.read())
125 fileobj.stream.seek(orig_tell) # reset the stream to its original location
126 fileobj = filecontent
127 if hasattr(fileobj, '_decryption_key'):
128 decryption_key = fileobj._decryption_key
129 my_file = True
131 # Create a new PdfFileReader instance using the stream
132 # (either file or BytesIO or StringIO) created above
133 pdfr = PdfFileReader(fileobj, strict=self.strict)
134 if decryption_key is not None:
135 pdfr._decryption_key = decryption_key
137 # Find the range of pages to merge.
138 if pages == None:
139 pages = (0, pdfr.getNumPages())
140 elif isinstance(pages, PageRange):
141 pages = pages.indices(pdfr.getNumPages())
142 elif not isinstance(pages, tuple):
143 raise TypeError('"pages" must be a tuple of (start, stop[, step])')
145 srcpages = []
146 if bookmark:
147 bookmark = Bookmark(TextStringObject(bookmark), NumberObject(self.id_count), NameObject('/Fit'))
149 outline = []
150 if import_bookmarks:
151 outline = pdfr.getOutlines()
152 outline = self._trim_outline(pdfr, outline, pages)
154 if bookmark:
155 self.bookmarks += [bookmark, outline]
156 else:
157 self.bookmarks += outline
159 dests = pdfr.namedDestinations
160 dests = self._trim_dests(pdfr, dests, pages)
161 self.named_dests += dests
163 # Gather all the pages that are going to be merged
164 for i in range(*pages):
165 pg = pdfr.getPage(i)
167 id = self.id_count
168 self.id_count += 1
170 mp = _MergedPage(pg, pdfr, id)
172 srcpages.append(mp)
174 self._associate_dests_to_pages(srcpages)
175 self._associate_bookmarks_to_pages(srcpages)
177 # Slice to insert the pages at the specified position
178 self.pages[position:position] = srcpages
180 # Keep track of our input files so we can close them later
181 self.inputs.append((fileobj, pdfr, my_file))
183 def append(self, fileobj, bookmark=None, pages=None, import_bookmarks=True):
184 """
185 Identical to the :meth:`merge()<merge>` method, but assumes you want to concatenate
186 all pages onto the end of the file instead of specifying a position.
188 :param fileobj: A File Object or an object that supports the standard read
189 and seek methods similar to a File Object. Could also be a
190 string representing a path to a PDF file.
192 :param str bookmark: Optionally, you may specify a bookmark to be applied at
193 the beginning of the included file by supplying the text of the bookmark.
195 :param pages: can be a :ref:`Page Range <page-range>` or a ``(start, stop[, step])`` tuple
196 to merge only the specified range of pages from the source
197 document into the output document.
199 :param bool import_bookmarks: You may prevent the source document's bookmarks
200 from being imported by specifying this as ``False``.
201 """
203 self.merge(len(self.pages), fileobj, bookmark, pages, import_bookmarks)
205 def write(self, fileobj):
206 """
207 Writes all data that has been merged to the given output file.
209 :param fileobj: Output file. Can be a filename or any kind of
210 file-like object.
211 """
212 my_file = False
213 if isString(fileobj):
214 fileobj = file(fileobj, 'wb')
215 my_file = True
217 # Add pages to the PdfFileWriter
218 # The commented out line below was replaced with the two lines below it to allow PdfFileMerger to work with PyPdf 1.13
219 for page in self.pages:
220 self.output.addPage(page.pagedata)
221 page.out_pagedata = self.output.getReference(self.output._pages.getObject()["/Kids"][-1].getObject())
222 #idnum = self.output._objects.index(self.output._pages.getObject()["/Kids"][-1].getObject()) + 1
223 #page.out_pagedata = IndirectObject(idnum, 0, self.output)
225 # Once all pages are added, create bookmarks to point at those pages
226 self._write_dests()
227 self._write_bookmarks()
229 # Write the output to the file
230 self.output.write(fileobj)
232 if my_file:
233 fileobj.close()
235 def close(self):
236 """
237 Shuts all file descriptors (input and output) and clears all memory
238 usage.
239 """
240 self.pages = []
241 for fo, pdfr, mine in self.inputs:
242 if mine:
243 fo.close()
245 self.inputs = []
246 self.output = None
248 def addMetadata(self, infos):
249 """
250 Add custom metadata to the output.
252 :param dict infos: a Python dictionary where each key is a field
253 and each value is your new metadata.
254 Example: ``{u'/Title': u'My title'}``
255 """
256 self.output.addMetadata(infos)
258 def setPageLayout(self, layout):
259 """
260 Set the page layout
262 :param str layout: The page layout to be used
264 Valid layouts are:
265 /NoLayout Layout explicitly not specified
266 /SinglePage Show one page at a time
267 /OneColumn Show one column at a time
268 /TwoColumnLeft Show pages in two columns, odd-numbered pages on the left
269 /TwoColumnRight Show pages in two columns, odd-numbered pages on the right
270 /TwoPageLeft Show two pages at a time, odd-numbered pages on the left
271 /TwoPageRight Show two pages at a time, odd-numbered pages on the right
272 """
273 self.output.setPageLayout(layout)
275 def setPageMode(self, mode):
276 """
277 Set the page mode.
279 :param str mode: The page mode to use.
281 Valid modes are:
282 /UseNone Do not show outlines or thumbnails panels
283 /UseOutlines Show outlines (aka bookmarks) panel
284 /UseThumbs Show page thumbnails panel
285 /FullScreen Fullscreen view
286 /UseOC Show Optional Content Group (OCG) panel
287 /UseAttachments Show attachments panel
288 """
289 self.output.setPageMode(mode)
291 def _trim_dests(self, pdf, dests, pages):
292 """
293 Removes any named destinations that are not a part of the specified
294 page set.
295 """
296 new_dests = []
297 prev_header_added = True
298 for k, o in list(dests.items()):
299 for j in range(*pages):
300 if pdf.getPage(j).getObject() == o['/Page'].getObject():
301 o[NameObject('/Page')] = o['/Page'].getObject()
302 assert str_(k) == str_(o['/Title'])
303 new_dests.append(o)
304 break
305 return new_dests
307 def _trim_outline(self, pdf, outline, pages):
308 """
309 Removes any outline/bookmark entries that are not a part of the
310 specified page set.
311 """
312 new_outline = []
313 prev_header_added = True
314 for i, o in enumerate(outline):
315 if isinstance(o, list):
316 sub = self._trim_outline(pdf, o, pages)
317 if sub:
318 if not prev_header_added:
319 new_outline.append(outline[i-1])
320 new_outline.append(sub)
321 else:
322 prev_header_added = False
323 for j in range(*pages):
324 if pdf.getPage(j).getObject() == o['/Page'].getObject():
325 o[NameObject('/Page')] = o['/Page'].getObject()
326 new_outline.append(o)
327 prev_header_added = True
328 break
329 return new_outline
331 def _write_dests(self):
332 dests = self.named_dests
334 for v in dests:
335 pageno = None
336 pdf = None
337 if '/Page' in v:
338 for i, p in enumerate(self.pages):
339 if p.id == v['/Page']:
340 v[NameObject('/Page')] = p.out_pagedata
341 pageno = i
342 pdf = p.src
343 break
344 if pageno != None:
345 self.output.addNamedDestinationObject(v)
347 def _write_bookmarks(self, bookmarks=None, parent=None):
349 if bookmarks == None:
350 bookmarks = self.bookmarks
352 last_added = None
353 for b in bookmarks:
354 if isinstance(b, list):
355 self._write_bookmarks(b, last_added)
356 continue
358 pageno = None
359 pdf = None
360 if '/Page' in b:
361 for i, p in enumerate(self.pages):
362 if p.id == b['/Page']:
363 #b[NameObject('/Page')] = p.out_pagedata
364 args = [NumberObject(p.id), NameObject(b['/Type'])]
365 #nothing more to add
366 #if b['/Type'] == '/Fit' or b['/Type'] == '/FitB'
367 if b['/Type'] == '/FitH' or b['/Type'] == '/FitBH':
368 if '/Top' in b and not isinstance(b['/Top'], NullObject):
369 args.append(FloatObject(b['/Top']))
370 else:
371 args.append(FloatObject(0))
372 del b['/Top']
373 elif b['/Type'] == '/FitV' or b['/Type'] == '/FitBV':
374 if '/Left' in b and not isinstance(b['/Left'], NullObject):
375 args.append(FloatObject(b['/Left']))
376 else:
377 args.append(FloatObject(0))
378 del b['/Left']
379 elif b['/Type'] == '/XYZ':
380 if '/Left' in b and not isinstance(b['/Left'], NullObject):
381 args.append(FloatObject(b['/Left']))
382 else:
383 args.append(FloatObject(0))
384 if '/Top' in b and not isinstance(b['/Top'], NullObject):
385 args.append(FloatObject(b['/Top']))
386 else:
387 args.append(FloatObject(0))
388 if '/Zoom' in b and not isinstance(b['/Zoom'], NullObject):
389 args.append(FloatObject(b['/Zoom']))
390 else:
391 args.append(FloatObject(0))
392 del b['/Top'], b['/Zoom'], b['/Left']
393 elif b['/Type'] == '/FitR':
394 if '/Left' in b and not isinstance(b['/Left'], NullObject):
395 args.append(FloatObject(b['/Left']))
396 else:
397 args.append(FloatObject(0))
398 if '/Bottom' in b and not isinstance(b['/Bottom'], NullObject):
399 args.append(FloatObject(b['/Bottom']))
400 else:
401 args.append(FloatObject(0))
402 if '/Right' in b and not isinstance(b['/Right'], NullObject):
403 args.append(FloatObject(b['/Right']))
404 else:
405 args.append(FloatObject(0))
406 if '/Top' in b and not isinstance(b['/Top'], NullObject):
407 args.append(FloatObject(b['/Top']))
408 else:
409 args.append(FloatObject(0))
410 del b['/Left'], b['/Right'], b['/Bottom'], b['/Top']
412 b[NameObject('/A')] = DictionaryObject({NameObject('/S'): NameObject('/GoTo'), NameObject('/D'): ArrayObject(args)})
414 pageno = i
415 pdf = p.src
416 break
417 if pageno != None:
418 del b['/Page'], b['/Type']
419 last_added = self.output.addBookmarkDict(b, parent)
421 def _associate_dests_to_pages(self, pages):
422 for nd in self.named_dests:
423 pageno = None
424 np = nd['/Page']
426 if isinstance(np, NumberObject):
427 continue
429 for p in pages:
430 if np.getObject() == p.pagedata.getObject():
431 pageno = p.id
433 if pageno != None:
434 nd[NameObject('/Page')] = NumberObject(pageno)
435 else:
436 raise ValueError("Unresolved named destination '%s'" % (nd['/Title'],))
438 def _associate_bookmarks_to_pages(self, pages, bookmarks=None):
439 if bookmarks == None:
440 bookmarks = self.bookmarks
442 for b in bookmarks:
443 if isinstance(b, list):
444 self._associate_bookmarks_to_pages(pages, b)
445 continue
447 pageno = None
448 bp = b['/Page']
450 if isinstance(bp, NumberObject):
451 continue
453 for p in pages:
454 if bp.getObject() == p.pagedata.getObject():
455 pageno = p.id
457 if pageno != None:
458 b[NameObject('/Page')] = NumberObject(pageno)
459 else:
460 raise ValueError("Unresolved bookmark '%s'" % (b['/Title'],))
462 def findBookmark(self, bookmark, root=None):
463 if root == None:
464 root = self.bookmarks
466 for i, b in enumerate(root):
467 if isinstance(b, list):
468 res = self.findBookmark(bookmark, b)
469 if res:
470 return [i] + res
471 elif b == bookmark or b['/Title'] == bookmark:
472 return [i]
474 return None
476 def addBookmark(self, title, pagenum, parent=None):
477 """
478 Add a bookmark to this PDF file.
480 :param str title: Title to use for this bookmark.
481 :param int pagenum: Page number this bookmark will point to.
482 :param parent: A reference to a parent bookmark to create nested
483 bookmarks.
484 """
485 if parent == None:
486 iloc = [len(self.bookmarks)-1]
487 elif isinstance(parent, list):
488 iloc = parent
489 else:
490 iloc = self.findBookmark(parent)
492 dest = Bookmark(TextStringObject(title), NumberObject(pagenum), NameObject('/FitH'), NumberObject(826))
494 if parent == None:
495 self.bookmarks.append(dest)
496 else:
497 bmparent = self.bookmarks
498 for i in iloc[:-1]:
499 bmparent = bmparent[i]
500 npos = iloc[-1]+1
501 if npos < len(bmparent) and isinstance(bmparent[npos], list):
502 bmparent[npos].append(dest)
503 else:
504 bmparent.insert(npos, [dest])
505 return dest
507 def addNamedDestination(self, title, pagenum):
508 """
509 Add a destination to the output.
511 :param str title: Title to use
512 :param int pagenum: Page number this destination points at.
513 """
515 dest = Destination(TextStringObject(title), NumberObject(pagenum), NameObject('/FitH'), NumberObject(826))
516 self.named_dests.append(dest)
519class OutlinesObject(list):
520 def __init__(self, pdf, tree, parent=None):
521 list.__init__(self)
522 self.tree = tree
523 self.pdf = pdf
524 self.parent = parent
526 def remove(self, index):
527 obj = self[index]
528 del self[index]
529 self.tree.removeChild(obj)
531 def add(self, title, pagenum):
532 pageRef = self.pdf.getObject(self.pdf._pages)['/Kids'][pagenum]
533 action = DictionaryObject()
534 action.update({
535 NameObject('/D') : ArrayObject([pageRef, NameObject('/FitH'), NumberObject(826)]),
536 NameObject('/S') : NameObject('/GoTo')
537 })
538 actionRef = self.pdf._addObject(action)
539 bookmark = TreeObject()
541 bookmark.update({
542 NameObject('/A'): actionRef,
543 NameObject('/Title'): createStringObject(title),
544 })
546 self.pdf._addObject(bookmark)
548 self.tree.addChild(bookmark)
550 def removeAll(self):
551 for child in [x for x in self.tree.children()]:
552 self.tree.removeChild(child)
553 self.pop()