Coverage for privacyforms_pdf / extractor.py: 96%
325 statements
« prev ^ index » next coverage.py v7.13.4, created at 2026-03-07 14:38 +0100
« prev ^ index » next coverage.py v7.13.4, created at 2026-03-07 14:38 +0100
1"""PDF Form Extractor module using pypdf."""
3from __future__ import annotations
5import json
6from pathlib import Path
7from typing import TYPE_CHECKING, Any, cast
9from pydantic import BaseModel, ConfigDict, Field
10from pypdf import PdfReader, PdfWriter
12if TYPE_CHECKING:
13 from pypdf.generic import ArrayObject
16class PDFFormError(Exception):
17 """Base exception for PDF form related errors."""
19 pass
22class PDFFormNotFoundError(PDFFormError):
23 """Raised when the PDF does not contain any forms."""
25 pass
28class FieldNotFoundError(PDFFormError):
29 """Raised when a field is not found in the form."""
31 pass
34class FormValidationError(PDFFormError):
35 """Raised when form data validation fails."""
37 def __init__(self, message: str, errors: list[str] | None = None) -> None:
38 """Initialize the error with validation details.
40 Args:
41 message: Error message.
42 errors: List of specific validation errors.
43 """
44 super().__init__(message)
45 self.message = message
46 self.errors = errors or []
48 def __str__(self) -> str: # noqa: D105
49 if self.errors:
50 return f"{self.message}\n- " + "\n- ".join(self.errors)
51 return self.message
54class FieldGeometry(BaseModel):
55 """Geometry information for a PDF form field.
57 Attributes:
58 page: 1-based page number where field appears.
59 rect: Bounding box as (x1, y1, x2, y2) in PDF points (1/72 inch).
60 x: Left coordinate.
61 y: Bottom coordinate (PDF coordinate system).
62 width: Field width in points.
63 height: Field height in points.
64 units: Unit of measurement (always "pt" for points).
65 """
67 page: int
68 rect: tuple[float, float, float, float]
70 @property
71 def x(self) -> float:
72 """Left coordinate."""
73 return self.rect[0]
75 @property
76 def y(self) -> float:
77 """Bottom coordinate (PDF coordinate system)."""
78 return self.rect[1]
80 @property
81 def width(self) -> float:
82 """Field width in points."""
83 return self.rect[2] - self.rect[0]
85 @property
86 def height(self) -> float:
87 """Field height in points."""
88 return self.rect[3] - self.rect[1]
90 def model_dump(self, **kwargs: Any) -> dict[str, Any]: # noqa: ARG002
91 """Convert to dictionary for JSON serialization.
93 Returns:
94 Dictionary with page, rect, x, y, width, height, units.
95 """
96 return {
97 "page": self.page,
98 "rect": list(self.rect),
99 "x": self.x,
100 "y": self.y,
101 "width": self.width,
102 "height": self.height,
103 "units": "pt",
104 }
107class PDFField(BaseModel):
108 """Unified PDF form field model with geometry and all field properties.
110 Attributes:
111 name: The name of the field.
112 id: The unique identifier of the field.
113 field_type: The type of the form field (e.g., 'textfield', 'checkbox').
114 value: The current value of the field.
115 pages: List of pages where this field appears.
116 locked: Whether the field is locked.
117 geometry: Optional geometry information (position and size).
118 format: Date format for datefield types.
119 options: Available options for radiobuttongroup, combobox, listbox types.
120 """
122 name: str
123 id: str
124 field_type: str = Field(..., alias="type")
125 value: str | bool = ""
126 pages: list[int] = []
127 locked: bool = False
128 geometry: FieldGeometry | None = None
129 format: str | None = None
130 options: list[str] = []
132 model_config = ConfigDict(populate_by_name=True)
134 def model_dump(self, **kwargs: Any) -> dict[str, Any]:
135 """Serialize field to dictionary, including geometry if present.
137 Returns:
138 Dictionary representation of the field.
139 """
140 data = super().model_dump(**kwargs)
141 # Ensure geometry is properly serialized if present
142 if self.geometry is not None:
143 data["geometry"] = self.geometry.model_dump()
144 else:
145 data["geometry"] = None
146 return data
149class FormField:
150 """Represents a single form field (legacy dataclass, use PDFField instead).
152 Attributes:
153 field_type: The type of the form field (e.g., 'textfield', 'checkbox').
154 pages: List of pages where this field appears.
155 id: The unique identifier of the field.
156 name: The name of the field.
157 value: The current value of the field.
158 locked: Whether the field is locked.
159 """
161 def __init__(
162 self,
163 field_type: str,
164 pages: list[int],
165 id: str, # noqa: A002
166 name: str,
167 value: str | bool,
168 locked: bool,
169 ) -> None:
170 """Initialize FormField.
172 Args:
173 field_type: The type of the form field.
174 pages: List of pages where this field appears.
175 id: The unique identifier of the field.
176 name: The name of the field.
177 value: The current value of the field.
178 locked: Whether the field is locked.
179 """
180 self.field_type = field_type
181 self.pages = pages
182 self.id = id
183 self.name = name
184 self.value = value
185 self.locked = locked
187 def __repr__(self) -> str:
188 """Return string representation."""
189 return f"FormField(field_type='{self.field_type}', name='{self.name}', id='{self.id}')"
191 def __eq__(self, other: object) -> bool:
192 """Check equality with another FormField."""
193 if not isinstance(other, FormField):
194 return NotImplemented
195 return (
196 self.field_type == other.field_type
197 and self.pages == other.pages
198 and self.id == other.id
199 and self.name == other.name
200 and self.value == other.value
201 and self.locked == other.locked
202 )
205class PDFFormData:
206 """Represents extracted PDF form data.
208 Attributes:
209 source: Path to the source PDF file.
210 pdf_version: Version of the PDF.
211 has_form: Whether the PDF contains a form.
212 fields: List of PDF fields (PDFField objects).
213 raw_data: The raw data from pypdf.
214 """
216 def __init__(
217 self,
218 source: Path,
219 pdf_version: str,
220 has_form: bool,
221 fields: list[PDFField],
222 raw_data: dict[str, Any],
223 ) -> None:
224 """Initialize PDFFormData.
226 Args:
227 source: Path to the source PDF file.
228 pdf_version: Version of the PDF.
229 has_form: Whether the PDF contains a form.
230 fields: List of PDFField objects.
231 raw_data: The raw data from pypdf.
232 """
233 self.source = source
234 self.pdf_version = pdf_version
235 self.has_form = has_form
236 self.fields = fields
237 self.raw_data = raw_data
239 def to_json(self) -> str:
240 """Serialize form data to JSON string.
242 Returns:
243 JSON string representation of the form data.
244 """
245 data = {
246 "source": str(self.source),
247 "pdf_version": self.pdf_version,
248 "has_form": self.has_form,
249 "fields": [field.model_dump() for field in self.fields],
250 }
251 return json.dumps(data, indent=2)
253 def to_dict(self) -> dict[str, Any]:
254 """Serialize form data to dictionary.
256 Returns:
257 Dictionary representation of the form data.
258 """
259 return {
260 "source": str(self.source),
261 "pdf_version": self.pdf_version,
262 "has_form": self.has_form,
263 "fields": [field.model_dump() for field in self.fields],
264 }
267class PDFFormExtractor:
268 """Extracts form information from PDF files using pypdf.
270 This class provides methods to extract form data from PDF files.
271 It uses pypdf for all operations including form extraction and filling.
273 Example:
274 >>> extractor = PDFFormExtractor()
275 >>> form_data = extractor.extract("form.pdf")
276 >>> for field in form_data.fields:
277 ... print(f"{field.name}: {field.value}")
278 ... if field.geometry:
279 ... print(f" Position: ({field.geometry.x}, {field.geometry.y})")
280 """
282 def __init__(
283 self,
284 timeout_seconds: float = 30.0,
285 extract_geometry: bool = True,
286 ) -> None:
287 """Initialize the extractor.
289 Args:
290 timeout_seconds: Timeout for operations (kept for API compatibility).
291 extract_geometry: Whether to extract field geometry information.
292 """
293 self._timeout_seconds = timeout_seconds
294 self._extract_geometry = extract_geometry
296 @staticmethod
297 def _get_field_type(field: dict[str, Any]) -> str:
298 """Determine field type from pypdf field data.
300 Args:
301 field: Field dictionary from pypdf.
303 Returns:
304 Field type string.
305 """
306 ft = field.get("/FT")
307 if ft is None:
308 # Try to get from field type name
309 ft = field.get("/Type")
311 if ft == "/Tx":
312 # Check if it's a date field
313 if "/AA" in field or "/DV" in field:
314 # Look for date format in additional actions
315 return "textfield"
316 return "textfield"
317 elif ft == "/Btn":
318 # Button can be checkbox, radio button, or push button
319 # Check for radio button group
320 if "/Opt" in field:
321 return "radiobuttongroup"
322 # Check if it's a checkbox (usually has /V as /Yes or /Off)
323 return "checkbox"
324 elif ft == "/Ch":
325 # Choice field - can be combo box or list box
326 ff = field.get("/Ff", 0)
327 if isinstance(ff, int) and ff & 0x40000: # Combo box flag
328 return "combobox"
329 return "listbox"
330 elif ft == "/Sig":
331 return "signature"
333 return "textfield" # Default fallback
335 @staticmethod
336 def _get_field_value(field: dict[str, Any]) -> str | bool:
337 """Extract value from pypdf field data.
339 Args:
340 field: Field dictionary from pypdf.
342 Returns:
343 Field value (string or boolean for checkboxes).
344 """
345 value = field.get("/V")
347 if value is None:
348 return ""
350 # Handle checkbox values
351 if isinstance(value, str):
352 if value.lower() in ("/yes", "yes", "/on", "on", "1"):
353 return True
354 elif value.lower() in ("/off", "off", "no", "0"):
355 return False
356 return value
358 # Handle NameObject from pypdf
359 if hasattr(value, "name"):
360 name = value.name
361 if name.lower() in ("/yes", "yes", "/on", "on", "1"):
362 return True
363 elif name.lower() in ("/off", "off", "no", "0"):
364 return False
365 return str(name)
367 return str(value)
369 @staticmethod
370 def _get_field_options(field: dict[str, Any]) -> list[str]:
371 """Extract options for choice/radio fields.
373 Args:
374 field: Field dictionary from pypdf.
376 Returns:
377 List of option strings.
378 """
379 options = field.get("/Opt", [])
380 if options:
381 result = []
382 for opt in options:
383 # Options can be text or [export_value, label]
384 if isinstance(opt, list) and len(opt) >= 2: 384 ↛ 385line 384 didn't jump to line 385 because the condition on line 384 was never true
385 result.append(str(opt[1]))
386 elif isinstance(opt, list) and len(opt) == 1: 386 ↛ 387line 386 didn't jump to line 387 because the condition on line 386 was never true
387 result.append(str(opt[0]))
388 else:
389 result.append(str(opt))
390 return result
392 # For radio buttons, check Kids
393 kids = field.get("/Kids", [])
394 if kids:
395 # Extract options from kid widgets
396 opt_list = []
397 for kid in kids:
398 kid_obj = kid.get_object() if hasattr(kid, "get_object") else kid
399 if kid_obj and "/AP" in kid_obj:
400 ap = kid_obj["/AP"]
401 if "/N" in ap:
402 # Get the appearance names
403 names = list(ap["/N"].keys())
404 opt_list.extend([str(n) for n in names if str(n).lower() != "/off"])
405 return list(dict.fromkeys(opt_list)) # Deduplicate while preserving order
407 return []
409 def has_form(self, pdf_path: str | Path) -> bool:
410 """Check if a PDF contains a form.
412 Args:
413 pdf_path: Path to the PDF file.
415 Returns:
416 True if the PDF contains a form, False otherwise.
417 """
418 pdf_path = Path(pdf_path)
419 self._validate_pdf_path(pdf_path)
421 reader = PdfReader(str(pdf_path))
422 fields = reader.get_fields()
423 return fields is not None and len(fields) > 0
425 def extract(self, pdf_path: str | Path) -> PDFFormData:
426 """Extract form data from a PDF file.
428 This method extracts form data from the PDF using pypdf and
429 parses it into a structured format. If extract_geometry is True,
430 field positions and sizes will also be extracted.
432 Args:
433 pdf_path: Path to the PDF file.
435 Returns:
436 PDFFormData containing all form information with PDFField objects.
438 Raises:
439 FileNotFoundError: If the PDF file does not exist.
440 PDFFormNotFoundError: If the PDF does not contain a form.
441 """
442 pdf_path = Path(pdf_path)
443 self._validate_pdf_path(pdf_path)
445 reader = PdfReader(str(pdf_path))
447 # Check if PDF has a form
448 fields = reader.get_fields()
449 if not fields:
450 raise PDFFormNotFoundError(f"PDF does not contain a form: {pdf_path}")
452 # Extract widget info (pages and geometry) in one pass
453 widget_info = self._extract_widgets_info(reader)
455 # Parse fields into PDFField objects
456 pdf_fields: list[PDFField] = []
457 raw_fields_data: dict[str, Any] = {}
459 for field_counter, (field_name, field_data) in enumerate(fields.items(), start=1):
460 raw_fields_data[field_name] = field_data
462 # Get field type
463 field_type = self._get_field_type(field_data)
465 # Get field value
466 value = self._get_field_value(field_data)
468 # Get info from widget scan
469 info = widget_info.get(field_name, ([], None))
470 pages = info[0] if info[0] else [1]
471 geometry = info[1] if self._extract_geometry else None
473 # Get options for choice fields
474 options = self._get_field_options(field_data)
476 # Create PDFField
477 pdf_field = PDFField(
478 name=field_name,
479 id=str(field_counter),
480 type=field_type,
481 value=value,
482 pages=pages,
483 locked=False, # pypdf doesn't directly expose locked state
484 geometry=geometry,
485 format=None, # Date format extraction would require additional parsing
486 options=options,
487 )
488 pdf_fields.append(pdf_field)
490 # Build raw data structure for compatibility
491 raw_data = self._build_raw_data_structure(pdf_fields, str(pdf_path))
493 # Get PDF version from header (e.g., "%PDF-1.7" -> "1.7")
494 if hasattr(reader, "pdf_header"):
495 pdf_version = reader.pdf_header.replace("%PDF-", "")
496 else:
497 pdf_version = "unknown"
499 return PDFFormData(
500 source=pdf_path,
501 pdf_version=pdf_version,
502 has_form=len(pdf_fields) > 0,
503 fields=pdf_fields,
504 raw_data=raw_data,
505 )
507 def _get_field_pages(self, reader: PdfReader, field_name: str) -> list[int]:
508 """Find which pages contain the field widget (legacy).
510 Args:
511 reader: PdfReader instance.
512 field_name: Name of the field.
514 Returns:
515 List of 1-based page numbers where field appears.
516 """
517 widget_info = self._extract_widgets_info(reader)
518 return widget_info.get(field_name, ([1], None))[0]
520 def _extract_geometry_from_pdf(self, reader: PdfReader) -> dict[str, FieldGeometry]:
521 """Extract field geometry from PDF using pypdf (legacy).
523 Args:
524 reader: PdfReader instance.
526 Returns:
527 Dictionary mapping field names to FieldGeometry.
528 """
529 widget_info = self._extract_widgets_info(reader)
530 return {name: info[1] for name, info in widget_info.items() if info[1] is not None}
532 def _extract_widgets_info(
533 self, reader: PdfReader
534 ) -> dict[str, tuple[list[int], FieldGeometry | None]]:
535 """Scan all pages once to find widget pages and geometry.
537 Args:
538 reader: PdfReader instance.
540 Returns:
541 Dictionary mapping field names to (pages_list, geometry_object).
542 """
543 info: dict[str, tuple[list[int], FieldGeometry | None]] = {}
545 for page_num, page in enumerate(reader.pages, start=1):
546 if "/Annots" not in page:
547 continue
549 annots = cast("ArrayObject", page["/Annots"])
550 for annot_ref in annots:
551 try:
552 annot = (
553 annot_ref.get_object() if hasattr(annot_ref, "get_object") else annot_ref
554 )
556 # Check if it's a widget annotation
557 if annot.get("/Subtype") != "/Widget":
558 continue
560 # Get field name
561 t_value = annot.get("/T")
562 if not t_value:
563 continue
565 field_name = (
566 str(t_value)
567 if isinstance(t_value, str)
568 else str(getattr(t_value, "name", t_value))
569 )
571 # Get rectangle
572 geometry = None
573 rect = annot.get("/Rect")
574 if rect:
575 x0, y0, x1, y1 = [float(coord) for coord in rect]
576 geometry = FieldGeometry(
577 page=page_num,
578 rect=(x0, y0, x1, y1),
579 )
581 # Update info map
582 if field_name not in info: 582 ↛ 585line 582 didn't jump to line 585 because the condition on line 582 was always true
583 info[field_name] = ([page_num], geometry)
584 else:
585 pages, existing_geom = info[field_name]
586 if page_num not in pages:
587 pages.append(page_num)
588 # Keep the first geometry if multiple exist (current limitation)
589 if existing_geom is None:
590 info[field_name] = (pages, geometry)
592 except Exception: # noqa: S110
593 pass
595 return info
597 def _build_raw_data_structure(self, fields: list[PDFField], source: str) -> dict[str, Any]:
598 """Build raw data structure for export.
600 Args:
601 fields: List of PDFField objects.
602 source: Source PDF path.
604 Returns:
605 Dictionary with form data organized by field type.
606 """
607 raw_data: dict[str, Any] = {
608 "header": {"source": source, "version": "pypdf"},
609 "forms": [
610 {
611 "textfield": [],
612 "datefield": [],
613 "checkbox": [],
614 "radiobuttongroup": [],
615 "combobox": [],
616 "listbox": [],
617 "signature": [],
618 }
619 ],
620 }
622 for field in fields:
623 field_entry: dict[str, Any] = {
624 "pages": field.pages,
625 "id": field.id,
626 "name": field.name,
627 "value": field.value,
628 "locked": field.locked,
629 }
631 # Add type-specific attributes
632 if field.field_type == "datefield" and field.format:
633 field_entry["format"] = field.format
635 if field.options and field.field_type in (
636 "radiobuttongroup",
637 "combobox",
638 "listbox",
639 ):
640 field_entry["options"] = field.options
642 # Add to appropriate list
643 if field.field_type in raw_data["forms"][0]:
644 raw_data["forms"][0][field.field_type].append(field_entry)
645 else:
646 # Unknown type, add as textfield
647 raw_data["forms"][0]["textfield"].append(field_entry)
649 return raw_data
651 def extract_to_json(self, pdf_path: str | Path, output_path: str | Path) -> None:
652 """Extract form data and save it to a JSON file.
654 The output JSON will contain the unified PDFField representation
655 with geometry information if available.
657 Args:
658 pdf_path: Path to the PDF file.
659 output_path: Path where the JSON output should be saved.
661 Raises:
662 FileNotFoundError: If the PDF file does not exist.
663 """
664 pdf_path = Path(pdf_path)
665 output_path = Path(output_path)
666 self._validate_pdf_path(pdf_path)
668 # Use unified extraction which includes geometry
669 form_data = self.extract(pdf_path)
671 # Write unified format to JSON
672 with open(output_path, "w", encoding="utf-8") as f:
673 json.dump(form_data.to_dict(), f, indent=2)
675 def list_fields(self, pdf_path: str | Path) -> list[PDFField]:
676 """List all form fields in a PDF.
678 Args:
679 pdf_path: Path to the PDF file.
681 Returns:
682 List of PDFField objects.
684 Raises:
685 FileNotFoundError: If the PDF file does not exist.
686 """
687 form_data = self.extract(pdf_path)
688 return form_data.fields
690 def get_field_value(self, pdf_path: str | Path, field_name: str) -> str | bool | None:
691 """Get the value of a specific form field.
693 Args:
694 pdf_path: Path to the PDF file.
695 field_name: Name of the field to retrieve.
697 Returns:
698 The field value, or None if the field is not found.
700 Raises:
701 FileNotFoundError: If the PDF file does not exist.
702 """
703 fields = self.list_fields(pdf_path)
704 for field in fields:
705 if field.name == field_name: 705 ↛ 704line 705 didn't jump to line 704 because the condition on line 705 was always true
706 return field.value
707 return None
709 def get_field_by_id(self, pdf_path: str | Path, field_id: str) -> PDFField | None:
710 """Get a form field by its ID.
712 Args:
713 pdf_path: Path to the PDF file.
714 field_id: ID of the field to retrieve.
716 Returns:
717 The PDFField object, or None if the field is not found.
719 Raises:
720 FileNotFoundError: If the PDF file does not exist.
721 """
722 fields = self.list_fields(pdf_path)
723 for field in fields:
724 if field.id == field_id: 724 ↛ 723line 724 didn't jump to line 723 because the condition on line 724 was always true
725 return field
726 return None
728 def get_field_by_name(self, pdf_path: str | Path, field_name: str) -> PDFField | None:
729 """Get a form field by its name.
731 Args:
732 pdf_path: Path to the PDF file.
733 field_name: Name of the field to retrieve.
735 Returns:
736 The PDFField object, or None if the field is not found.
738 Raises:
739 FileNotFoundError: If the PDF file does not exist.
740 """
741 fields = self.list_fields(pdf_path)
742 for field in fields:
743 if field.name == field_name: 743 ↛ 742line 743 didn't jump to line 742 because the condition on line 743 was always true
744 return field
745 return None
747 def validate_form_data(
748 self,
749 pdf_path: str | Path,
750 form_data: dict[str, Any],
751 *,
752 strict: bool = False,
753 allow_extra_fields: bool = False,
754 ) -> list[str]:
755 """Validate form data against PDF form fields.
757 This method validates that the provided form data (simple key:value format)
758 matches the structure and field types of the PDF form.
760 Args:
761 pdf_path: Path to the PDF file.
762 form_data: The form data to validate (simple format: {"Field Name": value}).
763 strict: If True, also checks that all form fields are provided.
764 allow_extra_fields: If True, allows fields not present in the form.
766 Returns:
767 List of validation error messages (empty if valid).
769 Raises:
770 FileNotFoundError: If the PDF file does not exist.
771 """
772 pdf_path = Path(pdf_path)
773 self._validate_pdf_path(pdf_path)
775 errors: list[str] = []
777 # Get the form fields from the PDF
778 try:
779 form_data_obj = self.extract(pdf_path)
780 except PDFFormNotFoundError:
781 return ["PDF does not contain a form"]
783 # Build lookup by name
784 fields_by_name = {f.name: f for f in form_data_obj.fields}
786 # Validate each input field
787 if not allow_extra_fields:
788 for field_name, value in form_data.items():
789 if field_name not in fields_by_name:
790 errors.append(f"Field not found in form: '{field_name}'")
791 continue
793 field = fields_by_name[field_name]
795 # Validate value type matches field type
796 if field.field_type == "checkbox" and not isinstance(value, bool):
797 errors.append(
798 f"Field '{field_name}': checkbox value must be boolean, "
799 f"got {type(value).__name__}"
800 )
802 # In strict mode, check all form fields are provided
803 if strict:
804 provided_names = set(form_data.keys())
805 for field in form_data_obj.fields:
806 if field.name not in provided_names:
807 errors.append(f"Required field not provided: '{field.name}'")
809 return errors
811 def fill_form(
812 self,
813 pdf_path: str | Path,
814 form_data: dict[str, Any],
815 output_path: str | Path | None = None,
816 *,
817 validate: bool = True,
818 ) -> Path:
819 """Fill a PDF form with data.
821 This method accepts form data in simple key:value format where keys are
822 field names and values are the values to fill.
824 Args:
825 pdf_path: Path to the PDF file containing the form.
826 form_data: The form data to fill (format: {"Field Name": value}).
827 output_path: Optional output path. If not provided, the input PDF
828 is modified in place.
829 validate: If True, validates form data before filling.
831 Returns:
832 Path to the filled PDF (output_path or pdf_path if no output specified).
834 Raises:
835 FileNotFoundError: If the PDF file does not exist.
836 FormValidationError: If validation fails and validate=True.
837 PDFFormNotFoundError: If the PDF does not contain a form.
839 Example:
840 >>> form_data = {"Candidate Name": "John Smith", "Full time": True}
841 >>> extractor.fill_form("form.pdf", form_data, "filled.pdf")
842 """
843 pdf_path = Path(pdf_path)
844 self._validate_pdf_path(pdf_path)
846 # Check if PDF has a form
847 if not self.has_form(pdf_path):
848 raise PDFFormNotFoundError(f"PDF does not contain a form: {pdf_path}")
850 # Validate form data if requested
851 if validate:
852 errors = self.validate_form_data(pdf_path, form_data)
853 if errors: 853 ↛ 857line 853 didn't jump to line 857 because the condition on line 853 was always true
854 raise FormValidationError("Form data validation failed", errors)
856 # Read the PDF
857 reader = PdfReader(str(pdf_path))
858 writer = PdfWriter()
860 # Copy all pages and form fields
861 writer.append(reader)
863 # Fill form fields - collect all values first
864 field_values = {}
865 for field_name, value in form_data.items():
866 # pypdf expects string values
867 # For checkboxes, use /Yes or /Off
868 str_value = ("/Yes" if value else "/Off") if isinstance(value, bool) else str(value)
869 field_values[field_name] = str_value
871 # Update all fields at once on all pages where they appear
872 if field_values:
873 # We need to call update_page_form_field_values for each page
874 # to ensure all widgets are updated. pypdf 5+ correctly handles
875 # this by only updating widgets present on the passed page.
876 for page in writer.pages: 876 ↛ 877line 876 didn't jump to line 877 because the loop on line 876 never started
877 writer.update_page_form_field_values(
878 page,
879 field_values,
880 )
882 # Write output
883 output_file = Path(output_path) if output_path else pdf_path
884 with open(output_file, "wb") as f:
885 writer.write(f)
887 return output_file
889 def fill_form_from_json(
890 self,
891 pdf_path: str | Path,
892 json_path: str | Path,
893 output_path: str | Path | None = None,
894 *,
895 validate: bool = True,
896 ) -> Path:
897 """Fill a PDF form with data from a JSON file.
899 The JSON file should contain simple key:value pairs where keys are
900 field names and values are the values to fill.
902 Args:
903 pdf_path: Path to the PDF file containing the form.
904 json_path: Path to the JSON file with form data.
905 output_path: Optional output path. If not provided, the input PDF
906 is modified in place.
907 validate: If True, validates form data before filling.
909 Returns:
910 Path to the filled PDF.
912 Raises:
913 FileNotFoundError: If any file does not exist.
914 FormValidationError: If validation fails and validate=True.
915 """
916 pdf_path = Path(pdf_path)
917 json_path = Path(json_path)
919 self._validate_pdf_path(pdf_path)
920 if not json_path.exists():
921 raise FileNotFoundError(f"JSON file not found: {json_path}")
922 if not json_path.is_file():
923 raise FileNotFoundError(f"Path is not a file: {json_path}")
925 # Read and parse JSON
926 with open(json_path, encoding="utf-8") as f:
927 form_data: dict[str, Any] = json.load(f)
929 return self.fill_form(pdf_path, form_data, output_path, validate=validate)
931 def _validate_pdf_path(self, pdf_path: Path) -> None:
932 """Validate that the PDF path exists and is a file.
934 Args:
935 pdf_path: Path to validate.
937 Raises:
938 FileNotFoundError: If the path does not exist or is not a file.
939 """
940 if not pdf_path.exists():
941 raise FileNotFoundError(f"PDF file not found: {pdf_path}")
942 if not pdf_path.is_file():
943 raise FileNotFoundError(f"Path is not a file: {pdf_path}")
946def get_available_geometry_backends() -> list[str]:
947 """Return list of available geometry backends.
949 Returns:
950 List of backend names that can be used.
951 For pypdf version, always returns ["pypdf"].
952 """
953 return ["pypdf"]
956def has_geometry_support() -> bool:
957 """Check if any geometry extraction backend is available.
959 Returns:
960 True (pypdf always supports geometry extraction).
961 """
962 return True
965# Backwards compatibility aliases (deprecated, will be removed in a future version)
966# These aliases exist for code that was written for earlier versions using pdfcpu
967PDFCPUError = PDFFormError
968PDFCPUNotFoundError = PDFFormError
969PDFCPUExecutionError = PDFFormError