Coverage for privacyforms_pdf / extractor.py: 96%

325 statements  

« prev     ^ index     » next       coverage.py v7.13.4, created at 2026-03-07 14:38 +0100

1"""PDF Form Extractor module using pypdf.""" 

2 

3from __future__ import annotations 

4 

5import json 

6from pathlib import Path 

7from typing import TYPE_CHECKING, Any, cast 

8 

9from pydantic import BaseModel, ConfigDict, Field 

10from pypdf import PdfReader, PdfWriter 

11 

12if TYPE_CHECKING: 

13 from pypdf.generic import ArrayObject 

14 

15 

16class PDFFormError(Exception): 

17 """Base exception for PDF form related errors.""" 

18 

19 pass 

20 

21 

22class PDFFormNotFoundError(PDFFormError): 

23 """Raised when the PDF does not contain any forms.""" 

24 

25 pass 

26 

27 

28class FieldNotFoundError(PDFFormError): 

29 """Raised when a field is not found in the form.""" 

30 

31 pass 

32 

33 

34class FormValidationError(PDFFormError): 

35 """Raised when form data validation fails.""" 

36 

37 def __init__(self, message: str, errors: list[str] | None = None) -> None: 

38 """Initialize the error with validation details. 

39 

40 Args: 

41 message: Error message. 

42 errors: List of specific validation errors. 

43 """ 

44 super().__init__(message) 

45 self.message = message 

46 self.errors = errors or [] 

47 

48 def __str__(self) -> str: # noqa: D105 

49 if self.errors: 

50 return f"{self.message}\n- " + "\n- ".join(self.errors) 

51 return self.message 

52 

53 

54class FieldGeometry(BaseModel): 

55 """Geometry information for a PDF form field. 

56 

57 Attributes: 

58 page: 1-based page number where field appears. 

59 rect: Bounding box as (x1, y1, x2, y2) in PDF points (1/72 inch). 

60 x: Left coordinate. 

61 y: Bottom coordinate (PDF coordinate system). 

62 width: Field width in points. 

63 height: Field height in points. 

64 units: Unit of measurement (always "pt" for points). 

65 """ 

66 

67 page: int 

68 rect: tuple[float, float, float, float] 

69 

70 @property 

71 def x(self) -> float: 

72 """Left coordinate.""" 

73 return self.rect[0] 

74 

75 @property 

76 def y(self) -> float: 

77 """Bottom coordinate (PDF coordinate system).""" 

78 return self.rect[1] 

79 

80 @property 

81 def width(self) -> float: 

82 """Field width in points.""" 

83 return self.rect[2] - self.rect[0] 

84 

85 @property 

86 def height(self) -> float: 

87 """Field height in points.""" 

88 return self.rect[3] - self.rect[1] 

89 

90 def model_dump(self, **kwargs: Any) -> dict[str, Any]: # noqa: ARG002 

91 """Convert to dictionary for JSON serialization. 

92 

93 Returns: 

94 Dictionary with page, rect, x, y, width, height, units. 

95 """ 

96 return { 

97 "page": self.page, 

98 "rect": list(self.rect), 

99 "x": self.x, 

100 "y": self.y, 

101 "width": self.width, 

102 "height": self.height, 

103 "units": "pt", 

104 } 

105 

106 

107class PDFField(BaseModel): 

108 """Unified PDF form field model with geometry and all field properties. 

109 

110 Attributes: 

111 name: The name of the field. 

112 id: The unique identifier of the field. 

113 field_type: The type of the form field (e.g., 'textfield', 'checkbox'). 

114 value: The current value of the field. 

115 pages: List of pages where this field appears. 

116 locked: Whether the field is locked. 

117 geometry: Optional geometry information (position and size). 

118 format: Date format for datefield types. 

119 options: Available options for radiobuttongroup, combobox, listbox types. 

120 """ 

121 

122 name: str 

123 id: str 

124 field_type: str = Field(..., alias="type") 

125 value: str | bool = "" 

126 pages: list[int] = [] 

127 locked: bool = False 

128 geometry: FieldGeometry | None = None 

129 format: str | None = None 

130 options: list[str] = [] 

131 

132 model_config = ConfigDict(populate_by_name=True) 

133 

134 def model_dump(self, **kwargs: Any) -> dict[str, Any]: 

135 """Serialize field to dictionary, including geometry if present. 

136 

137 Returns: 

138 Dictionary representation of the field. 

139 """ 

140 data = super().model_dump(**kwargs) 

141 # Ensure geometry is properly serialized if present 

142 if self.geometry is not None: 

143 data["geometry"] = self.geometry.model_dump() 

144 else: 

145 data["geometry"] = None 

146 return data 

147 

148 

149class FormField: 

150 """Represents a single form field (legacy dataclass, use PDFField instead). 

151 

152 Attributes: 

153 field_type: The type of the form field (e.g., 'textfield', 'checkbox'). 

154 pages: List of pages where this field appears. 

155 id: The unique identifier of the field. 

156 name: The name of the field. 

157 value: The current value of the field. 

158 locked: Whether the field is locked. 

159 """ 

160 

161 def __init__( 

162 self, 

163 field_type: str, 

164 pages: list[int], 

165 id: str, # noqa: A002 

166 name: str, 

167 value: str | bool, 

168 locked: bool, 

169 ) -> None: 

170 """Initialize FormField. 

171 

172 Args: 

173 field_type: The type of the form field. 

174 pages: List of pages where this field appears. 

175 id: The unique identifier of the field. 

176 name: The name of the field. 

177 value: The current value of the field. 

178 locked: Whether the field is locked. 

179 """ 

180 self.field_type = field_type 

181 self.pages = pages 

182 self.id = id 

183 self.name = name 

184 self.value = value 

185 self.locked = locked 

186 

187 def __repr__(self) -> str: 

188 """Return string representation.""" 

189 return f"FormField(field_type='{self.field_type}', name='{self.name}', id='{self.id}')" 

190 

191 def __eq__(self, other: object) -> bool: 

192 """Check equality with another FormField.""" 

193 if not isinstance(other, FormField): 

194 return NotImplemented 

195 return ( 

196 self.field_type == other.field_type 

197 and self.pages == other.pages 

198 and self.id == other.id 

199 and self.name == other.name 

200 and self.value == other.value 

201 and self.locked == other.locked 

202 ) 

203 

204 

205class PDFFormData: 

206 """Represents extracted PDF form data. 

207 

208 Attributes: 

209 source: Path to the source PDF file. 

210 pdf_version: Version of the PDF. 

211 has_form: Whether the PDF contains a form. 

212 fields: List of PDF fields (PDFField objects). 

213 raw_data: The raw data from pypdf. 

214 """ 

215 

216 def __init__( 

217 self, 

218 source: Path, 

219 pdf_version: str, 

220 has_form: bool, 

221 fields: list[PDFField], 

222 raw_data: dict[str, Any], 

223 ) -> None: 

224 """Initialize PDFFormData. 

225 

226 Args: 

227 source: Path to the source PDF file. 

228 pdf_version: Version of the PDF. 

229 has_form: Whether the PDF contains a form. 

230 fields: List of PDFField objects. 

231 raw_data: The raw data from pypdf. 

232 """ 

233 self.source = source 

234 self.pdf_version = pdf_version 

235 self.has_form = has_form 

236 self.fields = fields 

237 self.raw_data = raw_data 

238 

239 def to_json(self) -> str: 

240 """Serialize form data to JSON string. 

241 

242 Returns: 

243 JSON string representation of the form data. 

244 """ 

245 data = { 

246 "source": str(self.source), 

247 "pdf_version": self.pdf_version, 

248 "has_form": self.has_form, 

249 "fields": [field.model_dump() for field in self.fields], 

250 } 

251 return json.dumps(data, indent=2) 

252 

253 def to_dict(self) -> dict[str, Any]: 

254 """Serialize form data to dictionary. 

255 

256 Returns: 

257 Dictionary representation of the form data. 

258 """ 

259 return { 

260 "source": str(self.source), 

261 "pdf_version": self.pdf_version, 

262 "has_form": self.has_form, 

263 "fields": [field.model_dump() for field in self.fields], 

264 } 

265 

266 

267class PDFFormExtractor: 

268 """Extracts form information from PDF files using pypdf. 

269 

270 This class provides methods to extract form data from PDF files. 

271 It uses pypdf for all operations including form extraction and filling. 

272 

273 Example: 

274 >>> extractor = PDFFormExtractor() 

275 >>> form_data = extractor.extract("form.pdf") 

276 >>> for field in form_data.fields: 

277 ... print(f"{field.name}: {field.value}") 

278 ... if field.geometry: 

279 ... print(f" Position: ({field.geometry.x}, {field.geometry.y})") 

280 """ 

281 

282 def __init__( 

283 self, 

284 timeout_seconds: float = 30.0, 

285 extract_geometry: bool = True, 

286 ) -> None: 

287 """Initialize the extractor. 

288 

289 Args: 

290 timeout_seconds: Timeout for operations (kept for API compatibility). 

291 extract_geometry: Whether to extract field geometry information. 

292 """ 

293 self._timeout_seconds = timeout_seconds 

294 self._extract_geometry = extract_geometry 

295 

296 @staticmethod 

297 def _get_field_type(field: dict[str, Any]) -> str: 

298 """Determine field type from pypdf field data. 

299 

300 Args: 

301 field: Field dictionary from pypdf. 

302 

303 Returns: 

304 Field type string. 

305 """ 

306 ft = field.get("/FT") 

307 if ft is None: 

308 # Try to get from field type name 

309 ft = field.get("/Type") 

310 

311 if ft == "/Tx": 

312 # Check if it's a date field 

313 if "/AA" in field or "/DV" in field: 

314 # Look for date format in additional actions 

315 return "textfield" 

316 return "textfield" 

317 elif ft == "/Btn": 

318 # Button can be checkbox, radio button, or push button 

319 # Check for radio button group 

320 if "/Opt" in field: 

321 return "radiobuttongroup" 

322 # Check if it's a checkbox (usually has /V as /Yes or /Off) 

323 return "checkbox" 

324 elif ft == "/Ch": 

325 # Choice field - can be combo box or list box 

326 ff = field.get("/Ff", 0) 

327 if isinstance(ff, int) and ff & 0x40000: # Combo box flag 

328 return "combobox" 

329 return "listbox" 

330 elif ft == "/Sig": 

331 return "signature" 

332 

333 return "textfield" # Default fallback 

334 

335 @staticmethod 

336 def _get_field_value(field: dict[str, Any]) -> str | bool: 

337 """Extract value from pypdf field data. 

338 

339 Args: 

340 field: Field dictionary from pypdf. 

341 

342 Returns: 

343 Field value (string or boolean for checkboxes). 

344 """ 

345 value = field.get("/V") 

346 

347 if value is None: 

348 return "" 

349 

350 # Handle checkbox values 

351 if isinstance(value, str): 

352 if value.lower() in ("/yes", "yes", "/on", "on", "1"): 

353 return True 

354 elif value.lower() in ("/off", "off", "no", "0"): 

355 return False 

356 return value 

357 

358 # Handle NameObject from pypdf 

359 if hasattr(value, "name"): 

360 name = value.name 

361 if name.lower() in ("/yes", "yes", "/on", "on", "1"): 

362 return True 

363 elif name.lower() in ("/off", "off", "no", "0"): 

364 return False 

365 return str(name) 

366 

367 return str(value) 

368 

369 @staticmethod 

370 def _get_field_options(field: dict[str, Any]) -> list[str]: 

371 """Extract options for choice/radio fields. 

372 

373 Args: 

374 field: Field dictionary from pypdf. 

375 

376 Returns: 

377 List of option strings. 

378 """ 

379 options = field.get("/Opt", []) 

380 if options: 

381 result = [] 

382 for opt in options: 

383 # Options can be text or [export_value, label] 

384 if isinstance(opt, list) and len(opt) >= 2: 384 ↛ 385line 384 didn't jump to line 385 because the condition on line 384 was never true

385 result.append(str(opt[1])) 

386 elif isinstance(opt, list) and len(opt) == 1: 386 ↛ 387line 386 didn't jump to line 387 because the condition on line 386 was never true

387 result.append(str(opt[0])) 

388 else: 

389 result.append(str(opt)) 

390 return result 

391 

392 # For radio buttons, check Kids 

393 kids = field.get("/Kids", []) 

394 if kids: 

395 # Extract options from kid widgets 

396 opt_list = [] 

397 for kid in kids: 

398 kid_obj = kid.get_object() if hasattr(kid, "get_object") else kid 

399 if kid_obj and "/AP" in kid_obj: 

400 ap = kid_obj["/AP"] 

401 if "/N" in ap: 

402 # Get the appearance names 

403 names = list(ap["/N"].keys()) 

404 opt_list.extend([str(n) for n in names if str(n).lower() != "/off"]) 

405 return list(dict.fromkeys(opt_list)) # Deduplicate while preserving order 

406 

407 return [] 

408 

409 def has_form(self, pdf_path: str | Path) -> bool: 

410 """Check if a PDF contains a form. 

411 

412 Args: 

413 pdf_path: Path to the PDF file. 

414 

415 Returns: 

416 True if the PDF contains a form, False otherwise. 

417 """ 

418 pdf_path = Path(pdf_path) 

419 self._validate_pdf_path(pdf_path) 

420 

421 reader = PdfReader(str(pdf_path)) 

422 fields = reader.get_fields() 

423 return fields is not None and len(fields) > 0 

424 

425 def extract(self, pdf_path: str | Path) -> PDFFormData: 

426 """Extract form data from a PDF file. 

427 

428 This method extracts form data from the PDF using pypdf and 

429 parses it into a structured format. If extract_geometry is True, 

430 field positions and sizes will also be extracted. 

431 

432 Args: 

433 pdf_path: Path to the PDF file. 

434 

435 Returns: 

436 PDFFormData containing all form information with PDFField objects. 

437 

438 Raises: 

439 FileNotFoundError: If the PDF file does not exist. 

440 PDFFormNotFoundError: If the PDF does not contain a form. 

441 """ 

442 pdf_path = Path(pdf_path) 

443 self._validate_pdf_path(pdf_path) 

444 

445 reader = PdfReader(str(pdf_path)) 

446 

447 # Check if PDF has a form 

448 fields = reader.get_fields() 

449 if not fields: 

450 raise PDFFormNotFoundError(f"PDF does not contain a form: {pdf_path}") 

451 

452 # Extract widget info (pages and geometry) in one pass 

453 widget_info = self._extract_widgets_info(reader) 

454 

455 # Parse fields into PDFField objects 

456 pdf_fields: list[PDFField] = [] 

457 raw_fields_data: dict[str, Any] = {} 

458 

459 for field_counter, (field_name, field_data) in enumerate(fields.items(), start=1): 

460 raw_fields_data[field_name] = field_data 

461 

462 # Get field type 

463 field_type = self._get_field_type(field_data) 

464 

465 # Get field value 

466 value = self._get_field_value(field_data) 

467 

468 # Get info from widget scan 

469 info = widget_info.get(field_name, ([], None)) 

470 pages = info[0] if info[0] else [1] 

471 geometry = info[1] if self._extract_geometry else None 

472 

473 # Get options for choice fields 

474 options = self._get_field_options(field_data) 

475 

476 # Create PDFField 

477 pdf_field = PDFField( 

478 name=field_name, 

479 id=str(field_counter), 

480 type=field_type, 

481 value=value, 

482 pages=pages, 

483 locked=False, # pypdf doesn't directly expose locked state 

484 geometry=geometry, 

485 format=None, # Date format extraction would require additional parsing 

486 options=options, 

487 ) 

488 pdf_fields.append(pdf_field) 

489 

490 # Build raw data structure for compatibility 

491 raw_data = self._build_raw_data_structure(pdf_fields, str(pdf_path)) 

492 

493 # Get PDF version from header (e.g., "%PDF-1.7" -> "1.7") 

494 if hasattr(reader, "pdf_header"): 

495 pdf_version = reader.pdf_header.replace("%PDF-", "") 

496 else: 

497 pdf_version = "unknown" 

498 

499 return PDFFormData( 

500 source=pdf_path, 

501 pdf_version=pdf_version, 

502 has_form=len(pdf_fields) > 0, 

503 fields=pdf_fields, 

504 raw_data=raw_data, 

505 ) 

506 

507 def _get_field_pages(self, reader: PdfReader, field_name: str) -> list[int]: 

508 """Find which pages contain the field widget (legacy). 

509 

510 Args: 

511 reader: PdfReader instance. 

512 field_name: Name of the field. 

513 

514 Returns: 

515 List of 1-based page numbers where field appears. 

516 """ 

517 widget_info = self._extract_widgets_info(reader) 

518 return widget_info.get(field_name, ([1], None))[0] 

519 

520 def _extract_geometry_from_pdf(self, reader: PdfReader) -> dict[str, FieldGeometry]: 

521 """Extract field geometry from PDF using pypdf (legacy). 

522 

523 Args: 

524 reader: PdfReader instance. 

525 

526 Returns: 

527 Dictionary mapping field names to FieldGeometry. 

528 """ 

529 widget_info = self._extract_widgets_info(reader) 

530 return {name: info[1] for name, info in widget_info.items() if info[1] is not None} 

531 

532 def _extract_widgets_info( 

533 self, reader: PdfReader 

534 ) -> dict[str, tuple[list[int], FieldGeometry | None]]: 

535 """Scan all pages once to find widget pages and geometry. 

536 

537 Args: 

538 reader: PdfReader instance. 

539 

540 Returns: 

541 Dictionary mapping field names to (pages_list, geometry_object). 

542 """ 

543 info: dict[str, tuple[list[int], FieldGeometry | None]] = {} 

544 

545 for page_num, page in enumerate(reader.pages, start=1): 

546 if "/Annots" not in page: 

547 continue 

548 

549 annots = cast("ArrayObject", page["/Annots"]) 

550 for annot_ref in annots: 

551 try: 

552 annot = ( 

553 annot_ref.get_object() if hasattr(annot_ref, "get_object") else annot_ref 

554 ) 

555 

556 # Check if it's a widget annotation 

557 if annot.get("/Subtype") != "/Widget": 

558 continue 

559 

560 # Get field name 

561 t_value = annot.get("/T") 

562 if not t_value: 

563 continue 

564 

565 field_name = ( 

566 str(t_value) 

567 if isinstance(t_value, str) 

568 else str(getattr(t_value, "name", t_value)) 

569 ) 

570 

571 # Get rectangle 

572 geometry = None 

573 rect = annot.get("/Rect") 

574 if rect: 

575 x0, y0, x1, y1 = [float(coord) for coord in rect] 

576 geometry = FieldGeometry( 

577 page=page_num, 

578 rect=(x0, y0, x1, y1), 

579 ) 

580 

581 # Update info map 

582 if field_name not in info: 582 ↛ 585line 582 didn't jump to line 585 because the condition on line 582 was always true

583 info[field_name] = ([page_num], geometry) 

584 else: 

585 pages, existing_geom = info[field_name] 

586 if page_num not in pages: 

587 pages.append(page_num) 

588 # Keep the first geometry if multiple exist (current limitation) 

589 if existing_geom is None: 

590 info[field_name] = (pages, geometry) 

591 

592 except Exception: # noqa: S110 

593 pass 

594 

595 return info 

596 

597 def _build_raw_data_structure(self, fields: list[PDFField], source: str) -> dict[str, Any]: 

598 """Build raw data structure for export. 

599 

600 Args: 

601 fields: List of PDFField objects. 

602 source: Source PDF path. 

603 

604 Returns: 

605 Dictionary with form data organized by field type. 

606 """ 

607 raw_data: dict[str, Any] = { 

608 "header": {"source": source, "version": "pypdf"}, 

609 "forms": [ 

610 { 

611 "textfield": [], 

612 "datefield": [], 

613 "checkbox": [], 

614 "radiobuttongroup": [], 

615 "combobox": [], 

616 "listbox": [], 

617 "signature": [], 

618 } 

619 ], 

620 } 

621 

622 for field in fields: 

623 field_entry: dict[str, Any] = { 

624 "pages": field.pages, 

625 "id": field.id, 

626 "name": field.name, 

627 "value": field.value, 

628 "locked": field.locked, 

629 } 

630 

631 # Add type-specific attributes 

632 if field.field_type == "datefield" and field.format: 

633 field_entry["format"] = field.format 

634 

635 if field.options and field.field_type in ( 

636 "radiobuttongroup", 

637 "combobox", 

638 "listbox", 

639 ): 

640 field_entry["options"] = field.options 

641 

642 # Add to appropriate list 

643 if field.field_type in raw_data["forms"][0]: 

644 raw_data["forms"][0][field.field_type].append(field_entry) 

645 else: 

646 # Unknown type, add as textfield 

647 raw_data["forms"][0]["textfield"].append(field_entry) 

648 

649 return raw_data 

650 

651 def extract_to_json(self, pdf_path: str | Path, output_path: str | Path) -> None: 

652 """Extract form data and save it to a JSON file. 

653 

654 The output JSON will contain the unified PDFField representation 

655 with geometry information if available. 

656 

657 Args: 

658 pdf_path: Path to the PDF file. 

659 output_path: Path where the JSON output should be saved. 

660 

661 Raises: 

662 FileNotFoundError: If the PDF file does not exist. 

663 """ 

664 pdf_path = Path(pdf_path) 

665 output_path = Path(output_path) 

666 self._validate_pdf_path(pdf_path) 

667 

668 # Use unified extraction which includes geometry 

669 form_data = self.extract(pdf_path) 

670 

671 # Write unified format to JSON 

672 with open(output_path, "w", encoding="utf-8") as f: 

673 json.dump(form_data.to_dict(), f, indent=2) 

674 

675 def list_fields(self, pdf_path: str | Path) -> list[PDFField]: 

676 """List all form fields in a PDF. 

677 

678 Args: 

679 pdf_path: Path to the PDF file. 

680 

681 Returns: 

682 List of PDFField objects. 

683 

684 Raises: 

685 FileNotFoundError: If the PDF file does not exist. 

686 """ 

687 form_data = self.extract(pdf_path) 

688 return form_data.fields 

689 

690 def get_field_value(self, pdf_path: str | Path, field_name: str) -> str | bool | None: 

691 """Get the value of a specific form field. 

692 

693 Args: 

694 pdf_path: Path to the PDF file. 

695 field_name: Name of the field to retrieve. 

696 

697 Returns: 

698 The field value, or None if the field is not found. 

699 

700 Raises: 

701 FileNotFoundError: If the PDF file does not exist. 

702 """ 

703 fields = self.list_fields(pdf_path) 

704 for field in fields: 

705 if field.name == field_name: 705 ↛ 704line 705 didn't jump to line 704 because the condition on line 705 was always true

706 return field.value 

707 return None 

708 

709 def get_field_by_id(self, pdf_path: str | Path, field_id: str) -> PDFField | None: 

710 """Get a form field by its ID. 

711 

712 Args: 

713 pdf_path: Path to the PDF file. 

714 field_id: ID of the field to retrieve. 

715 

716 Returns: 

717 The PDFField object, or None if the field is not found. 

718 

719 Raises: 

720 FileNotFoundError: If the PDF file does not exist. 

721 """ 

722 fields = self.list_fields(pdf_path) 

723 for field in fields: 

724 if field.id == field_id: 724 ↛ 723line 724 didn't jump to line 723 because the condition on line 724 was always true

725 return field 

726 return None 

727 

728 def get_field_by_name(self, pdf_path: str | Path, field_name: str) -> PDFField | None: 

729 """Get a form field by its name. 

730 

731 Args: 

732 pdf_path: Path to the PDF file. 

733 field_name: Name of the field to retrieve. 

734 

735 Returns: 

736 The PDFField object, or None if the field is not found. 

737 

738 Raises: 

739 FileNotFoundError: If the PDF file does not exist. 

740 """ 

741 fields = self.list_fields(pdf_path) 

742 for field in fields: 

743 if field.name == field_name: 743 ↛ 742line 743 didn't jump to line 742 because the condition on line 743 was always true

744 return field 

745 return None 

746 

747 def validate_form_data( 

748 self, 

749 pdf_path: str | Path, 

750 form_data: dict[str, Any], 

751 *, 

752 strict: bool = False, 

753 allow_extra_fields: bool = False, 

754 ) -> list[str]: 

755 """Validate form data against PDF form fields. 

756 

757 This method validates that the provided form data (simple key:value format) 

758 matches the structure and field types of the PDF form. 

759 

760 Args: 

761 pdf_path: Path to the PDF file. 

762 form_data: The form data to validate (simple format: {"Field Name": value}). 

763 strict: If True, also checks that all form fields are provided. 

764 allow_extra_fields: If True, allows fields not present in the form. 

765 

766 Returns: 

767 List of validation error messages (empty if valid). 

768 

769 Raises: 

770 FileNotFoundError: If the PDF file does not exist. 

771 """ 

772 pdf_path = Path(pdf_path) 

773 self._validate_pdf_path(pdf_path) 

774 

775 errors: list[str] = [] 

776 

777 # Get the form fields from the PDF 

778 try: 

779 form_data_obj = self.extract(pdf_path) 

780 except PDFFormNotFoundError: 

781 return ["PDF does not contain a form"] 

782 

783 # Build lookup by name 

784 fields_by_name = {f.name: f for f in form_data_obj.fields} 

785 

786 # Validate each input field 

787 if not allow_extra_fields: 

788 for field_name, value in form_data.items(): 

789 if field_name not in fields_by_name: 

790 errors.append(f"Field not found in form: '{field_name}'") 

791 continue 

792 

793 field = fields_by_name[field_name] 

794 

795 # Validate value type matches field type 

796 if field.field_type == "checkbox" and not isinstance(value, bool): 

797 errors.append( 

798 f"Field '{field_name}': checkbox value must be boolean, " 

799 f"got {type(value).__name__}" 

800 ) 

801 

802 # In strict mode, check all form fields are provided 

803 if strict: 

804 provided_names = set(form_data.keys()) 

805 for field in form_data_obj.fields: 

806 if field.name not in provided_names: 

807 errors.append(f"Required field not provided: '{field.name}'") 

808 

809 return errors 

810 

811 def fill_form( 

812 self, 

813 pdf_path: str | Path, 

814 form_data: dict[str, Any], 

815 output_path: str | Path | None = None, 

816 *, 

817 validate: bool = True, 

818 ) -> Path: 

819 """Fill a PDF form with data. 

820 

821 This method accepts form data in simple key:value format where keys are 

822 field names and values are the values to fill. 

823 

824 Args: 

825 pdf_path: Path to the PDF file containing the form. 

826 form_data: The form data to fill (format: {"Field Name": value}). 

827 output_path: Optional output path. If not provided, the input PDF 

828 is modified in place. 

829 validate: If True, validates form data before filling. 

830 

831 Returns: 

832 Path to the filled PDF (output_path or pdf_path if no output specified). 

833 

834 Raises: 

835 FileNotFoundError: If the PDF file does not exist. 

836 FormValidationError: If validation fails and validate=True. 

837 PDFFormNotFoundError: If the PDF does not contain a form. 

838 

839 Example: 

840 >>> form_data = {"Candidate Name": "John Smith", "Full time": True} 

841 >>> extractor.fill_form("form.pdf", form_data, "filled.pdf") 

842 """ 

843 pdf_path = Path(pdf_path) 

844 self._validate_pdf_path(pdf_path) 

845 

846 # Check if PDF has a form 

847 if not self.has_form(pdf_path): 

848 raise PDFFormNotFoundError(f"PDF does not contain a form: {pdf_path}") 

849 

850 # Validate form data if requested 

851 if validate: 

852 errors = self.validate_form_data(pdf_path, form_data) 

853 if errors: 853 ↛ 857line 853 didn't jump to line 857 because the condition on line 853 was always true

854 raise FormValidationError("Form data validation failed", errors) 

855 

856 # Read the PDF 

857 reader = PdfReader(str(pdf_path)) 

858 writer = PdfWriter() 

859 

860 # Copy all pages and form fields 

861 writer.append(reader) 

862 

863 # Fill form fields - collect all values first 

864 field_values = {} 

865 for field_name, value in form_data.items(): 

866 # pypdf expects string values 

867 # For checkboxes, use /Yes or /Off 

868 str_value = ("/Yes" if value else "/Off") if isinstance(value, bool) else str(value) 

869 field_values[field_name] = str_value 

870 

871 # Update all fields at once on all pages where they appear 

872 if field_values: 

873 # We need to call update_page_form_field_values for each page 

874 # to ensure all widgets are updated. pypdf 5+ correctly handles 

875 # this by only updating widgets present on the passed page. 

876 for page in writer.pages: 876 ↛ 877line 876 didn't jump to line 877 because the loop on line 876 never started

877 writer.update_page_form_field_values( 

878 page, 

879 field_values, 

880 ) 

881 

882 # Write output 

883 output_file = Path(output_path) if output_path else pdf_path 

884 with open(output_file, "wb") as f: 

885 writer.write(f) 

886 

887 return output_file 

888 

889 def fill_form_from_json( 

890 self, 

891 pdf_path: str | Path, 

892 json_path: str | Path, 

893 output_path: str | Path | None = None, 

894 *, 

895 validate: bool = True, 

896 ) -> Path: 

897 """Fill a PDF form with data from a JSON file. 

898 

899 The JSON file should contain simple key:value pairs where keys are 

900 field names and values are the values to fill. 

901 

902 Args: 

903 pdf_path: Path to the PDF file containing the form. 

904 json_path: Path to the JSON file with form data. 

905 output_path: Optional output path. If not provided, the input PDF 

906 is modified in place. 

907 validate: If True, validates form data before filling. 

908 

909 Returns: 

910 Path to the filled PDF. 

911 

912 Raises: 

913 FileNotFoundError: If any file does not exist. 

914 FormValidationError: If validation fails and validate=True. 

915 """ 

916 pdf_path = Path(pdf_path) 

917 json_path = Path(json_path) 

918 

919 self._validate_pdf_path(pdf_path) 

920 if not json_path.exists(): 

921 raise FileNotFoundError(f"JSON file not found: {json_path}") 

922 if not json_path.is_file(): 

923 raise FileNotFoundError(f"Path is not a file: {json_path}") 

924 

925 # Read and parse JSON 

926 with open(json_path, encoding="utf-8") as f: 

927 form_data: dict[str, Any] = json.load(f) 

928 

929 return self.fill_form(pdf_path, form_data, output_path, validate=validate) 

930 

931 def _validate_pdf_path(self, pdf_path: Path) -> None: 

932 """Validate that the PDF path exists and is a file. 

933 

934 Args: 

935 pdf_path: Path to validate. 

936 

937 Raises: 

938 FileNotFoundError: If the path does not exist or is not a file. 

939 """ 

940 if not pdf_path.exists(): 

941 raise FileNotFoundError(f"PDF file not found: {pdf_path}") 

942 if not pdf_path.is_file(): 

943 raise FileNotFoundError(f"Path is not a file: {pdf_path}") 

944 

945 

946def get_available_geometry_backends() -> list[str]: 

947 """Return list of available geometry backends. 

948 

949 Returns: 

950 List of backend names that can be used. 

951 For pypdf version, always returns ["pypdf"]. 

952 """ 

953 return ["pypdf"] 

954 

955 

956def has_geometry_support() -> bool: 

957 """Check if any geometry extraction backend is available. 

958 

959 Returns: 

960 True (pypdf always supports geometry extraction). 

961 """ 

962 return True 

963 

964 

965# Backwards compatibility aliases (deprecated, will be removed in a future version) 

966# These aliases exist for code that was written for earlier versions using pdfcpu 

967PDFCPUError = PDFFormError 

968PDFCPUNotFoundError = PDFFormError 

969PDFCPUExecutionError = PDFFormError