Coverage for src/epublib/__init__.py: 97%

228 statements  

« prev     ^ index     » next       coverage.py v7.10.7, created at 2025-10-07 12:07 -0300

1from collections.abc import Generator 

2from pathlib import Path 

3from typing import IO, Any, Callable, Literal, TypedDict, overload, override 

4from zipfile import is_zipfile 

5 

6import bs4 

7 

8from epublib.create import EPUBCreator 

9from epublib.exceptions import ClosedEPUBError, EPUBError, NotEPUBError 

10from epublib.identifier import EPUBId 

11from epublib.nav.reset import ( 

12 create_landmarks, 

13 create_page_list, 

14 reset_landmarks, 

15 reset_page_list, 

16 reset_toc, 

17) 

18from epublib.nav.resource import NavigationDocument 

19from epublib.ncx.reset import generate_ncx, reset_ncx 

20from epublib.ncx.resource import NCXFile 

21from epublib.package.guide import BookGuide 

22from epublib.package.manifest import ( 

23 BookManifest, 

24 ManifestItem, 

25 detect_manifest_properties, 

26) 

27from epublib.package.metadata import BookMetadata, ValuedMetadataItem 

28from epublib.package.resource import PackageDocument 

29from epublib.package.spine import BookSpine, SpineItemRef 

30from epublib.parse import parse 

31from epublib.resources import ( 

32 ContentDocument, 

33 Resource, 

34 XMLResource, 

35) 

36from epublib.resources.manager import ( 

37 AudioManager, 

38 ContentDocumentManager, 

39 FontsManager, 

40 ImagesManager, 

41 PublicationResourceManager, 

42 ResourceIdentifier, 

43 ResourceManager, 

44 ScriptsManager, 

45 StylesManager, 

46 VideoManager, 

47) 

48from epublib.source import ( 

49 DirectorySink, 

50 DirectorySource, 

51 SinkProtocol, 

52 SourceProtocol, 

53 ZipFile, 

54) 

55from epublib.util import get_epublib_version 

56 

57 

58class ManagerDict(TypedDict, total=False): 

59 documents: ContentDocumentManager 

60 images: ImagesManager 

61 scripts: ScriptsManager 

62 styles: StylesManager 

63 fonts: FontsManager 

64 audios: AudioManager 

65 videos: VideoManager 

66 publication_resources: PublicationResourceManager 

67 

68 

69class EPUB: 

70 """ 

71 The main class for reading, writing, and manipulating EPUB files. 

72 """ 

73 

74 def __init__( 

75 self, 

76 file: IO[bytes] | str | Path | None = None, 

77 generator_tag: bool = True, 

78 ) -> None: 

79 self.source: SourceProtocol 

80 

81 if file is None: 

82 self.source = ZipFile( 

83 EPUBCreator(add_generator_tag=generator_tag).to_file() 

84 ) 

85 elif is_zipfile(file): 

86 self.source = ZipFile(file) 

87 elif (isinstance(file, str) or isinstance(file, Path)) and Path(file).is_dir(): 

88 self.source = DirectorySource(file) 

89 else: 

90 raise NotEPUBError(f"file '{file}' is not ZIP nor folder") 

91 

92 self.container_file: XMLResource 

93 self.package_document: PackageDocument 

94 self._resources: list[Resource] 

95 self.container_file, self.package_document, self._resources = parse(self.source) 

96 self.resources: ResourceManager = ResourceManager( 

97 self._resources, 

98 container_file=self.container_file, 

99 package_document=self.package_document, 

100 nav_getter=lambda: self.nav, 

101 ncx_getter=lambda: self.ncx, 

102 ) 

103 

104 self.original_path: Path | None = ( 

105 Path(file) if isinstance(file, str) or isinstance(file, Path) else None 

106 ) 

107 

108 self._managers: ManagerDict = {} 

109 

110 if generator_tag: 

111 self.add_generator_tag() 

112 

113 def close(self): 

114 for resource in self.resources: 

115 resource.close() 

116 self.source.close() 

117 

118 @property 

119 def closed(self) -> bool: 

120 return self.source.closed 

121 

122 def _check_closed(self, msg: str = "EPUB is already closed"): 

123 if self.closed: 

124 raise ClosedEPUBError(msg) 

125 

126 def __enter__(self): 

127 return self 

128 

129 def __exit__(self, *args: Any): # type: ignore[Any] 

130 self.close() 

131 

132 def add_generator_tag(self): 

133 """Add a generator meta tag to the metadata.""" 

134 

135 generator = self.metadata.get("generator") 

136 if not generator: 

137 generator = self.metadata.add_opf("generator", "Edited with epublib") 

138 

139 version = get_epublib_version() 

140 version_item = self.metadata.get("epublib version") 

141 if not version_item and version: 

142 __ = self.metadata.add_opf("epublib version", version) 

143 

144 def remove_generator_tag(self): 

145 """Remove the epublib generator tag of the metadata, if any.""" 

146 

147 generator = self.metadata.get("generator") 

148 if ( 

149 generator 

150 and isinstance(generator, ValuedMetadataItem) 

151 and "epublib" in generator.value 

152 ): 

153 self.metadata.remove_item(generator) 

154 

155 version_item = self.metadata.get("epublib version") 

156 if version_item: 

157 self.metadata.remove_item(version_item) 

158 

159 def write_to_sink(self, out: SinkProtocol): 

160 """Write this epub to a sink""" 

161 

162 self._check_closed("trying to write closed EPUB") 

163 

164 for resource in self.resources: 

165 out.writestr(resource.zipinfo, resource.get_content(cache=False)) 

166 

167 def write(self, output_file: IO[bytes] | str | Path) -> None: 

168 """Write this epub to a zip file""" 

169 

170 with ZipFile(output_file, mode="w") as out_zip: 

171 self.write_to_sink(out_zip) 

172 

173 def write_to_folder(self, folder: str | Path): 

174 """Write this epub to a folder ('unzipped')""" 

175 

176 if not Path(folder).is_dir(): 

177 raise EPUBError(f"Path '{folder}' is not a directory") 

178 

179 out = DirectorySink(folder) 

180 self.write_to_sink(out) 

181 

182 @overload 

183 def _get_manager(self, name: Literal["documents"]) -> ContentDocumentManager: ... 

184 @overload 

185 def _get_manager(self, name: Literal["images"]) -> ImagesManager: ... 

186 @overload 

187 def _get_manager(self, name: Literal["scripts"]) -> ScriptsManager: ... 

188 @overload 

189 def _get_manager(self, name: Literal["styles"]) -> StylesManager: ... 

190 @overload 

191 def _get_manager(self, name: Literal["fonts"]) -> FontsManager: ... 

192 @overload 

193 def _get_manager(self, name: Literal["audios"]) -> AudioManager: ... 

194 @overload 

195 def _get_manager(self, name: Literal["videos"]) -> VideoManager: ... 

196 @overload 

197 def _get_manager( 

198 self, name: Literal["publication_resources"] 

199 ) -> PublicationResourceManager: ... 

200 def _get_manager( 

201 self, 

202 name: Literal[ 

203 "documents", 

204 "images", 

205 "scripts", 

206 "styles", 

207 "fonts", 

208 "audios", 

209 "videos", 

210 "publication_resources", 

211 ], 

212 ): 

213 class ManagerKwargs(TypedDict): 

214 resources: list[Resource] 

215 container_file: XMLResource 

216 package_document: PackageDocument 

217 nav_getter: Callable[[], NavigationDocument] 

218 ncx_getter: Callable[[], NCXFile | None] 

219 

220 kwargs: ManagerKwargs = { 

221 "resources": self._resources, 

222 "container_file": self.container_file, 

223 "package_document": self.package_document, 

224 "nav_getter": lambda: self.nav, 

225 "ncx_getter": lambda: self.ncx, 

226 } 

227 

228 if name not in self._managers: 

229 match name: 

230 case "documents": 

231 self._managers[name] = ContentDocumentManager(**kwargs) 

232 case "images": 

233 self._managers[name] = ImagesManager(**kwargs) 

234 case "scripts": 

235 self._managers[name] = ScriptsManager(**kwargs) 

236 case "styles": 

237 self._managers[name] = StylesManager(**kwargs) 

238 case "fonts": 

239 self._managers[name] = FontsManager(**kwargs) 

240 case "audios": 

241 self._managers[name] = AudioManager(**kwargs) 

242 case "videos": 

243 self._managers[name] = VideoManager(**kwargs) 

244 case "publication_resources": 

245 self._managers[name] = PublicationResourceManager(**kwargs) 

246 

247 return self._managers[name] # type: ignore[reportReturnType] 

248 

249 @property 

250 def documents(self) -> ContentDocumentManager: 

251 """ 

252 Manage all content documents (XHTML or SVG) in this EPUB 

253 """ 

254 return self._get_manager("documents") 

255 

256 @property 

257 def images(self) -> ImagesManager: 

258 """ 

259 Manage all image resources in this EPUB 

260 """ 

261 return self._get_manager("images") 

262 

263 @property 

264 def scripts(self) -> ScriptsManager: 

265 """ 

266 Manage all JavaScript resources in this EPUB 

267 """ 

268 return self._get_manager("scripts") 

269 

270 @property 

271 def styles(self) -> StylesManager: 

272 """ 

273 Manage all CSS resources in this EPUB 

274 """ 

275 return self._get_manager("styles") 

276 

277 @property 

278 def fonts(self) -> FontsManager: 

279 """ 

280 Manage all font resources in this EPUB 

281 """ 

282 

283 return self._get_manager("fonts") 

284 

285 @property 

286 def audios(self) -> AudioManager: 

287 """ 

288 Manage all font resources in this EPUB 

289 """ 

290 

291 return self._get_manager("audios") 

292 

293 @property 

294 def videos(self) -> VideoManager: 

295 """ 

296 Manage all font resources in this EPUB 

297 """ 

298 

299 return self._get_manager("videos") 

300 

301 @property 

302 def publication_resources(self) -> PublicationResourceManager: 

303 """ 

304 Manage all publication resources (XHTML or SVG) in this EPUB 

305 """ 

306 return self._get_manager("publication_resources") 

307 

308 def rename_id( 

309 self, 

310 old: Resource | ResourceIdentifier, 

311 new: EPUBId, 

312 ) -> None: 

313 """ 

314 Rename a manifest identifier. Look for references for updating 

315 it in the spine items, the cover-image metadata tag, and the toc 

316 attribute of the spine element. Using this function is not 

317 recommended, as there may be other references to the old id that 

318 will become outdated. 

319 """ 

320 

321 if not isinstance(old, ManifestItem): 

322 manifest_item = self.manifest.get(old) 

323 else: 

324 manifest_item = old 

325 

326 if not manifest_item: 

327 raise EPUBError(f"Can't rename '{old}: not in manifest") 

328 

329 old_id = manifest_item.id 

330 

331 existing = self.manifest.get(new) 

332 if existing: 

333 raise EPUBError(f"Can't rename to already existing id '{new}' ({existing})") 

334 

335 # cover-image in metadata 

336 cover = self.metadata.get("cover", ValuedMetadataItem) 

337 if cover and cover.value == old: 

338 cover.value = new 

339 

340 # spine tag 

341 if self.spine.tag.attrs["toc"] == old_id: 

342 self.spine.tag.attrs["toc"] = new 

343 

344 spine_item = self.spine.get(old_id) 

345 if spine_item: 

346 spine_item.idref = new 

347 

348 manifest_item.id = new 

349 

350 def get_spine_item( 

351 self, 

352 resource: Resource | ResourceIdentifier, 

353 ) -> SpineItemRef | None: 

354 """Get spine item associated with a resource or filename""" 

355 if isinstance(resource, Resource): 

356 resource = resource.filename 

357 

358 epub_id = self.resources.ri_to_id(resource) 

359 if epub_id: 

360 return self.spine.get(epub_id) 

361 return None 

362 

363 def get_spine_position( 

364 self, 

365 resource: Resource | ResourceIdentifier, 

366 ) -> int | None: 

367 """Get the 0-indexed position of a resource in the spine""" 

368 

369 if isinstance(resource, Resource): 

370 resource = resource.filename 

371 

372 epub_id = self.resources.ri_to_id(resource) 

373 if epub_id: 

374 return self.spine.get_position(epub_id) 

375 return None 

376 

377 def update_manifest_properties(self) -> None: 

378 """ 

379 Update manifest properties by detecting them from the resources 

380 See https://www.w3.org/TR/epub-33/#sec-item-resource-properties 

381 """ 

382 

383 for item in self.manifest.items: 

384 resource = self.resources.get(item.filename, XMLResource) 

385 if resource: 

386 for prop in ["mathml", "remote-resources", "scripted", "switch"]: 

387 item.remove_property(prop) 

388 

389 for property in detect_manifest_properties(resource.soup): 

390 item.add_property(property) 

391 

392 def reset_toc( 

393 self, 

394 targets_selector: str | None = "h1, h2, h3, h4, h5, h6", 

395 include_filenames: bool = False, 

396 spine_only: bool = True, 

397 reset_ncx: bool | None = None, 

398 resource_class: type[Resource] = ContentDocument, 

399 title: str | None = None, 

400 ): 

401 """ 

402 Reset the table of contents in the navigation document by 

403 detecting targets in content documents. May replace any 

404 existing TOC. 

405 """ 

406 return reset_toc( 

407 self, 

408 targets_selector, 

409 include_filenames, 

410 spine_only, 

411 reset_ncx, 

412 resource_class, 

413 title, 

414 ) 

415 

416 def reset_page_list( 

417 self, 

418 id_format: str = "page_{page}", 

419 label_format: str = "{page}", 

420 pagebreak_selector: str = '[role="doc-pagebreak"], [epub|type="pagebreak"]', 

421 reset_ncx: bool | None = None, 

422 ): 

423 """ 

424 Reset the page list in the navigation document by detecting 

425 pagebreaks in content documents. Will replace any existing page 

426 list. 

427 """ 

428 return reset_page_list( 

429 self, 

430 id_format, 

431 label_format, 

432 pagebreak_selector, 

433 reset_ncx, 

434 ) 

435 

436 def create_page_list( 

437 self, 

438 id_format: str = "page_{page}", 

439 label_format: str = "{page}", 

440 pagebreak_selector: str = '[role="doc-pagebreak"], [epub|type="pagebreak"]', 

441 reset_ncx: bool | None = None, 

442 ): 

443 """ 

444 Create new page list in the navigation document by detecting 

445 pagebreaks in content documents. Will raise an error if a page 

446 list already exists. 

447 """ 

448 return create_page_list( 

449 self, 

450 id_format, 

451 label_format, 

452 pagebreak_selector, 

453 reset_ncx, 

454 ) 

455 

456 def reset_landmarks( 

457 self, 

458 include_toc: bool = True, 

459 targets_selector: str | None = None, 

460 ): 

461 """ 

462 Reset the landmarks in the navigation document by detecting 

463 targets in content documents, and optionally including the TOC. 

464 Will replace existing landmarks. 

465 """ 

466 

467 return reset_landmarks(self, include_toc, targets_selector) 

468 

469 def create_landmarks( 

470 self, 

471 include_toc: bool = True, 

472 targets_selector: str | None = None, 

473 ): 

474 """ 

475 Create landmarks in the navigation document by detecting 

476 targets in content documents, and optionally including the TOC. 

477 Will raise error if landmarks already exist. 

478 """ 

479 

480 return create_landmarks(self, include_toc, targets_selector) 

481 

482 def generate_ncx(self, filename: str | Path | None = None) -> NCXFile: 

483 return generate_ncx(self, filename) 

484 

485 def reset_ncx(self) -> NCXFile: 

486 return reset_ncx(self, self.ncx) 

487 

488 def select(self, selector: str) -> Generator[tuple[Resource, bs4.Tag]]: 

489 """ 

490 Select elements matching a CSS selector in all content documents. 

491 

492 Yields tuples of (resource, tag), where resource is the content 

493 document containing the tag. 

494 """ 

495 

496 for document in self.documents.filter(XMLResource): 

497 for tag in document.soup.select(selector): 

498 yield (document, tag) 

499 

500 @property 

501 def base_dir(self): 

502 """ 

503 The base directory for the resources in this EPUB. This is an 

504 holistic property, and the spec does not define it. There may be 

505 more than one base directory in an EPUB. This is the one 

506 containing the package document. 

507 """ 

508 

509 return Path(self.package_document.filename).parent 

510 

511 @property 

512 def manifest(self) -> BookManifest: 

513 return self.package_document.manifest 

514 

515 @property 

516 def metadata(self) -> BookMetadata: 

517 return self.package_document.metadata 

518 

519 @property 

520 def spine(self) -> BookSpine: 

521 return self.package_document.spine 

522 

523 @property 

524 def guide(self) -> BookGuide | None: 

525 return self.package_document.guide 

526 

527 @property 

528 def nav(self) -> NavigationDocument: 

529 nav = self.resources.get(self.manifest.nav.filename, NavigationDocument) 

530 if not nav: 

531 raise EPUBError("no navigation document found in EPUB") 

532 return nav 

533 

534 @property 

535 def ncx(self) -> NCXFile | None: 

536 return next(self.resources.filter(NCXFile), None) 

537 

538 @override 

539 def __repr__(self) -> str: 

540 return f"{self.__class__.__name__}(title='{self.metadata.title or id(self)}')"