Coverage for src/epublib/__init__.py: 97%
228 statements
« prev ^ index » next coverage.py v7.10.7, created at 2025-10-07 12:07 -0300
« prev ^ index » next coverage.py v7.10.7, created at 2025-10-07 12:07 -0300
1from collections.abc import Generator
2from pathlib import Path
3from typing import IO, Any, Callable, Literal, TypedDict, overload, override
4from zipfile import is_zipfile
6import bs4
8from epublib.create import EPUBCreator
9from epublib.exceptions import ClosedEPUBError, EPUBError, NotEPUBError
10from epublib.identifier import EPUBId
11from epublib.nav.reset import (
12 create_landmarks,
13 create_page_list,
14 reset_landmarks,
15 reset_page_list,
16 reset_toc,
17)
18from epublib.nav.resource import NavigationDocument
19from epublib.ncx.reset import generate_ncx, reset_ncx
20from epublib.ncx.resource import NCXFile
21from epublib.package.guide import BookGuide
22from epublib.package.manifest import (
23 BookManifest,
24 ManifestItem,
25 detect_manifest_properties,
26)
27from epublib.package.metadata import BookMetadata, ValuedMetadataItem
28from epublib.package.resource import PackageDocument
29from epublib.package.spine import BookSpine, SpineItemRef
30from epublib.parse import parse
31from epublib.resources import (
32 ContentDocument,
33 Resource,
34 XMLResource,
35)
36from epublib.resources.manager import (
37 AudioManager,
38 ContentDocumentManager,
39 FontsManager,
40 ImagesManager,
41 PublicationResourceManager,
42 ResourceIdentifier,
43 ResourceManager,
44 ScriptsManager,
45 StylesManager,
46 VideoManager,
47)
48from epublib.source import (
49 DirectorySink,
50 DirectorySource,
51 SinkProtocol,
52 SourceProtocol,
53 ZipFile,
54)
55from epublib.util import get_epublib_version
58class ManagerDict(TypedDict, total=False):
59 documents: ContentDocumentManager
60 images: ImagesManager
61 scripts: ScriptsManager
62 styles: StylesManager
63 fonts: FontsManager
64 audios: AudioManager
65 videos: VideoManager
66 publication_resources: PublicationResourceManager
69class EPUB:
70 """
71 The main class for reading, writing, and manipulating EPUB files.
72 """
74 def __init__(
75 self,
76 file: IO[bytes] | str | Path | None = None,
77 generator_tag: bool = True,
78 ) -> None:
79 self.source: SourceProtocol
81 if file is None:
82 self.source = ZipFile(
83 EPUBCreator(add_generator_tag=generator_tag).to_file()
84 )
85 elif is_zipfile(file):
86 self.source = ZipFile(file)
87 elif (isinstance(file, str) or isinstance(file, Path)) and Path(file).is_dir():
88 self.source = DirectorySource(file)
89 else:
90 raise NotEPUBError(f"file '{file}' is not ZIP nor folder")
92 self.container_file: XMLResource
93 self.package_document: PackageDocument
94 self._resources: list[Resource]
95 self.container_file, self.package_document, self._resources = parse(self.source)
96 self.resources: ResourceManager = ResourceManager(
97 self._resources,
98 container_file=self.container_file,
99 package_document=self.package_document,
100 nav_getter=lambda: self.nav,
101 ncx_getter=lambda: self.ncx,
102 )
104 self.original_path: Path | None = (
105 Path(file) if isinstance(file, str) or isinstance(file, Path) else None
106 )
108 self._managers: ManagerDict = {}
110 if generator_tag:
111 self.add_generator_tag()
113 def close(self):
114 for resource in self.resources:
115 resource.close()
116 self.source.close()
118 @property
119 def closed(self) -> bool:
120 return self.source.closed
122 def _check_closed(self, msg: str = "EPUB is already closed"):
123 if self.closed:
124 raise ClosedEPUBError(msg)
126 def __enter__(self):
127 return self
129 def __exit__(self, *args: Any): # type: ignore[Any]
130 self.close()
132 def add_generator_tag(self):
133 """Add a generator meta tag to the metadata."""
135 generator = self.metadata.get("generator")
136 if not generator:
137 generator = self.metadata.add_opf("generator", "Edited with epublib")
139 version = get_epublib_version()
140 version_item = self.metadata.get("epublib version")
141 if not version_item and version:
142 __ = self.metadata.add_opf("epublib version", version)
144 def remove_generator_tag(self):
145 """Remove the epublib generator tag of the metadata, if any."""
147 generator = self.metadata.get("generator")
148 if (
149 generator
150 and isinstance(generator, ValuedMetadataItem)
151 and "epublib" in generator.value
152 ):
153 self.metadata.remove_item(generator)
155 version_item = self.metadata.get("epublib version")
156 if version_item:
157 self.metadata.remove_item(version_item)
159 def write_to_sink(self, out: SinkProtocol):
160 """Write this epub to a sink"""
162 self._check_closed("trying to write closed EPUB")
164 for resource in self.resources:
165 out.writestr(resource.zipinfo, resource.get_content(cache=False))
167 def write(self, output_file: IO[bytes] | str | Path) -> None:
168 """Write this epub to a zip file"""
170 with ZipFile(output_file, mode="w") as out_zip:
171 self.write_to_sink(out_zip)
173 def write_to_folder(self, folder: str | Path):
174 """Write this epub to a folder ('unzipped')"""
176 if not Path(folder).is_dir():
177 raise EPUBError(f"Path '{folder}' is not a directory")
179 out = DirectorySink(folder)
180 self.write_to_sink(out)
182 @overload
183 def _get_manager(self, name: Literal["documents"]) -> ContentDocumentManager: ...
184 @overload
185 def _get_manager(self, name: Literal["images"]) -> ImagesManager: ...
186 @overload
187 def _get_manager(self, name: Literal["scripts"]) -> ScriptsManager: ...
188 @overload
189 def _get_manager(self, name: Literal["styles"]) -> StylesManager: ...
190 @overload
191 def _get_manager(self, name: Literal["fonts"]) -> FontsManager: ...
192 @overload
193 def _get_manager(self, name: Literal["audios"]) -> AudioManager: ...
194 @overload
195 def _get_manager(self, name: Literal["videos"]) -> VideoManager: ...
196 @overload
197 def _get_manager(
198 self, name: Literal["publication_resources"]
199 ) -> PublicationResourceManager: ...
200 def _get_manager(
201 self,
202 name: Literal[
203 "documents",
204 "images",
205 "scripts",
206 "styles",
207 "fonts",
208 "audios",
209 "videos",
210 "publication_resources",
211 ],
212 ):
213 class ManagerKwargs(TypedDict):
214 resources: list[Resource]
215 container_file: XMLResource
216 package_document: PackageDocument
217 nav_getter: Callable[[], NavigationDocument]
218 ncx_getter: Callable[[], NCXFile | None]
220 kwargs: ManagerKwargs = {
221 "resources": self._resources,
222 "container_file": self.container_file,
223 "package_document": self.package_document,
224 "nav_getter": lambda: self.nav,
225 "ncx_getter": lambda: self.ncx,
226 }
228 if name not in self._managers:
229 match name:
230 case "documents":
231 self._managers[name] = ContentDocumentManager(**kwargs)
232 case "images":
233 self._managers[name] = ImagesManager(**kwargs)
234 case "scripts":
235 self._managers[name] = ScriptsManager(**kwargs)
236 case "styles":
237 self._managers[name] = StylesManager(**kwargs)
238 case "fonts":
239 self._managers[name] = FontsManager(**kwargs)
240 case "audios":
241 self._managers[name] = AudioManager(**kwargs)
242 case "videos":
243 self._managers[name] = VideoManager(**kwargs)
244 case "publication_resources":
245 self._managers[name] = PublicationResourceManager(**kwargs)
247 return self._managers[name] # type: ignore[reportReturnType]
249 @property
250 def documents(self) -> ContentDocumentManager:
251 """
252 Manage all content documents (XHTML or SVG) in this EPUB
253 """
254 return self._get_manager("documents")
256 @property
257 def images(self) -> ImagesManager:
258 """
259 Manage all image resources in this EPUB
260 """
261 return self._get_manager("images")
263 @property
264 def scripts(self) -> ScriptsManager:
265 """
266 Manage all JavaScript resources in this EPUB
267 """
268 return self._get_manager("scripts")
270 @property
271 def styles(self) -> StylesManager:
272 """
273 Manage all CSS resources in this EPUB
274 """
275 return self._get_manager("styles")
277 @property
278 def fonts(self) -> FontsManager:
279 """
280 Manage all font resources in this EPUB
281 """
283 return self._get_manager("fonts")
285 @property
286 def audios(self) -> AudioManager:
287 """
288 Manage all font resources in this EPUB
289 """
291 return self._get_manager("audios")
293 @property
294 def videos(self) -> VideoManager:
295 """
296 Manage all font resources in this EPUB
297 """
299 return self._get_manager("videos")
301 @property
302 def publication_resources(self) -> PublicationResourceManager:
303 """
304 Manage all publication resources (XHTML or SVG) in this EPUB
305 """
306 return self._get_manager("publication_resources")
308 def rename_id(
309 self,
310 old: Resource | ResourceIdentifier,
311 new: EPUBId,
312 ) -> None:
313 """
314 Rename a manifest identifier. Look for references for updating
315 it in the spine items, the cover-image metadata tag, and the toc
316 attribute of the spine element. Using this function is not
317 recommended, as there may be other references to the old id that
318 will become outdated.
319 """
321 if not isinstance(old, ManifestItem):
322 manifest_item = self.manifest.get(old)
323 else:
324 manifest_item = old
326 if not manifest_item:
327 raise EPUBError(f"Can't rename '{old}: not in manifest")
329 old_id = manifest_item.id
331 existing = self.manifest.get(new)
332 if existing:
333 raise EPUBError(f"Can't rename to already existing id '{new}' ({existing})")
335 # cover-image in metadata
336 cover = self.metadata.get("cover", ValuedMetadataItem)
337 if cover and cover.value == old:
338 cover.value = new
340 # spine tag
341 if self.spine.tag.attrs["toc"] == old_id:
342 self.spine.tag.attrs["toc"] = new
344 spine_item = self.spine.get(old_id)
345 if spine_item:
346 spine_item.idref = new
348 manifest_item.id = new
350 def get_spine_item(
351 self,
352 resource: Resource | ResourceIdentifier,
353 ) -> SpineItemRef | None:
354 """Get spine item associated with a resource or filename"""
355 if isinstance(resource, Resource):
356 resource = resource.filename
358 epub_id = self.resources.ri_to_id(resource)
359 if epub_id:
360 return self.spine.get(epub_id)
361 return None
363 def get_spine_position(
364 self,
365 resource: Resource | ResourceIdentifier,
366 ) -> int | None:
367 """Get the 0-indexed position of a resource in the spine"""
369 if isinstance(resource, Resource):
370 resource = resource.filename
372 epub_id = self.resources.ri_to_id(resource)
373 if epub_id:
374 return self.spine.get_position(epub_id)
375 return None
377 def update_manifest_properties(self) -> None:
378 """
379 Update manifest properties by detecting them from the resources
380 See https://www.w3.org/TR/epub-33/#sec-item-resource-properties
381 """
383 for item in self.manifest.items:
384 resource = self.resources.get(item.filename, XMLResource)
385 if resource:
386 for prop in ["mathml", "remote-resources", "scripted", "switch"]:
387 item.remove_property(prop)
389 for property in detect_manifest_properties(resource.soup):
390 item.add_property(property)
392 def reset_toc(
393 self,
394 targets_selector: str | None = "h1, h2, h3, h4, h5, h6",
395 include_filenames: bool = False,
396 spine_only: bool = True,
397 reset_ncx: bool | None = None,
398 resource_class: type[Resource] = ContentDocument,
399 title: str | None = None,
400 ):
401 """
402 Reset the table of contents in the navigation document by
403 detecting targets in content documents. May replace any
404 existing TOC.
405 """
406 return reset_toc(
407 self,
408 targets_selector,
409 include_filenames,
410 spine_only,
411 reset_ncx,
412 resource_class,
413 title,
414 )
416 def reset_page_list(
417 self,
418 id_format: str = "page_{page}",
419 label_format: str = "{page}",
420 pagebreak_selector: str = '[role="doc-pagebreak"], [epub|type="pagebreak"]',
421 reset_ncx: bool | None = None,
422 ):
423 """
424 Reset the page list in the navigation document by detecting
425 pagebreaks in content documents. Will replace any existing page
426 list.
427 """
428 return reset_page_list(
429 self,
430 id_format,
431 label_format,
432 pagebreak_selector,
433 reset_ncx,
434 )
436 def create_page_list(
437 self,
438 id_format: str = "page_{page}",
439 label_format: str = "{page}",
440 pagebreak_selector: str = '[role="doc-pagebreak"], [epub|type="pagebreak"]',
441 reset_ncx: bool | None = None,
442 ):
443 """
444 Create new page list in the navigation document by detecting
445 pagebreaks in content documents. Will raise an error if a page
446 list already exists.
447 """
448 return create_page_list(
449 self,
450 id_format,
451 label_format,
452 pagebreak_selector,
453 reset_ncx,
454 )
456 def reset_landmarks(
457 self,
458 include_toc: bool = True,
459 targets_selector: str | None = None,
460 ):
461 """
462 Reset the landmarks in the navigation document by detecting
463 targets in content documents, and optionally including the TOC.
464 Will replace existing landmarks.
465 """
467 return reset_landmarks(self, include_toc, targets_selector)
469 def create_landmarks(
470 self,
471 include_toc: bool = True,
472 targets_selector: str | None = None,
473 ):
474 """
475 Create landmarks in the navigation document by detecting
476 targets in content documents, and optionally including the TOC.
477 Will raise error if landmarks already exist.
478 """
480 return create_landmarks(self, include_toc, targets_selector)
482 def generate_ncx(self, filename: str | Path | None = None) -> NCXFile:
483 return generate_ncx(self, filename)
485 def reset_ncx(self) -> NCXFile:
486 return reset_ncx(self, self.ncx)
488 def select(self, selector: str) -> Generator[tuple[Resource, bs4.Tag]]:
489 """
490 Select elements matching a CSS selector in all content documents.
492 Yields tuples of (resource, tag), where resource is the content
493 document containing the tag.
494 """
496 for document in self.documents.filter(XMLResource):
497 for tag in document.soup.select(selector):
498 yield (document, tag)
500 @property
501 def base_dir(self):
502 """
503 The base directory for the resources in this EPUB. This is an
504 holistic property, and the spec does not define it. There may be
505 more than one base directory in an EPUB. This is the one
506 containing the package document.
507 """
509 return Path(self.package_document.filename).parent
511 @property
512 def manifest(self) -> BookManifest:
513 return self.package_document.manifest
515 @property
516 def metadata(self) -> BookMetadata:
517 return self.package_document.metadata
519 @property
520 def spine(self) -> BookSpine:
521 return self.package_document.spine
523 @property
524 def guide(self) -> BookGuide | None:
525 return self.package_document.guide
527 @property
528 def nav(self) -> NavigationDocument:
529 nav = self.resources.get(self.manifest.nav.filename, NavigationDocument)
530 if not nav:
531 raise EPUBError("no navigation document found in EPUB")
532 return nav
534 @property
535 def ncx(self) -> NCXFile | None:
536 return next(self.resources.filter(NCXFile), None)
538 @override
539 def __repr__(self) -> str:
540 return f"{self.__class__.__name__}(title='{self.metadata.title or id(self)}')"