Coverage for src/epublib/resources.py: 91%
286 statements
« prev ^ index » next coverage.py v7.9.2, created at 2025-09-14 18:23 -0300
« prev ^ index » next coverage.py v7.9.2, created at 2025-09-14 18:23 -0300
1import io
2from mimetypes import guess_file_type
3from pathlib import Path
4from typing import IO, cast, override
5from zipfile import ZipInfo
7import bs4
9from epublib.identifier import EPUBId
10from epublib.source import zip_info_now
12from .exceptions import EPUBError
13from .mediatype import Category, MediaType
14from .nav import LandmarksRoot, PageListRoot, TocRoot
15from .nav.util import LandmarkEntryData, PageBreakData, TOCEntryData
16from .package.manifest import BookManifest, ManifestItem, detect_manifest_properties
17from .package.metadata import BookMetadata
18from .package.spine import BookSpine
19from .soup import PackageDocumentSoup
20from .util import get_absolute_href, get_relative_href
23def info_to_zipinfo(info: ZipInfo | str | Path) -> ZipInfo:
24 if isinstance(info, ZipInfo):
25 return info
27 return ZipInfo(filename=str(info), date_time=zip_info_now())
30class Resource:
31 """Base class for all resources (i.e. files) in an EPUB file."""
33 def __init__(self, file: IO[bytes] | bytes, info: ZipInfo | str | Path) -> None:
34 self.zipinfo: ZipInfo = info_to_zipinfo(info)
35 self._file: IO[bytes] | None = (
36 io.BytesIO(file) if isinstance(file, bytes) else file
37 )
38 self._content: bytes | None = None
39 self._closed: bool = False
41 @classmethod
42 def from_path(cls, filename: str | Path, location: str | Path):
43 file = open(filename, "rb")
44 zipinfo = ZipInfo.from_file(filename, location, strict_timestamps=False)
45 return cls(file, zipinfo)
47 @override
48 def __repr__(self) -> str:
49 return f"{self.__class__.__name__}({self.filename})"
51 def on_content_change(self):
52 pass
54 @property
55 def filename(self):
56 return self.zipinfo.filename
58 @filename.setter
59 def filename(self, value: str):
60 self._set_filename(value)
62 def _set_filename(self, value: str):
63 self.zipinfo.filename = value
65 @property
66 def content(self) -> bytes:
67 self.check_closed()
68 if self._content is None:
69 if self._file is None:
70 return b""
71 self._content = self._file.read()
72 __ = self._file.seek(0)
73 return self._content
75 @content.setter
76 def content(self, value: bytes):
77 self.check_closed()
78 self._set_content(value)
80 def _set_content(self, value: bytes, content_change: bool = True):
81 self._content = value
82 if content_change:
83 self.on_content_change()
85 def free(self):
86 del self._content
87 self._content = None
88 self.on_content_change()
90 def get_title(self):
91 return self.filename
93 def check_closed(self):
94 if self._closed:
95 raise EPUBError(f"Using resource {self.filename} after closing")
97 def close(self):
98 self.free()
99 if self._file is not None:
100 self._file.close()
101 self._file = None
103 def href_to_filename[T: (str, Path)](self, href: T) -> T:
104 return get_absolute_href(self.filename, href)
107class XMLResource[S: bs4.BeautifulSoup = bs4.BeautifulSoup](Resource):
108 """A resource that is an XML file."""
110 soup_class: type[S] = bs4.BeautifulSoup # type: ignore[reportAssignmentType]
112 def __init__(self, file: IO[bytes] | bytes, info: ZipInfo | str | Path) -> None:
113 super().__init__(file, info)
114 self._soup: None | S = None
116 @property
117 def soup(self) -> S:
118 if self._soup is None:
119 self._soup = self.soup_class(self.content, "xml")
120 return self._soup
122 @soup.setter
123 def soup(self, value: S):
124 self._set_soup(value)
126 def _set_soup(self, value: S):
127 self._soup = value
129 @property
130 @override
131 def content(self):
132 if self._soup is not None:
133 self._set_content(self._soup.encode(), content_change=False)
134 return super().content
136 @content.setter
137 def content(self, value: bytes):
138 super()._set_content(value)
140 @override
141 def on_content_change(self):
142 super().on_content_change()
143 del self._soup
144 self._soup = None
146 @override
147 def get_title(self):
148 if self.soup.title and self.soup.title.string:
149 return self.soup.title.string
150 return super().get_title()
153class PackageDocument(XMLResource[PackageDocumentSoup]):
154 """The package document of the EPUB file, sometimes known as the 'content.opf' file."""
156 soup_class: type[PackageDocumentSoup] = PackageDocumentSoup
158 def __init__(self, file: IO[bytes] | bytes, info: ZipInfo | str | Path) -> None:
159 super().__init__(file, info)
160 self._manifest: BookManifest | None = None
161 self._metadata: BookMetadata | None = None
162 self._spine: BookSpine | None = None
164 @property
165 def manifest(self):
166 if self._manifest is None:
167 self._manifest = BookManifest(self.soup.manifest, self.filename)
168 return self._manifest
170 @property
171 def metadata(self):
172 if self._metadata is None:
173 self._metadata = BookMetadata(self.soup.metadata)
174 return self._metadata
176 @property
177 def spine(self):
178 if self._spine is None:
179 self._spine = BookSpine(self.soup.spine)
180 return self._spine
182 def remove(self, filename: str):
183 item = self.manifest[filename]
184 spine_item = self.spine.get(item.id)
185 if spine_item:
186 self.spine.remove_item(spine_item)
187 self.manifest.remove_item(item)
189 def on_soup_change(self):
190 del self._manifest
191 del self._metadata
192 del self._spine
193 self._manifest = None
194 self._metadata = None
195 self._spine = None
197 @override
198 def on_content_change(self):
199 super().on_content_change()
200 self.on_soup_change()
203class PublicationResource(Resource):
204 """
205 A resource that contributes to the logic and rendering of the publication.
207 This includes resources like the package document, content documents (XHTML),
208 CSS stylesheets, audio, video, images, fonts, and scripts.
209 """
211 def __init__(
212 self,
213 file: IO[bytes] | bytes,
214 info: ZipInfo | str | Path,
215 media_type: MediaType | str | None = None,
216 ) -> None:
217 super().__init__(file, info)
218 if media_type is None:
219 media_type = guess_file_type(self.zipinfo.filename)[0]
220 if media_type is None:
221 raise EPUBError(
222 f"Cannot determine media type of {self.zipinfo.filename}"
223 )
225 media_type = MediaType.coalesce(media_type)
226 self.media_type: MediaType | str = media_type
228 @property
229 def is_foreign(self):
230 return isinstance(self.media_type, str)
232 @property
233 def category(self):
234 if isinstance(self.media_type, str):
235 return Category.FOREIGN
236 return self.media_type.category
238 @classmethod
239 def from_resource(cls, other: Resource, media_type: str | MediaType | None = None):
240 if other._file is None or other._closed:
241 raise EPUBError(f"Using resource {other} after closing")
243 return cls(other._file, other.zipinfo, media_type)
246class ContentDocument[S: bs4.BeautifulSoup = bs4.BeautifulSoup]( # type: ignore[reportUnsafeMultipleInheritance]
247 PublicationResource,
248 XMLResource[S],
249):
250 """
251 A publication resource referenced from the spine or a manifest fallback
252 chain that conforms to either the XHTML or SVG content document definitions.
253 """
255 @override
256 def get_title(self):
257 if self.soup.h1 and self.soup.h1.string:
258 return self.soup.h1.string
259 return super().get_title()
262class NavigationDocument(ContentDocument):
263 """
264 A specialization of the XHTML content document that contains human- and
265 machine-readable global navigation information.
266 """
268 def __init__(
269 self,
270 file: IO[bytes] | bytes,
271 info: ZipInfo | str | Path,
272 media_type: MediaType | str,
273 ) -> None:
274 super().__init__(file, info, media_type)
275 self._toc: TocRoot | None = None
276 self._page_list: PageListRoot | None = None
277 self._landmarks: LandmarksRoot | None = None
279 def add_to_toc(
280 self,
281 filename: str,
282 title: str,
283 position: int | None = None,
284 fragment: str | None = None,
285 ):
286 href = get_relative_href(self.filename, filename) + (
287 f"#{fragment}" if fragment is not None else ""
288 )
290 if self.toc is None:
291 self._toc = TocRoot(None, self.soup, self.filename)
293 assert self.toc is not None
294 return self.toc.add_item(href=href, title=title, position=position)
296 @property
297 def toc(self):
298 if self._toc is None:
299 tag = self.soup.select_one('nav[epub|type="toc"]')
300 if tag:
301 self._toc = TocRoot(tag, self.soup, self.filename)
302 return self._toc
304 @property
305 def page_list(self):
306 if self._page_list is None:
307 tag = self.soup.select_one('nav[epub|type="page-list"]')
308 if tag:
309 self._page_list = PageListRoot(tag, self.soup, self.filename)
310 return self._page_list
312 @property
313 def landmarks(self):
314 if self._landmarks is None:
315 tag = self.soup.select_one('nav[epub|type="landmarks"]')
316 if tag:
317 self._landmarks = LandmarksRoot(tag, self.soup, self.filename)
318 return self._landmarks
320 def reset_page_list(self, pagebreaks: list[PageBreakData]):
321 if self.page_list is None:
322 self._page_list = PageListRoot(None, self.soup, self.filename)
324 assert self.page_list
325 self.page_list.reset(pagebreaks)
327 def reset_toc(self, entries: list[TOCEntryData]):
328 if self.toc is None:
329 self._toc = TocRoot(None, self.soup, self.filename)
331 assert self.toc
332 self.toc.reset(entries)
334 def reset_landmarks(self, entries: list[LandmarkEntryData]):
335 if self.landmarks is None:
336 self._landmarks = LandmarksRoot(None, self.soup, self.filename)
338 assert self.landmarks
339 self.landmarks.reset(entries)
341 def remove(self, filename: str):
342 if self.toc:
343 self.toc.remove(filename)
344 if self.landmarks:
345 self.landmarks.remove(filename)
346 if self.page_list:
347 self.page_list.remove(filename)
349 def on_soup_change(self):
350 del self._toc
351 del self._page_list
352 del self._landmarks
353 self._toc = None
354 self._page_list = None
355 self._landmarks = None
357 @override
358 def on_content_change(self):
359 super().on_content_change()
360 self.on_soup_change()
363def resource_to_manifest_item(
364 resource: Resource,
365 package: PackageDocument,
366 identifier: EPUBId | str | None = None,
367 media_type: str | MediaType | None = None,
368 fallback: str | None = None,
369 media_overlay: str | None = None,
370 is_nav: bool = False,
371 is_cover: bool = False,
372 properties: list[str] | None = None,
373 detect_properties: bool = True,
374):
375 href = get_relative_href(
376 relative_to=package.filename,
377 absolute_href=resource.filename,
378 )
379 name = resource.filename
381 if identifier is None:
382 identifier = package.manifest.get_new_id(resource.filename)
383 else:
384 assert package.manifest.get(identifier) is None, (
385 f"Identifier '{identifier}' is already used in the manifest"
386 )
388 if media_type is None:
389 media_type = (
390 resource.media_type
391 if isinstance(resource, PublicationResource)
392 else guess_file_type(resource.filename)[0]
393 )
395 if not media_type:
396 raise EPUBError(f"Can't determine media type of file {resource.filename}")
398 if detect_properties or is_nav or is_cover:
399 properties = properties if properties is not None else []
401 if detect_properties and isinstance(resource, ContentDocument):
402 properties += detect_manifest_properties(
403 cast(ContentDocument[bs4.BeautifulSoup], resource).soup
404 )
406 if is_nav:
407 properties.append("nav")
409 if is_cover:
410 properties.append("cover-image")
412 properties = list(set(properties))
414 return ManifestItem(
415 name=name,
416 id=EPUBId(identifier),
417 media_type=str(media_type),
418 _href=href,
419 media_overlay=media_overlay,
420 fallback=fallback,
421 properties=properties,
422 manifest_filename=package.filename,
423 )
426def create_resource(
427 file: IO[bytes] | bytes,
428 info: ZipInfo | str | Path,
429 media_type: MediaType | str | None = None,
430 is_nav: bool = False,
431):
432 zipinfo = info_to_zipinfo(info)
434 if media_type is None:
435 media_type = MediaType.from_filename(zipinfo.filename)
437 if (
438 media_type is None
439 or Path(zipinfo.filename).parts[0] == "META-INF"
440 or zipinfo.filename == "mimetype"
441 ):
442 return Resource(file, zipinfo)
444 if media_type is MediaType.IMAGE_SVG or media_type is MediaType.XHTML:
445 if is_nav:
446 return NavigationDocument(file, zipinfo, media_type)
447 return ContentDocument(file, zipinfo, media_type)
449 if is_nav:
450 raise EPUBError(
451 f"Found media type of '{zipinfo.filename}' to be "
452 f"'{media_type}', which is incompatible with argument "
453 "'is_nav=True'. Only XHTML or SVG documents can be the "
454 "navigation document"
455 )
457 return PublicationResource(file, zipinfo, media_type)
460def create_resource_from_path(
461 path: str | Path,
462 info: ZipInfo | str | Path | None = None,
463 media_type: MediaType | str | None = None,
464 is_nav: bool = False,
465):
466 file = open(path, "rb")
468 if info is None:
469 info = Path(path).name
471 zipinfo = info
473 if not isinstance(info, ZipInfo):
474 zipinfo = ZipInfo.from_file(path, info, strict_timestamps=False)
476 return create_resource(file, zipinfo, media_type, is_nav)