Coverage for src/epublib/resources.py: 91%

286 statements  

« prev     ^ index     » next       coverage.py v7.9.2, created at 2025-09-14 18:23 -0300

1import io 

2from mimetypes import guess_file_type 

3from pathlib import Path 

4from typing import IO, cast, override 

5from zipfile import ZipInfo 

6 

7import bs4 

8 

9from epublib.identifier import EPUBId 

10from epublib.source import zip_info_now 

11 

12from .exceptions import EPUBError 

13from .mediatype import Category, MediaType 

14from .nav import LandmarksRoot, PageListRoot, TocRoot 

15from .nav.util import LandmarkEntryData, PageBreakData, TOCEntryData 

16from .package.manifest import BookManifest, ManifestItem, detect_manifest_properties 

17from .package.metadata import BookMetadata 

18from .package.spine import BookSpine 

19from .soup import PackageDocumentSoup 

20from .util import get_absolute_href, get_relative_href 

21 

22 

23def info_to_zipinfo(info: ZipInfo | str | Path) -> ZipInfo: 

24 if isinstance(info, ZipInfo): 

25 return info 

26 

27 return ZipInfo(filename=str(info), date_time=zip_info_now()) 

28 

29 

30class Resource: 

31 """Base class for all resources (i.e. files) in an EPUB file.""" 

32 

33 def __init__(self, file: IO[bytes] | bytes, info: ZipInfo | str | Path) -> None: 

34 self.zipinfo: ZipInfo = info_to_zipinfo(info) 

35 self._file: IO[bytes] | None = ( 

36 io.BytesIO(file) if isinstance(file, bytes) else file 

37 ) 

38 self._content: bytes | None = None 

39 self._closed: bool = False 

40 

41 @classmethod 

42 def from_path(cls, filename: str | Path, location: str | Path): 

43 file = open(filename, "rb") 

44 zipinfo = ZipInfo.from_file(filename, location, strict_timestamps=False) 

45 return cls(file, zipinfo) 

46 

47 @override 

48 def __repr__(self) -> str: 

49 return f"{self.__class__.__name__}({self.filename})" 

50 

51 def on_content_change(self): 

52 pass 

53 

54 @property 

55 def filename(self): 

56 return self.zipinfo.filename 

57 

58 @filename.setter 

59 def filename(self, value: str): 

60 self._set_filename(value) 

61 

62 def _set_filename(self, value: str): 

63 self.zipinfo.filename = value 

64 

65 @property 

66 def content(self) -> bytes: 

67 self.check_closed() 

68 if self._content is None: 

69 if self._file is None: 

70 return b"" 

71 self._content = self._file.read() 

72 __ = self._file.seek(0) 

73 return self._content 

74 

75 @content.setter 

76 def content(self, value: bytes): 

77 self.check_closed() 

78 self._set_content(value) 

79 

80 def _set_content(self, value: bytes, content_change: bool = True): 

81 self._content = value 

82 if content_change: 

83 self.on_content_change() 

84 

85 def free(self): 

86 del self._content 

87 self._content = None 

88 self.on_content_change() 

89 

90 def get_title(self): 

91 return self.filename 

92 

93 def check_closed(self): 

94 if self._closed: 

95 raise EPUBError(f"Using resource {self.filename} after closing") 

96 

97 def close(self): 

98 self.free() 

99 if self._file is not None: 

100 self._file.close() 

101 self._file = None 

102 

103 def href_to_filename[T: (str, Path)](self, href: T) -> T: 

104 return get_absolute_href(self.filename, href) 

105 

106 

107class XMLResource[S: bs4.BeautifulSoup = bs4.BeautifulSoup](Resource): 

108 """A resource that is an XML file.""" 

109 

110 soup_class: type[S] = bs4.BeautifulSoup # type: ignore[reportAssignmentType] 

111 

112 def __init__(self, file: IO[bytes] | bytes, info: ZipInfo | str | Path) -> None: 

113 super().__init__(file, info) 

114 self._soup: None | S = None 

115 

116 @property 

117 def soup(self) -> S: 

118 if self._soup is None: 

119 self._soup = self.soup_class(self.content, "xml") 

120 return self._soup 

121 

122 @soup.setter 

123 def soup(self, value: S): 

124 self._set_soup(value) 

125 

126 def _set_soup(self, value: S): 

127 self._soup = value 

128 

129 @property 

130 @override 

131 def content(self): 

132 if self._soup is not None: 

133 self._set_content(self._soup.encode(), content_change=False) 

134 return super().content 

135 

136 @content.setter 

137 def content(self, value: bytes): 

138 super()._set_content(value) 

139 

140 @override 

141 def on_content_change(self): 

142 super().on_content_change() 

143 del self._soup 

144 self._soup = None 

145 

146 @override 

147 def get_title(self): 

148 if self.soup.title and self.soup.title.string: 

149 return self.soup.title.string 

150 return super().get_title() 

151 

152 

153class PackageDocument(XMLResource[PackageDocumentSoup]): 

154 """The package document of the EPUB file, sometimes known as the 'content.opf' file.""" 

155 

156 soup_class: type[PackageDocumentSoup] = PackageDocumentSoup 

157 

158 def __init__(self, file: IO[bytes] | bytes, info: ZipInfo | str | Path) -> None: 

159 super().__init__(file, info) 

160 self._manifest: BookManifest | None = None 

161 self._metadata: BookMetadata | None = None 

162 self._spine: BookSpine | None = None 

163 

164 @property 

165 def manifest(self): 

166 if self._manifest is None: 

167 self._manifest = BookManifest(self.soup.manifest, self.filename) 

168 return self._manifest 

169 

170 @property 

171 def metadata(self): 

172 if self._metadata is None: 

173 self._metadata = BookMetadata(self.soup.metadata) 

174 return self._metadata 

175 

176 @property 

177 def spine(self): 

178 if self._spine is None: 

179 self._spine = BookSpine(self.soup.spine) 

180 return self._spine 

181 

182 def remove(self, filename: str): 

183 item = self.manifest[filename] 

184 spine_item = self.spine.get(item.id) 

185 if spine_item: 

186 self.spine.remove_item(spine_item) 

187 self.manifest.remove_item(item) 

188 

189 def on_soup_change(self): 

190 del self._manifest 

191 del self._metadata 

192 del self._spine 

193 self._manifest = None 

194 self._metadata = None 

195 self._spine = None 

196 

197 @override 

198 def on_content_change(self): 

199 super().on_content_change() 

200 self.on_soup_change() 

201 

202 

203class PublicationResource(Resource): 

204 """ 

205 A resource that contributes to the logic and rendering of the publication. 

206 

207 This includes resources like the package document, content documents (XHTML), 

208 CSS stylesheets, audio, video, images, fonts, and scripts. 

209 """ 

210 

211 def __init__( 

212 self, 

213 file: IO[bytes] | bytes, 

214 info: ZipInfo | str | Path, 

215 media_type: MediaType | str | None = None, 

216 ) -> None: 

217 super().__init__(file, info) 

218 if media_type is None: 

219 media_type = guess_file_type(self.zipinfo.filename)[0] 

220 if media_type is None: 

221 raise EPUBError( 

222 f"Cannot determine media type of {self.zipinfo.filename}" 

223 ) 

224 

225 media_type = MediaType.coalesce(media_type) 

226 self.media_type: MediaType | str = media_type 

227 

228 @property 

229 def is_foreign(self): 

230 return isinstance(self.media_type, str) 

231 

232 @property 

233 def category(self): 

234 if isinstance(self.media_type, str): 

235 return Category.FOREIGN 

236 return self.media_type.category 

237 

238 @classmethod 

239 def from_resource(cls, other: Resource, media_type: str | MediaType | None = None): 

240 if other._file is None or other._closed: 

241 raise EPUBError(f"Using resource {other} after closing") 

242 

243 return cls(other._file, other.zipinfo, media_type) 

244 

245 

246class ContentDocument[S: bs4.BeautifulSoup = bs4.BeautifulSoup]( # type: ignore[reportUnsafeMultipleInheritance] 

247 PublicationResource, 

248 XMLResource[S], 

249): 

250 """ 

251 A publication resource referenced from the spine or a manifest fallback 

252 chain that conforms to either the XHTML or SVG content document definitions. 

253 """ 

254 

255 @override 

256 def get_title(self): 

257 if self.soup.h1 and self.soup.h1.string: 

258 return self.soup.h1.string 

259 return super().get_title() 

260 

261 

262class NavigationDocument(ContentDocument): 

263 """ 

264 A specialization of the XHTML content document that contains human- and 

265 machine-readable global navigation information. 

266 """ 

267 

268 def __init__( 

269 self, 

270 file: IO[bytes] | bytes, 

271 info: ZipInfo | str | Path, 

272 media_type: MediaType | str, 

273 ) -> None: 

274 super().__init__(file, info, media_type) 

275 self._toc: TocRoot | None = None 

276 self._page_list: PageListRoot | None = None 

277 self._landmarks: LandmarksRoot | None = None 

278 

279 def add_to_toc( 

280 self, 

281 filename: str, 

282 title: str, 

283 position: int | None = None, 

284 fragment: str | None = None, 

285 ): 

286 href = get_relative_href(self.filename, filename) + ( 

287 f"#{fragment}" if fragment is not None else "" 

288 ) 

289 

290 if self.toc is None: 

291 self._toc = TocRoot(None, self.soup, self.filename) 

292 

293 assert self.toc is not None 

294 return self.toc.add_item(href=href, title=title, position=position) 

295 

296 @property 

297 def toc(self): 

298 if self._toc is None: 

299 tag = self.soup.select_one('nav[epub|type="toc"]') 

300 if tag: 

301 self._toc = TocRoot(tag, self.soup, self.filename) 

302 return self._toc 

303 

304 @property 

305 def page_list(self): 

306 if self._page_list is None: 

307 tag = self.soup.select_one('nav[epub|type="page-list"]') 

308 if tag: 

309 self._page_list = PageListRoot(tag, self.soup, self.filename) 

310 return self._page_list 

311 

312 @property 

313 def landmarks(self): 

314 if self._landmarks is None: 

315 tag = self.soup.select_one('nav[epub|type="landmarks"]') 

316 if tag: 

317 self._landmarks = LandmarksRoot(tag, self.soup, self.filename) 

318 return self._landmarks 

319 

320 def reset_page_list(self, pagebreaks: list[PageBreakData]): 

321 if self.page_list is None: 

322 self._page_list = PageListRoot(None, self.soup, self.filename) 

323 

324 assert self.page_list 

325 self.page_list.reset(pagebreaks) 

326 

327 def reset_toc(self, entries: list[TOCEntryData]): 

328 if self.toc is None: 

329 self._toc = TocRoot(None, self.soup, self.filename) 

330 

331 assert self.toc 

332 self.toc.reset(entries) 

333 

334 def reset_landmarks(self, entries: list[LandmarkEntryData]): 

335 if self.landmarks is None: 

336 self._landmarks = LandmarksRoot(None, self.soup, self.filename) 

337 

338 assert self.landmarks 

339 self.landmarks.reset(entries) 

340 

341 def remove(self, filename: str): 

342 if self.toc: 

343 self.toc.remove(filename) 

344 if self.landmarks: 

345 self.landmarks.remove(filename) 

346 if self.page_list: 

347 self.page_list.remove(filename) 

348 

349 def on_soup_change(self): 

350 del self._toc 

351 del self._page_list 

352 del self._landmarks 

353 self._toc = None 

354 self._page_list = None 

355 self._landmarks = None 

356 

357 @override 

358 def on_content_change(self): 

359 super().on_content_change() 

360 self.on_soup_change() 

361 

362 

363def resource_to_manifest_item( 

364 resource: Resource, 

365 package: PackageDocument, 

366 identifier: EPUBId | str | None = None, 

367 media_type: str | MediaType | None = None, 

368 fallback: str | None = None, 

369 media_overlay: str | None = None, 

370 is_nav: bool = False, 

371 is_cover: bool = False, 

372 properties: list[str] | None = None, 

373 detect_properties: bool = True, 

374): 

375 href = get_relative_href( 

376 relative_to=package.filename, 

377 absolute_href=resource.filename, 

378 ) 

379 name = resource.filename 

380 

381 if identifier is None: 

382 identifier = package.manifest.get_new_id(resource.filename) 

383 else: 

384 assert package.manifest.get(identifier) is None, ( 

385 f"Identifier '{identifier}' is already used in the manifest" 

386 ) 

387 

388 if media_type is None: 

389 media_type = ( 

390 resource.media_type 

391 if isinstance(resource, PublicationResource) 

392 else guess_file_type(resource.filename)[0] 

393 ) 

394 

395 if not media_type: 

396 raise EPUBError(f"Can't determine media type of file {resource.filename}") 

397 

398 if detect_properties or is_nav or is_cover: 

399 properties = properties if properties is not None else [] 

400 

401 if detect_properties and isinstance(resource, ContentDocument): 

402 properties += detect_manifest_properties( 

403 cast(ContentDocument[bs4.BeautifulSoup], resource).soup 

404 ) 

405 

406 if is_nav: 

407 properties.append("nav") 

408 

409 if is_cover: 

410 properties.append("cover-image") 

411 

412 properties = list(set(properties)) 

413 

414 return ManifestItem( 

415 name=name, 

416 id=EPUBId(identifier), 

417 media_type=str(media_type), 

418 _href=href, 

419 media_overlay=media_overlay, 

420 fallback=fallback, 

421 properties=properties, 

422 manifest_filename=package.filename, 

423 ) 

424 

425 

426def create_resource( 

427 file: IO[bytes] | bytes, 

428 info: ZipInfo | str | Path, 

429 media_type: MediaType | str | None = None, 

430 is_nav: bool = False, 

431): 

432 zipinfo = info_to_zipinfo(info) 

433 

434 if media_type is None: 

435 media_type = MediaType.from_filename(zipinfo.filename) 

436 

437 if ( 

438 media_type is None 

439 or Path(zipinfo.filename).parts[0] == "META-INF" 

440 or zipinfo.filename == "mimetype" 

441 ): 

442 return Resource(file, zipinfo) 

443 

444 if media_type is MediaType.IMAGE_SVG or media_type is MediaType.XHTML: 

445 if is_nav: 

446 return NavigationDocument(file, zipinfo, media_type) 

447 return ContentDocument(file, zipinfo, media_type) 

448 

449 if is_nav: 

450 raise EPUBError( 

451 f"Found media type of '{zipinfo.filename}' to be " 

452 f"'{media_type}', which is incompatible with argument " 

453 "'is_nav=True'. Only XHTML or SVG documents can be the " 

454 "navigation document" 

455 ) 

456 

457 return PublicationResource(file, zipinfo, media_type) 

458 

459 

460def create_resource_from_path( 

461 path: str | Path, 

462 info: ZipInfo | str | Path | None = None, 

463 media_type: MediaType | str | None = None, 

464 is_nav: bool = False, 

465): 

466 file = open(path, "rb") 

467 

468 if info is None: 

469 info = Path(path).name 

470 

471 zipinfo = info 

472 

473 if not isinstance(info, ZipInfo): 

474 zipinfo = ZipInfo.from_file(path, info, strict_timestamps=False) 

475 

476 return create_resource(file, zipinfo, media_type, is_nav)