Coverage for src/epublib/__init__.py: 95%

168 statements  

« prev     ^ index     » next       coverage.py v7.10.6, created at 2025-09-18 16:07 -0300

1from collections.abc import Generator 

2from pathlib import Path 

3from typing import IO, Any, cast, override 

4from zipfile import ZipFile, is_zipfile 

5 

6from epublib.create import EPUBCreator 

7from epublib.exceptions import ClosedEPUBError, EPUBError, NotEPUBError 

8from epublib.identifier import EPUBId 

9from epublib.mediatype import Category, MediaType 

10from epublib.nav.reset import ( 

11 create_landmarks, 

12 create_page_list, 

13 create_toc, 

14 reset_landmarks, 

15 reset_page_list, 

16 reset_toc, 

17) 

18from epublib.nav.resource import NavigationDocument 

19from epublib.ncx.reset import generate_ncx, reset_ncx 

20from epublib.ncx.resource import NCXFile 

21from epublib.package.manifest import ( 

22 BookManifest, 

23 ManifestItem, 

24 detect_manifest_properties, 

25) 

26from epublib.package.metadata import BookMetadata, ValuedMetadataItem 

27from epublib.package.resource import PackageDocument 

28from epublib.package.spine import BookSpine, SpineItemRef 

29from epublib.parse import parse 

30from epublib.resources import ( 

31 ContentDocument, 

32 PublicationResource, 

33 Resource, 

34 XMLResource, 

35) 

36from epublib.resources.manager import ResourceIdentifier, ResourceManager 

37from epublib.source import ( 

38 DirectorySink, 

39 DirectorySource, 

40 SinkProtocol, 

41 SourceProtocol, 

42) 

43from epublib.util import get_epublib_version 

44 

45 

46class EPUB: 

47 """ 

48 The main class for reading, writing, and manipulating EPUB files. 

49 """ 

50 

51 def __init__( 

52 self, 

53 file: IO[bytes] | str | Path | None = None, 

54 generator_tag: bool = True, 

55 ) -> None: 

56 self.source: SourceProtocol 

57 

58 if file is None: 

59 self.source = ZipFile( 

60 EPUBCreator(add_generator_tag=generator_tag).to_file() 

61 ) 

62 elif is_zipfile(file): 

63 self.source = ZipFile(file) 

64 elif (isinstance(file, str) or isinstance(file, Path)) and Path(file).is_dir(): 

65 self.source = DirectorySource(file) 

66 else: 

67 raise NotEPUBError(f"file '{file}' is not ZIP nor folder") 

68 

69 self._closed: bool = False 

70 self.container_file: XMLResource 

71 self.package_document: PackageDocument 

72 self.container_file, self.package_document, resources = parse(self.source) 

73 self.resources: ResourceManager = ResourceManager( 

74 resources, 

75 container_file=self.container_file, 

76 package_document=self.package_document, 

77 nav_getter=lambda: self.nav, 

78 ncx_getter=lambda: self.ncx, 

79 ) 

80 

81 self.original_path: Path | None = ( 

82 Path(file) if isinstance(file, str) or isinstance(file, Path) else None 

83 ) 

84 

85 if generator_tag: 

86 self.add_generator_tag() 

87 

88 def close(self): 

89 for resource in self.resources: 

90 resource.close() 

91 self._closed = True 

92 

93 def is_closed(self): 

94 return self._closed 

95 

96 def _check_closed(self, msg: str = "EPUB is already closed"): 

97 if self._closed: 

98 raise ClosedEPUBError(msg) 

99 

100 def __enter__(self): 

101 return self 

102 

103 def __exit__(self, *args: Any): # type: ignore[Any] 

104 self.close() 

105 

106 def add_generator_tag(self): 

107 """Add a generator meta tag to the metadata.""" 

108 

109 generator = self.metadata.get("generator") 

110 if not generator: 

111 generator = self.metadata.add("generator", "Edited with epublib") 

112 

113 version = get_epublib_version() 

114 version_item = self.metadata.get("epublib version") 

115 if not version_item and version: 

116 __ = self.metadata.add("epublib version", version) 

117 

118 def remove_generator_tag(self): 

119 """Remove the epublib generator tag of the metadata, if any.""" 

120 

121 generator = self.metadata.get("generator") 

122 if ( 

123 generator 

124 and isinstance(generator, ValuedMetadataItem) 

125 and "epublib" in generator.value 

126 ): 

127 self.metadata.remove_item(generator) 

128 

129 version_item = self.metadata.get("epublib version") 

130 if version_item: 

131 self.metadata.remove_item(version_item) 

132 

133 def write_to_sink(self, out: SinkProtocol): 

134 """Write this epub to a sink""" 

135 

136 self._check_closed("trying to write closed EPUB") 

137 

138 for resource in self.resources: 

139 out.writestr(resource.zipinfo, resource.content) 

140 resource.free() 

141 

142 def write(self, output_file: IO[bytes] | str | Path) -> None: 

143 """Write this epub to a zip file""" 

144 

145 out_zip = ZipFile(output_file, mode="w") 

146 self.write_to_sink(out_zip) 

147 

148 def write_to_folder(self, folder: str | Path): 

149 """Write this epub to a folder ('unzipped')""" 

150 

151 if not Path(folder).is_dir(): 

152 raise EPUBError(f"Path '{folder}' is not a directory") 

153 

154 out = DirectorySink(folder) 

155 self.write_to_sink(out) 

156 

157 def documents(self) -> Generator[ContentDocument]: 

158 """ 

159 Retrieve all content documents (XHTML or SVG) from this EPUB 

160 """ 

161 

162 yield from self.resources.filter(ContentDocument) 

163 

164 def images(self) -> Generator[PublicationResource]: 

165 """ 

166 Retrieve all image resources from this EPUB 

167 """ 

168 

169 yield from self.resources.filter(Category.IMAGE) 

170 

171 def scripts(self) -> Generator[PublicationResource]: 

172 """ 

173 Retrieve all JavaScript resources from this EPUB 

174 """ 

175 

176 return ( 

177 resource 

178 for resource in self.resources.filter(Category.OTHER) 

179 if cast(MediaType, resource.media_type).is_js() 

180 ) 

181 

182 def styles(self) -> Generator[PublicationResource]: 

183 """ 

184 Retrieve all CSS resources from this EPUB 

185 """ 

186 

187 return ( 

188 resource 

189 for resource in self.resources.filter(Category.STYLE) 

190 if cast(MediaType, resource.media_type).is_css() 

191 ) 

192 

193 def get_spine_item( 

194 self, 

195 resource: Resource | ResourceIdentifier, 

196 ) -> SpineItemRef | None: 

197 """Get spine item associated with a resource or filename""" 

198 

199 if isinstance(resource, EPUBId): 

200 return self.spine.get(resource) 

201 

202 if isinstance(resource, ManifestItem): 

203 manifest_item = resource 

204 else: 

205 manifest_item = self.manifest.get(resource) 

206 

207 if not manifest_item: 

208 return None 

209 

210 return self.spine.get(manifest_item.id) 

211 

212 def rename_id( 

213 self, 

214 old: Resource | ResourceIdentifier, 

215 new: EPUBId, 

216 ) -> None: 

217 """ 

218 Rename a manifest identifier. Look for references for updating 

219 it in the spine items, the cover-image metadata tag, and the toc 

220 attribute of the spine element. Using this function is not 

221 recommended, as there may be other references to the old id that 

222 will become outdated. 

223 """ 

224 

225 if not isinstance(old, ManifestItem): 

226 manifest_item = self.manifest.get(old) 

227 else: 

228 manifest_item = old 

229 

230 if not manifest_item: 

231 raise EPUBError(f"Can't rename '{old}: not in manifest") 

232 

233 old_id = manifest_item.id 

234 

235 existing = self.manifest.get(new) 

236 if existing: 

237 raise EPUBError(f"Can't rename to already existing id '{new}' ({existing})") 

238 

239 # cover-image in metadata 

240 cover = self.metadata.get("cover-image") 

241 if cover and cover: 

242 cover.value = new 

243 

244 # spine tag 

245 if self.spine.tag.attrs["toc"] == old_id: 

246 self.spine.tag.attrs["toc"] = new 

247 

248 spine_item = self.spine.get(old_id) 

249 if spine_item: 

250 spine_item.idref = new 

251 

252 manifest_item.id = new 

253 

254 def get_spine_position( 

255 self, 

256 resource: Resource | ResourceIdentifier, 

257 ) -> int | None: 

258 """Get the 0-indexed position of a resource in the spine""" 

259 

260 if isinstance(resource, EPUBId): 

261 epub_id = resource 

262 else: 

263 if isinstance(resource, ManifestItem): 

264 manifest_item = resource 

265 else: 

266 manifest_item = self.manifest.get(resource) 

267 

268 if not manifest_item: 

269 return None 

270 epub_id = manifest_item.id 

271 

272 return self.spine.get_position(epub_id) 

273 

274 def update_manifest_properties(self) -> None: 

275 """ 

276 Update manifest properties by detecting them from the resources 

277 See https://www.w3.org/TR/epub-33/#sec-item-resource-properties 

278 """ 

279 

280 for item in self.manifest.items: 

281 resource = self.resources.get(item.name, XMLResource) 

282 if resource: 

283 item.properties = list( 

284 set( 

285 (item.properties if item.properties is not None else []) 

286 + detect_manifest_properties(resource.soup) 

287 ) 

288 ) 

289 

290 def reset_toc( 

291 self, 

292 targets_selector: str | None = None, 

293 include_filenames: bool = False, 

294 spine_only: bool = False, # ensures correct ordering 

295 reset_ncx: bool | None = None, 

296 resource_class: type[Resource] = ContentDocument, 

297 ): 

298 """ 

299 Reset the table of contents in the navigation document by 

300 detecting targets in content documents. May replace any 

301 existing TOC. 

302 """ 

303 return reset_toc( 

304 self, 

305 targets_selector, 

306 include_filenames, 

307 spine_only, 

308 reset_ncx, 

309 resource_class, 

310 ) 

311 

312 def create_toc( 

313 self, 

314 targets_selector: str | None = None, 

315 include_filenames: bool = False, 

316 spine_only: bool = False, # ensures correct ordering 

317 reset_ncx: bool | None = None, 

318 resource_class: type[Resource] = ContentDocument, 

319 ): 

320 """ 

321 Create o new table of contents in the navigation document by 

322 detecting targets in content documents. Will raise an error if 

323 a TOC already exists. 

324 """ 

325 return create_toc( 

326 self, 

327 targets_selector, 

328 include_filenames, 

329 spine_only, 

330 reset_ncx, 

331 resource_class, 

332 ) 

333 

334 def reset_page_list( 

335 self, 

336 id_format: str = "page_{page}", 

337 label_format: str = "{page}", 

338 pagebreak_selector: str = '[role="doc-pagebreak"], [epub|type="pagebreak"]', 

339 reset_ncx: bool | None = None, 

340 ): 

341 """ 

342 Reset the page list in the navigation document by detecting 

343 pagebreaks in content documents. Will replace any existing page 

344 list. 

345 """ 

346 return reset_page_list( 

347 self, 

348 id_format, 

349 label_format, 

350 pagebreak_selector, 

351 reset_ncx, 

352 ) 

353 

354 def create_page_list( 

355 self, 

356 id_format: str = "page_{page}", 

357 label_format: str = "{page}", 

358 pagebreak_selector: str = '[role="doc-pagebreak"], [epub|type="pagebreak"]', 

359 reset_ncx: bool | None = None, 

360 ): 

361 """ 

362 Create new page list in the navigation document by detecting 

363 pagebreaks in content documents. Will raise an error if a page 

364 list already exists. 

365 """ 

366 return create_page_list( 

367 self, 

368 id_format, 

369 label_format, 

370 pagebreak_selector, 

371 reset_ncx, 

372 ) 

373 

374 def reset_landmarks( 

375 self, 

376 include_toc: bool = True, 

377 targets_selector: str | None = None, 

378 ): 

379 """ 

380 Reset the landmarks in the navigation document by detecting 

381 targets in content documents, and optionally including the TOC. 

382 Will replace existing landmarks. 

383 """ 

384 

385 return reset_landmarks(self, include_toc, targets_selector) 

386 

387 def create_landmarks( 

388 self, 

389 include_toc: bool = True, 

390 targets_selector: str | None = None, 

391 ): 

392 """ 

393 Create landmarks in the navigation document by detecting 

394 targets in content documents, and optionally including the TOC. 

395 Will raise error if landmarks already exist. 

396 """ 

397 

398 return create_landmarks(self, include_toc, targets_selector) 

399 

400 def generate_ncx(self, filename: str | Path | None = None) -> NCXFile: 

401 return generate_ncx(self, filename) 

402 

403 def reset_ncx(self, ncx: NCXFile | None = None) -> NCXFile: 

404 return reset_ncx(self, ncx) 

405 

406 @property 

407 def base_dir(self): 

408 """ 

409 The base directory for the resources in this EPUB. This is an 

410 holistic property, and the spec does not define it. There may be 

411 more than one base directory in an EPUB. This is the one 

412 containing the package document. 

413 """ 

414 

415 return Path(self.package_document.filename).parent 

416 

417 @property 

418 def manifest(self) -> BookManifest: 

419 return self.package_document.manifest 

420 

421 @property 

422 def metadata(self) -> BookMetadata: 

423 return self.package_document.metadata 

424 

425 @property 

426 def spine(self) -> BookSpine: 

427 return self.package_document.spine 

428 

429 @property 

430 def nav(self): 

431 return ( 

432 self.resources.get(self.manifest.nav.filename, NavigationDocument) 

433 if self.manifest.nav 

434 else None 

435 ) 

436 

437 @property 

438 def ncx(self) -> NCXFile | None: 

439 return next(self.resources.filter(NCXFile), None) 

440 

441 @override 

442 def __repr__(self) -> str: 

443 return f"{self.__class__.__name__}(title='{self.metadata.title or id(self)}')"