Coverage for src/epublib/util.py: 78%
101 statements
« prev ^ index » next coverage.py v7.10.6, created at 2025-09-18 16:07 -0300
« prev ^ index » next coverage.py v7.10.6, created at 2025-09-18 16:07 -0300
1import enum
2import os.path
3import re
4import unicodedata
5from datetime import datetime, timezone
6from importlib.metadata import PackageNotFoundError, version
7from pathlib import Path
8from typing import cast, overload
10import bs4
12from epublib.exceptions import EPUBError
15def normalize_path[T: (str, Path)](path: T) -> T:
16 # Resolve ..'s
17 absolute = os.path.normpath(path)
18 if isinstance(path, Path):
19 return Path(absolute)
20 return absolute
23def get_absolute_href[T: (str, Path)](origin_href: str | Path, href: T) -> T:
24 path = Path(origin_href).parent / Path(href)
25 if isinstance(href, str):
26 return str(normalize_path(path))
27 return normalize_path(path)
30def get_relative_href[T: (str, Path)](relative_to: str | Path, absolute_href: T) -> T:
31 path = Path(absolute_href).relative_to(Path(relative_to).parent, walk_up=True)
33 if isinstance(absolute_href, str):
34 return str(path)
35 return path
38@overload
39def parse_int(value: str) -> int | None: ...
40@overload
41def parse_int(value: None) -> None: ...
44def parse_int(value: str | None):
45 """Lenient integer parsing"""
46 if value is None:
47 return None
49 value = "".join(filter(str.isdigit, value))
50 try:
51 return int(value)
52 except ValueError:
53 return None
56def tag_ids(tag: bs4.Tag):
57 return {attr_to_str(t["id"]) for t in tag.select("[id]") if tag.get("id")}
60def new_id(base: str, gone: set[str], add_to_gone: bool = True) -> str:
61 if base not in gone:
62 if add_to_gone:
63 gone.add(base)
64 return base
66 for i in range(1, 1000):
67 new = f"{base}-{i}"
68 if new not in gone:
69 if add_to_gone:
70 gone.add(new)
71 return new
73 raise EPUBError(f"Exhausted unique id possibilities for {base}")
76def new_id_in_tag(base: str, tag: bs4.Tag):
77 ids = tag_ids(tag)
78 return new_id(base, ids, False)
81def get_content_document_title(soup: bs4.BeautifulSoup):
82 if soup.h1 and soup.h1.string:
83 return soup.h1.string
85 if soup.title and soup.title.string:
86 return soup.title.string
88 tag = cast(bs4.Tag, soup.find(string=True))
89 if tag and tag.string:
90 return tag.string
92 return ""
95def split_fragment(href: str) -> tuple[str, str | None]:
96 values = href.split("#", 1)
97 if len(values) < 1:
98 return "", None
99 if len(values) < 2:
100 return values[0], None
101 return values[0], values[1]
104def strip_fragment(href: str) -> str:
105 return split_fragment(href)[0]
108def slugify(value: str):
109 """
110 Adapted from django's utils.text
112 Convert to ASCII. Convert spaces or repeated
113 dashes to single dashes. Remove characters that aren't alphanumerics,
114 underscores, or hyphens. Convert to lowercase. Also strip leading and
115 trailing whitespace, dashes, and underscores.
116 """
117 value = unicodedata.normalize("NFKC", value)
118 value = re.sub(r"[^\w\s-]", "", value.lower())
119 return re.sub(r"[-\s]+", "-", value).strip("-_")
122class ResolutionType(enum.Enum):
123 """Strategy for converting a list of BeautifulSoup attribute values into a single string."""
125 JOIN = enum.auto()
126 FIRST = enum.auto()
129@overload
130def attr_to_str(
131 value: str | list[str],
132 resolution_type: ResolutionType = ResolutionType.JOIN,
133) -> str: ...
136@overload
137def attr_to_str(
138 value: str | list[str] | None,
139 resolution_type: ResolutionType = ResolutionType.JOIN,
140) -> str | None: ...
143def attr_to_str(
144 value: str | list[str] | None,
145 resolution_type: ResolutionType = ResolutionType.JOIN,
146) -> str | None:
147 if value is None:
148 return None
150 if isinstance(value, list):
151 match resolution_type:
152 case ResolutionType.JOIN:
153 return " ".join(value)
154 case ResolutionType.FIRST:
155 return value[0]
157 return value
160def get_actual_tag_position(tag: bs4.Tag, position: int) -> int:
161 """
162 Given a tag `tag` and a position `i`, return the index `ret` of
163 `position`-th child of tag (i.e. disregarding NavigableString
164 children of tag). If `position` is out of bounds, return position for
165 last child + 1.
166 """
168 tags = [el for el in tag.find_all(recursive=False) if isinstance(el, bs4.Tag)]
170 if position >= len(tags):
171 return len(list(tag.children))
173 sucessor = tags[position]
174 return tag.index(sucessor)
177def datetime_to_str(dt: datetime) -> str:
178 if dt.tzinfo is None:
179 dt = dt.astimezone()
181 dt = dt.astimezone(timezone.utc)
183 return dt.isoformat(timespec="seconds").replace("+00:00", "Z")
186def get_epublib_version() -> str | None:
187 try:
188 return version("epublib")
189 except PackageNotFoundError:
190 return None