Coverage for src/epublib/util.py: 98%
118 statements
« prev ^ index » next coverage.py v7.10.7, created at 2025-10-07 12:50 -0300
« prev ^ index » next coverage.py v7.10.7, created at 2025-10-07 12:50 -0300
1import enum
2import operator
3import os.path
4import re
5import typing
6import unicodedata
7from datetime import datetime, timezone
8from functools import reduce
9from importlib.metadata import PackageNotFoundError, version
10from pathlib import Path
11from types import UnionType
12from typing import cast, overload
14import bs4
16from epublib.exceptions import EPUBError
17from epublib.identifier import EPUBId
20def normalize_path[T: (str, Path, str | Path)](path: T) -> T:
21 cls = type(path)
22 # Resolve ..'s
23 absolute = os.path.normpath(path)
24 return cls(absolute)
27def get_absolute_href[T: (str, Path, str | Path)](
28 origin_href: str | Path, href: T
29) -> T:
30 cls = type(href)
32 if str(href).startswith("#"):
33 path = Path(f"{origin_href}{href if href != '#' else ''}")
34 else:
35 path = Path(origin_href).parent / Path(href)
37 return cls(normalize_path(path))
40def get_relative_href[T: (str, Path, str | Path)](
41 relative_to: str | Path, absolute_href: T
42) -> T:
43 cls = type(absolute_href)
45 if strip_fragment(absolute_href) == strip_fragment(relative_to):
46 fragment = get_fragment(absolute_href)
47 path = Path(f"#{fragment if fragment is not None else ''}")
48 else:
49 path = Path(absolute_href).relative_to(Path(relative_to).parent, walk_up=True)
51 return cls(path)
54@overload
55def parse_int(value: str) -> int | None: ...
56@overload
57def parse_int(value: None) -> None: ...
60def parse_int(value: str | None):
61 """Lenient integer parsing"""
62 if value is None:
63 return None
65 value = "".join([val for val in value if val.isdigit() or val in "-."])
66 value = value.split(".", 1)[0] # Remove decimal part
67 try:
68 return int(value)
69 except ValueError:
70 return None
73def tag_ids(tag: bs4.Tag):
74 return {attr_to_str(t["id"]) for t in tag.find_all(id=True)}
77def new_id(base: str | Path, gone: set[str], add_to_gone: bool = True) -> EPUBId:
78 base = EPUBId.to_valid(str(base))
80 if base not in gone:
81 if add_to_gone:
82 gone.add(base)
83 return EPUBId(base)
85 for i in range(1, 1 << 16):
86 new = f"{base}-{i}"
87 if new not in gone:
88 if add_to_gone:
89 gone.add(new)
90 return EPUBId(new)
92 raise EPUBError(f"Exhausted unique id possibilities for {base}")
95def new_id_in_tag(base: str | Path, tag: bs4.Tag) -> EPUBId:
96 ids = tag_ids(tag)
97 return new_id(base, ids, False)
100def split_fragment[T: (str, Path, str | Path)](href: T) -> tuple[T, str | None]:
101 cls = type(href)
103 values = str(href).split("#", 1)
104 if len(values) < 2:
105 return cls(values[0]), None
106 return cls(values[0]), values[1]
109def strip_fragment[T: (str, Path, str | Path)](href: T) -> T:
110 return split_fragment(href)[0]
113def get_fragment(href: str | Path) -> str | None:
114 return split_fragment(str(href))[1]
117def slugify(value: str):
118 """
119 Adapted from django's utils.text
121 Convert to ASCII. Convert spaces or repeated
122 dashes to single dashes. Remove characters that aren't alphanumerics,
123 underscores, or hyphens. Convert to lowercase. Also strip leading and
124 trailing whitespace, dashes, and underscores.
125 """
126 value = unicodedata.normalize("NFKC", value)
127 value = re.sub(r"[^\w\s-]", "", value.lower())
128 return re.sub(r"[-\s]+", "-", value).strip("-_")
131class ResolutionType(enum.Enum):
132 """Strategy for converting a list of BeautifulSoup attribute values into a single string."""
134 JOIN = enum.auto()
135 FIRST = enum.auto()
138@overload
139def attr_to_str(
140 value: str | list[str],
141 resolution_type: ResolutionType = ResolutionType.JOIN,
142) -> str: ...
145@overload
146def attr_to_str(
147 value: str | list[str] | None,
148 resolution_type: ResolutionType = ResolutionType.JOIN,
149) -> str | None: ...
152def attr_to_str(
153 value: str | list[str] | None,
154 resolution_type: ResolutionType = ResolutionType.JOIN,
155) -> str | None:
156 if value is None:
157 return None
159 if isinstance(value, list):
160 match resolution_type:
161 case ResolutionType.JOIN:
162 return " ".join(value)
163 case ResolutionType.FIRST:
164 return value[0]
166 return value
169def get_actual_tag_position(
170 tag: bs4.Tag,
171 position: int,
172 name: str | None = None,
173) -> int:
174 """
175 Given a tag `tag` and a position `i`, return the index `ret` of
176 `position`-th child of tag (i.e. disregarding NavigableString
177 children of tag). If name is given, consider only children that are
178 tags with that name. If `position` is out of bounds, return position for
179 last child + 1.
180 """
182 tags = list(tag.find_all(name, recursive=False))
184 if position >= len(tags):
185 return len(list(tag.children))
187 sucessor = tags[position]
188 return tag.index(sucessor)
191def datetime_to_str(dt: datetime) -> str:
192 if dt.tzinfo is None:
193 dt = dt.astimezone()
195 dt = dt.astimezone(timezone.utc)
197 return dt.isoformat(timespec="seconds").replace("+00:00", "Z")
200def get_epublib_version() -> str | None:
201 try:
202 return version("epublib")
203 except PackageNotFoundError:
204 return None
207def strip_type_parameters[T: UnionType | object](typ: type[T]) -> type[T]:
208 """
209 Strip parameters of type hints, making them suitable for usage
210 with isinstance and issubclass checks.
211 """
212 origin: type[T] = typing.get_origin(typ) or typ
214 if origin is UnionType or origin is typing.Union: # type: ignore[reportDeprecated]
215 if origin is typing.Union: # type: ignore[reportDeprecated]
216 origin = typ
218 origin = cast(
219 type[T],
220 operator.or_(
221 *(
222 strip_type_parameters(arg)
223 for arg in cast(tuple[type[T], ...], typing.get_args(typ))
224 )
225 ),
226 )
227 elif origin is typing.Literal:
228 types: set[type[T]] = {
229 type(option) for option in cast(tuple[type[T], ...], typing.get_args(typ))
230 }
231 if len(types) == 1:
232 origin = types.pop()
233 else:
234 origin = cast(type[T], reduce(operator.or_, types))
236 return origin
239def remove_optional_type[T: UnionType | object](typ: T) -> T:
240 """
241 Return the first type from list of types in a UnionType that is not
242 NoneType. This make the union ready for usage with issubclass.
243 """
244 if not isinstance(typ, UnionType):
245 return typ
247 return next(arg for arg in typing.get_args(typ) if arg is not None) # type: ignore[reportAny]