Coverage for src/epublib/util.py: 98%

118 statements  

« prev     ^ index     » next       coverage.py v7.10.7, created at 2025-10-07 12:50 -0300

1import enum 

2import operator 

3import os.path 

4import re 

5import typing 

6import unicodedata 

7from datetime import datetime, timezone 

8from functools import reduce 

9from importlib.metadata import PackageNotFoundError, version 

10from pathlib import Path 

11from types import UnionType 

12from typing import cast, overload 

13 

14import bs4 

15 

16from epublib.exceptions import EPUBError 

17from epublib.identifier import EPUBId 

18 

19 

20def normalize_path[T: (str, Path, str | Path)](path: T) -> T: 

21 cls = type(path) 

22 # Resolve ..'s 

23 absolute = os.path.normpath(path) 

24 return cls(absolute) 

25 

26 

27def get_absolute_href[T: (str, Path, str | Path)]( 

28 origin_href: str | Path, href: T 

29) -> T: 

30 cls = type(href) 

31 

32 if str(href).startswith("#"): 

33 path = Path(f"{origin_href}{href if href != '#' else ''}") 

34 else: 

35 path = Path(origin_href).parent / Path(href) 

36 

37 return cls(normalize_path(path)) 

38 

39 

40def get_relative_href[T: (str, Path, str | Path)]( 

41 relative_to: str | Path, absolute_href: T 

42) -> T: 

43 cls = type(absolute_href) 

44 

45 if strip_fragment(absolute_href) == strip_fragment(relative_to): 

46 fragment = get_fragment(absolute_href) 

47 path = Path(f"#{fragment if fragment is not None else ''}") 

48 else: 

49 path = Path(absolute_href).relative_to(Path(relative_to).parent, walk_up=True) 

50 

51 return cls(path) 

52 

53 

54@overload 

55def parse_int(value: str) -> int | None: ... 

56@overload 

57def parse_int(value: None) -> None: ... 

58 

59 

60def parse_int(value: str | None): 

61 """Lenient integer parsing""" 

62 if value is None: 

63 return None 

64 

65 value = "".join([val for val in value if val.isdigit() or val in "-."]) 

66 value = value.split(".", 1)[0] # Remove decimal part 

67 try: 

68 return int(value) 

69 except ValueError: 

70 return None 

71 

72 

73def tag_ids(tag: bs4.Tag): 

74 return {attr_to_str(t["id"]) for t in tag.find_all(id=True)} 

75 

76 

77def new_id(base: str | Path, gone: set[str], add_to_gone: bool = True) -> EPUBId: 

78 base = EPUBId.to_valid(str(base)) 

79 

80 if base not in gone: 

81 if add_to_gone: 

82 gone.add(base) 

83 return EPUBId(base) 

84 

85 for i in range(1, 1 << 16): 

86 new = f"{base}-{i}" 

87 if new not in gone: 

88 if add_to_gone: 

89 gone.add(new) 

90 return EPUBId(new) 

91 

92 raise EPUBError(f"Exhausted unique id possibilities for {base}") 

93 

94 

95def new_id_in_tag(base: str | Path, tag: bs4.Tag) -> EPUBId: 

96 ids = tag_ids(tag) 

97 return new_id(base, ids, False) 

98 

99 

100def split_fragment[T: (str, Path, str | Path)](href: T) -> tuple[T, str | None]: 

101 cls = type(href) 

102 

103 values = str(href).split("#", 1) 

104 if len(values) < 2: 

105 return cls(values[0]), None 

106 return cls(values[0]), values[1] 

107 

108 

109def strip_fragment[T: (str, Path, str | Path)](href: T) -> T: 

110 return split_fragment(href)[0] 

111 

112 

113def get_fragment(href: str | Path) -> str | None: 

114 return split_fragment(str(href))[1] 

115 

116 

117def slugify(value: str): 

118 """ 

119 Adapted from django's utils.text 

120 

121 Convert to ASCII. Convert spaces or repeated 

122 dashes to single dashes. Remove characters that aren't alphanumerics, 

123 underscores, or hyphens. Convert to lowercase. Also strip leading and 

124 trailing whitespace, dashes, and underscores. 

125 """ 

126 value = unicodedata.normalize("NFKC", value) 

127 value = re.sub(r"[^\w\s-]", "", value.lower()) 

128 return re.sub(r"[-\s]+", "-", value).strip("-_") 

129 

130 

131class ResolutionType(enum.Enum): 

132 """Strategy for converting a list of BeautifulSoup attribute values into a single string.""" 

133 

134 JOIN = enum.auto() 

135 FIRST = enum.auto() 

136 

137 

138@overload 

139def attr_to_str( 

140 value: str | list[str], 

141 resolution_type: ResolutionType = ResolutionType.JOIN, 

142) -> str: ... 

143 

144 

145@overload 

146def attr_to_str( 

147 value: str | list[str] | None, 

148 resolution_type: ResolutionType = ResolutionType.JOIN, 

149) -> str | None: ... 

150 

151 

152def attr_to_str( 

153 value: str | list[str] | None, 

154 resolution_type: ResolutionType = ResolutionType.JOIN, 

155) -> str | None: 

156 if value is None: 

157 return None 

158 

159 if isinstance(value, list): 

160 match resolution_type: 

161 case ResolutionType.JOIN: 

162 return " ".join(value) 

163 case ResolutionType.FIRST: 

164 return value[0] 

165 

166 return value 

167 

168 

169def get_actual_tag_position( 

170 tag: bs4.Tag, 

171 position: int, 

172 name: str | None = None, 

173) -> int: 

174 """ 

175 Given a tag `tag` and a position `i`, return the index `ret` of 

176 `position`-th child of tag (i.e. disregarding NavigableString 

177 children of tag). If name is given, consider only children that are 

178 tags with that name. If `position` is out of bounds, return position for 

179 last child + 1. 

180 """ 

181 

182 tags = list(tag.find_all(name, recursive=False)) 

183 

184 if position >= len(tags): 

185 return len(list(tag.children)) 

186 

187 sucessor = tags[position] 

188 return tag.index(sucessor) 

189 

190 

191def datetime_to_str(dt: datetime) -> str: 

192 if dt.tzinfo is None: 

193 dt = dt.astimezone() 

194 

195 dt = dt.astimezone(timezone.utc) 

196 

197 return dt.isoformat(timespec="seconds").replace("+00:00", "Z") 

198 

199 

200def get_epublib_version() -> str | None: 

201 try: 

202 return version("epublib") 

203 except PackageNotFoundError: 

204 return None 

205 

206 

207def strip_type_parameters[T: UnionType | object](typ: type[T]) -> type[T]: 

208 """ 

209 Strip parameters of type hints, making them suitable for usage 

210 with isinstance and issubclass checks. 

211 """ 

212 origin: type[T] = typing.get_origin(typ) or typ 

213 

214 if origin is UnionType or origin is typing.Union: # type: ignore[reportDeprecated] 

215 if origin is typing.Union: # type: ignore[reportDeprecated] 

216 origin = typ 

217 

218 origin = cast( 

219 type[T], 

220 operator.or_( 

221 *( 

222 strip_type_parameters(arg) 

223 for arg in cast(tuple[type[T], ...], typing.get_args(typ)) 

224 ) 

225 ), 

226 ) 

227 elif origin is typing.Literal: 

228 types: set[type[T]] = { 

229 type(option) for option in cast(tuple[type[T], ...], typing.get_args(typ)) 

230 } 

231 if len(types) == 1: 

232 origin = types.pop() 

233 else: 

234 origin = cast(type[T], reduce(operator.or_, types)) 

235 

236 return origin 

237 

238 

239def remove_optional_type[T: UnionType | object](typ: T) -> T: 

240 """ 

241 Return the first type from list of types in a UnionType that is not 

242 NoneType. This make the union ready for usage with issubclass. 

243 """ 

244 if not isinstance(typ, UnionType): 

245 return typ 

246 

247 return next(arg for arg in typing.get_args(typ) if arg is not None) # type: ignore[reportAny]