Coverage for src/epublib/util.py: 78%

101 statements  

« prev     ^ index     » next       coverage.py v7.10.6, created at 2025-09-18 16:07 -0300

1import enum 

2import os.path 

3import re 

4import unicodedata 

5from datetime import datetime, timezone 

6from importlib.metadata import PackageNotFoundError, version 

7from pathlib import Path 

8from typing import cast, overload 

9 

10import bs4 

11 

12from epublib.exceptions import EPUBError 

13 

14 

15def normalize_path[T: (str, Path)](path: T) -> T: 

16 # Resolve ..'s 

17 absolute = os.path.normpath(path) 

18 if isinstance(path, Path): 

19 return Path(absolute) 

20 return absolute 

21 

22 

23def get_absolute_href[T: (str, Path)](origin_href: str | Path, href: T) -> T: 

24 path = Path(origin_href).parent / Path(href) 

25 if isinstance(href, str): 

26 return str(normalize_path(path)) 

27 return normalize_path(path) 

28 

29 

30def get_relative_href[T: (str, Path)](relative_to: str | Path, absolute_href: T) -> T: 

31 path = Path(absolute_href).relative_to(Path(relative_to).parent, walk_up=True) 

32 

33 if isinstance(absolute_href, str): 

34 return str(path) 

35 return path 

36 

37 

38@overload 

39def parse_int(value: str) -> int | None: ... 

40@overload 

41def parse_int(value: None) -> None: ... 

42 

43 

44def parse_int(value: str | None): 

45 """Lenient integer parsing""" 

46 if value is None: 

47 return None 

48 

49 value = "".join(filter(str.isdigit, value)) 

50 try: 

51 return int(value) 

52 except ValueError: 

53 return None 

54 

55 

56def tag_ids(tag: bs4.Tag): 

57 return {attr_to_str(t["id"]) for t in tag.select("[id]") if tag.get("id")} 

58 

59 

60def new_id(base: str, gone: set[str], add_to_gone: bool = True) -> str: 

61 if base not in gone: 

62 if add_to_gone: 

63 gone.add(base) 

64 return base 

65 

66 for i in range(1, 1000): 

67 new = f"{base}-{i}" 

68 if new not in gone: 

69 if add_to_gone: 

70 gone.add(new) 

71 return new 

72 

73 raise EPUBError(f"Exhausted unique id possibilities for {base}") 

74 

75 

76def new_id_in_tag(base: str, tag: bs4.Tag): 

77 ids = tag_ids(tag) 

78 return new_id(base, ids, False) 

79 

80 

81def get_content_document_title(soup: bs4.BeautifulSoup): 

82 if soup.h1 and soup.h1.string: 

83 return soup.h1.string 

84 

85 if soup.title and soup.title.string: 

86 return soup.title.string 

87 

88 tag = cast(bs4.Tag, soup.find(string=True)) 

89 if tag and tag.string: 

90 return tag.string 

91 

92 return "" 

93 

94 

95def split_fragment(href: str) -> tuple[str, str | None]: 

96 values = href.split("#", 1) 

97 if len(values) < 1: 

98 return "", None 

99 if len(values) < 2: 

100 return values[0], None 

101 return values[0], values[1] 

102 

103 

104def strip_fragment(href: str) -> str: 

105 return split_fragment(href)[0] 

106 

107 

108def slugify(value: str): 

109 """ 

110 Adapted from django's utils.text 

111 

112 Convert to ASCII. Convert spaces or repeated 

113 dashes to single dashes. Remove characters that aren't alphanumerics, 

114 underscores, or hyphens. Convert to lowercase. Also strip leading and 

115 trailing whitespace, dashes, and underscores. 

116 """ 

117 value = unicodedata.normalize("NFKC", value) 

118 value = re.sub(r"[^\w\s-]", "", value.lower()) 

119 return re.sub(r"[-\s]+", "-", value).strip("-_") 

120 

121 

122class ResolutionType(enum.Enum): 

123 """Strategy for converting a list of BeautifulSoup attribute values into a single string.""" 

124 

125 JOIN = enum.auto() 

126 FIRST = enum.auto() 

127 

128 

129@overload 

130def attr_to_str( 

131 value: str | list[str], 

132 resolution_type: ResolutionType = ResolutionType.JOIN, 

133) -> str: ... 

134 

135 

136@overload 

137def attr_to_str( 

138 value: str | list[str] | None, 

139 resolution_type: ResolutionType = ResolutionType.JOIN, 

140) -> str | None: ... 

141 

142 

143def attr_to_str( 

144 value: str | list[str] | None, 

145 resolution_type: ResolutionType = ResolutionType.JOIN, 

146) -> str | None: 

147 if value is None: 

148 return None 

149 

150 if isinstance(value, list): 

151 match resolution_type: 

152 case ResolutionType.JOIN: 

153 return " ".join(value) 

154 case ResolutionType.FIRST: 

155 return value[0] 

156 

157 return value 

158 

159 

160def get_actual_tag_position(tag: bs4.Tag, position: int) -> int: 

161 """ 

162 Given a tag `tag` and a position `i`, return the index `ret` of 

163 `position`-th child of tag (i.e. disregarding NavigableString 

164 children of tag). If `position` is out of bounds, return position for 

165 last child + 1. 

166 """ 

167 

168 tags = [el for el in tag.find_all(recursive=False) if isinstance(el, bs4.Tag)] 

169 

170 if position >= len(tags): 

171 return len(list(tag.children)) 

172 

173 sucessor = tags[position] 

174 return tag.index(sucessor) 

175 

176 

177def datetime_to_str(dt: datetime) -> str: 

178 if dt.tzinfo is None: 

179 dt = dt.astimezone() 

180 

181 dt = dt.astimezone(timezone.utc) 

182 

183 return dt.isoformat(timespec="seconds").replace("+00:00", "Z") 

184 

185 

186def get_epublib_version() -> str | None: 

187 try: 

188 return version("epublib") 

189 except PackageNotFoundError: 

190 return None