Coverage for src/atlus/atlus.py: 88%

162 statements  

« prev     ^ index     » next       coverage.py v7.6.0, created at 2024-07-29 19:46 -0400

1"""Functions and tools to process the raw address strings.""" 

2 

3from collections import Counter 

4from typing import Union, List, Dict, Tuple 

5from pydantic import ValidationError 

6import usaddress 

7import regex 

8 

9from .objects import Address 

10from .resources import ( 

11 street_expand, 

12 direction_expand, 

13 name_expand, 

14 state_expand, 

15 saint_comp, 

16 abbr_join_comp, 

17 dir_fill_comp, 

18 sr_comp, 

19 usa_comp, 

20 paren_comp, 

21 grid_comp, 

22 post_comp, 

23 street_comp, 

24) 

25 

26toss_tags = [ 

27 "Recipient", 

28 "IntersectionSeparator", 

29 "LandmarkName", 

30 "USPSBoxGroupID", 

31 "USPSBoxGroupType", 

32 "USPSBoxID", 

33 "USPSBoxType", 

34 "OccupancyType", 

35] 

36"""Tags from the `usaddress` package to remove.""" 

37 

38osm_mapping = { 

39 "AddressNumber": "addr:housenumber", 

40 "AddressNumberPrefix": "addr:housenumber", 

41 "AddressNumberSuffix": "addr:housenumber", 

42 "StreetName": "addr:street", 

43 "StreetNamePreDirectional": "addr:street", 

44 "StreetNamePreModifier": "addr:street", 

45 "StreetNamePreType": "addr:street", 

46 "StreetNamePostDirectional": "addr:street", 

47 "StreetNamePostModifier": "addr:street", 

48 "StreetNamePostType": "addr:street", 

49 "OccupancyIdentifier": "addr:unit", 

50 "PlaceName": "addr:city", 

51 "StateName": "addr:state", 

52 "ZipCode": "addr:postcode", 

53} 

54"""Mapping from `usaddress` fields to OSM tags.""" 

55 

56 

57def get_title(value: str, single_word: bool = False) -> str: 

58 """Fix ALL-CAPS string. 

59 

60 ```python 

61 >>> get_title("PALM BEACH") 

62 # "Palm Beach" 

63 >>> get_title("BOSTON") 

64 # "BOSTON" 

65 >>> get_title("BOSTON", single_word=True) 

66 # "Boston" 

67 ``` 

68 

69 Args: 

70 value: String to fix. 

71 single_word: Whether the string should be fixed even if it is a single word. 

72 

73 Returns: 

74 str: Fixed string. 

75 """ 

76 if (value.isupper() and " " in value) or (value.isupper() and single_word): 

77 return mc_replace(value.title()) 

78 return value 

79 

80 

81def us_replace(value: str) -> str: 

82 """Fix string containing improperly formatted US. 

83 

84 ```python 

85 >>> us_replace("U.S. Route 15") 

86 # "US Route 15" 

87 ``` 

88 

89 Args: 

90 value: String to fix. 

91 

92 Returns: 

93 str: Fixed string. 

94 """ 

95 return value.replace("U.S.", "US").replace("U. S.", "US").replace("U S ", "US ") 

96 

97 

98def mc_replace(value: str) -> str: 

99 """Fix string containing improperly formatted Mc- prefix. 

100 

101 ```python 

102 >>> mc_replace("Fort Mchenry") 

103 # "Fort McHenry" 

104 ``` 

105 

106 Args: 

107 value: String to fix. 

108 

109 Returns: 

110 str: Fixed string. 

111 """ 

112 words = [] 

113 for word in value.split(): 

114 mc_match = word.partition("Mc") 

115 words.append(mc_match[0] + mc_match[1] + mc_match[2].capitalize()) 

116 return " ".join(words) 

117 

118 

119def ord_replace(value: str) -> str: 

120 """Fix string containing improperly capitalized ordinal. 

121 

122 ```python 

123 >>> ord_replace("3Rd St. NW") 

124 # "3rd St. NW" 

125 ``` 

126 

127 Args: 

128 value: String to fix. 

129 

130 Returns: 

131 str: Fixed string. 

132 """ 

133 return regex.sub(r"(\b\d+[SNRT][tTdDhH]\b)", lower_match, value) 

134 

135 

136def name_street_expand(match: regex.Match) -> str: 

137 """Expand matched street type abbreviations. 

138 

139 Args: 

140 match (regex.Match): Matched string. 

141 

142 Returns: 

143 str: Expanded string. 

144 """ 

145 mat = match.group(1).upper().rstrip(".") 

146 if mat: 146 ↛ 148line 146 didn't jump to line 148 because the condition on line 146 was always true

147 return ({**name_expand, **street_expand})[mat].title() 

148 raise ValueError 

149 

150 

151def direct_expand(match: regex.Match) -> str: 

152 """Expand matched directional abbreviations. 

153 

154 Args: 

155 match (regex.Match): Matched string. 

156 

157 Returns: 

158 str: Expanded string. 

159 """ 

160 mat = match.group(1).upper().replace(".", "") 

161 if mat: 161 ↛ 163line 161 didn't jump to line 163 because the condition on line 161 was always true

162 return direction_expand[mat].title() 

163 raise ValueError 

164 

165 

166def cap_match(match: regex.Match) -> str: 

167 """Make matches uppercase. 

168 

169 Args: 

170 match (regex.Match): Matched string. 

171 

172 Returns: 

173 str: Capitalized string. 

174 """ 

175 return "".join(match.groups()).upper().replace(".", "") 

176 

177 

178def lower_match(match: regex.Match) -> str: 

179 """Lower-case improperly cased ordinal values. 

180 

181 Args: 

182 value: String to fix. 

183 

184 Returns: 

185 str: Fixed string. 

186 """ 

187 return match.group(1).lower() 

188 

189 

190def grid_match(match_str: regex.Match) -> str: 

191 """Clean grid addresses.""" 

192 return match_str.group(0).replace(" ", "").upper() 

193 

194 

195def abbrs(value: str) -> str: 

196 """Bundle most common abbreviation expansion functions. 

197 

198 ```python 

199 >>> abbrs("St. Francis") 

200 # "Saint Francis" 

201 >>> abbrs("E St.") 

202 # "E Street" 

203 >>> abbrs("E Sewell St") 

204 # "East Sewell Street" 

205 ``` 

206 

207 Args: 

208 value (str): String to expand. 

209 

210 Returns: 

211 str: Expanded string. 

212 """ 

213 value = ord_replace(us_replace(mc_replace(get_title(value)))) 

214 

215 # change likely 'St' to 'Saint' 

216 value = saint_comp.sub( 

217 "Saint", 

218 value, 

219 ) 

220 

221 # expand common street and word abbreviations 

222 value = abbr_join_comp.sub( 

223 name_street_expand, 

224 value, 

225 ) 

226 

227 # expand directionals 

228 value = dir_fill_comp.sub( 

229 direct_expand, 

230 value, 

231 ) 

232 

233 # normalize 'US' 

234 value = us_replace(value) 

235 

236 # uppercase shortened street descriptors 

237 value = regex.sub( 

238 r"\b(C[rh]|S[rh]|[FR]m|Us)\b", 

239 cap_match, 

240 value, 

241 ) 

242 

243 # remove unremoved abbr periods 

244 value = regex.sub( 

245 r"([a-zA-Z]{2,})\.", 

246 r"\1", 

247 value, 

248 ) 

249 

250 # expand 'SR' if no other street types 

251 value = sr_comp.sub("State Route", value) 

252 return value.strip(" .") 

253 

254 

255def remove_br_unicode(old: str) -> str: 

256 """Clean the input string before sending to parser by removing newlines and unicode. 

257 

258 Args: 

259 old (str): String to clean. 

260 

261 Returns: 

262 str: Cleaned string. 

263 """ 

264 old = regex.sub(r"<br ?/>", ",", old) 

265 return regex.sub(r"[^\x00-\x7F\n\r\t]", "", old) # remove unicode 

266 

267 

268def clean_address(address_string: str) -> str: 

269 """Clean the input string before sending to parser by removing newlines and unicode. 

270 

271 Args: 

272 address_string (str): String to clean. 

273 

274 Returns: 

275 str: Cleaned string. 

276 """ 

277 address_string = usa_comp.sub( 

278 "", remove_br_unicode(address_string).replace(" ", " ").strip(" ,.") 

279 ) 

280 address_string = paren_comp.sub("", address_string) 

281 return grid_comp.sub(grid_match, address_string) 

282 

283 

284def help_join(tags, keep: List[str]) -> str: 

285 """Help to join address fields.""" 

286 tag_join: List[str] = [v for k, v in tags.items() if k in keep] 

287 return " ".join(tag_join) 

288 

289 

290def addr_street(tags: Dict[str, str]) -> str: 

291 """Build the street field.""" 

292 return help_join( 

293 tags, 

294 [ 

295 "StreetName", 

296 "StreetNamePreDirectional", 

297 "StreetNamePreModifier", 

298 "StreetNamePreType", 

299 "StreetNamePostDirectional", 

300 "StreetNamePostModifier", 

301 "StreetNamePostType", 

302 ], 

303 ) 

304 

305 

306def addr_housenumber(tags: Dict[str, str]) -> str: 

307 """Build the housenumber field.""" 

308 return help_join( 

309 tags, ["AddressNumberPrefix", "AddressNumber", "AddressNumberSuffix"] 

310 ) 

311 

312 

313def _combine_consecutive_tuples( 

314 tuples_list: List[Tuple[str, str]] 

315) -> List[Tuple[str, str]]: 

316 """Join adjacent `usaddress` fields.""" 

317 combined_list = [] 

318 current_tag = None 

319 current_value = None 

320 

321 for value, tag in tuples_list: 

322 if tag != current_tag: 322 ↛ 327line 322 didn't jump to line 327 because the condition on line 322 was always true

323 if current_tag: 

324 combined_list.append((current_value, current_tag)) 

325 current_value, current_tag = value, tag 

326 else: 

327 current_value = " ".join(i for i in [current_value, value] if i) 

328 

329 if current_tag: 329 ↛ 332line 329 didn't jump to line 332 because the condition on line 329 was always true

330 combined_list.append((current_value, current_tag)) 

331 

332 return combined_list 

333 

334 

335def _manual_join(parsed: List[tuple]) -> Tuple[Dict[str, str], List[Union[str, None]]]: 

336 """Remove duplicates and join remaining fields.""" 

337 parsed_clean = [i for i in parsed if i[1] not in toss_tags] 

338 counts = Counter([i[1] for i in parsed_clean]) 

339 ok_tags = [tag for tag, count in counts.items() if count == 1] 

340 ok_dict: Dict[str, str] = {i[1]: i[0] for i in parsed_clean if i[1] in ok_tags} 

341 removed = [osm_mapping.get(field) for field, count in counts.items() if count > 1] 

342 

343 new_dict: Dict[str, Union[str, None]] = {} 

344 if "addr:street" not in removed: 344 ↛ 346line 344 didn't jump to line 346 because the condition on line 344 was always true

345 new_dict["addr:street"] = addr_street(ok_dict) 

346 if "addr:housenumber" not in removed: 346 ↛ 348line 346 didn't jump to line 348 because the condition on line 346 was always true

347 new_dict["addr:housenumber"] = addr_housenumber(ok_dict) 

348 if "addr:unit" not in removed: 348 ↛ 349line 348 didn't jump to line 349 because the condition on line 348 was never true

349 new_dict["addr:unit"] = ok_dict.get("OccupancyIdentifier") 

350 if "addr:city" not in removed: 350 ↛ 352line 350 didn't jump to line 352 because the condition on line 350 was always true

351 new_dict["addr:city"] = ok_dict.get("PlaceName") 

352 if "addr:state" not in removed: 352 ↛ 354line 352 didn't jump to line 354 because the condition on line 352 was always true

353 new_dict["addr:state"] = ok_dict.get("StateName") 

354 if "addr:postcode" not in removed: 354 ↛ 357line 354 didn't jump to line 357 because the condition on line 354 was always true

355 new_dict["addr:postcode"] = ok_dict.get("ZipCode") 

356 

357 return {k: v for k, v in new_dict.items() if v}, removed 

358 

359 

360def collapse_list(seq: list) -> list: 

361 """Remove duplicates in list while keeping order. 

362 

363 ```python 

364 >>> collapse_list(["foo", "bar", "foo"]) 

365 # ["foo", "bar"] 

366 ``` 

367 

368 Args: 

369 seq (list): The list to collapse. 

370 

371 Returns: 

372 list: The collapsed list. 

373 """ 

374 seen = set() 

375 seen_add = seen.add 

376 return [x for x in seq if not (x in seen or seen_add(x))] 

377 

378 

379def split_unit(address_string: str) -> Dict[str, str]: 

380 """Split unit from address string, if present.""" 

381 address_string = address_string.strip(" ") 

382 if not any(char.isalpha() for char in address_string): 

383 return {"addr:housenumber": address_string} 

384 

385 add_dict = {} 

386 number = "" 

387 for char in address_string: 387 ↛ 393line 387 didn't jump to line 393 because the loop on line 387 didn't complete

388 if char.isdigit(): 

389 number += char 

390 else: 

391 break 

392 

393 unit = remove_prefix(address_string, number).lstrip(" -,/") 

394 if unit: 394 ↛ 396line 394 didn't jump to line 396 because the condition on line 394 was always true

395 add_dict["addr:unit"] = unit 

396 add_dict["addr:housenumber"] = number 

397 

398 return add_dict 

399 

400 

401def remove_prefix(text: str, prefix: str) -> str: 

402 """Remove prefix from string for Python 3.8.""" 

403 if text.startswith(prefix): 

404 return text[len(prefix) :] 

405 return text 

406 

407 

408def get_address( 

409 address_string: str, 

410) -> Tuple[Dict[str, Union[str, int]], List[Union[str, None]]]: 

411 """Process address strings. 

412 

413 ```python 

414 >>> get_address("345 MAPLE RD, COUNTRYSIDE, PA 24680-0198")[0] 

415 # {"addr:housenumber": "345", "addr:street": "Maple Road", 

416 "addr:city": "Countryside", "addr:state": "PA", "addr:postcode": "24680-0198"} 

417 >>> get_address("777 Strawberry St.")[0] 

418 # {"addr:housenumber": "777", "addr:street": "Strawberry Street"} 

419 >>> address = get_address("222 NW Pineapple Ave Suite A Unit B") 

420 >>> address[0] 

421 # {"addr:housenumber": "222", "addr:street": "Northwest Pineapple Avenue"} 

422 >>> address[1] 

423 # ["addr:unit"] 

424 ``` 

425 

426 Args: 

427 address_string (str): The address string to process. 

428 

429 Returns: 

430 Tuple[Dict[str, Union[str, int]], List[Union[str, None]]]: 

431 The processed address string and the removed fields. 

432 """ 

433 try: 

434 cleaned = usaddress.tag(clean_address(address_string), tag_mapping=osm_mapping)[ 

435 0 

436 ] 

437 removed = [] 

438 except usaddress.RepeatedLabelError as err: 

439 collapsed = collapse_list( 

440 [(i[0].strip(" .,#"), i[1]) for i in err.parsed_string] 

441 ) 

442 cleaned, removed = _manual_join(_combine_consecutive_tuples(collapsed)) 

443 

444 for toss in toss_tags: 

445 cleaned.pop(toss, None) 

446 

447 if "addr:housenumber" in cleaned: 447 ↛ 450line 447 didn't jump to line 450 because the condition on line 447 was always true

448 cleaned = {**cleaned, **split_unit(cleaned["addr:housenumber"])} 

449 

450 if "addr:street" in cleaned: 450 ↛ 457line 450 didn't jump to line 457 because the condition on line 450 was always true

451 street = abbrs(cleaned["addr:street"]) 

452 cleaned["addr:street"] = street_comp.sub( 

453 "Street", 

454 street, 

455 ).strip(".") 

456 

457 if "addr:city" in cleaned: 

458 cleaned["addr:city"] = abbrs(get_title(cleaned["addr:city"], single_word=True)) 

459 

460 if "addr:state" in cleaned: 

461 old = cleaned["addr:state"].replace(".", "") 

462 if old.upper() in state_expand: 462 ↛ 463line 462 didn't jump to line 463 because the condition on line 462 was never true

463 cleaned["addr:state"] = state_expand[old.upper()] 

464 elif len(old) == 2 and old.upper() in list(state_expand.values()): 464 ↛ 467line 464 didn't jump to line 467 because the condition on line 464 was always true

465 cleaned["addr:state"] = old.upper() 

466 

467 if "addr:unit" in cleaned: 467 ↛ 468line 467 didn't jump to line 468 because the condition on line 467 was never true

468 cleaned["addr:unit"] = cleaned["addr:unit"].removeprefix("Space").strip(" #.") 

469 

470 if "addr:postcode" in cleaned: 

471 # remove extraneous postcode digits 

472 cleaned["addr:postcode"] = post_comp.sub( 

473 r"\1", cleaned["addr:postcode"] 

474 ).replace(" ", "-") 

475 

476 try: 

477 validated: Address = Address.model_validate(dict(cleaned)) 

478 except ValidationError as err: 

479 bad_fields: list = [each.get("loc", [])[0] for each in err.errors()] 

480 cleaned_ret = dict(cleaned) 

481 for each in bad_fields: 

482 cleaned_ret.pop(each, None) 

483 

484 removed.extend(bad_fields) 

485 validated: Address = Address.model_validate(cleaned_ret) 

486 

487 return validated.model_dump(exclude_none=True, by_alias=True), removed 

488 

489 

490def get_phone(phone: str) -> str: 

491 """Format phone numbers to the US and Canadian standard format of `+1 XXX-XXX-XXXX`. 

492 

493 ```python 

494 >>> get_phone("2029009019") 

495 # "+1 202-900-9019" 

496 >>> get_phone("(202) 900-9019") 

497 # "+1 202-900-9019" 

498 >>> get_phone("202-900-901") 

499 # ValueError: Invalid phone number: 202-900-901 

500 ``` 

501 

502 Args: 

503 phone (str): The phone number to format. 

504 

505 Returns: 

506 str: The formatted phone number. 

507 

508 Raises: 

509 ValueError: If the phone number is invalid. 

510 """ 

511 phone_valid = regex.search( 

512 r"^\(?(?:\+? ?1?[ -.]*)?(?:\(?(\d{3})\)?[ -.]*)(\d{3})[ -.]*(\d{4})$", 

513 phone, 

514 ) 

515 if phone_valid: 

516 return ( 

517 f"+1 {phone_valid.group(1)}-{phone_valid.group(2)}-{phone_valid.group(3)}" 

518 ) 

519 raise ValueError(f"Invalid phone number: {phone}")