Coverage for src/atlus/atlus.py: 88%
162 statements
« prev ^ index » next coverage.py v7.6.0, created at 2024-07-29 19:46 -0400
« prev ^ index » next coverage.py v7.6.0, created at 2024-07-29 19:46 -0400
1"""Functions and tools to process the raw address strings."""
3from collections import Counter
4from typing import Union, List, Dict, Tuple
5from pydantic import ValidationError
6import usaddress
7import regex
9from .objects import Address
10from .resources import (
11 street_expand,
12 direction_expand,
13 name_expand,
14 state_expand,
15 saint_comp,
16 abbr_join_comp,
17 dir_fill_comp,
18 sr_comp,
19 usa_comp,
20 paren_comp,
21 grid_comp,
22 post_comp,
23 street_comp,
24)
26toss_tags = [
27 "Recipient",
28 "IntersectionSeparator",
29 "LandmarkName",
30 "USPSBoxGroupID",
31 "USPSBoxGroupType",
32 "USPSBoxID",
33 "USPSBoxType",
34 "OccupancyType",
35]
36"""Tags from the `usaddress` package to remove."""
38osm_mapping = {
39 "AddressNumber": "addr:housenumber",
40 "AddressNumberPrefix": "addr:housenumber",
41 "AddressNumberSuffix": "addr:housenumber",
42 "StreetName": "addr:street",
43 "StreetNamePreDirectional": "addr:street",
44 "StreetNamePreModifier": "addr:street",
45 "StreetNamePreType": "addr:street",
46 "StreetNamePostDirectional": "addr:street",
47 "StreetNamePostModifier": "addr:street",
48 "StreetNamePostType": "addr:street",
49 "OccupancyIdentifier": "addr:unit",
50 "PlaceName": "addr:city",
51 "StateName": "addr:state",
52 "ZipCode": "addr:postcode",
53}
54"""Mapping from `usaddress` fields to OSM tags."""
57def get_title(value: str, single_word: bool = False) -> str:
58 """Fix ALL-CAPS string.
60 ```python
61 >>> get_title("PALM BEACH")
62 # "Palm Beach"
63 >>> get_title("BOSTON")
64 # "BOSTON"
65 >>> get_title("BOSTON", single_word=True)
66 # "Boston"
67 ```
69 Args:
70 value: String to fix.
71 single_word: Whether the string should be fixed even if it is a single word.
73 Returns:
74 str: Fixed string.
75 """
76 if (value.isupper() and " " in value) or (value.isupper() and single_word):
77 return mc_replace(value.title())
78 return value
81def us_replace(value: str) -> str:
82 """Fix string containing improperly formatted US.
84 ```python
85 >>> us_replace("U.S. Route 15")
86 # "US Route 15"
87 ```
89 Args:
90 value: String to fix.
92 Returns:
93 str: Fixed string.
94 """
95 return value.replace("U.S.", "US").replace("U. S.", "US").replace("U S ", "US ")
98def mc_replace(value: str) -> str:
99 """Fix string containing improperly formatted Mc- prefix.
101 ```python
102 >>> mc_replace("Fort Mchenry")
103 # "Fort McHenry"
104 ```
106 Args:
107 value: String to fix.
109 Returns:
110 str: Fixed string.
111 """
112 words = []
113 for word in value.split():
114 mc_match = word.partition("Mc")
115 words.append(mc_match[0] + mc_match[1] + mc_match[2].capitalize())
116 return " ".join(words)
119def ord_replace(value: str) -> str:
120 """Fix string containing improperly capitalized ordinal.
122 ```python
123 >>> ord_replace("3Rd St. NW")
124 # "3rd St. NW"
125 ```
127 Args:
128 value: String to fix.
130 Returns:
131 str: Fixed string.
132 """
133 return regex.sub(r"(\b\d+[SNRT][tTdDhH]\b)", lower_match, value)
136def name_street_expand(match: regex.Match) -> str:
137 """Expand matched street type abbreviations.
139 Args:
140 match (regex.Match): Matched string.
142 Returns:
143 str: Expanded string.
144 """
145 mat = match.group(1).upper().rstrip(".")
146 if mat: 146 ↛ 148line 146 didn't jump to line 148 because the condition on line 146 was always true
147 return ({**name_expand, **street_expand})[mat].title()
148 raise ValueError
151def direct_expand(match: regex.Match) -> str:
152 """Expand matched directional abbreviations.
154 Args:
155 match (regex.Match): Matched string.
157 Returns:
158 str: Expanded string.
159 """
160 mat = match.group(1).upper().replace(".", "")
161 if mat: 161 ↛ 163line 161 didn't jump to line 163 because the condition on line 161 was always true
162 return direction_expand[mat].title()
163 raise ValueError
166def cap_match(match: regex.Match) -> str:
167 """Make matches uppercase.
169 Args:
170 match (regex.Match): Matched string.
172 Returns:
173 str: Capitalized string.
174 """
175 return "".join(match.groups()).upper().replace(".", "")
178def lower_match(match: regex.Match) -> str:
179 """Lower-case improperly cased ordinal values.
181 Args:
182 value: String to fix.
184 Returns:
185 str: Fixed string.
186 """
187 return match.group(1).lower()
190def grid_match(match_str: regex.Match) -> str:
191 """Clean grid addresses."""
192 return match_str.group(0).replace(" ", "").upper()
195def abbrs(value: str) -> str:
196 """Bundle most common abbreviation expansion functions.
198 ```python
199 >>> abbrs("St. Francis")
200 # "Saint Francis"
201 >>> abbrs("E St.")
202 # "E Street"
203 >>> abbrs("E Sewell St")
204 # "East Sewell Street"
205 ```
207 Args:
208 value (str): String to expand.
210 Returns:
211 str: Expanded string.
212 """
213 value = ord_replace(us_replace(mc_replace(get_title(value))))
215 # change likely 'St' to 'Saint'
216 value = saint_comp.sub(
217 "Saint",
218 value,
219 )
221 # expand common street and word abbreviations
222 value = abbr_join_comp.sub(
223 name_street_expand,
224 value,
225 )
227 # expand directionals
228 value = dir_fill_comp.sub(
229 direct_expand,
230 value,
231 )
233 # normalize 'US'
234 value = us_replace(value)
236 # uppercase shortened street descriptors
237 value = regex.sub(
238 r"\b(C[rh]|S[rh]|[FR]m|Us)\b",
239 cap_match,
240 value,
241 )
243 # remove unremoved abbr periods
244 value = regex.sub(
245 r"([a-zA-Z]{2,})\.",
246 r"\1",
247 value,
248 )
250 # expand 'SR' if no other street types
251 value = sr_comp.sub("State Route", value)
252 return value.strip(" .")
255def remove_br_unicode(old: str) -> str:
256 """Clean the input string before sending to parser by removing newlines and unicode.
258 Args:
259 old (str): String to clean.
261 Returns:
262 str: Cleaned string.
263 """
264 old = regex.sub(r"<br ?/>", ",", old)
265 return regex.sub(r"[^\x00-\x7F\n\r\t]", "", old) # remove unicode
268def clean_address(address_string: str) -> str:
269 """Clean the input string before sending to parser by removing newlines and unicode.
271 Args:
272 address_string (str): String to clean.
274 Returns:
275 str: Cleaned string.
276 """
277 address_string = usa_comp.sub(
278 "", remove_br_unicode(address_string).replace(" ", " ").strip(" ,.")
279 )
280 address_string = paren_comp.sub("", address_string)
281 return grid_comp.sub(grid_match, address_string)
284def help_join(tags, keep: List[str]) -> str:
285 """Help to join address fields."""
286 tag_join: List[str] = [v for k, v in tags.items() if k in keep]
287 return " ".join(tag_join)
290def addr_street(tags: Dict[str, str]) -> str:
291 """Build the street field."""
292 return help_join(
293 tags,
294 [
295 "StreetName",
296 "StreetNamePreDirectional",
297 "StreetNamePreModifier",
298 "StreetNamePreType",
299 "StreetNamePostDirectional",
300 "StreetNamePostModifier",
301 "StreetNamePostType",
302 ],
303 )
306def addr_housenumber(tags: Dict[str, str]) -> str:
307 """Build the housenumber field."""
308 return help_join(
309 tags, ["AddressNumberPrefix", "AddressNumber", "AddressNumberSuffix"]
310 )
313def _combine_consecutive_tuples(
314 tuples_list: List[Tuple[str, str]]
315) -> List[Tuple[str, str]]:
316 """Join adjacent `usaddress` fields."""
317 combined_list = []
318 current_tag = None
319 current_value = None
321 for value, tag in tuples_list:
322 if tag != current_tag: 322 ↛ 327line 322 didn't jump to line 327 because the condition on line 322 was always true
323 if current_tag:
324 combined_list.append((current_value, current_tag))
325 current_value, current_tag = value, tag
326 else:
327 current_value = " ".join(i for i in [current_value, value] if i)
329 if current_tag: 329 ↛ 332line 329 didn't jump to line 332 because the condition on line 329 was always true
330 combined_list.append((current_value, current_tag))
332 return combined_list
335def _manual_join(parsed: List[tuple]) -> Tuple[Dict[str, str], List[Union[str, None]]]:
336 """Remove duplicates and join remaining fields."""
337 parsed_clean = [i for i in parsed if i[1] not in toss_tags]
338 counts = Counter([i[1] for i in parsed_clean])
339 ok_tags = [tag for tag, count in counts.items() if count == 1]
340 ok_dict: Dict[str, str] = {i[1]: i[0] for i in parsed_clean if i[1] in ok_tags}
341 removed = [osm_mapping.get(field) for field, count in counts.items() if count > 1]
343 new_dict: Dict[str, Union[str, None]] = {}
344 if "addr:street" not in removed: 344 ↛ 346line 344 didn't jump to line 346 because the condition on line 344 was always true
345 new_dict["addr:street"] = addr_street(ok_dict)
346 if "addr:housenumber" not in removed: 346 ↛ 348line 346 didn't jump to line 348 because the condition on line 346 was always true
347 new_dict["addr:housenumber"] = addr_housenumber(ok_dict)
348 if "addr:unit" not in removed: 348 ↛ 349line 348 didn't jump to line 349 because the condition on line 348 was never true
349 new_dict["addr:unit"] = ok_dict.get("OccupancyIdentifier")
350 if "addr:city" not in removed: 350 ↛ 352line 350 didn't jump to line 352 because the condition on line 350 was always true
351 new_dict["addr:city"] = ok_dict.get("PlaceName")
352 if "addr:state" not in removed: 352 ↛ 354line 352 didn't jump to line 354 because the condition on line 352 was always true
353 new_dict["addr:state"] = ok_dict.get("StateName")
354 if "addr:postcode" not in removed: 354 ↛ 357line 354 didn't jump to line 357 because the condition on line 354 was always true
355 new_dict["addr:postcode"] = ok_dict.get("ZipCode")
357 return {k: v for k, v in new_dict.items() if v}, removed
360def collapse_list(seq: list) -> list:
361 """Remove duplicates in list while keeping order.
363 ```python
364 >>> collapse_list(["foo", "bar", "foo"])
365 # ["foo", "bar"]
366 ```
368 Args:
369 seq (list): The list to collapse.
371 Returns:
372 list: The collapsed list.
373 """
374 seen = set()
375 seen_add = seen.add
376 return [x for x in seq if not (x in seen or seen_add(x))]
379def split_unit(address_string: str) -> Dict[str, str]:
380 """Split unit from address string, if present."""
381 address_string = address_string.strip(" ")
382 if not any(char.isalpha() for char in address_string):
383 return {"addr:housenumber": address_string}
385 add_dict = {}
386 number = ""
387 for char in address_string: 387 ↛ 393line 387 didn't jump to line 393 because the loop on line 387 didn't complete
388 if char.isdigit():
389 number += char
390 else:
391 break
393 unit = remove_prefix(address_string, number).lstrip(" -,/")
394 if unit: 394 ↛ 396line 394 didn't jump to line 396 because the condition on line 394 was always true
395 add_dict["addr:unit"] = unit
396 add_dict["addr:housenumber"] = number
398 return add_dict
401def remove_prefix(text: str, prefix: str) -> str:
402 """Remove prefix from string for Python 3.8."""
403 if text.startswith(prefix):
404 return text[len(prefix) :]
405 return text
408def get_address(
409 address_string: str,
410) -> Tuple[Dict[str, Union[str, int]], List[Union[str, None]]]:
411 """Process address strings.
413 ```python
414 >>> get_address("345 MAPLE RD, COUNTRYSIDE, PA 24680-0198")[0]
415 # {"addr:housenumber": "345", "addr:street": "Maple Road",
416 "addr:city": "Countryside", "addr:state": "PA", "addr:postcode": "24680-0198"}
417 >>> get_address("777 Strawberry St.")[0]
418 # {"addr:housenumber": "777", "addr:street": "Strawberry Street"}
419 >>> address = get_address("222 NW Pineapple Ave Suite A Unit B")
420 >>> address[0]
421 # {"addr:housenumber": "222", "addr:street": "Northwest Pineapple Avenue"}
422 >>> address[1]
423 # ["addr:unit"]
424 ```
426 Args:
427 address_string (str): The address string to process.
429 Returns:
430 Tuple[Dict[str, Union[str, int]], List[Union[str, None]]]:
431 The processed address string and the removed fields.
432 """
433 try:
434 cleaned = usaddress.tag(clean_address(address_string), tag_mapping=osm_mapping)[
435 0
436 ]
437 removed = []
438 except usaddress.RepeatedLabelError as err:
439 collapsed = collapse_list(
440 [(i[0].strip(" .,#"), i[1]) for i in err.parsed_string]
441 )
442 cleaned, removed = _manual_join(_combine_consecutive_tuples(collapsed))
444 for toss in toss_tags:
445 cleaned.pop(toss, None)
447 if "addr:housenumber" in cleaned: 447 ↛ 450line 447 didn't jump to line 450 because the condition on line 447 was always true
448 cleaned = {**cleaned, **split_unit(cleaned["addr:housenumber"])}
450 if "addr:street" in cleaned: 450 ↛ 457line 450 didn't jump to line 457 because the condition on line 450 was always true
451 street = abbrs(cleaned["addr:street"])
452 cleaned["addr:street"] = street_comp.sub(
453 "Street",
454 street,
455 ).strip(".")
457 if "addr:city" in cleaned:
458 cleaned["addr:city"] = abbrs(get_title(cleaned["addr:city"], single_word=True))
460 if "addr:state" in cleaned:
461 old = cleaned["addr:state"].replace(".", "")
462 if old.upper() in state_expand: 462 ↛ 463line 462 didn't jump to line 463 because the condition on line 462 was never true
463 cleaned["addr:state"] = state_expand[old.upper()]
464 elif len(old) == 2 and old.upper() in list(state_expand.values()): 464 ↛ 467line 464 didn't jump to line 467 because the condition on line 464 was always true
465 cleaned["addr:state"] = old.upper()
467 if "addr:unit" in cleaned: 467 ↛ 468line 467 didn't jump to line 468 because the condition on line 467 was never true
468 cleaned["addr:unit"] = cleaned["addr:unit"].removeprefix("Space").strip(" #.")
470 if "addr:postcode" in cleaned:
471 # remove extraneous postcode digits
472 cleaned["addr:postcode"] = post_comp.sub(
473 r"\1", cleaned["addr:postcode"]
474 ).replace(" ", "-")
476 try:
477 validated: Address = Address.model_validate(dict(cleaned))
478 except ValidationError as err:
479 bad_fields: list = [each.get("loc", [])[0] for each in err.errors()]
480 cleaned_ret = dict(cleaned)
481 for each in bad_fields:
482 cleaned_ret.pop(each, None)
484 removed.extend(bad_fields)
485 validated: Address = Address.model_validate(cleaned_ret)
487 return validated.model_dump(exclude_none=True, by_alias=True), removed
490def get_phone(phone: str) -> str:
491 """Format phone numbers to the US and Canadian standard format of `+1 XXX-XXX-XXXX`.
493 ```python
494 >>> get_phone("2029009019")
495 # "+1 202-900-9019"
496 >>> get_phone("(202) 900-9019")
497 # "+1 202-900-9019"
498 >>> get_phone("202-900-901")
499 # ValueError: Invalid phone number: 202-900-901
500 ```
502 Args:
503 phone (str): The phone number to format.
505 Returns:
506 str: The formatted phone number.
508 Raises:
509 ValueError: If the phone number is invalid.
510 """
511 phone_valid = regex.search(
512 r"^\(?(?:\+? ?1?[ -.]*)?(?:\(?(\d{3})\)?[ -.]*)(\d{3})[ -.]*(\d{4})$",
513 phone,
514 )
515 if phone_valid:
516 return (
517 f"+1 {phone_valid.group(1)}-{phone_valid.group(2)}-{phone_valid.group(3)}"
518 )
519 raise ValueError(f"Invalid phone number: {phone}")