atlus.atlus
Functions and tools to process the raw address strings.
1"""Functions and tools to process the raw address strings.""" 2 3from collections import Counter 4from typing import OrderedDict 5import usaddress 6import regex 7from .resources import ( 8 street_expand, 9 direction_expand, 10 name_expand, 11 saints, 12 state_expand, 13) 14 15toss_tags = [ 16 "Recipient", 17 "IntersectionSeparator", 18 "LandmarkName", 19 "USPSBoxGroupID", 20 "USPSBoxGroupType", 21 "USPSBoxID", 22 "USPSBoxType", 23 "OccupancyType", 24] 25"""Tags from the `usaddress` packageto remove.""" 26 27osm_mapping = { 28 "AddressNumber": "addr:housenumber", 29 "AddressNumberPrefix": "addr:housenumber", 30 "AddressNumberSuffix": "addr:housenumber", 31 "StreetName": "addr:street", 32 "StreetNamePreDirectional": "addr:street", 33 "StreetNamePreModifier": "addr:street", 34 "StreetNamePreType": "addr:street", 35 "StreetNamePostDirectional": "addr:street", 36 "StreetNamePostModifier": "addr:street", 37 "StreetNamePostType": "addr:street", 38 "OccupancyIdentifier": "addr:unit", 39 "PlaceName": "addr:city", 40 "StateName": "addr:state", 41 "ZipCode": "addr:postcode", 42} 43"""Mapping from `usaddress` fields to OSM tags.""" 44 45 46def get_title(value: str, single_word: bool = False) -> str: 47 """Fix ALL-CAPS string. 48 49 ```python 50 >> get_title("PALM BEACH") 51 # "Palm Beach" 52 >> get_title("BOSTON") 53 # "BOSTON" 54 >> get_title("BOSTON", single_word=True) 55 # "Boston" 56 ``` 57 58 Args: 59 value: String to fix. 60 single_word: Whether the string should be fixed even if it is a single word. 61 62 Returns: 63 str: Fixed string. 64 """ 65 if (value.isupper() and " " in value) or (value.isupper() and single_word): 66 return mc_replace(value.title()) 67 return value 68 69 70def us_replace(value: str) -> str: 71 """Fix string containing improperly formatted US. 72 73 ```python 74 >> us_replace("U.S. Route 15") 75 # "US Route 15" 76 ``` 77 78 Args: 79 value: String to fix. 80 81 Returns: 82 str: Fixed string. 83 """ 84 return value.replace("U.S.", "US").replace("U. S.", "US").replace("U S ", "US ") 85 86 87def mc_replace(value: str) -> str: 88 """Fix string containing improperly formatted Mc- prefix. 89 90 ```python 91 >> mc_replace("Fort Mchenry") 92 # "Fort McHenry" 93 ``` 94 95 Args: 96 value: String to fix. 97 98 Returns: 99 str: Fixed string. 100 """ 101 mc_match = regex.search(r"(.*\bMc)([a-z])(.*)", value) 102 if mc_match: 103 return mc_match.group(1) + mc_match.group(2).title() + mc_match.group(3) 104 return value 105 106 107def ord_replace(value: str) -> str: 108 """Fix string containing improperly capitalized ordinal. 109 110 ```python 111 >> ord_replace("3Rd St. NW") 112 # "3rd St. NW" 113 ``` 114 115 Args: 116 value: String to fix. 117 118 Returns: 119 str: Fixed string. 120 """ 121 return regex.sub(r"(\b\d+[SNRT][tTdDhH]\b)", lower_match, value) 122 123 124def name_street_expand(match: regex.Match) -> str: 125 """Expand matched street type abbreviations. 126 127 Args: 128 match (regex.Match): Matched string. 129 130 Returns: 131 str: Expanded string. 132 """ 133 mat = match.group(1).upper().rstrip(".") 134 if mat: 135 return (name_expand | street_expand)[mat].title() 136 raise ValueError 137 138 139def direct_expand(match: regex.Match) -> str: 140 """Expand matched directional abbreviations. 141 142 Args: 143 match (regex.Match): Matched string. 144 145 Returns: 146 str: Expanded string. 147 """ 148 mat = match.group(1).upper().replace(".", "") 149 if mat: 150 return direction_expand[mat].title() 151 raise ValueError 152 153 154def cap_match(match: regex.Match) -> str: 155 """Make matches uppercase. 156 157 Args: 158 match (regex.Match): Matched string. 159 160 Returns: 161 str: Capitalized string. 162 """ 163 return "".join(match.groups()).upper().replace(".", "") 164 165 166def lower_match(match: regex.Match) -> str: 167 """Lower-case improperly cased ordinal values. 168 169 Args: 170 value: String to fix. 171 172 Returns: 173 str: Fixed string. 174 """ 175 return match.group(1).lower() 176 177 178def grid_match(match_str: regex.Match) -> str: 179 """Clean grid addresses.""" 180 return match_str.group(0).replace(" ", "").upper() 181 182 183# pre-compile regex for speed 184ABBR_JOIN = "|".join(name_expand | street_expand) 185abbr_join_comp = regex.compile( 186 rf"(\b(?:{ABBR_JOIN})\b\.?)(?!')", 187 flags=regex.IGNORECASE, 188) 189 190DIR_FILL = "|".join(r"\.?".join(list(abbr)) for abbr in direction_expand) 191dir_fill_comp = regex.compile( 192 rf"(?<!(?:^(?:Avenue) |[\.']))(\b(?:{DIR_FILL})\b\.?)(?!(?:\.?[a-zA-Z]| (?:Street|Avenue)))", 193 flags=regex.IGNORECASE, 194) 195 196sr_comp = regex.compile( 197 r"(\bS\.?R\b\.?)(?= \d+)", 198 flags=regex.IGNORECASE, 199) 200 201saint_comp = regex.compile( 202 rf"^(St\.?)(?= )|(\bSt\.?)(?= (?:{'|'.join(saints)}))", 203 flags=regex.IGNORECASE, 204) 205 206street_comp = regex.compile( 207 r"St\.?(?= [NESW]\.?[EW]?\.?)|(?<=\d[thndstr]{2} )St\.?\b|St\.?$" 208) 209 210post_comp = regex.compile(r"(\d{5})-?0{4}") 211 212usa_comp = regex.compile(r",? (?:USA?|United States(?: of America)?|Canada)\b") 213 214paren_comp = regex.compile(r" ?\(.*\)") 215 216# match Wisconsin grid-style addresses: N65w25055, W249 N6620, etc. 217grid_comp = regex.compile( 218 r"\b([NnSs]\d{2,}\s*[EeWw]\d{2,}|[EeWw]\d{2,}\s*[NnSs]\d{2,})\b" 219) 220 221 222def abbrs(value: str) -> str: 223 """Bundle most common abbreviation expansion functions. 224 225 Args: 226 value (str): String to expand. 227 228 Returns: 229 str: Expanded string. 230 """ 231 value = ord_replace(us_replace(mc_replace(get_title(value)))) 232 233 # change likely 'St' to 'Saint' 234 value = saint_comp.sub( 235 "Saint", 236 value, 237 ) 238 239 # expand common street and word abbreviations 240 value = abbr_join_comp.sub( 241 name_street_expand, 242 value, 243 ) 244 245 # expand directionals 246 value = dir_fill_comp.sub( 247 direct_expand, 248 value, 249 ) 250 251 # normalize 'US' 252 value = regex.sub( 253 r"\bU.[Ss].\B", 254 cap_match, 255 value, 256 ) 257 258 # uppercase shortened street descriptors 259 value = regex.sub( 260 r"\b(C[rh]|S[rh]|[FR]m|Us)\b", 261 cap_match, 262 value, 263 ) 264 265 # remove unremoved abbr periods 266 value = regex.sub( 267 r"([a-zA-Z]{2,})\.", 268 r"\1", 269 value, 270 ) 271 272 # expand 'SR' if no other street types 273 value = sr_comp.sub("State Route", value) 274 return value.strip(" .") 275 276 277def clean(old: str) -> str: 278 """Clean the input string before sending to parser by removing newlines and unicode. 279 280 Args: 281 old (str): String to clean. 282 283 Returns: 284 str: Cleaned string. 285 """ 286 old = regex.sub(r"<br ?/>", ",", old) 287 return regex.sub(r"[^\x00-\x7F\n\r\t]", "", old) # remove unicode 288 289 290def help_join(tags, keep: list[str]) -> str: 291 """Help to join address fields.""" 292 tag_join: list[str] = [v for k, v in tags.items() if k in keep] 293 return " ".join(tag_join) 294 295 296def addr_street(tags: dict[str, str]) -> str: 297 """Build the street field.""" 298 return help_join( 299 tags, 300 [ 301 "StreetName", 302 "StreetNamePreDirectional", 303 "StreetNamePreModifier", 304 "StreetNamePreType", 305 "StreetNamePostDirectional", 306 "StreetNamePostModifier", 307 "StreetNamePostType", 308 ], 309 ) 310 311 312def addr_housenumber(tags: dict[str, str]) -> str: 313 """Build the housenumber field.""" 314 return help_join( 315 tags, ["AddressNumberPrefix", "AddressNumber", "AddressNumberSuffix"] 316 ) 317 318 319def _combine_consecutive_tuples( 320 tuples_list: list[tuple[str, str]] 321) -> list[tuple[str, str]]: 322 """Join adjacent `usaddress` fields.""" 323 combined_list = [] 324 current_tag = None 325 current_value = None 326 327 for value, tag in tuples_list: 328 if tag != current_tag: 329 if current_tag: 330 combined_list.append((current_value, current_tag)) 331 current_value, current_tag = value, tag 332 else: 333 current_value = " ".join(i for i in [current_value, value] if i) 334 335 if current_tag: 336 combined_list.append((current_value, current_tag)) 337 338 return combined_list 339 340 341def _manual_join(parsed: list[tuple]) -> tuple[dict[str, str], list[str | None]]: 342 """Remove duplicates and join remaining fields.""" 343 a = [i for i in parsed if i[1] not in toss_tags] 344 counts = Counter([i[1] for i in a]) 345 ok_tags = [tag for tag, count in counts.items() if count == 1] 346 ok_dict: dict[str, str] = {i[1]: i[0] for i in a if i[1] in ok_tags} 347 removed = [osm_mapping.get(field) for field, count in counts.items() if count > 1] 348 349 new_dict: dict[str, str | None] = {} 350 if "addr:street" not in removed: 351 new_dict["addr:street"] = addr_street(ok_dict) 352 if "addr:housenumber" not in removed: 353 new_dict["addr:housenumber"] = addr_housenumber(ok_dict) 354 if "addr:unit" not in removed: 355 new_dict["addr:unit"] = ok_dict.get("OccupancyIdentifier") 356 if "addr:city" not in removed: 357 new_dict["addr:city"] = ok_dict.get("PlaceName") 358 if "addr:state" not in removed: 359 new_dict["addr:state"] = ok_dict.get("StateName") 360 if "addr:postcode" not in removed: 361 new_dict["addr:postcode"] = ok_dict.get("ZipCode") 362 363 return {k: v for k, v in new_dict.items() if v}, removed 364 365 366def collapse_list(seq: list) -> list: 367 """Remove duplicates in list while keeping order. 368 369 ```python 370 >> collapse_list(["foo", "bar", "foo"]) 371 # ["foo", "bar"] 372 ``` 373 374 Args: 375 seq (list): The list to collapse. 376 377 Returns: 378 list: The collapsed list. 379 """ 380 seen = set() 381 seen_add = seen.add 382 return [x for x in seq if not (x in seen or seen_add(x))] 383 384 385def process( 386 address_string: str, 387) -> tuple[OrderedDict[str, str | int], list[str | None]]: 388 """Process address strings. 389 390 Args: 391 address_string (str): The address string to process. 392 393 Returns: 394 tuple[OrderedDict[str, str | int], list[str | None]]: 395 The processed address string and the removed fields. 396 """ 397 address_string = clean(address_string) 398 address_string = address_string.replace(" ", " ").strip(" ,.") 399 address_string = usa_comp.sub("", address_string) 400 address_string = paren_comp.sub("", address_string) 401 address_string = grid_comp.sub(grid_match, address_string) 402 try: 403 cleaned = usaddress.tag(clean(address_string), tag_mapping=osm_mapping)[0] 404 removed = [] 405 except usaddress.RepeatedLabelError as e: 406 collapsed = collapse_list([(i[0].strip(" .,#"), i[1]) for i in e.parsed_string]) 407 cleaned, removed = _manual_join(_combine_consecutive_tuples(collapsed)) 408 409 for toss in toss_tags: 410 cleaned.pop(toss, None) 411 412 if "addr:housenumber" in cleaned: 413 suite = regex.match(r"([0-9]+)[- \/]?([a-zA-Z]+)", cleaned["addr:housenumber"]) 414 if suite: 415 cleaned["addr:housenumber"] = suite.group(1) 416 if "addr:unit" not in cleaned: 417 cleaned["addr:unit"] = suite.group(2).upper() 418 else: 419 if cleaned["addr:unit"] != suite.group(2).upper(): 420 cleaned.pop("addr:unit") 421 removed += ["addr:unit"] 422 423 if "addr:street" in cleaned: 424 street = abbrs(cleaned["addr:street"]) 425 cleaned["addr:street"] = street_comp.sub( 426 "Street", 427 street, 428 ).strip(".") 429 430 if "addr:city" in cleaned: 431 cleaned["addr:city"] = abbrs(get_title(cleaned["addr:city"], single_word=True)) 432 433 if "addr:state" in cleaned: 434 old = cleaned["addr:state"].replace(".", "") 435 if old.upper() in state_expand: 436 cleaned["addr:state"] = state_expand[old.upper()] 437 elif len(old) == 2 and old.upper() in list(state_expand.values()): 438 cleaned["addr:state"] = old.upper() 439 440 if "addr:unit" in cleaned: 441 cleaned["addr:unit"] = cleaned["addr:unit"].removeprefix("Space").strip(" #.") 442 443 if "addr:postcode" in cleaned: 444 # remove extraneous postcode digits 445 cleaned["addr:postcode"] = post_comp.sub( 446 r"\1", cleaned["addr:postcode"] 447 ).replace(" ", "-") 448 449 return cleaned, removed 450 451 452def phone_format(phone: str) -> str: 453 """Format phone numbers to the US and Canadian standard format of `+1 XXX-XXX-XXXX`. 454 455 ```python 456 >> phone_format("2029009019") 457 # "+1 202-900-9019" 458 >> phone_format("(202) 900-9019") 459 # "+1 202-900-9019" 460 >> phone_format("202-900-901") 461 # ValueError: Invalid phone number: 202-900-901 462 ``` 463 464 Args: 465 phone (str): The phone number to format. 466 467 Returns: 468 str: The formatted phone number. 469 470 Raises: 471 ValueError: If the phone number is invalid. 472 """ 473 phone_valid = regex.search( 474 r"^\(?(?:\+? ?1?[ -.]*)?(?:\(?(\d{3})\)?[ -.]*)(\d{3})[ -.]*(\d{4})$", 475 phone, 476 ) 477 if phone_valid: 478 return ( 479 f"+1 {phone_valid.group(1)}-{phone_valid.group(2)}-{phone_valid.group(3)}" 480 ) 481 raise ValueError(f"Invalid phone number: {phone}")
Mapping from usaddress
fields to OSM tags.
47def get_title(value: str, single_word: bool = False) -> str: 48 """Fix ALL-CAPS string. 49 50 ```python 51 >> get_title("PALM BEACH") 52 # "Palm Beach" 53 >> get_title("BOSTON") 54 # "BOSTON" 55 >> get_title("BOSTON", single_word=True) 56 # "Boston" 57 ``` 58 59 Args: 60 value: String to fix. 61 single_word: Whether the string should be fixed even if it is a single word. 62 63 Returns: 64 str: Fixed string. 65 """ 66 if (value.isupper() and " " in value) or (value.isupper() and single_word): 67 return mc_replace(value.title()) 68 return value
Fix ALL-CAPS string.
>> get_title("PALM BEACH")
# "Palm Beach"
>> get_title("BOSTON")
# "BOSTON"
>> get_title("BOSTON", single_word=True)
# "Boston"
Arguments:
- value: String to fix.
- single_word: Whether the string should be fixed even if it is a single word.
Returns:
str: Fixed string.
71def us_replace(value: str) -> str: 72 """Fix string containing improperly formatted US. 73 74 ```python 75 >> us_replace("U.S. Route 15") 76 # "US Route 15" 77 ``` 78 79 Args: 80 value: String to fix. 81 82 Returns: 83 str: Fixed string. 84 """ 85 return value.replace("U.S.", "US").replace("U. S.", "US").replace("U S ", "US ")
Fix string containing improperly formatted US.
>> us_replace("U.S. Route 15")
# "US Route 15"
Arguments:
- value: String to fix.
Returns:
str: Fixed string.
88def mc_replace(value: str) -> str: 89 """Fix string containing improperly formatted Mc- prefix. 90 91 ```python 92 >> mc_replace("Fort Mchenry") 93 # "Fort McHenry" 94 ``` 95 96 Args: 97 value: String to fix. 98 99 Returns: 100 str: Fixed string. 101 """ 102 mc_match = regex.search(r"(.*\bMc)([a-z])(.*)", value) 103 if mc_match: 104 return mc_match.group(1) + mc_match.group(2).title() + mc_match.group(3) 105 return value
Fix string containing improperly formatted Mc- prefix.
>> mc_replace("Fort Mchenry")
# "Fort McHenry"
Arguments:
- value: String to fix.
Returns:
str: Fixed string.
108def ord_replace(value: str) -> str: 109 """Fix string containing improperly capitalized ordinal. 110 111 ```python 112 >> ord_replace("3Rd St. NW") 113 # "3rd St. NW" 114 ``` 115 116 Args: 117 value: String to fix. 118 119 Returns: 120 str: Fixed string. 121 """ 122 return regex.sub(r"(\b\d+[SNRT][tTdDhH]\b)", lower_match, value)
Fix string containing improperly capitalized ordinal.
>> ord_replace("3Rd St. NW")
# "3rd St. NW"
Arguments:
- value: String to fix.
Returns:
str: Fixed string.
125def name_street_expand(match: regex.Match) -> str: 126 """Expand matched street type abbreviations. 127 128 Args: 129 match (regex.Match): Matched string. 130 131 Returns: 132 str: Expanded string. 133 """ 134 mat = match.group(1).upper().rstrip(".") 135 if mat: 136 return (name_expand | street_expand)[mat].title() 137 raise ValueError
Expand matched street type abbreviations.
Arguments:
- match (regex.Match): Matched string.
Returns:
str: Expanded string.
140def direct_expand(match: regex.Match) -> str: 141 """Expand matched directional abbreviations. 142 143 Args: 144 match (regex.Match): Matched string. 145 146 Returns: 147 str: Expanded string. 148 """ 149 mat = match.group(1).upper().replace(".", "") 150 if mat: 151 return direction_expand[mat].title() 152 raise ValueError
Expand matched directional abbreviations.
Arguments:
- match (regex.Match): Matched string.
Returns:
str: Expanded string.
155def cap_match(match: regex.Match) -> str: 156 """Make matches uppercase. 157 158 Args: 159 match (regex.Match): Matched string. 160 161 Returns: 162 str: Capitalized string. 163 """ 164 return "".join(match.groups()).upper().replace(".", "")
Make matches uppercase.
Arguments:
- match (regex.Match): Matched string.
Returns:
str: Capitalized string.
167def lower_match(match: regex.Match) -> str: 168 """Lower-case improperly cased ordinal values. 169 170 Args: 171 value: String to fix. 172 173 Returns: 174 str: Fixed string. 175 """ 176 return match.group(1).lower()
Lower-case improperly cased ordinal values.
Arguments:
- value: String to fix.
Returns:
str: Fixed string.
179def grid_match(match_str: regex.Match) -> str: 180 """Clean grid addresses.""" 181 return match_str.group(0).replace(" ", "").upper()
Clean grid addresses.
223def abbrs(value: str) -> str: 224 """Bundle most common abbreviation expansion functions. 225 226 Args: 227 value (str): String to expand. 228 229 Returns: 230 str: Expanded string. 231 """ 232 value = ord_replace(us_replace(mc_replace(get_title(value)))) 233 234 # change likely 'St' to 'Saint' 235 value = saint_comp.sub( 236 "Saint", 237 value, 238 ) 239 240 # expand common street and word abbreviations 241 value = abbr_join_comp.sub( 242 name_street_expand, 243 value, 244 ) 245 246 # expand directionals 247 value = dir_fill_comp.sub( 248 direct_expand, 249 value, 250 ) 251 252 # normalize 'US' 253 value = regex.sub( 254 r"\bU.[Ss].\B", 255 cap_match, 256 value, 257 ) 258 259 # uppercase shortened street descriptors 260 value = regex.sub( 261 r"\b(C[rh]|S[rh]|[FR]m|Us)\b", 262 cap_match, 263 value, 264 ) 265 266 # remove unremoved abbr periods 267 value = regex.sub( 268 r"([a-zA-Z]{2,})\.", 269 r"\1", 270 value, 271 ) 272 273 # expand 'SR' if no other street types 274 value = sr_comp.sub("State Route", value) 275 return value.strip(" .")
Bundle most common abbreviation expansion functions.
Arguments:
- value (str): String to expand.
Returns:
str: Expanded string.
278def clean(old: str) -> str: 279 """Clean the input string before sending to parser by removing newlines and unicode. 280 281 Args: 282 old (str): String to clean. 283 284 Returns: 285 str: Cleaned string. 286 """ 287 old = regex.sub(r"<br ?/>", ",", old) 288 return regex.sub(r"[^\x00-\x7F\n\r\t]", "", old) # remove unicode
Clean the input string before sending to parser by removing newlines and unicode.
Arguments:
- old (str): String to clean.
Returns:
str: Cleaned string.
291def help_join(tags, keep: list[str]) -> str: 292 """Help to join address fields.""" 293 tag_join: list[str] = [v for k, v in tags.items() if k in keep] 294 return " ".join(tag_join)
Help to join address fields.
297def addr_street(tags: dict[str, str]) -> str: 298 """Build the street field.""" 299 return help_join( 300 tags, 301 [ 302 "StreetName", 303 "StreetNamePreDirectional", 304 "StreetNamePreModifier", 305 "StreetNamePreType", 306 "StreetNamePostDirectional", 307 "StreetNamePostModifier", 308 "StreetNamePostType", 309 ], 310 )
Build the street field.
313def addr_housenumber(tags: dict[str, str]) -> str: 314 """Build the housenumber field.""" 315 return help_join( 316 tags, ["AddressNumberPrefix", "AddressNumber", "AddressNumberSuffix"] 317 )
Build the housenumber field.
367def collapse_list(seq: list) -> list: 368 """Remove duplicates in list while keeping order. 369 370 ```python 371 >> collapse_list(["foo", "bar", "foo"]) 372 # ["foo", "bar"] 373 ``` 374 375 Args: 376 seq (list): The list to collapse. 377 378 Returns: 379 list: The collapsed list. 380 """ 381 seen = set() 382 seen_add = seen.add 383 return [x for x in seq if not (x in seen or seen_add(x))]
Remove duplicates in list while keeping order.
>> collapse_list(["foo", "bar", "foo"])
# ["foo", "bar"]
Arguments:
- seq (list): The list to collapse.
Returns:
list: The collapsed list.
386def process( 387 address_string: str, 388) -> tuple[OrderedDict[str, str | int], list[str | None]]: 389 """Process address strings. 390 391 Args: 392 address_string (str): The address string to process. 393 394 Returns: 395 tuple[OrderedDict[str, str | int], list[str | None]]: 396 The processed address string and the removed fields. 397 """ 398 address_string = clean(address_string) 399 address_string = address_string.replace(" ", " ").strip(" ,.") 400 address_string = usa_comp.sub("", address_string) 401 address_string = paren_comp.sub("", address_string) 402 address_string = grid_comp.sub(grid_match, address_string) 403 try: 404 cleaned = usaddress.tag(clean(address_string), tag_mapping=osm_mapping)[0] 405 removed = [] 406 except usaddress.RepeatedLabelError as e: 407 collapsed = collapse_list([(i[0].strip(" .,#"), i[1]) for i in e.parsed_string]) 408 cleaned, removed = _manual_join(_combine_consecutive_tuples(collapsed)) 409 410 for toss in toss_tags: 411 cleaned.pop(toss, None) 412 413 if "addr:housenumber" in cleaned: 414 suite = regex.match(r"([0-9]+)[- \/]?([a-zA-Z]+)", cleaned["addr:housenumber"]) 415 if suite: 416 cleaned["addr:housenumber"] = suite.group(1) 417 if "addr:unit" not in cleaned: 418 cleaned["addr:unit"] = suite.group(2).upper() 419 else: 420 if cleaned["addr:unit"] != suite.group(2).upper(): 421 cleaned.pop("addr:unit") 422 removed += ["addr:unit"] 423 424 if "addr:street" in cleaned: 425 street = abbrs(cleaned["addr:street"]) 426 cleaned["addr:street"] = street_comp.sub( 427 "Street", 428 street, 429 ).strip(".") 430 431 if "addr:city" in cleaned: 432 cleaned["addr:city"] = abbrs(get_title(cleaned["addr:city"], single_word=True)) 433 434 if "addr:state" in cleaned: 435 old = cleaned["addr:state"].replace(".", "") 436 if old.upper() in state_expand: 437 cleaned["addr:state"] = state_expand[old.upper()] 438 elif len(old) == 2 and old.upper() in list(state_expand.values()): 439 cleaned["addr:state"] = old.upper() 440 441 if "addr:unit" in cleaned: 442 cleaned["addr:unit"] = cleaned["addr:unit"].removeprefix("Space").strip(" #.") 443 444 if "addr:postcode" in cleaned: 445 # remove extraneous postcode digits 446 cleaned["addr:postcode"] = post_comp.sub( 447 r"\1", cleaned["addr:postcode"] 448 ).replace(" ", "-") 449 450 return cleaned, removed
Process address strings.
Arguments:
- address_string (str): The address string to process.
Returns:
tuple[OrderedDict[str, str | int], list[str | None]]: The processed address string and the removed fields.
453def phone_format(phone: str) -> str: 454 """Format phone numbers to the US and Canadian standard format of `+1 XXX-XXX-XXXX`. 455 456 ```python 457 >> phone_format("2029009019") 458 # "+1 202-900-9019" 459 >> phone_format("(202) 900-9019") 460 # "+1 202-900-9019" 461 >> phone_format("202-900-901") 462 # ValueError: Invalid phone number: 202-900-901 463 ``` 464 465 Args: 466 phone (str): The phone number to format. 467 468 Returns: 469 str: The formatted phone number. 470 471 Raises: 472 ValueError: If the phone number is invalid. 473 """ 474 phone_valid = regex.search( 475 r"^\(?(?:\+? ?1?[ -.]*)?(?:\(?(\d{3})\)?[ -.]*)(\d{3})[ -.]*(\d{4})$", 476 phone, 477 ) 478 if phone_valid: 479 return ( 480 f"+1 {phone_valid.group(1)}-{phone_valid.group(2)}-{phone_valid.group(3)}" 481 ) 482 raise ValueError(f"Invalid phone number: {phone}")
Format phone numbers to the US and Canadian standard format of +1 XXX-XXX-XXXX
.
>> phone_format("2029009019")
# "+1 202-900-9019"
>> phone_format("(202) 900-9019")
# "+1 202-900-9019"
>> phone_format("202-900-901")
# ValueError: Invalid phone number: 202-900-901
Arguments:
- phone (str): The phone number to format.
Returns:
str: The formatted phone number.
Raises:
- ValueError: If the phone number is invalid.