atlus.atlus
Functions and tools to process the raw address strings.
1"""Functions and tools to process the raw address strings.""" 2 3from collections import Counter 4from typing import Union, List, Dict, Tuple 5from pydantic import ValidationError 6import usaddress 7import regex 8 9from .objects import Address 10from .resources import ( 11 street_expand, 12 direction_expand, 13 name_expand, 14 state_expand, 15 saint_comp, 16 abbr_join_comp, 17 dir_fill_comp, 18 sr_comp, 19 usa_comp, 20 paren_comp, 21 grid_comp, 22 post_comp, 23 street_comp, 24) 25 26toss_tags = [ 27 "Recipient", 28 "IntersectionSeparator", 29 "LandmarkName", 30 "USPSBoxGroupID", 31 "USPSBoxGroupType", 32 "USPSBoxID", 33 "USPSBoxType", 34 "OccupancyType", 35] 36"""Tags from the `usaddress` package to remove.""" 37 38osm_mapping = { 39 "AddressNumber": "addr:housenumber", 40 "AddressNumberPrefix": "addr:housenumber", 41 "AddressNumberSuffix": "addr:housenumber", 42 "StreetName": "addr:street", 43 "StreetNamePreDirectional": "addr:street", 44 "StreetNamePreModifier": "addr:street", 45 "StreetNamePreType": "addr:street", 46 "StreetNamePostDirectional": "addr:street", 47 "StreetNamePostModifier": "addr:street", 48 "StreetNamePostType": "addr:street", 49 "OccupancyIdentifier": "addr:unit", 50 "PlaceName": "addr:city", 51 "StateName": "addr:state", 52 "ZipCode": "addr:postcode", 53} 54"""Mapping from `usaddress` fields to OSM tags.""" 55 56 57def get_title(value: str, single_word: bool = False) -> str: 58 """Fix ALL-CAPS string. 59 60 ```python 61 >>> get_title("PALM BEACH") 62 # "Palm Beach" 63 >>> get_title("BOSTON") 64 # "BOSTON" 65 >>> get_title("BOSTON", single_word=True) 66 # "Boston" 67 ``` 68 69 Args: 70 value: String to fix. 71 single_word: Whether the string should be fixed even if it is a single word. 72 73 Returns: 74 str: Fixed string. 75 """ 76 if (value.isupper() and " " in value) or (value.isupper() and single_word): 77 return mc_replace(value.title()) 78 return value 79 80 81def us_replace(value: str) -> str: 82 """Fix string containing improperly formatted US. 83 84 ```python 85 >>> us_replace("U.S. Route 15") 86 # "US Route 15" 87 ``` 88 89 Args: 90 value: String to fix. 91 92 Returns: 93 str: Fixed string. 94 """ 95 return value.replace("U.S.", "US").replace("U. S.", "US").replace("U S ", "US ") 96 97 98def mc_replace(value: str) -> str: 99 """Fix string containing improperly formatted Mc- prefix. 100 101 ```python 102 >>> mc_replace("Fort Mchenry") 103 # "Fort McHenry" 104 ``` 105 106 Args: 107 value: String to fix. 108 109 Returns: 110 str: Fixed string. 111 """ 112 words = [] 113 for word in value.split(): 114 mc_match = word.partition("Mc") 115 words.append(mc_match[0] + mc_match[1] + mc_match[2].capitalize()) 116 return " ".join(words) 117 118 119def ord_replace(value: str) -> str: 120 """Fix string containing improperly capitalized ordinal. 121 122 ```python 123 >>> ord_replace("3Rd St. NW") 124 # "3rd St. NW" 125 ``` 126 127 Args: 128 value: String to fix. 129 130 Returns: 131 str: Fixed string. 132 """ 133 return regex.sub(r"(\b\d+[SNRT][tTdDhH]\b)", lower_match, value) 134 135 136def name_street_expand(match: regex.Match) -> str: 137 """Expand matched street type abbreviations. 138 139 Args: 140 match (regex.Match): Matched string. 141 142 Returns: 143 str: Expanded string. 144 """ 145 mat = match.group(1).upper().rstrip(".") 146 if mat: 147 return ({**name_expand, **street_expand})[mat].title() 148 raise ValueError 149 150 151def direct_expand(match: regex.Match) -> str: 152 """Expand matched directional abbreviations. 153 154 Args: 155 match (regex.Match): Matched string. 156 157 Returns: 158 str: Expanded string. 159 """ 160 mat = match.group(1).upper().replace(".", "") 161 if mat: 162 return direction_expand[mat].title() 163 raise ValueError 164 165 166def cap_match(match: regex.Match) -> str: 167 """Make matches uppercase. 168 169 Args: 170 match (regex.Match): Matched string. 171 172 Returns: 173 str: Capitalized string. 174 """ 175 return "".join(match.groups()).upper().replace(".", "") 176 177 178def lower_match(match: regex.Match) -> str: 179 """Lower-case improperly cased ordinal values. 180 181 Args: 182 value: String to fix. 183 184 Returns: 185 str: Fixed string. 186 """ 187 return match.group(1).lower() 188 189 190def grid_match(match_str: regex.Match) -> str: 191 """Clean grid addresses.""" 192 return match_str.group(0).replace(" ", "").upper() 193 194 195def abbrs(value: str) -> str: 196 """Bundle most common abbreviation expansion functions. 197 198 ```python 199 >>> abbrs("St. Francis") 200 # "Saint Francis" 201 >>> abbrs("E St.") 202 # "E Street" 203 >>> abbrs("E Sewell St") 204 # "East Sewell Street" 205 ``` 206 207 Args: 208 value (str): String to expand. 209 210 Returns: 211 str: Expanded string. 212 """ 213 value = ord_replace(us_replace(mc_replace(get_title(value)))) 214 215 # change likely 'St' to 'Saint' 216 value = saint_comp.sub( 217 "Saint", 218 value, 219 ) 220 221 # expand common street and word abbreviations 222 value = abbr_join_comp.sub( 223 name_street_expand, 224 value, 225 ) 226 227 # expand directionals 228 value = dir_fill_comp.sub( 229 direct_expand, 230 value, 231 ) 232 233 # normalize 'US' 234 value = us_replace(value) 235 236 # uppercase shortened street descriptors 237 value = regex.sub( 238 r"\b(C[rh]|S[rh]|[FR]m|Us)\b", 239 cap_match, 240 value, 241 ) 242 243 # remove unremoved abbr periods 244 value = regex.sub( 245 r"([a-zA-Z]{2,})\.", 246 r"\1", 247 value, 248 ) 249 250 # expand 'SR' if no other street types 251 value = sr_comp.sub("State Route", value) 252 return value.strip(" .") 253 254 255def remove_br_unicode(old: str) -> str: 256 """Clean the input string before sending to parser by removing newlines and unicode. 257 258 Args: 259 old (str): String to clean. 260 261 Returns: 262 str: Cleaned string. 263 """ 264 old = regex.sub(r"<br ?/>", ",", old) 265 return regex.sub(r"[^\x00-\x7F\n\r\t]", "", old) # remove unicode 266 267 268def clean_address(address_string: str) -> str: 269 """Clean the input string before sending to parser by removing newlines and unicode. 270 271 Args: 272 address_string (str): String to clean. 273 274 Returns: 275 str: Cleaned string. 276 """ 277 address_string = usa_comp.sub( 278 "", remove_br_unicode(address_string).replace(" ", " ").strip(" ,.") 279 ) 280 address_string = paren_comp.sub("", address_string) 281 return grid_comp.sub(grid_match, address_string) 282 283 284def help_join(tags, keep: List[str]) -> str: 285 """Help to join address fields.""" 286 tag_join: List[str] = [v for k, v in tags.items() if k in keep] 287 return " ".join(tag_join) 288 289 290def addr_street(tags: Dict[str, str]) -> str: 291 """Build the street field.""" 292 return help_join( 293 tags, 294 [ 295 "StreetName", 296 "StreetNamePreDirectional", 297 "StreetNamePreModifier", 298 "StreetNamePreType", 299 "StreetNamePostDirectional", 300 "StreetNamePostModifier", 301 "StreetNamePostType", 302 ], 303 ) 304 305 306def addr_housenumber(tags: Dict[str, str]) -> str: 307 """Build the housenumber field.""" 308 return help_join( 309 tags, ["AddressNumberPrefix", "AddressNumber", "AddressNumberSuffix"] 310 ) 311 312 313def _combine_consecutive_tuples( 314 tuples_list: List[Tuple[str, str]] 315) -> List[Tuple[str, str]]: 316 """Join adjacent `usaddress` fields.""" 317 combined_list = [] 318 current_tag = None 319 current_value = None 320 321 for value, tag in tuples_list: 322 if tag != current_tag: 323 if current_tag: 324 combined_list.append((current_value, current_tag)) 325 current_value, current_tag = value, tag 326 else: 327 current_value = " ".join(i for i in [current_value, value] if i) 328 329 if current_tag: 330 combined_list.append((current_value, current_tag)) 331 332 return combined_list 333 334 335def _manual_join(parsed: List[tuple]) -> Tuple[Dict[str, str], List[Union[str, None]]]: 336 """Remove duplicates and join remaining fields.""" 337 parsed_clean = [i for i in parsed if i[1] not in toss_tags] 338 counts = Counter([i[1] for i in parsed_clean]) 339 ok_tags = [tag for tag, count in counts.items() if count == 1] 340 ok_dict: Dict[str, str] = {i[1]: i[0] for i in parsed_clean if i[1] in ok_tags} 341 removed = [osm_mapping.get(field) for field, count in counts.items() if count > 1] 342 343 new_dict: Dict[str, Union[str, None]] = {} 344 if "addr:street" not in removed: 345 new_dict["addr:street"] = addr_street(ok_dict) 346 if "addr:housenumber" not in removed: 347 new_dict["addr:housenumber"] = addr_housenumber(ok_dict) 348 if "addr:unit" not in removed: 349 new_dict["addr:unit"] = ok_dict.get("OccupancyIdentifier") 350 if "addr:city" not in removed: 351 new_dict["addr:city"] = ok_dict.get("PlaceName") 352 if "addr:state" not in removed: 353 new_dict["addr:state"] = ok_dict.get("StateName") 354 if "addr:postcode" not in removed: 355 new_dict["addr:postcode"] = ok_dict.get("ZipCode") 356 357 return {k: v for k, v in new_dict.items() if v}, removed 358 359 360def collapse_list(seq: list) -> list: 361 """Remove duplicates in list while keeping order. 362 363 ```python 364 >>> collapse_list(["foo", "bar", "foo"]) 365 # ["foo", "bar"] 366 ``` 367 368 Args: 369 seq (list): The list to collapse. 370 371 Returns: 372 list: The collapsed list. 373 """ 374 seen = set() 375 seen_add = seen.add 376 return [x for x in seq if not (x in seen or seen_add(x))] 377 378 379def split_unit(address_string: str) -> Dict[str, str]: 380 """Split unit from address string, if present.""" 381 address_string = address_string.strip(" ") 382 if not any(char.isalpha() for char in address_string): 383 return {"addr:housenumber": address_string} 384 385 add_dict = {} 386 number = "" 387 for char in address_string: 388 if char.isdigit(): 389 number += char 390 else: 391 break 392 393 unit = remove_prefix(address_string, number).lstrip(" -,/") 394 if unit: 395 add_dict["addr:unit"] = unit 396 add_dict["addr:housenumber"] = number 397 398 return add_dict 399 400 401def remove_prefix(text: str, prefix: str) -> str: 402 """Remove prefix from string for Python 3.8.""" 403 if text.startswith(prefix): 404 return text[len(prefix) :] 405 return text 406 407 408def get_address( 409 address_string: str, 410) -> Tuple[Dict[str, Union[str, int]], List[Union[str, None]]]: 411 """Process address strings. 412 413 ```python 414 >>> get_address("345 MAPLE RD, COUNTRYSIDE, PA 24680-0198")[0] 415 # {"addr:housenumber": "345", "addr:street": "Maple Road", 416 "addr:city": "Countryside", "addr:state": "PA", "addr:postcode": "24680-0198"} 417 >>> get_address("777 Strawberry St.")[0] 418 # {"addr:housenumber": "777", "addr:street": "Strawberry Street"} 419 >>> address = get_address("222 NW Pineapple Ave Suite A Unit B") 420 >>> address[0] 421 # {"addr:housenumber": "222", "addr:street": "Northwest Pineapple Avenue"} 422 >>> address[1] 423 # ["addr:unit"] 424 ``` 425 426 Args: 427 address_string (str): The address string to process. 428 429 Returns: 430 Tuple[Dict[str, Union[str, int]], List[Union[str, None]]]: 431 The processed address string and the removed fields. 432 """ 433 try: 434 cleaned = usaddress.tag(clean_address(address_string), tag_mapping=osm_mapping)[ 435 0 436 ] 437 removed = [] 438 except usaddress.RepeatedLabelError as err: 439 collapsed = collapse_list( 440 [(i[0].strip(" .,#"), i[1]) for i in err.parsed_string] 441 ) 442 cleaned, removed = _manual_join(_combine_consecutive_tuples(collapsed)) 443 444 for toss in toss_tags: 445 cleaned.pop(toss, None) 446 447 if "addr:housenumber" in cleaned: 448 cleaned = {**cleaned, **split_unit(cleaned["addr:housenumber"])} 449 450 if "addr:street" in cleaned: 451 street = abbrs(cleaned["addr:street"]) 452 cleaned["addr:street"] = street_comp.sub( 453 "Street", 454 street, 455 ).strip(".") 456 457 if "addr:city" in cleaned: 458 cleaned["addr:city"] = abbrs(get_title(cleaned["addr:city"], single_word=True)) 459 460 if "addr:state" in cleaned: 461 old = cleaned["addr:state"].replace(".", "") 462 if old.upper() in state_expand: 463 cleaned["addr:state"] = state_expand[old.upper()] 464 elif len(old) == 2 and old.upper() in list(state_expand.values()): 465 cleaned["addr:state"] = old.upper() 466 467 if "addr:unit" in cleaned: 468 cleaned["addr:unit"] = cleaned["addr:unit"].removeprefix("Space").strip(" #.") 469 470 if "addr:postcode" in cleaned: 471 # remove extraneous postcode digits 472 cleaned["addr:postcode"] = post_comp.sub( 473 r"\1", cleaned["addr:postcode"] 474 ).replace(" ", "-") 475 476 try: 477 validated: Address = Address.model_validate(dict(cleaned)) 478 except ValidationError as err: 479 bad_fields: list = [each.get("loc", [])[0] for each in err.errors()] 480 cleaned_ret = dict(cleaned) 481 for each in bad_fields: 482 cleaned_ret.pop(each, None) 483 484 removed.extend(bad_fields) 485 validated: Address = Address.model_validate(cleaned_ret) 486 487 return validated.model_dump(exclude_none=True, by_alias=True), removed 488 489 490def get_phone(phone: str) -> str: 491 """Format phone numbers to the US and Canadian standard format of `+1 XXX-XXX-XXXX`. 492 493 ```python 494 >>> get_phone("2029009019") 495 # "+1 202-900-9019" 496 >>> get_phone("(202) 900-9019") 497 # "+1 202-900-9019" 498 >>> get_phone("202-900-901") 499 # ValueError: Invalid phone number: 202-900-901 500 ``` 501 502 Args: 503 phone (str): The phone number to format. 504 505 Returns: 506 str: The formatted phone number. 507 508 Raises: 509 ValueError: If the phone number is invalid. 510 """ 511 phone_valid = regex.search( 512 r"^\(?(?:\+? ?1?[ -.]*)?(?:\(?(\d{3})\)?[ -.]*)(\d{3})[ -.]*(\d{4})$", 513 phone, 514 ) 515 if phone_valid: 516 return ( 517 f"+1 {phone_valid.group(1)}-{phone_valid.group(2)}-{phone_valid.group(3)}" 518 ) 519 raise ValueError(f"Invalid phone number: {phone}")
Mapping from usaddress
fields to OSM tags.
58def get_title(value: str, single_word: bool = False) -> str: 59 """Fix ALL-CAPS string. 60 61 ```python 62 >>> get_title("PALM BEACH") 63 # "Palm Beach" 64 >>> get_title("BOSTON") 65 # "BOSTON" 66 >>> get_title("BOSTON", single_word=True) 67 # "Boston" 68 ``` 69 70 Args: 71 value: String to fix. 72 single_word: Whether the string should be fixed even if it is a single word. 73 74 Returns: 75 str: Fixed string. 76 """ 77 if (value.isupper() and " " in value) or (value.isupper() and single_word): 78 return mc_replace(value.title()) 79 return value
Fix ALL-CAPS string.
>>> get_title("PALM BEACH")
# "Palm Beach"
>>> get_title("BOSTON")
# "BOSTON"
>>> get_title("BOSTON", single_word=True)
# "Boston"
Arguments:
- value: String to fix.
- single_word: Whether the string should be fixed even if it is a single word.
Returns:
str: Fixed string.
82def us_replace(value: str) -> str: 83 """Fix string containing improperly formatted US. 84 85 ```python 86 >>> us_replace("U.S. Route 15") 87 # "US Route 15" 88 ``` 89 90 Args: 91 value: String to fix. 92 93 Returns: 94 str: Fixed string. 95 """ 96 return value.replace("U.S.", "US").replace("U. S.", "US").replace("U S ", "US ")
Fix string containing improperly formatted US.
>>> us_replace("U.S. Route 15")
# "US Route 15"
Arguments:
- value: String to fix.
Returns:
str: Fixed string.
99def mc_replace(value: str) -> str: 100 """Fix string containing improperly formatted Mc- prefix. 101 102 ```python 103 >>> mc_replace("Fort Mchenry") 104 # "Fort McHenry" 105 ``` 106 107 Args: 108 value: String to fix. 109 110 Returns: 111 str: Fixed string. 112 """ 113 words = [] 114 for word in value.split(): 115 mc_match = word.partition("Mc") 116 words.append(mc_match[0] + mc_match[1] + mc_match[2].capitalize()) 117 return " ".join(words)
Fix string containing improperly formatted Mc- prefix.
>>> mc_replace("Fort Mchenry")
# "Fort McHenry"
Arguments:
- value: String to fix.
Returns:
str: Fixed string.
120def ord_replace(value: str) -> str: 121 """Fix string containing improperly capitalized ordinal. 122 123 ```python 124 >>> ord_replace("3Rd St. NW") 125 # "3rd St. NW" 126 ``` 127 128 Args: 129 value: String to fix. 130 131 Returns: 132 str: Fixed string. 133 """ 134 return regex.sub(r"(\b\d+[SNRT][tTdDhH]\b)", lower_match, value)
Fix string containing improperly capitalized ordinal.
>>> ord_replace("3Rd St. NW")
# "3rd St. NW"
Arguments:
- value: String to fix.
Returns:
str: Fixed string.
137def name_street_expand(match: regex.Match) -> str: 138 """Expand matched street type abbreviations. 139 140 Args: 141 match (regex.Match): Matched string. 142 143 Returns: 144 str: Expanded string. 145 """ 146 mat = match.group(1).upper().rstrip(".") 147 if mat: 148 return ({**name_expand, **street_expand})[mat].title() 149 raise ValueError
Expand matched street type abbreviations.
Arguments:
- match (regex.Match): Matched string.
Returns:
str: Expanded string.
152def direct_expand(match: regex.Match) -> str: 153 """Expand matched directional abbreviations. 154 155 Args: 156 match (regex.Match): Matched string. 157 158 Returns: 159 str: Expanded string. 160 """ 161 mat = match.group(1).upper().replace(".", "") 162 if mat: 163 return direction_expand[mat].title() 164 raise ValueError
Expand matched directional abbreviations.
Arguments:
- match (regex.Match): Matched string.
Returns:
str: Expanded string.
167def cap_match(match: regex.Match) -> str: 168 """Make matches uppercase. 169 170 Args: 171 match (regex.Match): Matched string. 172 173 Returns: 174 str: Capitalized string. 175 """ 176 return "".join(match.groups()).upper().replace(".", "")
Make matches uppercase.
Arguments:
- match (regex.Match): Matched string.
Returns:
str: Capitalized string.
179def lower_match(match: regex.Match) -> str: 180 """Lower-case improperly cased ordinal values. 181 182 Args: 183 value: String to fix. 184 185 Returns: 186 str: Fixed string. 187 """ 188 return match.group(1).lower()
Lower-case improperly cased ordinal values.
Arguments:
- value: String to fix.
Returns:
str: Fixed string.
191def grid_match(match_str: regex.Match) -> str: 192 """Clean grid addresses.""" 193 return match_str.group(0).replace(" ", "").upper()
Clean grid addresses.
196def abbrs(value: str) -> str: 197 """Bundle most common abbreviation expansion functions. 198 199 ```python 200 >>> abbrs("St. Francis") 201 # "Saint Francis" 202 >>> abbrs("E St.") 203 # "E Street" 204 >>> abbrs("E Sewell St") 205 # "East Sewell Street" 206 ``` 207 208 Args: 209 value (str): String to expand. 210 211 Returns: 212 str: Expanded string. 213 """ 214 value = ord_replace(us_replace(mc_replace(get_title(value)))) 215 216 # change likely 'St' to 'Saint' 217 value = saint_comp.sub( 218 "Saint", 219 value, 220 ) 221 222 # expand common street and word abbreviations 223 value = abbr_join_comp.sub( 224 name_street_expand, 225 value, 226 ) 227 228 # expand directionals 229 value = dir_fill_comp.sub( 230 direct_expand, 231 value, 232 ) 233 234 # normalize 'US' 235 value = us_replace(value) 236 237 # uppercase shortened street descriptors 238 value = regex.sub( 239 r"\b(C[rh]|S[rh]|[FR]m|Us)\b", 240 cap_match, 241 value, 242 ) 243 244 # remove unremoved abbr periods 245 value = regex.sub( 246 r"([a-zA-Z]{2,})\.", 247 r"\1", 248 value, 249 ) 250 251 # expand 'SR' if no other street types 252 value = sr_comp.sub("State Route", value) 253 return value.strip(" .")
Bundle most common abbreviation expansion functions.
>>> abbrs("St. Francis")
# "Saint Francis"
>>> abbrs("E St.")
# "E Street"
>>> abbrs("E Sewell St")
# "East Sewell Street"
Arguments:
- value (str): String to expand.
Returns:
str: Expanded string.
256def remove_br_unicode(old: str) -> str: 257 """Clean the input string before sending to parser by removing newlines and unicode. 258 259 Args: 260 old (str): String to clean. 261 262 Returns: 263 str: Cleaned string. 264 """ 265 old = regex.sub(r"<br ?/>", ",", old) 266 return regex.sub(r"[^\x00-\x7F\n\r\t]", "", old) # remove unicode
Clean the input string before sending to parser by removing newlines and unicode.
Arguments:
- old (str): String to clean.
Returns:
str: Cleaned string.
269def clean_address(address_string: str) -> str: 270 """Clean the input string before sending to parser by removing newlines and unicode. 271 272 Args: 273 address_string (str): String to clean. 274 275 Returns: 276 str: Cleaned string. 277 """ 278 address_string = usa_comp.sub( 279 "", remove_br_unicode(address_string).replace(" ", " ").strip(" ,.") 280 ) 281 address_string = paren_comp.sub("", address_string) 282 return grid_comp.sub(grid_match, address_string)
Clean the input string before sending to parser by removing newlines and unicode.
Arguments:
- address_string (str): String to clean.
Returns:
str: Cleaned string.
285def help_join(tags, keep: List[str]) -> str: 286 """Help to join address fields.""" 287 tag_join: List[str] = [v for k, v in tags.items() if k in keep] 288 return " ".join(tag_join)
Help to join address fields.
291def addr_street(tags: Dict[str, str]) -> str: 292 """Build the street field.""" 293 return help_join( 294 tags, 295 [ 296 "StreetName", 297 "StreetNamePreDirectional", 298 "StreetNamePreModifier", 299 "StreetNamePreType", 300 "StreetNamePostDirectional", 301 "StreetNamePostModifier", 302 "StreetNamePostType", 303 ], 304 )
Build the street field.
307def addr_housenumber(tags: Dict[str, str]) -> str: 308 """Build the housenumber field.""" 309 return help_join( 310 tags, ["AddressNumberPrefix", "AddressNumber", "AddressNumberSuffix"] 311 )
Build the housenumber field.
361def collapse_list(seq: list) -> list: 362 """Remove duplicates in list while keeping order. 363 364 ```python 365 >>> collapse_list(["foo", "bar", "foo"]) 366 # ["foo", "bar"] 367 ``` 368 369 Args: 370 seq (list): The list to collapse. 371 372 Returns: 373 list: The collapsed list. 374 """ 375 seen = set() 376 seen_add = seen.add 377 return [x for x in seq if not (x in seen or seen_add(x))]
Remove duplicates in list while keeping order.
>>> collapse_list(["foo", "bar", "foo"])
# ["foo", "bar"]
Arguments:
- seq (list): The list to collapse.
Returns:
list: The collapsed list.
380def split_unit(address_string: str) -> Dict[str, str]: 381 """Split unit from address string, if present.""" 382 address_string = address_string.strip(" ") 383 if not any(char.isalpha() for char in address_string): 384 return {"addr:housenumber": address_string} 385 386 add_dict = {} 387 number = "" 388 for char in address_string: 389 if char.isdigit(): 390 number += char 391 else: 392 break 393 394 unit = remove_prefix(address_string, number).lstrip(" -,/") 395 if unit: 396 add_dict["addr:unit"] = unit 397 add_dict["addr:housenumber"] = number 398 399 return add_dict
Split unit from address string, if present.
402def remove_prefix(text: str, prefix: str) -> str: 403 """Remove prefix from string for Python 3.8.""" 404 if text.startswith(prefix): 405 return text[len(prefix) :] 406 return text
Remove prefix from string for Python 3.8.
409def get_address( 410 address_string: str, 411) -> Tuple[Dict[str, Union[str, int]], List[Union[str, None]]]: 412 """Process address strings. 413 414 ```python 415 >>> get_address("345 MAPLE RD, COUNTRYSIDE, PA 24680-0198")[0] 416 # {"addr:housenumber": "345", "addr:street": "Maple Road", 417 "addr:city": "Countryside", "addr:state": "PA", "addr:postcode": "24680-0198"} 418 >>> get_address("777 Strawberry St.")[0] 419 # {"addr:housenumber": "777", "addr:street": "Strawberry Street"} 420 >>> address = get_address("222 NW Pineapple Ave Suite A Unit B") 421 >>> address[0] 422 # {"addr:housenumber": "222", "addr:street": "Northwest Pineapple Avenue"} 423 >>> address[1] 424 # ["addr:unit"] 425 ``` 426 427 Args: 428 address_string (str): The address string to process. 429 430 Returns: 431 Tuple[Dict[str, Union[str, int]], List[Union[str, None]]]: 432 The processed address string and the removed fields. 433 """ 434 try: 435 cleaned = usaddress.tag(clean_address(address_string), tag_mapping=osm_mapping)[ 436 0 437 ] 438 removed = [] 439 except usaddress.RepeatedLabelError as err: 440 collapsed = collapse_list( 441 [(i[0].strip(" .,#"), i[1]) for i in err.parsed_string] 442 ) 443 cleaned, removed = _manual_join(_combine_consecutive_tuples(collapsed)) 444 445 for toss in toss_tags: 446 cleaned.pop(toss, None) 447 448 if "addr:housenumber" in cleaned: 449 cleaned = {**cleaned, **split_unit(cleaned["addr:housenumber"])} 450 451 if "addr:street" in cleaned: 452 street = abbrs(cleaned["addr:street"]) 453 cleaned["addr:street"] = street_comp.sub( 454 "Street", 455 street, 456 ).strip(".") 457 458 if "addr:city" in cleaned: 459 cleaned["addr:city"] = abbrs(get_title(cleaned["addr:city"], single_word=True)) 460 461 if "addr:state" in cleaned: 462 old = cleaned["addr:state"].replace(".", "") 463 if old.upper() in state_expand: 464 cleaned["addr:state"] = state_expand[old.upper()] 465 elif len(old) == 2 and old.upper() in list(state_expand.values()): 466 cleaned["addr:state"] = old.upper() 467 468 if "addr:unit" in cleaned: 469 cleaned["addr:unit"] = cleaned["addr:unit"].removeprefix("Space").strip(" #.") 470 471 if "addr:postcode" in cleaned: 472 # remove extraneous postcode digits 473 cleaned["addr:postcode"] = post_comp.sub( 474 r"\1", cleaned["addr:postcode"] 475 ).replace(" ", "-") 476 477 try: 478 validated: Address = Address.model_validate(dict(cleaned)) 479 except ValidationError as err: 480 bad_fields: list = [each.get("loc", [])[0] for each in err.errors()] 481 cleaned_ret = dict(cleaned) 482 for each in bad_fields: 483 cleaned_ret.pop(each, None) 484 485 removed.extend(bad_fields) 486 validated: Address = Address.model_validate(cleaned_ret) 487 488 return validated.model_dump(exclude_none=True, by_alias=True), removed
Process address strings.
>>> get_address("345 MAPLE RD, COUNTRYSIDE, PA 24680-0198")[0]
# {"addr:housenumber": "345", "addr:street": "Maple Road",
"addr:city": "Countryside", "addr:state": "PA", "addr:postcode": "24680-0198"}
>>> get_address("777 Strawberry St.")[0]
# {"addr:housenumber": "777", "addr:street": "Strawberry Street"}
>>> address = get_address("222 NW Pineapple Ave Suite A Unit B")
>>> address[0]
# {"addr:housenumber": "222", "addr:street": "Northwest Pineapple Avenue"}
>>> address[1]
# ["addr:unit"]
Arguments:
- address_string (str): The address string to process.
Returns:
Tuple[Dict[str, Union[str, int]], List[Union[str, None]]]: The processed address string and the removed fields.
491def get_phone(phone: str) -> str: 492 """Format phone numbers to the US and Canadian standard format of `+1 XXX-XXX-XXXX`. 493 494 ```python 495 >>> get_phone("2029009019") 496 # "+1 202-900-9019" 497 >>> get_phone("(202) 900-9019") 498 # "+1 202-900-9019" 499 >>> get_phone("202-900-901") 500 # ValueError: Invalid phone number: 202-900-901 501 ``` 502 503 Args: 504 phone (str): The phone number to format. 505 506 Returns: 507 str: The formatted phone number. 508 509 Raises: 510 ValueError: If the phone number is invalid. 511 """ 512 phone_valid = regex.search( 513 r"^\(?(?:\+? ?1?[ -.]*)?(?:\(?(\d{3})\)?[ -.]*)(\d{3})[ -.]*(\d{4})$", 514 phone, 515 ) 516 if phone_valid: 517 return ( 518 f"+1 {phone_valid.group(1)}-{phone_valid.group(2)}-{phone_valid.group(3)}" 519 ) 520 raise ValueError(f"Invalid phone number: {phone}")
Format phone numbers to the US and Canadian standard format of +1 XXX-XXX-XXXX
.
>>> get_phone("2029009019")
# "+1 202-900-9019"
>>> get_phone("(202) 900-9019")
# "+1 202-900-9019"
>>> get_phone("202-900-901")
# ValueError: Invalid phone number: 202-900-901
Arguments:
- phone (str): The phone number to format.
Returns:
str: The formatted phone number.
Raises:
- ValueError: If the phone number is invalid.