atlus.atlus

Functions and tools to process the raw address strings.

  1"""Functions and tools to process the raw address strings."""
  2
  3from collections import Counter
  4from typing import Union, List, Dict, Tuple
  5from pydantic import ValidationError
  6import usaddress
  7import regex
  8
  9from .objects import Address
 10from .resources import (
 11    street_expand,
 12    direction_expand,
 13    name_expand,
 14    state_expand,
 15    saint_comp,
 16    abbr_join_comp,
 17    dir_fill_comp,
 18    sr_comp,
 19    usa_comp,
 20    paren_comp,
 21    grid_comp,
 22    post_comp,
 23    street_comp,
 24)
 25
 26toss_tags = [
 27    "Recipient",
 28    "IntersectionSeparator",
 29    "LandmarkName",
 30    "USPSBoxGroupID",
 31    "USPSBoxGroupType",
 32    "USPSBoxID",
 33    "USPSBoxType",
 34    "OccupancyType",
 35]
 36"""Tags from the `usaddress` package to remove."""
 37
 38osm_mapping = {
 39    "AddressNumber": "addr:housenumber",
 40    "AddressNumberPrefix": "addr:housenumber",
 41    "AddressNumberSuffix": "addr:housenumber",
 42    "StreetName": "addr:street",
 43    "StreetNamePreDirectional": "addr:street",
 44    "StreetNamePreModifier": "addr:street",
 45    "StreetNamePreType": "addr:street",
 46    "StreetNamePostDirectional": "addr:street",
 47    "StreetNamePostModifier": "addr:street",
 48    "StreetNamePostType": "addr:street",
 49    "OccupancyIdentifier": "addr:unit",
 50    "PlaceName": "addr:city",
 51    "StateName": "addr:state",
 52    "ZipCode": "addr:postcode",
 53}
 54"""Mapping from `usaddress` fields to OSM tags."""
 55
 56
 57def get_title(value: str, single_word: bool = False) -> str:
 58    """Fix ALL-CAPS string.
 59
 60    ```python
 61    >>> get_title("PALM BEACH")
 62    # "Palm Beach"
 63    >>> get_title("BOSTON")
 64    # "BOSTON"
 65    >>> get_title("BOSTON", single_word=True)
 66    # "Boston"
 67    ```
 68
 69    Args:
 70        value: String to fix.
 71        single_word: Whether the string should be fixed even if it is a single word.
 72
 73    Returns:
 74        str: Fixed string.
 75    """
 76    if (value.isupper() and " " in value) or (value.isupper() and single_word):
 77        return mc_replace(value.title())
 78    return value
 79
 80
 81def us_replace(value: str) -> str:
 82    """Fix string containing improperly formatted US.
 83
 84    ```python
 85    >>> us_replace("U.S. Route 15")
 86    # "US Route 15"
 87    ```
 88
 89    Args:
 90        value: String to fix.
 91
 92    Returns:
 93        str: Fixed string.
 94    """
 95    return value.replace("U.S.", "US").replace("U. S.", "US").replace("U S ", "US ")
 96
 97
 98def mc_replace(value: str) -> str:
 99    """Fix string containing improperly formatted Mc- prefix.
100
101    ```python
102    >>> mc_replace("Fort Mchenry")
103    # "Fort McHenry"
104    ```
105
106    Args:
107        value: String to fix.
108
109    Returns:
110        str: Fixed string.
111    """
112    words = []
113    for word in value.split():
114        mc_match = word.partition("Mc")
115        words.append(mc_match[0] + mc_match[1] + mc_match[2].capitalize())
116    return " ".join(words)
117
118
119def ord_replace(value: str) -> str:
120    """Fix string containing improperly capitalized ordinal.
121
122    ```python
123    >>> ord_replace("3Rd St. NW")
124    # "3rd St. NW"
125    ```
126
127    Args:
128        value: String to fix.
129
130    Returns:
131        str: Fixed string.
132    """
133    return regex.sub(r"(\b\d+[SNRT][tTdDhH]\b)", lower_match, value)
134
135
136def name_street_expand(match: regex.Match) -> str:
137    """Expand matched street type abbreviations.
138
139    Args:
140        match (regex.Match): Matched string.
141
142    Returns:
143        str: Expanded string.
144    """
145    mat = match.group(1).upper().rstrip(".")
146    if mat:
147        return ({**name_expand, **street_expand})[mat].title()
148    raise ValueError
149
150
151def direct_expand(match: regex.Match) -> str:
152    """Expand matched directional abbreviations.
153
154    Args:
155        match (regex.Match): Matched string.
156
157    Returns:
158        str: Expanded string.
159    """
160    mat = match.group(1).upper().replace(".", "")
161    if mat:
162        return direction_expand[mat].title()
163    raise ValueError
164
165
166def cap_match(match: regex.Match) -> str:
167    """Make matches uppercase.
168
169    Args:
170        match (regex.Match): Matched string.
171
172    Returns:
173        str: Capitalized string.
174    """
175    return "".join(match.groups()).upper().replace(".", "")
176
177
178def lower_match(match: regex.Match) -> str:
179    """Lower-case improperly cased ordinal values.
180
181    Args:
182        value: String to fix.
183
184    Returns:
185        str: Fixed string.
186    """
187    return match.group(1).lower()
188
189
190def grid_match(match_str: regex.Match) -> str:
191    """Clean grid addresses."""
192    return match_str.group(0).replace(" ", "").upper()
193
194
195def abbrs(value: str) -> str:
196    """Bundle most common abbreviation expansion functions.
197
198    ```python
199    >>> abbrs("St. Francis")
200    # "Saint Francis"
201    >>> abbrs("E St.")
202    # "E Street"
203    >>> abbrs("E Sewell St")
204    # "East Sewell Street"
205    ```
206
207    Args:
208        value (str): String to expand.
209
210    Returns:
211        str: Expanded string.
212    """
213    value = ord_replace(us_replace(mc_replace(get_title(value))))
214
215    # change likely 'St' to 'Saint'
216    value = saint_comp.sub(
217        "Saint",
218        value,
219    )
220
221    # expand common street and word abbreviations
222    value = abbr_join_comp.sub(
223        name_street_expand,
224        value,
225    )
226
227    # expand directionals
228    value = dir_fill_comp.sub(
229        direct_expand,
230        value,
231    )
232
233    # normalize 'US'
234    value = us_replace(value)
235
236    # uppercase shortened street descriptors
237    value = regex.sub(
238        r"\b(C[rh]|S[rh]|[FR]m|Us)\b",
239        cap_match,
240        value,
241    )
242
243    # remove unremoved abbr periods
244    value = regex.sub(
245        r"([a-zA-Z]{2,})\.",
246        r"\1",
247        value,
248    )
249
250    # expand 'SR' if no other street types
251    value = sr_comp.sub("State Route", value)
252    return value.strip(" .")
253
254
255def remove_br_unicode(old: str) -> str:
256    """Clean the input string before sending to parser by removing newlines and unicode.
257
258    Args:
259        old (str): String to clean.
260
261    Returns:
262        str: Cleaned string.
263    """
264    old = regex.sub(r"<br ?/>", ",", old)
265    return regex.sub(r"[^\x00-\x7F\n\r\t]", "", old)  # remove unicode
266
267
268def clean_address(address_string: str) -> str:
269    """Clean the input string before sending to parser by removing newlines and unicode.
270
271    Args:
272        address_string (str): String to clean.
273
274    Returns:
275        str: Cleaned string.
276    """
277    address_string = usa_comp.sub(
278        "", remove_br_unicode(address_string).replace("  ", " ").strip(" ,.")
279    )
280    address_string = paren_comp.sub("", address_string)
281    return grid_comp.sub(grid_match, address_string)
282
283
284def help_join(tags, keep: List[str]) -> str:
285    """Help to join address fields."""
286    tag_join: List[str] = [v for k, v in tags.items() if k in keep]
287    return " ".join(tag_join)
288
289
290def addr_street(tags: Dict[str, str]) -> str:
291    """Build the street field."""
292    return help_join(
293        tags,
294        [
295            "StreetName",
296            "StreetNamePreDirectional",
297            "StreetNamePreModifier",
298            "StreetNamePreType",
299            "StreetNamePostDirectional",
300            "StreetNamePostModifier",
301            "StreetNamePostType",
302        ],
303    )
304
305
306def addr_housenumber(tags: Dict[str, str]) -> str:
307    """Build the housenumber field."""
308    return help_join(
309        tags, ["AddressNumberPrefix", "AddressNumber", "AddressNumberSuffix"]
310    )
311
312
313def _combine_consecutive_tuples(
314    tuples_list: List[Tuple[str, str]]
315) -> List[Tuple[str, str]]:
316    """Join adjacent `usaddress` fields."""
317    combined_list = []
318    current_tag = None
319    current_value = None
320
321    for value, tag in tuples_list:
322        if tag != current_tag:
323            if current_tag:
324                combined_list.append((current_value, current_tag))
325            current_value, current_tag = value, tag
326        else:
327            current_value = " ".join(i for i in [current_value, value] if i)
328
329    if current_tag:
330        combined_list.append((current_value, current_tag))
331
332    return combined_list
333
334
335def _manual_join(parsed: List[tuple]) -> Tuple[Dict[str, str], List[Union[str, None]]]:
336    """Remove duplicates and join remaining fields."""
337    parsed_clean = [i for i in parsed if i[1] not in toss_tags]
338    counts = Counter([i[1] for i in parsed_clean])
339    ok_tags = [tag for tag, count in counts.items() if count == 1]
340    ok_dict: Dict[str, str] = {i[1]: i[0] for i in parsed_clean if i[1] in ok_tags}
341    removed = [osm_mapping.get(field) for field, count in counts.items() if count > 1]
342
343    new_dict: Dict[str, Union[str, None]] = {}
344    if "addr:street" not in removed:
345        new_dict["addr:street"] = addr_street(ok_dict)
346    if "addr:housenumber" not in removed:
347        new_dict["addr:housenumber"] = addr_housenumber(ok_dict)
348    if "addr:unit" not in removed:
349        new_dict["addr:unit"] = ok_dict.get("OccupancyIdentifier")
350    if "addr:city" not in removed:
351        new_dict["addr:city"] = ok_dict.get("PlaceName")
352    if "addr:state" not in removed:
353        new_dict["addr:state"] = ok_dict.get("StateName")
354    if "addr:postcode" not in removed:
355        new_dict["addr:postcode"] = ok_dict.get("ZipCode")
356
357    return {k: v for k, v in new_dict.items() if v}, removed
358
359
360def collapse_list(seq: list) -> list:
361    """Remove duplicates in list while keeping order.
362
363    ```python
364    >>> collapse_list(["foo", "bar", "foo"])
365    # ["foo", "bar"]
366    ```
367
368    Args:
369        seq (list): The list to collapse.
370
371    Returns:
372        list: The collapsed list.
373    """
374    seen = set()
375    seen_add = seen.add
376    return [x for x in seq if not (x in seen or seen_add(x))]
377
378
379def split_unit(address_string: str) -> Dict[str, str]:
380    """Split unit from address string, if present."""
381    address_string = address_string.strip(" ")
382    if not any(char.isalpha() for char in address_string):
383        return {"addr:housenumber": address_string}
384
385    add_dict = {}
386    number = ""
387    for char in address_string:
388        if char.isdigit():
389            number += char
390        else:
391            break
392
393    unit = remove_prefix(address_string, number).lstrip(" -,/")
394    if unit:
395        add_dict["addr:unit"] = unit
396    add_dict["addr:housenumber"] = number
397
398    return add_dict
399
400
401def remove_prefix(text: str, prefix: str) -> str:
402    """Remove prefix from string for Python 3.8."""
403    if text.startswith(prefix):
404        return text[len(prefix) :]
405    return text
406
407
408def get_address(
409    address_string: str,
410) -> Tuple[Dict[str, Union[str, int]], List[Union[str, None]]]:
411    """Process address strings.
412
413    ```python
414    >>> get_address("345 MAPLE RD, COUNTRYSIDE, PA 24680-0198")[0]
415    # {"addr:housenumber": "345", "addr:street": "Maple Road",
416    "addr:city": "Countryside", "addr:state": "PA", "addr:postcode": "24680-0198"}
417    >>> get_address("777 Strawberry St.")[0]
418    # {"addr:housenumber": "777", "addr:street": "Strawberry Street"}
419    >>> address = get_address("222 NW Pineapple Ave Suite A Unit B")
420    >>> address[0]
421    # {"addr:housenumber": "222", "addr:street": "Northwest Pineapple Avenue"}
422    >>> address[1]
423    # ["addr:unit"]
424    ```
425
426    Args:
427        address_string (str): The address string to process.
428
429    Returns:
430        Tuple[Dict[str, Union[str, int]], List[Union[str, None]]]:
431        The processed address string and the removed fields.
432    """
433    try:
434        cleaned = usaddress.tag(clean_address(address_string), tag_mapping=osm_mapping)[
435            0
436        ]
437        removed = []
438    except usaddress.RepeatedLabelError as err:
439        collapsed = collapse_list(
440            [(i[0].strip(" .,#"), i[1]) for i in err.parsed_string]
441        )
442        cleaned, removed = _manual_join(_combine_consecutive_tuples(collapsed))
443
444    for toss in toss_tags:
445        cleaned.pop(toss, None)
446
447    if "addr:housenumber" in cleaned:
448        cleaned = {**cleaned, **split_unit(cleaned["addr:housenumber"])}
449
450    if "addr:street" in cleaned:
451        street = abbrs(cleaned["addr:street"])
452        cleaned["addr:street"] = street_comp.sub(
453            "Street",
454            street,
455        ).strip(".")
456
457    if "addr:city" in cleaned:
458        cleaned["addr:city"] = abbrs(get_title(cleaned["addr:city"], single_word=True))
459
460    if "addr:state" in cleaned:
461        old = cleaned["addr:state"].replace(".", "")
462        if old.upper() in state_expand:
463            cleaned["addr:state"] = state_expand[old.upper()]
464        elif len(old) == 2 and old.upper() in list(state_expand.values()):
465            cleaned["addr:state"] = old.upper()
466
467    if "addr:unit" in cleaned:
468        cleaned["addr:unit"] = cleaned["addr:unit"].removeprefix("Space").strip(" #.")
469
470    if "addr:postcode" in cleaned:
471        # remove extraneous postcode digits
472        cleaned["addr:postcode"] = post_comp.sub(
473            r"\1", cleaned["addr:postcode"]
474        ).replace(" ", "-")
475
476    try:
477        validated: Address = Address.model_validate(dict(cleaned))
478    except ValidationError as err:
479        bad_fields: list = [each.get("loc", [])[0] for each in err.errors()]
480        cleaned_ret = dict(cleaned)
481        for each in bad_fields:
482            cleaned_ret.pop(each, None)
483
484        removed.extend(bad_fields)
485        validated: Address = Address.model_validate(cleaned_ret)
486
487    return validated.model_dump(exclude_none=True, by_alias=True), removed
488
489
490def get_phone(phone: str) -> str:
491    """Format phone numbers to the US and Canadian standard format of `+1 XXX-XXX-XXXX`.
492
493    ```python
494    >>> get_phone("2029009019")
495    # "+1 202-900-9019"
496    >>> get_phone("(202) 900-9019")
497    # "+1 202-900-9019"
498    >>> get_phone("202-900-901")
499    # ValueError: Invalid phone number: 202-900-901
500    ```
501
502    Args:
503        phone (str): The phone number to format.
504
505    Returns:
506        str: The formatted phone number.
507
508    Raises:
509        ValueError: If the phone number is invalid.
510    """
511    phone_valid = regex.search(
512        r"^\(?(?:\+? ?1?[ -.]*)?(?:\(?(\d{3})\)?[ -.]*)(\d{3})[ -.]*(\d{4})$",
513        phone,
514    )
515    if phone_valid:
516        return (
517            f"+1 {phone_valid.group(1)}-{phone_valid.group(2)}-{phone_valid.group(3)}"
518        )
519    raise ValueError(f"Invalid phone number: {phone}")
toss_tags = ['Recipient', 'IntersectionSeparator', 'LandmarkName', 'USPSBoxGroupID', 'USPSBoxGroupType', 'USPSBoxID', 'USPSBoxType', 'OccupancyType']

Tags from the usaddress package to remove.

osm_mapping = {'AddressNumber': 'addr:housenumber', 'AddressNumberPrefix': 'addr:housenumber', 'AddressNumberSuffix': 'addr:housenumber', 'StreetName': 'addr:street', 'StreetNamePreDirectional': 'addr:street', 'StreetNamePreModifier': 'addr:street', 'StreetNamePreType': 'addr:street', 'StreetNamePostDirectional': 'addr:street', 'StreetNamePostModifier': 'addr:street', 'StreetNamePostType': 'addr:street', 'OccupancyIdentifier': 'addr:unit', 'PlaceName': 'addr:city', 'StateName': 'addr:state', 'ZipCode': 'addr:postcode'}

Mapping from usaddress fields to OSM tags.

def get_title(value: str, single_word: bool = False) -> str:
58def get_title(value: str, single_word: bool = False) -> str:
59    """Fix ALL-CAPS string.
60
61    ```python
62    >>> get_title("PALM BEACH")
63    # "Palm Beach"
64    >>> get_title("BOSTON")
65    # "BOSTON"
66    >>> get_title("BOSTON", single_word=True)
67    # "Boston"
68    ```
69
70    Args:
71        value: String to fix.
72        single_word: Whether the string should be fixed even if it is a single word.
73
74    Returns:
75        str: Fixed string.
76    """
77    if (value.isupper() and " " in value) or (value.isupper() and single_word):
78        return mc_replace(value.title())
79    return value

Fix ALL-CAPS string.

>>> get_title("PALM BEACH")
# "Palm Beach"
>>> get_title("BOSTON")
# "BOSTON"
>>> get_title("BOSTON", single_word=True)
# "Boston"
Arguments:
  • value: String to fix.
  • single_word: Whether the string should be fixed even if it is a single word.
Returns:

str: Fixed string.

def us_replace(value: str) -> str:
82def us_replace(value: str) -> str:
83    """Fix string containing improperly formatted US.
84
85    ```python
86    >>> us_replace("U.S. Route 15")
87    # "US Route 15"
88    ```
89
90    Args:
91        value: String to fix.
92
93    Returns:
94        str: Fixed string.
95    """
96    return value.replace("U.S.", "US").replace("U. S.", "US").replace("U S ", "US ")

Fix string containing improperly formatted US.

>>> us_replace("U.S. Route 15")
# "US Route 15"
Arguments:
  • value: String to fix.
Returns:

str: Fixed string.

def mc_replace(value: str) -> str:
 99def mc_replace(value: str) -> str:
100    """Fix string containing improperly formatted Mc- prefix.
101
102    ```python
103    >>> mc_replace("Fort Mchenry")
104    # "Fort McHenry"
105    ```
106
107    Args:
108        value: String to fix.
109
110    Returns:
111        str: Fixed string.
112    """
113    words = []
114    for word in value.split():
115        mc_match = word.partition("Mc")
116        words.append(mc_match[0] + mc_match[1] + mc_match[2].capitalize())
117    return " ".join(words)

Fix string containing improperly formatted Mc- prefix.

>>> mc_replace("Fort Mchenry")
# "Fort McHenry"
Arguments:
  • value: String to fix.
Returns:

str: Fixed string.

def ord_replace(value: str) -> str:
120def ord_replace(value: str) -> str:
121    """Fix string containing improperly capitalized ordinal.
122
123    ```python
124    >>> ord_replace("3Rd St. NW")
125    # "3rd St. NW"
126    ```
127
128    Args:
129        value: String to fix.
130
131    Returns:
132        str: Fixed string.
133    """
134    return regex.sub(r"(\b\d+[SNRT][tTdDhH]\b)", lower_match, value)

Fix string containing improperly capitalized ordinal.

>>> ord_replace("3Rd St. NW")
# "3rd St. NW"
Arguments:
  • value: String to fix.
Returns:

str: Fixed string.

def name_street_expand(match: _regex.Match) -> str:
137def name_street_expand(match: regex.Match) -> str:
138    """Expand matched street type abbreviations.
139
140    Args:
141        match (regex.Match): Matched string.
142
143    Returns:
144        str: Expanded string.
145    """
146    mat = match.group(1).upper().rstrip(".")
147    if mat:
148        return ({**name_expand, **street_expand})[mat].title()
149    raise ValueError

Expand matched street type abbreviations.

Arguments:
  • match (regex.Match): Matched string.
Returns:

str: Expanded string.

def direct_expand(match: _regex.Match) -> str:
152def direct_expand(match: regex.Match) -> str:
153    """Expand matched directional abbreviations.
154
155    Args:
156        match (regex.Match): Matched string.
157
158    Returns:
159        str: Expanded string.
160    """
161    mat = match.group(1).upper().replace(".", "")
162    if mat:
163        return direction_expand[mat].title()
164    raise ValueError

Expand matched directional abbreviations.

Arguments:
  • match (regex.Match): Matched string.
Returns:

str: Expanded string.

def cap_match(match: _regex.Match) -> str:
167def cap_match(match: regex.Match) -> str:
168    """Make matches uppercase.
169
170    Args:
171        match (regex.Match): Matched string.
172
173    Returns:
174        str: Capitalized string.
175    """
176    return "".join(match.groups()).upper().replace(".", "")

Make matches uppercase.

Arguments:
  • match (regex.Match): Matched string.
Returns:

str: Capitalized string.

def lower_match(match: _regex.Match) -> str:
179def lower_match(match: regex.Match) -> str:
180    """Lower-case improperly cased ordinal values.
181
182    Args:
183        value: String to fix.
184
185    Returns:
186        str: Fixed string.
187    """
188    return match.group(1).lower()

Lower-case improperly cased ordinal values.

Arguments:
  • value: String to fix.
Returns:

str: Fixed string.

def grid_match(match_str: _regex.Match) -> str:
191def grid_match(match_str: regex.Match) -> str:
192    """Clean grid addresses."""
193    return match_str.group(0).replace(" ", "").upper()

Clean grid addresses.

def abbrs(value: str) -> str:
196def abbrs(value: str) -> str:
197    """Bundle most common abbreviation expansion functions.
198
199    ```python
200    >>> abbrs("St. Francis")
201    # "Saint Francis"
202    >>> abbrs("E St.")
203    # "E Street"
204    >>> abbrs("E Sewell St")
205    # "East Sewell Street"
206    ```
207
208    Args:
209        value (str): String to expand.
210
211    Returns:
212        str: Expanded string.
213    """
214    value = ord_replace(us_replace(mc_replace(get_title(value))))
215
216    # change likely 'St' to 'Saint'
217    value = saint_comp.sub(
218        "Saint",
219        value,
220    )
221
222    # expand common street and word abbreviations
223    value = abbr_join_comp.sub(
224        name_street_expand,
225        value,
226    )
227
228    # expand directionals
229    value = dir_fill_comp.sub(
230        direct_expand,
231        value,
232    )
233
234    # normalize 'US'
235    value = us_replace(value)
236
237    # uppercase shortened street descriptors
238    value = regex.sub(
239        r"\b(C[rh]|S[rh]|[FR]m|Us)\b",
240        cap_match,
241        value,
242    )
243
244    # remove unremoved abbr periods
245    value = regex.sub(
246        r"([a-zA-Z]{2,})\.",
247        r"\1",
248        value,
249    )
250
251    # expand 'SR' if no other street types
252    value = sr_comp.sub("State Route", value)
253    return value.strip(" .")

Bundle most common abbreviation expansion functions.

>>> abbrs("St. Francis")
# "Saint Francis"
>>> abbrs("E St.")
# "E Street"
>>> abbrs("E Sewell St")
# "East Sewell Street"
Arguments:
  • value (str): String to expand.
Returns:

str: Expanded string.

def remove_br_unicode(old: str) -> str:
256def remove_br_unicode(old: str) -> str:
257    """Clean the input string before sending to parser by removing newlines and unicode.
258
259    Args:
260        old (str): String to clean.
261
262    Returns:
263        str: Cleaned string.
264    """
265    old = regex.sub(r"<br ?/>", ",", old)
266    return regex.sub(r"[^\x00-\x7F\n\r\t]", "", old)  # remove unicode

Clean the input string before sending to parser by removing newlines and unicode.

Arguments:
  • old (str): String to clean.
Returns:

str: Cleaned string.

def clean_address(address_string: str) -> str:
269def clean_address(address_string: str) -> str:
270    """Clean the input string before sending to parser by removing newlines and unicode.
271
272    Args:
273        address_string (str): String to clean.
274
275    Returns:
276        str: Cleaned string.
277    """
278    address_string = usa_comp.sub(
279        "", remove_br_unicode(address_string).replace("  ", " ").strip(" ,.")
280    )
281    address_string = paren_comp.sub("", address_string)
282    return grid_comp.sub(grid_match, address_string)

Clean the input string before sending to parser by removing newlines and unicode.

Arguments:
  • address_string (str): String to clean.
Returns:

str: Cleaned string.

def help_join(tags, keep: List[str]) -> str:
285def help_join(tags, keep: List[str]) -> str:
286    """Help to join address fields."""
287    tag_join: List[str] = [v for k, v in tags.items() if k in keep]
288    return " ".join(tag_join)

Help to join address fields.

def addr_street(tags: Dict[str, str]) -> str:
291def addr_street(tags: Dict[str, str]) -> str:
292    """Build the street field."""
293    return help_join(
294        tags,
295        [
296            "StreetName",
297            "StreetNamePreDirectional",
298            "StreetNamePreModifier",
299            "StreetNamePreType",
300            "StreetNamePostDirectional",
301            "StreetNamePostModifier",
302            "StreetNamePostType",
303        ],
304    )

Build the street field.

def addr_housenumber(tags: Dict[str, str]) -> str:
307def addr_housenumber(tags: Dict[str, str]) -> str:
308    """Build the housenumber field."""
309    return help_join(
310        tags, ["AddressNumberPrefix", "AddressNumber", "AddressNumberSuffix"]
311    )

Build the housenumber field.

def collapse_list(seq: list) -> list:
361def collapse_list(seq: list) -> list:
362    """Remove duplicates in list while keeping order.
363
364    ```python
365    >>> collapse_list(["foo", "bar", "foo"])
366    # ["foo", "bar"]
367    ```
368
369    Args:
370        seq (list): The list to collapse.
371
372    Returns:
373        list: The collapsed list.
374    """
375    seen = set()
376    seen_add = seen.add
377    return [x for x in seq if not (x in seen or seen_add(x))]

Remove duplicates in list while keeping order.

>>> collapse_list(["foo", "bar", "foo"])
# ["foo", "bar"]
Arguments:
  • seq (list): The list to collapse.
Returns:

list: The collapsed list.

def split_unit(address_string: str) -> Dict[str, str]:
380def split_unit(address_string: str) -> Dict[str, str]:
381    """Split unit from address string, if present."""
382    address_string = address_string.strip(" ")
383    if not any(char.isalpha() for char in address_string):
384        return {"addr:housenumber": address_string}
385
386    add_dict = {}
387    number = ""
388    for char in address_string:
389        if char.isdigit():
390            number += char
391        else:
392            break
393
394    unit = remove_prefix(address_string, number).lstrip(" -,/")
395    if unit:
396        add_dict["addr:unit"] = unit
397    add_dict["addr:housenumber"] = number
398
399    return add_dict

Split unit from address string, if present.

def remove_prefix(text: str, prefix: str) -> str:
402def remove_prefix(text: str, prefix: str) -> str:
403    """Remove prefix from string for Python 3.8."""
404    if text.startswith(prefix):
405        return text[len(prefix) :]
406    return text

Remove prefix from string for Python 3.8.

def get_address( address_string: str) -> Tuple[Dict[str, Union[str, int]], List[Optional[str]]]:
409def get_address(
410    address_string: str,
411) -> Tuple[Dict[str, Union[str, int]], List[Union[str, None]]]:
412    """Process address strings.
413
414    ```python
415    >>> get_address("345 MAPLE RD, COUNTRYSIDE, PA 24680-0198")[0]
416    # {"addr:housenumber": "345", "addr:street": "Maple Road",
417    "addr:city": "Countryside", "addr:state": "PA", "addr:postcode": "24680-0198"}
418    >>> get_address("777 Strawberry St.")[0]
419    # {"addr:housenumber": "777", "addr:street": "Strawberry Street"}
420    >>> address = get_address("222 NW Pineapple Ave Suite A Unit B")
421    >>> address[0]
422    # {"addr:housenumber": "222", "addr:street": "Northwest Pineapple Avenue"}
423    >>> address[1]
424    # ["addr:unit"]
425    ```
426
427    Args:
428        address_string (str): The address string to process.
429
430    Returns:
431        Tuple[Dict[str, Union[str, int]], List[Union[str, None]]]:
432        The processed address string and the removed fields.
433    """
434    try:
435        cleaned = usaddress.tag(clean_address(address_string), tag_mapping=osm_mapping)[
436            0
437        ]
438        removed = []
439    except usaddress.RepeatedLabelError as err:
440        collapsed = collapse_list(
441            [(i[0].strip(" .,#"), i[1]) for i in err.parsed_string]
442        )
443        cleaned, removed = _manual_join(_combine_consecutive_tuples(collapsed))
444
445    for toss in toss_tags:
446        cleaned.pop(toss, None)
447
448    if "addr:housenumber" in cleaned:
449        cleaned = {**cleaned, **split_unit(cleaned["addr:housenumber"])}
450
451    if "addr:street" in cleaned:
452        street = abbrs(cleaned["addr:street"])
453        cleaned["addr:street"] = street_comp.sub(
454            "Street",
455            street,
456        ).strip(".")
457
458    if "addr:city" in cleaned:
459        cleaned["addr:city"] = abbrs(get_title(cleaned["addr:city"], single_word=True))
460
461    if "addr:state" in cleaned:
462        old = cleaned["addr:state"].replace(".", "")
463        if old.upper() in state_expand:
464            cleaned["addr:state"] = state_expand[old.upper()]
465        elif len(old) == 2 and old.upper() in list(state_expand.values()):
466            cleaned["addr:state"] = old.upper()
467
468    if "addr:unit" in cleaned:
469        cleaned["addr:unit"] = cleaned["addr:unit"].removeprefix("Space").strip(" #.")
470
471    if "addr:postcode" in cleaned:
472        # remove extraneous postcode digits
473        cleaned["addr:postcode"] = post_comp.sub(
474            r"\1", cleaned["addr:postcode"]
475        ).replace(" ", "-")
476
477    try:
478        validated: Address = Address.model_validate(dict(cleaned))
479    except ValidationError as err:
480        bad_fields: list = [each.get("loc", [])[0] for each in err.errors()]
481        cleaned_ret = dict(cleaned)
482        for each in bad_fields:
483            cleaned_ret.pop(each, None)
484
485        removed.extend(bad_fields)
486        validated: Address = Address.model_validate(cleaned_ret)
487
488    return validated.model_dump(exclude_none=True, by_alias=True), removed

Process address strings.

>>> get_address("345 MAPLE RD, COUNTRYSIDE, PA 24680-0198")[0]
# {"addr:housenumber": "345", "addr:street": "Maple Road",
"addr:city": "Countryside", "addr:state": "PA", "addr:postcode": "24680-0198"}
>>> get_address("777 Strawberry St.")[0]
# {"addr:housenumber": "777", "addr:street": "Strawberry Street"}
>>> address = get_address("222 NW Pineapple Ave Suite A Unit B")
>>> address[0]
# {"addr:housenumber": "222", "addr:street": "Northwest Pineapple Avenue"}
>>> address[1]
# ["addr:unit"]
Arguments:
  • address_string (str): The address string to process.
Returns:

Tuple[Dict[str, Union[str, int]], List[Union[str, None]]]: The processed address string and the removed fields.

def get_phone(phone: str) -> str:
491def get_phone(phone: str) -> str:
492    """Format phone numbers to the US and Canadian standard format of `+1 XXX-XXX-XXXX`.
493
494    ```python
495    >>> get_phone("2029009019")
496    # "+1 202-900-9019"
497    >>> get_phone("(202) 900-9019")
498    # "+1 202-900-9019"
499    >>> get_phone("202-900-901")
500    # ValueError: Invalid phone number: 202-900-901
501    ```
502
503    Args:
504        phone (str): The phone number to format.
505
506    Returns:
507        str: The formatted phone number.
508
509    Raises:
510        ValueError: If the phone number is invalid.
511    """
512    phone_valid = regex.search(
513        r"^\(?(?:\+? ?1?[ -.]*)?(?:\(?(\d{3})\)?[ -.]*)(\d{3})[ -.]*(\d{4})$",
514        phone,
515    )
516    if phone_valid:
517        return (
518            f"+1 {phone_valid.group(1)}-{phone_valid.group(2)}-{phone_valid.group(3)}"
519        )
520    raise ValueError(f"Invalid phone number: {phone}")

Format phone numbers to the US and Canadian standard format of +1 XXX-XXX-XXXX.

>>> get_phone("2029009019")
# "+1 202-900-9019"
>>> get_phone("(202) 900-9019")
# "+1 202-900-9019"
>>> get_phone("202-900-901")
# ValueError: Invalid phone number: 202-900-901
Arguments:
  • phone (str): The phone number to format.
Returns:

str: The formatted phone number.

Raises:
  • ValueError: If the phone number is invalid.