atlus.atlus

Functions and tools to process the raw address strings.

  1"""Functions and tools to process the raw address strings."""
  2
  3from collections import Counter
  4from typing import OrderedDict
  5import usaddress
  6import regex
  7from .resources import (
  8    street_expand,
  9    direction_expand,
 10    name_expand,
 11    saints,
 12    state_expand,
 13)
 14
 15toss_tags = [
 16    "Recipient",
 17    "IntersectionSeparator",
 18    "LandmarkName",
 19    "USPSBoxGroupID",
 20    "USPSBoxGroupType",
 21    "USPSBoxID",
 22    "USPSBoxType",
 23    "OccupancyType",
 24]
 25"""Tags from the `usaddress` packageto remove."""
 26
 27osm_mapping = {
 28    "AddressNumber": "addr:housenumber",
 29    "AddressNumberPrefix": "addr:housenumber",
 30    "AddressNumberSuffix": "addr:housenumber",
 31    "StreetName": "addr:street",
 32    "StreetNamePreDirectional": "addr:street",
 33    "StreetNamePreModifier": "addr:street",
 34    "StreetNamePreType": "addr:street",
 35    "StreetNamePostDirectional": "addr:street",
 36    "StreetNamePostModifier": "addr:street",
 37    "StreetNamePostType": "addr:street",
 38    "OccupancyIdentifier": "addr:unit",
 39    "PlaceName": "addr:city",
 40    "StateName": "addr:state",
 41    "ZipCode": "addr:postcode",
 42}
 43"""Mapping from `usaddress` fields to OSM tags."""
 44
 45
 46def get_title(value: str, single_word: bool = False) -> str:
 47    """Fix ALL-CAPS string.
 48
 49    ```python
 50    >> get_title("PALM BEACH")
 51    # "Palm Beach"
 52    >> get_title("BOSTON")
 53    # "BOSTON"
 54    >> get_title("BOSTON", single_word=True)
 55    # "Boston"
 56    ```
 57
 58    Args:
 59        value: String to fix.
 60        single_word: Whether the string should be fixed even if it is a single word.
 61
 62    Returns:
 63        str: Fixed string.
 64    """
 65    if (value.isupper() and " " in value) or (value.isupper() and single_word):
 66        return mc_replace(value.title())
 67    return value
 68
 69
 70def us_replace(value: str) -> str:
 71    """Fix string containing improperly formatted US.
 72
 73    ```python
 74    >> us_replace("U.S. Route 15")
 75    # "US Route 15"
 76    ```
 77
 78    Args:
 79        value: String to fix.
 80
 81    Returns:
 82        str: Fixed string.
 83    """
 84    return value.replace("U.S.", "US").replace("U. S.", "US").replace("U S ", "US ")
 85
 86
 87def mc_replace(value: str) -> str:
 88    """Fix string containing improperly formatted Mc- prefix.
 89
 90    ```python
 91    >> mc_replace("Fort Mchenry")
 92    # "Fort McHenry"
 93    ```
 94
 95    Args:
 96        value: String to fix.
 97
 98    Returns:
 99        str: Fixed string.
100    """
101    mc_match = regex.search(r"(.*\bMc)([a-z])(.*)", value)
102    if mc_match:
103        return mc_match.group(1) + mc_match.group(2).title() + mc_match.group(3)
104    return value
105
106
107def ord_replace(value: str) -> str:
108    """Fix string containing improperly capitalized ordinal.
109
110    ```python
111    >> ord_replace("3Rd St. NW")
112    # "3rd St. NW"
113    ```
114
115    Args:
116        value: String to fix.
117
118    Returns:
119        str: Fixed string.
120    """
121    return regex.sub(r"(\b\d+[SNRT][tTdDhH]\b)", lower_match, value)
122
123
124def name_street_expand(match: regex.Match) -> str:
125    """Expand matched street type abbreviations.
126
127    Args:
128        match (regex.Match): Matched string.
129
130    Returns:
131        str: Expanded string.
132    """
133    mat = match.group(1).upper().rstrip(".")
134    if mat:
135        return (name_expand | street_expand)[mat].title()
136    raise ValueError
137
138
139def direct_expand(match: regex.Match) -> str:
140    """Expand matched directional abbreviations.
141
142    Args:
143        match (regex.Match): Matched string.
144
145    Returns:
146        str: Expanded string.
147    """
148    mat = match.group(1).upper().replace(".", "")
149    if mat:
150        return direction_expand[mat].title()
151    raise ValueError
152
153
154def cap_match(match: regex.Match) -> str:
155    """Make matches uppercase.
156
157    Args:
158        match (regex.Match): Matched string.
159
160    Returns:
161        str: Capitalized string.
162    """
163    return "".join(match.groups()).upper().replace(".", "")
164
165
166def lower_match(match: regex.Match) -> str:
167    """Lower-case improperly cased ordinal values.
168
169    Args:
170        value: String to fix.
171
172    Returns:
173        str: Fixed string.
174    """
175    return match.group(1).lower()
176
177
178def grid_match(match_str: regex.Match) -> str:
179    """Clean grid addresses."""
180    return match_str.group(0).replace(" ", "").upper()
181
182
183# pre-compile regex for speed
184ABBR_JOIN = "|".join(name_expand | street_expand)
185abbr_join_comp = regex.compile(
186    rf"(\b(?:{ABBR_JOIN})\b\.?)(?!')",
187    flags=regex.IGNORECASE,
188)
189
190DIR_FILL = "|".join(r"\.?".join(list(abbr)) for abbr in direction_expand)
191dir_fill_comp = regex.compile(
192    rf"(?<!(?:^(?:Avenue) |[\.']))(\b(?:{DIR_FILL})\b\.?)(?!(?:\.?[a-zA-Z]| (?:Street|Avenue)))",
193    flags=regex.IGNORECASE,
194)
195
196sr_comp = regex.compile(
197    r"(\bS\.?R\b\.?)(?= \d+)",
198    flags=regex.IGNORECASE,
199)
200
201saint_comp = regex.compile(
202    rf"^(St\.?)(?= )|(\bSt\.?)(?= (?:{'|'.join(saints)}))",
203    flags=regex.IGNORECASE,
204)
205
206street_comp = regex.compile(
207    r"St\.?(?= [NESW]\.?[EW]?\.?)|(?<=\d[thndstr]{2} )St\.?\b|St\.?$"
208)
209
210post_comp = regex.compile(r"(\d{5})-?0{4}")
211
212usa_comp = regex.compile(r",? (?:USA?|United States(?: of America)?|Canada)\b")
213
214paren_comp = regex.compile(r" ?\(.*\)")
215
216# match Wisconsin grid-style addresses: N65w25055, W249 N6620, etc.
217grid_comp = regex.compile(
218    r"\b([NnSs]\d{2,}\s*[EeWw]\d{2,}|[EeWw]\d{2,}\s*[NnSs]\d{2,})\b"
219)
220
221
222def abbrs(value: str) -> str:
223    """Bundle most common abbreviation expansion functions.
224
225    Args:
226        value (str): String to expand.
227
228    Returns:
229        str: Expanded string.
230    """
231    value = ord_replace(us_replace(mc_replace(get_title(value))))
232
233    # change likely 'St' to 'Saint'
234    value = saint_comp.sub(
235        "Saint",
236        value,
237    )
238
239    # expand common street and word abbreviations
240    value = abbr_join_comp.sub(
241        name_street_expand,
242        value,
243    )
244
245    # expand directionals
246    value = dir_fill_comp.sub(
247        direct_expand,
248        value,
249    )
250
251    # normalize 'US'
252    value = regex.sub(
253        r"\bU.[Ss].\B",
254        cap_match,
255        value,
256    )
257
258    # uppercase shortened street descriptors
259    value = regex.sub(
260        r"\b(C[rh]|S[rh]|[FR]m|Us)\b",
261        cap_match,
262        value,
263    )
264
265    # remove unremoved abbr periods
266    value = regex.sub(
267        r"([a-zA-Z]{2,})\.",
268        r"\1",
269        value,
270    )
271
272    # expand 'SR' if no other street types
273    value = sr_comp.sub("State Route", value)
274    return value.strip(" .")
275
276
277def clean(old: str) -> str:
278    """Clean the input string before sending to parser by removing newlines and unicode.
279
280    Args:
281        old (str): String to clean.
282
283    Returns:
284        str: Cleaned string.
285    """
286    old = regex.sub(r"<br ?/>", ",", old)
287    return regex.sub(r"[^\x00-\x7F\n\r\t]", "", old)  # remove unicode
288
289
290def help_join(tags, keep: list[str]) -> str:
291    """Help to join address fields."""
292    tag_join: list[str] = [v for k, v in tags.items() if k in keep]
293    return " ".join(tag_join)
294
295
296def addr_street(tags: dict[str, str]) -> str:
297    """Build the street field."""
298    return help_join(
299        tags,
300        [
301            "StreetName",
302            "StreetNamePreDirectional",
303            "StreetNamePreModifier",
304            "StreetNamePreType",
305            "StreetNamePostDirectional",
306            "StreetNamePostModifier",
307            "StreetNamePostType",
308        ],
309    )
310
311
312def addr_housenumber(tags: dict[str, str]) -> str:
313    """Build the housenumber field."""
314    return help_join(
315        tags, ["AddressNumberPrefix", "AddressNumber", "AddressNumberSuffix"]
316    )
317
318
319def _combine_consecutive_tuples(
320    tuples_list: list[tuple[str, str]]
321) -> list[tuple[str, str]]:
322    """Join adjacent `usaddress` fields."""
323    combined_list = []
324    current_tag = None
325    current_value = None
326
327    for value, tag in tuples_list:
328        if tag != current_tag:
329            if current_tag:
330                combined_list.append((current_value, current_tag))
331            current_value, current_tag = value, tag
332        else:
333            current_value = " ".join(i for i in [current_value, value] if i)
334
335    if current_tag:
336        combined_list.append((current_value, current_tag))
337
338    return combined_list
339
340
341def _manual_join(parsed: list[tuple]) -> tuple[dict[str, str], list[str | None]]:
342    """Remove duplicates and join remaining fields."""
343    a = [i for i in parsed if i[1] not in toss_tags]
344    counts = Counter([i[1] for i in a])
345    ok_tags = [tag for tag, count in counts.items() if count == 1]
346    ok_dict: dict[str, str] = {i[1]: i[0] for i in a if i[1] in ok_tags}
347    removed = [osm_mapping.get(field) for field, count in counts.items() if count > 1]
348
349    new_dict: dict[str, str | None] = {}
350    if "addr:street" not in removed:
351        new_dict["addr:street"] = addr_street(ok_dict)
352    if "addr:housenumber" not in removed:
353        new_dict["addr:housenumber"] = addr_housenumber(ok_dict)
354    if "addr:unit" not in removed:
355        new_dict["addr:unit"] = ok_dict.get("OccupancyIdentifier")
356    if "addr:city" not in removed:
357        new_dict["addr:city"] = ok_dict.get("PlaceName")
358    if "addr:state" not in removed:
359        new_dict["addr:state"] = ok_dict.get("StateName")
360    if "addr:postcode" not in removed:
361        new_dict["addr:postcode"] = ok_dict.get("ZipCode")
362
363    return {k: v for k, v in new_dict.items() if v}, removed
364
365
366def collapse_list(seq: list) -> list:
367    """Remove duplicates in list while keeping order.
368
369    ```python
370    >> collapse_list(["foo", "bar", "foo"])
371    # ["foo", "bar"]
372    ```
373
374    Args:
375        seq (list): The list to collapse.
376
377    Returns:
378        list: The collapsed list.
379    """
380    seen = set()
381    seen_add = seen.add
382    return [x for x in seq if not (x in seen or seen_add(x))]
383
384
385def process(
386    address_string: str,
387) -> tuple[OrderedDict[str, str | int], list[str | None]]:
388    """Process address strings.
389
390    Args:
391        address_string (str): The address string to process.
392
393    Returns:
394        tuple[OrderedDict[str, str | int], list[str | None]]:
395        The processed address string and the removed fields.
396    """
397    address_string = clean(address_string)
398    address_string = address_string.replace("  ", " ").strip(" ,.")
399    address_string = usa_comp.sub("", address_string)
400    address_string = paren_comp.sub("", address_string)
401    address_string = grid_comp.sub(grid_match, address_string)
402    try:
403        cleaned = usaddress.tag(clean(address_string), tag_mapping=osm_mapping)[0]
404        removed = []
405    except usaddress.RepeatedLabelError as e:
406        collapsed = collapse_list([(i[0].strip(" .,#"), i[1]) for i in e.parsed_string])
407        cleaned, removed = _manual_join(_combine_consecutive_tuples(collapsed))
408
409    for toss in toss_tags:
410        cleaned.pop(toss, None)
411
412    if "addr:housenumber" in cleaned:
413        suite = regex.match(r"([0-9]+)[- \/]?([a-zA-Z]+)", cleaned["addr:housenumber"])
414        if suite:
415            cleaned["addr:housenumber"] = suite.group(1)
416            if "addr:unit" not in cleaned:
417                cleaned["addr:unit"] = suite.group(2).upper()
418            else:
419                if cleaned["addr:unit"] != suite.group(2).upper():
420                    cleaned.pop("addr:unit")
421                    removed += ["addr:unit"]
422
423    if "addr:street" in cleaned:
424        street = abbrs(cleaned["addr:street"])
425        cleaned["addr:street"] = street_comp.sub(
426            "Street",
427            street,
428        ).strip(".")
429
430    if "addr:city" in cleaned:
431        cleaned["addr:city"] = abbrs(get_title(cleaned["addr:city"], single_word=True))
432
433    if "addr:state" in cleaned:
434        old = cleaned["addr:state"].replace(".", "")
435        if old.upper() in state_expand:
436            cleaned["addr:state"] = state_expand[old.upper()]
437        elif len(old) == 2 and old.upper() in list(state_expand.values()):
438            cleaned["addr:state"] = old.upper()
439
440    if "addr:unit" in cleaned:
441        cleaned["addr:unit"] = cleaned["addr:unit"].removeprefix("Space").strip(" #.")
442
443    if "addr:postcode" in cleaned:
444        # remove extraneous postcode digits
445        cleaned["addr:postcode"] = post_comp.sub(
446            r"\1", cleaned["addr:postcode"]
447        ).replace(" ", "-")
448
449    return cleaned, removed
450
451
452def phone_format(phone: str) -> str:
453    """Format phone numbers to the US and Canadian standard format of `+1 XXX-XXX-XXXX`.
454
455    ```python
456    >> phone_format("2029009019")
457    # "+1 202-900-9019"
458    >> phone_format("(202) 900-9019")
459    # "+1 202-900-9019"
460    >> phone_format("202-900-901")
461    # ValueError: Invalid phone number: 202-900-901
462    ```
463
464    Args:
465        phone (str): The phone number to format.
466
467    Returns:
468        str: The formatted phone number.
469
470    Raises:
471        ValueError: If the phone number is invalid.
472    """
473    phone_valid = regex.search(
474        r"^\(?(?:\+? ?1?[ -.]*)?(?:\(?(\d{3})\)?[ -.]*)(\d{3})[ -.]*(\d{4})$",
475        phone,
476    )
477    if phone_valid:
478        return (
479            f"+1 {phone_valid.group(1)}-{phone_valid.group(2)}-{phone_valid.group(3)}"
480        )
481    raise ValueError(f"Invalid phone number: {phone}")
toss_tags = ['Recipient', 'IntersectionSeparator', 'LandmarkName', 'USPSBoxGroupID', 'USPSBoxGroupType', 'USPSBoxID', 'USPSBoxType', 'OccupancyType']

Tags from the usaddress packageto remove.

osm_mapping = {'AddressNumber': 'addr:housenumber', 'AddressNumberPrefix': 'addr:housenumber', 'AddressNumberSuffix': 'addr:housenumber', 'StreetName': 'addr:street', 'StreetNamePreDirectional': 'addr:street', 'StreetNamePreModifier': 'addr:street', 'StreetNamePreType': 'addr:street', 'StreetNamePostDirectional': 'addr:street', 'StreetNamePostModifier': 'addr:street', 'StreetNamePostType': 'addr:street', 'OccupancyIdentifier': 'addr:unit', 'PlaceName': 'addr:city', 'StateName': 'addr:state', 'ZipCode': 'addr:postcode'}

Mapping from usaddress fields to OSM tags.

def get_title(value: str, single_word: bool = False) -> str:
47def get_title(value: str, single_word: bool = False) -> str:
48    """Fix ALL-CAPS string.
49
50    ```python
51    >> get_title("PALM BEACH")
52    # "Palm Beach"
53    >> get_title("BOSTON")
54    # "BOSTON"
55    >> get_title("BOSTON", single_word=True)
56    # "Boston"
57    ```
58
59    Args:
60        value: String to fix.
61        single_word: Whether the string should be fixed even if it is a single word.
62
63    Returns:
64        str: Fixed string.
65    """
66    if (value.isupper() and " " in value) or (value.isupper() and single_word):
67        return mc_replace(value.title())
68    return value

Fix ALL-CAPS string.

>> get_title("PALM BEACH")
# "Palm Beach"
>> get_title("BOSTON")
# "BOSTON"
>> get_title("BOSTON", single_word=True)
# "Boston"
Arguments:
  • value: String to fix.
  • single_word: Whether the string should be fixed even if it is a single word.
Returns:

str: Fixed string.

def us_replace(value: str) -> str:
71def us_replace(value: str) -> str:
72    """Fix string containing improperly formatted US.
73
74    ```python
75    >> us_replace("U.S. Route 15")
76    # "US Route 15"
77    ```
78
79    Args:
80        value: String to fix.
81
82    Returns:
83        str: Fixed string.
84    """
85    return value.replace("U.S.", "US").replace("U. S.", "US").replace("U S ", "US ")

Fix string containing improperly formatted US.

>> us_replace("U.S. Route 15")
# "US Route 15"
Arguments:
  • value: String to fix.
Returns:

str: Fixed string.

def mc_replace(value: str) -> str:
 88def mc_replace(value: str) -> str:
 89    """Fix string containing improperly formatted Mc- prefix.
 90
 91    ```python
 92    >> mc_replace("Fort Mchenry")
 93    # "Fort McHenry"
 94    ```
 95
 96    Args:
 97        value: String to fix.
 98
 99    Returns:
100        str: Fixed string.
101    """
102    mc_match = regex.search(r"(.*\bMc)([a-z])(.*)", value)
103    if mc_match:
104        return mc_match.group(1) + mc_match.group(2).title() + mc_match.group(3)
105    return value

Fix string containing improperly formatted Mc- prefix.

>> mc_replace("Fort Mchenry")
# "Fort McHenry"
Arguments:
  • value: String to fix.
Returns:

str: Fixed string.

def ord_replace(value: str) -> str:
108def ord_replace(value: str) -> str:
109    """Fix string containing improperly capitalized ordinal.
110
111    ```python
112    >> ord_replace("3Rd St. NW")
113    # "3rd St. NW"
114    ```
115
116    Args:
117        value: String to fix.
118
119    Returns:
120        str: Fixed string.
121    """
122    return regex.sub(r"(\b\d+[SNRT][tTdDhH]\b)", lower_match, value)

Fix string containing improperly capitalized ordinal.

>> ord_replace("3Rd St. NW")
# "3rd St. NW"
Arguments:
  • value: String to fix.
Returns:

str: Fixed string.

def name_street_expand(match: _regex.Match) -> str:
125def name_street_expand(match: regex.Match) -> str:
126    """Expand matched street type abbreviations.
127
128    Args:
129        match (regex.Match): Matched string.
130
131    Returns:
132        str: Expanded string.
133    """
134    mat = match.group(1).upper().rstrip(".")
135    if mat:
136        return (name_expand | street_expand)[mat].title()
137    raise ValueError

Expand matched street type abbreviations.

Arguments:
  • match (regex.Match): Matched string.
Returns:

str: Expanded string.

def direct_expand(match: _regex.Match) -> str:
140def direct_expand(match: regex.Match) -> str:
141    """Expand matched directional abbreviations.
142
143    Args:
144        match (regex.Match): Matched string.
145
146    Returns:
147        str: Expanded string.
148    """
149    mat = match.group(1).upper().replace(".", "")
150    if mat:
151        return direction_expand[mat].title()
152    raise ValueError

Expand matched directional abbreviations.

Arguments:
  • match (regex.Match): Matched string.
Returns:

str: Expanded string.

def cap_match(match: _regex.Match) -> str:
155def cap_match(match: regex.Match) -> str:
156    """Make matches uppercase.
157
158    Args:
159        match (regex.Match): Matched string.
160
161    Returns:
162        str: Capitalized string.
163    """
164    return "".join(match.groups()).upper().replace(".", "")

Make matches uppercase.

Arguments:
  • match (regex.Match): Matched string.
Returns:

str: Capitalized string.

def lower_match(match: _regex.Match) -> str:
167def lower_match(match: regex.Match) -> str:
168    """Lower-case improperly cased ordinal values.
169
170    Args:
171        value: String to fix.
172
173    Returns:
174        str: Fixed string.
175    """
176    return match.group(1).lower()

Lower-case improperly cased ordinal values.

Arguments:
  • value: String to fix.
Returns:

str: Fixed string.

def grid_match(match_str: _regex.Match) -> str:
179def grid_match(match_str: regex.Match) -> str:
180    """Clean grid addresses."""
181    return match_str.group(0).replace(" ", "").upper()

Clean grid addresses.

ABBR_JOIN = 'ARPT|BLDG|CONF|CONV|CNTR|CTR|DWTN|INTL|FT|MT|MTN|SHPG|ACC|ALY|ANX|ARC|AV|AVE|BYU|BCH|BND|BLF|BLFS|BTM|BLVD|BR|BRG|BRK|BRKS|BG|BGS|BYP|CP|CY|CYN|CPE|CTRS|CIR|CIRS|CLF|CLFS|CLB|CMN|CMNS|COR|CORS|CRSE|CT|CTS|CV|CVS|CRK|CRES|CRST|CSWY|CURV|DL|DM|DV|DR|DRS|EST|EXPY|EXPWY|EXT|EXTS|FGR|FGRS|FLS|FLD|FLDS|FLT|FLTS|FRD|FRDS|FRST|FRG|FRGS|FRK|FRKS|FRY|FRYS|FOR|FORS|FWY|GD|GDN|GDNS|GTWY|GLN|GLNS|GN|GNS|GRN|GRNS|GRV|GRVS|HBR|HBRS|HGWY|HVN|HTS|HWY|HL|HLS|HOLW|INLT|IS|ISS|JCT|JCTS|KY|KYS|KNL|KNLS|LK|LKS|LNDG|LN|LGT|LGTS|LF|LCK|LCKS|LDG|LP|MNR|MNRS|MDW|MDWS|ML|MLS|MSN|MTWY|MTNS|NCK|ORCH|OPAS|PKY|PKWY|PSGE|PNE|PNES|PL|PLN|PLNS|PLZ|PT|PTS|PRT|PRTS|PR|PVT|RADL|RNCH|RPD|RPDS|RST|RDG|RDGS|RIV|RD|RDS|RT|RTE|SHL|SHLS|SHR|SHRS|SKWY|SPG|SPGS|SQ|SQS|STA|STRA|STRM|STS|SMT|SRVC|TER|TRWY|THFR|TRCE|TRAK|TRFY|TRL|TRLR|TUNL|TPKE|UPAS|UN|UNP|UNS|VIA|VIAS|VLY|VLYS|VW|VWS|VLG|VL|VIS|WK|WKWY|WY|WL|WLS|XING|XINGS|XRD|XRDS|YU'
abbr_join_comp = regex.Regex("(\\b(?:ARPT|BLDG|CONF|CONV|CNTR|CTR|DWTN|INTL|FT|MT|MTN|SHPG|ACC|ALY|ANX|ARC|AV|AVE|BYU|BCH|BND|BLF|BLFS|BTM|BLVD|BR|BRG|BRK|BRKS|BG|BGS|BYP|CP|CY|CYN|CPE|CTRS|CIR|CIRS|CLF|CLFS|CLB|CMN|CMNS|COR|CORS|CRSE|CT|CTS|CV|CVS|CRK|CRES|CRST|CSWY|CURV|DL|DM|DV|DR|DRS|EST|EXPY|EXPWY|EXT|EXTS|FGR|FGRS|FLS|FLD|FLDS|FLT|FLTS|FRD|FRDS|FRST|FRG|FRGS|FRK|FRKS|FRY|FRYS|FOR|FORS|FWY|GD|GDN|GDNS|GTWY|GLN|GLNS|GN|GNS|GRN|GRNS|GRV|GRVS|HBR|HBRS|HGWY|HVN|HTS|HWY|HL|HLS|HOLW|INLT|IS|ISS|JCT|JCTS|KY|KYS|KNL|KNLS|LK|LKS|LNDG|LN|LGT|LGTS|LF|LCK|LCKS|LDG|LP|MNR|MNRS|MDW|MDWS|ML|MLS|MSN|MTWY|MTNS|NCK|ORCH|OPAS|PKY|PKWY|PSGE|PNE|PNES|PL|PLN|PLNS|PLZ|PT|PTS|PRT|PRTS|PR|PVT|RADL|RNCH|RPD|RPDS|RST|RDG|RDGS|RIV|RD|RDS|RT|RTE|SHL|SHLS|SHR|SHRS|SKWY|SPG|SPGS|SQ|SQS|STA|STRA|STRM|STS|SMT|SRVC|TER|TRWY|THFR|TRCE|TRAK|TRFY|TRL|TRLR|TUNL|TPKE|UPAS|UN|UNP|UNS|VIA|VIAS|VLY|VLYS|VW|VWS|VLG|VL|VIS|WK|WKWY|WY|WL|WLS|XING|XINGS|XRD|XRDS|YU)\\b\\.?)(?!')", flags=regex.I | regex.V0)
DIR_FILL = 'N\\.?E|S\\.?E|N\\.?W|S\\.?W|N|E|S|W'
dir_fill_comp = regex.Regex("(?<!(?:^(?:Avenue) |[\\.']))(\\b(?:N\\.?E|S\\.?E|N\\.?W|S\\.?W|N|E|S|W)\\b\\.?)(?!(?:\\.?[a-zA-Z]| (?:Street|Avenue)))", flags=regex.I | regex.V0)
sr_comp = regex.Regex('(\\bS\\.?R\\b\\.?)(?= \\d+)', flags=regex.I | regex.V0)
saint_comp = regex.Regex('^(St\\.?)(?= )|(\\bSt\\.?)(?= (?:Abigail|Agatha|Agnes|Andrew|Anthony|Augustine|Bernadette|Brigid|Catherine|Charles|Christopher|Clare|Cloud|Dymphna|Elizabeth|Faustina|Felix|Francis|Gabriel,|George|Gerard|James|Joan|John|Joseph|Jude|Kateri|Louis|Lucie|Lucy|Luke|Maria|Mark|Martin|Mary|Maximilian|Michael|Monica|Padre|Patrick|Paul|Peter|Philomena|Raphael|Rita|Rose|Sebastian|Teresa|Therese|Thomas|Valentine|Victor|Vincent))', flags=regex.I | regex.V0)
street_comp = regex.Regex('St\\.?(?= [NESW]\\.?[EW]?\\.?)|(?<=\\d[thndstr]{2} )St\\.?\\b|St\\.?$', flags=regex.V0)
post_comp = regex.Regex('(\\d{5})-?0{4}', flags=regex.V0)
usa_comp = regex.Regex(',? (?:USA?|United States(?: of America)?|Canada)\\b', flags=regex.V0)
paren_comp = regex.Regex(' ?\\(.*\\)', flags=regex.V0)
grid_comp = regex.Regex('\\b([NnSs]\\d{2,}\\s*[EeWw]\\d{2,}|[EeWw]\\d{2,}\\s*[NnSs]\\d{2,})\\b', flags=regex.V0)
def abbrs(value: str) -> str:
223def abbrs(value: str) -> str:
224    """Bundle most common abbreviation expansion functions.
225
226    Args:
227        value (str): String to expand.
228
229    Returns:
230        str: Expanded string.
231    """
232    value = ord_replace(us_replace(mc_replace(get_title(value))))
233
234    # change likely 'St' to 'Saint'
235    value = saint_comp.sub(
236        "Saint",
237        value,
238    )
239
240    # expand common street and word abbreviations
241    value = abbr_join_comp.sub(
242        name_street_expand,
243        value,
244    )
245
246    # expand directionals
247    value = dir_fill_comp.sub(
248        direct_expand,
249        value,
250    )
251
252    # normalize 'US'
253    value = regex.sub(
254        r"\bU.[Ss].\B",
255        cap_match,
256        value,
257    )
258
259    # uppercase shortened street descriptors
260    value = regex.sub(
261        r"\b(C[rh]|S[rh]|[FR]m|Us)\b",
262        cap_match,
263        value,
264    )
265
266    # remove unremoved abbr periods
267    value = regex.sub(
268        r"([a-zA-Z]{2,})\.",
269        r"\1",
270        value,
271    )
272
273    # expand 'SR' if no other street types
274    value = sr_comp.sub("State Route", value)
275    return value.strip(" .")

Bundle most common abbreviation expansion functions.

Arguments:
  • value (str): String to expand.
Returns:

str: Expanded string.

def clean(old: str) -> str:
278def clean(old: str) -> str:
279    """Clean the input string before sending to parser by removing newlines and unicode.
280
281    Args:
282        old (str): String to clean.
283
284    Returns:
285        str: Cleaned string.
286    """
287    old = regex.sub(r"<br ?/>", ",", old)
288    return regex.sub(r"[^\x00-\x7F\n\r\t]", "", old)  # remove unicode

Clean the input string before sending to parser by removing newlines and unicode.

Arguments:
  • old (str): String to clean.
Returns:

str: Cleaned string.

def help_join(tags, keep: list[str]) -> str:
291def help_join(tags, keep: list[str]) -> str:
292    """Help to join address fields."""
293    tag_join: list[str] = [v for k, v in tags.items() if k in keep]
294    return " ".join(tag_join)

Help to join address fields.

def addr_street(tags: dict[str, str]) -> str:
297def addr_street(tags: dict[str, str]) -> str:
298    """Build the street field."""
299    return help_join(
300        tags,
301        [
302            "StreetName",
303            "StreetNamePreDirectional",
304            "StreetNamePreModifier",
305            "StreetNamePreType",
306            "StreetNamePostDirectional",
307            "StreetNamePostModifier",
308            "StreetNamePostType",
309        ],
310    )

Build the street field.

def addr_housenumber(tags: dict[str, str]) -> str:
313def addr_housenumber(tags: dict[str, str]) -> str:
314    """Build the housenumber field."""
315    return help_join(
316        tags, ["AddressNumberPrefix", "AddressNumber", "AddressNumberSuffix"]
317    )

Build the housenumber field.

def collapse_list(seq: list) -> list:
367def collapse_list(seq: list) -> list:
368    """Remove duplicates in list while keeping order.
369
370    ```python
371    >> collapse_list(["foo", "bar", "foo"])
372    # ["foo", "bar"]
373    ```
374
375    Args:
376        seq (list): The list to collapse.
377
378    Returns:
379        list: The collapsed list.
380    """
381    seen = set()
382    seen_add = seen.add
383    return [x for x in seq if not (x in seen or seen_add(x))]

Remove duplicates in list while keeping order.

>> collapse_list(["foo", "bar", "foo"])
# ["foo", "bar"]
Arguments:
  • seq (list): The list to collapse.
Returns:

list: The collapsed list.

def process( address_string: str) -> tuple[typing.OrderedDict[str, str | int], list[str | None]]:
386def process(
387    address_string: str,
388) -> tuple[OrderedDict[str, str | int], list[str | None]]:
389    """Process address strings.
390
391    Args:
392        address_string (str): The address string to process.
393
394    Returns:
395        tuple[OrderedDict[str, str | int], list[str | None]]:
396        The processed address string and the removed fields.
397    """
398    address_string = clean(address_string)
399    address_string = address_string.replace("  ", " ").strip(" ,.")
400    address_string = usa_comp.sub("", address_string)
401    address_string = paren_comp.sub("", address_string)
402    address_string = grid_comp.sub(grid_match, address_string)
403    try:
404        cleaned = usaddress.tag(clean(address_string), tag_mapping=osm_mapping)[0]
405        removed = []
406    except usaddress.RepeatedLabelError as e:
407        collapsed = collapse_list([(i[0].strip(" .,#"), i[1]) for i in e.parsed_string])
408        cleaned, removed = _manual_join(_combine_consecutive_tuples(collapsed))
409
410    for toss in toss_tags:
411        cleaned.pop(toss, None)
412
413    if "addr:housenumber" in cleaned:
414        suite = regex.match(r"([0-9]+)[- \/]?([a-zA-Z]+)", cleaned["addr:housenumber"])
415        if suite:
416            cleaned["addr:housenumber"] = suite.group(1)
417            if "addr:unit" not in cleaned:
418                cleaned["addr:unit"] = suite.group(2).upper()
419            else:
420                if cleaned["addr:unit"] != suite.group(2).upper():
421                    cleaned.pop("addr:unit")
422                    removed += ["addr:unit"]
423
424    if "addr:street" in cleaned:
425        street = abbrs(cleaned["addr:street"])
426        cleaned["addr:street"] = street_comp.sub(
427            "Street",
428            street,
429        ).strip(".")
430
431    if "addr:city" in cleaned:
432        cleaned["addr:city"] = abbrs(get_title(cleaned["addr:city"], single_word=True))
433
434    if "addr:state" in cleaned:
435        old = cleaned["addr:state"].replace(".", "")
436        if old.upper() in state_expand:
437            cleaned["addr:state"] = state_expand[old.upper()]
438        elif len(old) == 2 and old.upper() in list(state_expand.values()):
439            cleaned["addr:state"] = old.upper()
440
441    if "addr:unit" in cleaned:
442        cleaned["addr:unit"] = cleaned["addr:unit"].removeprefix("Space").strip(" #.")
443
444    if "addr:postcode" in cleaned:
445        # remove extraneous postcode digits
446        cleaned["addr:postcode"] = post_comp.sub(
447            r"\1", cleaned["addr:postcode"]
448        ).replace(" ", "-")
449
450    return cleaned, removed

Process address strings.

Arguments:
  • address_string (str): The address string to process.
Returns:

tuple[OrderedDict[str, str | int], list[str | None]]: The processed address string and the removed fields.

def phone_format(phone: str) -> str:
453def phone_format(phone: str) -> str:
454    """Format phone numbers to the US and Canadian standard format of `+1 XXX-XXX-XXXX`.
455
456    ```python
457    >> phone_format("2029009019")
458    # "+1 202-900-9019"
459    >> phone_format("(202) 900-9019")
460    # "+1 202-900-9019"
461    >> phone_format("202-900-901")
462    # ValueError: Invalid phone number: 202-900-901
463    ```
464
465    Args:
466        phone (str): The phone number to format.
467
468    Returns:
469        str: The formatted phone number.
470
471    Raises:
472        ValueError: If the phone number is invalid.
473    """
474    phone_valid = regex.search(
475        r"^\(?(?:\+? ?1?[ -.]*)?(?:\(?(\d{3})\)?[ -.]*)(\d{3})[ -.]*(\d{4})$",
476        phone,
477    )
478    if phone_valid:
479        return (
480            f"+1 {phone_valid.group(1)}-{phone_valid.group(2)}-{phone_valid.group(3)}"
481        )
482    raise ValueError(f"Invalid phone number: {phone}")

Format phone numbers to the US and Canadian standard format of +1 XXX-XXX-XXXX.

>> phone_format("2029009019")
# "+1 202-900-9019"
>> phone_format("(202) 900-9019")
# "+1 202-900-9019"
>> phone_format("202-900-901")
# ValueError: Invalid phone number: 202-900-901
Arguments:
  • phone (str): The phone number to format.
Returns:

str: The formatted phone number.

Raises:
  • ValueError: If the phone number is invalid.