atlus.resources
Hold info for the processing script.
1"""Hold info for the processing script.""" 2 3import regex 4 5direction_expand = { 6 "NE": "Northeast", 7 "SE": "Southeast", 8 "NW": "Northwest", 9 "SW": "Southwest", 10 "N": "North", 11 "E": "East", 12 "S": "South", 13 "W": "West", 14} 15"""Compass direction abbreviations.""" 16 17name_expand = { 18 "ARPT": "airport", 19 "BLDG": "building", 20 "CONF": "conference", 21 "CONV": "convention", 22 "CNTR": "center", 23 "CTR": "center", 24 "DWTN": "downtown", 25 "INTL": "international", 26 "FT": "fort", 27 "MT": "mount", 28 "MTN": "mountain", 29 "SHPG": "shopping", 30} 31"""Common name abbreviations.""" 32 33state_expand = { 34 "ALABAMA": "AL", 35 "ALA": "AL", 36 "ALASKA": "AK", 37 "ALAS": "AK", 38 "ARIZONA": "AZ", 39 "ARIZ": "AZ", 40 "ARKANSAS": "AR", 41 "ARK": "AR", 42 "CALIFORNIA": "CA", 43 "CALIF": "CA", 44 "CAL": "CA", 45 "COLORADO": "CO", 46 "COLO": "CO", 47 "COL": "CO", 48 "CONNECTICUT": "CT", 49 "CONN": "CT", 50 "DELAWARE": "DE", 51 "DEL": "DE", 52 "DISTRICT OF COLUMBIA": "DC", 53 "FLORIDA": "FL", 54 "FLA": "FL", 55 "FLOR": "FL", 56 "GEORGIA": "GA", 57 "GA": "GA", 58 "HAWAII": "HI", 59 "IDAHO": "ID", 60 "IDA": "ID", 61 "ILLINOIS": "IL", 62 "ILL": "IL", 63 "INDIANA": "IN", 64 "IND": "IN", 65 "IOWA": "IA", 66 "KANSAS": "KS", 67 "KANS": "KS", 68 "KAN": "KS", 69 "KENTUCKY": "KY", 70 "KEN": "KY", 71 "KENT": "KY", 72 "LOUISIANA": "LA", 73 "MAINE": "ME", 74 "MARYLAND": "MD", 75 "MASSACHUSETTS": "MA", 76 "MASS": "MA", 77 "MICHIGAN": "MI", 78 "MICH": "MI", 79 "MINNESOTA": "MN", 80 "MINN": "MN", 81 "MISSISSIPPI": "MS", 82 "MISS": "MS", 83 "MISSOURI": "MO", 84 "MONTANA": "MT", 85 "MONT": "MT", 86 "NEBRASKA": "NE", 87 "NEBR": "NE", 88 "NEB": "NE", 89 "NEVADA": "NV", 90 "NEV": "NV", 91 "NEW HAMPSHIRE": "NH", 92 "NEW JERSEY": "NJ", 93 "NEW MEXICO": "NM", 94 "N MEX": "NM", 95 "NEW M": "NM", 96 "NEW YORK": "NY", 97 "NORTH CAROLINA": "NC", 98 "NORTH DAKOTA": "ND", 99 "N DAK": "ND", 100 "OHIO": "OH", 101 "OKLAHOMA": "OK", 102 "OKLA": "OK", 103 "OREGON": "OR", 104 "OREG": "OR", 105 "ORE": "OR", 106 "PENNSYLVANIA": "PA", 107 "PENN": "PA", 108 "RHODE ISLAND": "RI", 109 "SOUTH CAROLINA": "SC", 110 "SOUTH DAKOTA": "SD", 111 "S DAK": "SD", 112 "TENNESSEE": "TN", 113 "TENN": "TN", 114 "TEXAS": "TX", 115 "TEX": "TX", 116 "UTAH": "UT", 117 "VERMONT": "VT", 118 "VIRGINIA": "VA", 119 "WASHINGTON": "WA", 120 "WASH": "WA", 121 "WEST VIRGINIA": "WV", 122 "W VA": "WV", 123 "WISCONSIN": "WI", 124 "WIS": "WI", 125 "WISC": "WI", 126 "WYOMING": "WY", 127 "WYO": "WY", 128 "ONTARIO": "ON", 129 "QUEBEC": "QC", 130 "NOVA SCOTIA": "NS", 131 "NEW BRUNSWICK": "NB", 132 "MANITOBA": "MB", 133 "BRITISH COLUMBIA": "BC", 134 "PRINCE EDWARD ISLAND": "PE", 135 "PRINCE EDWARD": "PE", 136 "SASKATCHEWAN": "SK", 137 "ALBERTA": "AB", 138 "NEWFOUNDLAND AND LABRADOR": "NL", 139 "NEWFOUNDLAND & LABRADOR": "NL", 140 "NEWFOUNDLAND": "NL", 141 "YUKON": "YK", 142 "NUNAVUT": "NU", 143 "NORTHWEST TERRITORIES": "NT", 144 "NW TERRITORIES": "NT", 145} 146"""Map states to abbreviations.""" 147 148street_expand = { 149 "ACC": "ACCESS", 150 "ALY": "ALLEY", 151 "ANX": "ANEX", 152 "ARC": "ARCADE", 153 "AV": "AVENUE", 154 "AVE": "AVENUE", 155 "BYU": "BAYOU", 156 "BCH": "BEACH", 157 "BND": "BEND", 158 "BLF": "BLUFF", 159 "BLFS": "BLUFFS", 160 "BTM": "BOTTOM", 161 "BLVD": "BOULEVARD", 162 "BR": "BRANCH", 163 "BRG": "BRIDGE", 164 "BRK": "BROOK", 165 "BRKS": "BROOKS", 166 "BG": "BURG", 167 "BGS": "BURGS", 168 "BYP": "BYPASS", 169 "CP": "CAMP", 170 "CY": "KEY", 171 "CYN": "CANYON", 172 "CPE": "CAPE", 173 "CTR": "CENTER", 174 "CTRS": "CENTERS", 175 "CIR": "CIRCLE", 176 "CIRS": "CIRCLES", 177 "CLF": "CLIFF", 178 "CLFS": "CLIFFS", 179 "CLB": "CLUB", 180 "CMN": "COMMON", 181 "CMNS": "COMMONS", 182 "COR": "CORNER", 183 "CORS": "CORNERS", 184 "CRSE": "COURSE", 185 "CT": "COURT", 186 "CTS": "COURTS", 187 "CV": "COVE", 188 "CVS": "COVES", 189 "CRK": "CREEK", 190 "CRES": "CRESCENT", 191 "CRST": "CREST", 192 "CSWY": "CAUSEWAY", 193 "CURV": "CURVE", 194 "DL": "DALE", 195 "DM": "DAM", 196 "DV": "DIVIDE", 197 "DR": "DRIVE", 198 "DRS": "DRIVES", 199 "EST": "ESTATE", 200 "EXPY": "EXPRESSWAY", 201 "EXPWY": "EXPRESSWAY", 202 "EXT": "EXTENSION", 203 "EXTS": "EXTENSIONS", 204 "FGR": "FORGE", 205 "FGRS": "FORGES", 206 "FLS": "FALLS", 207 "FLD": "FIELD", 208 "FLDS": "FIELDS", 209 "FLT": "FLAT", 210 "FLTS": "FLATS", 211 "FRD": "FORD", 212 "FRDS": "FORDS", 213 "FRST": "FOREST", 214 "FRG": "FORGE", 215 "FRGS": "FORGES", 216 "FRK": "FORK", 217 "FRKS": "FORKS", 218 "FRY": "FERRY", 219 "FRYS": "FERRYS", 220 "FOR": "FORD", 221 "FORS": "FORDS", 222 "FT": "FORT", 223 "FWY": "FREEWAY", 224 "GD": "GRADE", 225 "GDN": "GARDEN", 226 "GDNS": "GARDENS", 227 "GTWY": "GATEWAY", 228 "GLN": "GLEN", 229 "GLNS": "GLENS", 230 "GN": "GREEN", 231 "GNS": "GREENS", 232 "GRN": "GREEN", 233 "GRNS": "GREENS", 234 "GRV": "GROVE", 235 "GRVS": "GROVES", 236 "HBR": "HARBOR", 237 "HBRS": "HARBORS", 238 "HGWY": "HIGHWAY", 239 "HVN": "HAVEN", 240 "HTS": "HEIGHTS", 241 "HWY": "HIGHWAY", 242 "HL": "HILL", 243 "HLS": "HILLS", 244 "HOLW": "HOLLOW", 245 "INLT": "INLET", 246 "IS": "ISLAND", 247 "ISS": "ISLANDS", 248 "JCT": "JUNCTION", 249 "JCTS": "JUNCTIONS", 250 "KY": "KEY", 251 "KYS": "KEYS", 252 "KNL": "KNOLL", 253 "KNLS": "KNOLLS", 254 "LK": "LAKE", 255 "LKS": "LAKES", 256 "LNDG": "LANDING", 257 "LN": "LANE", 258 "LGT": "LIGHT", 259 "LGTS": "LIGHTS", 260 "LF": "LOAF", 261 "LCK": "LOCK", 262 "LCKS": "LOCKS", 263 "LDG": "LODGE", 264 "LP": "LOOP", 265 "MNR": "MANOR", 266 "MNRS": "MANORS", 267 "MDW": "MEADOW", 268 "MDWS": "MEADOWS", 269 "ML": "MILL", 270 "MLS": "MILLS", 271 "MSN": "MISSION", 272 "MTWY": "MOTORWAY", 273 "MT": "MOUNT", 274 "MTN": "MOUNTAIN", 275 "MTNS": "MOUNTAINS", 276 "NCK": "NECK", 277 "ORCH": "ORCHARD", 278 "OPAS": "OVERPASS", 279 "PKY": "PARKWAY", 280 "PKWY": "PARKWAY", 281 "PSGE": "PASSAGE", 282 "PNE": "PINE", 283 "PNES": "PINES", 284 "PL": "PLACE", 285 "PLN": "PLAIN", 286 "PLNS": "PLAINS", 287 "PLZ": "PLAZA", 288 "PT": "POINT", 289 "PTS": "POINTS", 290 "PRT": "PORT", 291 "PRTS": "PORTS", 292 "PR": "PRAIRIE", 293 "PVT": "PRIVATE", 294 "RADL": "RADIAL", 295 "RNCH": "RANCH", 296 "RPD": "RAPID", 297 "RPDS": "RAPIDS", 298 "RST": "REST", 299 "RDG": "RIDGE", 300 "RDGS": "RIDGES", 301 "RIV": "RIVER", 302 "RD": "ROAD", 303 "RDS": "ROADS", 304 "RT": "ROUTE", 305 "RTE": "ROUTE", 306 "SHL": "SHOAL", 307 "SHLS": "SHOALS", 308 "SHR": "SHORE", 309 "SHRS": "SHORES", 310 "SKWY": "SKYWAY", 311 "SPG": "SPRING", 312 "SPGS": "SPRINGS", 313 "SQ": "SQUARE", 314 "SQS": "SQUARES", 315 "STA": "STATION", 316 "STRA": "STRAVENUE", 317 "STRM": "STREAM", 318 "STS": "STREETS", 319 "SMT": "SUMMIT", 320 "SRVC": "SERVICE", 321 "TER": "TERRACE", 322 "TRWY": "THROUGHWAY", 323 "THFR": "THOROUGHFARE", 324 "TRCE": "TRACE", 325 "TRAK": "TRACK", 326 "TRFY": "TRAFFICWAY", 327 "TRL": "TRAIL", 328 "TRLR": "TRAILER", 329 "TUNL": "TUNNEL", 330 "TPKE": "TURNPIKE", 331 "UPAS": "UNDERPASS", 332 "UN": "UNION", 333 "UNP": "UNDERPASS", 334 "UNS": "UNIONS", 335 "VIA": "VIADUCT", 336 "VIAS": "VIADUCTS", 337 "VLY": "VALLEY", 338 "VLYS": "VALLEYS", 339 "VW": "VIEW", 340 "VWS": "VIEWS", 341 "VLG": "VILLAGE", 342 "VL": "VILLE", 343 "VIS": "VISTA", 344 "WK": "WALK", 345 "WKWY": "WALKWAY", 346 "WY": "WAY", 347 "WL": "WELL", 348 "WLS": "WELLS", 349 "XING": "CROSSING", 350 "XINGS": "CROSSINGS", 351 "XRD": "CROSSROAD", 352 "XRDS": "CROSSROADS", 353 "YU": "BAYOU", 354} 355"""Common street type abbreviations.""" 356 357saints = [ 358 "Abigail", 359 "Agatha", 360 "Agnes", 361 "Andrew", 362 "Anthony", 363 "Augustine", 364 "Bernadette", 365 "Brigid", 366 "Catherine", 367 "Charles", 368 "Christopher", 369 "Clare", 370 "Cloud", 371 "Dymphna", 372 "Elizabeth", 373 "Faustina", 374 "Felix", 375 "Francis", 376 "Gabriel,", 377 "George", 378 "Gerard", 379 "James", 380 "Joan", 381 "John", 382 "Joseph", 383 "Jude", 384 "Kateri", 385 "Louis", 386 "Lucie", 387 "Lucy", 388 "Luke", 389 "Maria", 390 "Mark", 391 "Martin", 392 "Mary", 393 "Maximilian", 394 "Michael", 395 "Monica", 396 "Padre", 397 "Patrick", 398 "Paul", 399 "Peter", 400 "Philomena", 401 "Raphael", 402 "Rita", 403 "Rose", 404 "Sebastian", 405 "Teresa", 406 "Therese", 407 "Thomas", 408 "Valentine", 409 "Victor", 410 "Vincent", 411] 412"""Most common saint names.""" 413 414bad_zip_first_3 = [ 415 "001", 416 "002", 417 "003", 418 "004", 419 "213", 420 "269", 421 "343", 422 "345", 423 "348", 424 "353", 425 "419", 426 "428", 427 "429", 428 "517", 429 "518", 430 "519", 431 "529", 432 "533", 433 "536", 434 "552", 435 "568", 436 "569", 437 "578", 438 "579", 439 "589", 440 "621", 441 "632", 442 "642", 443 "643", 444 "659", 445 "663", 446 "682", 447 "694", 448 "695", 449 "696", 450 "697", 451 "698", 452 "699", 453 "702", 454 "709", 455 "715", 456 "732", 457 "742", 458 "817", 459 "818", 460 "819", 461 "839", 462 "848", 463 "849", 464 "851", 465 "854", 466 "858", 467 "861", 468 "862", 469 "866", 470 "867", 471 "868", 472 "869", 473 "876", 474 "886", 475 "887", 476 "888", 477 "892", 478 "896", 479 "899", 480 "909", 481 "929", 482 "987", 483] 484"""Three-digit combinations that don't represent a zip code.""" 485 486# pre-compile regex for speed 487ABBR_JOIN = "|".join({**name_expand, **street_expand}) 488abbr_join_comp = regex.compile( 489 rf"(\b(?:{ABBR_JOIN})\b\.?)(?!')", 490 flags=regex.IGNORECASE, 491) 492 493DIR_FILL = "|".join(r"\.?".join(list(abbr)) for abbr in direction_expand) 494dir_fill_comp = regex.compile( 495 rf"(?<!(?:^(?:Avenue) |[\.']))(\b(?:{DIR_FILL})\b\.?)(?!(?:\.?[a-zA-Z]| (?:Street|Avenue)))", 496 flags=regex.IGNORECASE, 497) 498 499sr_comp = regex.compile( 500 r"(\bS\.?R\b\.?)(?= \d+)", 501 flags=regex.IGNORECASE, 502) 503 504saint_comp = regex.compile( 505 rf"^(St\.?)(?= )|(\bSt\.?)(?= (?:{'|'.join(saints)}))", 506 flags=regex.IGNORECASE, 507) 508 509street_comp = regex.compile( 510 r"St\.?(?= [NESW]\.?[EW]?\.?)|(?<=\d[thndstr]{2} )St\.?\b|St\.?$" 511) 512 513post_comp = regex.compile(r"(\d{5})-?0{4}") 514 515usa_comp = regex.compile(r",? (?:USA?|United States(?: of America)?|Canada)\b") 516 517paren_comp = regex.compile(r" ?\(.*\)") 518 519# match Wisconsin grid-style addresses: N65w25055, W249 N6620, etc. 520grid_comp = regex.compile( 521 r"\b([NnSs]\d{2,}\s*[EeWw]\d{2,}|[EeWw]\d{2,}\s*[NnSs]\d{2,})\b" 522)
direction_expand =
{'NE': 'Northeast', 'SE': 'Southeast', 'NW': 'Northwest', 'SW': 'Southwest', 'N': 'North', 'E': 'East', 'S': 'South', 'W': 'West'}
Compass direction abbreviations.
name_expand =
{'ARPT': 'airport', 'BLDG': 'building', 'CONF': 'conference', 'CONV': 'convention', 'CNTR': 'center', 'CTR': 'center', 'DWTN': 'downtown', 'INTL': 'international', 'FT': 'fort', 'MT': 'mount', 'MTN': 'mountain', 'SHPG': 'shopping'}
Common name abbreviations.
state_expand =
{'ALABAMA': 'AL', 'ALA': 'AL', 'ALASKA': 'AK', 'ALAS': 'AK', 'ARIZONA': 'AZ', 'ARIZ': 'AZ', 'ARKANSAS': 'AR', 'ARK': 'AR', 'CALIFORNIA': 'CA', 'CALIF': 'CA', 'CAL': 'CA', 'COLORADO': 'CO', 'COLO': 'CO', 'COL': 'CO', 'CONNECTICUT': 'CT', 'CONN': 'CT', 'DELAWARE': 'DE', 'DEL': 'DE', 'DISTRICT OF COLUMBIA': 'DC', 'FLORIDA': 'FL', 'FLA': 'FL', 'FLOR': 'FL', 'GEORGIA': 'GA', 'GA': 'GA', 'HAWAII': 'HI', 'IDAHO': 'ID', 'IDA': 'ID', 'ILLINOIS': 'IL', 'ILL': 'IL', 'INDIANA': 'IN', 'IND': 'IN', 'IOWA': 'IA', 'KANSAS': 'KS', 'KANS': 'KS', 'KAN': 'KS', 'KENTUCKY': 'KY', 'KEN': 'KY', 'KENT': 'KY', 'LOUISIANA': 'LA', 'MAINE': 'ME', 'MARYLAND': 'MD', 'MASSACHUSETTS': 'MA', 'MASS': 'MA', 'MICHIGAN': 'MI', 'MICH': 'MI', 'MINNESOTA': 'MN', 'MINN': 'MN', 'MISSISSIPPI': 'MS', 'MISS': 'MS', 'MISSOURI': 'MO', 'MONTANA': 'MT', 'MONT': 'MT', 'NEBRASKA': 'NE', 'NEBR': 'NE', 'NEB': 'NE', 'NEVADA': 'NV', 'NEV': 'NV', 'NEW HAMPSHIRE': 'NH', 'NEW JERSEY': 'NJ', 'NEW MEXICO': 'NM', 'N MEX': 'NM', 'NEW M': 'NM', 'NEW YORK': 'NY', 'NORTH CAROLINA': 'NC', 'NORTH DAKOTA': 'ND', 'N DAK': 'ND', 'OHIO': 'OH', 'OKLAHOMA': 'OK', 'OKLA': 'OK', 'OREGON': 'OR', 'OREG': 'OR', 'ORE': 'OR', 'PENNSYLVANIA': 'PA', 'PENN': 'PA', 'RHODE ISLAND': 'RI', 'SOUTH CAROLINA': 'SC', 'SOUTH DAKOTA': 'SD', 'S DAK': 'SD', 'TENNESSEE': 'TN', 'TENN': 'TN', 'TEXAS': 'TX', 'TEX': 'TX', 'UTAH': 'UT', 'VERMONT': 'VT', 'VIRGINIA': 'VA', 'WASHINGTON': 'WA', 'WASH': 'WA', 'WEST VIRGINIA': 'WV', 'W VA': 'WV', 'WISCONSIN': 'WI', 'WIS': 'WI', 'WISC': 'WI', 'WYOMING': 'WY', 'WYO': 'WY', 'ONTARIO': 'ON', 'QUEBEC': 'QC', 'NOVA SCOTIA': 'NS', 'NEW BRUNSWICK': 'NB', 'MANITOBA': 'MB', 'BRITISH COLUMBIA': 'BC', 'PRINCE EDWARD ISLAND': 'PE', 'PRINCE EDWARD': 'PE', 'SASKATCHEWAN': 'SK', 'ALBERTA': 'AB', 'NEWFOUNDLAND AND LABRADOR': 'NL', 'NEWFOUNDLAND & LABRADOR': 'NL', 'NEWFOUNDLAND': 'NL', 'YUKON': 'YK', 'NUNAVUT': 'NU', 'NORTHWEST TERRITORIES': 'NT', 'NW TERRITORIES': 'NT'}
Map states to abbreviations.
street_expand =
{'ACC': 'ACCESS', 'ALY': 'ALLEY', 'ANX': 'ANEX', 'ARC': 'ARCADE', 'AV': 'AVENUE', 'AVE': 'AVENUE', 'BYU': 'BAYOU', 'BCH': 'BEACH', 'BND': 'BEND', 'BLF': 'BLUFF', 'BLFS': 'BLUFFS', 'BTM': 'BOTTOM', 'BLVD': 'BOULEVARD', 'BR': 'BRANCH', 'BRG': 'BRIDGE', 'BRK': 'BROOK', 'BRKS': 'BROOKS', 'BG': 'BURG', 'BGS': 'BURGS', 'BYP': 'BYPASS', 'CP': 'CAMP', 'CY': 'KEY', 'CYN': 'CANYON', 'CPE': 'CAPE', 'CTR': 'CENTER', 'CTRS': 'CENTERS', 'CIR': 'CIRCLE', 'CIRS': 'CIRCLES', 'CLF': 'CLIFF', 'CLFS': 'CLIFFS', 'CLB': 'CLUB', 'CMN': 'COMMON', 'CMNS': 'COMMONS', 'COR': 'CORNER', 'CORS': 'CORNERS', 'CRSE': 'COURSE', 'CT': 'COURT', 'CTS': 'COURTS', 'CV': 'COVE', 'CVS': 'COVES', 'CRK': 'CREEK', 'CRES': 'CRESCENT', 'CRST': 'CREST', 'CSWY': 'CAUSEWAY', 'CURV': 'CURVE', 'DL': 'DALE', 'DM': 'DAM', 'DV': 'DIVIDE', 'DR': 'DRIVE', 'DRS': 'DRIVES', 'EST': 'ESTATE', 'EXPY': 'EXPRESSWAY', 'EXPWY': 'EXPRESSWAY', 'EXT': 'EXTENSION', 'EXTS': 'EXTENSIONS', 'FGR': 'FORGE', 'FGRS': 'FORGES', 'FLS': 'FALLS', 'FLD': 'FIELD', 'FLDS': 'FIELDS', 'FLT': 'FLAT', 'FLTS': 'FLATS', 'FRD': 'FORD', 'FRDS': 'FORDS', 'FRST': 'FOREST', 'FRG': 'FORGE', 'FRGS': 'FORGES', 'FRK': 'FORK', 'FRKS': 'FORKS', 'FRY': 'FERRY', 'FRYS': 'FERRYS', 'FOR': 'FORD', 'FORS': 'FORDS', 'FT': 'FORT', 'FWY': 'FREEWAY', 'GD': 'GRADE', 'GDN': 'GARDEN', 'GDNS': 'GARDENS', 'GTWY': 'GATEWAY', 'GLN': 'GLEN', 'GLNS': 'GLENS', 'GN': 'GREEN', 'GNS': 'GREENS', 'GRN': 'GREEN', 'GRNS': 'GREENS', 'GRV': 'GROVE', 'GRVS': 'GROVES', 'HBR': 'HARBOR', 'HBRS': 'HARBORS', 'HGWY': 'HIGHWAY', 'HVN': 'HAVEN', 'HTS': 'HEIGHTS', 'HWY': 'HIGHWAY', 'HL': 'HILL', 'HLS': 'HILLS', 'HOLW': 'HOLLOW', 'INLT': 'INLET', 'IS': 'ISLAND', 'ISS': 'ISLANDS', 'JCT': 'JUNCTION', 'JCTS': 'JUNCTIONS', 'KY': 'KEY', 'KYS': 'KEYS', 'KNL': 'KNOLL', 'KNLS': 'KNOLLS', 'LK': 'LAKE', 'LKS': 'LAKES', 'LNDG': 'LANDING', 'LN': 'LANE', 'LGT': 'LIGHT', 'LGTS': 'LIGHTS', 'LF': 'LOAF', 'LCK': 'LOCK', 'LCKS': 'LOCKS', 'LDG': 'LODGE', 'LP': 'LOOP', 'MNR': 'MANOR', 'MNRS': 'MANORS', 'MDW': 'MEADOW', 'MDWS': 'MEADOWS', 'ML': 'MILL', 'MLS': 'MILLS', 'MSN': 'MISSION', 'MTWY': 'MOTORWAY', 'MT': 'MOUNT', 'MTN': 'MOUNTAIN', 'MTNS': 'MOUNTAINS', 'NCK': 'NECK', 'ORCH': 'ORCHARD', 'OPAS': 'OVERPASS', 'PKY': 'PARKWAY', 'PKWY': 'PARKWAY', 'PSGE': 'PASSAGE', 'PNE': 'PINE', 'PNES': 'PINES', 'PL': 'PLACE', 'PLN': 'PLAIN', 'PLNS': 'PLAINS', 'PLZ': 'PLAZA', 'PT': 'POINT', 'PTS': 'POINTS', 'PRT': 'PORT', 'PRTS': 'PORTS', 'PR': 'PRAIRIE', 'PVT': 'PRIVATE', 'RADL': 'RADIAL', 'RNCH': 'RANCH', 'RPD': 'RAPID', 'RPDS': 'RAPIDS', 'RST': 'REST', 'RDG': 'RIDGE', 'RDGS': 'RIDGES', 'RIV': 'RIVER', 'RD': 'ROAD', 'RDS': 'ROADS', 'RT': 'ROUTE', 'RTE': 'ROUTE', 'SHL': 'SHOAL', 'SHLS': 'SHOALS', 'SHR': 'SHORE', 'SHRS': 'SHORES', 'SKWY': 'SKYWAY', 'SPG': 'SPRING', 'SPGS': 'SPRINGS', 'SQ': 'SQUARE', 'SQS': 'SQUARES', 'STA': 'STATION', 'STRA': 'STRAVENUE', 'STRM': 'STREAM', 'STS': 'STREETS', 'SMT': 'SUMMIT', 'SRVC': 'SERVICE', 'TER': 'TERRACE', 'TRWY': 'THROUGHWAY', 'THFR': 'THOROUGHFARE', 'TRCE': 'TRACE', 'TRAK': 'TRACK', 'TRFY': 'TRAFFICWAY', 'TRL': 'TRAIL', 'TRLR': 'TRAILER', 'TUNL': 'TUNNEL', 'TPKE': 'TURNPIKE', 'UPAS': 'UNDERPASS', 'UN': 'UNION', 'UNP': 'UNDERPASS', 'UNS': 'UNIONS', 'VIA': 'VIADUCT', 'VIAS': 'VIADUCTS', 'VLY': 'VALLEY', 'VLYS': 'VALLEYS', 'VW': 'VIEW', 'VWS': 'VIEWS', 'VLG': 'VILLAGE', 'VL': 'VILLE', 'VIS': 'VISTA', 'WK': 'WALK', 'WKWY': 'WALKWAY', 'WY': 'WAY', 'WL': 'WELL', 'WLS': 'WELLS', 'XING': 'CROSSING', 'XINGS': 'CROSSINGS', 'XRD': 'CROSSROAD', 'XRDS': 'CROSSROADS', 'YU': 'BAYOU'}
Common street type abbreviations.
saints =
['Abigail', 'Agatha', 'Agnes', 'Andrew', 'Anthony', 'Augustine', 'Bernadette', 'Brigid', 'Catherine', 'Charles', 'Christopher', 'Clare', 'Cloud', 'Dymphna', 'Elizabeth', 'Faustina', 'Felix', 'Francis', 'Gabriel,', 'George', 'Gerard', 'James', 'Joan', 'John', 'Joseph', 'Jude', 'Kateri', 'Louis', 'Lucie', 'Lucy', 'Luke', 'Maria', 'Mark', 'Martin', 'Mary', 'Maximilian', 'Michael', 'Monica', 'Padre', 'Patrick', 'Paul', 'Peter', 'Philomena', 'Raphael', 'Rita', 'Rose', 'Sebastian', 'Teresa', 'Therese', 'Thomas', 'Valentine', 'Victor', 'Vincent']
Most common saint names.
bad_zip_first_3 =
['001', '002', '003', '004', '213', '269', '343', '345', '348', '353', '419', '428', '429', '517', '518', '519', '529', '533', '536', '552', '568', '569', '578', '579', '589', '621', '632', '642', '643', '659', '663', '682', '694', '695', '696', '697', '698', '699', '702', '709', '715', '732', '742', '817', '818', '819', '839', '848', '849', '851', '854', '858', '861', '862', '866', '867', '868', '869', '876', '886', '887', '888', '892', '896', '899', '909', '929', '987']
Three-digit combinations that don't represent a zip code.
ABBR_JOIN =
'ARPT|BLDG|CONF|CONV|CNTR|CTR|DWTN|INTL|FT|MT|MTN|SHPG|ACC|ALY|ANX|ARC|AV|AVE|BYU|BCH|BND|BLF|BLFS|BTM|BLVD|BR|BRG|BRK|BRKS|BG|BGS|BYP|CP|CY|CYN|CPE|CTRS|CIR|CIRS|CLF|CLFS|CLB|CMN|CMNS|COR|CORS|CRSE|CT|CTS|CV|CVS|CRK|CRES|CRST|CSWY|CURV|DL|DM|DV|DR|DRS|EST|EXPY|EXPWY|EXT|EXTS|FGR|FGRS|FLS|FLD|FLDS|FLT|FLTS|FRD|FRDS|FRST|FRG|FRGS|FRK|FRKS|FRY|FRYS|FOR|FORS|FWY|GD|GDN|GDNS|GTWY|GLN|GLNS|GN|GNS|GRN|GRNS|GRV|GRVS|HBR|HBRS|HGWY|HVN|HTS|HWY|HL|HLS|HOLW|INLT|IS|ISS|JCT|JCTS|KY|KYS|KNL|KNLS|LK|LKS|LNDG|LN|LGT|LGTS|LF|LCK|LCKS|LDG|LP|MNR|MNRS|MDW|MDWS|ML|MLS|MSN|MTWY|MTNS|NCK|ORCH|OPAS|PKY|PKWY|PSGE|PNE|PNES|PL|PLN|PLNS|PLZ|PT|PTS|PRT|PRTS|PR|PVT|RADL|RNCH|RPD|RPDS|RST|RDG|RDGS|RIV|RD|RDS|RT|RTE|SHL|SHLS|SHR|SHRS|SKWY|SPG|SPGS|SQ|SQS|STA|STRA|STRM|STS|SMT|SRVC|TER|TRWY|THFR|TRCE|TRAK|TRFY|TRL|TRLR|TUNL|TPKE|UPAS|UN|UNP|UNS|VIA|VIAS|VLY|VLYS|VW|VWS|VLG|VL|VIS|WK|WKWY|WY|WL|WLS|XING|XINGS|XRD|XRDS|YU'
abbr_join_comp =
regex.Regex("(\\b(?:ARPT|BLDG|CONF|CONV|CNTR|CTR|DWTN|INTL|FT|MT|MTN|SHPG|ACC|ALY|ANX|ARC|AV|AVE|BYU|BCH|BND|BLF|BLFS|BTM|BLVD|BR|BRG|BRK|BRKS|BG|BGS|BYP|CP|CY|CYN|CPE|CTRS|CIR|CIRS|CLF|CLFS|CLB|CMN|CMNS|COR|CORS|CRSE|CT|CTS|CV|CVS|CRK|CRES|CRST|CSWY|CURV|DL|DM|DV|DR|DRS|EST|EXPY|EXPWY|EXT|EXTS|FGR|FGRS|FLS|FLD|FLDS|FLT|FLTS|FRD|FRDS|FRST|FRG|FRGS|FRK|FRKS|FRY|FRYS|FOR|FORS|FWY|GD|GDN|GDNS|GTWY|GLN|GLNS|GN|GNS|GRN|GRNS|GRV|GRVS|HBR|HBRS|HGWY|HVN|HTS|HWY|HL|HLS|HOLW|INLT|IS|ISS|JCT|JCTS|KY|KYS|KNL|KNLS|LK|LKS|LNDG|LN|LGT|LGTS|LF|LCK|LCKS|LDG|LP|MNR|MNRS|MDW|MDWS|ML|MLS|MSN|MTWY|MTNS|NCK|ORCH|OPAS|PKY|PKWY|PSGE|PNE|PNES|PL|PLN|PLNS|PLZ|PT|PTS|PRT|PRTS|PR|PVT|RADL|RNCH|RPD|RPDS|RST|RDG|RDGS|RIV|RD|RDS|RT|RTE|SHL|SHLS|SHR|SHRS|SKWY|SPG|SPGS|SQ|SQS|STA|STRA|STRM|STS|SMT|SRVC|TER|TRWY|THFR|TRCE|TRAK|TRFY|TRL|TRLR|TUNL|TPKE|UPAS|UN|UNP|UNS|VIA|VIAS|VLY|VLYS|VW|VWS|VLG|VL|VIS|WK|WKWY|WY|WL|WLS|XING|XINGS|XRD|XRDS|YU)\\b\\.?)(?!')", flags=regex.I | regex.V0)
DIR_FILL =
'N\\.?E|S\\.?E|N\\.?W|S\\.?W|N|E|S|W'
dir_fill_comp =
regex.Regex("(?<!(?:^(?:Avenue) |[\\.']))(\\b(?:N\\.?E|S\\.?E|N\\.?W|S\\.?W|N|E|S|W)\\b\\.?)(?!(?:\\.?[a-zA-Z]| (?:Street|Avenue)))", flags=regex.I | regex.V0)
sr_comp =
regex.Regex('(\\bS\\.?R\\b\\.?)(?= \\d+)', flags=regex.I | regex.V0)
saint_comp =
regex.Regex('^(St\\.?)(?= )|(\\bSt\\.?)(?= (?:Abigail|Agatha|Agnes|Andrew|Anthony|Augustine|Bernadette|Brigid|Catherine|Charles|Christopher|Clare|Cloud|Dymphna|Elizabeth|Faustina|Felix|Francis|Gabriel,|George|Gerard|James|Joan|John|Joseph|Jude|Kateri|Louis|Lucie|Lucy|Luke|Maria|Mark|Martin|Mary|Maximilian|Michael|Monica|Padre|Patrick|Paul|Peter|Philomena|Raphael|Rita|Rose|Sebastian|Teresa|Therese|Thomas|Valentine|Victor|Vincent))', flags=regex.I | regex.V0)
street_comp =
regex.Regex('St\\.?(?= [NESW]\\.?[EW]?\\.?)|(?<=\\d[thndstr]{2} )St\\.?\\b|St\\.?$', flags=regex.V0)
post_comp =
regex.Regex('(\\d{5})-?0{4}', flags=regex.V0)
usa_comp =
regex.Regex(',? (?:USA?|United States(?: of America)?|Canada)\\b', flags=regex.V0)
paren_comp =
regex.Regex(' ?\\(.*\\)', flags=regex.V0)
grid_comp =
regex.Regex('\\b([NnSs]\\d{2,}\\s*[EeWw]\\d{2,}|[EeWw]\\d{2,}\\s*[NnSs]\\d{2,})\\b', flags=regex.V0)