atlus.resources

Hold info for the processing script.

  1"""Hold info for the processing script."""
  2
  3import regex
  4
  5direction_expand = {
  6    "NE": "Northeast",
  7    "SE": "Southeast",
  8    "NW": "Northwest",
  9    "SW": "Southwest",
 10    "N": "North",
 11    "E": "East",
 12    "S": "South",
 13    "W": "West",
 14}
 15"""Compass direction abbreviations."""
 16
 17name_expand = {
 18    "ARPT": "airport",
 19    "BLDG": "building",
 20    "CONF": "conference",
 21    "CONV": "convention",
 22    "CNTR": "center",
 23    "CTR": "center",
 24    "DWTN": "downtown",
 25    "INTL": "international",
 26    "FT": "fort",
 27    "MT": "mount",
 28    "MTN": "mountain",
 29    "SHPG": "shopping",
 30}
 31"""Common name abbreviations."""
 32
 33state_expand = {
 34    "ALABAMA": "AL",
 35    "ALA": "AL",
 36    "ALASKA": "AK",
 37    "ALAS": "AK",
 38    "ARIZONA": "AZ",
 39    "ARIZ": "AZ",
 40    "ARKANSAS": "AR",
 41    "ARK": "AR",
 42    "CALIFORNIA": "CA",
 43    "CALIF": "CA",
 44    "CAL": "CA",
 45    "COLORADO": "CO",
 46    "COLO": "CO",
 47    "COL": "CO",
 48    "CONNECTICUT": "CT",
 49    "CONN": "CT",
 50    "DELAWARE": "DE",
 51    "DEL": "DE",
 52    "DISTRICT OF COLUMBIA": "DC",
 53    "FLORIDA": "FL",
 54    "FLA": "FL",
 55    "FLOR": "FL",
 56    "GEORGIA": "GA",
 57    "GA": "GA",
 58    "HAWAII": "HI",
 59    "IDAHO": "ID",
 60    "IDA": "ID",
 61    "ILLINOIS": "IL",
 62    "ILL": "IL",
 63    "INDIANA": "IN",
 64    "IND": "IN",
 65    "IOWA": "IA",
 66    "KANSAS": "KS",
 67    "KANS": "KS",
 68    "KAN": "KS",
 69    "KENTUCKY": "KY",
 70    "KEN": "KY",
 71    "KENT": "KY",
 72    "LOUISIANA": "LA",
 73    "MAINE": "ME",
 74    "MARYLAND": "MD",
 75    "MASSACHUSETTS": "MA",
 76    "MASS": "MA",
 77    "MICHIGAN": "MI",
 78    "MICH": "MI",
 79    "MINNESOTA": "MN",
 80    "MINN": "MN",
 81    "MISSISSIPPI": "MS",
 82    "MISS": "MS",
 83    "MISSOURI": "MO",
 84    "MONTANA": "MT",
 85    "MONT": "MT",
 86    "NEBRASKA": "NE",
 87    "NEBR": "NE",
 88    "NEB": "NE",
 89    "NEVADA": "NV",
 90    "NEV": "NV",
 91    "NEW HAMPSHIRE": "NH",
 92    "NEW JERSEY": "NJ",
 93    "NEW MEXICO": "NM",
 94    "N MEX": "NM",
 95    "NEW M": "NM",
 96    "NEW YORK": "NY",
 97    "NORTH CAROLINA": "NC",
 98    "NORTH DAKOTA": "ND",
 99    "N DAK": "ND",
100    "OHIO": "OH",
101    "OKLAHOMA": "OK",
102    "OKLA": "OK",
103    "OREGON": "OR",
104    "OREG": "OR",
105    "ORE": "OR",
106    "PENNSYLVANIA": "PA",
107    "PENN": "PA",
108    "RHODE ISLAND": "RI",
109    "SOUTH CAROLINA": "SC",
110    "SOUTH DAKOTA": "SD",
111    "S DAK": "SD",
112    "TENNESSEE": "TN",
113    "TENN": "TN",
114    "TEXAS": "TX",
115    "TEX": "TX",
116    "UTAH": "UT",
117    "VERMONT": "VT",
118    "VIRGINIA": "VA",
119    "WASHINGTON": "WA",
120    "WASH": "WA",
121    "WEST VIRGINIA": "WV",
122    "W VA": "WV",
123    "WISCONSIN": "WI",
124    "WIS": "WI",
125    "WISC": "WI",
126    "WYOMING": "WY",
127    "WYO": "WY",
128    "ONTARIO": "ON",
129    "QUEBEC": "QC",
130    "NOVA SCOTIA": "NS",
131    "NEW BRUNSWICK": "NB",
132    "MANITOBA": "MB",
133    "BRITISH COLUMBIA": "BC",
134    "PRINCE EDWARD ISLAND": "PE",
135    "PRINCE EDWARD": "PE",
136    "SASKATCHEWAN": "SK",
137    "ALBERTA": "AB",
138    "NEWFOUNDLAND AND LABRADOR": "NL",
139    "NEWFOUNDLAND & LABRADOR": "NL",
140    "NEWFOUNDLAND": "NL",
141    "YUKON": "YK",
142    "NUNAVUT": "NU",
143    "NORTHWEST TERRITORIES": "NT",
144    "NW TERRITORIES": "NT",
145}
146"""Map states to abbreviations."""
147
148street_expand = {
149    "ACC": "ACCESS",
150    "ALY": "ALLEY",
151    "ANX": "ANEX",
152    "ARC": "ARCADE",
153    "AV": "AVENUE",
154    "AVE": "AVENUE",
155    "BYU": "BAYOU",
156    "BCH": "BEACH",
157    "BND": "BEND",
158    "BLF": "BLUFF",
159    "BLFS": "BLUFFS",
160    "BTM": "BOTTOM",
161    "BLVD": "BOULEVARD",
162    "BR": "BRANCH",
163    "BRG": "BRIDGE",
164    "BRK": "BROOK",
165    "BRKS": "BROOKS",
166    "BG": "BURG",
167    "BGS": "BURGS",
168    "BYP": "BYPASS",
169    "CP": "CAMP",
170    "CY": "KEY",
171    "CYN": "CANYON",
172    "CPE": "CAPE",
173    "CTR": "CENTER",
174    "CTRS": "CENTERS",
175    "CIR": "CIRCLE",
176    "CIRS": "CIRCLES",
177    "CLF": "CLIFF",
178    "CLFS": "CLIFFS",
179    "CLB": "CLUB",
180    "CMN": "COMMON",
181    "CMNS": "COMMONS",
182    "COR": "CORNER",
183    "CORS": "CORNERS",
184    "CRSE": "COURSE",
185    "CT": "COURT",
186    "CTS": "COURTS",
187    "CV": "COVE",
188    "CVS": "COVES",
189    "CRK": "CREEK",
190    "CRES": "CRESCENT",
191    "CRST": "CREST",
192    "CSWY": "CAUSEWAY",
193    "CURV": "CURVE",
194    "DL": "DALE",
195    "DM": "DAM",
196    "DV": "DIVIDE",
197    "DR": "DRIVE",
198    "DRS": "DRIVES",
199    "EST": "ESTATE",
200    "EXPY": "EXPRESSWAY",
201    "EXPWY": "EXPRESSWAY",
202    "EXT": "EXTENSION",
203    "EXTS": "EXTENSIONS",
204    "FGR": "FORGE",
205    "FGRS": "FORGES",
206    "FLS": "FALLS",
207    "FLD": "FIELD",
208    "FLDS": "FIELDS",
209    "FLT": "FLAT",
210    "FLTS": "FLATS",
211    "FRD": "FORD",
212    "FRDS": "FORDS",
213    "FRST": "FOREST",
214    "FRG": "FORGE",
215    "FRGS": "FORGES",
216    "FRK": "FORK",
217    "FRKS": "FORKS",
218    "FRY": "FERRY",
219    "FRYS": "FERRYS",
220    "FOR": "FORD",
221    "FORS": "FORDS",
222    "FT": "FORT",
223    "FWY": "FREEWAY",
224    "GD": "GRADE",
225    "GDN": "GARDEN",
226    "GDNS": "GARDENS",
227    "GTWY": "GATEWAY",
228    "GLN": "GLEN",
229    "GLNS": "GLENS",
230    "GN": "GREEN",
231    "GNS": "GREENS",
232    "GRN": "GREEN",
233    "GRNS": "GREENS",
234    "GRV": "GROVE",
235    "GRVS": "GROVES",
236    "HBR": "HARBOR",
237    "HBRS": "HARBORS",
238    "HGWY": "HIGHWAY",
239    "HVN": "HAVEN",
240    "HTS": "HEIGHTS",
241    "HWY": "HIGHWAY",
242    "HL": "HILL",
243    "HLS": "HILLS",
244    "HOLW": "HOLLOW",
245    "INLT": "INLET",
246    "IS": "ISLAND",
247    "ISS": "ISLANDS",
248    "JCT": "JUNCTION",
249    "JCTS": "JUNCTIONS",
250    "KY": "KEY",
251    "KYS": "KEYS",
252    "KNL": "KNOLL",
253    "KNLS": "KNOLLS",
254    "LK": "LAKE",
255    "LKS": "LAKES",
256    "LNDG": "LANDING",
257    "LN": "LANE",
258    "LGT": "LIGHT",
259    "LGTS": "LIGHTS",
260    "LF": "LOAF",
261    "LCK": "LOCK",
262    "LCKS": "LOCKS",
263    "LDG": "LODGE",
264    "LP": "LOOP",
265    "MNR": "MANOR",
266    "MNRS": "MANORS",
267    "MDW": "MEADOW",
268    "MDWS": "MEADOWS",
269    "ML": "MILL",
270    "MLS": "MILLS",
271    "MSN": "MISSION",
272    "MTWY": "MOTORWAY",
273    "MT": "MOUNT",
274    "MTN": "MOUNTAIN",
275    "MTNS": "MOUNTAINS",
276    "NCK": "NECK",
277    "ORCH": "ORCHARD",
278    "OPAS": "OVERPASS",
279    "PKY": "PARKWAY",
280    "PKWY": "PARKWAY",
281    "PSGE": "PASSAGE",
282    "PNE": "PINE",
283    "PNES": "PINES",
284    "PL": "PLACE",
285    "PLN": "PLAIN",
286    "PLNS": "PLAINS",
287    "PLZ": "PLAZA",
288    "PT": "POINT",
289    "PTS": "POINTS",
290    "PRT": "PORT",
291    "PRTS": "PORTS",
292    "PR": "PRAIRIE",
293    "PVT": "PRIVATE",
294    "RADL": "RADIAL",
295    "RNCH": "RANCH",
296    "RPD": "RAPID",
297    "RPDS": "RAPIDS",
298    "RST": "REST",
299    "RDG": "RIDGE",
300    "RDGS": "RIDGES",
301    "RIV": "RIVER",
302    "RD": "ROAD",
303    "RDS": "ROADS",
304    "RT": "ROUTE",
305    "RTE": "ROUTE",
306    "SHL": "SHOAL",
307    "SHLS": "SHOALS",
308    "SHR": "SHORE",
309    "SHRS": "SHORES",
310    "SKWY": "SKYWAY",
311    "SPG": "SPRING",
312    "SPGS": "SPRINGS",
313    "SQ": "SQUARE",
314    "SQS": "SQUARES",
315    "STA": "STATION",
316    "STRA": "STRAVENUE",
317    "STRM": "STREAM",
318    "STS": "STREETS",
319    "SMT": "SUMMIT",
320    "SRVC": "SERVICE",
321    "TER": "TERRACE",
322    "TRWY": "THROUGHWAY",
323    "THFR": "THOROUGHFARE",
324    "TRCE": "TRACE",
325    "TRAK": "TRACK",
326    "TRFY": "TRAFFICWAY",
327    "TRL": "TRAIL",
328    "TRLR": "TRAILER",
329    "TUNL": "TUNNEL",
330    "TPKE": "TURNPIKE",
331    "UPAS": "UNDERPASS",
332    "UN": "UNION",
333    "UNP": "UNDERPASS",
334    "UNS": "UNIONS",
335    "VIA": "VIADUCT",
336    "VIAS": "VIADUCTS",
337    "VLY": "VALLEY",
338    "VLYS": "VALLEYS",
339    "VW": "VIEW",
340    "VWS": "VIEWS",
341    "VLG": "VILLAGE",
342    "VL": "VILLE",
343    "VIS": "VISTA",
344    "WK": "WALK",
345    "WKWY": "WALKWAY",
346    "WY": "WAY",
347    "WL": "WELL",
348    "WLS": "WELLS",
349    "XING": "CROSSING",
350    "XINGS": "CROSSINGS",
351    "XRD": "CROSSROAD",
352    "XRDS": "CROSSROADS",
353    "YU": "BAYOU",
354}
355"""Common street type abbreviations."""
356
357saints = [
358    "Abigail",
359    "Agatha",
360    "Agnes",
361    "Andrew",
362    "Anthony",
363    "Augustine",
364    "Bernadette",
365    "Brigid",
366    "Catherine",
367    "Charles",
368    "Christopher",
369    "Clare",
370    "Cloud",
371    "Dymphna",
372    "Elizabeth",
373    "Faustina",
374    "Felix",
375    "Francis",
376    "Gabriel,",
377    "George",
378    "Gerard",
379    "James",
380    "Joan",
381    "John",
382    "Joseph",
383    "Jude",
384    "Kateri",
385    "Louis",
386    "Lucie",
387    "Lucy",
388    "Luke",
389    "Maria",
390    "Mark",
391    "Martin",
392    "Mary",
393    "Maximilian",
394    "Michael",
395    "Monica",
396    "Padre",
397    "Patrick",
398    "Paul",
399    "Peter",
400    "Philomena",
401    "Raphael",
402    "Rita",
403    "Rose",
404    "Sebastian",
405    "Teresa",
406    "Therese",
407    "Thomas",
408    "Valentine",
409    "Victor",
410    "Vincent",
411]
412"""Most common saint names."""
413
414bad_zip_first_3 = [
415    "001",
416    "002",
417    "003",
418    "004",
419    "213",
420    "269",
421    "343",
422    "345",
423    "348",
424    "353",
425    "419",
426    "428",
427    "429",
428    "517",
429    "518",
430    "519",
431    "529",
432    "533",
433    "536",
434    "552",
435    "568",
436    "569",
437    "578",
438    "579",
439    "589",
440    "621",
441    "632",
442    "642",
443    "643",
444    "659",
445    "663",
446    "682",
447    "694",
448    "695",
449    "696",
450    "697",
451    "698",
452    "699",
453    "702",
454    "709",
455    "715",
456    "732",
457    "742",
458    "817",
459    "818",
460    "819",
461    "839",
462    "848",
463    "849",
464    "851",
465    "854",
466    "858",
467    "861",
468    "862",
469    "866",
470    "867",
471    "868",
472    "869",
473    "876",
474    "886",
475    "887",
476    "888",
477    "892",
478    "896",
479    "899",
480    "909",
481    "929",
482    "987",
483]
484"""Three-digit combinations that don't represent a zip code."""
485
486# pre-compile regex for speed
487ABBR_JOIN = "|".join({**name_expand, **street_expand})
488abbr_join_comp = regex.compile(
489    rf"(\b(?:{ABBR_JOIN})\b\.?)(?!')",
490    flags=regex.IGNORECASE,
491)
492
493DIR_FILL = "|".join(r"\.?".join(list(abbr)) for abbr in direction_expand)
494dir_fill_comp = regex.compile(
495    rf"(?<!(?:^(?:Avenue) |[\.']))(\b(?:{DIR_FILL})\b\.?)(?!(?:\.?[a-zA-Z]| (?:Street|Avenue)))",
496    flags=regex.IGNORECASE,
497)
498
499sr_comp = regex.compile(
500    r"(\bS\.?R\b\.?)(?= \d+)",
501    flags=regex.IGNORECASE,
502)
503
504saint_comp = regex.compile(
505    rf"^(St\.?)(?= )|(\bSt\.?)(?= (?:{'|'.join(saints)}))",
506    flags=regex.IGNORECASE,
507)
508
509street_comp = regex.compile(
510    r"St\.?(?= [NESW]\.?[EW]?\.?)|(?<=\d[thndstr]{2} )St\.?\b|St\.?$"
511)
512
513post_comp = regex.compile(r"(\d{5})-?0{4}")
514
515usa_comp = regex.compile(r",? (?:USA?|United States(?: of America)?|Canada)\b")
516
517paren_comp = regex.compile(r" ?\(.*\)")
518
519# match Wisconsin grid-style addresses: N65w25055, W249 N6620, etc.
520grid_comp = regex.compile(
521    r"\b([NnSs]\d{2,}\s*[EeWw]\d{2,}|[EeWw]\d{2,}\s*[NnSs]\d{2,})\b"
522)
direction_expand = {'NE': 'Northeast', 'SE': 'Southeast', 'NW': 'Northwest', 'SW': 'Southwest', 'N': 'North', 'E': 'East', 'S': 'South', 'W': 'West'}

Compass direction abbreviations.

name_expand = {'ARPT': 'airport', 'BLDG': 'building', 'CONF': 'conference', 'CONV': 'convention', 'CNTR': 'center', 'CTR': 'center', 'DWTN': 'downtown', 'INTL': 'international', 'FT': 'fort', 'MT': 'mount', 'MTN': 'mountain', 'SHPG': 'shopping'}

Common name abbreviations.

state_expand = {'ALABAMA': 'AL', 'ALA': 'AL', 'ALASKA': 'AK', 'ALAS': 'AK', 'ARIZONA': 'AZ', 'ARIZ': 'AZ', 'ARKANSAS': 'AR', 'ARK': 'AR', 'CALIFORNIA': 'CA', 'CALIF': 'CA', 'CAL': 'CA', 'COLORADO': 'CO', 'COLO': 'CO', 'COL': 'CO', 'CONNECTICUT': 'CT', 'CONN': 'CT', 'DELAWARE': 'DE', 'DEL': 'DE', 'DISTRICT OF COLUMBIA': 'DC', 'FLORIDA': 'FL', 'FLA': 'FL', 'FLOR': 'FL', 'GEORGIA': 'GA', 'GA': 'GA', 'HAWAII': 'HI', 'IDAHO': 'ID', 'IDA': 'ID', 'ILLINOIS': 'IL', 'ILL': 'IL', 'INDIANA': 'IN', 'IND': 'IN', 'IOWA': 'IA', 'KANSAS': 'KS', 'KANS': 'KS', 'KAN': 'KS', 'KENTUCKY': 'KY', 'KEN': 'KY', 'KENT': 'KY', 'LOUISIANA': 'LA', 'MAINE': 'ME', 'MARYLAND': 'MD', 'MASSACHUSETTS': 'MA', 'MASS': 'MA', 'MICHIGAN': 'MI', 'MICH': 'MI', 'MINNESOTA': 'MN', 'MINN': 'MN', 'MISSISSIPPI': 'MS', 'MISS': 'MS', 'MISSOURI': 'MO', 'MONTANA': 'MT', 'MONT': 'MT', 'NEBRASKA': 'NE', 'NEBR': 'NE', 'NEB': 'NE', 'NEVADA': 'NV', 'NEV': 'NV', 'NEW HAMPSHIRE': 'NH', 'NEW JERSEY': 'NJ', 'NEW MEXICO': 'NM', 'N MEX': 'NM', 'NEW M': 'NM', 'NEW YORK': 'NY', 'NORTH CAROLINA': 'NC', 'NORTH DAKOTA': 'ND', 'N DAK': 'ND', 'OHIO': 'OH', 'OKLAHOMA': 'OK', 'OKLA': 'OK', 'OREGON': 'OR', 'OREG': 'OR', 'ORE': 'OR', 'PENNSYLVANIA': 'PA', 'PENN': 'PA', 'RHODE ISLAND': 'RI', 'SOUTH CAROLINA': 'SC', 'SOUTH DAKOTA': 'SD', 'S DAK': 'SD', 'TENNESSEE': 'TN', 'TENN': 'TN', 'TEXAS': 'TX', 'TEX': 'TX', 'UTAH': 'UT', 'VERMONT': 'VT', 'VIRGINIA': 'VA', 'WASHINGTON': 'WA', 'WASH': 'WA', 'WEST VIRGINIA': 'WV', 'W VA': 'WV', 'WISCONSIN': 'WI', 'WIS': 'WI', 'WISC': 'WI', 'WYOMING': 'WY', 'WYO': 'WY', 'ONTARIO': 'ON', 'QUEBEC': 'QC', 'NOVA SCOTIA': 'NS', 'NEW BRUNSWICK': 'NB', 'MANITOBA': 'MB', 'BRITISH COLUMBIA': 'BC', 'PRINCE EDWARD ISLAND': 'PE', 'PRINCE EDWARD': 'PE', 'SASKATCHEWAN': 'SK', 'ALBERTA': 'AB', 'NEWFOUNDLAND AND LABRADOR': 'NL', 'NEWFOUNDLAND & LABRADOR': 'NL', 'NEWFOUNDLAND': 'NL', 'YUKON': 'YK', 'NUNAVUT': 'NU', 'NORTHWEST TERRITORIES': 'NT', 'NW TERRITORIES': 'NT'}

Map states to abbreviations.

street_expand = {'ACC': 'ACCESS', 'ALY': 'ALLEY', 'ANX': 'ANEX', 'ARC': 'ARCADE', 'AV': 'AVENUE', 'AVE': 'AVENUE', 'BYU': 'BAYOU', 'BCH': 'BEACH', 'BND': 'BEND', 'BLF': 'BLUFF', 'BLFS': 'BLUFFS', 'BTM': 'BOTTOM', 'BLVD': 'BOULEVARD', 'BR': 'BRANCH', 'BRG': 'BRIDGE', 'BRK': 'BROOK', 'BRKS': 'BROOKS', 'BG': 'BURG', 'BGS': 'BURGS', 'BYP': 'BYPASS', 'CP': 'CAMP', 'CY': 'KEY', 'CYN': 'CANYON', 'CPE': 'CAPE', 'CTR': 'CENTER', 'CTRS': 'CENTERS', 'CIR': 'CIRCLE', 'CIRS': 'CIRCLES', 'CLF': 'CLIFF', 'CLFS': 'CLIFFS', 'CLB': 'CLUB', 'CMN': 'COMMON', 'CMNS': 'COMMONS', 'COR': 'CORNER', 'CORS': 'CORNERS', 'CRSE': 'COURSE', 'CT': 'COURT', 'CTS': 'COURTS', 'CV': 'COVE', 'CVS': 'COVES', 'CRK': 'CREEK', 'CRES': 'CRESCENT', 'CRST': 'CREST', 'CSWY': 'CAUSEWAY', 'CURV': 'CURVE', 'DL': 'DALE', 'DM': 'DAM', 'DV': 'DIVIDE', 'DR': 'DRIVE', 'DRS': 'DRIVES', 'EST': 'ESTATE', 'EXPY': 'EXPRESSWAY', 'EXPWY': 'EXPRESSWAY', 'EXT': 'EXTENSION', 'EXTS': 'EXTENSIONS', 'FGR': 'FORGE', 'FGRS': 'FORGES', 'FLS': 'FALLS', 'FLD': 'FIELD', 'FLDS': 'FIELDS', 'FLT': 'FLAT', 'FLTS': 'FLATS', 'FRD': 'FORD', 'FRDS': 'FORDS', 'FRST': 'FOREST', 'FRG': 'FORGE', 'FRGS': 'FORGES', 'FRK': 'FORK', 'FRKS': 'FORKS', 'FRY': 'FERRY', 'FRYS': 'FERRYS', 'FOR': 'FORD', 'FORS': 'FORDS', 'FT': 'FORT', 'FWY': 'FREEWAY', 'GD': 'GRADE', 'GDN': 'GARDEN', 'GDNS': 'GARDENS', 'GTWY': 'GATEWAY', 'GLN': 'GLEN', 'GLNS': 'GLENS', 'GN': 'GREEN', 'GNS': 'GREENS', 'GRN': 'GREEN', 'GRNS': 'GREENS', 'GRV': 'GROVE', 'GRVS': 'GROVES', 'HBR': 'HARBOR', 'HBRS': 'HARBORS', 'HGWY': 'HIGHWAY', 'HVN': 'HAVEN', 'HTS': 'HEIGHTS', 'HWY': 'HIGHWAY', 'HL': 'HILL', 'HLS': 'HILLS', 'HOLW': 'HOLLOW', 'INLT': 'INLET', 'IS': 'ISLAND', 'ISS': 'ISLANDS', 'JCT': 'JUNCTION', 'JCTS': 'JUNCTIONS', 'KY': 'KEY', 'KYS': 'KEYS', 'KNL': 'KNOLL', 'KNLS': 'KNOLLS', 'LK': 'LAKE', 'LKS': 'LAKES', 'LNDG': 'LANDING', 'LN': 'LANE', 'LGT': 'LIGHT', 'LGTS': 'LIGHTS', 'LF': 'LOAF', 'LCK': 'LOCK', 'LCKS': 'LOCKS', 'LDG': 'LODGE', 'LP': 'LOOP', 'MNR': 'MANOR', 'MNRS': 'MANORS', 'MDW': 'MEADOW', 'MDWS': 'MEADOWS', 'ML': 'MILL', 'MLS': 'MILLS', 'MSN': 'MISSION', 'MTWY': 'MOTORWAY', 'MT': 'MOUNT', 'MTN': 'MOUNTAIN', 'MTNS': 'MOUNTAINS', 'NCK': 'NECK', 'ORCH': 'ORCHARD', 'OPAS': 'OVERPASS', 'PKY': 'PARKWAY', 'PKWY': 'PARKWAY', 'PSGE': 'PASSAGE', 'PNE': 'PINE', 'PNES': 'PINES', 'PL': 'PLACE', 'PLN': 'PLAIN', 'PLNS': 'PLAINS', 'PLZ': 'PLAZA', 'PT': 'POINT', 'PTS': 'POINTS', 'PRT': 'PORT', 'PRTS': 'PORTS', 'PR': 'PRAIRIE', 'PVT': 'PRIVATE', 'RADL': 'RADIAL', 'RNCH': 'RANCH', 'RPD': 'RAPID', 'RPDS': 'RAPIDS', 'RST': 'REST', 'RDG': 'RIDGE', 'RDGS': 'RIDGES', 'RIV': 'RIVER', 'RD': 'ROAD', 'RDS': 'ROADS', 'RT': 'ROUTE', 'RTE': 'ROUTE', 'SHL': 'SHOAL', 'SHLS': 'SHOALS', 'SHR': 'SHORE', 'SHRS': 'SHORES', 'SKWY': 'SKYWAY', 'SPG': 'SPRING', 'SPGS': 'SPRINGS', 'SQ': 'SQUARE', 'SQS': 'SQUARES', 'STA': 'STATION', 'STRA': 'STRAVENUE', 'STRM': 'STREAM', 'STS': 'STREETS', 'SMT': 'SUMMIT', 'SRVC': 'SERVICE', 'TER': 'TERRACE', 'TRWY': 'THROUGHWAY', 'THFR': 'THOROUGHFARE', 'TRCE': 'TRACE', 'TRAK': 'TRACK', 'TRFY': 'TRAFFICWAY', 'TRL': 'TRAIL', 'TRLR': 'TRAILER', 'TUNL': 'TUNNEL', 'TPKE': 'TURNPIKE', 'UPAS': 'UNDERPASS', 'UN': 'UNION', 'UNP': 'UNDERPASS', 'UNS': 'UNIONS', 'VIA': 'VIADUCT', 'VIAS': 'VIADUCTS', 'VLY': 'VALLEY', 'VLYS': 'VALLEYS', 'VW': 'VIEW', 'VWS': 'VIEWS', 'VLG': 'VILLAGE', 'VL': 'VILLE', 'VIS': 'VISTA', 'WK': 'WALK', 'WKWY': 'WALKWAY', 'WY': 'WAY', 'WL': 'WELL', 'WLS': 'WELLS', 'XING': 'CROSSING', 'XINGS': 'CROSSINGS', 'XRD': 'CROSSROAD', 'XRDS': 'CROSSROADS', 'YU': 'BAYOU'}

Common street type abbreviations.

saints = ['Abigail', 'Agatha', 'Agnes', 'Andrew', 'Anthony', 'Augustine', 'Bernadette', 'Brigid', 'Catherine', 'Charles', 'Christopher', 'Clare', 'Cloud', 'Dymphna', 'Elizabeth', 'Faustina', 'Felix', 'Francis', 'Gabriel,', 'George', 'Gerard', 'James', 'Joan', 'John', 'Joseph', 'Jude', 'Kateri', 'Louis', 'Lucie', 'Lucy', 'Luke', 'Maria', 'Mark', 'Martin', 'Mary', 'Maximilian', 'Michael', 'Monica', 'Padre', 'Patrick', 'Paul', 'Peter', 'Philomena', 'Raphael', 'Rita', 'Rose', 'Sebastian', 'Teresa', 'Therese', 'Thomas', 'Valentine', 'Victor', 'Vincent']

Most common saint names.

bad_zip_first_3 = ['001', '002', '003', '004', '213', '269', '343', '345', '348', '353', '419', '428', '429', '517', '518', '519', '529', '533', '536', '552', '568', '569', '578', '579', '589', '621', '632', '642', '643', '659', '663', '682', '694', '695', '696', '697', '698', '699', '702', '709', '715', '732', '742', '817', '818', '819', '839', '848', '849', '851', '854', '858', '861', '862', '866', '867', '868', '869', '876', '886', '887', '888', '892', '896', '899', '909', '929', '987']

Three-digit combinations that don't represent a zip code.

ABBR_JOIN = 'ARPT|BLDG|CONF|CONV|CNTR|CTR|DWTN|INTL|FT|MT|MTN|SHPG|ACC|ALY|ANX|ARC|AV|AVE|BYU|BCH|BND|BLF|BLFS|BTM|BLVD|BR|BRG|BRK|BRKS|BG|BGS|BYP|CP|CY|CYN|CPE|CTRS|CIR|CIRS|CLF|CLFS|CLB|CMN|CMNS|COR|CORS|CRSE|CT|CTS|CV|CVS|CRK|CRES|CRST|CSWY|CURV|DL|DM|DV|DR|DRS|EST|EXPY|EXPWY|EXT|EXTS|FGR|FGRS|FLS|FLD|FLDS|FLT|FLTS|FRD|FRDS|FRST|FRG|FRGS|FRK|FRKS|FRY|FRYS|FOR|FORS|FWY|GD|GDN|GDNS|GTWY|GLN|GLNS|GN|GNS|GRN|GRNS|GRV|GRVS|HBR|HBRS|HGWY|HVN|HTS|HWY|HL|HLS|HOLW|INLT|IS|ISS|JCT|JCTS|KY|KYS|KNL|KNLS|LK|LKS|LNDG|LN|LGT|LGTS|LF|LCK|LCKS|LDG|LP|MNR|MNRS|MDW|MDWS|ML|MLS|MSN|MTWY|MTNS|NCK|ORCH|OPAS|PKY|PKWY|PSGE|PNE|PNES|PL|PLN|PLNS|PLZ|PT|PTS|PRT|PRTS|PR|PVT|RADL|RNCH|RPD|RPDS|RST|RDG|RDGS|RIV|RD|RDS|RT|RTE|SHL|SHLS|SHR|SHRS|SKWY|SPG|SPGS|SQ|SQS|STA|STRA|STRM|STS|SMT|SRVC|TER|TRWY|THFR|TRCE|TRAK|TRFY|TRL|TRLR|TUNL|TPKE|UPAS|UN|UNP|UNS|VIA|VIAS|VLY|VLYS|VW|VWS|VLG|VL|VIS|WK|WKWY|WY|WL|WLS|XING|XINGS|XRD|XRDS|YU'
abbr_join_comp = regex.Regex("(\\b(?:ARPT|BLDG|CONF|CONV|CNTR|CTR|DWTN|INTL|FT|MT|MTN|SHPG|ACC|ALY|ANX|ARC|AV|AVE|BYU|BCH|BND|BLF|BLFS|BTM|BLVD|BR|BRG|BRK|BRKS|BG|BGS|BYP|CP|CY|CYN|CPE|CTRS|CIR|CIRS|CLF|CLFS|CLB|CMN|CMNS|COR|CORS|CRSE|CT|CTS|CV|CVS|CRK|CRES|CRST|CSWY|CURV|DL|DM|DV|DR|DRS|EST|EXPY|EXPWY|EXT|EXTS|FGR|FGRS|FLS|FLD|FLDS|FLT|FLTS|FRD|FRDS|FRST|FRG|FRGS|FRK|FRKS|FRY|FRYS|FOR|FORS|FWY|GD|GDN|GDNS|GTWY|GLN|GLNS|GN|GNS|GRN|GRNS|GRV|GRVS|HBR|HBRS|HGWY|HVN|HTS|HWY|HL|HLS|HOLW|INLT|IS|ISS|JCT|JCTS|KY|KYS|KNL|KNLS|LK|LKS|LNDG|LN|LGT|LGTS|LF|LCK|LCKS|LDG|LP|MNR|MNRS|MDW|MDWS|ML|MLS|MSN|MTWY|MTNS|NCK|ORCH|OPAS|PKY|PKWY|PSGE|PNE|PNES|PL|PLN|PLNS|PLZ|PT|PTS|PRT|PRTS|PR|PVT|RADL|RNCH|RPD|RPDS|RST|RDG|RDGS|RIV|RD|RDS|RT|RTE|SHL|SHLS|SHR|SHRS|SKWY|SPG|SPGS|SQ|SQS|STA|STRA|STRM|STS|SMT|SRVC|TER|TRWY|THFR|TRCE|TRAK|TRFY|TRL|TRLR|TUNL|TPKE|UPAS|UN|UNP|UNS|VIA|VIAS|VLY|VLYS|VW|VWS|VLG|VL|VIS|WK|WKWY|WY|WL|WLS|XING|XINGS|XRD|XRDS|YU)\\b\\.?)(?!')", flags=regex.I | regex.V0)
DIR_FILL = 'N\\.?E|S\\.?E|N\\.?W|S\\.?W|N|E|S|W'
dir_fill_comp = regex.Regex("(?<!(?:^(?:Avenue) |[\\.']))(\\b(?:N\\.?E|S\\.?E|N\\.?W|S\\.?W|N|E|S|W)\\b\\.?)(?!(?:\\.?[a-zA-Z]| (?:Street|Avenue)))", flags=regex.I | regex.V0)
sr_comp = regex.Regex('(\\bS\\.?R\\b\\.?)(?= \\d+)', flags=regex.I | regex.V0)
saint_comp = regex.Regex('^(St\\.?)(?= )|(\\bSt\\.?)(?= (?:Abigail|Agatha|Agnes|Andrew|Anthony|Augustine|Bernadette|Brigid|Catherine|Charles|Christopher|Clare|Cloud|Dymphna|Elizabeth|Faustina|Felix|Francis|Gabriel,|George|Gerard|James|Joan|John|Joseph|Jude|Kateri|Louis|Lucie|Lucy|Luke|Maria|Mark|Martin|Mary|Maximilian|Michael|Monica|Padre|Patrick|Paul|Peter|Philomena|Raphael|Rita|Rose|Sebastian|Teresa|Therese|Thomas|Valentine|Victor|Vincent))', flags=regex.I | regex.V0)
street_comp = regex.Regex('St\\.?(?= [NESW]\\.?[EW]?\\.?)|(?<=\\d[thndstr]{2} )St\\.?\\b|St\\.?$', flags=regex.V0)
post_comp = regex.Regex('(\\d{5})-?0{4}', flags=regex.V0)
usa_comp = regex.Regex(',? (?:USA?|United States(?: of America)?|Canada)\\b', flags=regex.V0)
paren_comp = regex.Regex(' ?\\(.*\\)', flags=regex.V0)
grid_comp = regex.Regex('\\b([NnSs]\\d{2,}\\s*[EeWw]\\d{2,}|[EeWw]\\d{2,}\\s*[NnSs]\\d{2,})\\b', flags=regex.V0)