Coverage for src/atlus/resources.py: 100%
24 statements
« prev ^ index » next coverage.py v7.6.0, created at 2024-07-27 14:35 -0400
« prev ^ index » next coverage.py v7.6.0, created at 2024-07-27 14:35 -0400
1"""Hold info for the processing script."""
3import regex
5direction_expand = {
6 "NE": "Northeast",
7 "SE": "Southeast",
8 "NW": "Northwest",
9 "SW": "Southwest",
10 "N": "North",
11 "E": "East",
12 "S": "South",
13 "W": "West",
14}
15"""Compass direction abbreviations."""
17name_expand = {
18 "ARPT": "airport",
19 "BLDG": "building",
20 "CONF": "conference",
21 "CONV": "convention",
22 "CNTR": "center",
23 "CTR": "center",
24 "DWTN": "downtown",
25 "INTL": "international",
26 "FT": "fort",
27 "MT": "mount",
28 "MTN": "mountain",
29 "SHPG": "shopping",
30}
31"""Common name abbreviations."""
33state_expand = {
34 "ALABAMA": "AL",
35 "ALA": "AL",
36 "ALASKA": "AK",
37 "ALAS": "AK",
38 "ARIZONA": "AZ",
39 "ARIZ": "AZ",
40 "ARKANSAS": "AR",
41 "ARK": "AR",
42 "CALIFORNIA": "CA",
43 "CALIF": "CA",
44 "CAL": "CA",
45 "COLORADO": "CO",
46 "COLO": "CO",
47 "COL": "CO",
48 "CONNECTICUT": "CT",
49 "CONN": "CT",
50 "DELAWARE": "DE",
51 "DEL": "DE",
52 "DISTRICT OF COLUMBIA": "DC",
53 "FLORIDA": "FL",
54 "FLA": "FL",
55 "FLOR": "FL",
56 "GEORGIA": "GA",
57 "GA": "GA",
58 "HAWAII": "HI",
59 "IDAHO": "ID",
60 "IDA": "ID",
61 "ILLINOIS": "IL",
62 "ILL": "IL",
63 "INDIANA": "IN",
64 "IND": "IN",
65 "IOWA": "IA",
66 "KANSAS": "KS",
67 "KANS": "KS",
68 "KAN": "KS",
69 "KENTUCKY": "KY",
70 "KEN": "KY",
71 "KENT": "KY",
72 "LOUISIANA": "LA",
73 "MAINE": "ME",
74 "MARYLAND": "MD",
75 "MASSACHUSETTS": "MA",
76 "MASS": "MA",
77 "MICHIGAN": "MI",
78 "MICH": "MI",
79 "MINNESOTA": "MN",
80 "MINN": "MN",
81 "MISSISSIPPI": "MS",
82 "MISS": "MS",
83 "MISSOURI": "MO",
84 "MONTANA": "MT",
85 "MONT": "MT",
86 "NEBRASKA": "NE",
87 "NEBR": "NE",
88 "NEB": "NE",
89 "NEVADA": "NV",
90 "NEV": "NV",
91 "NEW HAMPSHIRE": "NH",
92 "NEW JERSEY": "NJ",
93 "NEW MEXICO": "NM",
94 "N MEX": "NM",
95 "NEW M": "NM",
96 "NEW YORK": "NY",
97 "NORTH CAROLINA": "NC",
98 "NORTH DAKOTA": "ND",
99 "N DAK": "ND",
100 "OHIO": "OH",
101 "OKLAHOMA": "OK",
102 "OKLA": "OK",
103 "OREGON": "OR",
104 "OREG": "OR",
105 "ORE": "OR",
106 "PENNSYLVANIA": "PA",
107 "PENN": "PA",
108 "RHODE ISLAND": "RI",
109 "SOUTH CAROLINA": "SC",
110 "SOUTH DAKOTA": "SD",
111 "S DAK": "SD",
112 "TENNESSEE": "TN",
113 "TENN": "TN",
114 "TEXAS": "TX",
115 "TEX": "TX",
116 "UTAH": "UT",
117 "VERMONT": "VT",
118 "VIRGINIA": "VA",
119 "WASHINGTON": "WA",
120 "WASH": "WA",
121 "WEST VIRGINIA": "WV",
122 "W VA": "WV",
123 "WISCONSIN": "WI",
124 "WIS": "WI",
125 "WISC": "WI",
126 "WYOMING": "WY",
127 "WYO": "WY",
128 "ONTARIO": "ON",
129 "QUEBEC": "QC",
130 "NOVA SCOTIA": "NS",
131 "NEW BRUNSWICK": "NB",
132 "MANITOBA": "MB",
133 "BRITISH COLUMBIA": "BC",
134 "PRINCE EDWARD ISLAND": "PE",
135 "PRINCE EDWARD": "PE",
136 "SASKATCHEWAN": "SK",
137 "ALBERTA": "AB",
138 "NEWFOUNDLAND AND LABRADOR": "NL",
139 "NEWFOUNDLAND & LABRADOR": "NL",
140 "NEWFOUNDLAND": "NL",
141 "YUKON": "YK",
142 "NUNAVUT": "NU",
143 "NORTHWEST TERRITORIES": "NT",
144 "NW TERRITORIES": "NT",
145}
146"""Map states to abbreviations."""
148street_expand = {
149 "ACC": "ACCESS",
150 "ALY": "ALLEY",
151 "ANX": "ANEX",
152 "ARC": "ARCADE",
153 "AV": "AVENUE",
154 "AVE": "AVENUE",
155 "BYU": "BAYOU",
156 "BCH": "BEACH",
157 "BND": "BEND",
158 "BLF": "BLUFF",
159 "BLFS": "BLUFFS",
160 "BTM": "BOTTOM",
161 "BLVD": "BOULEVARD",
162 "BR": "BRANCH",
163 "BRG": "BRIDGE",
164 "BRK": "BROOK",
165 "BRKS": "BROOKS",
166 "BG": "BURG",
167 "BGS": "BURGS",
168 "BYP": "BYPASS",
169 "CP": "CAMP",
170 "CY": "KEY",
171 "CYN": "CANYON",
172 "CPE": "CAPE",
173 "CTR": "CENTER",
174 "CTRS": "CENTERS",
175 "CIR": "CIRCLE",
176 "CIRS": "CIRCLES",
177 "CLF": "CLIFF",
178 "CLFS": "CLIFFS",
179 "CLB": "CLUB",
180 "CMN": "COMMON",
181 "CMNS": "COMMONS",
182 "COR": "CORNER",
183 "CORS": "CORNERS",
184 "CRSE": "COURSE",
185 "CT": "COURT",
186 "CTS": "COURTS",
187 "CV": "COVE",
188 "CVS": "COVES",
189 "CRK": "CREEK",
190 "CRES": "CRESCENT",
191 "CRST": "CREST",
192 "CSWY": "CAUSEWAY",
193 "CURV": "CURVE",
194 "DL": "DALE",
195 "DM": "DAM",
196 "DV": "DIVIDE",
197 "DR": "DRIVE",
198 "DRS": "DRIVES",
199 "EST": "ESTATE",
200 "EXPY": "EXPRESSWAY",
201 "EXPWY": "EXPRESSWAY",
202 "EXT": "EXTENSION",
203 "EXTS": "EXTENSIONS",
204 "FGR": "FORGE",
205 "FGRS": "FORGES",
206 "FLS": "FALLS",
207 "FLD": "FIELD",
208 "FLDS": "FIELDS",
209 "FLT": "FLAT",
210 "FLTS": "FLATS",
211 "FRD": "FORD",
212 "FRDS": "FORDS",
213 "FRST": "FOREST",
214 "FRG": "FORGE",
215 "FRGS": "FORGES",
216 "FRK": "FORK",
217 "FRKS": "FORKS",
218 "FRY": "FERRY",
219 "FRYS": "FERRYS",
220 "FOR": "FORD",
221 "FORS": "FORDS",
222 "FT": "FORT",
223 "FWY": "FREEWAY",
224 "GD": "GRADE",
225 "GDN": "GARDEN",
226 "GDNS": "GARDENS",
227 "GTWY": "GATEWAY",
228 "GLN": "GLEN",
229 "GLNS": "GLENS",
230 "GN": "GREEN",
231 "GNS": "GREENS",
232 "GRN": "GREEN",
233 "GRNS": "GREENS",
234 "GRV": "GROVE",
235 "GRVS": "GROVES",
236 "HBR": "HARBOR",
237 "HBRS": "HARBORS",
238 "HGWY": "HIGHWAY",
239 "HVN": "HAVEN",
240 "HTS": "HEIGHTS",
241 "HWY": "HIGHWAY",
242 "HL": "HILL",
243 "HLS": "HILLS",
244 "HOLW": "HOLLOW",
245 "INLT": "INLET",
246 "IS": "ISLAND",
247 "ISS": "ISLANDS",
248 "JCT": "JUNCTION",
249 "JCTS": "JUNCTIONS",
250 "KY": "KEY",
251 "KYS": "KEYS",
252 "KNL": "KNOLL",
253 "KNLS": "KNOLLS",
254 "LK": "LAKE",
255 "LKS": "LAKES",
256 "LNDG": "LANDING",
257 "LN": "LANE",
258 "LGT": "LIGHT",
259 "LGTS": "LIGHTS",
260 "LF": "LOAF",
261 "LCK": "LOCK",
262 "LCKS": "LOCKS",
263 "LDG": "LODGE",
264 "LP": "LOOP",
265 "MNR": "MANOR",
266 "MNRS": "MANORS",
267 "MDW": "MEADOW",
268 "MDWS": "MEADOWS",
269 "ML": "MILL",
270 "MLS": "MILLS",
271 "MSN": "MISSION",
272 "MTWY": "MOTORWAY",
273 "MT": "MOUNT",
274 "MTN": "MOUNTAIN",
275 "MTNS": "MOUNTAINS",
276 "NCK": "NECK",
277 "ORCH": "ORCHARD",
278 "OPAS": "OVERPASS",
279 "PKY": "PARKWAY",
280 "PKWY": "PARKWAY",
281 "PSGE": "PASSAGE",
282 "PNE": "PINE",
283 "PNES": "PINES",
284 "PL": "PLACE",
285 "PLN": "PLAIN",
286 "PLNS": "PLAINS",
287 "PLZ": "PLAZA",
288 "PT": "POINT",
289 "PTS": "POINTS",
290 "PRT": "PORT",
291 "PRTS": "PORTS",
292 "PR": "PRAIRIE",
293 "PVT": "PRIVATE",
294 "RADL": "RADIAL",
295 "RNCH": "RANCH",
296 "RPD": "RAPID",
297 "RPDS": "RAPIDS",
298 "RST": "REST",
299 "RDG": "RIDGE",
300 "RDGS": "RIDGES",
301 "RIV": "RIVER",
302 "RD": "ROAD",
303 "RDS": "ROADS",
304 "RT": "ROUTE",
305 "RTE": "ROUTE",
306 "SHL": "SHOAL",
307 "SHLS": "SHOALS",
308 "SHR": "SHORE",
309 "SHRS": "SHORES",
310 "SKWY": "SKYWAY",
311 "SPG": "SPRING",
312 "SPGS": "SPRINGS",
313 "SQ": "SQUARE",
314 "SQS": "SQUARES",
315 "STA": "STATION",
316 "STRA": "STRAVENUE",
317 "STRM": "STREAM",
318 "STS": "STREETS",
319 "SMT": "SUMMIT",
320 "SRVC": "SERVICE",
321 "TER": "TERRACE",
322 "TRWY": "THROUGHWAY",
323 "THFR": "THOROUGHFARE",
324 "TRCE": "TRACE",
325 "TRAK": "TRACK",
326 "TRFY": "TRAFFICWAY",
327 "TRL": "TRAIL",
328 "TRLR": "TRAILER",
329 "TUNL": "TUNNEL",
330 "TPKE": "TURNPIKE",
331 "UPAS": "UNDERPASS",
332 "UN": "UNION",
333 "UNP": "UNDERPASS",
334 "UNS": "UNIONS",
335 "VIA": "VIADUCT",
336 "VIAS": "VIADUCTS",
337 "VLY": "VALLEY",
338 "VLYS": "VALLEYS",
339 "VW": "VIEW",
340 "VWS": "VIEWS",
341 "VLG": "VILLAGE",
342 "VL": "VILLE",
343 "VIS": "VISTA",
344 "WK": "WALK",
345 "WKWY": "WALKWAY",
346 "WY": "WAY",
347 "WL": "WELL",
348 "WLS": "WELLS",
349 "XING": "CROSSING",
350 "XINGS": "CROSSINGS",
351 "XRD": "CROSSROAD",
352 "XRDS": "CROSSROADS",
353 "YU": "BAYOU",
354}
355"""Common street type abbreviations."""
357saints = [
358 "Abigail",
359 "Agatha",
360 "Agnes",
361 "Andrew",
362 "Anthony",
363 "Augustine",
364 "Bernadette",
365 "Brigid",
366 "Catherine",
367 "Charles",
368 "Christopher",
369 "Clare",
370 "Cloud",
371 "Dymphna",
372 "Elizabeth",
373 "Faustina",
374 "Felix",
375 "Francis",
376 "Gabriel,",
377 "George",
378 "Gerard",
379 "James",
380 "Joan",
381 "John",
382 "Joseph",
383 "Jude",
384 "Kateri",
385 "Louis",
386 "Lucie",
387 "Lucy",
388 "Luke",
389 "Maria",
390 "Mark",
391 "Martin",
392 "Mary",
393 "Maximilian",
394 "Michael",
395 "Monica",
396 "Padre",
397 "Patrick",
398 "Paul",
399 "Peter",
400 "Philomena",
401 "Raphael",
402 "Rita",
403 "Rose",
404 "Sebastian",
405 "Teresa",
406 "Therese",
407 "Thomas",
408 "Valentine",
409 "Victor",
410 "Vincent",
411]
412"""Most common saint names."""
414bad_zip_first_3 = [
415 "001",
416 "002",
417 "003",
418 "004",
419 "213",
420 "269",
421 "343",
422 "345",
423 "348",
424 "353",
425 "419",
426 "428",
427 "429",
428 "517",
429 "518",
430 "519",
431 "529",
432 "533",
433 "536",
434 "552",
435 "568",
436 "569",
437 "578",
438 "579",
439 "589",
440 "621",
441 "632",
442 "642",
443 "643",
444 "659",
445 "663",
446 "682",
447 "694",
448 "695",
449 "696",
450 "697",
451 "698",
452 "699",
453 "702",
454 "709",
455 "715",
456 "732",
457 "742",
458 "817",
459 "818",
460 "819",
461 "839",
462 "848",
463 "849",
464 "851",
465 "854",
466 "858",
467 "861",
468 "862",
469 "866",
470 "867",
471 "868",
472 "869",
473 "876",
474 "886",
475 "887",
476 "888",
477 "892",
478 "896",
479 "899",
480 "909",
481 "929",
482 "987",
483]
484"""Three-digit combinations that don't represent a zip code."""
486# pre-compile regex for speed
487ABBR_JOIN = "|".join({**name_expand, **street_expand})
488abbr_join_comp = regex.compile(
489 rf"(\b(?:{ABBR_JOIN})\b\.?)(?!')",
490 flags=regex.IGNORECASE,
491)
493DIR_FILL = "|".join(r"\.?".join(list(abbr)) for abbr in direction_expand)
494dir_fill_comp = regex.compile(
495 rf"(?<!(?:^(?:Avenue) |[\.']))(\b(?:{DIR_FILL})\b\.?)(?!(?:\.?[a-zA-Z]| (?:Street|Avenue)))",
496 flags=regex.IGNORECASE,
497)
499sr_comp = regex.compile(
500 r"(\bS\.?R\b\.?)(?= \d+)",
501 flags=regex.IGNORECASE,
502)
504saint_comp = regex.compile(
505 rf"^(St\.?)(?= )|(\bSt\.?)(?= (?:{'|'.join(saints)}))",
506 flags=regex.IGNORECASE,
507)
509street_comp = regex.compile(
510 r"St\.?(?= [NESW]\.?[EW]?\.?)|(?<=\d[thndstr]{2} )St\.?\b|St\.?$"
511)
513post_comp = regex.compile(r"(\d{5})-?0{4}")
515usa_comp = regex.compile(r",? (?:USA?|United States(?: of America)?|Canada)\b")
517paren_comp = regex.compile(r" ?\(.*\)")
519# match Wisconsin grid-style addresses: N65w25055, W249 N6620, etc.
520grid_comp = regex.compile(
521 r"\b([NnSs]\d{2,}\s*[EeWw]\d{2,}|[EeWw]\d{2,}\s*[NnSs]\d{2,})\b"
522)