Module scrapeTools.phoneScraper
Expand source code
import phonenumbers
def getNumConsecutiveNumbers(text: str, reverse: bool = False) -> int:
"""Finds the number of consecutive numeric characters in a string."""
# limit search to 10 characters
text[:10]
if reverse:
text = text[::-1]
for i, ch in enumerate(text):
if not ch.isnumeric():
return i
return len(text)
def findBySeparator(text: str, separator: str) -> list[str]:
"""Attempts to detect phone numbers according to these
patterns by scanning for separators (typically '-.')
and how many consecutive numbers follow or precede them:
(xxx)xxx{separator}xxxx
(xxx) xxx{separator}xxxx
(xxx){separator}xxx{separator}xxxx
xxx{separator}xxx{separator}xxxx"""
count = text.count(separator)
numbers = []
if count > 0:
lastStopdex = 0
for _ in range(count):
number = ""
sepdex = text.find(separator, lastStopdex)
if sepdex != -1:
nextSepdex = text.find(separator, sepdex + 1)
# consecutive numbers preceding sepdex
startOffset = getNumConsecutiveNumbers(
text[lastStopdex:sepdex], reverse=True
)
# consecutive numbers between sepdex and nextSepdex
firstStopOffset = getNumConsecutiveNumbers(
text[sepdex + 1 : nextSepdex + 1]
)
# consecutive numbers after nextSepdex
secondStopOffset = getNumConsecutiveNumbers(text[nextSepdex + 1 :])
if startOffset == 3 and firstStopOffset == 3 and secondStopOffset == 4:
# xxx{separator}xxx{separator}xxxx
number = text[
sepdex - startOffset : nextSepdex + secondStopOffset + 1
]
elif (
startOffset == 0
and firstStopOffset == 3
and secondStopOffset == 4
and text[sepdex - 1] == ")"
and text[sepdex - 5] == "("
):
# (xxx){separator}xxx{separator}xxxx
number = text[
sepdex - 5 : sepdex + firstStopOffset + secondStopOffset + 2
]
elif startOffset == 3 and text[sepdex - 4] in [")", " "]:
# (xxx)xxx{separator}xxxx or (xxx) xxx{separator}xxxx
number = text[sepdex - 8 : sepdex + 5]
lastStopdex = sepdex + 5
for ch in [separator, "(", ")", " "]:
number = number.replace(ch, "")
if len(number) == 10 and all(ch.isnumeric() for ch in number):
numbers.append(number)
return numbers
def findByHref(text: str) -> list[str]:
"""Scrapes phone numbers by href attribute."""
indicator = 'href="'
count = text.count(indicator)
prefixes = ["tel:", "callto:"]
index = 0
numbers = []
for _ in range(count):
index = text.find(indicator, index + 1)
number = text[index + len(indicator) : text.find('"', index + len(indicator))]
if any(prefix in number for prefix in prefixes):
number = "".join(
[num for num in number[number.find(":") + 1 :] if num.isnumeric()]
)
if len(number) == 10:
numbers.append(number)
return numbers
def scrapePhoneNumbers(text: str) -> list[str]:
"""Scrape for u.s. phone numbers."""
numbers = []
text = text.replace("+1", "")
for separator in "-.":
numbers.extend(findBySeparator(text, separator))
numbers.extend(findByHref(text))
numbers = [
number
for number in numbers
if phonenumbers.is_valid_number(phonenumbers.parse("+1" + number))
]
numbers = sorted(list(set(numbers)))
return numbers
Functions
def findByHref(text: str) ‑> list[str]
-
Scrapes phone numbers by href attribute.
Expand source code
def findByHref(text: str) -> list[str]: """Scrapes phone numbers by href attribute.""" indicator = 'href="' count = text.count(indicator) prefixes = ["tel:", "callto:"] index = 0 numbers = [] for _ in range(count): index = text.find(indicator, index + 1) number = text[index + len(indicator) : text.find('"', index + len(indicator))] if any(prefix in number for prefix in prefixes): number = "".join( [num for num in number[number.find(":") + 1 :] if num.isnumeric()] ) if len(number) == 10: numbers.append(number) return numbers
def findBySeparator(text: str, separator: str) ‑> list[str]
-
Attempts to detect phone numbers according to these patterns by scanning for separators (typically '-.') and how many consecutive numbers follow or precede them:
(xxx)xxx{separator}xxxx
(xxx) xxx{separator}xxxx
(xxx){separator}xxx{separator}xxxx
xxx{separator}xxx{separator}xxxx
Expand source code
def findBySeparator(text: str, separator: str) -> list[str]: """Attempts to detect phone numbers according to these patterns by scanning for separators (typically '-.') and how many consecutive numbers follow or precede them: (xxx)xxx{separator}xxxx (xxx) xxx{separator}xxxx (xxx){separator}xxx{separator}xxxx xxx{separator}xxx{separator}xxxx""" count = text.count(separator) numbers = [] if count > 0: lastStopdex = 0 for _ in range(count): number = "" sepdex = text.find(separator, lastStopdex) if sepdex != -1: nextSepdex = text.find(separator, sepdex + 1) # consecutive numbers preceding sepdex startOffset = getNumConsecutiveNumbers( text[lastStopdex:sepdex], reverse=True ) # consecutive numbers between sepdex and nextSepdex firstStopOffset = getNumConsecutiveNumbers( text[sepdex + 1 : nextSepdex + 1] ) # consecutive numbers after nextSepdex secondStopOffset = getNumConsecutiveNumbers(text[nextSepdex + 1 :]) if startOffset == 3 and firstStopOffset == 3 and secondStopOffset == 4: # xxx{separator}xxx{separator}xxxx number = text[ sepdex - startOffset : nextSepdex + secondStopOffset + 1 ] elif ( startOffset == 0 and firstStopOffset == 3 and secondStopOffset == 4 and text[sepdex - 1] == ")" and text[sepdex - 5] == "(" ): # (xxx){separator}xxx{separator}xxxx number = text[ sepdex - 5 : sepdex + firstStopOffset + secondStopOffset + 2 ] elif startOffset == 3 and text[sepdex - 4] in [")", " "]: # (xxx)xxx{separator}xxxx or (xxx) xxx{separator}xxxx number = text[sepdex - 8 : sepdex + 5] lastStopdex = sepdex + 5 for ch in [separator, "(", ")", " "]: number = number.replace(ch, "") if len(number) == 10 and all(ch.isnumeric() for ch in number): numbers.append(number) return numbers
def getNumConsecutiveNumbers(text: str, reverse: bool = False) ‑> int
-
Finds the number of consecutive numeric characters in a string.
Expand source code
def getNumConsecutiveNumbers(text: str, reverse: bool = False) -> int: """Finds the number of consecutive numeric characters in a string.""" # limit search to 10 characters text[:10] if reverse: text = text[::-1] for i, ch in enumerate(text): if not ch.isnumeric(): return i return len(text)
def scrapePhoneNumbers(text: str) ‑> list[str]
-
Scrape for u.s. phone numbers.
Expand source code
def scrapePhoneNumbers(text: str) -> list[str]: """Scrape for u.s. phone numbers.""" numbers = [] text = text.replace("+1", "") for separator in "-.": numbers.extend(findBySeparator(text, separator)) numbers.extend(findByHref(text)) numbers = [ number for number in numbers if phonenumbers.is_valid_number(phonenumbers.parse("+1" + number)) ] numbers = sorted(list(set(numbers))) return numbers