Package scrapeTools
Expand source code
from .emailScraper import scrapeEmails
from .inputScraper import scrapeInputs
from .linkScraper import LinkScraper
from .phoneScraper import scrapePhoneNumbers
__all__ = ["scrapeEmails", "LinkScraper", "scrapePhoneNumbers", "scrapeInputs"]
Sub-modules
scrapeTools.emailScraper
scrapeTools.inputScraper
scrapeTools.linkScraper
scrapeTools.phoneScraper
Functions
def scrapeEmails(text: str) ‑> list[str]
-
Extracts potential emails from given text and returns as a list of strings.
Expand source code
def scrapeEmails(text: str) -> list[str]: """Extracts potential emails from given text and returns as a list of strings.""" if "%" in text: # decode percent encoding text = unquote(text) for ch in ["\n", "\t", "\r"]: text = text.replace(ch, " ") atCount = text.count("@") emails = [] if atCount > 0: lastStopdex = 0 for i in range(atCount): atdex = text.find("@", lastStopdex) nextAtdex = text.find("@", atdex + 1) try: chunk = ( text[lastStopdex:nextAtdex] if nextAtdex != -1 else text[lastStopdex:] ) chunkAtdex = chunk.find("@") startdex = findLastValidCharacterOffset(chunk[: chunkAtdex + 1]) stopdex = findLastValidCharacterOffset(chunk[chunkAtdex:]) email = chunk[chunkAtdex - startdex : stopdex + chunkAtdex + 1] while email[-1].isnumeric() or not email[-1].isalpha(): email = email[:-1] if validate(email): emails.append(email.lower()) """ The extra '+ 1' is to ensure lastStopdex increments if 'len(email.split('@')[1])' is 0.""" lastStopdex = atdex + len(email.split("@")[1]) + 1 except Exception as e: lastStopdex = atdex + 1 emails = sorted(list(set(stripUnicode(emails)))) return emails
def scrapeInputs(source: str) ‑> tuple[list[bs4.element.Tag]]
-
Searches html for various user input elements.
Returns a tuple where each element is a list of BeautifulSoup Tag elements.
The tuple elements are forms, inputs, buttons, select elements, and textAreas. If an element type was not found, it will be an empty list.
The inputs, buttons, select elements, and textAreas are ones not already found in a form element.
Expand source code
def scrapeInputs(source: str) -> tuple[list[Tag]]: """Searches html for various user input elements. Returns a tuple where each element is a list of BeautifulSoup Tag elements. The tuple elements are forms, inputs, buttons, select elements, and textAreas. If an element type was not found, it will be an empty list. The inputs, buttons, select elements, and textAreas are ones not already found in a form element.""" soup = BeautifulSoup(source, "html.parser") forms = soup("form") for form in forms: form.extract() inputs = soup("input") buttons = soup("button") selects = soup("select") textAreas = soup("textAreas") return forms, inputs, buttons, selects, textAreas
def scrapePhoneNumbers(text: str) ‑> list[str]
-
Scrape for u.s. phone numbers.
Expand source code
def scrapePhoneNumbers(text: str) -> list[str]: """Scrape for u.s. phone numbers.""" numbers = [] text = text.replace("+1", "") for separator in "-.": numbers.extend(findBySeparator(text, separator)) numbers.extend(findByHref(text)) numbers = [ number for number in numbers if phonenumbers.is_valid_number(phonenumbers.parse("+1" + number)) ] numbers = sorted(list(set(numbers))) return numbers
Classes
class LinkScraper (htmlSrc: str, pageUrl: str)
-
Expand source code
class LinkScraper: def __init__(self, htmlSrc: str, pageUrl: str): self.soup = BeautifulSoup(htmlSrc, features="html.parser") self.parsedUrl = urlparse(pageUrl) self.pageLinks = [] self.imgLinks = [] self.scriptLinks = [] def formatRelativeLinks(self, links: list[str]) -> list[str]: """Parses list of links and constructs a full url according to self.parsedUrl for the ones that don't have a 'netloc' property returned by urlparse. Full urls are returned unedited other than stripping any leading or trailing forward slashes.""" formattedLinks = [] for link in links: link = ( link.strip(" \n\t\r") .replace('"', "") .replace("\\", "") .replace("'", "") ) parsedUrl = urlparse(link) if all(ch not in link for ch in "@ "): parsedUrl = list(parsedUrl) if parsedUrl[0] == "": parsedUrl[0] = self.parsedUrl.scheme if parsedUrl[1] == "": parsedUrl[1] = self.parsedUrl.netloc formattedLinks.append(urlunparse(parsedUrl).strip("/")) return formattedLinks def removeDuplicates(self, obj: list) -> list: """Removes duplicate members.""" return list(set(obj)) def processLinks(self, links: list[str]) -> list[str]: """Formats relative links, removes duplicates, and sorts in alphabetical order.""" return sorted(self.removeDuplicates(self.formatRelativeLinks(links))) def findAll(self, tagName: str, attributeName: str) -> list[str]: """Finds all results according to tagName and attributeName.\n Filters out fragments.""" return [ tag.get(attributeName) for tag in self.soup(tagName, recursive=True) if tag.get(attributeName) is not None and "#" not in tag.get(attributeName) ] def filterSameSite(self, links: list[str]) -> list[str]: """Filters out links that don't match self.parsedUrl.netloc""" return [ link for link in links if urlparse(link).netloc.strip("www.") == self.parsedUrl.netloc.strip("www.") ] def scrapePageLinks(self): """Scrape links from href attribute of <a> and <link> tags.""" links = self.findAll("a", "href") links.extend(self.findAll("link", "href")) self.pageLinks = self.processLinks(links) def scrapeImgLinks(self): """Scrape links from src attribute of <img> tags.""" self.imgLinks = self.processLinks( self.findAll("img", "src") + self.findAll("img", "data-src") ) def scrapeScriptLinks(self): """Scrape script links from src attribute of <script> tags.""" self.scriptLinks = self.processLinks(self.findAll("script", "src")) def scrapePage(self): """Scrape all link types.""" for scrape in [ self.scrapePageLinks, self.scrapeImgLinks, self.scrapeScriptLinks, ]: scrape() self.mergeImageLinksFromNonImgTags() def mergeImageLinksFromNonImgTags(self): """Finds links in self.scriptLinks and self.pageLinks that have one of these image file extensions and adds them to self.imgLinks""" formats = [ ".jpg", ".jpeg", ".png", ".svg", ".bmp", ".tiff", ".pdf", ".eps", ".gif", ".jfif", ".webp", ".heif", ".avif", ".bat", ".bpg", ] for link in self.scriptLinks + self.pageLinks: if any(ext in link for ext in formats): self.imgLinks.append(link) self.imgLinks = sorted(self.removeDuplicates(self.imgLinks)) def getLinks( self, linkType: str = "all", sameSiteOnly: bool = False, excludedLinks: list[str] = None, ) -> list[str]: """Returns a list of urls found on the page. :param linkType: Can be 'all', 'page', 'img', or 'script'. :param sameSiteOnly: Excludes external urls if True. :param excludedLinks: A list of urls to filter out of the results. Useful for excluding duplicates when recursively scraping a website. Can also be used with linkType='all' to get two link types in one call: e.g. links = linkScraper.getLinks(linkType = 'all', excludedLinks = linkScraper.scriptLinks) will return page links and img links.""" match linkType: case "all": links = self.removeDuplicates( self.pageLinks + self.imgLinks + self.scriptLinks ) case "page": links = self.pageLinks case "img": links = self.imgLinks case "script": links = self.scriptLinks if sameSiteOnly: links = self.filterSameSite(links) if excludedLinks: links = [link for link in links if link not in excludedLinks] return sorted(links)
Methods
def filterSameSite(self, links: list[str]) ‑> list[str]
-
Filters out links that don't match self.parsedUrl.netloc
Expand source code
def filterSameSite(self, links: list[str]) -> list[str]: """Filters out links that don't match self.parsedUrl.netloc""" return [ link for link in links if urlparse(link).netloc.strip("www.") == self.parsedUrl.netloc.strip("www.") ]
def findAll(self, tagName: str, attributeName: str) ‑> list[str]
-
Finds all results according to tagName and attributeName.
Filters out fragments.
Expand source code
def findAll(self, tagName: str, attributeName: str) -> list[str]: """Finds all results according to tagName and attributeName.\n Filters out fragments.""" return [ tag.get(attributeName) for tag in self.soup(tagName, recursive=True) if tag.get(attributeName) is not None and "#" not in tag.get(attributeName) ]
def formatRelativeLinks(self, links: list[str]) ‑> list[str]
-
Parses list of links and constructs a full url according to self.parsedUrl for the ones that don't have a 'netloc' property returned by urlparse.
Full urls are returned unedited other than stripping any leading or trailing forward slashes.
Expand source code
def formatRelativeLinks(self, links: list[str]) -> list[str]: """Parses list of links and constructs a full url according to self.parsedUrl for the ones that don't have a 'netloc' property returned by urlparse. Full urls are returned unedited other than stripping any leading or trailing forward slashes.""" formattedLinks = [] for link in links: link = ( link.strip(" \n\t\r") .replace('"', "") .replace("\\", "") .replace("'", "") ) parsedUrl = urlparse(link) if all(ch not in link for ch in "@ "): parsedUrl = list(parsedUrl) if parsedUrl[0] == "": parsedUrl[0] = self.parsedUrl.scheme if parsedUrl[1] == "": parsedUrl[1] = self.parsedUrl.netloc formattedLinks.append(urlunparse(parsedUrl).strip("/")) return formattedLinks
def getLinks(self, linkType: str = 'all', sameSiteOnly: bool = False, excludedLinks: list[str] = None) ‑> list[str]
-
Returns a list of urls found on the page.
:param linkType: Can be 'all', 'page', 'img', or 'script'.
:param sameSiteOnly: Excludes external urls if True.
:param excludedLinks: A list of urls to filter out of the results. Useful for excluding duplicates when recursively scraping a website. Can also be used with linkType='all' to get two link types in one call:
e.g. links = linkScraper.getLinks(linkType = 'all', excludedLinks = linkScraper.scriptLinks) will return page links and img links.
Expand source code
def getLinks( self, linkType: str = "all", sameSiteOnly: bool = False, excludedLinks: list[str] = None, ) -> list[str]: """Returns a list of urls found on the page. :param linkType: Can be 'all', 'page', 'img', or 'script'. :param sameSiteOnly: Excludes external urls if True. :param excludedLinks: A list of urls to filter out of the results. Useful for excluding duplicates when recursively scraping a website. Can also be used with linkType='all' to get two link types in one call: e.g. links = linkScraper.getLinks(linkType = 'all', excludedLinks = linkScraper.scriptLinks) will return page links and img links.""" match linkType: case "all": links = self.removeDuplicates( self.pageLinks + self.imgLinks + self.scriptLinks ) case "page": links = self.pageLinks case "img": links = self.imgLinks case "script": links = self.scriptLinks if sameSiteOnly: links = self.filterSameSite(links) if excludedLinks: links = [link for link in links if link not in excludedLinks] return sorted(links)
def mergeImageLinksFromNonImgTags(self)
-
Finds links in self.scriptLinks and self.pageLinks that have one of these image file extensions and adds them to self.imgLinks
Expand source code
def mergeImageLinksFromNonImgTags(self): """Finds links in self.scriptLinks and self.pageLinks that have one of these image file extensions and adds them to self.imgLinks""" formats = [ ".jpg", ".jpeg", ".png", ".svg", ".bmp", ".tiff", ".pdf", ".eps", ".gif", ".jfif", ".webp", ".heif", ".avif", ".bat", ".bpg", ] for link in self.scriptLinks + self.pageLinks: if any(ext in link for ext in formats): self.imgLinks.append(link) self.imgLinks = sorted(self.removeDuplicates(self.imgLinks))
def processLinks(self, links: list[str]) ‑> list[str]
-
Formats relative links, removes duplicates, and sorts in alphabetical order.
Expand source code
def processLinks(self, links: list[str]) -> list[str]: """Formats relative links, removes duplicates, and sorts in alphabetical order.""" return sorted(self.removeDuplicates(self.formatRelativeLinks(links)))
def removeDuplicates(self, obj: list) ‑> list
-
Removes duplicate members.
Expand source code
def removeDuplicates(self, obj: list) -> list: """Removes duplicate members.""" return list(set(obj))
def scrapeImgLinks(self)
-
Scrape links from src attribute of
tags.
Expand source code
def scrapeImgLinks(self): """Scrape links from src attribute of <img> tags.""" self.imgLinks = self.processLinks( self.findAll("img", "src") + self.findAll("img", "data-src") )
def scrapePage(self)
-
Scrape all link types.
Expand source code
def scrapePage(self): """Scrape all link types.""" for scrape in [ self.scrapePageLinks, self.scrapeImgLinks, self.scrapeScriptLinks, ]: scrape() self.mergeImageLinksFromNonImgTags()
def scrapePageLinks(self)
-
Scrape links from href attribute of and tags.
Expand source code
def scrapePageLinks(self): """Scrape links from href attribute of <a> and <link> tags.""" links = self.findAll("a", "href") links.extend(self.findAll("link", "href")) self.pageLinks = self.processLinks(links)
def scrapeScriptLinks(self)
-
Scrape script links from src attribute of