Package scrapeTools

Expand source code
from .emailScraper import scrapeEmails
from .inputScraper import scrapeInputs
from .linkScraper import LinkScraper
from .phoneScraper import scrapePhoneNumbers

__all__ = ["scrapeEmails", "LinkScraper", "scrapePhoneNumbers", "scrapeInputs"]

Sub-modules

scrapeTools.emailScraper
scrapeTools.inputScraper
scrapeTools.linkScraper
scrapeTools.phoneScraper

Functions

def scrapeEmails(text: str) ‑> list[str]

Extracts potential emails from given text and returns as a list of strings.

Expand source code
def scrapeEmails(text: str) -> list[str]:
    """Extracts potential emails from given text
    and returns as a list of strings."""
    if "%" in text:
        # decode percent encoding
        text = unquote(text)
    for ch in ["\n", "\t", "\r"]:
        text = text.replace(ch, " ")
    atCount = text.count("@")
    emails = []
    if atCount > 0:
        lastStopdex = 0
        for i in range(atCount):
            atdex = text.find("@", lastStopdex)
            nextAtdex = text.find("@", atdex + 1)
            try:
                chunk = (
                    text[lastStopdex:nextAtdex]
                    if nextAtdex != -1
                    else text[lastStopdex:]
                )
                chunkAtdex = chunk.find("@")
                startdex = findLastValidCharacterOffset(chunk[: chunkAtdex + 1])
                stopdex = findLastValidCharacterOffset(chunk[chunkAtdex:])
                email = chunk[chunkAtdex - startdex : stopdex + chunkAtdex + 1]
                while email[-1].isnumeric() or not email[-1].isalpha():
                    email = email[:-1]
                if validate(email):
                    emails.append(email.lower())
                """ The extra '+ 1' is to ensure lastStopdex increments
                if 'len(email.split('@')[1])' is 0."""
                lastStopdex = atdex + len(email.split("@")[1]) + 1
            except Exception as e:
                lastStopdex = atdex + 1
        emails = sorted(list(set(stripUnicode(emails))))
    return emails
def scrapeInputs(source: str) ‑> tuple[list[bs4.element.Tag]]

Searches html for various user input elements.

Returns a tuple where each element is a list of BeautifulSoup Tag elements.

The tuple elements are forms, inputs, buttons, select elements, and textAreas. If an element type was not found, it will be an empty list.

The inputs, buttons, select elements, and textAreas are ones not already found in a form element.

Expand source code
def scrapeInputs(source: str) -> tuple[list[Tag]]:
    """Searches html for various user input elements.

    Returns a tuple where each element is a list of BeautifulSoup Tag elements.

    The tuple elements are forms, inputs, buttons, select elements,
    and textAreas. If an element type was not found, it will be an empty list.

    The inputs, buttons, select elements, and textAreas are ones
    not already found in a form element."""
    soup = BeautifulSoup(source, "html.parser")
    forms = soup("form")
    for form in forms:
        form.extract()
    inputs = soup("input")
    buttons = soup("button")
    selects = soup("select")
    textAreas = soup("textAreas")

    return forms, inputs, buttons, selects, textAreas
def scrapePhoneNumbers(text: str) ‑> list[str]

Scrape for u.s. phone numbers.

Expand source code
def scrapePhoneNumbers(text: str) -> list[str]:
    """Scrape for u.s. phone numbers."""
    numbers = []
    text = text.replace("+1", "")
    for separator in "-.":
        numbers.extend(findBySeparator(text, separator))
    numbers.extend(findByHref(text))
    numbers = [
        number
        for number in numbers
        if phonenumbers.is_valid_number(phonenumbers.parse("+1" + number))
    ]
    numbers = sorted(list(set(numbers)))
    return numbers

Classes

class LinkScraper (htmlSrc: str, pageUrl: str)
Expand source code
class LinkScraper:
    def __init__(self, htmlSrc: str, pageUrl: str):
        self.soup = BeautifulSoup(htmlSrc, features="html.parser")
        self.parsedUrl = urlparse(pageUrl)
        self.pageLinks = []
        self.imgLinks = []
        self.scriptLinks = []

    def formatRelativeLinks(self, links: list[str]) -> list[str]:
        """Parses list of links and constructs a full url
        according to self.parsedUrl for the ones that don't have a
        'netloc' property returned by urlparse.

        Full urls are returned unedited other than stripping any
        leading or trailing forward slashes."""
        formattedLinks = []
        for link in links:
            link = (
                link.strip(" \n\t\r")
                .replace('"', "")
                .replace("\\", "")
                .replace("'", "")
            )
            parsedUrl = urlparse(link)
            if all(ch not in link for ch in "@ "):
                parsedUrl = list(parsedUrl)
                if parsedUrl[0] == "":
                    parsedUrl[0] = self.parsedUrl.scheme
                if parsedUrl[1] == "":
                    parsedUrl[1] = self.parsedUrl.netloc
                formattedLinks.append(urlunparse(parsedUrl).strip("/"))
        return formattedLinks

    def removeDuplicates(self, obj: list) -> list:
        """Removes duplicate members."""
        return list(set(obj))

    def processLinks(self, links: list[str]) -> list[str]:
        """Formats relative links, removes duplicates, and sorts in alphabetical order."""
        return sorted(self.removeDuplicates(self.formatRelativeLinks(links)))

    def findAll(self, tagName: str, attributeName: str) -> list[str]:
        """Finds all results according to tagName and attributeName.\n
        Filters out fragments."""
        return [
            tag.get(attributeName)
            for tag in self.soup(tagName, recursive=True)
            if tag.get(attributeName) is not None and "#" not in tag.get(attributeName)
        ]

    def filterSameSite(self, links: list[str]) -> list[str]:
        """Filters out links that don't match self.parsedUrl.netloc"""
        return [
            link
            for link in links
            if urlparse(link).netloc.strip("www.")
            == self.parsedUrl.netloc.strip("www.")
        ]

    def scrapePageLinks(self):
        """Scrape links from href attribute of <a> and <link> tags."""
        links = self.findAll("a", "href")
        links.extend(self.findAll("link", "href"))
        self.pageLinks = self.processLinks(links)

    def scrapeImgLinks(self):
        """Scrape links from src attribute of <img> tags."""
        self.imgLinks = self.processLinks(
            self.findAll("img", "src") + self.findAll("img", "data-src")
        )

    def scrapeScriptLinks(self):
        """Scrape script links from src attribute of <script> tags."""
        self.scriptLinks = self.processLinks(self.findAll("script", "src"))

    def scrapePage(self):
        """Scrape all link types."""
        for scrape in [
            self.scrapePageLinks,
            self.scrapeImgLinks,
            self.scrapeScriptLinks,
        ]:
            scrape()
        self.mergeImageLinksFromNonImgTags()

    def mergeImageLinksFromNonImgTags(self):
        """Finds links in self.scriptLinks and self.pageLinks
        that have one of these image file extensions and adds them
        to self.imgLinks"""
        formats = [
            ".jpg",
            ".jpeg",
            ".png",
            ".svg",
            ".bmp",
            ".tiff",
            ".pdf",
            ".eps",
            ".gif",
            ".jfif",
            ".webp",
            ".heif",
            ".avif",
            ".bat",
            ".bpg",
        ]
        for link in self.scriptLinks + self.pageLinks:
            if any(ext in link for ext in formats):
                self.imgLinks.append(link)
        self.imgLinks = sorted(self.removeDuplicates(self.imgLinks))

    def getLinks(
        self,
        linkType: str = "all",
        sameSiteOnly: bool = False,
        excludedLinks: list[str] = None,
    ) -> list[str]:
        """Returns a list of urls found on the page.

        :param linkType: Can be 'all', 'page', 'img', or 'script'.

        :param sameSiteOnly: Excludes external urls if True.

        :param excludedLinks: A list of urls to filter out of the results.
        Useful for excluding duplicates when recursively scraping a website.
        Can also be used with linkType='all' to get two link types in one call:

        e.g. links = linkScraper.getLinks(linkType = 'all', excludedLinks = linkScraper.scriptLinks)
        will return page links and img links."""
        match linkType:
            case "all":
                links = self.removeDuplicates(
                    self.pageLinks + self.imgLinks + self.scriptLinks
                )
            case "page":
                links = self.pageLinks
            case "img":
                links = self.imgLinks
            case "script":
                links = self.scriptLinks
        if sameSiteOnly:
            links = self.filterSameSite(links)
        if excludedLinks:
            links = [link for link in links if link not in excludedLinks]
        return sorted(links)

Methods

def filterSameSite(self, links: list[str]) ‑> list[str]

Filters out links that don't match self.parsedUrl.netloc

Expand source code
def filterSameSite(self, links: list[str]) -> list[str]:
    """Filters out links that don't match self.parsedUrl.netloc"""
    return [
        link
        for link in links
        if urlparse(link).netloc.strip("www.")
        == self.parsedUrl.netloc.strip("www.")
    ]
def findAll(self, tagName: str, attributeName: str) ‑> list[str]

Finds all results according to tagName and attributeName.

Filters out fragments.

Expand source code
def findAll(self, tagName: str, attributeName: str) -> list[str]:
    """Finds all results according to tagName and attributeName.\n
    Filters out fragments."""
    return [
        tag.get(attributeName)
        for tag in self.soup(tagName, recursive=True)
        if tag.get(attributeName) is not None and "#" not in tag.get(attributeName)
    ]

Parses list of links and constructs a full url according to self.parsedUrl for the ones that don't have a 'netloc' property returned by urlparse.

Full urls are returned unedited other than stripping any leading or trailing forward slashes.

Expand source code
def formatRelativeLinks(self, links: list[str]) -> list[str]:
    """Parses list of links and constructs a full url
    according to self.parsedUrl for the ones that don't have a
    'netloc' property returned by urlparse.

    Full urls are returned unedited other than stripping any
    leading or trailing forward slashes."""
    formattedLinks = []
    for link in links:
        link = (
            link.strip(" \n\t\r")
            .replace('"', "")
            .replace("\\", "")
            .replace("'", "")
        )
        parsedUrl = urlparse(link)
        if all(ch not in link for ch in "@ "):
            parsedUrl = list(parsedUrl)
            if parsedUrl[0] == "":
                parsedUrl[0] = self.parsedUrl.scheme
            if parsedUrl[1] == "":
                parsedUrl[1] = self.parsedUrl.netloc
            formattedLinks.append(urlunparse(parsedUrl).strip("/"))
    return formattedLinks

Returns a list of urls found on the page.

:param linkType: Can be 'all', 'page', 'img', or 'script'.

:param sameSiteOnly: Excludes external urls if True.

:param excludedLinks: A list of urls to filter out of the results. Useful for excluding duplicates when recursively scraping a website. Can also be used with linkType='all' to get two link types in one call:

e.g. links = linkScraper.getLinks(linkType = 'all', excludedLinks = linkScraper.scriptLinks) will return page links and img links.

Expand source code
def getLinks(
    self,
    linkType: str = "all",
    sameSiteOnly: bool = False,
    excludedLinks: list[str] = None,
) -> list[str]:
    """Returns a list of urls found on the page.

    :param linkType: Can be 'all', 'page', 'img', or 'script'.

    :param sameSiteOnly: Excludes external urls if True.

    :param excludedLinks: A list of urls to filter out of the results.
    Useful for excluding duplicates when recursively scraping a website.
    Can also be used with linkType='all' to get two link types in one call:

    e.g. links = linkScraper.getLinks(linkType = 'all', excludedLinks = linkScraper.scriptLinks)
    will return page links and img links."""
    match linkType:
        case "all":
            links = self.removeDuplicates(
                self.pageLinks + self.imgLinks + self.scriptLinks
            )
        case "page":
            links = self.pageLinks
        case "img":
            links = self.imgLinks
        case "script":
            links = self.scriptLinks
    if sameSiteOnly:
        links = self.filterSameSite(links)
    if excludedLinks:
        links = [link for link in links if link not in excludedLinks]
    return sorted(links)
def mergeImageLinksFromNonImgTags(self)

Finds links in self.scriptLinks and self.pageLinks that have one of these image file extensions and adds them to self.imgLinks

Expand source code
def mergeImageLinksFromNonImgTags(self):
    """Finds links in self.scriptLinks and self.pageLinks
    that have one of these image file extensions and adds them
    to self.imgLinks"""
    formats = [
        ".jpg",
        ".jpeg",
        ".png",
        ".svg",
        ".bmp",
        ".tiff",
        ".pdf",
        ".eps",
        ".gif",
        ".jfif",
        ".webp",
        ".heif",
        ".avif",
        ".bat",
        ".bpg",
    ]
    for link in self.scriptLinks + self.pageLinks:
        if any(ext in link for ext in formats):
            self.imgLinks.append(link)
    self.imgLinks = sorted(self.removeDuplicates(self.imgLinks))

Formats relative links, removes duplicates, and sorts in alphabetical order.

Expand source code
def processLinks(self, links: list[str]) -> list[str]:
    """Formats relative links, removes duplicates, and sorts in alphabetical order."""
    return sorted(self.removeDuplicates(self.formatRelativeLinks(links)))
def removeDuplicates(self, obj: list) ‑> list

Removes duplicate members.

Expand source code
def removeDuplicates(self, obj: list) -> list:
    """Removes duplicate members."""
    return list(set(obj))

Scrape links from src attribute of tags.

Expand source code
def scrapeImgLinks(self):
    """Scrape links from src attribute of <img> tags."""
    self.imgLinks = self.processLinks(
        self.findAll("img", "src") + self.findAll("img", "data-src")
    )
def scrapePage(self)

Scrape all link types.

Expand source code
def scrapePage(self):
    """Scrape all link types."""
    for scrape in [
        self.scrapePageLinks,
        self.scrapeImgLinks,
        self.scrapeScriptLinks,
    ]:
        scrape()
    self.mergeImageLinksFromNonImgTags()

Scrape links from href attribute of and tags.

Expand source code
def scrapePageLinks(self):
    """Scrape links from href attribute of <a> and <link> tags."""
    links = self.findAll("a", "href")
    links.extend(self.findAll("link", "href"))
    self.pageLinks = self.processLinks(links)

Scrape script links from src attribute of