Module scrapeTools.linkScraper
Expand source code
from urllib.parse import urlparse, urlunparse
from bs4 import BeautifulSoup
class LinkScraper:
def __init__(self, htmlSrc: str, pageUrl: str):
self.soup = BeautifulSoup(htmlSrc, features="html.parser")
self.parsedUrl = urlparse(pageUrl)
self.pageLinks = []
self.imgLinks = []
self.scriptLinks = []
def formatRelativeLinks(self, links: list[str]) -> list[str]:
"""Parses list of links and constructs a full url
according to self.parsedUrl for the ones that don't have a
'netloc' property returned by urlparse.
Full urls are returned unedited other than stripping any
leading or trailing forward slashes."""
formattedLinks = []
for link in links:
link = (
link.strip(" \n\t\r")
.replace('"', "")
.replace("\\", "")
.replace("'", "")
)
parsedUrl = urlparse(link)
if all(ch not in link for ch in "@ "):
parsedUrl = list(parsedUrl)
if parsedUrl[0] == "":
parsedUrl[0] = self.parsedUrl.scheme
if parsedUrl[1] == "":
parsedUrl[1] = self.parsedUrl.netloc
formattedLinks.append(urlunparse(parsedUrl).strip("/"))
return formattedLinks
def removeDuplicates(self, obj: list) -> list:
"""Removes duplicate members."""
return list(set(obj))
def processLinks(self, links: list[str]) -> list[str]:
"""Formats relative links, removes duplicates, and sorts in alphabetical order."""
return sorted(self.removeDuplicates(self.formatRelativeLinks(links)))
def findAll(self, tagName: str, attributeName: str) -> list[str]:
"""Finds all results according to tagName and attributeName.\n
Filters out fragments."""
return [
tag.get(attributeName)
for tag in self.soup(tagName, recursive=True)
if tag.get(attributeName) is not None and "#" not in tag.get(attributeName)
]
def filterSameSite(self, links: list[str]) -> list[str]:
"""Filters out links that don't match self.parsedUrl.netloc"""
return [
link
for link in links
if urlparse(link).netloc.strip("www.")
== self.parsedUrl.netloc.strip("www.")
]
def scrapePageLinks(self):
"""Scrape links from href attribute of <a> and <link> tags."""
links = self.findAll("a", "href")
links.extend(self.findAll("link", "href"))
self.pageLinks = self.processLinks(links)
def scrapeImgLinks(self):
"""Scrape links from src attribute of <img> tags."""
self.imgLinks = self.processLinks(
self.findAll("img", "src") + self.findAll("img", "data-src")
)
def scrapeScriptLinks(self):
"""Scrape script links from src attribute of <script> tags."""
self.scriptLinks = self.processLinks(self.findAll("script", "src"))
def scrapePage(self):
"""Scrape all link types."""
for scrape in [
self.scrapePageLinks,
self.scrapeImgLinks,
self.scrapeScriptLinks,
]:
scrape()
self.mergeImageLinksFromNonImgTags()
def mergeImageLinksFromNonImgTags(self):
"""Finds links in self.scriptLinks and self.pageLinks
that have one of these image file extensions and adds them
to self.imgLinks"""
formats = [
".jpg",
".jpeg",
".png",
".svg",
".bmp",
".tiff",
".pdf",
".eps",
".gif",
".jfif",
".webp",
".heif",
".avif",
".bat",
".bpg",
]
for link in self.scriptLinks + self.pageLinks:
if any(ext in link for ext in formats):
self.imgLinks.append(link)
self.imgLinks = sorted(self.removeDuplicates(self.imgLinks))
def getLinks(
self,
linkType: str = "all",
sameSiteOnly: bool = False,
excludedLinks: list[str] = None,
) -> list[str]:
"""Returns a list of urls found on the page.
:param linkType: Can be 'all', 'page', 'img', or 'script'.
:param sameSiteOnly: Excludes external urls if True.
:param excludedLinks: A list of urls to filter out of the results.
Useful for excluding duplicates when recursively scraping a website.
Can also be used with linkType='all' to get two link types in one call:
e.g. links = linkScraper.getLinks(linkType = 'all', excludedLinks = linkScraper.scriptLinks)
will return page links and img links."""
match linkType:
case "all":
links = self.removeDuplicates(
self.pageLinks + self.imgLinks + self.scriptLinks
)
case "page":
links = self.pageLinks
case "img":
links = self.imgLinks
case "script":
links = self.scriptLinks
if sameSiteOnly:
links = self.filterSameSite(links)
if excludedLinks:
links = [link for link in links if link not in excludedLinks]
return sorted(links)
Classes
class LinkScraper (htmlSrc: str, pageUrl: str)
-
Expand source code
class LinkScraper: def __init__(self, htmlSrc: str, pageUrl: str): self.soup = BeautifulSoup(htmlSrc, features="html.parser") self.parsedUrl = urlparse(pageUrl) self.pageLinks = [] self.imgLinks = [] self.scriptLinks = [] def formatRelativeLinks(self, links: list[str]) -> list[str]: """Parses list of links and constructs a full url according to self.parsedUrl for the ones that don't have a 'netloc' property returned by urlparse. Full urls are returned unedited other than stripping any leading or trailing forward slashes.""" formattedLinks = [] for link in links: link = ( link.strip(" \n\t\r") .replace('"', "") .replace("\\", "") .replace("'", "") ) parsedUrl = urlparse(link) if all(ch not in link for ch in "@ "): parsedUrl = list(parsedUrl) if parsedUrl[0] == "": parsedUrl[0] = self.parsedUrl.scheme if parsedUrl[1] == "": parsedUrl[1] = self.parsedUrl.netloc formattedLinks.append(urlunparse(parsedUrl).strip("/")) return formattedLinks def removeDuplicates(self, obj: list) -> list: """Removes duplicate members.""" return list(set(obj)) def processLinks(self, links: list[str]) -> list[str]: """Formats relative links, removes duplicates, and sorts in alphabetical order.""" return sorted(self.removeDuplicates(self.formatRelativeLinks(links))) def findAll(self, tagName: str, attributeName: str) -> list[str]: """Finds all results according to tagName and attributeName.\n Filters out fragments.""" return [ tag.get(attributeName) for tag in self.soup(tagName, recursive=True) if tag.get(attributeName) is not None and "#" not in tag.get(attributeName) ] def filterSameSite(self, links: list[str]) -> list[str]: """Filters out links that don't match self.parsedUrl.netloc""" return [ link for link in links if urlparse(link).netloc.strip("www.") == self.parsedUrl.netloc.strip("www.") ] def scrapePageLinks(self): """Scrape links from href attribute of <a> and <link> tags.""" links = self.findAll("a", "href") links.extend(self.findAll("link", "href")) self.pageLinks = self.processLinks(links) def scrapeImgLinks(self): """Scrape links from src attribute of <img> tags.""" self.imgLinks = self.processLinks( self.findAll("img", "src") + self.findAll("img", "data-src") ) def scrapeScriptLinks(self): """Scrape script links from src attribute of <script> tags.""" self.scriptLinks = self.processLinks(self.findAll("script", "src")) def scrapePage(self): """Scrape all link types.""" for scrape in [ self.scrapePageLinks, self.scrapeImgLinks, self.scrapeScriptLinks, ]: scrape() self.mergeImageLinksFromNonImgTags() def mergeImageLinksFromNonImgTags(self): """Finds links in self.scriptLinks and self.pageLinks that have one of these image file extensions and adds them to self.imgLinks""" formats = [ ".jpg", ".jpeg", ".png", ".svg", ".bmp", ".tiff", ".pdf", ".eps", ".gif", ".jfif", ".webp", ".heif", ".avif", ".bat", ".bpg", ] for link in self.scriptLinks + self.pageLinks: if any(ext in link for ext in formats): self.imgLinks.append(link) self.imgLinks = sorted(self.removeDuplicates(self.imgLinks)) def getLinks( self, linkType: str = "all", sameSiteOnly: bool = False, excludedLinks: list[str] = None, ) -> list[str]: """Returns a list of urls found on the page. :param linkType: Can be 'all', 'page', 'img', or 'script'. :param sameSiteOnly: Excludes external urls if True. :param excludedLinks: A list of urls to filter out of the results. Useful for excluding duplicates when recursively scraping a website. Can also be used with linkType='all' to get two link types in one call: e.g. links = linkScraper.getLinks(linkType = 'all', excludedLinks = linkScraper.scriptLinks) will return page links and img links.""" match linkType: case "all": links = self.removeDuplicates( self.pageLinks + self.imgLinks + self.scriptLinks ) case "page": links = self.pageLinks case "img": links = self.imgLinks case "script": links = self.scriptLinks if sameSiteOnly: links = self.filterSameSite(links) if excludedLinks: links = [link for link in links if link not in excludedLinks] return sorted(links)
Methods
def filterSameSite(self, links: list[str]) ‑> list[str]
-
Filters out links that don't match self.parsedUrl.netloc
Expand source code
def filterSameSite(self, links: list[str]) -> list[str]: """Filters out links that don't match self.parsedUrl.netloc""" return [ link for link in links if urlparse(link).netloc.strip("www.") == self.parsedUrl.netloc.strip("www.") ]
def findAll(self, tagName: str, attributeName: str) ‑> list[str]
-
Finds all results according to tagName and attributeName.
Filters out fragments.
Expand source code
def findAll(self, tagName: str, attributeName: str) -> list[str]: """Finds all results according to tagName and attributeName.\n Filters out fragments.""" return [ tag.get(attributeName) for tag in self.soup(tagName, recursive=True) if tag.get(attributeName) is not None and "#" not in tag.get(attributeName) ]
def formatRelativeLinks(self, links: list[str]) ‑> list[str]
-
Parses list of links and constructs a full url according to self.parsedUrl for the ones that don't have a 'netloc' property returned by urlparse.
Full urls are returned unedited other than stripping any leading or trailing forward slashes.
Expand source code
def formatRelativeLinks(self, links: list[str]) -> list[str]: """Parses list of links and constructs a full url according to self.parsedUrl for the ones that don't have a 'netloc' property returned by urlparse. Full urls are returned unedited other than stripping any leading or trailing forward slashes.""" formattedLinks = [] for link in links: link = ( link.strip(" \n\t\r") .replace('"', "") .replace("\\", "") .replace("'", "") ) parsedUrl = urlparse(link) if all(ch not in link for ch in "@ "): parsedUrl = list(parsedUrl) if parsedUrl[0] == "": parsedUrl[0] = self.parsedUrl.scheme if parsedUrl[1] == "": parsedUrl[1] = self.parsedUrl.netloc formattedLinks.append(urlunparse(parsedUrl).strip("/")) return formattedLinks
def getLinks(self, linkType: str = 'all', sameSiteOnly: bool = False, excludedLinks: list[str] = None) ‑> list[str]
-
Returns a list of urls found on the page.
:param linkType: Can be 'all', 'page', 'img', or 'script'.
:param sameSiteOnly: Excludes external urls if True.
:param excludedLinks: A list of urls to filter out of the results. Useful for excluding duplicates when recursively scraping a website. Can also be used with linkType='all' to get two link types in one call:
e.g. links = linkScraper.getLinks(linkType = 'all', excludedLinks = linkScraper.scriptLinks) will return page links and img links.
Expand source code
def getLinks( self, linkType: str = "all", sameSiteOnly: bool = False, excludedLinks: list[str] = None, ) -> list[str]: """Returns a list of urls found on the page. :param linkType: Can be 'all', 'page', 'img', or 'script'. :param sameSiteOnly: Excludes external urls if True. :param excludedLinks: A list of urls to filter out of the results. Useful for excluding duplicates when recursively scraping a website. Can also be used with linkType='all' to get two link types in one call: e.g. links = linkScraper.getLinks(linkType = 'all', excludedLinks = linkScraper.scriptLinks) will return page links and img links.""" match linkType: case "all": links = self.removeDuplicates( self.pageLinks + self.imgLinks + self.scriptLinks ) case "page": links = self.pageLinks case "img": links = self.imgLinks case "script": links = self.scriptLinks if sameSiteOnly: links = self.filterSameSite(links) if excludedLinks: links = [link for link in links if link not in excludedLinks] return sorted(links)
def mergeImageLinksFromNonImgTags(self)
-
Finds links in self.scriptLinks and self.pageLinks that have one of these image file extensions and adds them to self.imgLinks
Expand source code
def mergeImageLinksFromNonImgTags(self): """Finds links in self.scriptLinks and self.pageLinks that have one of these image file extensions and adds them to self.imgLinks""" formats = [ ".jpg", ".jpeg", ".png", ".svg", ".bmp", ".tiff", ".pdf", ".eps", ".gif", ".jfif", ".webp", ".heif", ".avif", ".bat", ".bpg", ] for link in self.scriptLinks + self.pageLinks: if any(ext in link for ext in formats): self.imgLinks.append(link) self.imgLinks = sorted(self.removeDuplicates(self.imgLinks))
def processLinks(self, links: list[str]) ‑> list[str]
-
Formats relative links, removes duplicates, and sorts in alphabetical order.
Expand source code
def processLinks(self, links: list[str]) -> list[str]: """Formats relative links, removes duplicates, and sorts in alphabetical order.""" return sorted(self.removeDuplicates(self.formatRelativeLinks(links)))
def removeDuplicates(self, obj: list) ‑> list
-
Removes duplicate members.
Expand source code
def removeDuplicates(self, obj: list) -> list: """Removes duplicate members.""" return list(set(obj))
def scrapeImgLinks(self)
-
Scrape links from src attribute of
tags.
Expand source code
def scrapeImgLinks(self): """Scrape links from src attribute of <img> tags.""" self.imgLinks = self.processLinks( self.findAll("img", "src") + self.findAll("img", "data-src") )
def scrapePage(self)
-
Scrape all link types.
Expand source code
def scrapePage(self): """Scrape all link types.""" for scrape in [ self.scrapePageLinks, self.scrapeImgLinks, self.scrapeScriptLinks, ]: scrape() self.mergeImageLinksFromNonImgTags()
def scrapePageLinks(self)
-
Scrape links from href attribute of and tags.
Expand source code
def scrapePageLinks(self): """Scrape links from href attribute of <a> and <link> tags.""" links = self.findAll("a", "href") links.extend(self.findAll("link", "href")) self.pageLinks = self.processLinks(links)
def scrapeScriptLinks(self)
-
Scrape script links from src attribute of