gruel.gruel
1import inspect 2import time 3from typing import Any 4 5import loggi 6import requests 7from bs4 import BeautifulSoup, Tag 8from noiftimer import Timer 9from pathier import Pathier 10from printbuddies import ProgBar 11from whosyouragent import get_agent 12 13ParsableItem = dict | str | Tag 14 15 16class Gruel: 17 """Scraper base class.""" 18 19 def __init__(self, name: str | None = None): 20 self._name = name 21 self._init_logger() 22 self.timer = Timer() 23 self.success_count = 0 24 self.fail_count = 0 25 26 @property 27 def name(self) -> str: 28 """Returns the name given to __init__ or the stem of the file this instance was defined in if one wasn't given.""" 29 return self._name or Pathier(inspect.getsourcefile(type(self))).stem # type: ignore 30 31 def _init_logger(self): 32 log_dir = Pathier.cwd() / "gruel_logs" 33 self.logger = loggi.getLogger(self.name, log_dir) 34 35 def get_page( 36 self, url: str, method: str = "get", headers: dict[str, str] = {} 37 ) -> requests.Response: 38 """Request `url` and return the `requests.Response` object. 39 40 By default, the only header sent is a randomized user agent string. 41 42 This can be overridden by supplying a user agent in the `headers` param.""" 43 try: 44 return requests.request( 45 method, url, headers={"User-Agent": get_agent()} | headers 46 ) 47 except Exception as e: 48 time.sleep(1) 49 return requests.request( 50 method, url, headers={"User-Agent": get_agent()} | headers 51 ) 52 53 def as_soup(self, response: requests.Response) -> BeautifulSoup: 54 """Returns the text content of `response` as a `BeautifulSoup` object.""" 55 return BeautifulSoup(response.text, "html.parser") 56 57 def get_soup( 58 self, url: str, method: str = "get", headers: dict[str, str] = {} 59 ) -> BeautifulSoup: 60 """Request `url` with `headers` and return `BeautifulSoup` object.""" 61 return self.as_soup(self.get_page(url, method, headers)) 62 63 def clean_string(self, text: str) -> str: 64 """Strip `\\n\\r\\t` and whitespace from `text`.""" 65 return text.strip(" \n\t\r") 66 67 def prescrape_chores(self): 68 """Chores to do before scraping.""" 69 ... 70 71 def postscrape_chores(self): 72 """Chores to do after scraping.""" 73 loggi.close(self.logger) 74 75 def get_parsable_items(self) -> list[ParsableItem]: 76 """Get relevant webpages and extract raw data that needs to be parsed. 77 78 e.g. first 10 results for an endpoint that returns json content 79 >>> return self.get_page(some_url).json()[:10]""" 80 raise NotImplementedError 81 82 def parse_item(self, item: ParsableItem) -> Any: 83 """Parse `item` and return parsed data. 84 85 e.g. 86 >>> try: 87 >>> parsed = {} 88 >>> parsed["thing1"] = item["element"].split()[0] 89 >>> self.successes += 1 90 >>> return parsed 91 >>> except Exception: 92 >>> self.logger.exception("message") 93 >>> self.failures += 1 94 >>> return None""" 95 raise NotImplementedError 96 97 def store_item(self, item: Any): 98 """Store `item`.""" 99 raise NotImplementedError 100 101 def _parse_items_no_prog_bar(self, parsable_items: list[ParsableItem]): 102 for item in parsable_items: 103 parsed_item = self.parse_item(item) 104 if parsed_item: 105 self.store_item(parsed_item) 106 107 def _parse_items_prog_bar(self, parsable_items: list[ParsableItem]): 108 with ProgBar(len(parsable_items)) as bar: 109 for item in parsable_items: 110 parsed_item = self.parse_item(item) 111 if parsed_item: 112 self.store_item(parsed_item) 113 bar.display(f"{bar.runtime}") 114 115 def scrape(self, parse_items_prog_bar_display: bool = False): 116 """Run the scraper: 117 1. prescrape chores 118 2. get parsable items 119 3. parse items 120 4. store items 121 5. postscrape chores""" 122 try: 123 self.timer.start() 124 self.logger.info("Scrape started.") 125 self.prescrape_chores() 126 try: 127 parsable_items = self.get_parsable_items() 128 self.logger.info( 129 f"{self.name}:get_parsable_items() returned {(len(parsable_items))} items" 130 ) 131 except Exception: 132 self.logger.exception(f"Error in {self.name}:get_parsable_items().") 133 else: 134 if parse_items_prog_bar_display: 135 self._parse_items_prog_bar(parsable_items) 136 else: 137 self._parse_items_no_prog_bar(parsable_items) 138 self.logger.info( 139 f"Scrape completed in {self.timer.elapsed_str} with {self.success_count} successes and {self.fail_count} failures." 140 ) 141 except Exception: 142 self.logger.exception(f"Unexpected failure in {self.name}:scrape()") 143 self.postscrape_chores()
class
Gruel:
17class Gruel: 18 """Scraper base class.""" 19 20 def __init__(self, name: str | None = None): 21 self._name = name 22 self._init_logger() 23 self.timer = Timer() 24 self.success_count = 0 25 self.fail_count = 0 26 27 @property 28 def name(self) -> str: 29 """Returns the name given to __init__ or the stem of the file this instance was defined in if one wasn't given.""" 30 return self._name or Pathier(inspect.getsourcefile(type(self))).stem # type: ignore 31 32 def _init_logger(self): 33 log_dir = Pathier.cwd() / "gruel_logs" 34 self.logger = loggi.getLogger(self.name, log_dir) 35 36 def get_page( 37 self, url: str, method: str = "get", headers: dict[str, str] = {} 38 ) -> requests.Response: 39 """Request `url` and return the `requests.Response` object. 40 41 By default, the only header sent is a randomized user agent string. 42 43 This can be overridden by supplying a user agent in the `headers` param.""" 44 try: 45 return requests.request( 46 method, url, headers={"User-Agent": get_agent()} | headers 47 ) 48 except Exception as e: 49 time.sleep(1) 50 return requests.request( 51 method, url, headers={"User-Agent": get_agent()} | headers 52 ) 53 54 def as_soup(self, response: requests.Response) -> BeautifulSoup: 55 """Returns the text content of `response` as a `BeautifulSoup` object.""" 56 return BeautifulSoup(response.text, "html.parser") 57 58 def get_soup( 59 self, url: str, method: str = "get", headers: dict[str, str] = {} 60 ) -> BeautifulSoup: 61 """Request `url` with `headers` and return `BeautifulSoup` object.""" 62 return self.as_soup(self.get_page(url, method, headers)) 63 64 def clean_string(self, text: str) -> str: 65 """Strip `\\n\\r\\t` and whitespace from `text`.""" 66 return text.strip(" \n\t\r") 67 68 def prescrape_chores(self): 69 """Chores to do before scraping.""" 70 ... 71 72 def postscrape_chores(self): 73 """Chores to do after scraping.""" 74 loggi.close(self.logger) 75 76 def get_parsable_items(self) -> list[ParsableItem]: 77 """Get relevant webpages and extract raw data that needs to be parsed. 78 79 e.g. first 10 results for an endpoint that returns json content 80 >>> return self.get_page(some_url).json()[:10]""" 81 raise NotImplementedError 82 83 def parse_item(self, item: ParsableItem) -> Any: 84 """Parse `item` and return parsed data. 85 86 e.g. 87 >>> try: 88 >>> parsed = {} 89 >>> parsed["thing1"] = item["element"].split()[0] 90 >>> self.successes += 1 91 >>> return parsed 92 >>> except Exception: 93 >>> self.logger.exception("message") 94 >>> self.failures += 1 95 >>> return None""" 96 raise NotImplementedError 97 98 def store_item(self, item: Any): 99 """Store `item`.""" 100 raise NotImplementedError 101 102 def _parse_items_no_prog_bar(self, parsable_items: list[ParsableItem]): 103 for item in parsable_items: 104 parsed_item = self.parse_item(item) 105 if parsed_item: 106 self.store_item(parsed_item) 107 108 def _parse_items_prog_bar(self, parsable_items: list[ParsableItem]): 109 with ProgBar(len(parsable_items)) as bar: 110 for item in parsable_items: 111 parsed_item = self.parse_item(item) 112 if parsed_item: 113 self.store_item(parsed_item) 114 bar.display(f"{bar.runtime}") 115 116 def scrape(self, parse_items_prog_bar_display: bool = False): 117 """Run the scraper: 118 1. prescrape chores 119 2. get parsable items 120 3. parse items 121 4. store items 122 5. postscrape chores""" 123 try: 124 self.timer.start() 125 self.logger.info("Scrape started.") 126 self.prescrape_chores() 127 try: 128 parsable_items = self.get_parsable_items() 129 self.logger.info( 130 f"{self.name}:get_parsable_items() returned {(len(parsable_items))} items" 131 ) 132 except Exception: 133 self.logger.exception(f"Error in {self.name}:get_parsable_items().") 134 else: 135 if parse_items_prog_bar_display: 136 self._parse_items_prog_bar(parsable_items) 137 else: 138 self._parse_items_no_prog_bar(parsable_items) 139 self.logger.info( 140 f"Scrape completed in {self.timer.elapsed_str} with {self.success_count} successes and {self.fail_count} failures." 141 ) 142 except Exception: 143 self.logger.exception(f"Unexpected failure in {self.name}:scrape()") 144 self.postscrape_chores()
Scraper base class.
name: str
Returns the name given to __init__ or the stem of the file this instance was defined in if one wasn't given.
def
get_page( self, url: str, method: str = 'get', headers: dict[str, str] = {}) -> requests.models.Response:
36 def get_page( 37 self, url: str, method: str = "get", headers: dict[str, str] = {} 38 ) -> requests.Response: 39 """Request `url` and return the `requests.Response` object. 40 41 By default, the only header sent is a randomized user agent string. 42 43 This can be overridden by supplying a user agent in the `headers` param.""" 44 try: 45 return requests.request( 46 method, url, headers={"User-Agent": get_agent()} | headers 47 ) 48 except Exception as e: 49 time.sleep(1) 50 return requests.request( 51 method, url, headers={"User-Agent": get_agent()} | headers 52 )
Request url
and return the requests.Response
object.
By default, the only header sent is a randomized user agent string.
This can be overridden by supplying a user agent in the headers
param.
def
as_soup(self, response: requests.models.Response) -> bs4.BeautifulSoup:
54 def as_soup(self, response: requests.Response) -> BeautifulSoup: 55 """Returns the text content of `response` as a `BeautifulSoup` object.""" 56 return BeautifulSoup(response.text, "html.parser")
Returns the text content of response
as a BeautifulSoup
object.
def
get_soup( self, url: str, method: str = 'get', headers: dict[str, str] = {}) -> bs4.BeautifulSoup:
58 def get_soup( 59 self, url: str, method: str = "get", headers: dict[str, str] = {} 60 ) -> BeautifulSoup: 61 """Request `url` with `headers` and return `BeautifulSoup` object.""" 62 return self.as_soup(self.get_page(url, method, headers))
Request url
with headers
and return BeautifulSoup
object.
def
clean_string(self, text: str) -> str:
64 def clean_string(self, text: str) -> str: 65 """Strip `\\n\\r\\t` and whitespace from `text`.""" 66 return text.strip(" \n\t\r")
Strip \n\r\t
and whitespace from text
.
def
get_parsable_items(self) -> list[dict | str | bs4.element.Tag]:
76 def get_parsable_items(self) -> list[ParsableItem]: 77 """Get relevant webpages and extract raw data that needs to be parsed. 78 79 e.g. first 10 results for an endpoint that returns json content 80 >>> return self.get_page(some_url).json()[:10]""" 81 raise NotImplementedError
Get relevant webpages and extract raw data that needs to be parsed.
e.g. first 10 results for an endpoint that returns json content
>>> return self.get_page(some_url).json()[:10]
def
parse_item(self, item: dict | str | bs4.element.Tag) -> Any:
83 def parse_item(self, item: ParsableItem) -> Any: 84 """Parse `item` and return parsed data. 85 86 e.g. 87 >>> try: 88 >>> parsed = {} 89 >>> parsed["thing1"] = item["element"].split()[0] 90 >>> self.successes += 1 91 >>> return parsed 92 >>> except Exception: 93 >>> self.logger.exception("message") 94 >>> self.failures += 1 95 >>> return None""" 96 raise NotImplementedError
Parse item
and return parsed data.
e.g.
>>> try:
>>> parsed = {}
>>> parsed["thing1"] = item["element"].split()[0]
>>> self.successes += 1
>>> return parsed
>>> except Exception:
>>> self.logger.exception("message")
>>> self.failures += 1
>>> return None
def
scrape(self, parse_items_prog_bar_display: bool = False):
116 def scrape(self, parse_items_prog_bar_display: bool = False): 117 """Run the scraper: 118 1. prescrape chores 119 2. get parsable items 120 3. parse items 121 4. store items 122 5. postscrape chores""" 123 try: 124 self.timer.start() 125 self.logger.info("Scrape started.") 126 self.prescrape_chores() 127 try: 128 parsable_items = self.get_parsable_items() 129 self.logger.info( 130 f"{self.name}:get_parsable_items() returned {(len(parsable_items))} items" 131 ) 132 except Exception: 133 self.logger.exception(f"Error in {self.name}:get_parsable_items().") 134 else: 135 if parse_items_prog_bar_display: 136 self._parse_items_prog_bar(parsable_items) 137 else: 138 self._parse_items_no_prog_bar(parsable_items) 139 self.logger.info( 140 f"Scrape completed in {self.timer.elapsed_str} with {self.success_count} successes and {self.fail_count} failures." 141 ) 142 except Exception: 143 self.logger.exception(f"Unexpected failure in {self.name}:scrape()") 144 self.postscrape_chores()
Run the scraper:
- prescrape chores
- get parsable items
- parse items
- store items
- postscrape chores