gruel.gruel
1import inspect 2import logging 3import time 4from typing import Any 5 6import requests 7from bs4 import BeautifulSoup, Tag 8from noiftimer import Timer 9from pathier import Pathier 10from printbuddies import ProgBar 11from whosyouragent import get_agent 12 13ParsableItem = dict | str | Tag 14 15 16class Gruel: 17 """Scraper base class.""" 18 19 def __init__(self, name: str | None = None): 20 self._name = name 21 self._init_logger() 22 self.timer = Timer() 23 self.success_count = 0 24 self.fail_count = 0 25 26 @property 27 def name(self) -> str: 28 """Returns the stem of the file this instance was defined in.""" 29 return Pathier(inspect.getsourcefile(type(self))).stem # type: ignore 30 31 def _init_logger(self): 32 log_dir = Pathier.cwd() / "logs" 33 log_dir.mkdir() 34 self.logger = logging.getLogger(self.name) 35 if not self.logger.hasHandlers(): 36 handler = logging.FileHandler( 37 (log_dir / self.name).with_suffix(".log"), encoding="utf-8" 38 ) 39 handler.setFormatter( 40 logging.Formatter( 41 "{levelname}|-|{asctime}|-|{message}", 42 style="{", 43 datefmt="%m/%d/%Y %I:%M:%S %p", 44 ) 45 ) 46 self.logger.addHandler(handler) 47 self.logger.setLevel(logging.INFO) 48 49 def get_page( 50 self, url: str, method: str = "get", headers: dict[str, str] = {} 51 ) -> requests.Response: 52 """Request `url` and return the `requests.Response` object. 53 54 By default, the only header sent is a randomized user agent string. 55 56 This can be overridden by supplying a user agent in the `headers` param.""" 57 try: 58 return requests.request( 59 method, url, headers={"User-Agent": get_agent()} | headers 60 ) 61 except Exception as e: 62 time.sleep(1) 63 return requests.request( 64 method, url, headers={"User-Agent": get_agent()} | headers 65 ) 66 67 def as_soup(self, response: requests.Response) -> BeautifulSoup: 68 """Returns the text content of `response` as a `BeautifulSoup` object.""" 69 return BeautifulSoup(response.text, "html.parser") 70 71 def get_soup( 72 self, url: str, method: str = "get", headers: dict[str, str] = {} 73 ) -> BeautifulSoup: 74 """Request `url` with `headers` and return `BeautifulSoup` object.""" 75 return self.as_soup(self.get_page(url, method, headers)) 76 77 def clean_string(self, text: str) -> str: 78 """Strip `\\n\\r\\t` and whitespace from `text`.""" 79 return text.strip(" \n\t\r") 80 81 def prescrape_chores(self): 82 """Chores to do before scraping.""" 83 ... 84 85 def postscrape_chores(self): 86 """Chores to do after scraping.""" 87 ... 88 89 def get_parsable_items(self) -> list[ParsableItem]: 90 """Get relevant webpages and extract raw data that needs to be parsed. 91 92 e.g. first 10 results for an endpoint that returns json content 93 >>> return self.get_page(some_url).json()[:10]""" 94 raise NotImplementedError 95 96 def parse_item(self, item: ParsableItem) -> Any: 97 """Parse `item` and return parsed data. 98 99 e.g. 100 >>> try: 101 >>> parsed = {} 102 >>> parsed["thing1"] = item["element"].split()[0] 103 >>> self.successes += 1 104 >>> return parsed 105 >>> except Exception: 106 >>> self.logger.exception("message") 107 >>> self.failures += 1 108 >>> return None""" 109 raise NotImplementedError 110 111 def store_item(self, item: Any): 112 """Store `item`.""" 113 raise NotImplementedError 114 115 def scrape(self, parse_items_prog_bar_display: bool = False): 116 """Run the scraper: 117 1. prescrape chores 118 2. get parsable items 119 3. parse items 120 4. store items 121 5. postscrape chores""" 122 try: 123 self.timer.start() 124 self.logger.info("Scrape started.") 125 self.prescrape_chores() 126 try: 127 parsable_items = self.get_parsable_items() 128 except Exception: 129 self.logger.exception(f"Error in {self.name}:get_parsable_items().") 130 else: 131 with ProgBar(len(parsable_items)) as bar: 132 for item in parsable_items: 133 parsed_item = self.parse_item(item) 134 if parsed_item: 135 self.store_item(parsed_item) 136 if parse_items_prog_bar_display: 137 bar.display(f"{bar.runtime}") 138 self.logger.info( 139 f"Scrape completed in {self.timer.elapsed_str} with {self.success_count} successes and {self.fail_count} failures." 140 ) 141 except Exception: 142 self.logger.exception(f"Unexpected failure in {self.name}:scrape()") 143 self.postscrape_chores()
class
Gruel:
17class Gruel: 18 """Scraper base class.""" 19 20 def __init__(self, name: str | None = None): 21 self._name = name 22 self._init_logger() 23 self.timer = Timer() 24 self.success_count = 0 25 self.fail_count = 0 26 27 @property 28 def name(self) -> str: 29 """Returns the stem of the file this instance was defined in.""" 30 return Pathier(inspect.getsourcefile(type(self))).stem # type: ignore 31 32 def _init_logger(self): 33 log_dir = Pathier.cwd() / "logs" 34 log_dir.mkdir() 35 self.logger = logging.getLogger(self.name) 36 if not self.logger.hasHandlers(): 37 handler = logging.FileHandler( 38 (log_dir / self.name).with_suffix(".log"), encoding="utf-8" 39 ) 40 handler.setFormatter( 41 logging.Formatter( 42 "{levelname}|-|{asctime}|-|{message}", 43 style="{", 44 datefmt="%m/%d/%Y %I:%M:%S %p", 45 ) 46 ) 47 self.logger.addHandler(handler) 48 self.logger.setLevel(logging.INFO) 49 50 def get_page( 51 self, url: str, method: str = "get", headers: dict[str, str] = {} 52 ) -> requests.Response: 53 """Request `url` and return the `requests.Response` object. 54 55 By default, the only header sent is a randomized user agent string. 56 57 This can be overridden by supplying a user agent in the `headers` param.""" 58 try: 59 return requests.request( 60 method, url, headers={"User-Agent": get_agent()} | headers 61 ) 62 except Exception as e: 63 time.sleep(1) 64 return requests.request( 65 method, url, headers={"User-Agent": get_agent()} | headers 66 ) 67 68 def as_soup(self, response: requests.Response) -> BeautifulSoup: 69 """Returns the text content of `response` as a `BeautifulSoup` object.""" 70 return BeautifulSoup(response.text, "html.parser") 71 72 def get_soup( 73 self, url: str, method: str = "get", headers: dict[str, str] = {} 74 ) -> BeautifulSoup: 75 """Request `url` with `headers` and return `BeautifulSoup` object.""" 76 return self.as_soup(self.get_page(url, method, headers)) 77 78 def clean_string(self, text: str) -> str: 79 """Strip `\\n\\r\\t` and whitespace from `text`.""" 80 return text.strip(" \n\t\r") 81 82 def prescrape_chores(self): 83 """Chores to do before scraping.""" 84 ... 85 86 def postscrape_chores(self): 87 """Chores to do after scraping.""" 88 ... 89 90 def get_parsable_items(self) -> list[ParsableItem]: 91 """Get relevant webpages and extract raw data that needs to be parsed. 92 93 e.g. first 10 results for an endpoint that returns json content 94 >>> return self.get_page(some_url).json()[:10]""" 95 raise NotImplementedError 96 97 def parse_item(self, item: ParsableItem) -> Any: 98 """Parse `item` and return parsed data. 99 100 e.g. 101 >>> try: 102 >>> parsed = {} 103 >>> parsed["thing1"] = item["element"].split()[0] 104 >>> self.successes += 1 105 >>> return parsed 106 >>> except Exception: 107 >>> self.logger.exception("message") 108 >>> self.failures += 1 109 >>> return None""" 110 raise NotImplementedError 111 112 def store_item(self, item: Any): 113 """Store `item`.""" 114 raise NotImplementedError 115 116 def scrape(self, parse_items_prog_bar_display: bool = False): 117 """Run the scraper: 118 1. prescrape chores 119 2. get parsable items 120 3. parse items 121 4. store items 122 5. postscrape chores""" 123 try: 124 self.timer.start() 125 self.logger.info("Scrape started.") 126 self.prescrape_chores() 127 try: 128 parsable_items = self.get_parsable_items() 129 except Exception: 130 self.logger.exception(f"Error in {self.name}:get_parsable_items().") 131 else: 132 with ProgBar(len(parsable_items)) as bar: 133 for item in parsable_items: 134 parsed_item = self.parse_item(item) 135 if parsed_item: 136 self.store_item(parsed_item) 137 if parse_items_prog_bar_display: 138 bar.display(f"{bar.runtime}") 139 self.logger.info( 140 f"Scrape completed in {self.timer.elapsed_str} with {self.success_count} successes and {self.fail_count} failures." 141 ) 142 except Exception: 143 self.logger.exception(f"Unexpected failure in {self.name}:scrape()") 144 self.postscrape_chores()
Scraper base class.
def
get_page( self, url: str, method: str = 'get', headers: dict[str, str] = {}) -> requests.models.Response:
50 def get_page( 51 self, url: str, method: str = "get", headers: dict[str, str] = {} 52 ) -> requests.Response: 53 """Request `url` and return the `requests.Response` object. 54 55 By default, the only header sent is a randomized user agent string. 56 57 This can be overridden by supplying a user agent in the `headers` param.""" 58 try: 59 return requests.request( 60 method, url, headers={"User-Agent": get_agent()} | headers 61 ) 62 except Exception as e: 63 time.sleep(1) 64 return requests.request( 65 method, url, headers={"User-Agent": get_agent()} | headers 66 )
Request url
and return the requests.Response
object.
By default, the only header sent is a randomized user agent string.
This can be overridden by supplying a user agent in the headers
param.
def
as_soup(self, response: requests.models.Response) -> bs4.BeautifulSoup:
68 def as_soup(self, response: requests.Response) -> BeautifulSoup: 69 """Returns the text content of `response` as a `BeautifulSoup` object.""" 70 return BeautifulSoup(response.text, "html.parser")
Returns the text content of response
as a BeautifulSoup
object.
def
get_soup( self, url: str, method: str = 'get', headers: dict[str, str] = {}) -> bs4.BeautifulSoup:
72 def get_soup( 73 self, url: str, method: str = "get", headers: dict[str, str] = {} 74 ) -> BeautifulSoup: 75 """Request `url` with `headers` and return `BeautifulSoup` object.""" 76 return self.as_soup(self.get_page(url, method, headers))
Request url
with headers
and return BeautifulSoup
object.
def
clean_string(self, text: str) -> str:
78 def clean_string(self, text: str) -> str: 79 """Strip `\\n\\r\\t` and whitespace from `text`.""" 80 return text.strip(" \n\t\r")
Strip \n\r\t
and whitespace from text
.
def
get_parsable_items(self) -> list[dict | str | bs4.element.Tag]:
90 def get_parsable_items(self) -> list[ParsableItem]: 91 """Get relevant webpages and extract raw data that needs to be parsed. 92 93 e.g. first 10 results for an endpoint that returns json content 94 >>> return self.get_page(some_url).json()[:10]""" 95 raise NotImplementedError
Get relevant webpages and extract raw data that needs to be parsed.
e.g. first 10 results for an endpoint that returns json content
>>> return self.get_page(some_url).json()[:10]
def
parse_item(self, item: dict | str | bs4.element.Tag) -> Any:
97 def parse_item(self, item: ParsableItem) -> Any: 98 """Parse `item` and return parsed data. 99 100 e.g. 101 >>> try: 102 >>> parsed = {} 103 >>> parsed["thing1"] = item["element"].split()[0] 104 >>> self.successes += 1 105 >>> return parsed 106 >>> except Exception: 107 >>> self.logger.exception("message") 108 >>> self.failures += 1 109 >>> return None""" 110 raise NotImplementedError
Parse item
and return parsed data.
e.g.
>>> try:
>>> parsed = {}
>>> parsed["thing1"] = item["element"].split()[0]
>>> self.successes += 1
>>> return parsed
>>> except Exception:
>>> self.logger.exception("message")
>>> self.failures += 1
>>> return None
def
scrape(self, parse_items_prog_bar_display: bool = False):
116 def scrape(self, parse_items_prog_bar_display: bool = False): 117 """Run the scraper: 118 1. prescrape chores 119 2. get parsable items 120 3. parse items 121 4. store items 122 5. postscrape chores""" 123 try: 124 self.timer.start() 125 self.logger.info("Scrape started.") 126 self.prescrape_chores() 127 try: 128 parsable_items = self.get_parsable_items() 129 except Exception: 130 self.logger.exception(f"Error in {self.name}:get_parsable_items().") 131 else: 132 with ProgBar(len(parsable_items)) as bar: 133 for item in parsable_items: 134 parsed_item = self.parse_item(item) 135 if parsed_item: 136 self.store_item(parsed_item) 137 if parse_items_prog_bar_display: 138 bar.display(f"{bar.runtime}") 139 self.logger.info( 140 f"Scrape completed in {self.timer.elapsed_str} with {self.success_count} successes and {self.fail_count} failures." 141 ) 142 except Exception: 143 self.logger.exception(f"Unexpected failure in {self.name}:scrape()") 144 self.postscrape_chores()
Run the scraper:
- prescrape chores
- get parsable items
- parse items
- store items
- postscrape chores