gruel.gruel
1import inspect 2import time 3from typing import Any 4 5import loggi 6import requests 7import whosyouragent 8from bs4 import BeautifulSoup, Tag 9from noiftimer import Timer 10from pathier import Pathier, Pathish 11from printbuddies import ProgBar 12 13ParsableItem = dict | str | Tag 14 15 16class Gruel: 17 """Scraper base class. 18 19 Classes subclassing `Gruel` need to implement the following methods: 20 21 * `get_parsable_items(self) -> list[Any]` 22 * `parse_item(self, item: Any)->Any` 23 * `store_item(self, item: Any)` 24 25 Calling the `scrape()` method will execute: 26 1. `self.prescrape_chores()` (does nothing unless overridden) 27 2. `self.get_parsable_items()` 28 3. `self.parse_item()` for each item returned by `self.get_parsable_items()` 29 4. `self.store_item()` for each successfully parsed item 30 5. `self.postscrape_chores()` (only closes this instance's log file unless overridden) 31 32 When overriding `self.postscrape_chores`, it's recommended to either 33 call `super().postscrape_chores()` or make sure to call `self.log.close()`. 34 Otherwise running a large number of scrapers can cause file handle limit issues.""" 35 36 def __init__(self, name: str | None = None, log_dir: Pathish | None = None): 37 """ 38 :params: 39 * `name`: The name of this scraper. If `None`, the name will be the stem of the file this class/subclass was defined in. 40 i.e. A `Gruel` subclass located in a file called `myscraper.py` will have the name `"myscraper"`. 41 * `log_dir`: The directory this scraper's logs should be saved to. 42 If `None`, the logs will be written to a folder called `"gruel_logs"` within the current working directory. 43 """ 44 self._name = name 45 self._init_logger(log_dir) 46 self.timer = Timer() 47 self.success_count = 0 48 self.fail_count = 0 49 50 @property 51 def name(self) -> str: 52 """Returns the name given to __init__ or the stem of the file this instance was defined in if one wasn't given.""" 53 return self._name or Pathier(inspect.getsourcefile(type(self))).stem # type: ignore 54 55 def _init_logger(self, log_dir: Pathish | None): 56 log_dir = Pathier.cwd() / "gruel_logs" if not log_dir else Pathier(log_dir) 57 self.logger = loggi.getLogger(self.name, log_dir) 58 59 @staticmethod 60 def request( 61 url: str, 62 method: str = "get", 63 headers: dict[str, str] = {}, 64 params: dict | None = None, 65 data: dict | None = None, 66 timeout: int | None = None, 67 retry_on_fail: bool = True, 68 ) -> requests.Response: 69 """Send a request to `url` and return the `requests.Response` object. 70 71 By default, the only header sent is a randomized user agent string. 72 73 This can be overridden by supplying a user agent in the `headers` param. 74 75 If `retry_on_fail` is `True`, the request will be repeated after 1 second if the originally request causes an exception to be thrown. 76 Otherwise, the exception will be raised.""" 77 args = [method, url] 78 headers = whosyouragent.get_header() | headers 79 kwargs = { 80 "headers": headers, 81 "timeout": timeout, 82 "params": params, 83 "data": data, 84 } 85 try: 86 response = requests.request(*args, **kwargs) 87 return response 88 except Exception as e: 89 if retry_on_fail: 90 time.sleep(1) 91 return requests.request(*args, **kwargs) 92 else: 93 raise e 94 95 @staticmethod 96 def as_soup(response: requests.Response) -> BeautifulSoup: 97 """Returns the text content of `response` as a `BeautifulSoup` object.""" 98 return BeautifulSoup(response.text, "html.parser") 99 100 def get_soup( 101 self, url: str, method: str = "get", headers: dict[str, str] = {} 102 ) -> BeautifulSoup: 103 """Request `url` with `headers` and return `BeautifulSoup` object.""" 104 return self.as_soup(self.request(url, method, headers)) 105 106 def clean_string(self, text: str) -> str: 107 """Strip `\\n\\r\\t` and whitespace from `text`.""" 108 return text.strip(" \n\t\r") 109 110 # |==============================================================================| 111 # Overridables 112 # |==============================================================================| 113 def prescrape_chores(self): 114 """Chores to do before scraping.""" 115 ... 116 117 def postscrape_chores(self): 118 """Chores to do after scraping.""" 119 loggi.close(self.logger) 120 121 def get_parsable_items(self) -> list[ParsableItem]: 122 """Get relevant webpages and extract raw data that needs to be parsed. 123 124 e.g. first 10 results for an endpoint that returns json content 125 >>> return self.get_page(some_url).json()[:10]""" 126 raise NotImplementedError 127 128 def parse_item(self, item: ParsableItem) -> Any: 129 """Parse `item` and return parsed data. 130 131 e.g. 132 >>> try: 133 >>> parsed = {} 134 >>> parsed["thing1"] = item["element"].split()[0] 135 >>> self.successes += 1 136 >>> return parsed 137 >>> except Exception: 138 >>> self.logger.exception("message") 139 >>> self.failures += 1 140 >>> return None""" 141 raise NotImplementedError 142 143 def store_item(self, item: Any): 144 """Store `item`.""" 145 raise NotImplementedError 146 147 def _parse_items_no_prog_bar(self, parsable_items: list[ParsableItem]): 148 for item in parsable_items: 149 parsed_item = self.parse_item(item) 150 if parsed_item: 151 self.store_item(parsed_item) 152 153 def _parse_items_prog_bar(self, parsable_items: list[ParsableItem]): 154 with ProgBar(len(parsable_items)) as bar: 155 for item in parsable_items: 156 parsed_item = self.parse_item(item) 157 if parsed_item: 158 self.store_item(parsed_item) 159 bar.display(f"{bar.runtime}") 160 161 def scrape(self, parse_items_prog_bar_display: bool = False): 162 """Run the scraper: 163 1. prescrape chores 164 2. get parsable items 165 3. parse and store items 166 5. postscrape chores""" 167 try: 168 self.timer.start() 169 self.logger.info("Scrape started.") 170 self.prescrape_chores() 171 try: 172 parsable_items = self.get_parsable_items() 173 self.logger.info( 174 f"{self.name}:get_parsable_items() returned {(len(parsable_items))} items" 175 ) 176 except Exception: 177 self.logger.exception(f"Error in {self.name}:get_parsable_items().") 178 else: 179 if parse_items_prog_bar_display: 180 self._parse_items_prog_bar(parsable_items) 181 else: 182 self._parse_items_no_prog_bar(parsable_items) 183 self.logger.info( 184 f"Scrape completed in {self.timer.elapsed_str} with {self.success_count} successes and {self.fail_count} failures." 185 ) 186 except Exception: 187 self.logger.exception(f"Unexpected failure in {self.name}:scrape()") 188 self.postscrape_chores()
17class Gruel: 18 """Scraper base class. 19 20 Classes subclassing `Gruel` need to implement the following methods: 21 22 * `get_parsable_items(self) -> list[Any]` 23 * `parse_item(self, item: Any)->Any` 24 * `store_item(self, item: Any)` 25 26 Calling the `scrape()` method will execute: 27 1. `self.prescrape_chores()` (does nothing unless overridden) 28 2. `self.get_parsable_items()` 29 3. `self.parse_item()` for each item returned by `self.get_parsable_items()` 30 4. `self.store_item()` for each successfully parsed item 31 5. `self.postscrape_chores()` (only closes this instance's log file unless overridden) 32 33 When overriding `self.postscrape_chores`, it's recommended to either 34 call `super().postscrape_chores()` or make sure to call `self.log.close()`. 35 Otherwise running a large number of scrapers can cause file handle limit issues.""" 36 37 def __init__(self, name: str | None = None, log_dir: Pathish | None = None): 38 """ 39 :params: 40 * `name`: The name of this scraper. If `None`, the name will be the stem of the file this class/subclass was defined in. 41 i.e. A `Gruel` subclass located in a file called `myscraper.py` will have the name `"myscraper"`. 42 * `log_dir`: The directory this scraper's logs should be saved to. 43 If `None`, the logs will be written to a folder called `"gruel_logs"` within the current working directory. 44 """ 45 self._name = name 46 self._init_logger(log_dir) 47 self.timer = Timer() 48 self.success_count = 0 49 self.fail_count = 0 50 51 @property 52 def name(self) -> str: 53 """Returns the name given to __init__ or the stem of the file this instance was defined in if one wasn't given.""" 54 return self._name or Pathier(inspect.getsourcefile(type(self))).stem # type: ignore 55 56 def _init_logger(self, log_dir: Pathish | None): 57 log_dir = Pathier.cwd() / "gruel_logs" if not log_dir else Pathier(log_dir) 58 self.logger = loggi.getLogger(self.name, log_dir) 59 60 @staticmethod 61 def request( 62 url: str, 63 method: str = "get", 64 headers: dict[str, str] = {}, 65 params: dict | None = None, 66 data: dict | None = None, 67 timeout: int | None = None, 68 retry_on_fail: bool = True, 69 ) -> requests.Response: 70 """Send a request to `url` and return the `requests.Response` object. 71 72 By default, the only header sent is a randomized user agent string. 73 74 This can be overridden by supplying a user agent in the `headers` param. 75 76 If `retry_on_fail` is `True`, the request will be repeated after 1 second if the originally request causes an exception to be thrown. 77 Otherwise, the exception will be raised.""" 78 args = [method, url] 79 headers = whosyouragent.get_header() | headers 80 kwargs = { 81 "headers": headers, 82 "timeout": timeout, 83 "params": params, 84 "data": data, 85 } 86 try: 87 response = requests.request(*args, **kwargs) 88 return response 89 except Exception as e: 90 if retry_on_fail: 91 time.sleep(1) 92 return requests.request(*args, **kwargs) 93 else: 94 raise e 95 96 @staticmethod 97 def as_soup(response: requests.Response) -> BeautifulSoup: 98 """Returns the text content of `response` as a `BeautifulSoup` object.""" 99 return BeautifulSoup(response.text, "html.parser") 100 101 def get_soup( 102 self, url: str, method: str = "get", headers: dict[str, str] = {} 103 ) -> BeautifulSoup: 104 """Request `url` with `headers` and return `BeautifulSoup` object.""" 105 return self.as_soup(self.request(url, method, headers)) 106 107 def clean_string(self, text: str) -> str: 108 """Strip `\\n\\r\\t` and whitespace from `text`.""" 109 return text.strip(" \n\t\r") 110 111 # |==============================================================================| 112 # Overridables 113 # |==============================================================================| 114 def prescrape_chores(self): 115 """Chores to do before scraping.""" 116 ... 117 118 def postscrape_chores(self): 119 """Chores to do after scraping.""" 120 loggi.close(self.logger) 121 122 def get_parsable_items(self) -> list[ParsableItem]: 123 """Get relevant webpages and extract raw data that needs to be parsed. 124 125 e.g. first 10 results for an endpoint that returns json content 126 >>> return self.get_page(some_url).json()[:10]""" 127 raise NotImplementedError 128 129 def parse_item(self, item: ParsableItem) -> Any: 130 """Parse `item` and return parsed data. 131 132 e.g. 133 >>> try: 134 >>> parsed = {} 135 >>> parsed["thing1"] = item["element"].split()[0] 136 >>> self.successes += 1 137 >>> return parsed 138 >>> except Exception: 139 >>> self.logger.exception("message") 140 >>> self.failures += 1 141 >>> return None""" 142 raise NotImplementedError 143 144 def store_item(self, item: Any): 145 """Store `item`.""" 146 raise NotImplementedError 147 148 def _parse_items_no_prog_bar(self, parsable_items: list[ParsableItem]): 149 for item in parsable_items: 150 parsed_item = self.parse_item(item) 151 if parsed_item: 152 self.store_item(parsed_item) 153 154 def _parse_items_prog_bar(self, parsable_items: list[ParsableItem]): 155 with ProgBar(len(parsable_items)) as bar: 156 for item in parsable_items: 157 parsed_item = self.parse_item(item) 158 if parsed_item: 159 self.store_item(parsed_item) 160 bar.display(f"{bar.runtime}") 161 162 def scrape(self, parse_items_prog_bar_display: bool = False): 163 """Run the scraper: 164 1. prescrape chores 165 2. get parsable items 166 3. parse and store items 167 5. postscrape chores""" 168 try: 169 self.timer.start() 170 self.logger.info("Scrape started.") 171 self.prescrape_chores() 172 try: 173 parsable_items = self.get_parsable_items() 174 self.logger.info( 175 f"{self.name}:get_parsable_items() returned {(len(parsable_items))} items" 176 ) 177 except Exception: 178 self.logger.exception(f"Error in {self.name}:get_parsable_items().") 179 else: 180 if parse_items_prog_bar_display: 181 self._parse_items_prog_bar(parsable_items) 182 else: 183 self._parse_items_no_prog_bar(parsable_items) 184 self.logger.info( 185 f"Scrape completed in {self.timer.elapsed_str} with {self.success_count} successes and {self.fail_count} failures." 186 ) 187 except Exception: 188 self.logger.exception(f"Unexpected failure in {self.name}:scrape()") 189 self.postscrape_chores()
Scraper base class.
Classes subclassing Gruel
need to implement the following methods:
get_parsable_items(self) -> list[Any]
parse_item(self, item: Any)->Any
store_item(self, item: Any)
Calling the scrape()
method will execute:
self.prescrape_chores()
(does nothing unless overridden)self.get_parsable_items()
self.parse_item()
for each item returned byself.get_parsable_items()
self.store_item()
for each successfully parsed itemself.postscrape_chores()
(only closes this instance's log file unless overridden)
When overriding self.postscrape_chores
, it's recommended to either
call super().postscrape_chores()
or make sure to call self.log.close()
.
Otherwise running a large number of scrapers can cause file handle limit issues.
37 def __init__(self, name: str | None = None, log_dir: Pathish | None = None): 38 """ 39 :params: 40 * `name`: The name of this scraper. If `None`, the name will be the stem of the file this class/subclass was defined in. 41 i.e. A `Gruel` subclass located in a file called `myscraper.py` will have the name `"myscraper"`. 42 * `log_dir`: The directory this scraper's logs should be saved to. 43 If `None`, the logs will be written to a folder called `"gruel_logs"` within the current working directory. 44 """ 45 self._name = name 46 self._init_logger(log_dir) 47 self.timer = Timer() 48 self.success_count = 0 49 self.fail_count = 0
:params:
name
: The name of this scraper. IfNone
, the name will be the stem of the file this class/subclass was defined in. i.e. AGruel
subclass located in a file calledmyscraper.py
will have the name"myscraper"
.log_dir
: The directory this scraper's logs should be saved to. IfNone
, the logs will be written to a folder called"gruel_logs"
within the current working directory.
Returns the name given to __init__ or the stem of the file this instance was defined in if one wasn't given.
60 @staticmethod 61 def request( 62 url: str, 63 method: str = "get", 64 headers: dict[str, str] = {}, 65 params: dict | None = None, 66 data: dict | None = None, 67 timeout: int | None = None, 68 retry_on_fail: bool = True, 69 ) -> requests.Response: 70 """Send a request to `url` and return the `requests.Response` object. 71 72 By default, the only header sent is a randomized user agent string. 73 74 This can be overridden by supplying a user agent in the `headers` param. 75 76 If `retry_on_fail` is `True`, the request will be repeated after 1 second if the originally request causes an exception to be thrown. 77 Otherwise, the exception will be raised.""" 78 args = [method, url] 79 headers = whosyouragent.get_header() | headers 80 kwargs = { 81 "headers": headers, 82 "timeout": timeout, 83 "params": params, 84 "data": data, 85 } 86 try: 87 response = requests.request(*args, **kwargs) 88 return response 89 except Exception as e: 90 if retry_on_fail: 91 time.sleep(1) 92 return requests.request(*args, **kwargs) 93 else: 94 raise e
Send a request to url
and return the requests.Response
object.
By default, the only header sent is a randomized user agent string.
This can be overridden by supplying a user agent in the headers
param.
If retry_on_fail
is True
, the request will be repeated after 1 second if the originally request causes an exception to be thrown.
Otherwise, the exception will be raised.
96 @staticmethod 97 def as_soup(response: requests.Response) -> BeautifulSoup: 98 """Returns the text content of `response` as a `BeautifulSoup` object.""" 99 return BeautifulSoup(response.text, "html.parser")
Returns the text content of response
as a BeautifulSoup
object.
101 def get_soup( 102 self, url: str, method: str = "get", headers: dict[str, str] = {} 103 ) -> BeautifulSoup: 104 """Request `url` with `headers` and return `BeautifulSoup` object.""" 105 return self.as_soup(self.request(url, method, headers))
Request url
with headers
and return BeautifulSoup
object.
107 def clean_string(self, text: str) -> str: 108 """Strip `\\n\\r\\t` and whitespace from `text`.""" 109 return text.strip(" \n\t\r")
Strip \n\r\t
and whitespace from text
.
118 def postscrape_chores(self): 119 """Chores to do after scraping.""" 120 loggi.close(self.logger)
Chores to do after scraping.
122 def get_parsable_items(self) -> list[ParsableItem]: 123 """Get relevant webpages and extract raw data that needs to be parsed. 124 125 e.g. first 10 results for an endpoint that returns json content 126 >>> return self.get_page(some_url).json()[:10]""" 127 raise NotImplementedError
Get relevant webpages and extract raw data that needs to be parsed.
e.g. first 10 results for an endpoint that returns json content
>>> return self.get_page(some_url).json()[:10]
129 def parse_item(self, item: ParsableItem) -> Any: 130 """Parse `item` and return parsed data. 131 132 e.g. 133 >>> try: 134 >>> parsed = {} 135 >>> parsed["thing1"] = item["element"].split()[0] 136 >>> self.successes += 1 137 >>> return parsed 138 >>> except Exception: 139 >>> self.logger.exception("message") 140 >>> self.failures += 1 141 >>> return None""" 142 raise NotImplementedError
Parse item
and return parsed data.
e.g.
>>> try:
>>> parsed = {}
>>> parsed["thing1"] = item["element"].split()[0]
>>> self.successes += 1
>>> return parsed
>>> except Exception:
>>> self.logger.exception("message")
>>> self.failures += 1
>>> return None
162 def scrape(self, parse_items_prog_bar_display: bool = False): 163 """Run the scraper: 164 1. prescrape chores 165 2. get parsable items 166 3. parse and store items 167 5. postscrape chores""" 168 try: 169 self.timer.start() 170 self.logger.info("Scrape started.") 171 self.prescrape_chores() 172 try: 173 parsable_items = self.get_parsable_items() 174 self.logger.info( 175 f"{self.name}:get_parsable_items() returned {(len(parsable_items))} items" 176 ) 177 except Exception: 178 self.logger.exception(f"Error in {self.name}:get_parsable_items().") 179 else: 180 if parse_items_prog_bar_display: 181 self._parse_items_prog_bar(parsable_items) 182 else: 183 self._parse_items_no_prog_bar(parsable_items) 184 self.logger.info( 185 f"Scrape completed in {self.timer.elapsed_str} with {self.success_count} successes and {self.fail_count} failures." 186 ) 187 except Exception: 188 self.logger.exception(f"Unexpected failure in {self.name}:scrape()") 189 self.postscrape_chores()
Run the scraper:
- prescrape chores
- get parsable items
- parse and store items
- postscrape chores