gruel.grueler
1import inspect 2import time 3from typing import Any 4 5import loggi 6import requests 7import whosyouragent 8from bs4 import BeautifulSoup, Tag 9from noiftimer import Timer 10from pathier import Pathier, Pathish 11from printbuddies import ProgBar 12 13ParsableItem = dict | str | Tag 14 15 16class Gruel: 17 """Scraper base class. 18 19 Classes subclassing `Gruel` need to implement the following methods: 20 21 * `get_parsable_items(self) -> list[Any]` 22 * `parse_item(self, item: Any)->Any` 23 * `store_item(self, item: Any)` 24 25 Calling the `scrape()` method will execute: 26 1. `self.prescrape_chores()` (does nothing unless overridden) 27 2. `self.get_parsable_items()` 28 3. `self.parse_item()` for each item returned by `self.get_parsable_items()` 29 4. `self.store_item()` for each successfully parsed item 30 5. `self.postscrape_chores()` (only closes this instance's log file unless overridden) 31 32 When overriding `self.postscrape_chores`, it's recommended to either 33 call `super().postscrape_chores()` or make sure to call `self.log.close()`. 34 Otherwise running a large number of scrapers can cause file handle limit issues.""" 35 36 def __init__(self, name: str | None = None, log_dir: Pathish | None = None): 37 """ 38 :params: 39 * `name`: The name of this scraper. If `None`, the name will be the stem of the file this class/subclass was defined in. 40 i.e. A `Gruel` subclass located in a file called `myscraper.py` will have the name `"myscraper"`. 41 * `log_dir`: The directory this scraper's logs should be saved to. 42 If `None`, the logs will be written to a folder called `"gruel_logs"` within the current working directory. 43 """ 44 self._name = name 45 self._init_logger(log_dir) 46 self.timer = Timer() 47 self.success_count = 0 48 self.fail_count = 0 49 self.failed_to_get_parsable_items = False 50 self.unexpected_failure_occured = False 51 self.parsable_items = [] 52 self.parsed_items = [] 53 54 @property 55 def name(self) -> str: 56 """Returns the name given to __init__ or the stem of the file this instance was defined in if one wasn't given.""" 57 return self._name or Pathier(inspect.getsourcefile(type(self))).stem # type: ignore 58 59 @property 60 def had_failures(self) -> bool: 61 """`True` if getting parsable items, parsing items, or unexpected failures occured.""" 62 return ( 63 (self.fail_count > 0) 64 or self.failed_to_get_parsable_items 65 or self.unexpected_failure_occured 66 ) 67 68 def _init_logger(self, log_dir: Pathish | None): 69 log_dir = Pathier.cwd() / "gruel_logs" if not log_dir else Pathier(log_dir) 70 self.logger = loggi.getLogger(self.name, log_dir) 71 72 @staticmethod 73 def request( 74 url: str, 75 method: str = "get", 76 headers: dict[str, str] = {}, 77 params: dict | None = None, 78 data: dict | None = None, 79 timeout: int | None = None, 80 retry_on_fail: bool = True, 81 json_: Any | None = None, 82 ) -> requests.Response: 83 """Send a request to `url` and return the `requests.Response` object. 84 85 By default, the only header sent is a randomized user agent string. 86 87 This can be overridden by supplying a user agent in the `headers` param. 88 89 If `retry_on_fail` is `True`, the request will be repeated after 1 second if the originally request causes an exception to be thrown. 90 Otherwise, the exception will be raised.""" 91 args = [method, url] 92 headers = whosyouragent.get_header() | headers 93 kwargs = { 94 "headers": headers, 95 "timeout": timeout, 96 "params": params, 97 "data": data, 98 "json": json_, 99 } 100 try: 101 response = requests.request(*args, **kwargs) 102 return response 103 except Exception as e: 104 if retry_on_fail: 105 time.sleep(1) 106 return requests.request(*args, **kwargs) 107 else: 108 raise e 109 110 @staticmethod 111 def as_soup(response: requests.Response) -> BeautifulSoup: 112 """Returns the text content of `response` as a `BeautifulSoup` object.""" 113 return BeautifulSoup(response.text, "html.parser") 114 115 def get_soup( 116 self, url: str, method: str = "get", headers: dict[str, str] = {} 117 ) -> BeautifulSoup: 118 """Request `url` with `headers` and return `BeautifulSoup` object.""" 119 return self.as_soup(self.request(url, method, headers)) 120 121 def clean_string(self, text: str) -> str: 122 """Strip `\\n\\r\\t` and whitespace from `text`.""" 123 return text.strip(" \n\t\r") 124 125 # |==============================================================================| 126 # Overridables 127 # |==============================================================================| 128 def prescrape_chores(self): 129 """Chores to do before scraping.""" 130 ... 131 132 def postscrape_chores(self): 133 """Chores to do after scraping.""" 134 loggi.close(self.logger) 135 136 def get_parsable_items(self) -> list[ParsableItem]: 137 """Get relevant webpages and extract raw data that needs to be parsed. 138 139 e.g. first 10 results for an endpoint that returns json content 140 >>> return self.get_page(some_url).json()[:10]""" 141 raise NotImplementedError 142 143 def parse_item(self, item: ParsableItem) -> Any: 144 """Parse `item` and return parsed data. 145 146 e.g. 147 >>> try: 148 >>> parsed = {} 149 >>> parsed["thing1"] = item["element"].split()[0] 150 >>> self.successes += 1 151 >>> return parsed 152 >>> except Exception: 153 >>> self.logger.exception("message") 154 >>> self.failures += 1 155 >>> return None""" 156 raise NotImplementedError 157 158 def store_item(self, item: Any): 159 """Store `item`.""" 160 raise NotImplementedError 161 162 def _parse_items_no_prog_bar(self): 163 for item in self.parsable_items: 164 parsed_item = self.parse_item(item) 165 if parsed_item: 166 self.store_item(parsed_item) 167 # Append to `self.parsable_items` even if `None` 168 # so `parsable_items` and `parsed_items` are equal length 169 self.parsed_items.append(parsed_item) 170 171 def _parse_items_prog_bar(self): 172 with ProgBar(len(self.parsable_items)) as bar: 173 for item in self.parsable_items: 174 parsed_item = self.parse_item(item) 175 if parsed_item: 176 self.store_item(parsed_item) 177 bar.display(f"{bar.runtime}") 178 # Append to `self.parsable_items` even if `None` 179 # so `parsable_items` and `parsed_items` are equal length 180 self.parsed_items.append(parsed_item) 181 182 def scrape(self, parse_items_prog_bar_display: bool = False): 183 """Run the scraper: 184 1. prescrape chores 185 2. get parsable items 186 3. parse and store items 187 5. postscrape chores""" 188 try: 189 self.timer.start() 190 self.logger.info("Scrape started.") 191 self.prescrape_chores() 192 try: 193 self.parsable_items = self.get_parsable_items() 194 self.logger.info( 195 f"{self.name}:get_parsable_items() returned {(len(self.parsable_items))} items" 196 ) 197 except Exception: 198 self.failed_to_get_parsable_items = True 199 self.logger.exception(f"Error in {self.name}:get_parsable_items().") 200 else: 201 if parse_items_prog_bar_display: 202 self._parse_items_prog_bar() 203 else: 204 self._parse_items_no_prog_bar() 205 self.logger.info( 206 f"Scrape completed in {self.timer.elapsed_str} with {self.success_count} successes and {self.fail_count} failures." 207 ) 208 except Exception: 209 self.unexpected_failure_occured = True 210 self.logger.exception(f"Unexpected failure in {self.name}:scrape()") 211 self.postscrape_chores()
17class Gruel: 18 """Scraper base class. 19 20 Classes subclassing `Gruel` need to implement the following methods: 21 22 * `get_parsable_items(self) -> list[Any]` 23 * `parse_item(self, item: Any)->Any` 24 * `store_item(self, item: Any)` 25 26 Calling the `scrape()` method will execute: 27 1. `self.prescrape_chores()` (does nothing unless overridden) 28 2. `self.get_parsable_items()` 29 3. `self.parse_item()` for each item returned by `self.get_parsable_items()` 30 4. `self.store_item()` for each successfully parsed item 31 5. `self.postscrape_chores()` (only closes this instance's log file unless overridden) 32 33 When overriding `self.postscrape_chores`, it's recommended to either 34 call `super().postscrape_chores()` or make sure to call `self.log.close()`. 35 Otherwise running a large number of scrapers can cause file handle limit issues.""" 36 37 def __init__(self, name: str | None = None, log_dir: Pathish | None = None): 38 """ 39 :params: 40 * `name`: The name of this scraper. If `None`, the name will be the stem of the file this class/subclass was defined in. 41 i.e. A `Gruel` subclass located in a file called `myscraper.py` will have the name `"myscraper"`. 42 * `log_dir`: The directory this scraper's logs should be saved to. 43 If `None`, the logs will be written to a folder called `"gruel_logs"` within the current working directory. 44 """ 45 self._name = name 46 self._init_logger(log_dir) 47 self.timer = Timer() 48 self.success_count = 0 49 self.fail_count = 0 50 self.failed_to_get_parsable_items = False 51 self.unexpected_failure_occured = False 52 self.parsable_items = [] 53 self.parsed_items = [] 54 55 @property 56 def name(self) -> str: 57 """Returns the name given to __init__ or the stem of the file this instance was defined in if one wasn't given.""" 58 return self._name or Pathier(inspect.getsourcefile(type(self))).stem # type: ignore 59 60 @property 61 def had_failures(self) -> bool: 62 """`True` if getting parsable items, parsing items, or unexpected failures occured.""" 63 return ( 64 (self.fail_count > 0) 65 or self.failed_to_get_parsable_items 66 or self.unexpected_failure_occured 67 ) 68 69 def _init_logger(self, log_dir: Pathish | None): 70 log_dir = Pathier.cwd() / "gruel_logs" if not log_dir else Pathier(log_dir) 71 self.logger = loggi.getLogger(self.name, log_dir) 72 73 @staticmethod 74 def request( 75 url: str, 76 method: str = "get", 77 headers: dict[str, str] = {}, 78 params: dict | None = None, 79 data: dict | None = None, 80 timeout: int | None = None, 81 retry_on_fail: bool = True, 82 json_: Any | None = None, 83 ) -> requests.Response: 84 """Send a request to `url` and return the `requests.Response` object. 85 86 By default, the only header sent is a randomized user agent string. 87 88 This can be overridden by supplying a user agent in the `headers` param. 89 90 If `retry_on_fail` is `True`, the request will be repeated after 1 second if the originally request causes an exception to be thrown. 91 Otherwise, the exception will be raised.""" 92 args = [method, url] 93 headers = whosyouragent.get_header() | headers 94 kwargs = { 95 "headers": headers, 96 "timeout": timeout, 97 "params": params, 98 "data": data, 99 "json": json_, 100 } 101 try: 102 response = requests.request(*args, **kwargs) 103 return response 104 except Exception as e: 105 if retry_on_fail: 106 time.sleep(1) 107 return requests.request(*args, **kwargs) 108 else: 109 raise e 110 111 @staticmethod 112 def as_soup(response: requests.Response) -> BeautifulSoup: 113 """Returns the text content of `response` as a `BeautifulSoup` object.""" 114 return BeautifulSoup(response.text, "html.parser") 115 116 def get_soup( 117 self, url: str, method: str = "get", headers: dict[str, str] = {} 118 ) -> BeautifulSoup: 119 """Request `url` with `headers` and return `BeautifulSoup` object.""" 120 return self.as_soup(self.request(url, method, headers)) 121 122 def clean_string(self, text: str) -> str: 123 """Strip `\\n\\r\\t` and whitespace from `text`.""" 124 return text.strip(" \n\t\r") 125 126 # |==============================================================================| 127 # Overridables 128 # |==============================================================================| 129 def prescrape_chores(self): 130 """Chores to do before scraping.""" 131 ... 132 133 def postscrape_chores(self): 134 """Chores to do after scraping.""" 135 loggi.close(self.logger) 136 137 def get_parsable_items(self) -> list[ParsableItem]: 138 """Get relevant webpages and extract raw data that needs to be parsed. 139 140 e.g. first 10 results for an endpoint that returns json content 141 >>> return self.get_page(some_url).json()[:10]""" 142 raise NotImplementedError 143 144 def parse_item(self, item: ParsableItem) -> Any: 145 """Parse `item` and return parsed data. 146 147 e.g. 148 >>> try: 149 >>> parsed = {} 150 >>> parsed["thing1"] = item["element"].split()[0] 151 >>> self.successes += 1 152 >>> return parsed 153 >>> except Exception: 154 >>> self.logger.exception("message") 155 >>> self.failures += 1 156 >>> return None""" 157 raise NotImplementedError 158 159 def store_item(self, item: Any): 160 """Store `item`.""" 161 raise NotImplementedError 162 163 def _parse_items_no_prog_bar(self): 164 for item in self.parsable_items: 165 parsed_item = self.parse_item(item) 166 if parsed_item: 167 self.store_item(parsed_item) 168 # Append to `self.parsable_items` even if `None` 169 # so `parsable_items` and `parsed_items` are equal length 170 self.parsed_items.append(parsed_item) 171 172 def _parse_items_prog_bar(self): 173 with ProgBar(len(self.parsable_items)) as bar: 174 for item in self.parsable_items: 175 parsed_item = self.parse_item(item) 176 if parsed_item: 177 self.store_item(parsed_item) 178 bar.display(f"{bar.runtime}") 179 # Append to `self.parsable_items` even if `None` 180 # so `parsable_items` and `parsed_items` are equal length 181 self.parsed_items.append(parsed_item) 182 183 def scrape(self, parse_items_prog_bar_display: bool = False): 184 """Run the scraper: 185 1. prescrape chores 186 2. get parsable items 187 3. parse and store items 188 5. postscrape chores""" 189 try: 190 self.timer.start() 191 self.logger.info("Scrape started.") 192 self.prescrape_chores() 193 try: 194 self.parsable_items = self.get_parsable_items() 195 self.logger.info( 196 f"{self.name}:get_parsable_items() returned {(len(self.parsable_items))} items" 197 ) 198 except Exception: 199 self.failed_to_get_parsable_items = True 200 self.logger.exception(f"Error in {self.name}:get_parsable_items().") 201 else: 202 if parse_items_prog_bar_display: 203 self._parse_items_prog_bar() 204 else: 205 self._parse_items_no_prog_bar() 206 self.logger.info( 207 f"Scrape completed in {self.timer.elapsed_str} with {self.success_count} successes and {self.fail_count} failures." 208 ) 209 except Exception: 210 self.unexpected_failure_occured = True 211 self.logger.exception(f"Unexpected failure in {self.name}:scrape()") 212 self.postscrape_chores()
Scraper base class.
Classes subclassing Gruel
need to implement the following methods:
get_parsable_items(self) -> list[Any]
parse_item(self, item: Any)->Any
store_item(self, item: Any)
Calling the scrape()
method will execute:
self.prescrape_chores()
(does nothing unless overridden)self.get_parsable_items()
self.parse_item()
for each item returned byself.get_parsable_items()
self.store_item()
for each successfully parsed itemself.postscrape_chores()
(only closes this instance's log file unless overridden)
When overriding self.postscrape_chores
, it's recommended to either
call super().postscrape_chores()
or make sure to call self.log.close()
.
Otherwise running a large number of scrapers can cause file handle limit issues.
37 def __init__(self, name: str | None = None, log_dir: Pathish | None = None): 38 """ 39 :params: 40 * `name`: The name of this scraper. If `None`, the name will be the stem of the file this class/subclass was defined in. 41 i.e. A `Gruel` subclass located in a file called `myscraper.py` will have the name `"myscraper"`. 42 * `log_dir`: The directory this scraper's logs should be saved to. 43 If `None`, the logs will be written to a folder called `"gruel_logs"` within the current working directory. 44 """ 45 self._name = name 46 self._init_logger(log_dir) 47 self.timer = Timer() 48 self.success_count = 0 49 self.fail_count = 0 50 self.failed_to_get_parsable_items = False 51 self.unexpected_failure_occured = False 52 self.parsable_items = [] 53 self.parsed_items = []
:params:
name
: The name of this scraper. IfNone
, the name will be the stem of the file this class/subclass was defined in. i.e. AGruel
subclass located in a file calledmyscraper.py
will have the name"myscraper"
.log_dir
: The directory this scraper's logs should be saved to. IfNone
, the logs will be written to a folder called"gruel_logs"
within the current working directory.
Returns the name given to __init__ or the stem of the file this instance was defined in if one wasn't given.
73 @staticmethod 74 def request( 75 url: str, 76 method: str = "get", 77 headers: dict[str, str] = {}, 78 params: dict | None = None, 79 data: dict | None = None, 80 timeout: int | None = None, 81 retry_on_fail: bool = True, 82 json_: Any | None = None, 83 ) -> requests.Response: 84 """Send a request to `url` and return the `requests.Response` object. 85 86 By default, the only header sent is a randomized user agent string. 87 88 This can be overridden by supplying a user agent in the `headers` param. 89 90 If `retry_on_fail` is `True`, the request will be repeated after 1 second if the originally request causes an exception to be thrown. 91 Otherwise, the exception will be raised.""" 92 args = [method, url] 93 headers = whosyouragent.get_header() | headers 94 kwargs = { 95 "headers": headers, 96 "timeout": timeout, 97 "params": params, 98 "data": data, 99 "json": json_, 100 } 101 try: 102 response = requests.request(*args, **kwargs) 103 return response 104 except Exception as e: 105 if retry_on_fail: 106 time.sleep(1) 107 return requests.request(*args, **kwargs) 108 else: 109 raise e
Send a request to url
and return the requests.Response
object.
By default, the only header sent is a randomized user agent string.
This can be overridden by supplying a user agent in the headers
param.
If retry_on_fail
is True
, the request will be repeated after 1 second if the originally request causes an exception to be thrown.
Otherwise, the exception will be raised.
111 @staticmethod 112 def as_soup(response: requests.Response) -> BeautifulSoup: 113 """Returns the text content of `response` as a `BeautifulSoup` object.""" 114 return BeautifulSoup(response.text, "html.parser")
Returns the text content of response
as a BeautifulSoup
object.
116 def get_soup( 117 self, url: str, method: str = "get", headers: dict[str, str] = {} 118 ) -> BeautifulSoup: 119 """Request `url` with `headers` and return `BeautifulSoup` object.""" 120 return self.as_soup(self.request(url, method, headers))
Request url
with headers
and return BeautifulSoup
object.
122 def clean_string(self, text: str) -> str: 123 """Strip `\\n\\r\\t` and whitespace from `text`.""" 124 return text.strip(" \n\t\r")
Strip \n\r\t
and whitespace from text
.
133 def postscrape_chores(self): 134 """Chores to do after scraping.""" 135 loggi.close(self.logger)
Chores to do after scraping.
137 def get_parsable_items(self) -> list[ParsableItem]: 138 """Get relevant webpages and extract raw data that needs to be parsed. 139 140 e.g. first 10 results for an endpoint that returns json content 141 >>> return self.get_page(some_url).json()[:10]""" 142 raise NotImplementedError
Get relevant webpages and extract raw data that needs to be parsed.
e.g. first 10 results for an endpoint that returns json content
>>> return self.get_page(some_url).json()[:10]
144 def parse_item(self, item: ParsableItem) -> Any: 145 """Parse `item` and return parsed data. 146 147 e.g. 148 >>> try: 149 >>> parsed = {} 150 >>> parsed["thing1"] = item["element"].split()[0] 151 >>> self.successes += 1 152 >>> return parsed 153 >>> except Exception: 154 >>> self.logger.exception("message") 155 >>> self.failures += 1 156 >>> return None""" 157 raise NotImplementedError
Parse item
and return parsed data.
e.g.
>>> try:
>>> parsed = {}
>>> parsed["thing1"] = item["element"].split()[0]
>>> self.successes += 1
>>> return parsed
>>> except Exception:
>>> self.logger.exception("message")
>>> self.failures += 1
>>> return None
183 def scrape(self, parse_items_prog_bar_display: bool = False): 184 """Run the scraper: 185 1. prescrape chores 186 2. get parsable items 187 3. parse and store items 188 5. postscrape chores""" 189 try: 190 self.timer.start() 191 self.logger.info("Scrape started.") 192 self.prescrape_chores() 193 try: 194 self.parsable_items = self.get_parsable_items() 195 self.logger.info( 196 f"{self.name}:get_parsable_items() returned {(len(self.parsable_items))} items" 197 ) 198 except Exception: 199 self.failed_to_get_parsable_items = True 200 self.logger.exception(f"Error in {self.name}:get_parsable_items().") 201 else: 202 if parse_items_prog_bar_display: 203 self._parse_items_prog_bar() 204 else: 205 self._parse_items_no_prog_bar() 206 self.logger.info( 207 f"Scrape completed in {self.timer.elapsed_str} with {self.success_count} successes and {self.fail_count} failures." 208 ) 209 except Exception: 210 self.unexpected_failure_occured = True 211 self.logger.exception(f"Unexpected failure in {self.name}:scrape()") 212 self.postscrape_chores()
Run the scraper:
- prescrape chores
- get parsable items
- parse and store items
- postscrape chores