gruel.grueler

  1import inspect
  2import time
  3from typing import Any
  4
  5import loggi
  6import requests
  7import whosyouragent
  8from bs4 import BeautifulSoup, Tag
  9from noiftimer import Timer
 10from pathier import Pathier, Pathish
 11from printbuddies import ProgBar
 12
 13ParsableItem = dict | str | Tag
 14
 15
 16class Gruel:
 17    """Scraper base class.
 18
 19    Classes subclassing `Gruel` need to implement the following methods:
 20
 21    * `get_parsable_items(self) -> list[Any]`
 22    * `parse_item(self, item: Any)->Any`
 23    * `store_item(self, item: Any)`
 24
 25    Calling the `scrape()` method will execute:
 26    1. `self.prescrape_chores()` (does nothing unless overridden)
 27    2. `self.get_parsable_items()`
 28    3. `self.parse_item()` for each item returned by `self.get_parsable_items()`
 29    4. `self.store_item()` for each successfully parsed item
 30    5. `self.postscrape_chores()` (only closes this instance's log file unless overridden)
 31
 32    When overriding `self.postscrape_chores`, it's recommended to either
 33    call `super().postscrape_chores()` or make sure to call `self.log.close()`.
 34    Otherwise running a large number of scrapers can cause file handle limit issues."""
 35
 36    def __init__(self, name: str | None = None, log_dir: Pathish | None = None):
 37        """
 38        :params:
 39        * `name`: The name of this scraper. If `None`, the name will be the stem of the file this class/subclass was defined in.
 40        i.e. A `Gruel` subclass located in a file called `myscraper.py` will have the name `"myscraper"`.
 41        * `log_dir`: The directory this scraper's logs should be saved to.
 42        If `None`, the logs will be written to a folder called `"gruel_logs"` within the current working directory.
 43        """
 44        self._name = name
 45        self._init_logger(log_dir)
 46        self.timer = Timer()
 47        self.success_count = 0
 48        self.fail_count = 0
 49        self.failed_to_get_parsable_items = False
 50        self.unexpected_failure_occured = False
 51        self.parsable_items = []
 52        self.parsed_items = []
 53
 54    @property
 55    def name(self) -> str:
 56        """Returns the name given to __init__ or the stem of the file this instance was defined in if one wasn't given."""
 57        return self._name or Pathier(inspect.getsourcefile(type(self))).stem  # type: ignore
 58
 59    @property
 60    def had_failures(self) -> bool:
 61        """`True` if getting parsable items, parsing items, or unexpected failures occured."""
 62        return (
 63            (self.fail_count > 0)
 64            or self.failed_to_get_parsable_items
 65            or self.unexpected_failure_occured
 66        )
 67
 68    def _init_logger(self, log_dir: Pathish | None):
 69        log_dir = Pathier.cwd() / "gruel_logs" if not log_dir else Pathier(log_dir)
 70        self.logger = loggi.getLogger(self.name, log_dir)
 71
 72    @staticmethod
 73    def request(
 74        url: str,
 75        method: str = "get",
 76        headers: dict[str, str] = {},
 77        params: dict | None = None,
 78        data: dict | None = None,
 79        timeout: int | None = None,
 80        retry_on_fail: bool = True,
 81        json_: Any | None = None,
 82    ) -> requests.Response:
 83        """Send a request to `url` and return the `requests.Response` object.
 84
 85        By default, the only header sent is a randomized user agent string.
 86
 87        This can be overridden by supplying a user agent in the `headers` param.
 88
 89        If `retry_on_fail` is `True`, the request will be repeated after 1 second if the originally request causes an exception to be thrown.
 90        Otherwise, the exception will be raised."""
 91        args = [method, url]
 92        headers = whosyouragent.get_header() | headers
 93        kwargs = {
 94            "headers": headers,
 95            "timeout": timeout,
 96            "params": params,
 97            "data": data,
 98            "json": json_,
 99        }
100        try:
101            response = requests.request(*args, **kwargs)
102            return response
103        except Exception as e:
104            if retry_on_fail:
105                time.sleep(1)
106                return requests.request(*args, **kwargs)
107            else:
108                raise e
109
110    @staticmethod
111    def as_soup(response: requests.Response) -> BeautifulSoup:
112        """Returns the text content of `response` as a `BeautifulSoup` object."""
113        return BeautifulSoup(response.text, "html.parser")
114
115    def get_soup(
116        self, url: str, method: str = "get", headers: dict[str, str] = {}
117    ) -> BeautifulSoup:
118        """Request `url` with `headers` and return `BeautifulSoup` object."""
119        return self.as_soup(self.request(url, method, headers))
120
121    def clean_string(self, text: str) -> str:
122        """Strip `\\n\\r\\t` and whitespace from `text`."""
123        return text.strip(" \n\t\r")
124
125    # |==============================================================================|
126    # Overridables
127    # |==============================================================================|
128    def prescrape_chores(self):
129        """Chores to do before scraping."""
130        ...
131
132    def postscrape_chores(self):
133        """Chores to do after scraping."""
134        loggi.close(self.logger)
135
136    def get_parsable_items(self) -> list[ParsableItem]:
137        """Get relevant webpages and extract raw data that needs to be parsed.
138
139        e.g. first 10 results for an endpoint that returns json content
140        >>> return self.get_page(some_url).json()[:10]"""
141        raise NotImplementedError
142
143    def parse_item(self, item: ParsableItem) -> Any:
144        """Parse `item` and return parsed data.
145
146        e.g.
147        >>> try:
148        >>>     parsed = {}
149        >>>     parsed["thing1"] = item["element"].split()[0]
150        >>>     self.successes += 1
151        >>>     return parsed
152        >>> except Exception:
153        >>>     self.logger.exception("message")
154        >>>     self.failures += 1
155        >>>     return None"""
156        raise NotImplementedError
157
158    def store_item(self, item: Any):
159        """Store `item`."""
160        raise NotImplementedError
161
162    def _parse_items_no_prog_bar(self):
163        for item in self.parsable_items:
164            parsed_item = self.parse_item(item)
165            if parsed_item:
166                self.store_item(parsed_item)
167            # Append to `self.parsable_items` even if `None`
168            # so `parsable_items` and `parsed_items` are equal length
169            self.parsed_items.append(parsed_item)
170
171    def _parse_items_prog_bar(self):
172        with ProgBar(len(self.parsable_items)) as bar:
173            for item in self.parsable_items:
174                parsed_item = self.parse_item(item)
175                if parsed_item:
176                    self.store_item(parsed_item)
177                    bar.display(f"{bar.runtime}")
178                # Append to `self.parsable_items` even if `None`
179                # so `parsable_items` and `parsed_items` are equal length
180                self.parsed_items.append(parsed_item)
181
182    def scrape(self, parse_items_prog_bar_display: bool = False):
183        """Run the scraper:
184        1. prescrape chores
185        2. get parsable items
186        3. parse and store items
187        5. postscrape chores"""
188        try:
189            self.timer.start()
190            self.logger.info("Scrape started.")
191            self.prescrape_chores()
192            try:
193                self.parsable_items = self.get_parsable_items()
194                self.logger.info(
195                    f"{self.name}:get_parsable_items() returned {(len(self.parsable_items))} items"
196                )
197            except Exception:
198                self.failed_to_get_parsable_items = True
199                self.logger.exception(f"Error in {self.name}:get_parsable_items().")
200            else:
201                if parse_items_prog_bar_display:
202                    self._parse_items_prog_bar()
203                else:
204                    self._parse_items_no_prog_bar()
205                self.logger.info(
206                    f"Scrape completed in {self.timer.elapsed_str} with {self.success_count} successes and {self.fail_count} failures."
207                )
208        except Exception:
209            self.unexpected_failure_occured = True
210            self.logger.exception(f"Unexpected failure in {self.name}:scrape()")
211        self.postscrape_chores()
class Gruel:
 17class Gruel:
 18    """Scraper base class.
 19
 20    Classes subclassing `Gruel` need to implement the following methods:
 21
 22    * `get_parsable_items(self) -> list[Any]`
 23    * `parse_item(self, item: Any)->Any`
 24    * `store_item(self, item: Any)`
 25
 26    Calling the `scrape()` method will execute:
 27    1. `self.prescrape_chores()` (does nothing unless overridden)
 28    2. `self.get_parsable_items()`
 29    3. `self.parse_item()` for each item returned by `self.get_parsable_items()`
 30    4. `self.store_item()` for each successfully parsed item
 31    5. `self.postscrape_chores()` (only closes this instance's log file unless overridden)
 32
 33    When overriding `self.postscrape_chores`, it's recommended to either
 34    call `super().postscrape_chores()` or make sure to call `self.log.close()`.
 35    Otherwise running a large number of scrapers can cause file handle limit issues."""
 36
 37    def __init__(self, name: str | None = None, log_dir: Pathish | None = None):
 38        """
 39        :params:
 40        * `name`: The name of this scraper. If `None`, the name will be the stem of the file this class/subclass was defined in.
 41        i.e. A `Gruel` subclass located in a file called `myscraper.py` will have the name `"myscraper"`.
 42        * `log_dir`: The directory this scraper's logs should be saved to.
 43        If `None`, the logs will be written to a folder called `"gruel_logs"` within the current working directory.
 44        """
 45        self._name = name
 46        self._init_logger(log_dir)
 47        self.timer = Timer()
 48        self.success_count = 0
 49        self.fail_count = 0
 50        self.failed_to_get_parsable_items = False
 51        self.unexpected_failure_occured = False
 52        self.parsable_items = []
 53        self.parsed_items = []
 54
 55    @property
 56    def name(self) -> str:
 57        """Returns the name given to __init__ or the stem of the file this instance was defined in if one wasn't given."""
 58        return self._name or Pathier(inspect.getsourcefile(type(self))).stem  # type: ignore
 59
 60    @property
 61    def had_failures(self) -> bool:
 62        """`True` if getting parsable items, parsing items, or unexpected failures occured."""
 63        return (
 64            (self.fail_count > 0)
 65            or self.failed_to_get_parsable_items
 66            or self.unexpected_failure_occured
 67        )
 68
 69    def _init_logger(self, log_dir: Pathish | None):
 70        log_dir = Pathier.cwd() / "gruel_logs" if not log_dir else Pathier(log_dir)
 71        self.logger = loggi.getLogger(self.name, log_dir)
 72
 73    @staticmethod
 74    def request(
 75        url: str,
 76        method: str = "get",
 77        headers: dict[str, str] = {},
 78        params: dict | None = None,
 79        data: dict | None = None,
 80        timeout: int | None = None,
 81        retry_on_fail: bool = True,
 82        json_: Any | None = None,
 83    ) -> requests.Response:
 84        """Send a request to `url` and return the `requests.Response` object.
 85
 86        By default, the only header sent is a randomized user agent string.
 87
 88        This can be overridden by supplying a user agent in the `headers` param.
 89
 90        If `retry_on_fail` is `True`, the request will be repeated after 1 second if the originally request causes an exception to be thrown.
 91        Otherwise, the exception will be raised."""
 92        args = [method, url]
 93        headers = whosyouragent.get_header() | headers
 94        kwargs = {
 95            "headers": headers,
 96            "timeout": timeout,
 97            "params": params,
 98            "data": data,
 99            "json": json_,
100        }
101        try:
102            response = requests.request(*args, **kwargs)
103            return response
104        except Exception as e:
105            if retry_on_fail:
106                time.sleep(1)
107                return requests.request(*args, **kwargs)
108            else:
109                raise e
110
111    @staticmethod
112    def as_soup(response: requests.Response) -> BeautifulSoup:
113        """Returns the text content of `response` as a `BeautifulSoup` object."""
114        return BeautifulSoup(response.text, "html.parser")
115
116    def get_soup(
117        self, url: str, method: str = "get", headers: dict[str, str] = {}
118    ) -> BeautifulSoup:
119        """Request `url` with `headers` and return `BeautifulSoup` object."""
120        return self.as_soup(self.request(url, method, headers))
121
122    def clean_string(self, text: str) -> str:
123        """Strip `\\n\\r\\t` and whitespace from `text`."""
124        return text.strip(" \n\t\r")
125
126    # |==============================================================================|
127    # Overridables
128    # |==============================================================================|
129    def prescrape_chores(self):
130        """Chores to do before scraping."""
131        ...
132
133    def postscrape_chores(self):
134        """Chores to do after scraping."""
135        loggi.close(self.logger)
136
137    def get_parsable_items(self) -> list[ParsableItem]:
138        """Get relevant webpages and extract raw data that needs to be parsed.
139
140        e.g. first 10 results for an endpoint that returns json content
141        >>> return self.get_page(some_url).json()[:10]"""
142        raise NotImplementedError
143
144    def parse_item(self, item: ParsableItem) -> Any:
145        """Parse `item` and return parsed data.
146
147        e.g.
148        >>> try:
149        >>>     parsed = {}
150        >>>     parsed["thing1"] = item["element"].split()[0]
151        >>>     self.successes += 1
152        >>>     return parsed
153        >>> except Exception:
154        >>>     self.logger.exception("message")
155        >>>     self.failures += 1
156        >>>     return None"""
157        raise NotImplementedError
158
159    def store_item(self, item: Any):
160        """Store `item`."""
161        raise NotImplementedError
162
163    def _parse_items_no_prog_bar(self):
164        for item in self.parsable_items:
165            parsed_item = self.parse_item(item)
166            if parsed_item:
167                self.store_item(parsed_item)
168            # Append to `self.parsable_items` even if `None`
169            # so `parsable_items` and `parsed_items` are equal length
170            self.parsed_items.append(parsed_item)
171
172    def _parse_items_prog_bar(self):
173        with ProgBar(len(self.parsable_items)) as bar:
174            for item in self.parsable_items:
175                parsed_item = self.parse_item(item)
176                if parsed_item:
177                    self.store_item(parsed_item)
178                    bar.display(f"{bar.runtime}")
179                # Append to `self.parsable_items` even if `None`
180                # so `parsable_items` and `parsed_items` are equal length
181                self.parsed_items.append(parsed_item)
182
183    def scrape(self, parse_items_prog_bar_display: bool = False):
184        """Run the scraper:
185        1. prescrape chores
186        2. get parsable items
187        3. parse and store items
188        5. postscrape chores"""
189        try:
190            self.timer.start()
191            self.logger.info("Scrape started.")
192            self.prescrape_chores()
193            try:
194                self.parsable_items = self.get_parsable_items()
195                self.logger.info(
196                    f"{self.name}:get_parsable_items() returned {(len(self.parsable_items))} items"
197                )
198            except Exception:
199                self.failed_to_get_parsable_items = True
200                self.logger.exception(f"Error in {self.name}:get_parsable_items().")
201            else:
202                if parse_items_prog_bar_display:
203                    self._parse_items_prog_bar()
204                else:
205                    self._parse_items_no_prog_bar()
206                self.logger.info(
207                    f"Scrape completed in {self.timer.elapsed_str} with {self.success_count} successes and {self.fail_count} failures."
208                )
209        except Exception:
210            self.unexpected_failure_occured = True
211            self.logger.exception(f"Unexpected failure in {self.name}:scrape()")
212        self.postscrape_chores()

Scraper base class.

Classes subclassing Gruel need to implement the following methods:

  • get_parsable_items(self) -> list[Any]
  • parse_item(self, item: Any)->Any
  • store_item(self, item: Any)

Calling the scrape() method will execute:

  1. self.prescrape_chores() (does nothing unless overridden)
  2. self.get_parsable_items()
  3. self.parse_item() for each item returned by self.get_parsable_items()
  4. self.store_item() for each successfully parsed item
  5. self.postscrape_chores() (only closes this instance's log file unless overridden)

When overriding self.postscrape_chores, it's recommended to either call super().postscrape_chores() or make sure to call self.log.close(). Otherwise running a large number of scrapers can cause file handle limit issues.

Gruel( name: str | None = None, log_dir: pathier.pathier.Pathier | pathlib.Path | str | None = None)
37    def __init__(self, name: str | None = None, log_dir: Pathish | None = None):
38        """
39        :params:
40        * `name`: The name of this scraper. If `None`, the name will be the stem of the file this class/subclass was defined in.
41        i.e. A `Gruel` subclass located in a file called `myscraper.py` will have the name `"myscraper"`.
42        * `log_dir`: The directory this scraper's logs should be saved to.
43        If `None`, the logs will be written to a folder called `"gruel_logs"` within the current working directory.
44        """
45        self._name = name
46        self._init_logger(log_dir)
47        self.timer = Timer()
48        self.success_count = 0
49        self.fail_count = 0
50        self.failed_to_get_parsable_items = False
51        self.unexpected_failure_occured = False
52        self.parsable_items = []
53        self.parsed_items = []

:params:

  • name: The name of this scraper. If None, the name will be the stem of the file this class/subclass was defined in. i.e. A Gruel subclass located in a file called myscraper.py will have the name "myscraper".
  • log_dir: The directory this scraper's logs should be saved to. If None, the logs will be written to a folder called "gruel_logs" within the current working directory.
name: str

Returns the name given to __init__ or the stem of the file this instance was defined in if one wasn't given.

had_failures: bool

True if getting parsable items, parsing items, or unexpected failures occured.

@staticmethod
def request( url: str, method: str = 'get', headers: dict[str, str] = {}, params: dict | None = None, data: dict | None = None, timeout: int | None = None, retry_on_fail: bool = True, json_: typing.Any | None = None) -> requests.models.Response:
 73    @staticmethod
 74    def request(
 75        url: str,
 76        method: str = "get",
 77        headers: dict[str, str] = {},
 78        params: dict | None = None,
 79        data: dict | None = None,
 80        timeout: int | None = None,
 81        retry_on_fail: bool = True,
 82        json_: Any | None = None,
 83    ) -> requests.Response:
 84        """Send a request to `url` and return the `requests.Response` object.
 85
 86        By default, the only header sent is a randomized user agent string.
 87
 88        This can be overridden by supplying a user agent in the `headers` param.
 89
 90        If `retry_on_fail` is `True`, the request will be repeated after 1 second if the originally request causes an exception to be thrown.
 91        Otherwise, the exception will be raised."""
 92        args = [method, url]
 93        headers = whosyouragent.get_header() | headers
 94        kwargs = {
 95            "headers": headers,
 96            "timeout": timeout,
 97            "params": params,
 98            "data": data,
 99            "json": json_,
100        }
101        try:
102            response = requests.request(*args, **kwargs)
103            return response
104        except Exception as e:
105            if retry_on_fail:
106                time.sleep(1)
107                return requests.request(*args, **kwargs)
108            else:
109                raise e

Send a request to url and return the requests.Response object.

By default, the only header sent is a randomized user agent string.

This can be overridden by supplying a user agent in the headers param.

If retry_on_fail is True, the request will be repeated after 1 second if the originally request causes an exception to be thrown. Otherwise, the exception will be raised.

@staticmethod
def as_soup(response: requests.models.Response) -> bs4.BeautifulSoup:
111    @staticmethod
112    def as_soup(response: requests.Response) -> BeautifulSoup:
113        """Returns the text content of `response` as a `BeautifulSoup` object."""
114        return BeautifulSoup(response.text, "html.parser")

Returns the text content of response as a BeautifulSoup object.

def get_soup( self, url: str, method: str = 'get', headers: dict[str, str] = {}) -> bs4.BeautifulSoup:
116    def get_soup(
117        self, url: str, method: str = "get", headers: dict[str, str] = {}
118    ) -> BeautifulSoup:
119        """Request `url` with `headers` and return `BeautifulSoup` object."""
120        return self.as_soup(self.request(url, method, headers))

Request url with headers and return BeautifulSoup object.

def clean_string(self, text: str) -> str:
122    def clean_string(self, text: str) -> str:
123        """Strip `\\n\\r\\t` and whitespace from `text`."""
124        return text.strip(" \n\t\r")

Strip \n\r\t and whitespace from text.

def prescrape_chores(self):
129    def prescrape_chores(self):
130        """Chores to do before scraping."""
131        ...

Chores to do before scraping.

def postscrape_chores(self):
133    def postscrape_chores(self):
134        """Chores to do after scraping."""
135        loggi.close(self.logger)

Chores to do after scraping.

def get_parsable_items(self) -> list[dict | str | bs4.element.Tag]:
137    def get_parsable_items(self) -> list[ParsableItem]:
138        """Get relevant webpages and extract raw data that needs to be parsed.
139
140        e.g. first 10 results for an endpoint that returns json content
141        >>> return self.get_page(some_url).json()[:10]"""
142        raise NotImplementedError

Get relevant webpages and extract raw data that needs to be parsed.

e.g. first 10 results for an endpoint that returns json content

>>> return self.get_page(some_url).json()[:10]
def parse_item(self, item: dict | str | bs4.element.Tag) -> Any:
144    def parse_item(self, item: ParsableItem) -> Any:
145        """Parse `item` and return parsed data.
146
147        e.g.
148        >>> try:
149        >>>     parsed = {}
150        >>>     parsed["thing1"] = item["element"].split()[0]
151        >>>     self.successes += 1
152        >>>     return parsed
153        >>> except Exception:
154        >>>     self.logger.exception("message")
155        >>>     self.failures += 1
156        >>>     return None"""
157        raise NotImplementedError

Parse item and return parsed data.

e.g.

>>> try:
>>>     parsed = {}
>>>     parsed["thing1"] = item["element"].split()[0]
>>>     self.successes += 1
>>>     return parsed
>>> except Exception:
>>>     self.logger.exception("message")
>>>     self.failures += 1
>>>     return None
def store_item(self, item: Any):
159    def store_item(self, item: Any):
160        """Store `item`."""
161        raise NotImplementedError

Store item.

def scrape(self, parse_items_prog_bar_display: bool = False):
183    def scrape(self, parse_items_prog_bar_display: bool = False):
184        """Run the scraper:
185        1. prescrape chores
186        2. get parsable items
187        3. parse and store items
188        5. postscrape chores"""
189        try:
190            self.timer.start()
191            self.logger.info("Scrape started.")
192            self.prescrape_chores()
193            try:
194                self.parsable_items = self.get_parsable_items()
195                self.logger.info(
196                    f"{self.name}:get_parsable_items() returned {(len(self.parsable_items))} items"
197                )
198            except Exception:
199                self.failed_to_get_parsable_items = True
200                self.logger.exception(f"Error in {self.name}:get_parsable_items().")
201            else:
202                if parse_items_prog_bar_display:
203                    self._parse_items_prog_bar()
204                else:
205                    self._parse_items_no_prog_bar()
206                self.logger.info(
207                    f"Scrape completed in {self.timer.elapsed_str} with {self.success_count} successes and {self.fail_count} failures."
208                )
209        except Exception:
210            self.unexpected_failure_occured = True
211            self.logger.exception(f"Unexpected failure in {self.name}:scrape()")
212        self.postscrape_chores()

Run the scraper:

  1. prescrape chores
  2. get parsable items
  3. parse and store items
  4. postscrape chores