gruel.grueler

  1import inspect
  2import time
  3from typing import Any
  4
  5import loggi
  6import requests
  7import whosyouragent
  8from bs4 import BeautifulSoup, Tag
  9from noiftimer import Timer
 10from pathier import Pathier, Pathish
 11from printbuddies import ProgBar
 12
 13ParsableItem = dict | str | Tag
 14
 15
 16class Gruel:
 17    """Scraper base class.
 18
 19    Classes subclassing `Gruel` need to implement the following methods:
 20
 21    * `get_parsable_items(self) -> list[Any]`
 22    * `parse_item(self, item: Any)->Any`
 23    * `store_item(self, item: Any)`
 24
 25    Calling the `scrape()` method will execute:
 26    1. `self.prescrape_chores()` (does nothing unless overridden)
 27    2. `self.get_parsable_items()`
 28    3. `self.parse_item()` for each item returned by `self.get_parsable_items()`
 29    4. `self.store_item()` for each successfully parsed item
 30    5. `self.postscrape_chores()` (only closes this instance's log file unless overridden)
 31
 32    When overriding `self.postscrape_chores`, it's recommended to either
 33    call `super().postscrape_chores()` or make sure to call `self.log.close()`.
 34    Otherwise running a large number of scrapers can cause file handle limit issues."""
 35
 36    def __init__(self, name: str | None = None, log_dir: Pathish | None = None):
 37        """
 38        :params:
 39        * `name`: The name of this scraper. If `None`, the name will be the stem of the file this class/subclass was defined in.
 40        i.e. A `Gruel` subclass located in a file called `myscraper.py` will have the name `"myscraper"`.
 41        * `log_dir`: The directory this scraper's logs should be saved to.
 42        If `None`, the logs will be written to a folder called `"gruel_logs"` within the current working directory.
 43        """
 44        self._name = name
 45        self._init_logger(log_dir)
 46        self.timer = Timer()
 47        self.success_count = 0
 48        self.fail_count = 0
 49
 50    @property
 51    def name(self) -> str:
 52        """Returns the name given to __init__ or the stem of the file this instance was defined in if one wasn't given."""
 53        return self._name or Pathier(inspect.getsourcefile(type(self))).stem  # type: ignore
 54
 55    def _init_logger(self, log_dir: Pathish | None):
 56        log_dir = Pathier.cwd() / "gruel_logs" if not log_dir else Pathier(log_dir)
 57        self.logger = loggi.getLogger(self.name, log_dir)
 58
 59    @staticmethod
 60    def request(
 61        url: str,
 62        method: str = "get",
 63        headers: dict[str, str] = {},
 64        params: dict | None = None,
 65        data: dict | None = None,
 66        timeout: int | None = None,
 67        retry_on_fail: bool = True,
 68    ) -> requests.Response:
 69        """Send a request to `url` and return the `requests.Response` object.
 70
 71        By default, the only header sent is a randomized user agent string.
 72
 73        This can be overridden by supplying a user agent in the `headers` param.
 74
 75        If `retry_on_fail` is `True`, the request will be repeated after 1 second if the originally request causes an exception to be thrown.
 76        Otherwise, the exception will be raised."""
 77        args = [method, url]
 78        headers = whosyouragent.get_header() | headers
 79        kwargs = {
 80            "headers": headers,
 81            "timeout": timeout,
 82            "params": params,
 83            "data": data,
 84        }
 85        try:
 86            response = requests.request(*args, **kwargs)
 87            return response
 88        except Exception as e:
 89            if retry_on_fail:
 90                time.sleep(1)
 91                return requests.request(*args, **kwargs)
 92            else:
 93                raise e
 94
 95    @staticmethod
 96    def as_soup(response: requests.Response) -> BeautifulSoup:
 97        """Returns the text content of `response` as a `BeautifulSoup` object."""
 98        return BeautifulSoup(response.text, "html.parser")
 99
100    def get_soup(
101        self, url: str, method: str = "get", headers: dict[str, str] = {}
102    ) -> BeautifulSoup:
103        """Request `url` with `headers` and return `BeautifulSoup` object."""
104        return self.as_soup(self.request(url, method, headers))
105
106    def clean_string(self, text: str) -> str:
107        """Strip `\\n\\r\\t` and whitespace from `text`."""
108        return text.strip(" \n\t\r")
109
110    # |==============================================================================|
111    # Overridables
112    # |==============================================================================|
113    def prescrape_chores(self):
114        """Chores to do before scraping."""
115        ...
116
117    def postscrape_chores(self):
118        """Chores to do after scraping."""
119        loggi.close(self.logger)
120
121    def get_parsable_items(self) -> list[ParsableItem]:
122        """Get relevant webpages and extract raw data that needs to be parsed.
123
124        e.g. first 10 results for an endpoint that returns json content
125        >>> return self.get_page(some_url).json()[:10]"""
126        raise NotImplementedError
127
128    def parse_item(self, item: ParsableItem) -> Any:
129        """Parse `item` and return parsed data.
130
131        e.g.
132        >>> try:
133        >>>     parsed = {}
134        >>>     parsed["thing1"] = item["element"].split()[0]
135        >>>     self.successes += 1
136        >>>     return parsed
137        >>> except Exception:
138        >>>     self.logger.exception("message")
139        >>>     self.failures += 1
140        >>>     return None"""
141        raise NotImplementedError
142
143    def store_item(self, item: Any):
144        """Store `item`."""
145        raise NotImplementedError
146
147    def _parse_items_no_prog_bar(self, parsable_items: list[ParsableItem]):
148        for item in parsable_items:
149            parsed_item = self.parse_item(item)
150            if parsed_item:
151                self.store_item(parsed_item)
152
153    def _parse_items_prog_bar(self, parsable_items: list[ParsableItem]):
154        with ProgBar(len(parsable_items)) as bar:
155            for item in parsable_items:
156                parsed_item = self.parse_item(item)
157                if parsed_item:
158                    self.store_item(parsed_item)
159                    bar.display(f"{bar.runtime}")
160
161    def scrape(self, parse_items_prog_bar_display: bool = False):
162        """Run the scraper:
163        1. prescrape chores
164        2. get parsable items
165        3. parse and store items
166        5. postscrape chores"""
167        try:
168            self.timer.start()
169            self.logger.info("Scrape started.")
170            self.prescrape_chores()
171            try:
172                parsable_items = self.get_parsable_items()
173                self.logger.info(
174                    f"{self.name}:get_parsable_items() returned {(len(parsable_items))} items"
175                )
176            except Exception:
177                self.logger.exception(f"Error in {self.name}:get_parsable_items().")
178            else:
179                if parse_items_prog_bar_display:
180                    self._parse_items_prog_bar(parsable_items)
181                else:
182                    self._parse_items_no_prog_bar(parsable_items)
183                self.logger.info(
184                    f"Scrape completed in {self.timer.elapsed_str} with {self.success_count} successes and {self.fail_count} failures."
185                )
186        except Exception:
187            self.logger.exception(f"Unexpected failure in {self.name}:scrape()")
188        self.postscrape_chores()
class Gruel:
 17class Gruel:
 18    """Scraper base class.
 19
 20    Classes subclassing `Gruel` need to implement the following methods:
 21
 22    * `get_parsable_items(self) -> list[Any]`
 23    * `parse_item(self, item: Any)->Any`
 24    * `store_item(self, item: Any)`
 25
 26    Calling the `scrape()` method will execute:
 27    1. `self.prescrape_chores()` (does nothing unless overridden)
 28    2. `self.get_parsable_items()`
 29    3. `self.parse_item()` for each item returned by `self.get_parsable_items()`
 30    4. `self.store_item()` for each successfully parsed item
 31    5. `self.postscrape_chores()` (only closes this instance's log file unless overridden)
 32
 33    When overriding `self.postscrape_chores`, it's recommended to either
 34    call `super().postscrape_chores()` or make sure to call `self.log.close()`.
 35    Otherwise running a large number of scrapers can cause file handle limit issues."""
 36
 37    def __init__(self, name: str | None = None, log_dir: Pathish | None = None):
 38        """
 39        :params:
 40        * `name`: The name of this scraper. If `None`, the name will be the stem of the file this class/subclass was defined in.
 41        i.e. A `Gruel` subclass located in a file called `myscraper.py` will have the name `"myscraper"`.
 42        * `log_dir`: The directory this scraper's logs should be saved to.
 43        If `None`, the logs will be written to a folder called `"gruel_logs"` within the current working directory.
 44        """
 45        self._name = name
 46        self._init_logger(log_dir)
 47        self.timer = Timer()
 48        self.success_count = 0
 49        self.fail_count = 0
 50
 51    @property
 52    def name(self) -> str:
 53        """Returns the name given to __init__ or the stem of the file this instance was defined in if one wasn't given."""
 54        return self._name or Pathier(inspect.getsourcefile(type(self))).stem  # type: ignore
 55
 56    def _init_logger(self, log_dir: Pathish | None):
 57        log_dir = Pathier.cwd() / "gruel_logs" if not log_dir else Pathier(log_dir)
 58        self.logger = loggi.getLogger(self.name, log_dir)
 59
 60    @staticmethod
 61    def request(
 62        url: str,
 63        method: str = "get",
 64        headers: dict[str, str] = {},
 65        params: dict | None = None,
 66        data: dict | None = None,
 67        timeout: int | None = None,
 68        retry_on_fail: bool = True,
 69    ) -> requests.Response:
 70        """Send a request to `url` and return the `requests.Response` object.
 71
 72        By default, the only header sent is a randomized user agent string.
 73
 74        This can be overridden by supplying a user agent in the `headers` param.
 75
 76        If `retry_on_fail` is `True`, the request will be repeated after 1 second if the originally request causes an exception to be thrown.
 77        Otherwise, the exception will be raised."""
 78        args = [method, url]
 79        headers = whosyouragent.get_header() | headers
 80        kwargs = {
 81            "headers": headers,
 82            "timeout": timeout,
 83            "params": params,
 84            "data": data,
 85        }
 86        try:
 87            response = requests.request(*args, **kwargs)
 88            return response
 89        except Exception as e:
 90            if retry_on_fail:
 91                time.sleep(1)
 92                return requests.request(*args, **kwargs)
 93            else:
 94                raise e
 95
 96    @staticmethod
 97    def as_soup(response: requests.Response) -> BeautifulSoup:
 98        """Returns the text content of `response` as a `BeautifulSoup` object."""
 99        return BeautifulSoup(response.text, "html.parser")
100
101    def get_soup(
102        self, url: str, method: str = "get", headers: dict[str, str] = {}
103    ) -> BeautifulSoup:
104        """Request `url` with `headers` and return `BeautifulSoup` object."""
105        return self.as_soup(self.request(url, method, headers))
106
107    def clean_string(self, text: str) -> str:
108        """Strip `\\n\\r\\t` and whitespace from `text`."""
109        return text.strip(" \n\t\r")
110
111    # |==============================================================================|
112    # Overridables
113    # |==============================================================================|
114    def prescrape_chores(self):
115        """Chores to do before scraping."""
116        ...
117
118    def postscrape_chores(self):
119        """Chores to do after scraping."""
120        loggi.close(self.logger)
121
122    def get_parsable_items(self) -> list[ParsableItem]:
123        """Get relevant webpages and extract raw data that needs to be parsed.
124
125        e.g. first 10 results for an endpoint that returns json content
126        >>> return self.get_page(some_url).json()[:10]"""
127        raise NotImplementedError
128
129    def parse_item(self, item: ParsableItem) -> Any:
130        """Parse `item` and return parsed data.
131
132        e.g.
133        >>> try:
134        >>>     parsed = {}
135        >>>     parsed["thing1"] = item["element"].split()[0]
136        >>>     self.successes += 1
137        >>>     return parsed
138        >>> except Exception:
139        >>>     self.logger.exception("message")
140        >>>     self.failures += 1
141        >>>     return None"""
142        raise NotImplementedError
143
144    def store_item(self, item: Any):
145        """Store `item`."""
146        raise NotImplementedError
147
148    def _parse_items_no_prog_bar(self, parsable_items: list[ParsableItem]):
149        for item in parsable_items:
150            parsed_item = self.parse_item(item)
151            if parsed_item:
152                self.store_item(parsed_item)
153
154    def _parse_items_prog_bar(self, parsable_items: list[ParsableItem]):
155        with ProgBar(len(parsable_items)) as bar:
156            for item in parsable_items:
157                parsed_item = self.parse_item(item)
158                if parsed_item:
159                    self.store_item(parsed_item)
160                    bar.display(f"{bar.runtime}")
161
162    def scrape(self, parse_items_prog_bar_display: bool = False):
163        """Run the scraper:
164        1. prescrape chores
165        2. get parsable items
166        3. parse and store items
167        5. postscrape chores"""
168        try:
169            self.timer.start()
170            self.logger.info("Scrape started.")
171            self.prescrape_chores()
172            try:
173                parsable_items = self.get_parsable_items()
174                self.logger.info(
175                    f"{self.name}:get_parsable_items() returned {(len(parsable_items))} items"
176                )
177            except Exception:
178                self.logger.exception(f"Error in {self.name}:get_parsable_items().")
179            else:
180                if parse_items_prog_bar_display:
181                    self._parse_items_prog_bar(parsable_items)
182                else:
183                    self._parse_items_no_prog_bar(parsable_items)
184                self.logger.info(
185                    f"Scrape completed in {self.timer.elapsed_str} with {self.success_count} successes and {self.fail_count} failures."
186                )
187        except Exception:
188            self.logger.exception(f"Unexpected failure in {self.name}:scrape()")
189        self.postscrape_chores()

Scraper base class.

Classes subclassing Gruel need to implement the following methods:

  • get_parsable_items(self) -> list[Any]
  • parse_item(self, item: Any)->Any
  • store_item(self, item: Any)

Calling the scrape() method will execute:

  1. self.prescrape_chores() (does nothing unless overridden)
  2. self.get_parsable_items()
  3. self.parse_item() for each item returned by self.get_parsable_items()
  4. self.store_item() for each successfully parsed item
  5. self.postscrape_chores() (only closes this instance's log file unless overridden)

When overriding self.postscrape_chores, it's recommended to either call super().postscrape_chores() or make sure to call self.log.close(). Otherwise running a large number of scrapers can cause file handle limit issues.

Gruel( name: str | None = None, log_dir: pathier.pathier.Pathier | pathlib.Path | str | None = None)
37    def __init__(self, name: str | None = None, log_dir: Pathish | None = None):
38        """
39        :params:
40        * `name`: The name of this scraper. If `None`, the name will be the stem of the file this class/subclass was defined in.
41        i.e. A `Gruel` subclass located in a file called `myscraper.py` will have the name `"myscraper"`.
42        * `log_dir`: The directory this scraper's logs should be saved to.
43        If `None`, the logs will be written to a folder called `"gruel_logs"` within the current working directory.
44        """
45        self._name = name
46        self._init_logger(log_dir)
47        self.timer = Timer()
48        self.success_count = 0
49        self.fail_count = 0

:params:

  • name: The name of this scraper. If None, the name will be the stem of the file this class/subclass was defined in. i.e. A Gruel subclass located in a file called myscraper.py will have the name "myscraper".
  • log_dir: The directory this scraper's logs should be saved to. If None, the logs will be written to a folder called "gruel_logs" within the current working directory.
name: str

Returns the name given to __init__ or the stem of the file this instance was defined in if one wasn't given.

@staticmethod
def request( url: str, method: str = 'get', headers: dict[str, str] = {}, params: dict | None = None, data: dict | None = None, timeout: int | None = None, retry_on_fail: bool = True) -> requests.models.Response:
60    @staticmethod
61    def request(
62        url: str,
63        method: str = "get",
64        headers: dict[str, str] = {},
65        params: dict | None = None,
66        data: dict | None = None,
67        timeout: int | None = None,
68        retry_on_fail: bool = True,
69    ) -> requests.Response:
70        """Send a request to `url` and return the `requests.Response` object.
71
72        By default, the only header sent is a randomized user agent string.
73
74        This can be overridden by supplying a user agent in the `headers` param.
75
76        If `retry_on_fail` is `True`, the request will be repeated after 1 second if the originally request causes an exception to be thrown.
77        Otherwise, the exception will be raised."""
78        args = [method, url]
79        headers = whosyouragent.get_header() | headers
80        kwargs = {
81            "headers": headers,
82            "timeout": timeout,
83            "params": params,
84            "data": data,
85        }
86        try:
87            response = requests.request(*args, **kwargs)
88            return response
89        except Exception as e:
90            if retry_on_fail:
91                time.sleep(1)
92                return requests.request(*args, **kwargs)
93            else:
94                raise e

Send a request to url and return the requests.Response object.

By default, the only header sent is a randomized user agent string.

This can be overridden by supplying a user agent in the headers param.

If retry_on_fail is True, the request will be repeated after 1 second if the originally request causes an exception to be thrown. Otherwise, the exception will be raised.

@staticmethod
def as_soup(response: requests.models.Response) -> bs4.BeautifulSoup:
96    @staticmethod
97    def as_soup(response: requests.Response) -> BeautifulSoup:
98        """Returns the text content of `response` as a `BeautifulSoup` object."""
99        return BeautifulSoup(response.text, "html.parser")

Returns the text content of response as a BeautifulSoup object.

def get_soup( self, url: str, method: str = 'get', headers: dict[str, str] = {}) -> bs4.BeautifulSoup:
101    def get_soup(
102        self, url: str, method: str = "get", headers: dict[str, str] = {}
103    ) -> BeautifulSoup:
104        """Request `url` with `headers` and return `BeautifulSoup` object."""
105        return self.as_soup(self.request(url, method, headers))

Request url with headers and return BeautifulSoup object.

def clean_string(self, text: str) -> str:
107    def clean_string(self, text: str) -> str:
108        """Strip `\\n\\r\\t` and whitespace from `text`."""
109        return text.strip(" \n\t\r")

Strip \n\r\t and whitespace from text.

def prescrape_chores(self):
114    def prescrape_chores(self):
115        """Chores to do before scraping."""
116        ...

Chores to do before scraping.

def postscrape_chores(self):
118    def postscrape_chores(self):
119        """Chores to do after scraping."""
120        loggi.close(self.logger)

Chores to do after scraping.

def get_parsable_items(self) -> list[dict | str | bs4.element.Tag]:
122    def get_parsable_items(self) -> list[ParsableItem]:
123        """Get relevant webpages and extract raw data that needs to be parsed.
124
125        e.g. first 10 results for an endpoint that returns json content
126        >>> return self.get_page(some_url).json()[:10]"""
127        raise NotImplementedError

Get relevant webpages and extract raw data that needs to be parsed.

e.g. first 10 results for an endpoint that returns json content

>>> return self.get_page(some_url).json()[:10]
def parse_item(self, item: dict | str | bs4.element.Tag) -> Any:
129    def parse_item(self, item: ParsableItem) -> Any:
130        """Parse `item` and return parsed data.
131
132        e.g.
133        >>> try:
134        >>>     parsed = {}
135        >>>     parsed["thing1"] = item["element"].split()[0]
136        >>>     self.successes += 1
137        >>>     return parsed
138        >>> except Exception:
139        >>>     self.logger.exception("message")
140        >>>     self.failures += 1
141        >>>     return None"""
142        raise NotImplementedError

Parse item and return parsed data.

e.g.

>>> try:
>>>     parsed = {}
>>>     parsed["thing1"] = item["element"].split()[0]
>>>     self.successes += 1
>>>     return parsed
>>> except Exception:
>>>     self.logger.exception("message")
>>>     self.failures += 1
>>>     return None
def store_item(self, item: Any):
144    def store_item(self, item: Any):
145        """Store `item`."""
146        raise NotImplementedError

Store item.

def scrape(self, parse_items_prog_bar_display: bool = False):
162    def scrape(self, parse_items_prog_bar_display: bool = False):
163        """Run the scraper:
164        1. prescrape chores
165        2. get parsable items
166        3. parse and store items
167        5. postscrape chores"""
168        try:
169            self.timer.start()
170            self.logger.info("Scrape started.")
171            self.prescrape_chores()
172            try:
173                parsable_items = self.get_parsable_items()
174                self.logger.info(
175                    f"{self.name}:get_parsable_items() returned {(len(parsable_items))} items"
176                )
177            except Exception:
178                self.logger.exception(f"Error in {self.name}:get_parsable_items().")
179            else:
180                if parse_items_prog_bar_display:
181                    self._parse_items_prog_bar(parsable_items)
182                else:
183                    self._parse_items_no_prog_bar(parsable_items)
184                self.logger.info(
185                    f"Scrape completed in {self.timer.elapsed_str} with {self.success_count} successes and {self.fail_count} failures."
186                )
187        except Exception:
188            self.logger.exception(f"Unexpected failure in {self.name}:scrape()")
189        self.postscrape_chores()

Run the scraper:

  1. prescrape chores
  2. get parsable items
  3. parse and store items
  4. postscrape chores