gruel.gruel

  1import inspect
  2import time
  3from typing import Any
  4
  5import loggi
  6import requests
  7import whosyouragent
  8from bs4 import BeautifulSoup, Tag
  9from noiftimer import Timer
 10from pathier import Pathier, Pathish
 11from printbuddies import ProgBar
 12
 13ParsableItem = dict | str | Tag
 14
 15
 16class Gruel:
 17    """Scraper base class.
 18
 19    Classes subclassing `Gruel` need to implement the following methods:
 20
 21    * `get_parsable_items(self) -> list[Any]`
 22    * `parse_item(self, item: Any)->Any`
 23    * `store_item(self, item: Any)`
 24
 25    Calling the `scrape()` method will execute:
 26    1. `self.prescrape_chores()` (does nothing unless overridden)
 27    2. `self.get_parsable_items()`
 28    3. `self.parse_item()` for each item returned by `self.get_parsable_items()`
 29    4. `self.store_item()` for each successfully parsed item
 30    5. `self.postscrape_chores()` (only closes this instance's log file unless overridden)
 31
 32    When overriding `self.postscrape_chores`, it's recommended to either
 33    call `super().postscrape_chores()` or make sure to call `self.log.close()`.
 34    Otherwise running a large number of scrapers can cause file handle limit issues."""
 35
 36    def __init__(self, name: str | None = None, log_dir: Pathish | None = None):
 37        """
 38        :params:
 39        * `name`: The name of this scraper. If `None`, the name will be the stem of the file this class/subclass was defined in.
 40        i.e. A `Gruel` subclass located in a file called `myscraper.py` will have the name `"myscraper"`.
 41        * `log_dir`: The directory this scraper's logs should be saved to.
 42        If `None`, the logs will be written to a folder called `"gruel_logs"` within the current working directory.
 43        """
 44        self._name = name
 45        self._init_logger(log_dir)
 46        self.timer = Timer()
 47        self.success_count = 0
 48        self.fail_count = 0
 49
 50    @property
 51    def name(self) -> str:
 52        """Returns the name given to __init__ or the stem of the file this instance was defined in if one wasn't given."""
 53        return self._name or Pathier(inspect.getsourcefile(type(self))).stem  # type: ignore
 54
 55    def _init_logger(self, log_dir: Pathish | None):
 56        log_dir = Pathier.cwd() / "gruel_logs" if not log_dir else Pathier(log_dir)
 57        self.logger = loggi.getLogger(self.name, log_dir)
 58
 59    def get_page(
 60        self,
 61        url: str,
 62        method: str = "get",
 63        headers: dict[str, str] = {},
 64        timeout: int | None = None,
 65        retry_on_fail: bool = True,
 66        params: dict | None = None,
 67        data: dict | None = None,
 68    ) -> requests.Response:
 69        """Request `url` and return the `requests.Response` object.
 70
 71        By default, the only header sent is a randomized user agent string.
 72
 73        This can be overridden by supplying a user agent in the `headers` param."""
 74        args = [method, url]
 75        headers = whosyouragent.get_header() | headers
 76        kwargs = {
 77            "headers": headers,
 78            "timeout": timeout,
 79            "params": params,
 80            "data": data,
 81        }
 82        try:
 83            response = requests.request(*args, **kwargs)
 84            print(response.request.body)
 85            return response
 86        except Exception as e:
 87            if retry_on_fail:
 88                time.sleep(1)
 89                return requests.request(*args, **kwargs)
 90            else:
 91                raise e
 92
 93    def as_soup(self, response: requests.Response) -> BeautifulSoup:
 94        """Returns the text content of `response` as a `BeautifulSoup` object."""
 95        return BeautifulSoup(response.text, "html.parser")
 96
 97    def get_soup(
 98        self, url: str, method: str = "get", headers: dict[str, str] = {}
 99    ) -> BeautifulSoup:
100        """Request `url` with `headers` and return `BeautifulSoup` object."""
101        return self.as_soup(self.get_page(url, method, headers))
102
103    def clean_string(self, text: str) -> str:
104        """Strip `\\n\\r\\t` and whitespace from `text`."""
105        return text.strip(" \n\t\r")
106
107    def prescrape_chores(self):
108        """Chores to do before scraping."""
109        ...
110
111    def postscrape_chores(self):
112        """Chores to do after scraping."""
113        loggi.close(self.logger)
114
115    def get_parsable_items(self) -> list[ParsableItem]:
116        """Get relevant webpages and extract raw data that needs to be parsed.
117
118        e.g. first 10 results for an endpoint that returns json content
119        >>> return self.get_page(some_url).json()[:10]"""
120        raise NotImplementedError
121
122    def parse_item(self, item: ParsableItem) -> Any:
123        """Parse `item` and return parsed data.
124
125        e.g.
126        >>> try:
127        >>>     parsed = {}
128        >>>     parsed["thing1"] = item["element"].split()[0]
129        >>>     self.successes += 1
130        >>>     return parsed
131        >>> except Exception:
132        >>>     self.logger.exception("message")
133        >>>     self.failures += 1
134        >>>     return None"""
135        raise NotImplementedError
136
137    def store_item(self, item: Any):
138        """Store `item`."""
139        raise NotImplementedError
140
141    def _parse_items_no_prog_bar(self, parsable_items: list[ParsableItem]):
142        for item in parsable_items:
143            parsed_item = self.parse_item(item)
144            if parsed_item:
145                self.store_item(parsed_item)
146
147    def _parse_items_prog_bar(self, parsable_items: list[ParsableItem]):
148        with ProgBar(len(parsable_items)) as bar:
149            for item in parsable_items:
150                parsed_item = self.parse_item(item)
151                if parsed_item:
152                    self.store_item(parsed_item)
153                    bar.display(f"{bar.runtime}")
154
155    def scrape(self, parse_items_prog_bar_display: bool = False):
156        """Run the scraper:
157        1. prescrape chores
158        2. get parsable items
159        3. parse items
160        4. store items
161        5. postscrape chores"""
162        try:
163            self.timer.start()
164            self.logger.info("Scrape started.")
165            self.prescrape_chores()
166            try:
167                parsable_items = self.get_parsable_items()
168                self.logger.info(
169                    f"{self.name}:get_parsable_items() returned {(len(parsable_items))} items"
170                )
171            except Exception:
172                self.logger.exception(f"Error in {self.name}:get_parsable_items().")
173            else:
174                if parse_items_prog_bar_display:
175                    self._parse_items_prog_bar(parsable_items)
176                else:
177                    self._parse_items_no_prog_bar(parsable_items)
178                self.logger.info(
179                    f"Scrape completed in {self.timer.elapsed_str} with {self.success_count} successes and {self.fail_count} failures."
180                )
181        except Exception:
182            self.logger.exception(f"Unexpected failure in {self.name}:scrape()")
183        self.postscrape_chores()
class Gruel:
 17class Gruel:
 18    """Scraper base class.
 19
 20    Classes subclassing `Gruel` need to implement the following methods:
 21
 22    * `get_parsable_items(self) -> list[Any]`
 23    * `parse_item(self, item: Any)->Any`
 24    * `store_item(self, item: Any)`
 25
 26    Calling the `scrape()` method will execute:
 27    1. `self.prescrape_chores()` (does nothing unless overridden)
 28    2. `self.get_parsable_items()`
 29    3. `self.parse_item()` for each item returned by `self.get_parsable_items()`
 30    4. `self.store_item()` for each successfully parsed item
 31    5. `self.postscrape_chores()` (only closes this instance's log file unless overridden)
 32
 33    When overriding `self.postscrape_chores`, it's recommended to either
 34    call `super().postscrape_chores()` or make sure to call `self.log.close()`.
 35    Otherwise running a large number of scrapers can cause file handle limit issues."""
 36
 37    def __init__(self, name: str | None = None, log_dir: Pathish | None = None):
 38        """
 39        :params:
 40        * `name`: The name of this scraper. If `None`, the name will be the stem of the file this class/subclass was defined in.
 41        i.e. A `Gruel` subclass located in a file called `myscraper.py` will have the name `"myscraper"`.
 42        * `log_dir`: The directory this scraper's logs should be saved to.
 43        If `None`, the logs will be written to a folder called `"gruel_logs"` within the current working directory.
 44        """
 45        self._name = name
 46        self._init_logger(log_dir)
 47        self.timer = Timer()
 48        self.success_count = 0
 49        self.fail_count = 0
 50
 51    @property
 52    def name(self) -> str:
 53        """Returns the name given to __init__ or the stem of the file this instance was defined in if one wasn't given."""
 54        return self._name or Pathier(inspect.getsourcefile(type(self))).stem  # type: ignore
 55
 56    def _init_logger(self, log_dir: Pathish | None):
 57        log_dir = Pathier.cwd() / "gruel_logs" if not log_dir else Pathier(log_dir)
 58        self.logger = loggi.getLogger(self.name, log_dir)
 59
 60    def get_page(
 61        self,
 62        url: str,
 63        method: str = "get",
 64        headers: dict[str, str] = {},
 65        timeout: int | None = None,
 66        retry_on_fail: bool = True,
 67        params: dict | None = None,
 68        data: dict | None = None,
 69    ) -> requests.Response:
 70        """Request `url` and return the `requests.Response` object.
 71
 72        By default, the only header sent is a randomized user agent string.
 73
 74        This can be overridden by supplying a user agent in the `headers` param."""
 75        args = [method, url]
 76        headers = whosyouragent.get_header() | headers
 77        kwargs = {
 78            "headers": headers,
 79            "timeout": timeout,
 80            "params": params,
 81            "data": data,
 82        }
 83        try:
 84            response = requests.request(*args, **kwargs)
 85            print(response.request.body)
 86            return response
 87        except Exception as e:
 88            if retry_on_fail:
 89                time.sleep(1)
 90                return requests.request(*args, **kwargs)
 91            else:
 92                raise e
 93
 94    def as_soup(self, response: requests.Response) -> BeautifulSoup:
 95        """Returns the text content of `response` as a `BeautifulSoup` object."""
 96        return BeautifulSoup(response.text, "html.parser")
 97
 98    def get_soup(
 99        self, url: str, method: str = "get", headers: dict[str, str] = {}
100    ) -> BeautifulSoup:
101        """Request `url` with `headers` and return `BeautifulSoup` object."""
102        return self.as_soup(self.get_page(url, method, headers))
103
104    def clean_string(self, text: str) -> str:
105        """Strip `\\n\\r\\t` and whitespace from `text`."""
106        return text.strip(" \n\t\r")
107
108    def prescrape_chores(self):
109        """Chores to do before scraping."""
110        ...
111
112    def postscrape_chores(self):
113        """Chores to do after scraping."""
114        loggi.close(self.logger)
115
116    def get_parsable_items(self) -> list[ParsableItem]:
117        """Get relevant webpages and extract raw data that needs to be parsed.
118
119        e.g. first 10 results for an endpoint that returns json content
120        >>> return self.get_page(some_url).json()[:10]"""
121        raise NotImplementedError
122
123    def parse_item(self, item: ParsableItem) -> Any:
124        """Parse `item` and return parsed data.
125
126        e.g.
127        >>> try:
128        >>>     parsed = {}
129        >>>     parsed["thing1"] = item["element"].split()[0]
130        >>>     self.successes += 1
131        >>>     return parsed
132        >>> except Exception:
133        >>>     self.logger.exception("message")
134        >>>     self.failures += 1
135        >>>     return None"""
136        raise NotImplementedError
137
138    def store_item(self, item: Any):
139        """Store `item`."""
140        raise NotImplementedError
141
142    def _parse_items_no_prog_bar(self, parsable_items: list[ParsableItem]):
143        for item in parsable_items:
144            parsed_item = self.parse_item(item)
145            if parsed_item:
146                self.store_item(parsed_item)
147
148    def _parse_items_prog_bar(self, parsable_items: list[ParsableItem]):
149        with ProgBar(len(parsable_items)) as bar:
150            for item in parsable_items:
151                parsed_item = self.parse_item(item)
152                if parsed_item:
153                    self.store_item(parsed_item)
154                    bar.display(f"{bar.runtime}")
155
156    def scrape(self, parse_items_prog_bar_display: bool = False):
157        """Run the scraper:
158        1. prescrape chores
159        2. get parsable items
160        3. parse items
161        4. store items
162        5. postscrape chores"""
163        try:
164            self.timer.start()
165            self.logger.info("Scrape started.")
166            self.prescrape_chores()
167            try:
168                parsable_items = self.get_parsable_items()
169                self.logger.info(
170                    f"{self.name}:get_parsable_items() returned {(len(parsable_items))} items"
171                )
172            except Exception:
173                self.logger.exception(f"Error in {self.name}:get_parsable_items().")
174            else:
175                if parse_items_prog_bar_display:
176                    self._parse_items_prog_bar(parsable_items)
177                else:
178                    self._parse_items_no_prog_bar(parsable_items)
179                self.logger.info(
180                    f"Scrape completed in {self.timer.elapsed_str} with {self.success_count} successes and {self.fail_count} failures."
181                )
182        except Exception:
183            self.logger.exception(f"Unexpected failure in {self.name}:scrape()")
184        self.postscrape_chores()

Scraper base class.

Classes subclassing Gruel need to implement the following methods:

  • get_parsable_items(self) -> list[Any]
  • parse_item(self, item: Any)->Any
  • store_item(self, item: Any)

Calling the scrape() method will execute:

  1. self.prescrape_chores() (does nothing unless overridden)
  2. self.get_parsable_items()
  3. self.parse_item() for each item returned by self.get_parsable_items()
  4. self.store_item() for each successfully parsed item
  5. self.postscrape_chores() (only closes this instance's log file unless overridden)

When overriding self.postscrape_chores, it's recommended to either call super().postscrape_chores() or make sure to call self.log.close(). Otherwise running a large number of scrapers can cause file handle limit issues.

Gruel( name: str | None = None, log_dir: pathier.pathier.Pathier | pathlib.Path | str | None = None)
37    def __init__(self, name: str | None = None, log_dir: Pathish | None = None):
38        """
39        :params:
40        * `name`: The name of this scraper. If `None`, the name will be the stem of the file this class/subclass was defined in.
41        i.e. A `Gruel` subclass located in a file called `myscraper.py` will have the name `"myscraper"`.
42        * `log_dir`: The directory this scraper's logs should be saved to.
43        If `None`, the logs will be written to a folder called `"gruel_logs"` within the current working directory.
44        """
45        self._name = name
46        self._init_logger(log_dir)
47        self.timer = Timer()
48        self.success_count = 0
49        self.fail_count = 0

:params:

  • name: The name of this scraper. If None, the name will be the stem of the file this class/subclass was defined in. i.e. A Gruel subclass located in a file called myscraper.py will have the name "myscraper".
  • log_dir: The directory this scraper's logs should be saved to. If None, the logs will be written to a folder called "gruel_logs" within the current working directory.
name: str

Returns the name given to __init__ or the stem of the file this instance was defined in if one wasn't given.

def get_page( self, url: str, method: str = 'get', headers: dict[str, str] = {}, timeout: int | None = None, retry_on_fail: bool = True, params: dict | None = None, data: dict | None = None) -> requests.models.Response:
60    def get_page(
61        self,
62        url: str,
63        method: str = "get",
64        headers: dict[str, str] = {},
65        timeout: int | None = None,
66        retry_on_fail: bool = True,
67        params: dict | None = None,
68        data: dict | None = None,
69    ) -> requests.Response:
70        """Request `url` and return the `requests.Response` object.
71
72        By default, the only header sent is a randomized user agent string.
73
74        This can be overridden by supplying a user agent in the `headers` param."""
75        args = [method, url]
76        headers = whosyouragent.get_header() | headers
77        kwargs = {
78            "headers": headers,
79            "timeout": timeout,
80            "params": params,
81            "data": data,
82        }
83        try:
84            response = requests.request(*args, **kwargs)
85            print(response.request.body)
86            return response
87        except Exception as e:
88            if retry_on_fail:
89                time.sleep(1)
90                return requests.request(*args, **kwargs)
91            else:
92                raise e

Request url and return the requests.Response object.

By default, the only header sent is a randomized user agent string.

This can be overridden by supplying a user agent in the headers param.

def as_soup(self, response: requests.models.Response) -> bs4.BeautifulSoup:
94    def as_soup(self, response: requests.Response) -> BeautifulSoup:
95        """Returns the text content of `response` as a `BeautifulSoup` object."""
96        return BeautifulSoup(response.text, "html.parser")

Returns the text content of response as a BeautifulSoup object.

def get_soup( self, url: str, method: str = 'get', headers: dict[str, str] = {}) -> bs4.BeautifulSoup:
 98    def get_soup(
 99        self, url: str, method: str = "get", headers: dict[str, str] = {}
100    ) -> BeautifulSoup:
101        """Request `url` with `headers` and return `BeautifulSoup` object."""
102        return self.as_soup(self.get_page(url, method, headers))

Request url with headers and return BeautifulSoup object.

def clean_string(self, text: str) -> str:
104    def clean_string(self, text: str) -> str:
105        """Strip `\\n\\r\\t` and whitespace from `text`."""
106        return text.strip(" \n\t\r")

Strip \n\r\t and whitespace from text.

def prescrape_chores(self):
108    def prescrape_chores(self):
109        """Chores to do before scraping."""
110        ...

Chores to do before scraping.

def postscrape_chores(self):
112    def postscrape_chores(self):
113        """Chores to do after scraping."""
114        loggi.close(self.logger)

Chores to do after scraping.

def get_parsable_items(self) -> list[dict | str | bs4.element.Tag]:
116    def get_parsable_items(self) -> list[ParsableItem]:
117        """Get relevant webpages and extract raw data that needs to be parsed.
118
119        e.g. first 10 results for an endpoint that returns json content
120        >>> return self.get_page(some_url).json()[:10]"""
121        raise NotImplementedError

Get relevant webpages and extract raw data that needs to be parsed.

e.g. first 10 results for an endpoint that returns json content

>>> return self.get_page(some_url).json()[:10]
def parse_item(self, item: dict | str | bs4.element.Tag) -> Any:
123    def parse_item(self, item: ParsableItem) -> Any:
124        """Parse `item` and return parsed data.
125
126        e.g.
127        >>> try:
128        >>>     parsed = {}
129        >>>     parsed["thing1"] = item["element"].split()[0]
130        >>>     self.successes += 1
131        >>>     return parsed
132        >>> except Exception:
133        >>>     self.logger.exception("message")
134        >>>     self.failures += 1
135        >>>     return None"""
136        raise NotImplementedError

Parse item and return parsed data.

e.g.

>>> try:
>>>     parsed = {}
>>>     parsed["thing1"] = item["element"].split()[0]
>>>     self.successes += 1
>>>     return parsed
>>> except Exception:
>>>     self.logger.exception("message")
>>>     self.failures += 1
>>>     return None
def store_item(self, item: Any):
138    def store_item(self, item: Any):
139        """Store `item`."""
140        raise NotImplementedError

Store item.

def scrape(self, parse_items_prog_bar_display: bool = False):
156    def scrape(self, parse_items_prog_bar_display: bool = False):
157        """Run the scraper:
158        1. prescrape chores
159        2. get parsable items
160        3. parse items
161        4. store items
162        5. postscrape chores"""
163        try:
164            self.timer.start()
165            self.logger.info("Scrape started.")
166            self.prescrape_chores()
167            try:
168                parsable_items = self.get_parsable_items()
169                self.logger.info(
170                    f"{self.name}:get_parsable_items() returned {(len(parsable_items))} items"
171                )
172            except Exception:
173                self.logger.exception(f"Error in {self.name}:get_parsable_items().")
174            else:
175                if parse_items_prog_bar_display:
176                    self._parse_items_prog_bar(parsable_items)
177                else:
178                    self._parse_items_no_prog_bar(parsable_items)
179                self.logger.info(
180                    f"Scrape completed in {self.timer.elapsed_str} with {self.success_count} successes and {self.fail_count} failures."
181                )
182        except Exception:
183            self.logger.exception(f"Unexpected failure in {self.name}:scrape()")
184        self.postscrape_chores()

Run the scraper:

  1. prescrape chores
  2. get parsable items
  3. parse items
  4. store items
  5. postscrape chores