gruel.gruel

  1import inspect
  2import time
  3from typing import Any
  4
  5import loggi
  6import requests
  7from bs4 import BeautifulSoup, Tag
  8from noiftimer import Timer
  9from pathier import Pathier
 10from printbuddies import ProgBar
 11from whosyouragent import get_agent
 12
 13ParsableItem = dict | str | Tag
 14
 15
 16class Gruel:
 17    """Scraper base class."""
 18
 19    def __init__(self, name: str | None = None):
 20        self._name = name
 21        self._init_logger()
 22        self.timer = Timer()
 23        self.success_count = 0
 24        self.fail_count = 0
 25
 26    @property
 27    def name(self) -> str:
 28        """Returns the name given to __init__ or the stem of the file this instance was defined in if one wasn't given."""
 29        return self._name or Pathier(inspect.getsourcefile(type(self))).stem  # type: ignore
 30
 31    def _init_logger(self):
 32        log_dir = Pathier.cwd() / "gruel_logs"
 33        self.logger = loggi.getLogger(self.name, log_dir)
 34
 35    def get_page(
 36        self, url: str, method: str = "get", headers: dict[str, str] = {}
 37    ) -> requests.Response:
 38        """Request `url` and return the `requests.Response` object.
 39
 40        By default, the only header sent is a randomized user agent string.
 41
 42        This can be overridden by supplying a user agent in the `headers` param."""
 43        try:
 44            return requests.request(
 45                method, url, headers={"User-Agent": get_agent()} | headers
 46            )
 47        except Exception as e:
 48            time.sleep(1)
 49            return requests.request(
 50                method, url, headers={"User-Agent": get_agent()} | headers
 51            )
 52
 53    def as_soup(self, response: requests.Response) -> BeautifulSoup:
 54        """Returns the text content of `response` as a `BeautifulSoup` object."""
 55        return BeautifulSoup(response.text, "html.parser")
 56
 57    def get_soup(
 58        self, url: str, method: str = "get", headers: dict[str, str] = {}
 59    ) -> BeautifulSoup:
 60        """Request `url` with `headers` and return `BeautifulSoup` object."""
 61        return self.as_soup(self.get_page(url, method, headers))
 62
 63    def clean_string(self, text: str) -> str:
 64        """Strip `\\n\\r\\t` and whitespace from `text`."""
 65        return text.strip(" \n\t\r")
 66
 67    def prescrape_chores(self):
 68        """Chores to do before scraping."""
 69        ...
 70
 71    def postscrape_chores(self):
 72        """Chores to do after scraping."""
 73        loggi.close(self.logger)
 74
 75    def get_parsable_items(self) -> list[ParsableItem]:
 76        """Get relevant webpages and extract raw data that needs to be parsed.
 77
 78        e.g. first 10 results for an endpoint that returns json content
 79        >>> return self.get_page(some_url).json()[:10]"""
 80        raise NotImplementedError
 81
 82    def parse_item(self, item: ParsableItem) -> Any:
 83        """Parse `item` and return parsed data.
 84
 85        e.g.
 86        >>> try:
 87        >>>     parsed = {}
 88        >>>     parsed["thing1"] = item["element"].split()[0]
 89        >>>     self.successes += 1
 90        >>>     return parsed
 91        >>> except Exception:
 92        >>>     self.logger.exception("message")
 93        >>>     self.failures += 1
 94        >>>     return None"""
 95        raise NotImplementedError
 96
 97    def store_item(self, item: Any):
 98        """Store `item`."""
 99        raise NotImplementedError
100
101    def _parse_items_no_prog_bar(self, parsable_items: list[ParsableItem]):
102        for item in parsable_items:
103            parsed_item = self.parse_item(item)
104            if parsed_item:
105                self.store_item(parsed_item)
106
107    def _parse_items_prog_bar(self, parsable_items: list[ParsableItem]):
108        with ProgBar(len(parsable_items)) as bar:
109            for item in parsable_items:
110                parsed_item = self.parse_item(item)
111                if parsed_item:
112                    self.store_item(parsed_item)
113                    bar.display(f"{bar.runtime}")
114
115    def scrape(self, parse_items_prog_bar_display: bool = False):
116        """Run the scraper:
117        1. prescrape chores
118        2. get parsable items
119        3. parse items
120        4. store items
121        5. postscrape chores"""
122        try:
123            self.timer.start()
124            self.logger.info("Scrape started.")
125            self.prescrape_chores()
126            try:
127                parsable_items = self.get_parsable_items()
128                self.logger.info(
129                    f"{self.name}:get_parsable_items() returned {(len(parsable_items))} items"
130                )
131            except Exception:
132                self.logger.exception(f"Error in {self.name}:get_parsable_items().")
133            else:
134                if parse_items_prog_bar_display:
135                    self._parse_items_prog_bar(parsable_items)
136                else:
137                    self._parse_items_no_prog_bar(parsable_items)
138                self.logger.info(
139                    f"Scrape completed in {self.timer.elapsed_str} with {self.success_count} successes and {self.fail_count} failures."
140                )
141        except Exception:
142            self.logger.exception(f"Unexpected failure in {self.name}:scrape()")
143        self.postscrape_chores()
class Gruel:
 17class Gruel:
 18    """Scraper base class."""
 19
 20    def __init__(self, name: str | None = None):
 21        self._name = name
 22        self._init_logger()
 23        self.timer = Timer()
 24        self.success_count = 0
 25        self.fail_count = 0
 26
 27    @property
 28    def name(self) -> str:
 29        """Returns the name given to __init__ or the stem of the file this instance was defined in if one wasn't given."""
 30        return self._name or Pathier(inspect.getsourcefile(type(self))).stem  # type: ignore
 31
 32    def _init_logger(self):
 33        log_dir = Pathier.cwd() / "gruel_logs"
 34        self.logger = loggi.getLogger(self.name, log_dir)
 35
 36    def get_page(
 37        self, url: str, method: str = "get", headers: dict[str, str] = {}
 38    ) -> requests.Response:
 39        """Request `url` and return the `requests.Response` object.
 40
 41        By default, the only header sent is a randomized user agent string.
 42
 43        This can be overridden by supplying a user agent in the `headers` param."""
 44        try:
 45            return requests.request(
 46                method, url, headers={"User-Agent": get_agent()} | headers
 47            )
 48        except Exception as e:
 49            time.sleep(1)
 50            return requests.request(
 51                method, url, headers={"User-Agent": get_agent()} | headers
 52            )
 53
 54    def as_soup(self, response: requests.Response) -> BeautifulSoup:
 55        """Returns the text content of `response` as a `BeautifulSoup` object."""
 56        return BeautifulSoup(response.text, "html.parser")
 57
 58    def get_soup(
 59        self, url: str, method: str = "get", headers: dict[str, str] = {}
 60    ) -> BeautifulSoup:
 61        """Request `url` with `headers` and return `BeautifulSoup` object."""
 62        return self.as_soup(self.get_page(url, method, headers))
 63
 64    def clean_string(self, text: str) -> str:
 65        """Strip `\\n\\r\\t` and whitespace from `text`."""
 66        return text.strip(" \n\t\r")
 67
 68    def prescrape_chores(self):
 69        """Chores to do before scraping."""
 70        ...
 71
 72    def postscrape_chores(self):
 73        """Chores to do after scraping."""
 74        loggi.close(self.logger)
 75
 76    def get_parsable_items(self) -> list[ParsableItem]:
 77        """Get relevant webpages and extract raw data that needs to be parsed.
 78
 79        e.g. first 10 results for an endpoint that returns json content
 80        >>> return self.get_page(some_url).json()[:10]"""
 81        raise NotImplementedError
 82
 83    def parse_item(self, item: ParsableItem) -> Any:
 84        """Parse `item` and return parsed data.
 85
 86        e.g.
 87        >>> try:
 88        >>>     parsed = {}
 89        >>>     parsed["thing1"] = item["element"].split()[0]
 90        >>>     self.successes += 1
 91        >>>     return parsed
 92        >>> except Exception:
 93        >>>     self.logger.exception("message")
 94        >>>     self.failures += 1
 95        >>>     return None"""
 96        raise NotImplementedError
 97
 98    def store_item(self, item: Any):
 99        """Store `item`."""
100        raise NotImplementedError
101
102    def _parse_items_no_prog_bar(self, parsable_items: list[ParsableItem]):
103        for item in parsable_items:
104            parsed_item = self.parse_item(item)
105            if parsed_item:
106                self.store_item(parsed_item)
107
108    def _parse_items_prog_bar(self, parsable_items: list[ParsableItem]):
109        with ProgBar(len(parsable_items)) as bar:
110            for item in parsable_items:
111                parsed_item = self.parse_item(item)
112                if parsed_item:
113                    self.store_item(parsed_item)
114                    bar.display(f"{bar.runtime}")
115
116    def scrape(self, parse_items_prog_bar_display: bool = False):
117        """Run the scraper:
118        1. prescrape chores
119        2. get parsable items
120        3. parse items
121        4. store items
122        5. postscrape chores"""
123        try:
124            self.timer.start()
125            self.logger.info("Scrape started.")
126            self.prescrape_chores()
127            try:
128                parsable_items = self.get_parsable_items()
129                self.logger.info(
130                    f"{self.name}:get_parsable_items() returned {(len(parsable_items))} items"
131                )
132            except Exception:
133                self.logger.exception(f"Error in {self.name}:get_parsable_items().")
134            else:
135                if parse_items_prog_bar_display:
136                    self._parse_items_prog_bar(parsable_items)
137                else:
138                    self._parse_items_no_prog_bar(parsable_items)
139                self.logger.info(
140                    f"Scrape completed in {self.timer.elapsed_str} with {self.success_count} successes and {self.fail_count} failures."
141                )
142        except Exception:
143            self.logger.exception(f"Unexpected failure in {self.name}:scrape()")
144        self.postscrape_chores()

Scraper base class.

Gruel(name: str | None = None)
20    def __init__(self, name: str | None = None):
21        self._name = name
22        self._init_logger()
23        self.timer = Timer()
24        self.success_count = 0
25        self.fail_count = 0
name: str

Returns the name given to __init__ or the stem of the file this instance was defined in if one wasn't given.

def get_page( self, url: str, method: str = 'get', headers: dict[str, str] = {}) -> requests.models.Response:
36    def get_page(
37        self, url: str, method: str = "get", headers: dict[str, str] = {}
38    ) -> requests.Response:
39        """Request `url` and return the `requests.Response` object.
40
41        By default, the only header sent is a randomized user agent string.
42
43        This can be overridden by supplying a user agent in the `headers` param."""
44        try:
45            return requests.request(
46                method, url, headers={"User-Agent": get_agent()} | headers
47            )
48        except Exception as e:
49            time.sleep(1)
50            return requests.request(
51                method, url, headers={"User-Agent": get_agent()} | headers
52            )

Request url and return the requests.Response object.

By default, the only header sent is a randomized user agent string.

This can be overridden by supplying a user agent in the headers param.

def as_soup(self, response: requests.models.Response) -> bs4.BeautifulSoup:
54    def as_soup(self, response: requests.Response) -> BeautifulSoup:
55        """Returns the text content of `response` as a `BeautifulSoup` object."""
56        return BeautifulSoup(response.text, "html.parser")

Returns the text content of response as a BeautifulSoup object.

def get_soup( self, url: str, method: str = 'get', headers: dict[str, str] = {}) -> bs4.BeautifulSoup:
58    def get_soup(
59        self, url: str, method: str = "get", headers: dict[str, str] = {}
60    ) -> BeautifulSoup:
61        """Request `url` with `headers` and return `BeautifulSoup` object."""
62        return self.as_soup(self.get_page(url, method, headers))

Request url with headers and return BeautifulSoup object.

def clean_string(self, text: str) -> str:
64    def clean_string(self, text: str) -> str:
65        """Strip `\\n\\r\\t` and whitespace from `text`."""
66        return text.strip(" \n\t\r")

Strip \n\r\t and whitespace from text.

def prescrape_chores(self):
68    def prescrape_chores(self):
69        """Chores to do before scraping."""
70        ...

Chores to do before scraping.

def postscrape_chores(self):
72    def postscrape_chores(self):
73        """Chores to do after scraping."""
74        loggi.close(self.logger)

Chores to do after scraping.

def get_parsable_items(self) -> list[dict | str | bs4.element.Tag]:
76    def get_parsable_items(self) -> list[ParsableItem]:
77        """Get relevant webpages and extract raw data that needs to be parsed.
78
79        e.g. first 10 results for an endpoint that returns json content
80        >>> return self.get_page(some_url).json()[:10]"""
81        raise NotImplementedError

Get relevant webpages and extract raw data that needs to be parsed.

e.g. first 10 results for an endpoint that returns json content

>>> return self.get_page(some_url).json()[:10]
def parse_item(self, item: dict | str | bs4.element.Tag) -> Any:
83    def parse_item(self, item: ParsableItem) -> Any:
84        """Parse `item` and return parsed data.
85
86        e.g.
87        >>> try:
88        >>>     parsed = {}
89        >>>     parsed["thing1"] = item["element"].split()[0]
90        >>>     self.successes += 1
91        >>>     return parsed
92        >>> except Exception:
93        >>>     self.logger.exception("message")
94        >>>     self.failures += 1
95        >>>     return None"""
96        raise NotImplementedError

Parse item and return parsed data.

e.g.

>>> try:
>>>     parsed = {}
>>>     parsed["thing1"] = item["element"].split()[0]
>>>     self.successes += 1
>>>     return parsed
>>> except Exception:
>>>     self.logger.exception("message")
>>>     self.failures += 1
>>>     return None
def store_item(self, item: Any):
 98    def store_item(self, item: Any):
 99        """Store `item`."""
100        raise NotImplementedError

Store item.

def scrape(self, parse_items_prog_bar_display: bool = False):
116    def scrape(self, parse_items_prog_bar_display: bool = False):
117        """Run the scraper:
118        1. prescrape chores
119        2. get parsable items
120        3. parse items
121        4. store items
122        5. postscrape chores"""
123        try:
124            self.timer.start()
125            self.logger.info("Scrape started.")
126            self.prescrape_chores()
127            try:
128                parsable_items = self.get_parsable_items()
129                self.logger.info(
130                    f"{self.name}:get_parsable_items() returned {(len(parsable_items))} items"
131                )
132            except Exception:
133                self.logger.exception(f"Error in {self.name}:get_parsable_items().")
134            else:
135                if parse_items_prog_bar_display:
136                    self._parse_items_prog_bar(parsable_items)
137                else:
138                    self._parse_items_no_prog_bar(parsable_items)
139                self.logger.info(
140                    f"Scrape completed in {self.timer.elapsed_str} with {self.success_count} successes and {self.fail_count} failures."
141                )
142        except Exception:
143            self.logger.exception(f"Unexpected failure in {self.name}:scrape()")
144        self.postscrape_chores()

Run the scraper:

  1. prescrape chores
  2. get parsable items
  3. parse items
  4. store items
  5. postscrape chores