gruel.brewer

  1import argparse
  2import importlib
  3import importlib.machinery
  4import importlib.util
  5import inspect
  6from types import ModuleType
  7from typing import Any, Sequence, Type
  8
  9import loggi
 10import quickpool
 11from pathier import Pathier, Pathish
 12from younotyou import Matcher, younotyou
 13
 14from gruel.grueler import Gruel
 15
 16
 17class GruelFinder:
 18    """Find and load classes that subclass `Gruel`."""
 19
 20    def __init__(
 21        self,
 22        subgruel_classes: list[str] = ["*"],
 23        file_exclude_patterns: list[str] = [],
 24        scan_path: Pathier | None = None,
 25        file_include_patterns: list[str] = ["*.py"],
 26        recursive: bool = True,
 27        log_dir: Pathish | None = None,
 28    ):
 29        """#### :params:
 30
 31        `subgruel_classes`: A list of class names for scrapers that should be loaded.
 32        In order to be loaded, a scraper class must have a name in this list and have `Gruel` somewhere in its inheritance hierarchy.
 33        Can use wildcard ('*') patterns for matching.
 34
 35        `file_exclude_patterns`: Files that match these patterns will not be scanned.
 36
 37        `scan_path`: The path to scan for scraper classes.
 38
 39        `file_include_patterns`: Files that match these patterns will be scanned.
 40
 41        `recursive`: Whether the scan should be recursive or not.
 42
 43        `log_dir`: The directory this instance's log should be saved to.
 44        If `None`, it will be saved to the current working directory.
 45
 46        Will find and load all classes in the "scrapers" directory that inherit from `Gruel`
 47        and start with "MySubGruel", but don't contain "Scratch" in the name:
 48        >>> finder = finder(["MySubGruel*"], ["*Scratch*"], "scrapers")
 49        >>> gruels = finder.find()"""
 50        self.subgruel_classes = subgruel_classes
 51        self.file_exclude_patterns = file_exclude_patterns
 52        self.scan_path = scan_path or Pathier.cwd()
 53        self.file_include_patterns = file_include_patterns
 54        self.recursive = recursive
 55        self.logger = loggi.getLogger(
 56            "gruel_finder", Pathier(log_dir) if log_dir else Pathier.cwd()
 57        )
 58
 59    def get_bases(self, object: Any) -> list[Any]:
 60        """Returns a recursive list of all the classes `object` inherits from."""
 61        parents = []
 62        bases = object.__bases__
 63        if not bases:
 64            return parents
 65        for base in bases:
 66            parents.append(base)
 67            parents.extend(self.get_bases(base))
 68        return parents
 69
 70    def is_subgruel(self, object: Any) -> bool:
 71        """Returns whether `object` inherits from `Gruel` somewhere in its ancestory."""
 72        if not inspect.isclass(object) or Gruel not in self.get_bases(object):
 73            return False
 74        return True
 75
 76    def glob_files(self) -> list[Pathier]:
 77        """Search `self.scan_path` for files according to `self.file_include_patterns` and `self.file_exclude_patterns`.
 78
 79        Returns the file list."""
 80        globber = self.scan_path.rglob if self.recursive else self.scan_path.glob
 81        files = [
 82            str(file)
 83            for pattern in self.file_include_patterns
 84            for file in globber(pattern)
 85        ]
 86        files = [
 87            Pathier(file)
 88            for file in younotyou(files, exclude_patterns=self.file_exclude_patterns)
 89        ]
 90        return files
 91
 92    def load_module_from_file(self, file: Pathier) -> ModuleType | None:
 93        """Attempts to load and return a module from `file`."""
 94        module_name = file.stem
 95        try:
 96            module = importlib.machinery.SourceFileLoader(
 97                module_name, str(file)
 98            ).load_module()
 99            self.logger.info(f"Successfully imported `{module_name}` from `{file}`.")
100            return module
101        except Exception as e:
102            self.logger.exception(f"Failed to load `{module_name}` from `{file}`.")
103
104    def strain_for_gruel(self, modules: list[ModuleType]) -> list[Type[Gruel]]:
105        """Searches `modules` for classes that inherit from `Gruel` and are in `self.subgruel_classes`.
106
107        Returns the list of classes."""
108        matcher = Matcher(self.subgruel_classes)
109        return [
110            getattr(module, class_)
111            for module in modules
112            for class_ in dir(module)
113            if class_ in matcher and self.is_subgruel(getattr(module, class_))
114        ]
115
116    def find(self) -> list[Type[Gruel]]:
117        """Run the scan and return `Gruel` subclasses."""
118        files = self.glob_files()
119        modules = []
120        for file in files:
121            if module := self.load_module_from_file(file):
122                modules.append(module)
123        return self.strain_for_gruel(modules)
124
125
126class Brewer:
127    """Use to do multithreaded execution of a list of scrapers.
128
129    Intended to be used with `Gruel` scrapers, but anything with a `scrape` method can be passed.
130
131    To run any `Gruel` scrapers from the current directory:
132    >>> Brewer(GruelFinder().find()).brew()
133
134    The `prescrape_chores` and `postscrape_chores` can be set/overridden like the same methods in `Gruel`.
135
136    When calling the `brew` method they will be executed once before and after all the scrapers have been executed.
137
138    i.e.
139    >>> brewer = Brewer(GruelFinder().find())
140    >>> brewer.prescrape_chores()
141    >>> results = brewer.scrape()
142    >>> brewer.postscrape_chores()
143
144    is equivalent to
145    >>> results = Brewer(GruelFinder().find()).brew()
146
147    except `brew()` has some logging."""
148
149    def __init__(
150        self,
151        scrapers: Sequence[Any],
152        scraper_args: Sequence[Sequence[Any]] = [],
153        scraper_kwargs: Sequence[dict[str, Any]] = [],
154        log_dir: Pathish | None = None,
155    ):
156        """#### :params:
157
158        `scrapers`: A list of scraper classes to initialize and execute.
159        A scraper should not be instantiated before being passed.
160        When `Brewer` runs a scraper it will instantiate the object at execution time and call it's `scrape` method.
161
162        `scraper_args`: A list where each element is a list of positional arguments to be passed to the corresponding scraper's `__init__` function.
163
164        `scraper_kwargs`: A list of dictionaries where each dictionary is a set of keyword arguments to be passed to the corresponding scraper's `__init__` function.
165
166        `log_dir`: The directory to store `Brewer` logs in. Defaults to the current working directory.
167
168        e.g.
169        >>> class MyGruel(Gruel):
170        >>>   def __init__(self, value:int):
171        >>>     super().__init__()
172        >>>     self.value = value
173        >>>
174        >>>   def scrape(self)->int:
175        >>>     return self.value
176        >>>
177        >>> num_scrapers = 5
178        >>> values = list(range(5))
179        >>> brewer = Brewer(
180        >>>   [MyGruel]*num_scrapers,
181        >>>   [(val,) for val in values]
182        >>> results = brewer.brew()
183        >>> print(results)
184        >>> [0, 1, 2, 3, 4]"""
185        self._init_logger(log_dir)
186        self.scrapers = scrapers
187        num_scrapers = len(self.scrapers)
188        # Pad args and kwargs if there aren't any given
189        self.scraper_args = scraper_args or [[]] * num_scrapers
190        self.scraper_kwargs = scraper_kwargs or [{}] * num_scrapers
191
192    def _init_logger(self, log_dir: Pathish | None = None):
193        # When Brewer is subclassed, use that file's stem instead of `brewer`
194        log_dir = Pathier(log_dir) if log_dir else Pathier.cwd()
195        source_file = inspect.getsourcefile(type(self))
196        if source_file:
197            log_name = Pathier(source_file).stem
198        else:
199            log_name = Pathier(__file__).stem
200        self.logger = loggi.getLogger(log_name, log_dir)
201
202    def prescrape_chores(self):
203        """Override to add any tasks to be done before running the scrapers."""
204        ...
205
206    def postscrape_chores(self):
207        """Override to add any tasks to be done after running the scrapers."""
208        ...
209
210    def _prep_scrapers(self) -> list[tuple[Any, Sequence[Any], dict[str, Any]]]:
211        return [
212            (scraper, args, kwargs)
213            for scraper, args, kwargs in zip(
214                self.scrapers, self.scraper_args, self.scraper_kwargs
215            )
216        ]
217
218    def scrape(self) -> list[Any]:
219        """Run the `scrape()` method for each scraper in `scrapers`.
220
221        Execution is multithreaded."""
222
223        def execute(scraper, args, kwargs):
224            return scraper(*args, **kwargs).scrape()
225
226        pool = quickpool.ThreadPool(
227            [execute] * len(self.scrapers), self._prep_scrapers()
228        )
229        return pool.execute()
230
231    def brew(self) -> list[Any] | None:
232        """Execute pipeline.
233
234        1. self.prescrape_chores()
235        2. self.scrape()
236        3. self.postscrape_chores()"""
237
238        try:
239            self.logger.logprint("Beginning brew")
240            # 1--------------------------------------------
241            self.logger.logprint("Executing prescrape chores")
242            self.prescrape_chores()
243            # 2--------------------------------------------
244            self.logger.logprint("Starting scrape")
245            results = self.scrape()
246            self.logger.logprint("Scrape complete")
247            # 4--------------------------------------------
248            self.logger.logprint("Executing postscrape chores")
249            self.postscrape_chores()
250            self.logger.logprint("Brew complete")
251            return results
252        except Exception as e:
253            print(e)
254            self.logger.exception("Exception occured during brew():")
255
256
257def get_args() -> argparse.Namespace:
258    parser = argparse.ArgumentParser(
259        prog="Brewer", description="Invoke `Brewer` from the command line."
260    )
261
262    parser.add_argument(
263        "subgruel_classes",
264        type=str,
265        nargs="*",
266        help=""" A list of Gruel scraper class names to find and import. """,
267    )
268    parser.add_argument(
269        "-e",
270        "--excludes",
271        type=str,
272        nargs="*",
273        default=[],
274        help=""" A list of glob style file patterns to exclude from the scan. """,
275    )
276    parser.add_argument(
277        "-i",
278        "--includes",
279        type=str,
280        nargs="*",
281        default=["*.py"],
282        help=""" A list of glob style file patterns to include in the scan. Defaults to "*.py". """,
283    )
284    parser.add_argument(
285        "-p",
286        "--path",
287        type=str,
288        default=Pathier.cwd(),
289        help=""" The directory path to scan. Defaults to the current working directory. """,
290    )
291    parser.add_argument(
292        "-r",
293        "--recursive",
294        action="store_true",
295        help=""" Whether -p/--path should be scanned recursively or not. """,
296    )
297    parser.add_argument(
298        "-l",
299        "--log_dir",
300        type=str,
301        default=None,
302        help=""" The directory to save the brew log to.""",
303    )
304    args = parser.parse_args()
305    args.path = Pathier(args.path)
306
307    return args
308
309
310def main(args: argparse.Namespace | None = None):
311    if not args:
312        args = get_args()
313    finder = GruelFinder(
314        args.subgruel_classes,
315        args.excludes,
316        args.path,
317        args.includes,
318        args.recursive,
319        args.log_dir,
320    )
321    brewer = Brewer(
322        finder.find(),
323        args.log_dir,
324    )
325    brewer.brew()
326
327
328if __name__ == "__main__":
329    main(get_args())
class GruelFinder:
 18class GruelFinder:
 19    """Find and load classes that subclass `Gruel`."""
 20
 21    def __init__(
 22        self,
 23        subgruel_classes: list[str] = ["*"],
 24        file_exclude_patterns: list[str] = [],
 25        scan_path: Pathier | None = None,
 26        file_include_patterns: list[str] = ["*.py"],
 27        recursive: bool = True,
 28        log_dir: Pathish | None = None,
 29    ):
 30        """#### :params:
 31
 32        `subgruel_classes`: A list of class names for scrapers that should be loaded.
 33        In order to be loaded, a scraper class must have a name in this list and have `Gruel` somewhere in its inheritance hierarchy.
 34        Can use wildcard ('*') patterns for matching.
 35
 36        `file_exclude_patterns`: Files that match these patterns will not be scanned.
 37
 38        `scan_path`: The path to scan for scraper classes.
 39
 40        `file_include_patterns`: Files that match these patterns will be scanned.
 41
 42        `recursive`: Whether the scan should be recursive or not.
 43
 44        `log_dir`: The directory this instance's log should be saved to.
 45        If `None`, it will be saved to the current working directory.
 46
 47        Will find and load all classes in the "scrapers" directory that inherit from `Gruel`
 48        and start with "MySubGruel", but don't contain "Scratch" in the name:
 49        >>> finder = finder(["MySubGruel*"], ["*Scratch*"], "scrapers")
 50        >>> gruels = finder.find()"""
 51        self.subgruel_classes = subgruel_classes
 52        self.file_exclude_patterns = file_exclude_patterns
 53        self.scan_path = scan_path or Pathier.cwd()
 54        self.file_include_patterns = file_include_patterns
 55        self.recursive = recursive
 56        self.logger = loggi.getLogger(
 57            "gruel_finder", Pathier(log_dir) if log_dir else Pathier.cwd()
 58        )
 59
 60    def get_bases(self, object: Any) -> list[Any]:
 61        """Returns a recursive list of all the classes `object` inherits from."""
 62        parents = []
 63        bases = object.__bases__
 64        if not bases:
 65            return parents
 66        for base in bases:
 67            parents.append(base)
 68            parents.extend(self.get_bases(base))
 69        return parents
 70
 71    def is_subgruel(self, object: Any) -> bool:
 72        """Returns whether `object` inherits from `Gruel` somewhere in its ancestory."""
 73        if not inspect.isclass(object) or Gruel not in self.get_bases(object):
 74            return False
 75        return True
 76
 77    def glob_files(self) -> list[Pathier]:
 78        """Search `self.scan_path` for files according to `self.file_include_patterns` and `self.file_exclude_patterns`.
 79
 80        Returns the file list."""
 81        globber = self.scan_path.rglob if self.recursive else self.scan_path.glob
 82        files = [
 83            str(file)
 84            for pattern in self.file_include_patterns
 85            for file in globber(pattern)
 86        ]
 87        files = [
 88            Pathier(file)
 89            for file in younotyou(files, exclude_patterns=self.file_exclude_patterns)
 90        ]
 91        return files
 92
 93    def load_module_from_file(self, file: Pathier) -> ModuleType | None:
 94        """Attempts to load and return a module from `file`."""
 95        module_name = file.stem
 96        try:
 97            module = importlib.machinery.SourceFileLoader(
 98                module_name, str(file)
 99            ).load_module()
100            self.logger.info(f"Successfully imported `{module_name}` from `{file}`.")
101            return module
102        except Exception as e:
103            self.logger.exception(f"Failed to load `{module_name}` from `{file}`.")
104
105    def strain_for_gruel(self, modules: list[ModuleType]) -> list[Type[Gruel]]:
106        """Searches `modules` for classes that inherit from `Gruel` and are in `self.subgruel_classes`.
107
108        Returns the list of classes."""
109        matcher = Matcher(self.subgruel_classes)
110        return [
111            getattr(module, class_)
112            for module in modules
113            for class_ in dir(module)
114            if class_ in matcher and self.is_subgruel(getattr(module, class_))
115        ]
116
117    def find(self) -> list[Type[Gruel]]:
118        """Run the scan and return `Gruel` subclasses."""
119        files = self.glob_files()
120        modules = []
121        for file in files:
122            if module := self.load_module_from_file(file):
123                modules.append(module)
124        return self.strain_for_gruel(modules)

Find and load classes that subclass Gruel.

GruelFinder( subgruel_classes: list[str] = ['*'], file_exclude_patterns: list[str] = [], scan_path: pathier.pathier.Pathier | None = None, file_include_patterns: list[str] = ['*.py'], recursive: bool = True, log_dir: pathier.pathier.Pathier | pathlib.Path | str | None = None)
21    def __init__(
22        self,
23        subgruel_classes: list[str] = ["*"],
24        file_exclude_patterns: list[str] = [],
25        scan_path: Pathier | None = None,
26        file_include_patterns: list[str] = ["*.py"],
27        recursive: bool = True,
28        log_dir: Pathish | None = None,
29    ):
30        """#### :params:
31
32        `subgruel_classes`: A list of class names for scrapers that should be loaded.
33        In order to be loaded, a scraper class must have a name in this list and have `Gruel` somewhere in its inheritance hierarchy.
34        Can use wildcard ('*') patterns for matching.
35
36        `file_exclude_patterns`: Files that match these patterns will not be scanned.
37
38        `scan_path`: The path to scan for scraper classes.
39
40        `file_include_patterns`: Files that match these patterns will be scanned.
41
42        `recursive`: Whether the scan should be recursive or not.
43
44        `log_dir`: The directory this instance's log should be saved to.
45        If `None`, it will be saved to the current working directory.
46
47        Will find and load all classes in the "scrapers" directory that inherit from `Gruel`
48        and start with "MySubGruel", but don't contain "Scratch" in the name:
49        >>> finder = finder(["MySubGruel*"], ["*Scratch*"], "scrapers")
50        >>> gruels = finder.find()"""
51        self.subgruel_classes = subgruel_classes
52        self.file_exclude_patterns = file_exclude_patterns
53        self.scan_path = scan_path or Pathier.cwd()
54        self.file_include_patterns = file_include_patterns
55        self.recursive = recursive
56        self.logger = loggi.getLogger(
57            "gruel_finder", Pathier(log_dir) if log_dir else Pathier.cwd()
58        )

:params:

subgruel_classes: A list of class names for scrapers that should be loaded. In order to be loaded, a scraper class must have a name in this list and have Gruel somewhere in its inheritance hierarchy. Can use wildcard ('*') patterns for matching.

file_exclude_patterns: Files that match these patterns will not be scanned.

scan_path: The path to scan for scraper classes.

file_include_patterns: Files that match these patterns will be scanned.

recursive: Whether the scan should be recursive or not.

log_dir: The directory this instance's log should be saved to. If None, it will be saved to the current working directory.

Will find and load all classes in the "scrapers" directory that inherit from Gruel and start with "MySubGruel", but don't contain "Scratch" in the name:

>>> finder = finder(["MySubGruel*"], ["*Scratch*"], "scrapers")
>>> gruels = finder.find()
def get_bases(self, object: Any) -> list[typing.Any]:
60    def get_bases(self, object: Any) -> list[Any]:
61        """Returns a recursive list of all the classes `object` inherits from."""
62        parents = []
63        bases = object.__bases__
64        if not bases:
65            return parents
66        for base in bases:
67            parents.append(base)
68            parents.extend(self.get_bases(base))
69        return parents

Returns a recursive list of all the classes object inherits from.

def is_subgruel(self, object: Any) -> bool:
71    def is_subgruel(self, object: Any) -> bool:
72        """Returns whether `object` inherits from `Gruel` somewhere in its ancestory."""
73        if not inspect.isclass(object) or Gruel not in self.get_bases(object):
74            return False
75        return True

Returns whether object inherits from Gruel somewhere in its ancestory.

def glob_files(self) -> list[pathier.pathier.Pathier]:
77    def glob_files(self) -> list[Pathier]:
78        """Search `self.scan_path` for files according to `self.file_include_patterns` and `self.file_exclude_patterns`.
79
80        Returns the file list."""
81        globber = self.scan_path.rglob if self.recursive else self.scan_path.glob
82        files = [
83            str(file)
84            for pattern in self.file_include_patterns
85            for file in globber(pattern)
86        ]
87        files = [
88            Pathier(file)
89            for file in younotyou(files, exclude_patterns=self.file_exclude_patterns)
90        ]
91        return files

Search self.scan_path for files according to self.file_include_patterns and self.file_exclude_patterns.

Returns the file list.

def load_module_from_file(self, file: pathier.pathier.Pathier) -> module | None:
 93    def load_module_from_file(self, file: Pathier) -> ModuleType | None:
 94        """Attempts to load and return a module from `file`."""
 95        module_name = file.stem
 96        try:
 97            module = importlib.machinery.SourceFileLoader(
 98                module_name, str(file)
 99            ).load_module()
100            self.logger.info(f"Successfully imported `{module_name}` from `{file}`.")
101            return module
102        except Exception as e:
103            self.logger.exception(f"Failed to load `{module_name}` from `{file}`.")

Attempts to load and return a module from file.

def strain_for_gruel(self, modules: list[module]) -> list[typing.Type[gruel.grueler.Gruel]]:
105    def strain_for_gruel(self, modules: list[ModuleType]) -> list[Type[Gruel]]:
106        """Searches `modules` for classes that inherit from `Gruel` and are in `self.subgruel_classes`.
107
108        Returns the list of classes."""
109        matcher = Matcher(self.subgruel_classes)
110        return [
111            getattr(module, class_)
112            for module in modules
113            for class_ in dir(module)
114            if class_ in matcher and self.is_subgruel(getattr(module, class_))
115        ]

Searches modules for classes that inherit from Gruel and are in self.subgruel_classes.

Returns the list of classes.

def find(self) -> list[typing.Type[gruel.grueler.Gruel]]:
117    def find(self) -> list[Type[Gruel]]:
118        """Run the scan and return `Gruel` subclasses."""
119        files = self.glob_files()
120        modules = []
121        for file in files:
122            if module := self.load_module_from_file(file):
123                modules.append(module)
124        return self.strain_for_gruel(modules)

Run the scan and return Gruel subclasses.

class Brewer:
127class Brewer:
128    """Use to do multithreaded execution of a list of scrapers.
129
130    Intended to be used with `Gruel` scrapers, but anything with a `scrape` method can be passed.
131
132    To run any `Gruel` scrapers from the current directory:
133    >>> Brewer(GruelFinder().find()).brew()
134
135    The `prescrape_chores` and `postscrape_chores` can be set/overridden like the same methods in `Gruel`.
136
137    When calling the `brew` method they will be executed once before and after all the scrapers have been executed.
138
139    i.e.
140    >>> brewer = Brewer(GruelFinder().find())
141    >>> brewer.prescrape_chores()
142    >>> results = brewer.scrape()
143    >>> brewer.postscrape_chores()
144
145    is equivalent to
146    >>> results = Brewer(GruelFinder().find()).brew()
147
148    except `brew()` has some logging."""
149
150    def __init__(
151        self,
152        scrapers: Sequence[Any],
153        scraper_args: Sequence[Sequence[Any]] = [],
154        scraper_kwargs: Sequence[dict[str, Any]] = [],
155        log_dir: Pathish | None = None,
156    ):
157        """#### :params:
158
159        `scrapers`: A list of scraper classes to initialize and execute.
160        A scraper should not be instantiated before being passed.
161        When `Brewer` runs a scraper it will instantiate the object at execution time and call it's `scrape` method.
162
163        `scraper_args`: A list where each element is a list of positional arguments to be passed to the corresponding scraper's `__init__` function.
164
165        `scraper_kwargs`: A list of dictionaries where each dictionary is a set of keyword arguments to be passed to the corresponding scraper's `__init__` function.
166
167        `log_dir`: The directory to store `Brewer` logs in. Defaults to the current working directory.
168
169        e.g.
170        >>> class MyGruel(Gruel):
171        >>>   def __init__(self, value:int):
172        >>>     super().__init__()
173        >>>     self.value = value
174        >>>
175        >>>   def scrape(self)->int:
176        >>>     return self.value
177        >>>
178        >>> num_scrapers = 5
179        >>> values = list(range(5))
180        >>> brewer = Brewer(
181        >>>   [MyGruel]*num_scrapers,
182        >>>   [(val,) for val in values]
183        >>> results = brewer.brew()
184        >>> print(results)
185        >>> [0, 1, 2, 3, 4]"""
186        self._init_logger(log_dir)
187        self.scrapers = scrapers
188        num_scrapers = len(self.scrapers)
189        # Pad args and kwargs if there aren't any given
190        self.scraper_args = scraper_args or [[]] * num_scrapers
191        self.scraper_kwargs = scraper_kwargs or [{}] * num_scrapers
192
193    def _init_logger(self, log_dir: Pathish | None = None):
194        # When Brewer is subclassed, use that file's stem instead of `brewer`
195        log_dir = Pathier(log_dir) if log_dir else Pathier.cwd()
196        source_file = inspect.getsourcefile(type(self))
197        if source_file:
198            log_name = Pathier(source_file).stem
199        else:
200            log_name = Pathier(__file__).stem
201        self.logger = loggi.getLogger(log_name, log_dir)
202
203    def prescrape_chores(self):
204        """Override to add any tasks to be done before running the scrapers."""
205        ...
206
207    def postscrape_chores(self):
208        """Override to add any tasks to be done after running the scrapers."""
209        ...
210
211    def _prep_scrapers(self) -> list[tuple[Any, Sequence[Any], dict[str, Any]]]:
212        return [
213            (scraper, args, kwargs)
214            for scraper, args, kwargs in zip(
215                self.scrapers, self.scraper_args, self.scraper_kwargs
216            )
217        ]
218
219    def scrape(self) -> list[Any]:
220        """Run the `scrape()` method for each scraper in `scrapers`.
221
222        Execution is multithreaded."""
223
224        def execute(scraper, args, kwargs):
225            return scraper(*args, **kwargs).scrape()
226
227        pool = quickpool.ThreadPool(
228            [execute] * len(self.scrapers), self._prep_scrapers()
229        )
230        return pool.execute()
231
232    def brew(self) -> list[Any] | None:
233        """Execute pipeline.
234
235        1. self.prescrape_chores()
236        2. self.scrape()
237        3. self.postscrape_chores()"""
238
239        try:
240            self.logger.logprint("Beginning brew")
241            # 1--------------------------------------------
242            self.logger.logprint("Executing prescrape chores")
243            self.prescrape_chores()
244            # 2--------------------------------------------
245            self.logger.logprint("Starting scrape")
246            results = self.scrape()
247            self.logger.logprint("Scrape complete")
248            # 4--------------------------------------------
249            self.logger.logprint("Executing postscrape chores")
250            self.postscrape_chores()
251            self.logger.logprint("Brew complete")
252            return results
253        except Exception as e:
254            print(e)
255            self.logger.exception("Exception occured during brew():")

Use to do multithreaded execution of a list of scrapers.

Intended to be used with Gruel scrapers, but anything with a scrape method can be passed.

To run any Gruel scrapers from the current directory:

>>> Brewer(GruelFinder().find()).brew()

The prescrape_chores and postscrape_chores can be set/overridden like the same methods in Gruel.

When calling the brew method they will be executed once before and after all the scrapers have been executed.

i.e.

>>> brewer = Brewer(GruelFinder().find())
>>> brewer.prescrape_chores()
>>> results = brewer.scrape()
>>> brewer.postscrape_chores()

is equivalent to

>>> results = Brewer(GruelFinder().find()).brew()

except brew() has some logging.

Brewer( scrapers: Sequence[Any], scraper_args: Sequence[Sequence[Any]] = [], scraper_kwargs: Sequence[dict[str, Any]] = [], log_dir: pathier.pathier.Pathier | pathlib.Path | str | None = None)
150    def __init__(
151        self,
152        scrapers: Sequence[Any],
153        scraper_args: Sequence[Sequence[Any]] = [],
154        scraper_kwargs: Sequence[dict[str, Any]] = [],
155        log_dir: Pathish | None = None,
156    ):
157        """#### :params:
158
159        `scrapers`: A list of scraper classes to initialize and execute.
160        A scraper should not be instantiated before being passed.
161        When `Brewer` runs a scraper it will instantiate the object at execution time and call it's `scrape` method.
162
163        `scraper_args`: A list where each element is a list of positional arguments to be passed to the corresponding scraper's `__init__` function.
164
165        `scraper_kwargs`: A list of dictionaries where each dictionary is a set of keyword arguments to be passed to the corresponding scraper's `__init__` function.
166
167        `log_dir`: The directory to store `Brewer` logs in. Defaults to the current working directory.
168
169        e.g.
170        >>> class MyGruel(Gruel):
171        >>>   def __init__(self, value:int):
172        >>>     super().__init__()
173        >>>     self.value = value
174        >>>
175        >>>   def scrape(self)->int:
176        >>>     return self.value
177        >>>
178        >>> num_scrapers = 5
179        >>> values = list(range(5))
180        >>> brewer = Brewer(
181        >>>   [MyGruel]*num_scrapers,
182        >>>   [(val,) for val in values]
183        >>> results = brewer.brew()
184        >>> print(results)
185        >>> [0, 1, 2, 3, 4]"""
186        self._init_logger(log_dir)
187        self.scrapers = scrapers
188        num_scrapers = len(self.scrapers)
189        # Pad args and kwargs if there aren't any given
190        self.scraper_args = scraper_args or [[]] * num_scrapers
191        self.scraper_kwargs = scraper_kwargs or [{}] * num_scrapers

:params:

scrapers: A list of scraper classes to initialize and execute. A scraper should not be instantiated before being passed. When Brewer runs a scraper it will instantiate the object at execution time and call it's scrape method.

scraper_args: A list where each element is a list of positional arguments to be passed to the corresponding scraper's __init__ function.

scraper_kwargs: A list of dictionaries where each dictionary is a set of keyword arguments to be passed to the corresponding scraper's __init__ function.

log_dir: The directory to store Brewer logs in. Defaults to the current working directory.

e.g.

>>> class MyGruel(Gruel):
>>>   def __init__(self, value:int):
>>>     super().__init__()
>>>     self.value = value
>>>
>>>   def scrape(self)->int:
>>>     return self.value
>>>
>>> num_scrapers = 5
>>> values = list(range(5))
>>> brewer = Brewer(
>>>   [MyGruel]*num_scrapers,
>>>   [(val,) for val in values]
>>> results = brewer.brew()
>>> print(results)
>>> [0, 1, 2, 3, 4]
def prescrape_chores(self):
203    def prescrape_chores(self):
204        """Override to add any tasks to be done before running the scrapers."""
205        ...

Override to add any tasks to be done before running the scrapers.

def postscrape_chores(self):
207    def postscrape_chores(self):
208        """Override to add any tasks to be done after running the scrapers."""
209        ...

Override to add any tasks to be done after running the scrapers.

def scrape(self) -> list[typing.Any]:
219    def scrape(self) -> list[Any]:
220        """Run the `scrape()` method for each scraper in `scrapers`.
221
222        Execution is multithreaded."""
223
224        def execute(scraper, args, kwargs):
225            return scraper(*args, **kwargs).scrape()
226
227        pool = quickpool.ThreadPool(
228            [execute] * len(self.scrapers), self._prep_scrapers()
229        )
230        return pool.execute()

Run the scrape() method for each scraper in scrapers.

Execution is multithreaded.

def brew(self) -> list[typing.Any] | None:
232    def brew(self) -> list[Any] | None:
233        """Execute pipeline.
234
235        1. self.prescrape_chores()
236        2. self.scrape()
237        3. self.postscrape_chores()"""
238
239        try:
240            self.logger.logprint("Beginning brew")
241            # 1--------------------------------------------
242            self.logger.logprint("Executing prescrape chores")
243            self.prescrape_chores()
244            # 2--------------------------------------------
245            self.logger.logprint("Starting scrape")
246            results = self.scrape()
247            self.logger.logprint("Scrape complete")
248            # 4--------------------------------------------
249            self.logger.logprint("Executing postscrape chores")
250            self.postscrape_chores()
251            self.logger.logprint("Brew complete")
252            return results
253        except Exception as e:
254            print(e)
255            self.logger.exception("Exception occured during brew():")

Execute pipeline.

  1. self.prescrape_chores()
  2. self.scrape()
  3. self.postscrape_chores()
def get_args() -> argparse.Namespace:
258def get_args() -> argparse.Namespace:
259    parser = argparse.ArgumentParser(
260        prog="Brewer", description="Invoke `Brewer` from the command line."
261    )
262
263    parser.add_argument(
264        "subgruel_classes",
265        type=str,
266        nargs="*",
267        help=""" A list of Gruel scraper class names to find and import. """,
268    )
269    parser.add_argument(
270        "-e",
271        "--excludes",
272        type=str,
273        nargs="*",
274        default=[],
275        help=""" A list of glob style file patterns to exclude from the scan. """,
276    )
277    parser.add_argument(
278        "-i",
279        "--includes",
280        type=str,
281        nargs="*",
282        default=["*.py"],
283        help=""" A list of glob style file patterns to include in the scan. Defaults to "*.py". """,
284    )
285    parser.add_argument(
286        "-p",
287        "--path",
288        type=str,
289        default=Pathier.cwd(),
290        help=""" The directory path to scan. Defaults to the current working directory. """,
291    )
292    parser.add_argument(
293        "-r",
294        "--recursive",
295        action="store_true",
296        help=""" Whether -p/--path should be scanned recursively or not. """,
297    )
298    parser.add_argument(
299        "-l",
300        "--log_dir",
301        type=str,
302        default=None,
303        help=""" The directory to save the brew log to.""",
304    )
305    args = parser.parse_args()
306    args.path = Pathier(args.path)
307
308    return args
def main(args: argparse.Namespace | None = None):
311def main(args: argparse.Namespace | None = None):
312    if not args:
313        args = get_args()
314    finder = GruelFinder(
315        args.subgruel_classes,
316        args.excludes,
317        args.path,
318        args.includes,
319        args.recursive,
320        args.log_dir,
321    )
322    brewer = Brewer(
323        finder.find(),
324        args.log_dir,
325    )
326    brewer.brew()