gruel.brewer

  1import argparse
  2import importlib
  3import importlib.machinery
  4import importlib.util
  5import inspect
  6from typing import Any
  7
  8import loggi
  9import quickpool
 10from pathier import Pathier, Pathish
 11from younotyou import younotyou
 12
 13from gruel import Gruel
 14
 15
 16class Brewer:
 17    def __init__(
 18        self,
 19        subgruel_classes: list[str],
 20        file_exclude_patterns: list[str] = [],
 21        scan_path: Pathish = Pathier.cwd(),
 22        file_include_patterns: list[str] = ["*.py"],
 23        recursive: bool = True,
 24    ):
 25        """Run `Gruel` scrapers.
 26
 27        #### :params:
 28
 29        `subgruel_classes`: A list of class names for scrapers that should be loaded.
 30        In order to be loaded, a scraper class must have a name in this list and have `Gruel` somewhere in its inheritance hierarchy.
 31
 32        `file_exclude_patterns`: Files that match these patterns will not be scanned.
 33
 34        `scan_path`: The path to scan for scraper classes.
 35
 36        `file_include_patterns`: Files that match these patterns will be scanned.
 37
 38        `recursive`: Whether the scan should be recursive or not.
 39
 40        >>> brewer = Brewer(["VenueScraper"], ["*template*", "*giggruel*"], "scrapers")
 41        >>> brewer.brew()"""
 42        self._init_logger()
 43        self.subgruel_classes = subgruel_classes
 44        self.file_exclude_patterns = file_exclude_patterns
 45        self.file_include_patterns = file_include_patterns
 46        self.scan_path = Pathier(scan_path)
 47        self.recursive = recursive
 48
 49    def _init_logger(self):
 50        # When Brewer is subclassed, use that file's stem instead of `brewer`
 51        source_file = inspect.getsourcefile(type(self))
 52        if source_file:
 53            log_name = Pathier(source_file).stem
 54        else:
 55            log_name = Pathier(__file__).stem
 56        self.logger = loggi.getLogger(log_name)
 57
 58    def load_scrapers(self) -> list[Gruel]:
 59        """Load scraper classes that inherit from `Gruel`.
 60
 61        NOTE: Classes are loaded, but scraper objects are not instantiated until the `scrape()` method is called.
 62
 63        #### :params:
 64
 65        `directory`: The path to scan for scraper classes.
 66
 67        `class_names`: A list of class names for scrapers that should be loaded.
 68        In order to be loaded, a scraper class must have a name in this list and have `Gruel` somewhere in its inheritance hierarchy.
 69
 70        `include_patterns`: Files that match these patterns will be scanned.
 71
 72        `exclude_patterns`: Files that match these patterns will not be scanned.
 73
 74        `recursive`: Whether the search should be recursive or not.
 75
 76        >>> load_scrapers("getToTheGig/scrapers", ["VenueScraper"], ["*.py"], ["*template*", "*giggruel*"])"""
 77        globber = self.scan_path.glob
 78        if self.recursive:
 79            globber = self.scan_path.rglob
 80        files = [
 81            str(file)
 82            for pattern in self.file_include_patterns
 83            for file in globber(pattern)
 84        ]
 85        files = younotyou(files, exclude_patterns=self.file_exclude_patterns)
 86        self.modules = {}
 87        self._module_names = []
 88        for file in files:
 89            module_name = Pathier(file).stem
 90            try:
 91                module = importlib.machinery.SourceFileLoader(
 92                    module_name, file
 93                ).load_module()
 94            except Exception as e:
 95                self.logger.exception(
 96                    f"Failed to load module '{module_name}' from '{file}'."
 97                )
 98            else:
 99                self._module_names.append(module_name)
100                self.modules[module] = module
101        gruels = [
102            getattr(module, class_)
103            for module in self.modules.values()
104            for class_ in self.subgruel_classes
105            if class_ in dir(module) and self.is_subgruel(getattr(module, class_))
106        ]
107        self.logger.info(
108            "\n".join(
109                [f"Imported {len(gruels)} scrapers: "]
110                + [str(gruel) for gruel in gruels]
111            )
112        )
113        return gruels
114
115    def pop_modules(self):
116        """Unload modules."""
117        for module in self.modules:
118            del module
119        self._module_names = []
120
121    def get_bases(self, object: Any) -> list[Any]:
122        """Returns a recursive list of all the classes `object` inherits from."""
123        parents = []
124        bases = object.__bases__
125        if not bases:
126            return parents
127        for base in bases:
128            parents.append(base)
129            parents.extend(self.get_bases(base))
130        return parents
131
132    def is_subgruel(self, object: Any) -> bool:
133        """Returns whether `object` inherits from `Gruel` somewhere in its ancestory."""
134        if not inspect.isclass(object) or Gruel not in self.get_bases(object):
135            return False
136        return True
137
138    def prescrape_chores(self):
139        """Override to add any tasks to be done before running the scrapers."""
140        ...
141
142    def postscrape_chores(self):
143        """Override to add any tasks to be done after running the scrapers."""
144        self.pop_modules()
145
146    def scrape(self, scrapers: list[Gruel]):
147        """Run the `scrape()` method for each scraper in `scrapers`.
148
149        Execution is multithreaded."""
150        pool = quickpool.ThreadPool([scraper().scrape for scraper in scrapers])  # type: ignore
151        pool.execute()
152
153    def logprint(self, message: str):
154        """Log and print `message`."""
155        self.logger.info(message)
156        print(message)
157
158    def brew(self):
159        """Execute pipeline.
160
161        1. self.prescrape_chores()
162        2. self.load_scrapers()
163        3. self.scrape()
164        4. self.postscrape_chores()"""
165
166        try:
167            self.logprint("Beginning brew")
168            # 1--------------------------------------------
169            self.logprint("Executing prescrape chores")
170            self.prescrape_chores()
171            # 2--------------------------------------------
172            self.logprint("Loading scrapers")
173            scrapers = self.load_scrapers()
174            print(f"Loaded {len(scrapers)} scrapers")
175            # 3--------------------------------------------
176            self.logprint("Starting scrape")
177            self.scrape(scrapers)
178            self.logprint("Scrape complete")
179            # 4--------------------------------------------
180            self.logprint("Executing postscrape chores")
181            self.postscrape_chores()
182            self.logprint("Brew complete")
183        except Exception as e:
184            print(e)
185            self.logger.exception("Exception occured during brew():")
186
187
188def get_args() -> argparse.Namespace:
189    parser = argparse.ArgumentParser()
190
191    parser.add_argument(
192        "subgruel_classes",
193        type=str,
194        nargs="*",
195        help=""" A list of Gruel scraper class names to find and import. """,
196    )
197    parser.add_argument(
198        "-e",
199        "--excludes",
200        type=str,
201        nargs="*",
202        default=[],
203        help=""" A list of glob style file patterns to exclude from the scan. """,
204    )
205    parser.add_argument(
206        "-i",
207        "--includes",
208        type=str,
209        nargs="*",
210        default=["*.py"],
211        help=""" A list of glob style file patterns to include in the scan. Defaults to "*.py". """,
212    )
213    parser.add_argument(
214        "-p",
215        "--path",
216        type=str,
217        default=Pathier.cwd(),
218        help=""" The directory path to scan. Defaults to the current working directory. """,
219    )
220    parser.add_argument(
221        "-r",
222        "--recursive",
223        action="store_true",
224        help=""" Whether -p/--path should be scanned recursively or not. """,
225    )
226    args = parser.parse_args()
227    args.path = Pathier(args.path)
228
229    return args
230
231
232def main(args: argparse.Namespace | None = None):
233    if not args:
234        args = get_args()
235    brewer = Brewer(
236        args.subgruel_classes, args.excludes, args.path, args.includes, args.recursive
237    )
238    brewer.brew()
239
240
241if __name__ == "__main__":
242    main(get_args())
class Brewer:
 17class Brewer:
 18    def __init__(
 19        self,
 20        subgruel_classes: list[str],
 21        file_exclude_patterns: list[str] = [],
 22        scan_path: Pathish = Pathier.cwd(),
 23        file_include_patterns: list[str] = ["*.py"],
 24        recursive: bool = True,
 25    ):
 26        """Run `Gruel` scrapers.
 27
 28        #### :params:
 29
 30        `subgruel_classes`: A list of class names for scrapers that should be loaded.
 31        In order to be loaded, a scraper class must have a name in this list and have `Gruel` somewhere in its inheritance hierarchy.
 32
 33        `file_exclude_patterns`: Files that match these patterns will not be scanned.
 34
 35        `scan_path`: The path to scan for scraper classes.
 36
 37        `file_include_patterns`: Files that match these patterns will be scanned.
 38
 39        `recursive`: Whether the scan should be recursive or not.
 40
 41        >>> brewer = Brewer(["VenueScraper"], ["*template*", "*giggruel*"], "scrapers")
 42        >>> brewer.brew()"""
 43        self._init_logger()
 44        self.subgruel_classes = subgruel_classes
 45        self.file_exclude_patterns = file_exclude_patterns
 46        self.file_include_patterns = file_include_patterns
 47        self.scan_path = Pathier(scan_path)
 48        self.recursive = recursive
 49
 50    def _init_logger(self):
 51        # When Brewer is subclassed, use that file's stem instead of `brewer`
 52        source_file = inspect.getsourcefile(type(self))
 53        if source_file:
 54            log_name = Pathier(source_file).stem
 55        else:
 56            log_name = Pathier(__file__).stem
 57        self.logger = loggi.getLogger(log_name)
 58
 59    def load_scrapers(self) -> list[Gruel]:
 60        """Load scraper classes that inherit from `Gruel`.
 61
 62        NOTE: Classes are loaded, but scraper objects are not instantiated until the `scrape()` method is called.
 63
 64        #### :params:
 65
 66        `directory`: The path to scan for scraper classes.
 67
 68        `class_names`: A list of class names for scrapers that should be loaded.
 69        In order to be loaded, a scraper class must have a name in this list and have `Gruel` somewhere in its inheritance hierarchy.
 70
 71        `include_patterns`: Files that match these patterns will be scanned.
 72
 73        `exclude_patterns`: Files that match these patterns will not be scanned.
 74
 75        `recursive`: Whether the search should be recursive or not.
 76
 77        >>> load_scrapers("getToTheGig/scrapers", ["VenueScraper"], ["*.py"], ["*template*", "*giggruel*"])"""
 78        globber = self.scan_path.glob
 79        if self.recursive:
 80            globber = self.scan_path.rglob
 81        files = [
 82            str(file)
 83            for pattern in self.file_include_patterns
 84            for file in globber(pattern)
 85        ]
 86        files = younotyou(files, exclude_patterns=self.file_exclude_patterns)
 87        self.modules = {}
 88        self._module_names = []
 89        for file in files:
 90            module_name = Pathier(file).stem
 91            try:
 92                module = importlib.machinery.SourceFileLoader(
 93                    module_name, file
 94                ).load_module()
 95            except Exception as e:
 96                self.logger.exception(
 97                    f"Failed to load module '{module_name}' from '{file}'."
 98                )
 99            else:
100                self._module_names.append(module_name)
101                self.modules[module] = module
102        gruels = [
103            getattr(module, class_)
104            for module in self.modules.values()
105            for class_ in self.subgruel_classes
106            if class_ in dir(module) and self.is_subgruel(getattr(module, class_))
107        ]
108        self.logger.info(
109            "\n".join(
110                [f"Imported {len(gruels)} scrapers: "]
111                + [str(gruel) for gruel in gruels]
112            )
113        )
114        return gruels
115
116    def pop_modules(self):
117        """Unload modules."""
118        for module in self.modules:
119            del module
120        self._module_names = []
121
122    def get_bases(self, object: Any) -> list[Any]:
123        """Returns a recursive list of all the classes `object` inherits from."""
124        parents = []
125        bases = object.__bases__
126        if not bases:
127            return parents
128        for base in bases:
129            parents.append(base)
130            parents.extend(self.get_bases(base))
131        return parents
132
133    def is_subgruel(self, object: Any) -> bool:
134        """Returns whether `object` inherits from `Gruel` somewhere in its ancestory."""
135        if not inspect.isclass(object) or Gruel not in self.get_bases(object):
136            return False
137        return True
138
139    def prescrape_chores(self):
140        """Override to add any tasks to be done before running the scrapers."""
141        ...
142
143    def postscrape_chores(self):
144        """Override to add any tasks to be done after running the scrapers."""
145        self.pop_modules()
146
147    def scrape(self, scrapers: list[Gruel]):
148        """Run the `scrape()` method for each scraper in `scrapers`.
149
150        Execution is multithreaded."""
151        pool = quickpool.ThreadPool([scraper().scrape for scraper in scrapers])  # type: ignore
152        pool.execute()
153
154    def logprint(self, message: str):
155        """Log and print `message`."""
156        self.logger.info(message)
157        print(message)
158
159    def brew(self):
160        """Execute pipeline.
161
162        1. self.prescrape_chores()
163        2. self.load_scrapers()
164        3. self.scrape()
165        4. self.postscrape_chores()"""
166
167        try:
168            self.logprint("Beginning brew")
169            # 1--------------------------------------------
170            self.logprint("Executing prescrape chores")
171            self.prescrape_chores()
172            # 2--------------------------------------------
173            self.logprint("Loading scrapers")
174            scrapers = self.load_scrapers()
175            print(f"Loaded {len(scrapers)} scrapers")
176            # 3--------------------------------------------
177            self.logprint("Starting scrape")
178            self.scrape(scrapers)
179            self.logprint("Scrape complete")
180            # 4--------------------------------------------
181            self.logprint("Executing postscrape chores")
182            self.postscrape_chores()
183            self.logprint("Brew complete")
184        except Exception as e:
185            print(e)
186            self.logger.exception("Exception occured during brew():")
Brewer( subgruel_classes: list[str], file_exclude_patterns: list[str] = [], scan_path: pathier.pathier.Pathier | pathlib.Path | str = WindowsPath('E:/1vsCode/python/gruel'), file_include_patterns: list[str] = ['*.py'], recursive: bool = True)
18    def __init__(
19        self,
20        subgruel_classes: list[str],
21        file_exclude_patterns: list[str] = [],
22        scan_path: Pathish = Pathier.cwd(),
23        file_include_patterns: list[str] = ["*.py"],
24        recursive: bool = True,
25    ):
26        """Run `Gruel` scrapers.
27
28        #### :params:
29
30        `subgruel_classes`: A list of class names for scrapers that should be loaded.
31        In order to be loaded, a scraper class must have a name in this list and have `Gruel` somewhere in its inheritance hierarchy.
32
33        `file_exclude_patterns`: Files that match these patterns will not be scanned.
34
35        `scan_path`: The path to scan for scraper classes.
36
37        `file_include_patterns`: Files that match these patterns will be scanned.
38
39        `recursive`: Whether the scan should be recursive or not.
40
41        >>> brewer = Brewer(["VenueScraper"], ["*template*", "*giggruel*"], "scrapers")
42        >>> brewer.brew()"""
43        self._init_logger()
44        self.subgruel_classes = subgruel_classes
45        self.file_exclude_patterns = file_exclude_patterns
46        self.file_include_patterns = file_include_patterns
47        self.scan_path = Pathier(scan_path)
48        self.recursive = recursive

Run Gruel scrapers.

:params:

subgruel_classes: A list of class names for scrapers that should be loaded. In order to be loaded, a scraper class must have a name in this list and have Gruel somewhere in its inheritance hierarchy.

file_exclude_patterns: Files that match these patterns will not be scanned.

scan_path: The path to scan for scraper classes.

file_include_patterns: Files that match these patterns will be scanned.

recursive: Whether the scan should be recursive or not.

>>> brewer = Brewer(["VenueScraper"], ["*template*", "*giggruel*"], "scrapers")
>>> brewer.brew()
def load_scrapers(self) -> list[gruel.gruel.Gruel]:
 59    def load_scrapers(self) -> list[Gruel]:
 60        """Load scraper classes that inherit from `Gruel`.
 61
 62        NOTE: Classes are loaded, but scraper objects are not instantiated until the `scrape()` method is called.
 63
 64        #### :params:
 65
 66        `directory`: The path to scan for scraper classes.
 67
 68        `class_names`: A list of class names for scrapers that should be loaded.
 69        In order to be loaded, a scraper class must have a name in this list and have `Gruel` somewhere in its inheritance hierarchy.
 70
 71        `include_patterns`: Files that match these patterns will be scanned.
 72
 73        `exclude_patterns`: Files that match these patterns will not be scanned.
 74
 75        `recursive`: Whether the search should be recursive or not.
 76
 77        >>> load_scrapers("getToTheGig/scrapers", ["VenueScraper"], ["*.py"], ["*template*", "*giggruel*"])"""
 78        globber = self.scan_path.glob
 79        if self.recursive:
 80            globber = self.scan_path.rglob
 81        files = [
 82            str(file)
 83            for pattern in self.file_include_patterns
 84            for file in globber(pattern)
 85        ]
 86        files = younotyou(files, exclude_patterns=self.file_exclude_patterns)
 87        self.modules = {}
 88        self._module_names = []
 89        for file in files:
 90            module_name = Pathier(file).stem
 91            try:
 92                module = importlib.machinery.SourceFileLoader(
 93                    module_name, file
 94                ).load_module()
 95            except Exception as e:
 96                self.logger.exception(
 97                    f"Failed to load module '{module_name}' from '{file}'."
 98                )
 99            else:
100                self._module_names.append(module_name)
101                self.modules[module] = module
102        gruels = [
103            getattr(module, class_)
104            for module in self.modules.values()
105            for class_ in self.subgruel_classes
106            if class_ in dir(module) and self.is_subgruel(getattr(module, class_))
107        ]
108        self.logger.info(
109            "\n".join(
110                [f"Imported {len(gruels)} scrapers: "]
111                + [str(gruel) for gruel in gruels]
112            )
113        )
114        return gruels

Load scraper classes that inherit from Gruel.

NOTE: Classes are loaded, but scraper objects are not instantiated until the scrape() method is called.

:params:

directory: The path to scan for scraper classes.

class_names: A list of class names for scrapers that should be loaded. In order to be loaded, a scraper class must have a name in this list and have Gruel somewhere in its inheritance hierarchy.

include_patterns: Files that match these patterns will be scanned.

exclude_patterns: Files that match these patterns will not be scanned.

recursive: Whether the search should be recursive or not.

>>> load_scrapers("getToTheGig/scrapers", ["VenueScraper"], ["*.py"], ["*template*", "*giggruel*"])
def pop_modules(self):
116    def pop_modules(self):
117        """Unload modules."""
118        for module in self.modules:
119            del module
120        self._module_names = []

Unload modules.

def get_bases(self, object: Any) -> list[typing.Any]:
122    def get_bases(self, object: Any) -> list[Any]:
123        """Returns a recursive list of all the classes `object` inherits from."""
124        parents = []
125        bases = object.__bases__
126        if not bases:
127            return parents
128        for base in bases:
129            parents.append(base)
130            parents.extend(self.get_bases(base))
131        return parents

Returns a recursive list of all the classes object inherits from.

def is_subgruel(self, object: Any) -> bool:
133    def is_subgruel(self, object: Any) -> bool:
134        """Returns whether `object` inherits from `Gruel` somewhere in its ancestory."""
135        if not inspect.isclass(object) or Gruel not in self.get_bases(object):
136            return False
137        return True

Returns whether object inherits from Gruel somewhere in its ancestory.

def prescrape_chores(self):
139    def prescrape_chores(self):
140        """Override to add any tasks to be done before running the scrapers."""
141        ...

Override to add any tasks to be done before running the scrapers.

def postscrape_chores(self):
143    def postscrape_chores(self):
144        """Override to add any tasks to be done after running the scrapers."""
145        self.pop_modules()

Override to add any tasks to be done after running the scrapers.

def scrape(self, scrapers: list[gruel.gruel.Gruel]):
147    def scrape(self, scrapers: list[Gruel]):
148        """Run the `scrape()` method for each scraper in `scrapers`.
149
150        Execution is multithreaded."""
151        pool = quickpool.ThreadPool([scraper().scrape for scraper in scrapers])  # type: ignore
152        pool.execute()

Run the scrape() method for each scraper in scrapers.

Execution is multithreaded.

def logprint(self, message: str):
154    def logprint(self, message: str):
155        """Log and print `message`."""
156        self.logger.info(message)
157        print(message)

Log and print message.

def brew(self):
159    def brew(self):
160        """Execute pipeline.
161
162        1. self.prescrape_chores()
163        2. self.load_scrapers()
164        3. self.scrape()
165        4. self.postscrape_chores()"""
166
167        try:
168            self.logprint("Beginning brew")
169            # 1--------------------------------------------
170            self.logprint("Executing prescrape chores")
171            self.prescrape_chores()
172            # 2--------------------------------------------
173            self.logprint("Loading scrapers")
174            scrapers = self.load_scrapers()
175            print(f"Loaded {len(scrapers)} scrapers")
176            # 3--------------------------------------------
177            self.logprint("Starting scrape")
178            self.scrape(scrapers)
179            self.logprint("Scrape complete")
180            # 4--------------------------------------------
181            self.logprint("Executing postscrape chores")
182            self.postscrape_chores()
183            self.logprint("Brew complete")
184        except Exception as e:
185            print(e)
186            self.logger.exception("Exception occured during brew():")

Execute pipeline.

  1. self.prescrape_chores()
  2. self.load_scrapers()
  3. self.scrape()
  4. self.postscrape_chores()
def get_args() -> argparse.Namespace:
189def get_args() -> argparse.Namespace:
190    parser = argparse.ArgumentParser()
191
192    parser.add_argument(
193        "subgruel_classes",
194        type=str,
195        nargs="*",
196        help=""" A list of Gruel scraper class names to find and import. """,
197    )
198    parser.add_argument(
199        "-e",
200        "--excludes",
201        type=str,
202        nargs="*",
203        default=[],
204        help=""" A list of glob style file patterns to exclude from the scan. """,
205    )
206    parser.add_argument(
207        "-i",
208        "--includes",
209        type=str,
210        nargs="*",
211        default=["*.py"],
212        help=""" A list of glob style file patterns to include in the scan. Defaults to "*.py". """,
213    )
214    parser.add_argument(
215        "-p",
216        "--path",
217        type=str,
218        default=Pathier.cwd(),
219        help=""" The directory path to scan. Defaults to the current working directory. """,
220    )
221    parser.add_argument(
222        "-r",
223        "--recursive",
224        action="store_true",
225        help=""" Whether -p/--path should be scanned recursively or not. """,
226    )
227    args = parser.parse_args()
228    args.path = Pathier(args.path)
229
230    return args
def main(args: argparse.Namespace | None = None):
233def main(args: argparse.Namespace | None = None):
234    if not args:
235        args = get_args()
236    brewer = Brewer(
237        args.subgruel_classes, args.excludes, args.path, args.includes, args.recursive
238    )
239    brewer.brew()