gruel.brewer

  1import argparse
  2import importlib
  3import importlib.machinery
  4import importlib.util
  5import inspect
  6from typing import Any
  7
  8import loggi
  9import quickpool
 10from pathier import Pathier, Pathish
 11from younotyou import younotyou
 12
 13from gruel import Gruel
 14
 15
 16class Brewer:
 17    def __init__(
 18        self,
 19        subgruel_classes: list[str],
 20        file_exclude_patterns: list[str] = [],
 21        scan_path: Pathish = Pathier.cwd(),
 22        file_include_patterns: list[str] = ["*.py"],
 23        recursive: bool = True,
 24    ):
 25        """Run `Gruel` scrapers.
 26
 27        #### :params:
 28
 29        `subgruel_classes`: A list of class names for scrapers that should be loaded.
 30        In order to be loaded, a scraper class must have a name in this list and have `Gruel` somewhere in its inheritance hierarchy.
 31
 32        `file_exclude_patterns`: Files that match these patterns will not be scanned.
 33
 34        `scan_path`: The path to scan for scraper classes.
 35
 36        `file_include_patterns`: Files that match these patterns will be scanned.
 37
 38        `recursive`: Whether the scan should be recursive or not.
 39
 40        >>> brewer = Brewer(["VenueScraper"], ["*template*", "*giggruel*"], "scrapers")
 41        >>> brewer.brew()"""
 42        self._init_logger()
 43        self.subgruel_classes = subgruel_classes
 44        self.file_exclude_patterns = file_exclude_patterns
 45        self.file_include_patterns = file_include_patterns
 46        self.scan_path = Pathier(scan_path)
 47        self.recursive = recursive
 48
 49    def _init_logger(self):
 50        # When Brewer is subclassed, use that file's stem instead of `brewer`
 51        source_file = inspect.getsourcefile(type(self))
 52        if source_file:
 53            log_name = Pathier(source_file).stem
 54        else:
 55            log_name = Pathier(__file__).stem
 56        self.logger = loggi.getLogger(log_name)
 57
 58    def load_scrapers(self) -> list[Gruel]:
 59        """Load scraper classes that inherit from `Gruel`.
 60
 61        NOTE: Classes are loaded, but scraper objects are not instantiated until the `scrape()` method is called.
 62
 63        #### :params:
 64
 65        `directory`: The path to scan for scraper classes.
 66
 67        `class_names`: A list of class names for scrapers that should be loaded.
 68        In order to be loaded, a scraper class must have a name in this list and have `Gruel` somewhere in its inheritance hierarchy.
 69
 70        `include_patterns`: Files that match these patterns will be scanned.
 71
 72        `exclude_patterns`: Files that match these patterns will not be scanned.
 73
 74        `recursive`: Whether the search should be recursive or not.
 75
 76        >>> load_scrapers("getToTheGig/scrapers", ["VenueScraper"], ["*.py"], ["*template*", "*giggruel*"])
 77        """
 78        globber = self.scan_path.glob
 79        if self.recursive:
 80            globber = self.scan_path.rglob
 81        files = [
 82            str(file)
 83            for pattern in self.file_include_patterns
 84            for file in globber(pattern)
 85        ]
 86        files = younotyou(files, exclude_patterns=self.file_exclude_patterns)
 87        self.modules = {}
 88        self._module_names = []
 89        for file in files:
 90            module_name = Pathier(file).stem
 91            try:
 92                module = importlib.machinery.SourceFileLoader(
 93                    module_name, file
 94                ).load_module()
 95            except Exception as e:
 96                self.logger.exception(
 97                    f"Failed to load module '{module_name}' from '{file}'."
 98                )
 99            else:
100                self._module_names.append(module_name)
101                self.modules[module] = module
102        gruels = [
103            getattr(module, class_)
104            for module in self.modules.values()
105            for class_ in self.subgruel_classes
106            if class_ in dir(module) and self.is_subgruel(getattr(module, class_))
107        ]
108        self.logger.info(
109            "\n".join(
110                [f"Imported {len(gruels)} scrapers: "]
111                + [str(gruel) for gruel in gruels]
112            )
113        )
114        return gruels
115
116    def pop_modules(self):
117        """Unload modules."""
118        for module in self.modules:
119            del module
120        self._module_names = []
121
122    def get_bases(self, object: Any) -> list[Any]:
123        """Returns a recursive list of all the classes `object` inherits from."""
124        parents = []
125        bases = object.__bases__
126        if not bases:
127            return parents
128        for base in bases:
129            parents.append(base)
130            parents.extend(self.get_bases(base))
131        return parents
132
133    def is_subgruel(self, object: Any) -> bool:
134        """Returns whether `object` inherits from `Gruel` somewhere in its ancestory."""
135        if not inspect.isclass(object) or Gruel not in self.get_bases(object):
136            return False
137        return True
138
139    def prescrape_chores(self):
140        """Override to add any tasks to be done before running the scrapers."""
141        ...
142
143    def postscrape_chores(self):
144        """Override to add any tasks to be done after running the scrapers."""
145        self.pop_modules()
146
147    def scrape(self, scrapers: list[Gruel]):
148        """Run the `scrape()` method for each scraper in `scrapers`.
149
150        Execution is multithreaded."""
151        execute = lambda scraper: scraper().scrape()
152        pool = quickpool.ThreadPool(
153            [execute] * len(scrapers), [(scraper,) for scraper in scrapers]
154        )
155        pool.execute()
156
157    def logprint(self, message: str):
158        """Log and print `message`."""
159        self.logger.info(message)
160        print(message)
161
162    def brew(self):
163        """Execute pipeline.
164
165        1. self.prescrape_chores()
166        2. self.load_scrapers()
167        3. self.scrape()
168        4. self.postscrape_chores()"""
169
170        try:
171            self.logprint("Beginning brew")
172            # 1--------------------------------------------
173            self.logprint("Executing prescrape chores")
174            self.prescrape_chores()
175            # 2--------------------------------------------
176            self.logprint("Loading scrapers")
177            scrapers = self.load_scrapers()
178            print(f"Loaded {len(scrapers)} scrapers")
179            # 3--------------------------------------------
180            self.logprint("Starting scrape")
181            self.scrape(scrapers)
182            self.logprint("Scrape complete")
183            # 4--------------------------------------------
184            self.logprint("Executing postscrape chores")
185            self.postscrape_chores()
186            self.logprint("Brew complete")
187        except Exception as e:
188            print(e)
189            self.logger.exception("Exception occured during brew():")
190
191
192def get_args() -> argparse.Namespace:
193    parser = argparse.ArgumentParser()
194
195    parser.add_argument(
196        "subgruel_classes",
197        type=str,
198        nargs="*",
199        help=""" A list of Gruel scraper class names to find and import. """,
200    )
201    parser.add_argument(
202        "-e",
203        "--excludes",
204        type=str,
205        nargs="*",
206        default=[],
207        help=""" A list of glob style file patterns to exclude from the scan. """,
208    )
209    parser.add_argument(
210        "-i",
211        "--includes",
212        type=str,
213        nargs="*",
214        default=["*.py"],
215        help=""" A list of glob style file patterns to include in the scan. Defaults to "*.py". """,
216    )
217    parser.add_argument(
218        "-p",
219        "--path",
220        type=str,
221        default=Pathier.cwd(),
222        help=""" The directory path to scan. Defaults to the current working directory. """,
223    )
224    parser.add_argument(
225        "-r",
226        "--recursive",
227        action="store_true",
228        help=""" Whether -p/--path should be scanned recursively or not. """,
229    )
230    args = parser.parse_args()
231    args.path = Pathier(args.path)
232
233    return args
234
235
236def main(args: argparse.Namespace | None = None):
237    if not args:
238        args = get_args()
239    brewer = Brewer(
240        args.subgruel_classes, args.excludes, args.path, args.includes, args.recursive
241    )
242    brewer.brew()
243
244
245if __name__ == "__main__":
246    main(get_args())
class Brewer:
 17class Brewer:
 18    def __init__(
 19        self,
 20        subgruel_classes: list[str],
 21        file_exclude_patterns: list[str] = [],
 22        scan_path: Pathish = Pathier.cwd(),
 23        file_include_patterns: list[str] = ["*.py"],
 24        recursive: bool = True,
 25    ):
 26        """Run `Gruel` scrapers.
 27
 28        #### :params:
 29
 30        `subgruel_classes`: A list of class names for scrapers that should be loaded.
 31        In order to be loaded, a scraper class must have a name in this list and have `Gruel` somewhere in its inheritance hierarchy.
 32
 33        `file_exclude_patterns`: Files that match these patterns will not be scanned.
 34
 35        `scan_path`: The path to scan for scraper classes.
 36
 37        `file_include_patterns`: Files that match these patterns will be scanned.
 38
 39        `recursive`: Whether the scan should be recursive or not.
 40
 41        >>> brewer = Brewer(["VenueScraper"], ["*template*", "*giggruel*"], "scrapers")
 42        >>> brewer.brew()"""
 43        self._init_logger()
 44        self.subgruel_classes = subgruel_classes
 45        self.file_exclude_patterns = file_exclude_patterns
 46        self.file_include_patterns = file_include_patterns
 47        self.scan_path = Pathier(scan_path)
 48        self.recursive = recursive
 49
 50    def _init_logger(self):
 51        # When Brewer is subclassed, use that file's stem instead of `brewer`
 52        source_file = inspect.getsourcefile(type(self))
 53        if source_file:
 54            log_name = Pathier(source_file).stem
 55        else:
 56            log_name = Pathier(__file__).stem
 57        self.logger = loggi.getLogger(log_name)
 58
 59    def load_scrapers(self) -> list[Gruel]:
 60        """Load scraper classes that inherit from `Gruel`.
 61
 62        NOTE: Classes are loaded, but scraper objects are not instantiated until the `scrape()` method is called.
 63
 64        #### :params:
 65
 66        `directory`: The path to scan for scraper classes.
 67
 68        `class_names`: A list of class names for scrapers that should be loaded.
 69        In order to be loaded, a scraper class must have a name in this list and have `Gruel` somewhere in its inheritance hierarchy.
 70
 71        `include_patterns`: Files that match these patterns will be scanned.
 72
 73        `exclude_patterns`: Files that match these patterns will not be scanned.
 74
 75        `recursive`: Whether the search should be recursive or not.
 76
 77        >>> load_scrapers("getToTheGig/scrapers", ["VenueScraper"], ["*.py"], ["*template*", "*giggruel*"])
 78        """
 79        globber = self.scan_path.glob
 80        if self.recursive:
 81            globber = self.scan_path.rglob
 82        files = [
 83            str(file)
 84            for pattern in self.file_include_patterns
 85            for file in globber(pattern)
 86        ]
 87        files = younotyou(files, exclude_patterns=self.file_exclude_patterns)
 88        self.modules = {}
 89        self._module_names = []
 90        for file in files:
 91            module_name = Pathier(file).stem
 92            try:
 93                module = importlib.machinery.SourceFileLoader(
 94                    module_name, file
 95                ).load_module()
 96            except Exception as e:
 97                self.logger.exception(
 98                    f"Failed to load module '{module_name}' from '{file}'."
 99                )
100            else:
101                self._module_names.append(module_name)
102                self.modules[module] = module
103        gruels = [
104            getattr(module, class_)
105            for module in self.modules.values()
106            for class_ in self.subgruel_classes
107            if class_ in dir(module) and self.is_subgruel(getattr(module, class_))
108        ]
109        self.logger.info(
110            "\n".join(
111                [f"Imported {len(gruels)} scrapers: "]
112                + [str(gruel) for gruel in gruels]
113            )
114        )
115        return gruels
116
117    def pop_modules(self):
118        """Unload modules."""
119        for module in self.modules:
120            del module
121        self._module_names = []
122
123    def get_bases(self, object: Any) -> list[Any]:
124        """Returns a recursive list of all the classes `object` inherits from."""
125        parents = []
126        bases = object.__bases__
127        if not bases:
128            return parents
129        for base in bases:
130            parents.append(base)
131            parents.extend(self.get_bases(base))
132        return parents
133
134    def is_subgruel(self, object: Any) -> bool:
135        """Returns whether `object` inherits from `Gruel` somewhere in its ancestory."""
136        if not inspect.isclass(object) or Gruel not in self.get_bases(object):
137            return False
138        return True
139
140    def prescrape_chores(self):
141        """Override to add any tasks to be done before running the scrapers."""
142        ...
143
144    def postscrape_chores(self):
145        """Override to add any tasks to be done after running the scrapers."""
146        self.pop_modules()
147
148    def scrape(self, scrapers: list[Gruel]):
149        """Run the `scrape()` method for each scraper in `scrapers`.
150
151        Execution is multithreaded."""
152        execute = lambda scraper: scraper().scrape()
153        pool = quickpool.ThreadPool(
154            [execute] * len(scrapers), [(scraper,) for scraper in scrapers]
155        )
156        pool.execute()
157
158    def logprint(self, message: str):
159        """Log and print `message`."""
160        self.logger.info(message)
161        print(message)
162
163    def brew(self):
164        """Execute pipeline.
165
166        1. self.prescrape_chores()
167        2. self.load_scrapers()
168        3. self.scrape()
169        4. self.postscrape_chores()"""
170
171        try:
172            self.logprint("Beginning brew")
173            # 1--------------------------------------------
174            self.logprint("Executing prescrape chores")
175            self.prescrape_chores()
176            # 2--------------------------------------------
177            self.logprint("Loading scrapers")
178            scrapers = self.load_scrapers()
179            print(f"Loaded {len(scrapers)} scrapers")
180            # 3--------------------------------------------
181            self.logprint("Starting scrape")
182            self.scrape(scrapers)
183            self.logprint("Scrape complete")
184            # 4--------------------------------------------
185            self.logprint("Executing postscrape chores")
186            self.postscrape_chores()
187            self.logprint("Brew complete")
188        except Exception as e:
189            print(e)
190            self.logger.exception("Exception occured during brew():")
Brewer( subgruel_classes: list[str], file_exclude_patterns: list[str] = [], scan_path: pathier.pathier.Pathier | pathlib.Path | str = WindowsPath('E:/1vsCode/python/gruel'), file_include_patterns: list[str] = ['*.py'], recursive: bool = True)
18    def __init__(
19        self,
20        subgruel_classes: list[str],
21        file_exclude_patterns: list[str] = [],
22        scan_path: Pathish = Pathier.cwd(),
23        file_include_patterns: list[str] = ["*.py"],
24        recursive: bool = True,
25    ):
26        """Run `Gruel` scrapers.
27
28        #### :params:
29
30        `subgruel_classes`: A list of class names for scrapers that should be loaded.
31        In order to be loaded, a scraper class must have a name in this list and have `Gruel` somewhere in its inheritance hierarchy.
32
33        `file_exclude_patterns`: Files that match these patterns will not be scanned.
34
35        `scan_path`: The path to scan for scraper classes.
36
37        `file_include_patterns`: Files that match these patterns will be scanned.
38
39        `recursive`: Whether the scan should be recursive or not.
40
41        >>> brewer = Brewer(["VenueScraper"], ["*template*", "*giggruel*"], "scrapers")
42        >>> brewer.brew()"""
43        self._init_logger()
44        self.subgruel_classes = subgruel_classes
45        self.file_exclude_patterns = file_exclude_patterns
46        self.file_include_patterns = file_include_patterns
47        self.scan_path = Pathier(scan_path)
48        self.recursive = recursive

Run Gruel scrapers.

:params:

subgruel_classes: A list of class names for scrapers that should be loaded. In order to be loaded, a scraper class must have a name in this list and have Gruel somewhere in its inheritance hierarchy.

file_exclude_patterns: Files that match these patterns will not be scanned.

scan_path: The path to scan for scraper classes.

file_include_patterns: Files that match these patterns will be scanned.

recursive: Whether the scan should be recursive or not.

>>> brewer = Brewer(["VenueScraper"], ["*template*", "*giggruel*"], "scrapers")
>>> brewer.brew()
def load_scrapers(self) -> list[gruel.gruel.Gruel]:
 59    def load_scrapers(self) -> list[Gruel]:
 60        """Load scraper classes that inherit from `Gruel`.
 61
 62        NOTE: Classes are loaded, but scraper objects are not instantiated until the `scrape()` method is called.
 63
 64        #### :params:
 65
 66        `directory`: The path to scan for scraper classes.
 67
 68        `class_names`: A list of class names for scrapers that should be loaded.
 69        In order to be loaded, a scraper class must have a name in this list and have `Gruel` somewhere in its inheritance hierarchy.
 70
 71        `include_patterns`: Files that match these patterns will be scanned.
 72
 73        `exclude_patterns`: Files that match these patterns will not be scanned.
 74
 75        `recursive`: Whether the search should be recursive or not.
 76
 77        >>> load_scrapers("getToTheGig/scrapers", ["VenueScraper"], ["*.py"], ["*template*", "*giggruel*"])
 78        """
 79        globber = self.scan_path.glob
 80        if self.recursive:
 81            globber = self.scan_path.rglob
 82        files = [
 83            str(file)
 84            for pattern in self.file_include_patterns
 85            for file in globber(pattern)
 86        ]
 87        files = younotyou(files, exclude_patterns=self.file_exclude_patterns)
 88        self.modules = {}
 89        self._module_names = []
 90        for file in files:
 91            module_name = Pathier(file).stem
 92            try:
 93                module = importlib.machinery.SourceFileLoader(
 94                    module_name, file
 95                ).load_module()
 96            except Exception as e:
 97                self.logger.exception(
 98                    f"Failed to load module '{module_name}' from '{file}'."
 99                )
100            else:
101                self._module_names.append(module_name)
102                self.modules[module] = module
103        gruels = [
104            getattr(module, class_)
105            for module in self.modules.values()
106            for class_ in self.subgruel_classes
107            if class_ in dir(module) and self.is_subgruel(getattr(module, class_))
108        ]
109        self.logger.info(
110            "\n".join(
111                [f"Imported {len(gruels)} scrapers: "]
112                + [str(gruel) for gruel in gruels]
113            )
114        )
115        return gruels

Load scraper classes that inherit from Gruel.

NOTE: Classes are loaded, but scraper objects are not instantiated until the scrape() method is called.

:params:

directory: The path to scan for scraper classes.

class_names: A list of class names for scrapers that should be loaded. In order to be loaded, a scraper class must have a name in this list and have Gruel somewhere in its inheritance hierarchy.

include_patterns: Files that match these patterns will be scanned.

exclude_patterns: Files that match these patterns will not be scanned.

recursive: Whether the search should be recursive or not.

>>> load_scrapers("getToTheGig/scrapers", ["VenueScraper"], ["*.py"], ["*template*", "*giggruel*"])
def pop_modules(self):
117    def pop_modules(self):
118        """Unload modules."""
119        for module in self.modules:
120            del module
121        self._module_names = []

Unload modules.

def get_bases(self, object: Any) -> list[typing.Any]:
123    def get_bases(self, object: Any) -> list[Any]:
124        """Returns a recursive list of all the classes `object` inherits from."""
125        parents = []
126        bases = object.__bases__
127        if not bases:
128            return parents
129        for base in bases:
130            parents.append(base)
131            parents.extend(self.get_bases(base))
132        return parents

Returns a recursive list of all the classes object inherits from.

def is_subgruel(self, object: Any) -> bool:
134    def is_subgruel(self, object: Any) -> bool:
135        """Returns whether `object` inherits from `Gruel` somewhere in its ancestory."""
136        if not inspect.isclass(object) or Gruel not in self.get_bases(object):
137            return False
138        return True

Returns whether object inherits from Gruel somewhere in its ancestory.

def prescrape_chores(self):
140    def prescrape_chores(self):
141        """Override to add any tasks to be done before running the scrapers."""
142        ...

Override to add any tasks to be done before running the scrapers.

def postscrape_chores(self):
144    def postscrape_chores(self):
145        """Override to add any tasks to be done after running the scrapers."""
146        self.pop_modules()

Override to add any tasks to be done after running the scrapers.

def scrape(self, scrapers: list[gruel.gruel.Gruel]):
148    def scrape(self, scrapers: list[Gruel]):
149        """Run the `scrape()` method for each scraper in `scrapers`.
150
151        Execution is multithreaded."""
152        execute = lambda scraper: scraper().scrape()
153        pool = quickpool.ThreadPool(
154            [execute] * len(scrapers), [(scraper,) for scraper in scrapers]
155        )
156        pool.execute()

Run the scrape() method for each scraper in scrapers.

Execution is multithreaded.

def logprint(self, message: str):
158    def logprint(self, message: str):
159        """Log and print `message`."""
160        self.logger.info(message)
161        print(message)

Log and print message.

def brew(self):
163    def brew(self):
164        """Execute pipeline.
165
166        1. self.prescrape_chores()
167        2. self.load_scrapers()
168        3. self.scrape()
169        4. self.postscrape_chores()"""
170
171        try:
172            self.logprint("Beginning brew")
173            # 1--------------------------------------------
174            self.logprint("Executing prescrape chores")
175            self.prescrape_chores()
176            # 2--------------------------------------------
177            self.logprint("Loading scrapers")
178            scrapers = self.load_scrapers()
179            print(f"Loaded {len(scrapers)} scrapers")
180            # 3--------------------------------------------
181            self.logprint("Starting scrape")
182            self.scrape(scrapers)
183            self.logprint("Scrape complete")
184            # 4--------------------------------------------
185            self.logprint("Executing postscrape chores")
186            self.postscrape_chores()
187            self.logprint("Brew complete")
188        except Exception as e:
189            print(e)
190            self.logger.exception("Exception occured during brew():")

Execute pipeline.

  1. self.prescrape_chores()
  2. self.load_scrapers()
  3. self.scrape()
  4. self.postscrape_chores()
def get_args() -> argparse.Namespace:
193def get_args() -> argparse.Namespace:
194    parser = argparse.ArgumentParser()
195
196    parser.add_argument(
197        "subgruel_classes",
198        type=str,
199        nargs="*",
200        help=""" A list of Gruel scraper class names to find and import. """,
201    )
202    parser.add_argument(
203        "-e",
204        "--excludes",
205        type=str,
206        nargs="*",
207        default=[],
208        help=""" A list of glob style file patterns to exclude from the scan. """,
209    )
210    parser.add_argument(
211        "-i",
212        "--includes",
213        type=str,
214        nargs="*",
215        default=["*.py"],
216        help=""" A list of glob style file patterns to include in the scan. Defaults to "*.py". """,
217    )
218    parser.add_argument(
219        "-p",
220        "--path",
221        type=str,
222        default=Pathier.cwd(),
223        help=""" The directory path to scan. Defaults to the current working directory. """,
224    )
225    parser.add_argument(
226        "-r",
227        "--recursive",
228        action="store_true",
229        help=""" Whether -p/--path should be scanned recursively or not. """,
230    )
231    args = parser.parse_args()
232    args.path = Pathier(args.path)
233
234    return args
def main(args: argparse.Namespace | None = None):
237def main(args: argparse.Namespace | None = None):
238    if not args:
239        args = get_args()
240    brewer = Brewer(
241        args.subgruel_classes, args.excludes, args.path, args.includes, args.recursive
242    )
243    brewer.brew()