gruel.brewer

  1import argparse
  2import importlib
  3import importlib.machinery
  4import importlib.util
  5import inspect
  6import logging
  7import sys
  8from typing import Any
  9
 10from pathier import Pathier, Pathish
 11from printbuddies import PoolBar
 12from younotyou import younotyou
 13
 14from gruel import Gruel
 15
 16
 17class Brewer:
 18    def __init__(
 19        self,
 20        subgruel_classes: list[str],
 21        file_exclude_patterns: list[str] = [],
 22        scan_path: Pathish = Pathier.cwd(),
 23        file_include_patterns: list[str] = ["*.py"],
 24        recursive: bool = True,
 25    ):
 26        """Run `Gruel` scrapers.
 27
 28        #### :params:
 29
 30        `subgruel_classes`: A list of class names for scrapers that should be loaded.
 31        In order to be loaded, a scraper class must have a name in this list and have `Gruel` somewhere in its inheritance hierarchy.
 32
 33        `file_exclude_patterns`: Files that match these patterns will not be scanned.
 34
 35        `scan_path`: The path to scan for scraper classes.
 36
 37        `file_include_patterns`: Files that match these patterns will be scanned.
 38
 39        `recursive`: Whether the scan should be recursive or not.
 40
 41        >>> brewer = Brewer(["VenueScraper"], ["*template*", "*giggruel*"], "scrapers")
 42        >>> brewer.brew()"""
 43        self._init_logger()
 44        self.subgruel_classes = subgruel_classes
 45        self.file_exclude_patterns = file_exclude_patterns
 46        self.file_include_patterns = file_include_patterns
 47        self.scan_path = Pathier(scan_path)
 48        self.recursive = recursive
 49
 50    def _init_logger(self):
 51        self.logger = logging.getLogger(Pathier(__file__).stem)
 52        if not self.logger.hasHandlers():
 53            handler = logging.FileHandler(Pathier(__file__).stem + ".log")
 54            handler.setFormatter(
 55                logging.Formatter(
 56                    "{levelname}|-|{asctime}|-|{message}",
 57                    style="{",
 58                    datefmt="%m/%d/%Y %I:%M:%S %p",
 59                )
 60            )
 61            self.logger.addHandler(handler)
 62            self.logger.setLevel(logging.INFO)
 63
 64    def load_scrapers(self) -> list[Gruel]:
 65        """Load scraper classes that inherit from `Gruel`.
 66
 67        #### :params:
 68
 69        `directory`: The path to scan for scraper classes.
 70
 71        `class_names`: A list of class names for scrapers that should be loaded.
 72        In order to be loaded, a scraper class must have a name in this list and have `Gruel` somewhere in its inheritance hierarchy.
 73
 74        `include_patterns`: Files that match these patterns will be scanned.
 75
 76        `exclude_patterns`: Files that match these patterns will not be scanned.
 77
 78        `recursive`: Whether the search should be recursive or not.
 79
 80        >>> load_scrapers("getToTheGig/scrapers", ["VenueScraper"], ["*.py"], ["*template*", "*giggruel*"])"""
 81        globber = self.scan_path.glob
 82        if self.recursive:
 83            globber = self.scan_path.rglob
 84        files = [
 85            str(file)
 86            for pattern in self.file_include_patterns
 87            for file in globber(pattern)
 88        ]
 89        files = younotyou(files, exclude_patterns=self.file_exclude_patterns)
 90        modules = []
 91        self._module_names = []
 92        for file in files:
 93            try:
 94                module_name = Pathier(file).stem
 95                self._module_names.append(module_name)
 96                module = importlib.machinery.SourceFileLoader(
 97                    module_name, file
 98                ).load_module()
 99                modules.append(module)
100            except Exception as e:
101                ...
102        gruels = [
103            getattr(module, class_)
104            for module in modules
105            for class_ in self.subgruel_classes
106            if class_ in dir(module) and self.is_subgruel(getattr(module, class_))
107        ]
108        self.logger.info(
109            "\n".join(
110                [f"Imported {len(gruels)} scrapers: "]
111                + [str(gruel) for gruel in gruels]
112            )
113        )
114        return gruels
115
116    def pop_modules(self):
117        """Unload modules."""
118        for module in self._module_names:
119            sys.modules.pop(module)
120        self._module_names = []
121
122    def get_bases(self, object: Any) -> list[Any]:
123        """Returns a recursive list of all the classes `object` inherits from."""
124        parents = []
125        bases = object.__bases__
126        if not bases:
127            return parents
128        for base in bases:
129            parents.append(base)
130            parents.extend(self.get_bases(base))
131        return parents
132
133    def is_subgruel(self, object: Any) -> bool:
134        """Returns whether `object` inherits from `Gruel` somewhere in its ancestory."""
135        if not inspect.isclass(object) or Gruel not in self.get_bases(object):
136            return False
137        return True
138
139    def prescrape_chores(self):
140        """Override to add any tasks to be done before running the scrapers."""
141        ...
142
143    def postscrape_chores(self):
144        """Override to add any tasks to be done after running the scrapers."""
145        self.pop_modules()
146
147    def scrape(self, scrapers: list[Gruel]):
148        """Run the `scrape()` method for each scraper in `scrapers`.
149
150        Execution is multithreaded."""
151        pool = PoolBar("thread", [scraper().scrape for scraper in scrapers])  # type: ignore
152        pool.execute()
153
154    def brew(self):
155        """Execute pipeline.
156
157        1. self.prescrape_chores()
158        2. self.load_scrapers()
159        3. self.scrape()
160        4. self.postscrape_chores()"""
161        self.logger.info("Beginning brew")
162        print("Beginning brew")
163        print("Executing prescrape chores...")
164        self.prescrape_chores()
165        print("Loading scrapers...")
166        scrapers = self.load_scrapers()
167        print(f"Loaded {len(scrapers)} scrapers.")
168        print("Starting scrape...")
169        self.scrape(scrapers)
170        print("Scrape complete.")
171        print("Executing postscrape chores...")
172        self.postscrape_chores()
173        print("Brew complete.")
174        self.logger.info("Brew complete.")
175
176
177def get_args() -> argparse.Namespace:
178    parser = argparse.ArgumentParser()
179
180    parser.add_argument(
181        "subgruel_classes",
182        type=str,
183        nargs="*",
184        help=""" A list of Gruel scraper class names to find and import. """,
185    )
186    parser.add_argument(
187        "-e",
188        "--excludes",
189        type=str,
190        nargs="*",
191        default=[],
192        help=""" A list of glob style file patterns to exclude from the scan. """,
193    )
194    parser.add_argument(
195        "-i",
196        "--includes",
197        type=str,
198        nargs="*",
199        default=["*.py"],
200        help=""" A list of glob style file patterns to include in the scan. Defaults to "*.py". """,
201    )
202    parser.add_argument(
203        "-p",
204        "--path",
205        type=str,
206        default=Pathier.cwd(),
207        help=""" The directory path to scan. Defaults to the current working directory. """,
208    )
209    parser.add_argument(
210        "-r",
211        "--recursive",
212        action="store_true",
213        help=""" Whether -p/--path should be scanned recursively or not. """,
214    )
215    args = parser.parse_args()
216    args.path = Pathier(args.path)
217
218    return args
219
220
221def main(args: argparse.Namespace | None = None):
222    if not args:
223        args = get_args()
224    brewer = Brewer(
225        args.subgruel_classes, args.excludes, args.path, args.includes, args.recursive
226    )
227    brewer.brew()
228
229
230if __name__ == "__main__":
231    main(get_args())
class Brewer:
 18class Brewer:
 19    def __init__(
 20        self,
 21        subgruel_classes: list[str],
 22        file_exclude_patterns: list[str] = [],
 23        scan_path: Pathish = Pathier.cwd(),
 24        file_include_patterns: list[str] = ["*.py"],
 25        recursive: bool = True,
 26    ):
 27        """Run `Gruel` scrapers.
 28
 29        #### :params:
 30
 31        `subgruel_classes`: A list of class names for scrapers that should be loaded.
 32        In order to be loaded, a scraper class must have a name in this list and have `Gruel` somewhere in its inheritance hierarchy.
 33
 34        `file_exclude_patterns`: Files that match these patterns will not be scanned.
 35
 36        `scan_path`: The path to scan for scraper classes.
 37
 38        `file_include_patterns`: Files that match these patterns will be scanned.
 39
 40        `recursive`: Whether the scan should be recursive or not.
 41
 42        >>> brewer = Brewer(["VenueScraper"], ["*template*", "*giggruel*"], "scrapers")
 43        >>> brewer.brew()"""
 44        self._init_logger()
 45        self.subgruel_classes = subgruel_classes
 46        self.file_exclude_patterns = file_exclude_patterns
 47        self.file_include_patterns = file_include_patterns
 48        self.scan_path = Pathier(scan_path)
 49        self.recursive = recursive
 50
 51    def _init_logger(self):
 52        self.logger = logging.getLogger(Pathier(__file__).stem)
 53        if not self.logger.hasHandlers():
 54            handler = logging.FileHandler(Pathier(__file__).stem + ".log")
 55            handler.setFormatter(
 56                logging.Formatter(
 57                    "{levelname}|-|{asctime}|-|{message}",
 58                    style="{",
 59                    datefmt="%m/%d/%Y %I:%M:%S %p",
 60                )
 61            )
 62            self.logger.addHandler(handler)
 63            self.logger.setLevel(logging.INFO)
 64
 65    def load_scrapers(self) -> list[Gruel]:
 66        """Load scraper classes that inherit from `Gruel`.
 67
 68        #### :params:
 69
 70        `directory`: The path to scan for scraper classes.
 71
 72        `class_names`: A list of class names for scrapers that should be loaded.
 73        In order to be loaded, a scraper class must have a name in this list and have `Gruel` somewhere in its inheritance hierarchy.
 74
 75        `include_patterns`: Files that match these patterns will be scanned.
 76
 77        `exclude_patterns`: Files that match these patterns will not be scanned.
 78
 79        `recursive`: Whether the search should be recursive or not.
 80
 81        >>> load_scrapers("getToTheGig/scrapers", ["VenueScraper"], ["*.py"], ["*template*", "*giggruel*"])"""
 82        globber = self.scan_path.glob
 83        if self.recursive:
 84            globber = self.scan_path.rglob
 85        files = [
 86            str(file)
 87            for pattern in self.file_include_patterns
 88            for file in globber(pattern)
 89        ]
 90        files = younotyou(files, exclude_patterns=self.file_exclude_patterns)
 91        modules = []
 92        self._module_names = []
 93        for file in files:
 94            try:
 95                module_name = Pathier(file).stem
 96                self._module_names.append(module_name)
 97                module = importlib.machinery.SourceFileLoader(
 98                    module_name, file
 99                ).load_module()
100                modules.append(module)
101            except Exception as e:
102                ...
103        gruels = [
104            getattr(module, class_)
105            for module in modules
106            for class_ in self.subgruel_classes
107            if class_ in dir(module) and self.is_subgruel(getattr(module, class_))
108        ]
109        self.logger.info(
110            "\n".join(
111                [f"Imported {len(gruels)} scrapers: "]
112                + [str(gruel) for gruel in gruels]
113            )
114        )
115        return gruels
116
117    def pop_modules(self):
118        """Unload modules."""
119        for module in self._module_names:
120            sys.modules.pop(module)
121        self._module_names = []
122
123    def get_bases(self, object: Any) -> list[Any]:
124        """Returns a recursive list of all the classes `object` inherits from."""
125        parents = []
126        bases = object.__bases__
127        if not bases:
128            return parents
129        for base in bases:
130            parents.append(base)
131            parents.extend(self.get_bases(base))
132        return parents
133
134    def is_subgruel(self, object: Any) -> bool:
135        """Returns whether `object` inherits from `Gruel` somewhere in its ancestory."""
136        if not inspect.isclass(object) or Gruel not in self.get_bases(object):
137            return False
138        return True
139
140    def prescrape_chores(self):
141        """Override to add any tasks to be done before running the scrapers."""
142        ...
143
144    def postscrape_chores(self):
145        """Override to add any tasks to be done after running the scrapers."""
146        self.pop_modules()
147
148    def scrape(self, scrapers: list[Gruel]):
149        """Run the `scrape()` method for each scraper in `scrapers`.
150
151        Execution is multithreaded."""
152        pool = PoolBar("thread", [scraper().scrape for scraper in scrapers])  # type: ignore
153        pool.execute()
154
155    def brew(self):
156        """Execute pipeline.
157
158        1. self.prescrape_chores()
159        2. self.load_scrapers()
160        3. self.scrape()
161        4. self.postscrape_chores()"""
162        self.logger.info("Beginning brew")
163        print("Beginning brew")
164        print("Executing prescrape chores...")
165        self.prescrape_chores()
166        print("Loading scrapers...")
167        scrapers = self.load_scrapers()
168        print(f"Loaded {len(scrapers)} scrapers.")
169        print("Starting scrape...")
170        self.scrape(scrapers)
171        print("Scrape complete.")
172        print("Executing postscrape chores...")
173        self.postscrape_chores()
174        print("Brew complete.")
175        self.logger.info("Brew complete.")
Brewer( subgruel_classes: list[str], file_exclude_patterns: list[str] = [], scan_path: pathier.pathier.Pathier | pathlib.Path | str = WindowsPath('E:/1vsCode/python/gruel'), file_include_patterns: list[str] = ['*.py'], recursive: bool = True)
19    def __init__(
20        self,
21        subgruel_classes: list[str],
22        file_exclude_patterns: list[str] = [],
23        scan_path: Pathish = Pathier.cwd(),
24        file_include_patterns: list[str] = ["*.py"],
25        recursive: bool = True,
26    ):
27        """Run `Gruel` scrapers.
28
29        #### :params:
30
31        `subgruel_classes`: A list of class names for scrapers that should be loaded.
32        In order to be loaded, a scraper class must have a name in this list and have `Gruel` somewhere in its inheritance hierarchy.
33
34        `file_exclude_patterns`: Files that match these patterns will not be scanned.
35
36        `scan_path`: The path to scan for scraper classes.
37
38        `file_include_patterns`: Files that match these patterns will be scanned.
39
40        `recursive`: Whether the scan should be recursive or not.
41
42        >>> brewer = Brewer(["VenueScraper"], ["*template*", "*giggruel*"], "scrapers")
43        >>> brewer.brew()"""
44        self._init_logger()
45        self.subgruel_classes = subgruel_classes
46        self.file_exclude_patterns = file_exclude_patterns
47        self.file_include_patterns = file_include_patterns
48        self.scan_path = Pathier(scan_path)
49        self.recursive = recursive

Run Gruel scrapers.

:params:

subgruel_classes: A list of class names for scrapers that should be loaded. In order to be loaded, a scraper class must have a name in this list and have Gruel somewhere in its inheritance hierarchy.

file_exclude_patterns: Files that match these patterns will not be scanned.

scan_path: The path to scan for scraper classes.

file_include_patterns: Files that match these patterns will be scanned.

recursive: Whether the scan should be recursive or not.

>>> brewer = Brewer(["VenueScraper"], ["*template*", "*giggruel*"], "scrapers")
>>> brewer.brew()
def load_scrapers(self) -> list[gruel.gruel.Gruel]:
 65    def load_scrapers(self) -> list[Gruel]:
 66        """Load scraper classes that inherit from `Gruel`.
 67
 68        #### :params:
 69
 70        `directory`: The path to scan for scraper classes.
 71
 72        `class_names`: A list of class names for scrapers that should be loaded.
 73        In order to be loaded, a scraper class must have a name in this list and have `Gruel` somewhere in its inheritance hierarchy.
 74
 75        `include_patterns`: Files that match these patterns will be scanned.
 76
 77        `exclude_patterns`: Files that match these patterns will not be scanned.
 78
 79        `recursive`: Whether the search should be recursive or not.
 80
 81        >>> load_scrapers("getToTheGig/scrapers", ["VenueScraper"], ["*.py"], ["*template*", "*giggruel*"])"""
 82        globber = self.scan_path.glob
 83        if self.recursive:
 84            globber = self.scan_path.rglob
 85        files = [
 86            str(file)
 87            for pattern in self.file_include_patterns
 88            for file in globber(pattern)
 89        ]
 90        files = younotyou(files, exclude_patterns=self.file_exclude_patterns)
 91        modules = []
 92        self._module_names = []
 93        for file in files:
 94            try:
 95                module_name = Pathier(file).stem
 96                self._module_names.append(module_name)
 97                module = importlib.machinery.SourceFileLoader(
 98                    module_name, file
 99                ).load_module()
100                modules.append(module)
101            except Exception as e:
102                ...
103        gruels = [
104            getattr(module, class_)
105            for module in modules
106            for class_ in self.subgruel_classes
107            if class_ in dir(module) and self.is_subgruel(getattr(module, class_))
108        ]
109        self.logger.info(
110            "\n".join(
111                [f"Imported {len(gruels)} scrapers: "]
112                + [str(gruel) for gruel in gruels]
113            )
114        )
115        return gruels

Load scraper classes that inherit from Gruel.

:params:

directory: The path to scan for scraper classes.

class_names: A list of class names for scrapers that should be loaded. In order to be loaded, a scraper class must have a name in this list and have Gruel somewhere in its inheritance hierarchy.

include_patterns: Files that match these patterns will be scanned.

exclude_patterns: Files that match these patterns will not be scanned.

recursive: Whether the search should be recursive or not.

>>> load_scrapers("getToTheGig/scrapers", ["VenueScraper"], ["*.py"], ["*template*", "*giggruel*"])
def pop_modules(self):
117    def pop_modules(self):
118        """Unload modules."""
119        for module in self._module_names:
120            sys.modules.pop(module)
121        self._module_names = []

Unload modules.

def get_bases(self, object: Any) -> list[typing.Any]:
123    def get_bases(self, object: Any) -> list[Any]:
124        """Returns a recursive list of all the classes `object` inherits from."""
125        parents = []
126        bases = object.__bases__
127        if not bases:
128            return parents
129        for base in bases:
130            parents.append(base)
131            parents.extend(self.get_bases(base))
132        return parents

Returns a recursive list of all the classes object inherits from.

def is_subgruel(self, object: Any) -> bool:
134    def is_subgruel(self, object: Any) -> bool:
135        """Returns whether `object` inherits from `Gruel` somewhere in its ancestory."""
136        if not inspect.isclass(object) or Gruel not in self.get_bases(object):
137            return False
138        return True

Returns whether object inherits from Gruel somewhere in its ancestory.

def prescrape_chores(self):
140    def prescrape_chores(self):
141        """Override to add any tasks to be done before running the scrapers."""
142        ...

Override to add any tasks to be done before running the scrapers.

def postscrape_chores(self):
144    def postscrape_chores(self):
145        """Override to add any tasks to be done after running the scrapers."""
146        self.pop_modules()

Override to add any tasks to be done after running the scrapers.

def scrape(self, scrapers: list[gruel.gruel.Gruel]):
148    def scrape(self, scrapers: list[Gruel]):
149        """Run the `scrape()` method for each scraper in `scrapers`.
150
151        Execution is multithreaded."""
152        pool = PoolBar("thread", [scraper().scrape for scraper in scrapers])  # type: ignore
153        pool.execute()

Run the scrape() method for each scraper in scrapers.

Execution is multithreaded.

def brew(self):
155    def brew(self):
156        """Execute pipeline.
157
158        1. self.prescrape_chores()
159        2. self.load_scrapers()
160        3. self.scrape()
161        4. self.postscrape_chores()"""
162        self.logger.info("Beginning brew")
163        print("Beginning brew")
164        print("Executing prescrape chores...")
165        self.prescrape_chores()
166        print("Loading scrapers...")
167        scrapers = self.load_scrapers()
168        print(f"Loaded {len(scrapers)} scrapers.")
169        print("Starting scrape...")
170        self.scrape(scrapers)
171        print("Scrape complete.")
172        print("Executing postscrape chores...")
173        self.postscrape_chores()
174        print("Brew complete.")
175        self.logger.info("Brew complete.")

Execute pipeline.

  1. self.prescrape_chores()
  2. self.load_scrapers()
  3. self.scrape()
  4. self.postscrape_chores()
def get_args() -> argparse.Namespace:
178def get_args() -> argparse.Namespace:
179    parser = argparse.ArgumentParser()
180
181    parser.add_argument(
182        "subgruel_classes",
183        type=str,
184        nargs="*",
185        help=""" A list of Gruel scraper class names to find and import. """,
186    )
187    parser.add_argument(
188        "-e",
189        "--excludes",
190        type=str,
191        nargs="*",
192        default=[],
193        help=""" A list of glob style file patterns to exclude from the scan. """,
194    )
195    parser.add_argument(
196        "-i",
197        "--includes",
198        type=str,
199        nargs="*",
200        default=["*.py"],
201        help=""" A list of glob style file patterns to include in the scan. Defaults to "*.py". """,
202    )
203    parser.add_argument(
204        "-p",
205        "--path",
206        type=str,
207        default=Pathier.cwd(),
208        help=""" The directory path to scan. Defaults to the current working directory. """,
209    )
210    parser.add_argument(
211        "-r",
212        "--recursive",
213        action="store_true",
214        help=""" Whether -p/--path should be scanned recursively or not. """,
215    )
216    args = parser.parse_args()
217    args.path = Pathier(args.path)
218
219    return args
def main(args: argparse.Namespace | None = None):
222def main(args: argparse.Namespace | None = None):
223    if not args:
224        args = get_args()
225    brewer = Brewer(
226        args.subgruel_classes, args.excludes, args.path, args.includes, args.recursive
227    )
228    brewer.brew()