gruel.brewer
1import argparse 2import importlib 3import importlib.machinery 4import importlib.util 5import inspect 6from typing import Any 7 8import loggi 9import quickpool 10from pathier import Pathier, Pathish 11from younotyou import younotyou 12 13from gruel import Gruel 14 15 16class Brewer: 17 def __init__( 18 self, 19 subgruel_classes: list[str], 20 file_exclude_patterns: list[str] = [], 21 scan_path: Pathish = Pathier.cwd(), 22 file_include_patterns: list[str] = ["*.py"], 23 recursive: bool = True, 24 ): 25 """Run `Gruel` scrapers. 26 27 #### :params: 28 29 `subgruel_classes`: A list of class names for scrapers that should be loaded. 30 In order to be loaded, a scraper class must have a name in this list and have `Gruel` somewhere in its inheritance hierarchy. 31 32 `file_exclude_patterns`: Files that match these patterns will not be scanned. 33 34 `scan_path`: The path to scan for scraper classes. 35 36 `file_include_patterns`: Files that match these patterns will be scanned. 37 38 `recursive`: Whether the scan should be recursive or not. 39 40 >>> brewer = Brewer(["VenueScraper"], ["*template*", "*giggruel*"], "scrapers") 41 >>> brewer.brew()""" 42 self._init_logger() 43 self.subgruel_classes = subgruel_classes 44 self.file_exclude_patterns = file_exclude_patterns 45 self.file_include_patterns = file_include_patterns 46 self.scan_path = Pathier(scan_path) 47 self.recursive = recursive 48 49 def _init_logger(self): 50 # When Brewer is subclassed, use that file's stem instead of `brewer` 51 source_file = inspect.getsourcefile(type(self)) 52 if source_file: 53 log_name = Pathier(source_file).stem 54 else: 55 log_name = Pathier(__file__).stem 56 self.logger = loggi.getLogger(log_name) 57 58 def load_scrapers(self) -> list[Gruel]: 59 """Load scraper classes that inherit from `Gruel`. 60 61 NOTE: Classes are loaded, but scraper objects are not instantiated until the `scrape()` method is called. 62 63 #### :params: 64 65 `directory`: The path to scan for scraper classes. 66 67 `class_names`: A list of class names for scrapers that should be loaded. 68 In order to be loaded, a scraper class must have a name in this list and have `Gruel` somewhere in its inheritance hierarchy. 69 70 `include_patterns`: Files that match these patterns will be scanned. 71 72 `exclude_patterns`: Files that match these patterns will not be scanned. 73 74 `recursive`: Whether the search should be recursive or not. 75 76 >>> load_scrapers("getToTheGig/scrapers", ["VenueScraper"], ["*.py"], ["*template*", "*giggruel*"]) 77 """ 78 globber = self.scan_path.glob 79 if self.recursive: 80 globber = self.scan_path.rglob 81 files = [ 82 str(file) 83 for pattern in self.file_include_patterns 84 for file in globber(pattern) 85 ] 86 files = younotyou(files, exclude_patterns=self.file_exclude_patterns) 87 self.modules = {} 88 self._module_names = [] 89 for file in files: 90 module_name = Pathier(file).stem 91 try: 92 module = importlib.machinery.SourceFileLoader( 93 module_name, file 94 ).load_module() 95 except Exception as e: 96 self.logger.exception( 97 f"Failed to load module '{module_name}' from '{file}'." 98 ) 99 else: 100 self._module_names.append(module_name) 101 self.modules[module] = module 102 gruels = [ 103 getattr(module, class_) 104 for module in self.modules.values() 105 for class_ in self.subgruel_classes 106 if class_ in dir(module) and self.is_subgruel(getattr(module, class_)) 107 ] 108 self.logger.info( 109 "\n".join( 110 [f"Imported {len(gruels)} scrapers: "] 111 + [str(gruel) for gruel in gruels] 112 ) 113 ) 114 return gruels 115 116 def pop_modules(self): 117 """Unload modules.""" 118 for module in self.modules: 119 del module 120 self._module_names = [] 121 122 def get_bases(self, object: Any) -> list[Any]: 123 """Returns a recursive list of all the classes `object` inherits from.""" 124 parents = [] 125 bases = object.__bases__ 126 if not bases: 127 return parents 128 for base in bases: 129 parents.append(base) 130 parents.extend(self.get_bases(base)) 131 return parents 132 133 def is_subgruel(self, object: Any) -> bool: 134 """Returns whether `object` inherits from `Gruel` somewhere in its ancestory.""" 135 if not inspect.isclass(object) or Gruel not in self.get_bases(object): 136 return False 137 return True 138 139 def prescrape_chores(self): 140 """Override to add any tasks to be done before running the scrapers.""" 141 ... 142 143 def postscrape_chores(self): 144 """Override to add any tasks to be done after running the scrapers.""" 145 self.pop_modules() 146 147 def scrape(self, scrapers: list[Gruel]): 148 """Run the `scrape()` method for each scraper in `scrapers`. 149 150 Execution is multithreaded.""" 151 execute = lambda scraper: scraper().scrape() 152 pool = quickpool.ThreadPool( 153 [execute] * len(scrapers), [(scraper,) for scraper in scrapers] 154 ) 155 pool.execute() 156 157 def logprint(self, message: str): 158 """Log and print `message`.""" 159 self.logger.info(message) 160 print(message) 161 162 def brew(self): 163 """Execute pipeline. 164 165 1. self.prescrape_chores() 166 2. self.load_scrapers() 167 3. self.scrape() 168 4. self.postscrape_chores()""" 169 170 try: 171 self.logprint("Beginning brew") 172 # 1-------------------------------------------- 173 self.logprint("Executing prescrape chores") 174 self.prescrape_chores() 175 # 2-------------------------------------------- 176 self.logprint("Loading scrapers") 177 scrapers = self.load_scrapers() 178 print(f"Loaded {len(scrapers)} scrapers") 179 # 3-------------------------------------------- 180 self.logprint("Starting scrape") 181 self.scrape(scrapers) 182 self.logprint("Scrape complete") 183 # 4-------------------------------------------- 184 self.logprint("Executing postscrape chores") 185 self.postscrape_chores() 186 self.logprint("Brew complete") 187 except Exception as e: 188 print(e) 189 self.logger.exception("Exception occured during brew():") 190 191 192def get_args() -> argparse.Namespace: 193 parser = argparse.ArgumentParser() 194 195 parser.add_argument( 196 "subgruel_classes", 197 type=str, 198 nargs="*", 199 help=""" A list of Gruel scraper class names to find and import. """, 200 ) 201 parser.add_argument( 202 "-e", 203 "--excludes", 204 type=str, 205 nargs="*", 206 default=[], 207 help=""" A list of glob style file patterns to exclude from the scan. """, 208 ) 209 parser.add_argument( 210 "-i", 211 "--includes", 212 type=str, 213 nargs="*", 214 default=["*.py"], 215 help=""" A list of glob style file patterns to include in the scan. Defaults to "*.py". """, 216 ) 217 parser.add_argument( 218 "-p", 219 "--path", 220 type=str, 221 default=Pathier.cwd(), 222 help=""" The directory path to scan. Defaults to the current working directory. """, 223 ) 224 parser.add_argument( 225 "-r", 226 "--recursive", 227 action="store_true", 228 help=""" Whether -p/--path should be scanned recursively or not. """, 229 ) 230 args = parser.parse_args() 231 args.path = Pathier(args.path) 232 233 return args 234 235 236def main(args: argparse.Namespace | None = None): 237 if not args: 238 args = get_args() 239 brewer = Brewer( 240 args.subgruel_classes, args.excludes, args.path, args.includes, args.recursive 241 ) 242 brewer.brew() 243 244 245if __name__ == "__main__": 246 main(get_args())
17class Brewer: 18 def __init__( 19 self, 20 subgruel_classes: list[str], 21 file_exclude_patterns: list[str] = [], 22 scan_path: Pathish = Pathier.cwd(), 23 file_include_patterns: list[str] = ["*.py"], 24 recursive: bool = True, 25 ): 26 """Run `Gruel` scrapers. 27 28 #### :params: 29 30 `subgruel_classes`: A list of class names for scrapers that should be loaded. 31 In order to be loaded, a scraper class must have a name in this list and have `Gruel` somewhere in its inheritance hierarchy. 32 33 `file_exclude_patterns`: Files that match these patterns will not be scanned. 34 35 `scan_path`: The path to scan for scraper classes. 36 37 `file_include_patterns`: Files that match these patterns will be scanned. 38 39 `recursive`: Whether the scan should be recursive or not. 40 41 >>> brewer = Brewer(["VenueScraper"], ["*template*", "*giggruel*"], "scrapers") 42 >>> brewer.brew()""" 43 self._init_logger() 44 self.subgruel_classes = subgruel_classes 45 self.file_exclude_patterns = file_exclude_patterns 46 self.file_include_patterns = file_include_patterns 47 self.scan_path = Pathier(scan_path) 48 self.recursive = recursive 49 50 def _init_logger(self): 51 # When Brewer is subclassed, use that file's stem instead of `brewer` 52 source_file = inspect.getsourcefile(type(self)) 53 if source_file: 54 log_name = Pathier(source_file).stem 55 else: 56 log_name = Pathier(__file__).stem 57 self.logger = loggi.getLogger(log_name) 58 59 def load_scrapers(self) -> list[Gruel]: 60 """Load scraper classes that inherit from `Gruel`. 61 62 NOTE: Classes are loaded, but scraper objects are not instantiated until the `scrape()` method is called. 63 64 #### :params: 65 66 `directory`: The path to scan for scraper classes. 67 68 `class_names`: A list of class names for scrapers that should be loaded. 69 In order to be loaded, a scraper class must have a name in this list and have `Gruel` somewhere in its inheritance hierarchy. 70 71 `include_patterns`: Files that match these patterns will be scanned. 72 73 `exclude_patterns`: Files that match these patterns will not be scanned. 74 75 `recursive`: Whether the search should be recursive or not. 76 77 >>> load_scrapers("getToTheGig/scrapers", ["VenueScraper"], ["*.py"], ["*template*", "*giggruel*"]) 78 """ 79 globber = self.scan_path.glob 80 if self.recursive: 81 globber = self.scan_path.rglob 82 files = [ 83 str(file) 84 for pattern in self.file_include_patterns 85 for file in globber(pattern) 86 ] 87 files = younotyou(files, exclude_patterns=self.file_exclude_patterns) 88 self.modules = {} 89 self._module_names = [] 90 for file in files: 91 module_name = Pathier(file).stem 92 try: 93 module = importlib.machinery.SourceFileLoader( 94 module_name, file 95 ).load_module() 96 except Exception as e: 97 self.logger.exception( 98 f"Failed to load module '{module_name}' from '{file}'." 99 ) 100 else: 101 self._module_names.append(module_name) 102 self.modules[module] = module 103 gruels = [ 104 getattr(module, class_) 105 for module in self.modules.values() 106 for class_ in self.subgruel_classes 107 if class_ in dir(module) and self.is_subgruel(getattr(module, class_)) 108 ] 109 self.logger.info( 110 "\n".join( 111 [f"Imported {len(gruels)} scrapers: "] 112 + [str(gruel) for gruel in gruels] 113 ) 114 ) 115 return gruels 116 117 def pop_modules(self): 118 """Unload modules.""" 119 for module in self.modules: 120 del module 121 self._module_names = [] 122 123 def get_bases(self, object: Any) -> list[Any]: 124 """Returns a recursive list of all the classes `object` inherits from.""" 125 parents = [] 126 bases = object.__bases__ 127 if not bases: 128 return parents 129 for base in bases: 130 parents.append(base) 131 parents.extend(self.get_bases(base)) 132 return parents 133 134 def is_subgruel(self, object: Any) -> bool: 135 """Returns whether `object` inherits from `Gruel` somewhere in its ancestory.""" 136 if not inspect.isclass(object) or Gruel not in self.get_bases(object): 137 return False 138 return True 139 140 def prescrape_chores(self): 141 """Override to add any tasks to be done before running the scrapers.""" 142 ... 143 144 def postscrape_chores(self): 145 """Override to add any tasks to be done after running the scrapers.""" 146 self.pop_modules() 147 148 def scrape(self, scrapers: list[Gruel]): 149 """Run the `scrape()` method for each scraper in `scrapers`. 150 151 Execution is multithreaded.""" 152 execute = lambda scraper: scraper().scrape() 153 pool = quickpool.ThreadPool( 154 [execute] * len(scrapers), [(scraper,) for scraper in scrapers] 155 ) 156 pool.execute() 157 158 def logprint(self, message: str): 159 """Log and print `message`.""" 160 self.logger.info(message) 161 print(message) 162 163 def brew(self): 164 """Execute pipeline. 165 166 1. self.prescrape_chores() 167 2. self.load_scrapers() 168 3. self.scrape() 169 4. self.postscrape_chores()""" 170 171 try: 172 self.logprint("Beginning brew") 173 # 1-------------------------------------------- 174 self.logprint("Executing prescrape chores") 175 self.prescrape_chores() 176 # 2-------------------------------------------- 177 self.logprint("Loading scrapers") 178 scrapers = self.load_scrapers() 179 print(f"Loaded {len(scrapers)} scrapers") 180 # 3-------------------------------------------- 181 self.logprint("Starting scrape") 182 self.scrape(scrapers) 183 self.logprint("Scrape complete") 184 # 4-------------------------------------------- 185 self.logprint("Executing postscrape chores") 186 self.postscrape_chores() 187 self.logprint("Brew complete") 188 except Exception as e: 189 print(e) 190 self.logger.exception("Exception occured during brew():")
18 def __init__( 19 self, 20 subgruel_classes: list[str], 21 file_exclude_patterns: list[str] = [], 22 scan_path: Pathish = Pathier.cwd(), 23 file_include_patterns: list[str] = ["*.py"], 24 recursive: bool = True, 25 ): 26 """Run `Gruel` scrapers. 27 28 #### :params: 29 30 `subgruel_classes`: A list of class names for scrapers that should be loaded. 31 In order to be loaded, a scraper class must have a name in this list and have `Gruel` somewhere in its inheritance hierarchy. 32 33 `file_exclude_patterns`: Files that match these patterns will not be scanned. 34 35 `scan_path`: The path to scan for scraper classes. 36 37 `file_include_patterns`: Files that match these patterns will be scanned. 38 39 `recursive`: Whether the scan should be recursive or not. 40 41 >>> brewer = Brewer(["VenueScraper"], ["*template*", "*giggruel*"], "scrapers") 42 >>> brewer.brew()""" 43 self._init_logger() 44 self.subgruel_classes = subgruel_classes 45 self.file_exclude_patterns = file_exclude_patterns 46 self.file_include_patterns = file_include_patterns 47 self.scan_path = Pathier(scan_path) 48 self.recursive = recursive
Run Gruel
scrapers.
:params:
subgruel_classes
: A list of class names for scrapers that should be loaded.
In order to be loaded, a scraper class must have a name in this list and have Gruel
somewhere in its inheritance hierarchy.
file_exclude_patterns
: Files that match these patterns will not be scanned.
scan_path
: The path to scan for scraper classes.
file_include_patterns
: Files that match these patterns will be scanned.
recursive
: Whether the scan should be recursive or not.
>>> brewer = Brewer(["VenueScraper"], ["*template*", "*giggruel*"], "scrapers")
>>> brewer.brew()
59 def load_scrapers(self) -> list[Gruel]: 60 """Load scraper classes that inherit from `Gruel`. 61 62 NOTE: Classes are loaded, but scraper objects are not instantiated until the `scrape()` method is called. 63 64 #### :params: 65 66 `directory`: The path to scan for scraper classes. 67 68 `class_names`: A list of class names for scrapers that should be loaded. 69 In order to be loaded, a scraper class must have a name in this list and have `Gruel` somewhere in its inheritance hierarchy. 70 71 `include_patterns`: Files that match these patterns will be scanned. 72 73 `exclude_patterns`: Files that match these patterns will not be scanned. 74 75 `recursive`: Whether the search should be recursive or not. 76 77 >>> load_scrapers("getToTheGig/scrapers", ["VenueScraper"], ["*.py"], ["*template*", "*giggruel*"]) 78 """ 79 globber = self.scan_path.glob 80 if self.recursive: 81 globber = self.scan_path.rglob 82 files = [ 83 str(file) 84 for pattern in self.file_include_patterns 85 for file in globber(pattern) 86 ] 87 files = younotyou(files, exclude_patterns=self.file_exclude_patterns) 88 self.modules = {} 89 self._module_names = [] 90 for file in files: 91 module_name = Pathier(file).stem 92 try: 93 module = importlib.machinery.SourceFileLoader( 94 module_name, file 95 ).load_module() 96 except Exception as e: 97 self.logger.exception( 98 f"Failed to load module '{module_name}' from '{file}'." 99 ) 100 else: 101 self._module_names.append(module_name) 102 self.modules[module] = module 103 gruels = [ 104 getattr(module, class_) 105 for module in self.modules.values() 106 for class_ in self.subgruel_classes 107 if class_ in dir(module) and self.is_subgruel(getattr(module, class_)) 108 ] 109 self.logger.info( 110 "\n".join( 111 [f"Imported {len(gruels)} scrapers: "] 112 + [str(gruel) for gruel in gruels] 113 ) 114 ) 115 return gruels
Load scraper classes that inherit from Gruel
.
NOTE: Classes are loaded, but scraper objects are not instantiated until the scrape()
method is called.
:params:
directory
: The path to scan for scraper classes.
class_names
: A list of class names for scrapers that should be loaded.
In order to be loaded, a scraper class must have a name in this list and have Gruel
somewhere in its inheritance hierarchy.
include_patterns
: Files that match these patterns will be scanned.
exclude_patterns
: Files that match these patterns will not be scanned.
recursive
: Whether the search should be recursive or not.
>>> load_scrapers("getToTheGig/scrapers", ["VenueScraper"], ["*.py"], ["*template*", "*giggruel*"])
117 def pop_modules(self): 118 """Unload modules.""" 119 for module in self.modules: 120 del module 121 self._module_names = []
Unload modules.
123 def get_bases(self, object: Any) -> list[Any]: 124 """Returns a recursive list of all the classes `object` inherits from.""" 125 parents = [] 126 bases = object.__bases__ 127 if not bases: 128 return parents 129 for base in bases: 130 parents.append(base) 131 parents.extend(self.get_bases(base)) 132 return parents
Returns a recursive list of all the classes object
inherits from.
134 def is_subgruel(self, object: Any) -> bool: 135 """Returns whether `object` inherits from `Gruel` somewhere in its ancestory.""" 136 if not inspect.isclass(object) or Gruel not in self.get_bases(object): 137 return False 138 return True
Returns whether object
inherits from Gruel
somewhere in its ancestory.
140 def prescrape_chores(self): 141 """Override to add any tasks to be done before running the scrapers.""" 142 ...
Override to add any tasks to be done before running the scrapers.
144 def postscrape_chores(self): 145 """Override to add any tasks to be done after running the scrapers.""" 146 self.pop_modules()
Override to add any tasks to be done after running the scrapers.
148 def scrape(self, scrapers: list[Gruel]): 149 """Run the `scrape()` method for each scraper in `scrapers`. 150 151 Execution is multithreaded.""" 152 execute = lambda scraper: scraper().scrape() 153 pool = quickpool.ThreadPool( 154 [execute] * len(scrapers), [(scraper,) for scraper in scrapers] 155 ) 156 pool.execute()
Run the scrape()
method for each scraper in scrapers
.
Execution is multithreaded.
158 def logprint(self, message: str): 159 """Log and print `message`.""" 160 self.logger.info(message) 161 print(message)
Log and print message
.
163 def brew(self): 164 """Execute pipeline. 165 166 1. self.prescrape_chores() 167 2. self.load_scrapers() 168 3. self.scrape() 169 4. self.postscrape_chores()""" 170 171 try: 172 self.logprint("Beginning brew") 173 # 1-------------------------------------------- 174 self.logprint("Executing prescrape chores") 175 self.prescrape_chores() 176 # 2-------------------------------------------- 177 self.logprint("Loading scrapers") 178 scrapers = self.load_scrapers() 179 print(f"Loaded {len(scrapers)} scrapers") 180 # 3-------------------------------------------- 181 self.logprint("Starting scrape") 182 self.scrape(scrapers) 183 self.logprint("Scrape complete") 184 # 4-------------------------------------------- 185 self.logprint("Executing postscrape chores") 186 self.postscrape_chores() 187 self.logprint("Brew complete") 188 except Exception as e: 189 print(e) 190 self.logger.exception("Exception occured during brew():")
Execute pipeline.
- self.prescrape_chores()
- self.load_scrapers()
- self.scrape()
- self.postscrape_chores()
193def get_args() -> argparse.Namespace: 194 parser = argparse.ArgumentParser() 195 196 parser.add_argument( 197 "subgruel_classes", 198 type=str, 199 nargs="*", 200 help=""" A list of Gruel scraper class names to find and import. """, 201 ) 202 parser.add_argument( 203 "-e", 204 "--excludes", 205 type=str, 206 nargs="*", 207 default=[], 208 help=""" A list of glob style file patterns to exclude from the scan. """, 209 ) 210 parser.add_argument( 211 "-i", 212 "--includes", 213 type=str, 214 nargs="*", 215 default=["*.py"], 216 help=""" A list of glob style file patterns to include in the scan. Defaults to "*.py". """, 217 ) 218 parser.add_argument( 219 "-p", 220 "--path", 221 type=str, 222 default=Pathier.cwd(), 223 help=""" The directory path to scan. Defaults to the current working directory. """, 224 ) 225 parser.add_argument( 226 "-r", 227 "--recursive", 228 action="store_true", 229 help=""" Whether -p/--path should be scanned recursively or not. """, 230 ) 231 args = parser.parse_args() 232 args.path = Pathier(args.path) 233 234 return args