gruel.brewer
1import argparse 2import importlib 3import importlib.machinery 4import importlib.util 5import inspect 6from typing import Any 7 8import loggi 9import quickpool 10from pathier import Pathier, Pathish 11from younotyou import younotyou 12 13from gruel import Gruel 14 15 16class Brewer: 17 def __init__( 18 self, 19 subgruel_classes: list[str], 20 file_exclude_patterns: list[str] = [], 21 scan_path: Pathish = Pathier.cwd(), 22 file_include_patterns: list[str] = ["*.py"], 23 recursive: bool = True, 24 log_dir: Pathish | None = None, 25 ): 26 """Run `Gruel` scrapers. 27 28 #### :params: 29 30 `subgruel_classes`: A list of class names for scrapers that should be loaded. 31 In order to be loaded, a scraper class must have a name in this list and have `Gruel` somewhere in its inheritance hierarchy. 32 33 `file_exclude_patterns`: Files that match these patterns will not be scanned. 34 35 `scan_path`: The path to scan for scraper classes. 36 37 `file_include_patterns`: Files that match these patterns will be scanned. 38 39 `recursive`: Whether the scan should be recursive or not. 40 41 `log_dir`: The directory this instance's log should be saved to. 42 If `None`, it will be saved to the current working directory. 43 44 >>> brewer = Brewer(["VenueScraper"], ["*template*", "*giggruel*"], "scrapers") 45 >>> brewer.brew()""" 46 self._init_logger(log_dir) 47 self.subgruel_classes = subgruel_classes 48 self.file_exclude_patterns = file_exclude_patterns 49 self.file_include_patterns = file_include_patterns 50 self.scan_path = Pathier(scan_path) 51 self.recursive = recursive 52 53 def _init_logger(self, log_dir: Pathish | None = None): 54 # When Brewer is subclassed, use that file's stem instead of `brewer` 55 log_dir = Pathier(log_dir) if log_dir else Pathier.cwd() 56 source_file = inspect.getsourcefile(type(self)) 57 if source_file: 58 log_name = Pathier(source_file).stem 59 else: 60 log_name = Pathier(__file__).stem 61 self.logger = loggi.getLogger(log_name, log_dir) 62 63 def load_scrapers(self) -> list[Gruel]: 64 """Load scraper classes that inherit from `Gruel`. 65 66 NOTE: Classes are loaded, but scraper objects are not instantiated until the `scrape()` method is called. 67 68 #### :params: 69 70 `directory`: The path to scan for scraper classes. 71 72 `class_names`: A list of class names for scrapers that should be loaded. 73 In order to be loaded, a scraper class must have a name in this list and have `Gruel` somewhere in its inheritance hierarchy. 74 75 `include_patterns`: Files that match these patterns will be scanned. 76 77 `exclude_patterns`: Files that match these patterns will not be scanned. 78 79 `recursive`: Whether the search should be recursive or not. 80 81 >>> load_scrapers("getToTheGig/scrapers", ["VenueScraper"], ["*.py"], ["*template*", "*giggruel*"]) 82 """ 83 globber = self.scan_path.glob 84 if self.recursive: 85 globber = self.scan_path.rglob 86 files = [ 87 str(file) 88 for pattern in self.file_include_patterns 89 for file in globber(pattern) 90 ] 91 files = younotyou(files, exclude_patterns=self.file_exclude_patterns) 92 self.modules = {} 93 self._module_names = [] 94 for file in files: 95 module_name = Pathier(file).stem 96 try: 97 module = importlib.machinery.SourceFileLoader( 98 module_name, file 99 ).load_module() 100 except Exception as e: 101 self.logger.exception( 102 f"Failed to load module '{module_name}' from '{file}'." 103 ) 104 else: 105 self._module_names.append(module_name) 106 self.modules[module] = module 107 gruels = [ 108 getattr(module, class_) 109 for module in self.modules.values() 110 for class_ in self.subgruel_classes 111 if class_ in dir(module) and self.is_subgruel(getattr(module, class_)) 112 ] 113 self.logger.info( 114 "\n".join( 115 [f"Imported {len(gruels)} scrapers: "] 116 + [str(gruel) for gruel in gruels] 117 ) 118 ) 119 return gruels 120 121 def pop_modules(self): 122 """Unload modules.""" 123 for module in self.modules: 124 del module 125 self._module_names = [] 126 127 def get_bases(self, object: Any) -> list[Any]: 128 """Returns a recursive list of all the classes `object` inherits from.""" 129 parents = [] 130 bases = object.__bases__ 131 if not bases: 132 return parents 133 for base in bases: 134 parents.append(base) 135 parents.extend(self.get_bases(base)) 136 return parents 137 138 def is_subgruel(self, object: Any) -> bool: 139 """Returns whether `object` inherits from `Gruel` somewhere in its ancestory.""" 140 if not inspect.isclass(object) or Gruel not in self.get_bases(object): 141 return False 142 return True 143 144 def prescrape_chores(self): 145 """Override to add any tasks to be done before running the scrapers.""" 146 ... 147 148 def postscrape_chores(self): 149 """Override to add any tasks to be done after running the scrapers.""" 150 self.pop_modules() 151 152 def scrape(self, scrapers: list[Gruel]): 153 """Run the `scrape()` method for each scraper in `scrapers`. 154 155 Execution is multithreaded.""" 156 execute = lambda scraper: scraper().scrape() 157 pool = quickpool.ThreadPool( 158 [execute] * len(scrapers), [(scraper,) for scraper in scrapers] 159 ) 160 pool.execute() 161 162 def logprint(self, message: str): 163 """Log and print `message`.""" 164 self.logger.info(message) 165 print(message) 166 167 def brew(self): 168 """Execute pipeline. 169 170 1. self.prescrape_chores() 171 2. self.load_scrapers() 172 3. self.scrape() 173 4. self.postscrape_chores()""" 174 175 try: 176 self.logprint("Beginning brew") 177 # 1-------------------------------------------- 178 self.logprint("Executing prescrape chores") 179 self.prescrape_chores() 180 # 2-------------------------------------------- 181 self.logprint("Loading scrapers") 182 scrapers = self.load_scrapers() 183 print(f"Loaded {len(scrapers)} scrapers") 184 # 3-------------------------------------------- 185 self.logprint("Starting scrape") 186 self.scrape(scrapers) 187 self.logprint("Scrape complete") 188 # 4-------------------------------------------- 189 self.logprint("Executing postscrape chores") 190 self.postscrape_chores() 191 self.logprint("Brew complete") 192 except Exception as e: 193 print(e) 194 self.logger.exception("Exception occured during brew():") 195 196 197def get_args() -> argparse.Namespace: 198 parser = argparse.ArgumentParser() 199 200 parser.add_argument( 201 "subgruel_classes", 202 type=str, 203 nargs="*", 204 help=""" A list of Gruel scraper class names to find and import. """, 205 ) 206 parser.add_argument( 207 "-e", 208 "--excludes", 209 type=str, 210 nargs="*", 211 default=[], 212 help=""" A list of glob style file patterns to exclude from the scan. """, 213 ) 214 parser.add_argument( 215 "-i", 216 "--includes", 217 type=str, 218 nargs="*", 219 default=["*.py"], 220 help=""" A list of glob style file patterns to include in the scan. Defaults to "*.py". """, 221 ) 222 parser.add_argument( 223 "-p", 224 "--path", 225 type=str, 226 default=Pathier.cwd(), 227 help=""" The directory path to scan. Defaults to the current working directory. """, 228 ) 229 parser.add_argument( 230 "-r", 231 "--recursive", 232 action="store_true", 233 help=""" Whether -p/--path should be scanned recursively or not. """, 234 ) 235 parser.add_argument( 236 "-l", 237 "--log_dir", 238 type=str, 239 default=None, 240 help=""" The directory to save the brew log to.""", 241 ) 242 args = parser.parse_args() 243 args.path = Pathier(args.path) 244 245 return args 246 247 248def main(args: argparse.Namespace | None = None): 249 if not args: 250 args = get_args() 251 brewer = Brewer( 252 args.subgruel_classes, 253 args.excludes, 254 args.path, 255 args.includes, 256 args.recursive, 257 args.log_dir, 258 ) 259 brewer.brew() 260 261 262if __name__ == "__main__": 263 main(get_args())
17class Brewer: 18 def __init__( 19 self, 20 subgruel_classes: list[str], 21 file_exclude_patterns: list[str] = [], 22 scan_path: Pathish = Pathier.cwd(), 23 file_include_patterns: list[str] = ["*.py"], 24 recursive: bool = True, 25 log_dir: Pathish | None = None, 26 ): 27 """Run `Gruel` scrapers. 28 29 #### :params: 30 31 `subgruel_classes`: A list of class names for scrapers that should be loaded. 32 In order to be loaded, a scraper class must have a name in this list and have `Gruel` somewhere in its inheritance hierarchy. 33 34 `file_exclude_patterns`: Files that match these patterns will not be scanned. 35 36 `scan_path`: The path to scan for scraper classes. 37 38 `file_include_patterns`: Files that match these patterns will be scanned. 39 40 `recursive`: Whether the scan should be recursive or not. 41 42 `log_dir`: The directory this instance's log should be saved to. 43 If `None`, it will be saved to the current working directory. 44 45 >>> brewer = Brewer(["VenueScraper"], ["*template*", "*giggruel*"], "scrapers") 46 >>> brewer.brew()""" 47 self._init_logger(log_dir) 48 self.subgruel_classes = subgruel_classes 49 self.file_exclude_patterns = file_exclude_patterns 50 self.file_include_patterns = file_include_patterns 51 self.scan_path = Pathier(scan_path) 52 self.recursive = recursive 53 54 def _init_logger(self, log_dir: Pathish | None = None): 55 # When Brewer is subclassed, use that file's stem instead of `brewer` 56 log_dir = Pathier(log_dir) if log_dir else Pathier.cwd() 57 source_file = inspect.getsourcefile(type(self)) 58 if source_file: 59 log_name = Pathier(source_file).stem 60 else: 61 log_name = Pathier(__file__).stem 62 self.logger = loggi.getLogger(log_name, log_dir) 63 64 def load_scrapers(self) -> list[Gruel]: 65 """Load scraper classes that inherit from `Gruel`. 66 67 NOTE: Classes are loaded, but scraper objects are not instantiated until the `scrape()` method is called. 68 69 #### :params: 70 71 `directory`: The path to scan for scraper classes. 72 73 `class_names`: A list of class names for scrapers that should be loaded. 74 In order to be loaded, a scraper class must have a name in this list and have `Gruel` somewhere in its inheritance hierarchy. 75 76 `include_patterns`: Files that match these patterns will be scanned. 77 78 `exclude_patterns`: Files that match these patterns will not be scanned. 79 80 `recursive`: Whether the search should be recursive or not. 81 82 >>> load_scrapers("getToTheGig/scrapers", ["VenueScraper"], ["*.py"], ["*template*", "*giggruel*"]) 83 """ 84 globber = self.scan_path.glob 85 if self.recursive: 86 globber = self.scan_path.rglob 87 files = [ 88 str(file) 89 for pattern in self.file_include_patterns 90 for file in globber(pattern) 91 ] 92 files = younotyou(files, exclude_patterns=self.file_exclude_patterns) 93 self.modules = {} 94 self._module_names = [] 95 for file in files: 96 module_name = Pathier(file).stem 97 try: 98 module = importlib.machinery.SourceFileLoader( 99 module_name, file 100 ).load_module() 101 except Exception as e: 102 self.logger.exception( 103 f"Failed to load module '{module_name}' from '{file}'." 104 ) 105 else: 106 self._module_names.append(module_name) 107 self.modules[module] = module 108 gruels = [ 109 getattr(module, class_) 110 for module in self.modules.values() 111 for class_ in self.subgruel_classes 112 if class_ in dir(module) and self.is_subgruel(getattr(module, class_)) 113 ] 114 self.logger.info( 115 "\n".join( 116 [f"Imported {len(gruels)} scrapers: "] 117 + [str(gruel) for gruel in gruels] 118 ) 119 ) 120 return gruels 121 122 def pop_modules(self): 123 """Unload modules.""" 124 for module in self.modules: 125 del module 126 self._module_names = [] 127 128 def get_bases(self, object: Any) -> list[Any]: 129 """Returns a recursive list of all the classes `object` inherits from.""" 130 parents = [] 131 bases = object.__bases__ 132 if not bases: 133 return parents 134 for base in bases: 135 parents.append(base) 136 parents.extend(self.get_bases(base)) 137 return parents 138 139 def is_subgruel(self, object: Any) -> bool: 140 """Returns whether `object` inherits from `Gruel` somewhere in its ancestory.""" 141 if not inspect.isclass(object) or Gruel not in self.get_bases(object): 142 return False 143 return True 144 145 def prescrape_chores(self): 146 """Override to add any tasks to be done before running the scrapers.""" 147 ... 148 149 def postscrape_chores(self): 150 """Override to add any tasks to be done after running the scrapers.""" 151 self.pop_modules() 152 153 def scrape(self, scrapers: list[Gruel]): 154 """Run the `scrape()` method for each scraper in `scrapers`. 155 156 Execution is multithreaded.""" 157 execute = lambda scraper: scraper().scrape() 158 pool = quickpool.ThreadPool( 159 [execute] * len(scrapers), [(scraper,) for scraper in scrapers] 160 ) 161 pool.execute() 162 163 def logprint(self, message: str): 164 """Log and print `message`.""" 165 self.logger.info(message) 166 print(message) 167 168 def brew(self): 169 """Execute pipeline. 170 171 1. self.prescrape_chores() 172 2. self.load_scrapers() 173 3. self.scrape() 174 4. self.postscrape_chores()""" 175 176 try: 177 self.logprint("Beginning brew") 178 # 1-------------------------------------------- 179 self.logprint("Executing prescrape chores") 180 self.prescrape_chores() 181 # 2-------------------------------------------- 182 self.logprint("Loading scrapers") 183 scrapers = self.load_scrapers() 184 print(f"Loaded {len(scrapers)} scrapers") 185 # 3-------------------------------------------- 186 self.logprint("Starting scrape") 187 self.scrape(scrapers) 188 self.logprint("Scrape complete") 189 # 4-------------------------------------------- 190 self.logprint("Executing postscrape chores") 191 self.postscrape_chores() 192 self.logprint("Brew complete") 193 except Exception as e: 194 print(e) 195 self.logger.exception("Exception occured during brew():")
18 def __init__( 19 self, 20 subgruel_classes: list[str], 21 file_exclude_patterns: list[str] = [], 22 scan_path: Pathish = Pathier.cwd(), 23 file_include_patterns: list[str] = ["*.py"], 24 recursive: bool = True, 25 log_dir: Pathish | None = None, 26 ): 27 """Run `Gruel` scrapers. 28 29 #### :params: 30 31 `subgruel_classes`: A list of class names for scrapers that should be loaded. 32 In order to be loaded, a scraper class must have a name in this list and have `Gruel` somewhere in its inheritance hierarchy. 33 34 `file_exclude_patterns`: Files that match these patterns will not be scanned. 35 36 `scan_path`: The path to scan for scraper classes. 37 38 `file_include_patterns`: Files that match these patterns will be scanned. 39 40 `recursive`: Whether the scan should be recursive or not. 41 42 `log_dir`: The directory this instance's log should be saved to. 43 If `None`, it will be saved to the current working directory. 44 45 >>> brewer = Brewer(["VenueScraper"], ["*template*", "*giggruel*"], "scrapers") 46 >>> brewer.brew()""" 47 self._init_logger(log_dir) 48 self.subgruel_classes = subgruel_classes 49 self.file_exclude_patterns = file_exclude_patterns 50 self.file_include_patterns = file_include_patterns 51 self.scan_path = Pathier(scan_path) 52 self.recursive = recursive
Run Gruel
scrapers.
:params:
subgruel_classes
: A list of class names for scrapers that should be loaded.
In order to be loaded, a scraper class must have a name in this list and have Gruel
somewhere in its inheritance hierarchy.
file_exclude_patterns
: Files that match these patterns will not be scanned.
scan_path
: The path to scan for scraper classes.
file_include_patterns
: Files that match these patterns will be scanned.
recursive
: Whether the scan should be recursive or not.
log_dir
: The directory this instance's log should be saved to.
If None
, it will be saved to the current working directory.
>>> brewer = Brewer(["VenueScraper"], ["*template*", "*giggruel*"], "scrapers")
>>> brewer.brew()
64 def load_scrapers(self) -> list[Gruel]: 65 """Load scraper classes that inherit from `Gruel`. 66 67 NOTE: Classes are loaded, but scraper objects are not instantiated until the `scrape()` method is called. 68 69 #### :params: 70 71 `directory`: The path to scan for scraper classes. 72 73 `class_names`: A list of class names for scrapers that should be loaded. 74 In order to be loaded, a scraper class must have a name in this list and have `Gruel` somewhere in its inheritance hierarchy. 75 76 `include_patterns`: Files that match these patterns will be scanned. 77 78 `exclude_patterns`: Files that match these patterns will not be scanned. 79 80 `recursive`: Whether the search should be recursive or not. 81 82 >>> load_scrapers("getToTheGig/scrapers", ["VenueScraper"], ["*.py"], ["*template*", "*giggruel*"]) 83 """ 84 globber = self.scan_path.glob 85 if self.recursive: 86 globber = self.scan_path.rglob 87 files = [ 88 str(file) 89 for pattern in self.file_include_patterns 90 for file in globber(pattern) 91 ] 92 files = younotyou(files, exclude_patterns=self.file_exclude_patterns) 93 self.modules = {} 94 self._module_names = [] 95 for file in files: 96 module_name = Pathier(file).stem 97 try: 98 module = importlib.machinery.SourceFileLoader( 99 module_name, file 100 ).load_module() 101 except Exception as e: 102 self.logger.exception( 103 f"Failed to load module '{module_name}' from '{file}'." 104 ) 105 else: 106 self._module_names.append(module_name) 107 self.modules[module] = module 108 gruels = [ 109 getattr(module, class_) 110 for module in self.modules.values() 111 for class_ in self.subgruel_classes 112 if class_ in dir(module) and self.is_subgruel(getattr(module, class_)) 113 ] 114 self.logger.info( 115 "\n".join( 116 [f"Imported {len(gruels)} scrapers: "] 117 + [str(gruel) for gruel in gruels] 118 ) 119 ) 120 return gruels
Load scraper classes that inherit from Gruel
.
NOTE: Classes are loaded, but scraper objects are not instantiated until the scrape()
method is called.
:params:
directory
: The path to scan for scraper classes.
class_names
: A list of class names for scrapers that should be loaded.
In order to be loaded, a scraper class must have a name in this list and have Gruel
somewhere in its inheritance hierarchy.
include_patterns
: Files that match these patterns will be scanned.
exclude_patterns
: Files that match these patterns will not be scanned.
recursive
: Whether the search should be recursive or not.
>>> load_scrapers("getToTheGig/scrapers", ["VenueScraper"], ["*.py"], ["*template*", "*giggruel*"])
122 def pop_modules(self): 123 """Unload modules.""" 124 for module in self.modules: 125 del module 126 self._module_names = []
Unload modules.
128 def get_bases(self, object: Any) -> list[Any]: 129 """Returns a recursive list of all the classes `object` inherits from.""" 130 parents = [] 131 bases = object.__bases__ 132 if not bases: 133 return parents 134 for base in bases: 135 parents.append(base) 136 parents.extend(self.get_bases(base)) 137 return parents
Returns a recursive list of all the classes object
inherits from.
139 def is_subgruel(self, object: Any) -> bool: 140 """Returns whether `object` inherits from `Gruel` somewhere in its ancestory.""" 141 if not inspect.isclass(object) or Gruel not in self.get_bases(object): 142 return False 143 return True
Returns whether object
inherits from Gruel
somewhere in its ancestory.
145 def prescrape_chores(self): 146 """Override to add any tasks to be done before running the scrapers.""" 147 ...
Override to add any tasks to be done before running the scrapers.
149 def postscrape_chores(self): 150 """Override to add any tasks to be done after running the scrapers.""" 151 self.pop_modules()
Override to add any tasks to be done after running the scrapers.
153 def scrape(self, scrapers: list[Gruel]): 154 """Run the `scrape()` method for each scraper in `scrapers`. 155 156 Execution is multithreaded.""" 157 execute = lambda scraper: scraper().scrape() 158 pool = quickpool.ThreadPool( 159 [execute] * len(scrapers), [(scraper,) for scraper in scrapers] 160 ) 161 pool.execute()
Run the scrape()
method for each scraper in scrapers
.
Execution is multithreaded.
163 def logprint(self, message: str): 164 """Log and print `message`.""" 165 self.logger.info(message) 166 print(message)
Log and print message
.
168 def brew(self): 169 """Execute pipeline. 170 171 1. self.prescrape_chores() 172 2. self.load_scrapers() 173 3. self.scrape() 174 4. self.postscrape_chores()""" 175 176 try: 177 self.logprint("Beginning brew") 178 # 1-------------------------------------------- 179 self.logprint("Executing prescrape chores") 180 self.prescrape_chores() 181 # 2-------------------------------------------- 182 self.logprint("Loading scrapers") 183 scrapers = self.load_scrapers() 184 print(f"Loaded {len(scrapers)} scrapers") 185 # 3-------------------------------------------- 186 self.logprint("Starting scrape") 187 self.scrape(scrapers) 188 self.logprint("Scrape complete") 189 # 4-------------------------------------------- 190 self.logprint("Executing postscrape chores") 191 self.postscrape_chores() 192 self.logprint("Brew complete") 193 except Exception as e: 194 print(e) 195 self.logger.exception("Exception occured during brew():")
Execute pipeline.
- self.prescrape_chores()
- self.load_scrapers()
- self.scrape()
- self.postscrape_chores()
198def get_args() -> argparse.Namespace: 199 parser = argparse.ArgumentParser() 200 201 parser.add_argument( 202 "subgruel_classes", 203 type=str, 204 nargs="*", 205 help=""" A list of Gruel scraper class names to find and import. """, 206 ) 207 parser.add_argument( 208 "-e", 209 "--excludes", 210 type=str, 211 nargs="*", 212 default=[], 213 help=""" A list of glob style file patterns to exclude from the scan. """, 214 ) 215 parser.add_argument( 216 "-i", 217 "--includes", 218 type=str, 219 nargs="*", 220 default=["*.py"], 221 help=""" A list of glob style file patterns to include in the scan. Defaults to "*.py". """, 222 ) 223 parser.add_argument( 224 "-p", 225 "--path", 226 type=str, 227 default=Pathier.cwd(), 228 help=""" The directory path to scan. Defaults to the current working directory. """, 229 ) 230 parser.add_argument( 231 "-r", 232 "--recursive", 233 action="store_true", 234 help=""" Whether -p/--path should be scanned recursively or not. """, 235 ) 236 parser.add_argument( 237 "-l", 238 "--log_dir", 239 type=str, 240 default=None, 241 help=""" The directory to save the brew log to.""", 242 ) 243 args = parser.parse_args() 244 args.path = Pathier(args.path) 245 246 return args