gruel.brewer
1import argparse 2import importlib 3import importlib.machinery 4import importlib.util 5import inspect 6from typing import Any 7 8import loggi 9import quickpool 10from pathier import Pathier, Pathish 11from younotyou import younotyou 12 13from gruel import Gruel 14 15 16class Brewer: 17 def __init__( 18 self, 19 subgruel_classes: list[str], 20 file_exclude_patterns: list[str] = [], 21 scan_path: Pathish = Pathier.cwd(), 22 file_include_patterns: list[str] = ["*.py"], 23 recursive: bool = True, 24 ): 25 """Run `Gruel` scrapers. 26 27 #### :params: 28 29 `subgruel_classes`: A list of class names for scrapers that should be loaded. 30 In order to be loaded, a scraper class must have a name in this list and have `Gruel` somewhere in its inheritance hierarchy. 31 32 `file_exclude_patterns`: Files that match these patterns will not be scanned. 33 34 `scan_path`: The path to scan for scraper classes. 35 36 `file_include_patterns`: Files that match these patterns will be scanned. 37 38 `recursive`: Whether the scan should be recursive or not. 39 40 >>> brewer = Brewer(["VenueScraper"], ["*template*", "*giggruel*"], "scrapers") 41 >>> brewer.brew()""" 42 self._init_logger() 43 self.subgruel_classes = subgruel_classes 44 self.file_exclude_patterns = file_exclude_patterns 45 self.file_include_patterns = file_include_patterns 46 self.scan_path = Pathier(scan_path) 47 self.recursive = recursive 48 49 def _init_logger(self): 50 # When Brewer is subclassed, use that file's stem instead of `brewer` 51 source_file = inspect.getsourcefile(type(self)) 52 if source_file: 53 log_name = Pathier(source_file).stem 54 else: 55 log_name = Pathier(__file__).stem 56 self.logger = loggi.getLogger(log_name) 57 58 def load_scrapers(self) -> list[Gruel]: 59 """Load scraper classes that inherit from `Gruel`. 60 61 NOTE: Classes are loaded, but scraper objects are not instantiated until the `scrape()` method is called. 62 63 #### :params: 64 65 `directory`: The path to scan for scraper classes. 66 67 `class_names`: A list of class names for scrapers that should be loaded. 68 In order to be loaded, a scraper class must have a name in this list and have `Gruel` somewhere in its inheritance hierarchy. 69 70 `include_patterns`: Files that match these patterns will be scanned. 71 72 `exclude_patterns`: Files that match these patterns will not be scanned. 73 74 `recursive`: Whether the search should be recursive or not. 75 76 >>> load_scrapers("getToTheGig/scrapers", ["VenueScraper"], ["*.py"], ["*template*", "*giggruel*"])""" 77 globber = self.scan_path.glob 78 if self.recursive: 79 globber = self.scan_path.rglob 80 files = [ 81 str(file) 82 for pattern in self.file_include_patterns 83 for file in globber(pattern) 84 ] 85 files = younotyou(files, exclude_patterns=self.file_exclude_patterns) 86 self.modules = {} 87 self._module_names = [] 88 for file in files: 89 module_name = Pathier(file).stem 90 try: 91 module = importlib.machinery.SourceFileLoader( 92 module_name, file 93 ).load_module() 94 except Exception as e: 95 self.logger.exception( 96 f"Failed to load module '{module_name}' from '{file}'." 97 ) 98 else: 99 self._module_names.append(module_name) 100 self.modules[module] = module 101 gruels = [ 102 getattr(module, class_) 103 for module in self.modules.values() 104 for class_ in self.subgruel_classes 105 if class_ in dir(module) and self.is_subgruel(getattr(module, class_)) 106 ] 107 self.logger.info( 108 "\n".join( 109 [f"Imported {len(gruels)} scrapers: "] 110 + [str(gruel) for gruel in gruels] 111 ) 112 ) 113 return gruels 114 115 def pop_modules(self): 116 """Unload modules.""" 117 for module in self.modules: 118 del module 119 self._module_names = [] 120 121 def get_bases(self, object: Any) -> list[Any]: 122 """Returns a recursive list of all the classes `object` inherits from.""" 123 parents = [] 124 bases = object.__bases__ 125 if not bases: 126 return parents 127 for base in bases: 128 parents.append(base) 129 parents.extend(self.get_bases(base)) 130 return parents 131 132 def is_subgruel(self, object: Any) -> bool: 133 """Returns whether `object` inherits from `Gruel` somewhere in its ancestory.""" 134 if not inspect.isclass(object) or Gruel not in self.get_bases(object): 135 return False 136 return True 137 138 def prescrape_chores(self): 139 """Override to add any tasks to be done before running the scrapers.""" 140 ... 141 142 def postscrape_chores(self): 143 """Override to add any tasks to be done after running the scrapers.""" 144 self.pop_modules() 145 146 def scrape(self, scrapers: list[Gruel]): 147 """Run the `scrape()` method for each scraper in `scrapers`. 148 149 Execution is multithreaded.""" 150 pool = quickpool.ThreadPool([scraper().scrape for scraper in scrapers]) # type: ignore 151 pool.execute() 152 153 def logprint(self, message: str): 154 """Log and print `message`.""" 155 self.logger.info(message) 156 print(message) 157 158 def brew(self): 159 """Execute pipeline. 160 161 1. self.prescrape_chores() 162 2. self.load_scrapers() 163 3. self.scrape() 164 4. self.postscrape_chores()""" 165 166 try: 167 self.logprint("Beginning brew") 168 # 1-------------------------------------------- 169 self.logprint("Executing prescrape chores") 170 self.prescrape_chores() 171 # 2-------------------------------------------- 172 self.logprint("Loading scrapers") 173 scrapers = self.load_scrapers() 174 print(f"Loaded {len(scrapers)} scrapers") 175 # 3-------------------------------------------- 176 self.logprint("Starting scrape") 177 self.scrape(scrapers) 178 self.logprint("Scrape complete") 179 # 4-------------------------------------------- 180 self.logprint("Executing postscrape chores") 181 self.postscrape_chores() 182 self.logprint("Brew complete") 183 except Exception as e: 184 print(e) 185 self.logger.exception("Exception occured during brew():") 186 187 188def get_args() -> argparse.Namespace: 189 parser = argparse.ArgumentParser() 190 191 parser.add_argument( 192 "subgruel_classes", 193 type=str, 194 nargs="*", 195 help=""" A list of Gruel scraper class names to find and import. """, 196 ) 197 parser.add_argument( 198 "-e", 199 "--excludes", 200 type=str, 201 nargs="*", 202 default=[], 203 help=""" A list of glob style file patterns to exclude from the scan. """, 204 ) 205 parser.add_argument( 206 "-i", 207 "--includes", 208 type=str, 209 nargs="*", 210 default=["*.py"], 211 help=""" A list of glob style file patterns to include in the scan. Defaults to "*.py". """, 212 ) 213 parser.add_argument( 214 "-p", 215 "--path", 216 type=str, 217 default=Pathier.cwd(), 218 help=""" The directory path to scan. Defaults to the current working directory. """, 219 ) 220 parser.add_argument( 221 "-r", 222 "--recursive", 223 action="store_true", 224 help=""" Whether -p/--path should be scanned recursively or not. """, 225 ) 226 args = parser.parse_args() 227 args.path = Pathier(args.path) 228 229 return args 230 231 232def main(args: argparse.Namespace | None = None): 233 if not args: 234 args = get_args() 235 brewer = Brewer( 236 args.subgruel_classes, args.excludes, args.path, args.includes, args.recursive 237 ) 238 brewer.brew() 239 240 241if __name__ == "__main__": 242 main(get_args())
17class Brewer: 18 def __init__( 19 self, 20 subgruel_classes: list[str], 21 file_exclude_patterns: list[str] = [], 22 scan_path: Pathish = Pathier.cwd(), 23 file_include_patterns: list[str] = ["*.py"], 24 recursive: bool = True, 25 ): 26 """Run `Gruel` scrapers. 27 28 #### :params: 29 30 `subgruel_classes`: A list of class names for scrapers that should be loaded. 31 In order to be loaded, a scraper class must have a name in this list and have `Gruel` somewhere in its inheritance hierarchy. 32 33 `file_exclude_patterns`: Files that match these patterns will not be scanned. 34 35 `scan_path`: The path to scan for scraper classes. 36 37 `file_include_patterns`: Files that match these patterns will be scanned. 38 39 `recursive`: Whether the scan should be recursive or not. 40 41 >>> brewer = Brewer(["VenueScraper"], ["*template*", "*giggruel*"], "scrapers") 42 >>> brewer.brew()""" 43 self._init_logger() 44 self.subgruel_classes = subgruel_classes 45 self.file_exclude_patterns = file_exclude_patterns 46 self.file_include_patterns = file_include_patterns 47 self.scan_path = Pathier(scan_path) 48 self.recursive = recursive 49 50 def _init_logger(self): 51 # When Brewer is subclassed, use that file's stem instead of `brewer` 52 source_file = inspect.getsourcefile(type(self)) 53 if source_file: 54 log_name = Pathier(source_file).stem 55 else: 56 log_name = Pathier(__file__).stem 57 self.logger = loggi.getLogger(log_name) 58 59 def load_scrapers(self) -> list[Gruel]: 60 """Load scraper classes that inherit from `Gruel`. 61 62 NOTE: Classes are loaded, but scraper objects are not instantiated until the `scrape()` method is called. 63 64 #### :params: 65 66 `directory`: The path to scan for scraper classes. 67 68 `class_names`: A list of class names for scrapers that should be loaded. 69 In order to be loaded, a scraper class must have a name in this list and have `Gruel` somewhere in its inheritance hierarchy. 70 71 `include_patterns`: Files that match these patterns will be scanned. 72 73 `exclude_patterns`: Files that match these patterns will not be scanned. 74 75 `recursive`: Whether the search should be recursive or not. 76 77 >>> load_scrapers("getToTheGig/scrapers", ["VenueScraper"], ["*.py"], ["*template*", "*giggruel*"])""" 78 globber = self.scan_path.glob 79 if self.recursive: 80 globber = self.scan_path.rglob 81 files = [ 82 str(file) 83 for pattern in self.file_include_patterns 84 for file in globber(pattern) 85 ] 86 files = younotyou(files, exclude_patterns=self.file_exclude_patterns) 87 self.modules = {} 88 self._module_names = [] 89 for file in files: 90 module_name = Pathier(file).stem 91 try: 92 module = importlib.machinery.SourceFileLoader( 93 module_name, file 94 ).load_module() 95 except Exception as e: 96 self.logger.exception( 97 f"Failed to load module '{module_name}' from '{file}'." 98 ) 99 else: 100 self._module_names.append(module_name) 101 self.modules[module] = module 102 gruels = [ 103 getattr(module, class_) 104 for module in self.modules.values() 105 for class_ in self.subgruel_classes 106 if class_ in dir(module) and self.is_subgruel(getattr(module, class_)) 107 ] 108 self.logger.info( 109 "\n".join( 110 [f"Imported {len(gruels)} scrapers: "] 111 + [str(gruel) for gruel in gruels] 112 ) 113 ) 114 return gruels 115 116 def pop_modules(self): 117 """Unload modules.""" 118 for module in self.modules: 119 del module 120 self._module_names = [] 121 122 def get_bases(self, object: Any) -> list[Any]: 123 """Returns a recursive list of all the classes `object` inherits from.""" 124 parents = [] 125 bases = object.__bases__ 126 if not bases: 127 return parents 128 for base in bases: 129 parents.append(base) 130 parents.extend(self.get_bases(base)) 131 return parents 132 133 def is_subgruel(self, object: Any) -> bool: 134 """Returns whether `object` inherits from `Gruel` somewhere in its ancestory.""" 135 if not inspect.isclass(object) or Gruel not in self.get_bases(object): 136 return False 137 return True 138 139 def prescrape_chores(self): 140 """Override to add any tasks to be done before running the scrapers.""" 141 ... 142 143 def postscrape_chores(self): 144 """Override to add any tasks to be done after running the scrapers.""" 145 self.pop_modules() 146 147 def scrape(self, scrapers: list[Gruel]): 148 """Run the `scrape()` method for each scraper in `scrapers`. 149 150 Execution is multithreaded.""" 151 pool = quickpool.ThreadPool([scraper().scrape for scraper in scrapers]) # type: ignore 152 pool.execute() 153 154 def logprint(self, message: str): 155 """Log and print `message`.""" 156 self.logger.info(message) 157 print(message) 158 159 def brew(self): 160 """Execute pipeline. 161 162 1. self.prescrape_chores() 163 2. self.load_scrapers() 164 3. self.scrape() 165 4. self.postscrape_chores()""" 166 167 try: 168 self.logprint("Beginning brew") 169 # 1-------------------------------------------- 170 self.logprint("Executing prescrape chores") 171 self.prescrape_chores() 172 # 2-------------------------------------------- 173 self.logprint("Loading scrapers") 174 scrapers = self.load_scrapers() 175 print(f"Loaded {len(scrapers)} scrapers") 176 # 3-------------------------------------------- 177 self.logprint("Starting scrape") 178 self.scrape(scrapers) 179 self.logprint("Scrape complete") 180 # 4-------------------------------------------- 181 self.logprint("Executing postscrape chores") 182 self.postscrape_chores() 183 self.logprint("Brew complete") 184 except Exception as e: 185 print(e) 186 self.logger.exception("Exception occured during brew():")
18 def __init__( 19 self, 20 subgruel_classes: list[str], 21 file_exclude_patterns: list[str] = [], 22 scan_path: Pathish = Pathier.cwd(), 23 file_include_patterns: list[str] = ["*.py"], 24 recursive: bool = True, 25 ): 26 """Run `Gruel` scrapers. 27 28 #### :params: 29 30 `subgruel_classes`: A list of class names for scrapers that should be loaded. 31 In order to be loaded, a scraper class must have a name in this list and have `Gruel` somewhere in its inheritance hierarchy. 32 33 `file_exclude_patterns`: Files that match these patterns will not be scanned. 34 35 `scan_path`: The path to scan for scraper classes. 36 37 `file_include_patterns`: Files that match these patterns will be scanned. 38 39 `recursive`: Whether the scan should be recursive or not. 40 41 >>> brewer = Brewer(["VenueScraper"], ["*template*", "*giggruel*"], "scrapers") 42 >>> brewer.brew()""" 43 self._init_logger() 44 self.subgruel_classes = subgruel_classes 45 self.file_exclude_patterns = file_exclude_patterns 46 self.file_include_patterns = file_include_patterns 47 self.scan_path = Pathier(scan_path) 48 self.recursive = recursive
Run Gruel
scrapers.
:params:
subgruel_classes
: A list of class names for scrapers that should be loaded.
In order to be loaded, a scraper class must have a name in this list and have Gruel
somewhere in its inheritance hierarchy.
file_exclude_patterns
: Files that match these patterns will not be scanned.
scan_path
: The path to scan for scraper classes.
file_include_patterns
: Files that match these patterns will be scanned.
recursive
: Whether the scan should be recursive or not.
>>> brewer = Brewer(["VenueScraper"], ["*template*", "*giggruel*"], "scrapers")
>>> brewer.brew()
59 def load_scrapers(self) -> list[Gruel]: 60 """Load scraper classes that inherit from `Gruel`. 61 62 NOTE: Classes are loaded, but scraper objects are not instantiated until the `scrape()` method is called. 63 64 #### :params: 65 66 `directory`: The path to scan for scraper classes. 67 68 `class_names`: A list of class names for scrapers that should be loaded. 69 In order to be loaded, a scraper class must have a name in this list and have `Gruel` somewhere in its inheritance hierarchy. 70 71 `include_patterns`: Files that match these patterns will be scanned. 72 73 `exclude_patterns`: Files that match these patterns will not be scanned. 74 75 `recursive`: Whether the search should be recursive or not. 76 77 >>> load_scrapers("getToTheGig/scrapers", ["VenueScraper"], ["*.py"], ["*template*", "*giggruel*"])""" 78 globber = self.scan_path.glob 79 if self.recursive: 80 globber = self.scan_path.rglob 81 files = [ 82 str(file) 83 for pattern in self.file_include_patterns 84 for file in globber(pattern) 85 ] 86 files = younotyou(files, exclude_patterns=self.file_exclude_patterns) 87 self.modules = {} 88 self._module_names = [] 89 for file in files: 90 module_name = Pathier(file).stem 91 try: 92 module = importlib.machinery.SourceFileLoader( 93 module_name, file 94 ).load_module() 95 except Exception as e: 96 self.logger.exception( 97 f"Failed to load module '{module_name}' from '{file}'." 98 ) 99 else: 100 self._module_names.append(module_name) 101 self.modules[module] = module 102 gruels = [ 103 getattr(module, class_) 104 for module in self.modules.values() 105 for class_ in self.subgruel_classes 106 if class_ in dir(module) and self.is_subgruel(getattr(module, class_)) 107 ] 108 self.logger.info( 109 "\n".join( 110 [f"Imported {len(gruels)} scrapers: "] 111 + [str(gruel) for gruel in gruels] 112 ) 113 ) 114 return gruels
Load scraper classes that inherit from Gruel
.
NOTE: Classes are loaded, but scraper objects are not instantiated until the scrape()
method is called.
:params:
directory
: The path to scan for scraper classes.
class_names
: A list of class names for scrapers that should be loaded.
In order to be loaded, a scraper class must have a name in this list and have Gruel
somewhere in its inheritance hierarchy.
include_patterns
: Files that match these patterns will be scanned.
exclude_patterns
: Files that match these patterns will not be scanned.
recursive
: Whether the search should be recursive or not.
>>> load_scrapers("getToTheGig/scrapers", ["VenueScraper"], ["*.py"], ["*template*", "*giggruel*"])
116 def pop_modules(self): 117 """Unload modules.""" 118 for module in self.modules: 119 del module 120 self._module_names = []
Unload modules.
122 def get_bases(self, object: Any) -> list[Any]: 123 """Returns a recursive list of all the classes `object` inherits from.""" 124 parents = [] 125 bases = object.__bases__ 126 if not bases: 127 return parents 128 for base in bases: 129 parents.append(base) 130 parents.extend(self.get_bases(base)) 131 return parents
Returns a recursive list of all the classes object
inherits from.
133 def is_subgruel(self, object: Any) -> bool: 134 """Returns whether `object` inherits from `Gruel` somewhere in its ancestory.""" 135 if not inspect.isclass(object) or Gruel not in self.get_bases(object): 136 return False 137 return True
Returns whether object
inherits from Gruel
somewhere in its ancestory.
139 def prescrape_chores(self): 140 """Override to add any tasks to be done before running the scrapers.""" 141 ...
Override to add any tasks to be done before running the scrapers.
143 def postscrape_chores(self): 144 """Override to add any tasks to be done after running the scrapers.""" 145 self.pop_modules()
Override to add any tasks to be done after running the scrapers.
147 def scrape(self, scrapers: list[Gruel]): 148 """Run the `scrape()` method for each scraper in `scrapers`. 149 150 Execution is multithreaded.""" 151 pool = quickpool.ThreadPool([scraper().scrape for scraper in scrapers]) # type: ignore 152 pool.execute()
Run the scrape()
method for each scraper in scrapers
.
Execution is multithreaded.
154 def logprint(self, message: str): 155 """Log and print `message`.""" 156 self.logger.info(message) 157 print(message)
Log and print message
.
159 def brew(self): 160 """Execute pipeline. 161 162 1. self.prescrape_chores() 163 2. self.load_scrapers() 164 3. self.scrape() 165 4. self.postscrape_chores()""" 166 167 try: 168 self.logprint("Beginning brew") 169 # 1-------------------------------------------- 170 self.logprint("Executing prescrape chores") 171 self.prescrape_chores() 172 # 2-------------------------------------------- 173 self.logprint("Loading scrapers") 174 scrapers = self.load_scrapers() 175 print(f"Loaded {len(scrapers)} scrapers") 176 # 3-------------------------------------------- 177 self.logprint("Starting scrape") 178 self.scrape(scrapers) 179 self.logprint("Scrape complete") 180 # 4-------------------------------------------- 181 self.logprint("Executing postscrape chores") 182 self.postscrape_chores() 183 self.logprint("Brew complete") 184 except Exception as e: 185 print(e) 186 self.logger.exception("Exception occured during brew():")
Execute pipeline.
- self.prescrape_chores()
- self.load_scrapers()
- self.scrape()
- self.postscrape_chores()
189def get_args() -> argparse.Namespace: 190 parser = argparse.ArgumentParser() 191 192 parser.add_argument( 193 "subgruel_classes", 194 type=str, 195 nargs="*", 196 help=""" A list of Gruel scraper class names to find and import. """, 197 ) 198 parser.add_argument( 199 "-e", 200 "--excludes", 201 type=str, 202 nargs="*", 203 default=[], 204 help=""" A list of glob style file patterns to exclude from the scan. """, 205 ) 206 parser.add_argument( 207 "-i", 208 "--includes", 209 type=str, 210 nargs="*", 211 default=["*.py"], 212 help=""" A list of glob style file patterns to include in the scan. Defaults to "*.py". """, 213 ) 214 parser.add_argument( 215 "-p", 216 "--path", 217 type=str, 218 default=Pathier.cwd(), 219 help=""" The directory path to scan. Defaults to the current working directory. """, 220 ) 221 parser.add_argument( 222 "-r", 223 "--recursive", 224 action="store_true", 225 help=""" Whether -p/--path should be scanned recursively or not. """, 226 ) 227 args = parser.parse_args() 228 args.path = Pathier(args.path) 229 230 return args