gruel.brewer
1import argparse 2import importlib 3import importlib.machinery 4import importlib.util 5import inspect 6import logging 7import sys 8from typing import Any 9 10from pathier import Pathier, Pathish 11from printbuddies import PoolBar 12from younotyou import younotyou 13 14from gruel import Gruel 15 16 17class Brewer: 18 def __init__( 19 self, 20 subgruel_classes: list[str], 21 file_exclude_patterns: list[str] = [], 22 scan_path: Pathish = Pathier.cwd(), 23 file_include_patterns: list[str] = ["*.py"], 24 recursive: bool = True, 25 ): 26 """Run `Gruel` scrapers. 27 28 #### :params: 29 30 `subgruel_classes`: A list of class names for scrapers that should be loaded. 31 In order to be loaded, a scraper class must have a name in this list and have `Gruel` somewhere in its inheritance hierarchy. 32 33 `file_exclude_patterns`: Files that match these patterns will not be scanned. 34 35 `scan_path`: The path to scan for scraper classes. 36 37 `file_include_patterns`: Files that match these patterns will be scanned. 38 39 `recursive`: Whether the scan should be recursive or not. 40 41 >>> brewer = Brewer(["VenueScraper"], ["*template*", "*giggruel*"], "scrapers") 42 >>> brewer.brew()""" 43 self._init_logger() 44 self.subgruel_classes = subgruel_classes 45 self.file_exclude_patterns = file_exclude_patterns 46 self.file_include_patterns = file_include_patterns 47 self.scan_path = Pathier(scan_path) 48 self.recursive = recursive 49 50 def _init_logger(self): 51 self.logger = logging.getLogger(Pathier(__file__).stem) 52 if not self.logger.hasHandlers(): 53 handler = logging.FileHandler(Pathier(__file__).stem + ".log") 54 handler.setFormatter( 55 logging.Formatter( 56 "{levelname}|-|{asctime}|-|{message}", 57 style="{", 58 datefmt="%m/%d/%Y %I:%M:%S %p", 59 ) 60 ) 61 self.logger.addHandler(handler) 62 self.logger.setLevel(logging.INFO) 63 64 def load_scrapers(self) -> list[Gruel]: 65 """Load scraper classes that inherit from `Gruel`. 66 67 #### :params: 68 69 `directory`: The path to scan for scraper classes. 70 71 `class_names`: A list of class names for scrapers that should be loaded. 72 In order to be loaded, a scraper class must have a name in this list and have `Gruel` somewhere in its inheritance hierarchy. 73 74 `include_patterns`: Files that match these patterns will be scanned. 75 76 `exclude_patterns`: Files that match these patterns will not be scanned. 77 78 `recursive`: Whether the search should be recursive or not. 79 80 >>> load_scrapers("getToTheGig/scrapers", ["VenueScraper"], ["*.py"], ["*template*", "*giggruel*"])""" 81 globber = self.scan_path.glob 82 if self.recursive: 83 globber = self.scan_path.rglob 84 files = [ 85 str(file) 86 for pattern in self.file_include_patterns 87 for file in globber(pattern) 88 ] 89 files = younotyou(files, exclude_patterns=self.file_exclude_patterns) 90 modules = [] 91 self._module_names = [] 92 for file in files: 93 try: 94 module_name = Pathier(file).stem 95 self._module_names.append(module_name) 96 module = importlib.machinery.SourceFileLoader( 97 module_name, file 98 ).load_module() 99 modules.append(module) 100 except Exception as e: 101 ... 102 gruels = [ 103 getattr(module, class_) 104 for module in modules 105 for class_ in self.subgruel_classes 106 if class_ in dir(module) and self.is_subgruel(getattr(module, class_)) 107 ] 108 self.logger.info( 109 "\n".join( 110 [f"Imported {len(gruels)} scrapers: "] 111 + [str(gruel) for gruel in gruels] 112 ) 113 ) 114 return gruels 115 116 def pop_modules(self): 117 """Unload modules.""" 118 for module in self._module_names: 119 sys.modules.pop(module) 120 self._module_names = [] 121 122 def get_bases(self, object: Any) -> list[Any]: 123 """Returns a recursive list of all the classes `object` inherits from.""" 124 parents = [] 125 bases = object.__bases__ 126 if not bases: 127 return parents 128 for base in bases: 129 parents.append(base) 130 parents.extend(self.get_bases(base)) 131 return parents 132 133 def is_subgruel(self, object: Any) -> bool: 134 """Returns whether `object` inherits from `Gruel` somewhere in its ancestory.""" 135 if not inspect.isclass(object) or Gruel not in self.get_bases(object): 136 return False 137 return True 138 139 def prescrape_chores(self): 140 """Override to add any tasks to be done before running the scrapers.""" 141 ... 142 143 def postscrape_chores(self): 144 """Override to add any tasks to be done after running the scrapers.""" 145 self.pop_modules() 146 147 def scrape(self, scrapers: list[Gruel]): 148 """Run the `scrape()` method for each scraper in `scrapers`. 149 150 Execution is multithreaded.""" 151 pool = PoolBar("thread", [scraper().scrape for scraper in scrapers]) # type: ignore 152 pool.execute() 153 154 def brew(self): 155 """Execute pipeline. 156 157 1. self.prescrape_chores() 158 2. self.load_scrapers() 159 3. self.scrape() 160 4. self.postscrape_chores()""" 161 self.logger.info("Beginning brew") 162 print("Beginning brew") 163 print("Executing prescrape chores...") 164 self.prescrape_chores() 165 print("Loading scrapers...") 166 scrapers = self.load_scrapers() 167 print(f"Loaded {len(scrapers)} scrapers.") 168 print("Starting scrape...") 169 self.scrape(scrapers) 170 print("Scrape complete.") 171 print("Executing postscrape chores...") 172 self.postscrape_chores() 173 print("Brew complete.") 174 self.logger.info("Brew complete.") 175 176 177def get_args() -> argparse.Namespace: 178 parser = argparse.ArgumentParser() 179 180 parser.add_argument( 181 "subgruel_classes", 182 type=str, 183 nargs="*", 184 help=""" A list of Gruel scraper class names to find and import. """, 185 ) 186 parser.add_argument( 187 "-e", 188 "--excludes", 189 type=str, 190 nargs="*", 191 default=[], 192 help=""" A list of glob style file patterns to exclude from the scan. """, 193 ) 194 parser.add_argument( 195 "-i", 196 "--includes", 197 type=str, 198 nargs="*", 199 default=["*.py"], 200 help=""" A list of glob style file patterns to include in the scan. Defaults to "*.py". """, 201 ) 202 parser.add_argument( 203 "-p", 204 "--path", 205 type=str, 206 default=Pathier.cwd(), 207 help=""" The directory path to scan. Defaults to the current working directory. """, 208 ) 209 parser.add_argument( 210 "-r", 211 "--recursive", 212 action="store_true", 213 help=""" Whether -p/--path should be scanned recursively or not. """, 214 ) 215 args = parser.parse_args() 216 args.path = Pathier(args.path) 217 218 return args 219 220 221def main(args: argparse.Namespace | None = None): 222 if not args: 223 args = get_args() 224 brewer = Brewer( 225 args.subgruel_classes, args.excludes, args.path, args.includes, args.recursive 226 ) 227 brewer.brew() 228 229 230if __name__ == "__main__": 231 main(get_args())
18class Brewer: 19 def __init__( 20 self, 21 subgruel_classes: list[str], 22 file_exclude_patterns: list[str] = [], 23 scan_path: Pathish = Pathier.cwd(), 24 file_include_patterns: list[str] = ["*.py"], 25 recursive: bool = True, 26 ): 27 """Run `Gruel` scrapers. 28 29 #### :params: 30 31 `subgruel_classes`: A list of class names for scrapers that should be loaded. 32 In order to be loaded, a scraper class must have a name in this list and have `Gruel` somewhere in its inheritance hierarchy. 33 34 `file_exclude_patterns`: Files that match these patterns will not be scanned. 35 36 `scan_path`: The path to scan for scraper classes. 37 38 `file_include_patterns`: Files that match these patterns will be scanned. 39 40 `recursive`: Whether the scan should be recursive or not. 41 42 >>> brewer = Brewer(["VenueScraper"], ["*template*", "*giggruel*"], "scrapers") 43 >>> brewer.brew()""" 44 self._init_logger() 45 self.subgruel_classes = subgruel_classes 46 self.file_exclude_patterns = file_exclude_patterns 47 self.file_include_patterns = file_include_patterns 48 self.scan_path = Pathier(scan_path) 49 self.recursive = recursive 50 51 def _init_logger(self): 52 self.logger = logging.getLogger(Pathier(__file__).stem) 53 if not self.logger.hasHandlers(): 54 handler = logging.FileHandler(Pathier(__file__).stem + ".log") 55 handler.setFormatter( 56 logging.Formatter( 57 "{levelname}|-|{asctime}|-|{message}", 58 style="{", 59 datefmt="%m/%d/%Y %I:%M:%S %p", 60 ) 61 ) 62 self.logger.addHandler(handler) 63 self.logger.setLevel(logging.INFO) 64 65 def load_scrapers(self) -> list[Gruel]: 66 """Load scraper classes that inherit from `Gruel`. 67 68 #### :params: 69 70 `directory`: The path to scan for scraper classes. 71 72 `class_names`: A list of class names for scrapers that should be loaded. 73 In order to be loaded, a scraper class must have a name in this list and have `Gruel` somewhere in its inheritance hierarchy. 74 75 `include_patterns`: Files that match these patterns will be scanned. 76 77 `exclude_patterns`: Files that match these patterns will not be scanned. 78 79 `recursive`: Whether the search should be recursive or not. 80 81 >>> load_scrapers("getToTheGig/scrapers", ["VenueScraper"], ["*.py"], ["*template*", "*giggruel*"])""" 82 globber = self.scan_path.glob 83 if self.recursive: 84 globber = self.scan_path.rglob 85 files = [ 86 str(file) 87 for pattern in self.file_include_patterns 88 for file in globber(pattern) 89 ] 90 files = younotyou(files, exclude_patterns=self.file_exclude_patterns) 91 modules = [] 92 self._module_names = [] 93 for file in files: 94 try: 95 module_name = Pathier(file).stem 96 self._module_names.append(module_name) 97 module = importlib.machinery.SourceFileLoader( 98 module_name, file 99 ).load_module() 100 modules.append(module) 101 except Exception as e: 102 ... 103 gruels = [ 104 getattr(module, class_) 105 for module in modules 106 for class_ in self.subgruel_classes 107 if class_ in dir(module) and self.is_subgruel(getattr(module, class_)) 108 ] 109 self.logger.info( 110 "\n".join( 111 [f"Imported {len(gruels)} scrapers: "] 112 + [str(gruel) for gruel in gruels] 113 ) 114 ) 115 return gruels 116 117 def pop_modules(self): 118 """Unload modules.""" 119 for module in self._module_names: 120 sys.modules.pop(module) 121 self._module_names = [] 122 123 def get_bases(self, object: Any) -> list[Any]: 124 """Returns a recursive list of all the classes `object` inherits from.""" 125 parents = [] 126 bases = object.__bases__ 127 if not bases: 128 return parents 129 for base in bases: 130 parents.append(base) 131 parents.extend(self.get_bases(base)) 132 return parents 133 134 def is_subgruel(self, object: Any) -> bool: 135 """Returns whether `object` inherits from `Gruel` somewhere in its ancestory.""" 136 if not inspect.isclass(object) or Gruel not in self.get_bases(object): 137 return False 138 return True 139 140 def prescrape_chores(self): 141 """Override to add any tasks to be done before running the scrapers.""" 142 ... 143 144 def postscrape_chores(self): 145 """Override to add any tasks to be done after running the scrapers.""" 146 self.pop_modules() 147 148 def scrape(self, scrapers: list[Gruel]): 149 """Run the `scrape()` method for each scraper in `scrapers`. 150 151 Execution is multithreaded.""" 152 pool = PoolBar("thread", [scraper().scrape for scraper in scrapers]) # type: ignore 153 pool.execute() 154 155 def brew(self): 156 """Execute pipeline. 157 158 1. self.prescrape_chores() 159 2. self.load_scrapers() 160 3. self.scrape() 161 4. self.postscrape_chores()""" 162 self.logger.info("Beginning brew") 163 print("Beginning brew") 164 print("Executing prescrape chores...") 165 self.prescrape_chores() 166 print("Loading scrapers...") 167 scrapers = self.load_scrapers() 168 print(f"Loaded {len(scrapers)} scrapers.") 169 print("Starting scrape...") 170 self.scrape(scrapers) 171 print("Scrape complete.") 172 print("Executing postscrape chores...") 173 self.postscrape_chores() 174 print("Brew complete.") 175 self.logger.info("Brew complete.")
19 def __init__( 20 self, 21 subgruel_classes: list[str], 22 file_exclude_patterns: list[str] = [], 23 scan_path: Pathish = Pathier.cwd(), 24 file_include_patterns: list[str] = ["*.py"], 25 recursive: bool = True, 26 ): 27 """Run `Gruel` scrapers. 28 29 #### :params: 30 31 `subgruel_classes`: A list of class names for scrapers that should be loaded. 32 In order to be loaded, a scraper class must have a name in this list and have `Gruel` somewhere in its inheritance hierarchy. 33 34 `file_exclude_patterns`: Files that match these patterns will not be scanned. 35 36 `scan_path`: The path to scan for scraper classes. 37 38 `file_include_patterns`: Files that match these patterns will be scanned. 39 40 `recursive`: Whether the scan should be recursive or not. 41 42 >>> brewer = Brewer(["VenueScraper"], ["*template*", "*giggruel*"], "scrapers") 43 >>> brewer.brew()""" 44 self._init_logger() 45 self.subgruel_classes = subgruel_classes 46 self.file_exclude_patterns = file_exclude_patterns 47 self.file_include_patterns = file_include_patterns 48 self.scan_path = Pathier(scan_path) 49 self.recursive = recursive
Run Gruel
scrapers.
:params:
subgruel_classes
: A list of class names for scrapers that should be loaded.
In order to be loaded, a scraper class must have a name in this list and have Gruel
somewhere in its inheritance hierarchy.
file_exclude_patterns
: Files that match these patterns will not be scanned.
scan_path
: The path to scan for scraper classes.
file_include_patterns
: Files that match these patterns will be scanned.
recursive
: Whether the scan should be recursive or not.
>>> brewer = Brewer(["VenueScraper"], ["*template*", "*giggruel*"], "scrapers")
>>> brewer.brew()
65 def load_scrapers(self) -> list[Gruel]: 66 """Load scraper classes that inherit from `Gruel`. 67 68 #### :params: 69 70 `directory`: The path to scan for scraper classes. 71 72 `class_names`: A list of class names for scrapers that should be loaded. 73 In order to be loaded, a scraper class must have a name in this list and have `Gruel` somewhere in its inheritance hierarchy. 74 75 `include_patterns`: Files that match these patterns will be scanned. 76 77 `exclude_patterns`: Files that match these patterns will not be scanned. 78 79 `recursive`: Whether the search should be recursive or not. 80 81 >>> load_scrapers("getToTheGig/scrapers", ["VenueScraper"], ["*.py"], ["*template*", "*giggruel*"])""" 82 globber = self.scan_path.glob 83 if self.recursive: 84 globber = self.scan_path.rglob 85 files = [ 86 str(file) 87 for pattern in self.file_include_patterns 88 for file in globber(pattern) 89 ] 90 files = younotyou(files, exclude_patterns=self.file_exclude_patterns) 91 modules = [] 92 self._module_names = [] 93 for file in files: 94 try: 95 module_name = Pathier(file).stem 96 self._module_names.append(module_name) 97 module = importlib.machinery.SourceFileLoader( 98 module_name, file 99 ).load_module() 100 modules.append(module) 101 except Exception as e: 102 ... 103 gruels = [ 104 getattr(module, class_) 105 for module in modules 106 for class_ in self.subgruel_classes 107 if class_ in dir(module) and self.is_subgruel(getattr(module, class_)) 108 ] 109 self.logger.info( 110 "\n".join( 111 [f"Imported {len(gruels)} scrapers: "] 112 + [str(gruel) for gruel in gruels] 113 ) 114 ) 115 return gruels
Load scraper classes that inherit from Gruel
.
:params:
directory
: The path to scan for scraper classes.
class_names
: A list of class names for scrapers that should be loaded.
In order to be loaded, a scraper class must have a name in this list and have Gruel
somewhere in its inheritance hierarchy.
include_patterns
: Files that match these patterns will be scanned.
exclude_patterns
: Files that match these patterns will not be scanned.
recursive
: Whether the search should be recursive or not.
>>> load_scrapers("getToTheGig/scrapers", ["VenueScraper"], ["*.py"], ["*template*", "*giggruel*"])
117 def pop_modules(self): 118 """Unload modules.""" 119 for module in self._module_names: 120 sys.modules.pop(module) 121 self._module_names = []
Unload modules.
123 def get_bases(self, object: Any) -> list[Any]: 124 """Returns a recursive list of all the classes `object` inherits from.""" 125 parents = [] 126 bases = object.__bases__ 127 if not bases: 128 return parents 129 for base in bases: 130 parents.append(base) 131 parents.extend(self.get_bases(base)) 132 return parents
Returns a recursive list of all the classes object
inherits from.
134 def is_subgruel(self, object: Any) -> bool: 135 """Returns whether `object` inherits from `Gruel` somewhere in its ancestory.""" 136 if not inspect.isclass(object) or Gruel not in self.get_bases(object): 137 return False 138 return True
Returns whether object
inherits from Gruel
somewhere in its ancestory.
140 def prescrape_chores(self): 141 """Override to add any tasks to be done before running the scrapers.""" 142 ...
Override to add any tasks to be done before running the scrapers.
144 def postscrape_chores(self): 145 """Override to add any tasks to be done after running the scrapers.""" 146 self.pop_modules()
Override to add any tasks to be done after running the scrapers.
148 def scrape(self, scrapers: list[Gruel]): 149 """Run the `scrape()` method for each scraper in `scrapers`. 150 151 Execution is multithreaded.""" 152 pool = PoolBar("thread", [scraper().scrape for scraper in scrapers]) # type: ignore 153 pool.execute()
Run the scrape()
method for each scraper in scrapers
.
Execution is multithreaded.
155 def brew(self): 156 """Execute pipeline. 157 158 1. self.prescrape_chores() 159 2. self.load_scrapers() 160 3. self.scrape() 161 4. self.postscrape_chores()""" 162 self.logger.info("Beginning brew") 163 print("Beginning brew") 164 print("Executing prescrape chores...") 165 self.prescrape_chores() 166 print("Loading scrapers...") 167 scrapers = self.load_scrapers() 168 print(f"Loaded {len(scrapers)} scrapers.") 169 print("Starting scrape...") 170 self.scrape(scrapers) 171 print("Scrape complete.") 172 print("Executing postscrape chores...") 173 self.postscrape_chores() 174 print("Brew complete.") 175 self.logger.info("Brew complete.")
Execute pipeline.
- self.prescrape_chores()
- self.load_scrapers()
- self.scrape()
- self.postscrape_chores()
178def get_args() -> argparse.Namespace: 179 parser = argparse.ArgumentParser() 180 181 parser.add_argument( 182 "subgruel_classes", 183 type=str, 184 nargs="*", 185 help=""" A list of Gruel scraper class names to find and import. """, 186 ) 187 parser.add_argument( 188 "-e", 189 "--excludes", 190 type=str, 191 nargs="*", 192 default=[], 193 help=""" A list of glob style file patterns to exclude from the scan. """, 194 ) 195 parser.add_argument( 196 "-i", 197 "--includes", 198 type=str, 199 nargs="*", 200 default=["*.py"], 201 help=""" A list of glob style file patterns to include in the scan. Defaults to "*.py". """, 202 ) 203 parser.add_argument( 204 "-p", 205 "--path", 206 type=str, 207 default=Pathier.cwd(), 208 help=""" The directory path to scan. Defaults to the current working directory. """, 209 ) 210 parser.add_argument( 211 "-r", 212 "--recursive", 213 action="store_true", 214 help=""" Whether -p/--path should be scanned recursively or not. """, 215 ) 216 args = parser.parse_args() 217 args.path = Pathier(args.path) 218 219 return args