seleniumuser.seleniumuser
1import atexit 2import os 3import random 4import sys 5import time 6from pathlib import Path 7from types import LambdaType 8from typing import Any 9from warnings import warn 10 11from bs4 import BeautifulSoup 12from noiftimer import Timer 13from selenium import webdriver 14from selenium.webdriver.chrome.options import Options as ChromeOptions 15from selenium.webdriver.chrome.service import Service as ChromeService 16from selenium.webdriver.common.by import By 17from selenium.webdriver.common.keys import Keys 18from selenium.webdriver.firefox.firefox_profile import FirefoxProfile 19from selenium.webdriver.firefox.options import Options as FirefoxOptions 20from selenium.webdriver.firefox.service import Service as FirefoxService 21from selenium.webdriver.remote.webelement import WebElement 22from selenium.webdriver.support.ui import Select 23from voxscribe import get_text_from_url 24from whosyouragent import get_agent 25 26 27class User: 28 """Sits on top of selenium to streamline 29 automation and scraping tasks.""" 30 31 def __init__( 32 self, 33 headless: bool = False, 34 browser_type: str = "firefox", 35 implicit_wait: int = 10, 36 page_load_timeout: int = 60, 37 open_browser: bool = True, 38 locator_method: str = "xpath", 39 randomize_user_agent: bool = True, 40 user_agent_rotation_period: int = None, 41 move_window_by: tuple[int, int] = (0, -1000), 42 download_dir: str | Path = None, 43 driver_path: str | Path = None, 44 ): 45 """ 46 :param headless: If True, browser window will not be visible. 47 48 :param browser_type: Which browser to use. Can be 'firefox' or 'chrome'. 49 50 :param implicit_wait: Number of seconds to look for a specified element before 51 selenium considers it missing and throws an exception. 52 53 :param page_load_timeout: Time in seconds for selenium to wait for a page to load 54 before throwing an exception. 55 56 :param open_browser: If True, opens a browser window when a User object is created. 57 If False, a manual call to self.open_browser() must be made. 58 59 :param locator_method: The locator type User should expect to be given. 60 Can be 'xpath', 'id', 'className', 'name', or 'cssSelector'. 61 Every member function with a 'locator' argument refers to a string matching 62 the current locator_method. 63 64 :param randomize_user_agent: If True, a random useragent will be used whenever 65 the browser is opened. If False, the native useragent will be used. 66 67 :param user_agent_rotation_period: If not None, the browser window will be closed 68 and reopened with a new useragent every user_agent_rotation_period number of minutes. 69 Rotation occurs on the first call to self.get() after the time period has elapsed. 70 Ignored if randomize_user_agent is False. 71 72 :param move_window_by: The x and y amount of pixels to move the browser window by after opening. 73 74 :param download_dir: The download folder to use. If None, the default folder will be used. 75 76 :param driver_path: The path to the webdriver executable selenium should use. 77 If None, the system PATH will be checked for the executable. 78 If the executable isn't found, the parent directories and the immediate child directories 79 of the current working directory will be searched. 80 """ 81 self.headless = headless 82 browser_type = browser_type.lower() 83 if browser_type in ["firefox", "chrome"]: 84 self.browser_type = browser_type 85 else: 86 raise ValueError("'browser_type' parameter must be 'firefox' or 'chrome'") 87 self.browser_open = False 88 self.implicit_wait = implicit_wait 89 self.page_load_timeout = page_load_timeout 90 self.rotation_timer = Timer() 91 self.timer = Timer() 92 self.timer.start() 93 self.randomize_user_agent = randomize_user_agent 94 self.user_agent_rotation_period = user_agent_rotation_period 95 self.locator_method = locator_method 96 self.turbo() 97 self.keys = Keys 98 self.move_window_by = move_window_by 99 self.download_dir = download_dir 100 self.driver_path = driver_path 101 if not self.driver_path: 102 self.search_for_driver() 103 if open_browser: 104 self.open_browser() 105 else: 106 self.browser = None 107 atexit.register(self.close_browser) 108 109 def __enter__(self): 110 return self 111 112 def __exit__(self, *args): 113 self.close_browser() 114 115 def configure_firefox(self) -> FirefoxService: 116 """Configure options and profile for firefox.""" 117 self.options = FirefoxOptions() 118 self.options.headless = self.headless 119 self.options.set_preference( 120 "widget.windows.window_occlusion_tracking.enabled", False 121 ) 122 self.options.set_preference("dom.webaudio.enabled", False) 123 if self.randomize_user_agent: 124 self.options.set_preference("general.useragent.override", get_agent()) 125 if self.download_dir: 126 Path(self.download_dir).mkdir(parents=True, exist_ok=True) 127 self.profile = FirefoxProfile() 128 self.profile.set_preference("browser.download.dir", str(self.download_dir)) 129 self.profile.set_preference("browser.download.folderList", 2) 130 else: 131 self.profile = None 132 self.service = FirefoxService( 133 executable_path=str(self.driver_path), log_path=os.devnull 134 ) 135 136 def configure_chrome(self) -> ChromeService: 137 """Configure options and profile for chrome.""" 138 self.options = ChromeOptions() 139 self.options.headless = self.headless 140 self.options.add_argument("--disable-blink-features=AutomationControlled") 141 self.options.add_argument("--mute-audio") 142 self.options.add_argument("--disable-infobars") 143 self.options.add_argument("--disable-notifications") 144 self.options.add_argument("--log-level=3") 145 if self.randomize_user_agent: 146 self.options.add_argument(f"--user-agent={get_agent()}") 147 self.options.add_experimental_option("useAutomationExtension", False) 148 if self.download_dir: 149 Path(self.download_dir).mkdir(parents=True, exist_ok=True) 150 self.options.add_experimental_option( 151 "prefs", {"download.default_directory": str(self.download_dir)} 152 ) 153 self.service = ChromeService( 154 executable_path=str(self.driver_path), log_path=os.devnull 155 ) 156 157 def search_for_driver(self): 158 """Searches for the webdriver executable.""" 159 cwd = Path.cwd() 160 found = False 161 match self.browser_type: 162 case "firefox": 163 driver = "geckodriver.exe" 164 case "chrome": 165 driver = "chromedriver.exe" 166 # search PATH 167 env_path = os.environ["PATH"] 168 if sys.platform == "win32": 169 env_paths = env_path.split(";") 170 else: 171 env_paths = env_path.split(":") 172 driver = driver[: driver.find(".")] 173 for path in env_paths: 174 if (Path(path) / driver).exists(): 175 self.driver_path = Path(path) / driver 176 found = True 177 break 178 # check current working directory and parent folders 179 if not found: 180 while cwd != cwd.parent: 181 if (cwd / driver).exists(): 182 self.driver_path = cwd / driver 183 found = True 184 break 185 cwd = cwd.parent 186 # check top most level 187 if not found and (cwd / driver).exists(): 188 self.driver_path = cwd / driver 189 found = True 190 # check child folders (only 1 level down) 191 if not found: 192 for child in Path.cwd().iterdir(): 193 if child.is_dir() and (child / driver).exists(): 194 self.driver_path = child / driver 195 found = True 196 if not found: 197 warn(f"Could not find {driver}") 198 199 def set_implicit_wait(self, wait_time: int = None): 200 """Sets to default time if no arg given.""" 201 if not wait_time: 202 self.browser.implicitly_wait(self.implicit_wait) 203 else: 204 self.browser.implicitly_wait(wait_time) 205 206 def open_browser(self): 207 """Configures and opens selenium browser.""" 208 if not self.browser_open: 209 match self.browser_type: 210 case "firefox": 211 self.configure_firefox() 212 self.browser = webdriver.Firefox( 213 options=self.options, 214 service=self.service, 215 firefox_profile=self.profile, 216 ) 217 case "chrome": 218 self.configure_chrome() 219 self.browser = webdriver.Chrome( 220 options=self.options, service=self.service 221 ) 222 self.set_implicit_wait() 223 self.browser.maximize_window() 224 self.browser.set_window_position( 225 self.move_window_by[0], self.move_window_by[1] 226 ) 227 self.browser.maximize_window() 228 self.browser.set_page_load_timeout(self.page_load_timeout) 229 self.browser_open = True 230 self.tab_index = 0 231 self.rotation_timer.start() 232 else: 233 warn("Browser already open.") 234 235 def close_browser(self): 236 """Close browser window.""" 237 if self.browser_open: 238 self.browser_open = False 239 self.browser.quit() 240 241 def open_tab(self, url: str = "", switch_to_tab: bool = True): 242 """Opens new tab and, if provided, goes to url. 243 244 New tab is inserted after currently active tab.""" 245 self.script("window.open(arguments[0]);", url) 246 if switch_to_tab: 247 self.switch_to_tab(self.tab_index + 1) 248 249 def switch_to_tab(self, tab_index: int): 250 """Switch to a tab in browser, zero indexed.""" 251 self.browser.switch_to.window(self.browser.window_handles[tab_index]) 252 self.tab_index = tab_index 253 254 def get_num_tabs(self) -> int: 255 """Returns number of tabs open.""" 256 return len(self.browser.window_handles) 257 258 def close_tab(self, tab_index: int = 1): 259 """Close specified tab and 260 switches to tab index 0.""" 261 self.switch_to_tab(tab_index) 262 self.browser.close() 263 self.switch_to_tab(0) 264 265 def get(self, url: str): 266 """Requests webpage at given url and rotates userAgent if necessary.""" 267 if not self.browser_open: 268 self.open_browser() 269 if ( 270 self.randomize_user_agent 271 and self.user_agent_rotation_period is not None 272 and self.rotation_timer.check(format=False) 273 > (60 * self.user_agent_rotation_period) 274 ): 275 self.rotation_timer.stop() 276 self.close_browser() 277 self.open_browser() 278 self.browser.get(url) 279 self.script("Object.defineProperty(navigator, 'webdriver', {get: () => false})") 280 self.chill(self.arrival_wait) 281 282 def get_soup(self) -> BeautifulSoup: 283 """Returns a BeautifulSoup object 284 of the current page source.""" 285 return BeautifulSoup(self.browser.page_source, "html.parser") 286 287 def current_url(self) -> str: 288 """Returns current url of active tab.""" 289 return self.browser.current_url 290 291 def delete_cookies(self): 292 """Delete all cookies for 293 this browser instance.""" 294 self.browser.delete_all_cookies() 295 296 def turbo(self, engage: bool = True): 297 """When engaged, strings will be sent 298 to elements all at once and there will be 299 no waiting after actions. 300 301 When disengaged, strings will be sent to elements 302 'one key at a time' with randomized amounts of 303 time between successive keys and after actions.""" 304 if engage: 305 self.after_key_wait = (0, 0) 306 self.after_field_wait = (0, 0) 307 self.after_click_wait = (0, 0) 308 self.arrival_wait = (1, 1) 309 self.one_key_at_a_time = False 310 self.turbo_engaged = True 311 else: 312 self.after_key_wait = (0.1, 0.5) 313 self.after_field_wait = (1, 2) 314 self.after_click_wait = (0.25, 1.5) 315 self.arrival_wait = (4, 10) 316 self.one_key_at_a_time = True 317 self.turbo_engaged = False 318 319 def chill(self, min_max: tuple[float, float]): 320 """Sleeps a random amount 321 between min_max[0] and min_max[1].""" 322 time.sleep(random.uniform(min_max[0], min_max[1])) 323 324 def script(self, script: str, args: Any = None) -> Any: 325 """Execute javascript code and returns result.""" 326 return self.browser.execute_script(script, args) 327 328 def remove(self, locator: str): 329 """Removes element from DOM.""" 330 self.script("arguments[0].remove();", self.find(locator)) 331 332 def get_length(self, locator: str) -> int: 333 """Returns number of child elements for a given element.""" 334 return int(self.script("return arguments[0].length;", self.find(locator))) 335 336 def find(self, locator: str) -> WebElement: 337 """Finds and returns a WebElement.""" 338 match self.locator_method: 339 case "xpath": 340 return self.browser.find_element(By.XPATH, locator) 341 case "id": 342 return self.browser.find_element(By.ID, locator) 343 case "className": 344 return self.browser.find_element(By.CLASS_NAME, locator) 345 case "name": 346 return self.browser.find_element(By.NAME, locator) 347 case "cssSelector": 348 return self.browser.find_element(By.CSS_SELECTOR, locator) 349 350 def find_children(self, locator: str) -> list[WebElement]: 351 """Returns a list of child WebElements 352 for given locator arg.""" 353 element = self.find(locator) 354 return element.find_elements("xpath", "./*") 355 356 def scroll(self, amount: int = None, fraction: float = None): 357 """Scroll web page. 358 :param amount: The number of lines to scroll if not None. 359 360 :param fraction: The amount between 0.0 and 1.0 361 of the page height to scroll. 362 363 If values are provided for both arguments, 364 amount will be used. 365 366 If values are provided for neither argument, 367 the entire page length will be scrolled. 368 369 Scrolls one line at a time if self.turbo is False.""" 370 if amount: 371 amount_to_scroll = amount 372 elif fraction: 373 amount_to_scroll = int( 374 fraction 375 * ( 376 int(self.script("return document.body.scrollHeight;")) 377 - int(self.script("return window.pageYOffset;")) 378 ) 379 ) 380 else: 381 amount_to_scroll = int(self.script("return document.body.scrollHeight;")) 382 if self.turbo_engaged: 383 self.script("window.scrollBy(0,arguments[0]);", amount_to_scroll) 384 else: 385 for _ in range(abs(amount_to_scroll)): 386 if amount_to_scroll >= 0: 387 self.script("window.scrollBy(0,1);") 388 else: 389 self.script("window.scrollBy(0,-1);") 390 self.chill(self.after_click_wait) 391 392 def scroll_into_view(self, locator: str) -> WebElement: 393 """Scrolls to a given element and returns the element.""" 394 element = self.find(locator) 395 self.script("arguments[0].scroll_into_view();", element) 396 self.chill(self.after_click_wait) 397 return element 398 399 def text(self, locator: str) -> str: 400 """Returns text of WebElement.""" 401 return self.find(locator).text 402 403 def click(self, locator: str) -> WebElement: 404 """Clicks on and returns WebElement.""" 405 element = self.find(locator) 406 element.click() 407 self.chill(self.after_click_wait) 408 return element 409 410 def clear(self, locator: str) -> WebElement: 411 """Clears content of WebElement if able 412 and then returns WebElement.""" 413 element = self.find(locator) 414 element.clear() 415 self.chill(self.after_click_wait) 416 return element 417 418 def switch_to_iframe(self, locator: str): 419 """Switch to an iframe from given locator.""" 420 self.browser.switch_to.frame(self.find(locator)) 421 422 def switch_to_parent_frame(self): 423 """Move up a frame level from current frame.""" 424 self.browser.switch_to.parent_frame() 425 426 def select( 427 self, locator: str, method: str, choice: str | int | tuple 428 ) -> WebElement: 429 """Select a choice from Select element. 430 Returns the Select element from the locator string, 431 not the option element that is selected. 432 433 :param method: Can be 'value' or 'index' 434 435 :param choice: The option to select. 436 437 If method is 'value', then choice should be 438 the html 'value' attribute of the desired option. 439 440 If method is 'index', choice can either be a single 441 int for the desired option or it can be a two-tuple. 442 If the tuple is provided, a random option between the 443 two indicies (inclusive) will be selected.""" 444 element = self.click(locator) 445 match method: 446 case "value": 447 Select(element).select_by_value(choice) 448 case "index": 449 if type(choice) == tuple: 450 choice = random.randint(choice[0], choice[1]) 451 Select(element).select_by_index(choice) 452 self.chill(self.after_field_wait) 453 return element 454 455 def click_elements( 456 self, locators: list[str], max_selections: int = None, min_selections: int = 1 457 ) -> WebElement: 458 """Click a random number of WebElements 459 and return the last WebElement clicked. 460 461 :param locators: A list of element locators to choose from. 462 463 :param max_selections: The maximum number of elements to click. 464 If None, the maximum will be the length of the locators list. 465 466 :param min_selections: The minimum number of elements to click. 467 468 e.g. self.click_elements([xpath1, xpath2, xpath3, xpath4], max_selections=3) 469 will click between 1 and 3 random elements from the list. 470 """ 471 if not max_selections: 472 max_selections = len(locators) 473 for option in random.sample( 474 locators, k=random.randint(min_selections, max_selections) 475 ): 476 element = self.click(option) 477 return element 478 479 def get_click_list( 480 self, num_options: int, max_choices: int = 1, min_choices: int = 1 481 ) -> list[str]: 482 """Similar to self.click_elements(), but for use with the self.fill_next() method. 483 484 Creates a list of length 'num_options' where every element is 'skip'. 485 486 A random number of elements in the list between 'min_choices' and 'max_choices' are 487 replaced with 'keys.SPACE' (interpreted as a click by almost all web forms).""" 488 click_list = ["skip"] * num_options 489 selected_indexes = [] 490 for i in range(random.randint(min_choices, max_choices)): 491 index = random.randint(0, num_options - 1) 492 while index in selected_indexes: 493 index = random.randint(0, num_options - 1) 494 selected_indexes.append(index) 495 click_list[index] = self.keys.SPACE 496 return click_list 497 498 def send_keys( 499 self, 500 locator: str, 501 data: str, 502 click_first: bool = True, 503 clear_first: bool = False, 504 ) -> WebElement: 505 """Types data into element and returns the element. 506 507 :param data: The string to send to the element. 508 509 :param click_first: If True, the element is clicked on 510 before the data is sent. 511 512 :param clear_first: If True, the current text of the element 513 is cleared before the data is sent.""" 514 element = self.click(locator) if click_first else self.find(locator) 515 if clear_first: 516 element.clear() 517 self.chill(self.after_click_wait) 518 if self.one_key_at_a_time: 519 for ch in str(data): 520 element.send_keys(ch) 521 self.chill(self.after_key_wait) 522 else: 523 element.send_keys(str(data)) 524 self.chill(self.after_field_wait) 525 return element 526 527 def fill_next( 528 self, data: list[str | tuple], start_element: WebElement = None 529 ) -> WebElement: 530 """Fills a form by tabbing from the current WebElement 531 to the next one and using the corresponding item in data. 532 Returns the last WebElement. 533 534 :param data: A list of form data. If an item is a string (except for 'skip') 535 it will be typed into the current WebElement. 536 537 An item in data can be a two-tuple of the form 538 ('downArrow', numberOfPresses:int|tuple[int, int]). 539 540 If numberOfPresses is a single int, Keys.ARROW_DOWN will be sent 541 that many times to the WebElement. 542 543 If numberOfPresses is a tuple, Keys.ARROW_DOWN will be sent a random 544 number of times between numberOfPresses[0] and numberOfPresses[1] inclusive. 545 This is typically for use with Select elements. 546 547 An item in data can also be 'skip', which will perform no action on the current 548 WebElement and will continue to the next one. 549 550 An item in data can also be 'click=n', where 'n' is an integer b/t 0 and 100, 551 representing a percent chance an element will be clicked or skipped: 552 >>> user.fill_next(["click=70"]) 553 554 has a 70% chance of being 555 >>> user.fill_next([user.keys.SPACE]) 556 557 and a 30% chance of being 558 >>> user.fill_next(["skip"]) 559 560 561 :param start_element: The WebElement to start tabbing from. 562 The currently active element will be used if start_element is None. 563 564 Note: The function tabs to the next element before sending data, 565 so the start_element should the WebElement before the one 566 that should receive data[0]. 567 """ 568 element = ( 569 self.browser.switch_to.active_element 570 if not start_element 571 else start_element 572 ) 573 for datum in data: 574 element.send_keys(Keys.TAB) 575 element = self.browser.switch_to.active_element 576 self.chill(self.after_key_wait) 577 if type(datum) == str and datum.strip().startswith("click="): 578 chance = int(datum.split("=")[1].strip()) 579 if random.randint(0, 100) <= chance: 580 datum = Keys.SPACE 581 else: 582 datum = "skip" 583 if datum[0] == "downArrow": 584 if type(datum[1]) == tuple: 585 times = random.randint(datum[1][0], datum[1][1]) 586 else: 587 times = datum[1] 588 for _ in range(times): 589 element.send_keys(Keys.ARROW_DOWN) 590 self.chill(self.after_key_wait) 591 elif datum == "skip": 592 self.chill(self.after_key_wait) 593 else: 594 595 if self.turbo_engaged: 596 element.send_keys(str(datum)) 597 else: 598 for ch in str(datum): 599 element.send_keys(ch) 600 self.chill(self.after_key_wait) 601 self.chill(self.after_field_wait) 602 return element 603 604 def wait_until( 605 self, condition: LambdaType, max_wait: float = 10, polling_interval: float = 0.1 606 ): 607 """Checks condition repeatedly until either it is true, 608 or the max_wait is exceeded. 609 610 Raises a TimeoutError if the condition doesn't success within max_wait. 611 612 Useful for determing whether a form has been successfully submitted. 613 614 :param condition: The condition function to check. 615 616 :param max_wait: Number of seconds to continue checking condition 617 before throwing a TimeoutError. 618 619 :param polling_interval: The number of seconds to sleep before 620 checking the condition function again after it fails. 621 622 e.g. self.wait_until(lambda: 'Successfully Submitted' in self.text('//p[@id="form-output"]))""" 623 start_time = time.time() 624 while True: 625 try: 626 if condition(): 627 time.sleep(1) 628 break 629 elif (time.time() - start_time) > max_wait: 630 raise TimeoutError(f"max_wait exceeded in wait_until({condition})") 631 else: 632 time.sleep(polling_interval) 633 except: 634 if (time.time() - start_time) > max_wait: 635 raise TimeoutError(f"max_wait exceeded in wait_until({condition})") 636 else: 637 time.sleep(polling_interval) 638 639 def dismiss_alert(self): 640 """Dismiss alert dialog.""" 641 self.browser.switch_to.alert.dismiss() 642 643 def solve_recaptcha_v3( 644 self, 645 outer_iframe_xpath: str = '//iframe[@title="reCAPTCHA"]', 646 inner_iframe_xpath: str = '//iframe[@title="recaptcha challenge expires in two minutes"]', 647 ): 648 """Pass google recaptcha v3 by solving an audio puzzle. 649 650 :param outer_iframe_xpath: Xpath to the iframe containing the recaptcha checkbox. 651 If it's the recaptcha without the initial checkbox that just shows the image puzzle, 652 pass None to this argument. 653 654 """ 655 locator_method = self.locator_method 656 self.locator_method = "xpath" 657 try: 658 if outer_iframe_xpath: 659 self.switch_to_iframe(outer_iframe_xpath) 660 self.click('//*[@id="recaptcha-anchor"]') 661 self.switch_to_parent_frame() 662 self.switch_to_iframe(inner_iframe_xpath) 663 self.click('//*[@id="recaptcha-audio-button"]') 664 mp3_url = self.find( 665 '//a[@class="rc-audiochallenge-tdownload-link"]' 666 ).get_attribute("href") 667 text = get_text_from_url(mp3_url, ".mp3") 668 self.send_keys('//*[@id="audio-response"]', text) 669 self.click('//*[@id="recaptcha-verify-button"]') 670 except Exception as e: 671 print(e) 672 raise Exception("Could not solve captcha") 673 finally: 674 self.switch_to_parent_frame() 675 self.locator_method = locator_method
28class User: 29 """Sits on top of selenium to streamline 30 automation and scraping tasks.""" 31 32 def __init__( 33 self, 34 headless: bool = False, 35 browser_type: str = "firefox", 36 implicit_wait: int = 10, 37 page_load_timeout: int = 60, 38 open_browser: bool = True, 39 locator_method: str = "xpath", 40 randomize_user_agent: bool = True, 41 user_agent_rotation_period: int = None, 42 move_window_by: tuple[int, int] = (0, -1000), 43 download_dir: str | Path = None, 44 driver_path: str | Path = None, 45 ): 46 """ 47 :param headless: If True, browser window will not be visible. 48 49 :param browser_type: Which browser to use. Can be 'firefox' or 'chrome'. 50 51 :param implicit_wait: Number of seconds to look for a specified element before 52 selenium considers it missing and throws an exception. 53 54 :param page_load_timeout: Time in seconds for selenium to wait for a page to load 55 before throwing an exception. 56 57 :param open_browser: If True, opens a browser window when a User object is created. 58 If False, a manual call to self.open_browser() must be made. 59 60 :param locator_method: The locator type User should expect to be given. 61 Can be 'xpath', 'id', 'className', 'name', or 'cssSelector'. 62 Every member function with a 'locator' argument refers to a string matching 63 the current locator_method. 64 65 :param randomize_user_agent: If True, a random useragent will be used whenever 66 the browser is opened. If False, the native useragent will be used. 67 68 :param user_agent_rotation_period: If not None, the browser window will be closed 69 and reopened with a new useragent every user_agent_rotation_period number of minutes. 70 Rotation occurs on the first call to self.get() after the time period has elapsed. 71 Ignored if randomize_user_agent is False. 72 73 :param move_window_by: The x and y amount of pixels to move the browser window by after opening. 74 75 :param download_dir: The download folder to use. If None, the default folder will be used. 76 77 :param driver_path: The path to the webdriver executable selenium should use. 78 If None, the system PATH will be checked for the executable. 79 If the executable isn't found, the parent directories and the immediate child directories 80 of the current working directory will be searched. 81 """ 82 self.headless = headless 83 browser_type = browser_type.lower() 84 if browser_type in ["firefox", "chrome"]: 85 self.browser_type = browser_type 86 else: 87 raise ValueError("'browser_type' parameter must be 'firefox' or 'chrome'") 88 self.browser_open = False 89 self.implicit_wait = implicit_wait 90 self.page_load_timeout = page_load_timeout 91 self.rotation_timer = Timer() 92 self.timer = Timer() 93 self.timer.start() 94 self.randomize_user_agent = randomize_user_agent 95 self.user_agent_rotation_period = user_agent_rotation_period 96 self.locator_method = locator_method 97 self.turbo() 98 self.keys = Keys 99 self.move_window_by = move_window_by 100 self.download_dir = download_dir 101 self.driver_path = driver_path 102 if not self.driver_path: 103 self.search_for_driver() 104 if open_browser: 105 self.open_browser() 106 else: 107 self.browser = None 108 atexit.register(self.close_browser) 109 110 def __enter__(self): 111 return self 112 113 def __exit__(self, *args): 114 self.close_browser() 115 116 def configure_firefox(self) -> FirefoxService: 117 """Configure options and profile for firefox.""" 118 self.options = FirefoxOptions() 119 self.options.headless = self.headless 120 self.options.set_preference( 121 "widget.windows.window_occlusion_tracking.enabled", False 122 ) 123 self.options.set_preference("dom.webaudio.enabled", False) 124 if self.randomize_user_agent: 125 self.options.set_preference("general.useragent.override", get_agent()) 126 if self.download_dir: 127 Path(self.download_dir).mkdir(parents=True, exist_ok=True) 128 self.profile = FirefoxProfile() 129 self.profile.set_preference("browser.download.dir", str(self.download_dir)) 130 self.profile.set_preference("browser.download.folderList", 2) 131 else: 132 self.profile = None 133 self.service = FirefoxService( 134 executable_path=str(self.driver_path), log_path=os.devnull 135 ) 136 137 def configure_chrome(self) -> ChromeService: 138 """Configure options and profile for chrome.""" 139 self.options = ChromeOptions() 140 self.options.headless = self.headless 141 self.options.add_argument("--disable-blink-features=AutomationControlled") 142 self.options.add_argument("--mute-audio") 143 self.options.add_argument("--disable-infobars") 144 self.options.add_argument("--disable-notifications") 145 self.options.add_argument("--log-level=3") 146 if self.randomize_user_agent: 147 self.options.add_argument(f"--user-agent={get_agent()}") 148 self.options.add_experimental_option("useAutomationExtension", False) 149 if self.download_dir: 150 Path(self.download_dir).mkdir(parents=True, exist_ok=True) 151 self.options.add_experimental_option( 152 "prefs", {"download.default_directory": str(self.download_dir)} 153 ) 154 self.service = ChromeService( 155 executable_path=str(self.driver_path), log_path=os.devnull 156 ) 157 158 def search_for_driver(self): 159 """Searches for the webdriver executable.""" 160 cwd = Path.cwd() 161 found = False 162 match self.browser_type: 163 case "firefox": 164 driver = "geckodriver.exe" 165 case "chrome": 166 driver = "chromedriver.exe" 167 # search PATH 168 env_path = os.environ["PATH"] 169 if sys.platform == "win32": 170 env_paths = env_path.split(";") 171 else: 172 env_paths = env_path.split(":") 173 driver = driver[: driver.find(".")] 174 for path in env_paths: 175 if (Path(path) / driver).exists(): 176 self.driver_path = Path(path) / driver 177 found = True 178 break 179 # check current working directory and parent folders 180 if not found: 181 while cwd != cwd.parent: 182 if (cwd / driver).exists(): 183 self.driver_path = cwd / driver 184 found = True 185 break 186 cwd = cwd.parent 187 # check top most level 188 if not found and (cwd / driver).exists(): 189 self.driver_path = cwd / driver 190 found = True 191 # check child folders (only 1 level down) 192 if not found: 193 for child in Path.cwd().iterdir(): 194 if child.is_dir() and (child / driver).exists(): 195 self.driver_path = child / driver 196 found = True 197 if not found: 198 warn(f"Could not find {driver}") 199 200 def set_implicit_wait(self, wait_time: int = None): 201 """Sets to default time if no arg given.""" 202 if not wait_time: 203 self.browser.implicitly_wait(self.implicit_wait) 204 else: 205 self.browser.implicitly_wait(wait_time) 206 207 def open_browser(self): 208 """Configures and opens selenium browser.""" 209 if not self.browser_open: 210 match self.browser_type: 211 case "firefox": 212 self.configure_firefox() 213 self.browser = webdriver.Firefox( 214 options=self.options, 215 service=self.service, 216 firefox_profile=self.profile, 217 ) 218 case "chrome": 219 self.configure_chrome() 220 self.browser = webdriver.Chrome( 221 options=self.options, service=self.service 222 ) 223 self.set_implicit_wait() 224 self.browser.maximize_window() 225 self.browser.set_window_position( 226 self.move_window_by[0], self.move_window_by[1] 227 ) 228 self.browser.maximize_window() 229 self.browser.set_page_load_timeout(self.page_load_timeout) 230 self.browser_open = True 231 self.tab_index = 0 232 self.rotation_timer.start() 233 else: 234 warn("Browser already open.") 235 236 def close_browser(self): 237 """Close browser window.""" 238 if self.browser_open: 239 self.browser_open = False 240 self.browser.quit() 241 242 def open_tab(self, url: str = "", switch_to_tab: bool = True): 243 """Opens new tab and, if provided, goes to url. 244 245 New tab is inserted after currently active tab.""" 246 self.script("window.open(arguments[0]);", url) 247 if switch_to_tab: 248 self.switch_to_tab(self.tab_index + 1) 249 250 def switch_to_tab(self, tab_index: int): 251 """Switch to a tab in browser, zero indexed.""" 252 self.browser.switch_to.window(self.browser.window_handles[tab_index]) 253 self.tab_index = tab_index 254 255 def get_num_tabs(self) -> int: 256 """Returns number of tabs open.""" 257 return len(self.browser.window_handles) 258 259 def close_tab(self, tab_index: int = 1): 260 """Close specified tab and 261 switches to tab index 0.""" 262 self.switch_to_tab(tab_index) 263 self.browser.close() 264 self.switch_to_tab(0) 265 266 def get(self, url: str): 267 """Requests webpage at given url and rotates userAgent if necessary.""" 268 if not self.browser_open: 269 self.open_browser() 270 if ( 271 self.randomize_user_agent 272 and self.user_agent_rotation_period is not None 273 and self.rotation_timer.check(format=False) 274 > (60 * self.user_agent_rotation_period) 275 ): 276 self.rotation_timer.stop() 277 self.close_browser() 278 self.open_browser() 279 self.browser.get(url) 280 self.script("Object.defineProperty(navigator, 'webdriver', {get: () => false})") 281 self.chill(self.arrival_wait) 282 283 def get_soup(self) -> BeautifulSoup: 284 """Returns a BeautifulSoup object 285 of the current page source.""" 286 return BeautifulSoup(self.browser.page_source, "html.parser") 287 288 def current_url(self) -> str: 289 """Returns current url of active tab.""" 290 return self.browser.current_url 291 292 def delete_cookies(self): 293 """Delete all cookies for 294 this browser instance.""" 295 self.browser.delete_all_cookies() 296 297 def turbo(self, engage: bool = True): 298 """When engaged, strings will be sent 299 to elements all at once and there will be 300 no waiting after actions. 301 302 When disengaged, strings will be sent to elements 303 'one key at a time' with randomized amounts of 304 time between successive keys and after actions.""" 305 if engage: 306 self.after_key_wait = (0, 0) 307 self.after_field_wait = (0, 0) 308 self.after_click_wait = (0, 0) 309 self.arrival_wait = (1, 1) 310 self.one_key_at_a_time = False 311 self.turbo_engaged = True 312 else: 313 self.after_key_wait = (0.1, 0.5) 314 self.after_field_wait = (1, 2) 315 self.after_click_wait = (0.25, 1.5) 316 self.arrival_wait = (4, 10) 317 self.one_key_at_a_time = True 318 self.turbo_engaged = False 319 320 def chill(self, min_max: tuple[float, float]): 321 """Sleeps a random amount 322 between min_max[0] and min_max[1].""" 323 time.sleep(random.uniform(min_max[0], min_max[1])) 324 325 def script(self, script: str, args: Any = None) -> Any: 326 """Execute javascript code and returns result.""" 327 return self.browser.execute_script(script, args) 328 329 def remove(self, locator: str): 330 """Removes element from DOM.""" 331 self.script("arguments[0].remove();", self.find(locator)) 332 333 def get_length(self, locator: str) -> int: 334 """Returns number of child elements for a given element.""" 335 return int(self.script("return arguments[0].length;", self.find(locator))) 336 337 def find(self, locator: str) -> WebElement: 338 """Finds and returns a WebElement.""" 339 match self.locator_method: 340 case "xpath": 341 return self.browser.find_element(By.XPATH, locator) 342 case "id": 343 return self.browser.find_element(By.ID, locator) 344 case "className": 345 return self.browser.find_element(By.CLASS_NAME, locator) 346 case "name": 347 return self.browser.find_element(By.NAME, locator) 348 case "cssSelector": 349 return self.browser.find_element(By.CSS_SELECTOR, locator) 350 351 def find_children(self, locator: str) -> list[WebElement]: 352 """Returns a list of child WebElements 353 for given locator arg.""" 354 element = self.find(locator) 355 return element.find_elements("xpath", "./*") 356 357 def scroll(self, amount: int = None, fraction: float = None): 358 """Scroll web page. 359 :param amount: The number of lines to scroll if not None. 360 361 :param fraction: The amount between 0.0 and 1.0 362 of the page height to scroll. 363 364 If values are provided for both arguments, 365 amount will be used. 366 367 If values are provided for neither argument, 368 the entire page length will be scrolled. 369 370 Scrolls one line at a time if self.turbo is False.""" 371 if amount: 372 amount_to_scroll = amount 373 elif fraction: 374 amount_to_scroll = int( 375 fraction 376 * ( 377 int(self.script("return document.body.scrollHeight;")) 378 - int(self.script("return window.pageYOffset;")) 379 ) 380 ) 381 else: 382 amount_to_scroll = int(self.script("return document.body.scrollHeight;")) 383 if self.turbo_engaged: 384 self.script("window.scrollBy(0,arguments[0]);", amount_to_scroll) 385 else: 386 for _ in range(abs(amount_to_scroll)): 387 if amount_to_scroll >= 0: 388 self.script("window.scrollBy(0,1);") 389 else: 390 self.script("window.scrollBy(0,-1);") 391 self.chill(self.after_click_wait) 392 393 def scroll_into_view(self, locator: str) -> WebElement: 394 """Scrolls to a given element and returns the element.""" 395 element = self.find(locator) 396 self.script("arguments[0].scroll_into_view();", element) 397 self.chill(self.after_click_wait) 398 return element 399 400 def text(self, locator: str) -> str: 401 """Returns text of WebElement.""" 402 return self.find(locator).text 403 404 def click(self, locator: str) -> WebElement: 405 """Clicks on and returns WebElement.""" 406 element = self.find(locator) 407 element.click() 408 self.chill(self.after_click_wait) 409 return element 410 411 def clear(self, locator: str) -> WebElement: 412 """Clears content of WebElement if able 413 and then returns WebElement.""" 414 element = self.find(locator) 415 element.clear() 416 self.chill(self.after_click_wait) 417 return element 418 419 def switch_to_iframe(self, locator: str): 420 """Switch to an iframe from given locator.""" 421 self.browser.switch_to.frame(self.find(locator)) 422 423 def switch_to_parent_frame(self): 424 """Move up a frame level from current frame.""" 425 self.browser.switch_to.parent_frame() 426 427 def select( 428 self, locator: str, method: str, choice: str | int | tuple 429 ) -> WebElement: 430 """Select a choice from Select element. 431 Returns the Select element from the locator string, 432 not the option element that is selected. 433 434 :param method: Can be 'value' or 'index' 435 436 :param choice: The option to select. 437 438 If method is 'value', then choice should be 439 the html 'value' attribute of the desired option. 440 441 If method is 'index', choice can either be a single 442 int for the desired option or it can be a two-tuple. 443 If the tuple is provided, a random option between the 444 two indicies (inclusive) will be selected.""" 445 element = self.click(locator) 446 match method: 447 case "value": 448 Select(element).select_by_value(choice) 449 case "index": 450 if type(choice) == tuple: 451 choice = random.randint(choice[0], choice[1]) 452 Select(element).select_by_index(choice) 453 self.chill(self.after_field_wait) 454 return element 455 456 def click_elements( 457 self, locators: list[str], max_selections: int = None, min_selections: int = 1 458 ) -> WebElement: 459 """Click a random number of WebElements 460 and return the last WebElement clicked. 461 462 :param locators: A list of element locators to choose from. 463 464 :param max_selections: The maximum number of elements to click. 465 If None, the maximum will be the length of the locators list. 466 467 :param min_selections: The minimum number of elements to click. 468 469 e.g. self.click_elements([xpath1, xpath2, xpath3, xpath4], max_selections=3) 470 will click between 1 and 3 random elements from the list. 471 """ 472 if not max_selections: 473 max_selections = len(locators) 474 for option in random.sample( 475 locators, k=random.randint(min_selections, max_selections) 476 ): 477 element = self.click(option) 478 return element 479 480 def get_click_list( 481 self, num_options: int, max_choices: int = 1, min_choices: int = 1 482 ) -> list[str]: 483 """Similar to self.click_elements(), but for use with the self.fill_next() method. 484 485 Creates a list of length 'num_options' where every element is 'skip'. 486 487 A random number of elements in the list between 'min_choices' and 'max_choices' are 488 replaced with 'keys.SPACE' (interpreted as a click by almost all web forms).""" 489 click_list = ["skip"] * num_options 490 selected_indexes = [] 491 for i in range(random.randint(min_choices, max_choices)): 492 index = random.randint(0, num_options - 1) 493 while index in selected_indexes: 494 index = random.randint(0, num_options - 1) 495 selected_indexes.append(index) 496 click_list[index] = self.keys.SPACE 497 return click_list 498 499 def send_keys( 500 self, 501 locator: str, 502 data: str, 503 click_first: bool = True, 504 clear_first: bool = False, 505 ) -> WebElement: 506 """Types data into element and returns the element. 507 508 :param data: The string to send to the element. 509 510 :param click_first: If True, the element is clicked on 511 before the data is sent. 512 513 :param clear_first: If True, the current text of the element 514 is cleared before the data is sent.""" 515 element = self.click(locator) if click_first else self.find(locator) 516 if clear_first: 517 element.clear() 518 self.chill(self.after_click_wait) 519 if self.one_key_at_a_time: 520 for ch in str(data): 521 element.send_keys(ch) 522 self.chill(self.after_key_wait) 523 else: 524 element.send_keys(str(data)) 525 self.chill(self.after_field_wait) 526 return element 527 528 def fill_next( 529 self, data: list[str | tuple], start_element: WebElement = None 530 ) -> WebElement: 531 """Fills a form by tabbing from the current WebElement 532 to the next one and using the corresponding item in data. 533 Returns the last WebElement. 534 535 :param data: A list of form data. If an item is a string (except for 'skip') 536 it will be typed into the current WebElement. 537 538 An item in data can be a two-tuple of the form 539 ('downArrow', numberOfPresses:int|tuple[int, int]). 540 541 If numberOfPresses is a single int, Keys.ARROW_DOWN will be sent 542 that many times to the WebElement. 543 544 If numberOfPresses is a tuple, Keys.ARROW_DOWN will be sent a random 545 number of times between numberOfPresses[0] and numberOfPresses[1] inclusive. 546 This is typically for use with Select elements. 547 548 An item in data can also be 'skip', which will perform no action on the current 549 WebElement and will continue to the next one. 550 551 An item in data can also be 'click=n', where 'n' is an integer b/t 0 and 100, 552 representing a percent chance an element will be clicked or skipped: 553 >>> user.fill_next(["click=70"]) 554 555 has a 70% chance of being 556 >>> user.fill_next([user.keys.SPACE]) 557 558 and a 30% chance of being 559 >>> user.fill_next(["skip"]) 560 561 562 :param start_element: The WebElement to start tabbing from. 563 The currently active element will be used if start_element is None. 564 565 Note: The function tabs to the next element before sending data, 566 so the start_element should the WebElement before the one 567 that should receive data[0]. 568 """ 569 element = ( 570 self.browser.switch_to.active_element 571 if not start_element 572 else start_element 573 ) 574 for datum in data: 575 element.send_keys(Keys.TAB) 576 element = self.browser.switch_to.active_element 577 self.chill(self.after_key_wait) 578 if type(datum) == str and datum.strip().startswith("click="): 579 chance = int(datum.split("=")[1].strip()) 580 if random.randint(0, 100) <= chance: 581 datum = Keys.SPACE 582 else: 583 datum = "skip" 584 if datum[0] == "downArrow": 585 if type(datum[1]) == tuple: 586 times = random.randint(datum[1][0], datum[1][1]) 587 else: 588 times = datum[1] 589 for _ in range(times): 590 element.send_keys(Keys.ARROW_DOWN) 591 self.chill(self.after_key_wait) 592 elif datum == "skip": 593 self.chill(self.after_key_wait) 594 else: 595 596 if self.turbo_engaged: 597 element.send_keys(str(datum)) 598 else: 599 for ch in str(datum): 600 element.send_keys(ch) 601 self.chill(self.after_key_wait) 602 self.chill(self.after_field_wait) 603 return element 604 605 def wait_until( 606 self, condition: LambdaType, max_wait: float = 10, polling_interval: float = 0.1 607 ): 608 """Checks condition repeatedly until either it is true, 609 or the max_wait is exceeded. 610 611 Raises a TimeoutError if the condition doesn't success within max_wait. 612 613 Useful for determing whether a form has been successfully submitted. 614 615 :param condition: The condition function to check. 616 617 :param max_wait: Number of seconds to continue checking condition 618 before throwing a TimeoutError. 619 620 :param polling_interval: The number of seconds to sleep before 621 checking the condition function again after it fails. 622 623 e.g. self.wait_until(lambda: 'Successfully Submitted' in self.text('//p[@id="form-output"]))""" 624 start_time = time.time() 625 while True: 626 try: 627 if condition(): 628 time.sleep(1) 629 break 630 elif (time.time() - start_time) > max_wait: 631 raise TimeoutError(f"max_wait exceeded in wait_until({condition})") 632 else: 633 time.sleep(polling_interval) 634 except: 635 if (time.time() - start_time) > max_wait: 636 raise TimeoutError(f"max_wait exceeded in wait_until({condition})") 637 else: 638 time.sleep(polling_interval) 639 640 def dismiss_alert(self): 641 """Dismiss alert dialog.""" 642 self.browser.switch_to.alert.dismiss() 643 644 def solve_recaptcha_v3( 645 self, 646 outer_iframe_xpath: str = '//iframe[@title="reCAPTCHA"]', 647 inner_iframe_xpath: str = '//iframe[@title="recaptcha challenge expires in two minutes"]', 648 ): 649 """Pass google recaptcha v3 by solving an audio puzzle. 650 651 :param outer_iframe_xpath: Xpath to the iframe containing the recaptcha checkbox. 652 If it's the recaptcha without the initial checkbox that just shows the image puzzle, 653 pass None to this argument. 654 655 """ 656 locator_method = self.locator_method 657 self.locator_method = "xpath" 658 try: 659 if outer_iframe_xpath: 660 self.switch_to_iframe(outer_iframe_xpath) 661 self.click('//*[@id="recaptcha-anchor"]') 662 self.switch_to_parent_frame() 663 self.switch_to_iframe(inner_iframe_xpath) 664 self.click('//*[@id="recaptcha-audio-button"]') 665 mp3_url = self.find( 666 '//a[@class="rc-audiochallenge-tdownload-link"]' 667 ).get_attribute("href") 668 text = get_text_from_url(mp3_url, ".mp3") 669 self.send_keys('//*[@id="audio-response"]', text) 670 self.click('//*[@id="recaptcha-verify-button"]') 671 except Exception as e: 672 print(e) 673 raise Exception("Could not solve captcha") 674 finally: 675 self.switch_to_parent_frame() 676 self.locator_method = locator_method
Sits on top of selenium to streamline automation and scraping tasks.
32 def __init__( 33 self, 34 headless: bool = False, 35 browser_type: str = "firefox", 36 implicit_wait: int = 10, 37 page_load_timeout: int = 60, 38 open_browser: bool = True, 39 locator_method: str = "xpath", 40 randomize_user_agent: bool = True, 41 user_agent_rotation_period: int = None, 42 move_window_by: tuple[int, int] = (0, -1000), 43 download_dir: str | Path = None, 44 driver_path: str | Path = None, 45 ): 46 """ 47 :param headless: If True, browser window will not be visible. 48 49 :param browser_type: Which browser to use. Can be 'firefox' or 'chrome'. 50 51 :param implicit_wait: Number of seconds to look for a specified element before 52 selenium considers it missing and throws an exception. 53 54 :param page_load_timeout: Time in seconds for selenium to wait for a page to load 55 before throwing an exception. 56 57 :param open_browser: If True, opens a browser window when a User object is created. 58 If False, a manual call to self.open_browser() must be made. 59 60 :param locator_method: The locator type User should expect to be given. 61 Can be 'xpath', 'id', 'className', 'name', or 'cssSelector'. 62 Every member function with a 'locator' argument refers to a string matching 63 the current locator_method. 64 65 :param randomize_user_agent: If True, a random useragent will be used whenever 66 the browser is opened. If False, the native useragent will be used. 67 68 :param user_agent_rotation_period: If not None, the browser window will be closed 69 and reopened with a new useragent every user_agent_rotation_period number of minutes. 70 Rotation occurs on the first call to self.get() after the time period has elapsed. 71 Ignored if randomize_user_agent is False. 72 73 :param move_window_by: The x and y amount of pixels to move the browser window by after opening. 74 75 :param download_dir: The download folder to use. If None, the default folder will be used. 76 77 :param driver_path: The path to the webdriver executable selenium should use. 78 If None, the system PATH will be checked for the executable. 79 If the executable isn't found, the parent directories and the immediate child directories 80 of the current working directory will be searched. 81 """ 82 self.headless = headless 83 browser_type = browser_type.lower() 84 if browser_type in ["firefox", "chrome"]: 85 self.browser_type = browser_type 86 else: 87 raise ValueError("'browser_type' parameter must be 'firefox' or 'chrome'") 88 self.browser_open = False 89 self.implicit_wait = implicit_wait 90 self.page_load_timeout = page_load_timeout 91 self.rotation_timer = Timer() 92 self.timer = Timer() 93 self.timer.start() 94 self.randomize_user_agent = randomize_user_agent 95 self.user_agent_rotation_period = user_agent_rotation_period 96 self.locator_method = locator_method 97 self.turbo() 98 self.keys = Keys 99 self.move_window_by = move_window_by 100 self.download_dir = download_dir 101 self.driver_path = driver_path 102 if not self.driver_path: 103 self.search_for_driver() 104 if open_browser: 105 self.open_browser() 106 else: 107 self.browser = None 108 atexit.register(self.close_browser)
Parameters
headless: If True, browser window will not be visible.
browser_type: Which browser to use. Can be 'firefox' or 'chrome'.
implicit_wait: Number of seconds to look for a specified element before selenium considers it missing and throws an exception.
page_load_timeout: Time in seconds for selenium to wait for a page to load before throwing an exception.
open_browser: If True, opens a browser window when a User object is created. If False, a manual call to self.open_browser() must be made.
locator_method: The locator type User should expect to be given. Can be 'xpath', 'id', 'className', 'name', or 'cssSelector'. Every member function with a 'locator' argument refers to a string matching the current locator_method.
randomize_user_agent: If True, a random useragent will be used whenever the browser is opened. If False, the native useragent will be used.
user_agent_rotation_period: If not None, the browser window will be closed and reopened with a new useragent every user_agent_rotation_period number of minutes. Rotation occurs on the first call to self.get() after the time period has elapsed. Ignored if randomize_user_agent is False.
move_window_by: The x and y amount of pixels to move the browser window by after opening.
download_dir: The download folder to use. If None, the default folder will be used.
driver_path: The path to the webdriver executable selenium should use. If None, the system PATH will be checked for the executable. If the executable isn't found, the parent directories and the immediate child directories of the current working directory will be searched.
116 def configure_firefox(self) -> FirefoxService: 117 """Configure options and profile for firefox.""" 118 self.options = FirefoxOptions() 119 self.options.headless = self.headless 120 self.options.set_preference( 121 "widget.windows.window_occlusion_tracking.enabled", False 122 ) 123 self.options.set_preference("dom.webaudio.enabled", False) 124 if self.randomize_user_agent: 125 self.options.set_preference("general.useragent.override", get_agent()) 126 if self.download_dir: 127 Path(self.download_dir).mkdir(parents=True, exist_ok=True) 128 self.profile = FirefoxProfile() 129 self.profile.set_preference("browser.download.dir", str(self.download_dir)) 130 self.profile.set_preference("browser.download.folderList", 2) 131 else: 132 self.profile = None 133 self.service = FirefoxService( 134 executable_path=str(self.driver_path), log_path=os.devnull 135 )
Configure options and profile for firefox.
137 def configure_chrome(self) -> ChromeService: 138 """Configure options and profile for chrome.""" 139 self.options = ChromeOptions() 140 self.options.headless = self.headless 141 self.options.add_argument("--disable-blink-features=AutomationControlled") 142 self.options.add_argument("--mute-audio") 143 self.options.add_argument("--disable-infobars") 144 self.options.add_argument("--disable-notifications") 145 self.options.add_argument("--log-level=3") 146 if self.randomize_user_agent: 147 self.options.add_argument(f"--user-agent={get_agent()}") 148 self.options.add_experimental_option("useAutomationExtension", False) 149 if self.download_dir: 150 Path(self.download_dir).mkdir(parents=True, exist_ok=True) 151 self.options.add_experimental_option( 152 "prefs", {"download.default_directory": str(self.download_dir)} 153 ) 154 self.service = ChromeService( 155 executable_path=str(self.driver_path), log_path=os.devnull 156 )
Configure options and profile for chrome.
158 def search_for_driver(self): 159 """Searches for the webdriver executable.""" 160 cwd = Path.cwd() 161 found = False 162 match self.browser_type: 163 case "firefox": 164 driver = "geckodriver.exe" 165 case "chrome": 166 driver = "chromedriver.exe" 167 # search PATH 168 env_path = os.environ["PATH"] 169 if sys.platform == "win32": 170 env_paths = env_path.split(";") 171 else: 172 env_paths = env_path.split(":") 173 driver = driver[: driver.find(".")] 174 for path in env_paths: 175 if (Path(path) / driver).exists(): 176 self.driver_path = Path(path) / driver 177 found = True 178 break 179 # check current working directory and parent folders 180 if not found: 181 while cwd != cwd.parent: 182 if (cwd / driver).exists(): 183 self.driver_path = cwd / driver 184 found = True 185 break 186 cwd = cwd.parent 187 # check top most level 188 if not found and (cwd / driver).exists(): 189 self.driver_path = cwd / driver 190 found = True 191 # check child folders (only 1 level down) 192 if not found: 193 for child in Path.cwd().iterdir(): 194 if child.is_dir() and (child / driver).exists(): 195 self.driver_path = child / driver 196 found = True 197 if not found: 198 warn(f"Could not find {driver}")
Searches for the webdriver executable.
200 def set_implicit_wait(self, wait_time: int = None): 201 """Sets to default time if no arg given.""" 202 if not wait_time: 203 self.browser.implicitly_wait(self.implicit_wait) 204 else: 205 self.browser.implicitly_wait(wait_time)
Sets to default time if no arg given.
207 def open_browser(self): 208 """Configures and opens selenium browser.""" 209 if not self.browser_open: 210 match self.browser_type: 211 case "firefox": 212 self.configure_firefox() 213 self.browser = webdriver.Firefox( 214 options=self.options, 215 service=self.service, 216 firefox_profile=self.profile, 217 ) 218 case "chrome": 219 self.configure_chrome() 220 self.browser = webdriver.Chrome( 221 options=self.options, service=self.service 222 ) 223 self.set_implicit_wait() 224 self.browser.maximize_window() 225 self.browser.set_window_position( 226 self.move_window_by[0], self.move_window_by[1] 227 ) 228 self.browser.maximize_window() 229 self.browser.set_page_load_timeout(self.page_load_timeout) 230 self.browser_open = True 231 self.tab_index = 0 232 self.rotation_timer.start() 233 else: 234 warn("Browser already open.")
Configures and opens selenium browser.
236 def close_browser(self): 237 """Close browser window.""" 238 if self.browser_open: 239 self.browser_open = False 240 self.browser.quit()
Close browser window.
242 def open_tab(self, url: str = "", switch_to_tab: bool = True): 243 """Opens new tab and, if provided, goes to url. 244 245 New tab is inserted after currently active tab.""" 246 self.script("window.open(arguments[0]);", url) 247 if switch_to_tab: 248 self.switch_to_tab(self.tab_index + 1)
Opens new tab and, if provided, goes to url.
New tab is inserted after currently active tab.
250 def switch_to_tab(self, tab_index: int): 251 """Switch to a tab in browser, zero indexed.""" 252 self.browser.switch_to.window(self.browser.window_handles[tab_index]) 253 self.tab_index = tab_index
Switch to a tab in browser, zero indexed.
255 def get_num_tabs(self) -> int: 256 """Returns number of tabs open.""" 257 return len(self.browser.window_handles)
Returns number of tabs open.
259 def close_tab(self, tab_index: int = 1): 260 """Close specified tab and 261 switches to tab index 0.""" 262 self.switch_to_tab(tab_index) 263 self.browser.close() 264 self.switch_to_tab(0)
Close specified tab and switches to tab index 0.
266 def get(self, url: str): 267 """Requests webpage at given url and rotates userAgent if necessary.""" 268 if not self.browser_open: 269 self.open_browser() 270 if ( 271 self.randomize_user_agent 272 and self.user_agent_rotation_period is not None 273 and self.rotation_timer.check(format=False) 274 > (60 * self.user_agent_rotation_period) 275 ): 276 self.rotation_timer.stop() 277 self.close_browser() 278 self.open_browser() 279 self.browser.get(url) 280 self.script("Object.defineProperty(navigator, 'webdriver', {get: () => false})") 281 self.chill(self.arrival_wait)
Requests webpage at given url and rotates userAgent if necessary.
283 def get_soup(self) -> BeautifulSoup: 284 """Returns a BeautifulSoup object 285 of the current page source.""" 286 return BeautifulSoup(self.browser.page_source, "html.parser")
Returns a BeautifulSoup object of the current page source.
288 def current_url(self) -> str: 289 """Returns current url of active tab.""" 290 return self.browser.current_url
Returns current url of active tab.
297 def turbo(self, engage: bool = True): 298 """When engaged, strings will be sent 299 to elements all at once and there will be 300 no waiting after actions. 301 302 When disengaged, strings will be sent to elements 303 'one key at a time' with randomized amounts of 304 time between successive keys and after actions.""" 305 if engage: 306 self.after_key_wait = (0, 0) 307 self.after_field_wait = (0, 0) 308 self.after_click_wait = (0, 0) 309 self.arrival_wait = (1, 1) 310 self.one_key_at_a_time = False 311 self.turbo_engaged = True 312 else: 313 self.after_key_wait = (0.1, 0.5) 314 self.after_field_wait = (1, 2) 315 self.after_click_wait = (0.25, 1.5) 316 self.arrival_wait = (4, 10) 317 self.one_key_at_a_time = True 318 self.turbo_engaged = False
When engaged, strings will be sent to elements all at once and there will be no waiting after actions.
When disengaged, strings will be sent to elements 'one key at a time' with randomized amounts of time between successive keys and after actions.
320 def chill(self, min_max: tuple[float, float]): 321 """Sleeps a random amount 322 between min_max[0] and min_max[1].""" 323 time.sleep(random.uniform(min_max[0], min_max[1]))
Sleeps a random amount between min_max[0] and min_max[1].
325 def script(self, script: str, args: Any = None) -> Any: 326 """Execute javascript code and returns result.""" 327 return self.browser.execute_script(script, args)
Execute javascript code and returns result.
329 def remove(self, locator: str): 330 """Removes element from DOM.""" 331 self.script("arguments[0].remove();", self.find(locator))
Removes element from DOM.
333 def get_length(self, locator: str) -> int: 334 """Returns number of child elements for a given element.""" 335 return int(self.script("return arguments[0].length;", self.find(locator)))
Returns number of child elements for a given element.
337 def find(self, locator: str) -> WebElement: 338 """Finds and returns a WebElement.""" 339 match self.locator_method: 340 case "xpath": 341 return self.browser.find_element(By.XPATH, locator) 342 case "id": 343 return self.browser.find_element(By.ID, locator) 344 case "className": 345 return self.browser.find_element(By.CLASS_NAME, locator) 346 case "name": 347 return self.browser.find_element(By.NAME, locator) 348 case "cssSelector": 349 return self.browser.find_element(By.CSS_SELECTOR, locator)
Finds and returns a WebElement.
351 def find_children(self, locator: str) -> list[WebElement]: 352 """Returns a list of child WebElements 353 for given locator arg.""" 354 element = self.find(locator) 355 return element.find_elements("xpath", "./*")
Returns a list of child WebElements for given locator arg.
357 def scroll(self, amount: int = None, fraction: float = None): 358 """Scroll web page. 359 :param amount: The number of lines to scroll if not None. 360 361 :param fraction: The amount between 0.0 and 1.0 362 of the page height to scroll. 363 364 If values are provided for both arguments, 365 amount will be used. 366 367 If values are provided for neither argument, 368 the entire page length will be scrolled. 369 370 Scrolls one line at a time if self.turbo is False.""" 371 if amount: 372 amount_to_scroll = amount 373 elif fraction: 374 amount_to_scroll = int( 375 fraction 376 * ( 377 int(self.script("return document.body.scrollHeight;")) 378 - int(self.script("return window.pageYOffset;")) 379 ) 380 ) 381 else: 382 amount_to_scroll = int(self.script("return document.body.scrollHeight;")) 383 if self.turbo_engaged: 384 self.script("window.scrollBy(0,arguments[0]);", amount_to_scroll) 385 else: 386 for _ in range(abs(amount_to_scroll)): 387 if amount_to_scroll >= 0: 388 self.script("window.scrollBy(0,1);") 389 else: 390 self.script("window.scrollBy(0,-1);") 391 self.chill(self.after_click_wait)
Scroll web page.
Parameters
amount: The number of lines to scroll if not None.
fraction: The amount between 0.0 and 1.0 of the page height to scroll.
If values are provided for both arguments, amount will be used.
If values are provided for neither argument, the entire page length will be scrolled.
Scrolls one line at a time if self.turbo is False.
393 def scroll_into_view(self, locator: str) -> WebElement: 394 """Scrolls to a given element and returns the element.""" 395 element = self.find(locator) 396 self.script("arguments[0].scroll_into_view();", element) 397 self.chill(self.after_click_wait) 398 return element
Scrolls to a given element and returns the element.
400 def text(self, locator: str) -> str: 401 """Returns text of WebElement.""" 402 return self.find(locator).text
Returns text of WebElement.
404 def click(self, locator: str) -> WebElement: 405 """Clicks on and returns WebElement.""" 406 element = self.find(locator) 407 element.click() 408 self.chill(self.after_click_wait) 409 return element
Clicks on and returns WebElement.
411 def clear(self, locator: str) -> WebElement: 412 """Clears content of WebElement if able 413 and then returns WebElement.""" 414 element = self.find(locator) 415 element.clear() 416 self.chill(self.after_click_wait) 417 return element
Clears content of WebElement if able and then returns WebElement.
419 def switch_to_iframe(self, locator: str): 420 """Switch to an iframe from given locator.""" 421 self.browser.switch_to.frame(self.find(locator))
Switch to an iframe from given locator.
423 def switch_to_parent_frame(self): 424 """Move up a frame level from current frame.""" 425 self.browser.switch_to.parent_frame()
Move up a frame level from current frame.
427 def select( 428 self, locator: str, method: str, choice: str | int | tuple 429 ) -> WebElement: 430 """Select a choice from Select element. 431 Returns the Select element from the locator string, 432 not the option element that is selected. 433 434 :param method: Can be 'value' or 'index' 435 436 :param choice: The option to select. 437 438 If method is 'value', then choice should be 439 the html 'value' attribute of the desired option. 440 441 If method is 'index', choice can either be a single 442 int for the desired option or it can be a two-tuple. 443 If the tuple is provided, a random option between the 444 two indicies (inclusive) will be selected.""" 445 element = self.click(locator) 446 match method: 447 case "value": 448 Select(element).select_by_value(choice) 449 case "index": 450 if type(choice) == tuple: 451 choice = random.randint(choice[0], choice[1]) 452 Select(element).select_by_index(choice) 453 self.chill(self.after_field_wait) 454 return element
Select a choice from Select element. Returns the Select element from the locator string, not the option element that is selected.
Parameters
method: Can be 'value' or 'index'
choice: The option to select.
If method is 'value', then choice should be the html 'value' attribute of the desired option.
If method is 'index', choice can either be a single int for the desired option or it can be a two-tuple. If the tuple is provided, a random option between the two indicies (inclusive) will be selected.
456 def click_elements( 457 self, locators: list[str], max_selections: int = None, min_selections: int = 1 458 ) -> WebElement: 459 """Click a random number of WebElements 460 and return the last WebElement clicked. 461 462 :param locators: A list of element locators to choose from. 463 464 :param max_selections: The maximum number of elements to click. 465 If None, the maximum will be the length of the locators list. 466 467 :param min_selections: The minimum number of elements to click. 468 469 e.g. self.click_elements([xpath1, xpath2, xpath3, xpath4], max_selections=3) 470 will click between 1 and 3 random elements from the list. 471 """ 472 if not max_selections: 473 max_selections = len(locators) 474 for option in random.sample( 475 locators, k=random.randint(min_selections, max_selections) 476 ): 477 element = self.click(option) 478 return element
Click a random number of WebElements and return the last WebElement clicked.
Parameters
locators: A list of element locators to choose from.
max_selections: The maximum number of elements to click. If None, the maximum will be the length of the locators list.
min_selections: The minimum number of elements to click.
e.g. self.click_elements([xpath1, xpath2, xpath3, xpath4], max_selections=3) will click between 1 and 3 random elements from the list.
480 def get_click_list( 481 self, num_options: int, max_choices: int = 1, min_choices: int = 1 482 ) -> list[str]: 483 """Similar to self.click_elements(), but for use with the self.fill_next() method. 484 485 Creates a list of length 'num_options' where every element is 'skip'. 486 487 A random number of elements in the list between 'min_choices' and 'max_choices' are 488 replaced with 'keys.SPACE' (interpreted as a click by almost all web forms).""" 489 click_list = ["skip"] * num_options 490 selected_indexes = [] 491 for i in range(random.randint(min_choices, max_choices)): 492 index = random.randint(0, num_options - 1) 493 while index in selected_indexes: 494 index = random.randint(0, num_options - 1) 495 selected_indexes.append(index) 496 click_list[index] = self.keys.SPACE 497 return click_list
Similar to self.click_elements(), but for use with the self.fill_next() method.
Creates a list of length 'num_options' where every element is 'skip'.
A random number of elements in the list between 'min_choices' and 'max_choices' are replaced with 'keys.SPACE' (interpreted as a click by almost all web forms).
499 def send_keys( 500 self, 501 locator: str, 502 data: str, 503 click_first: bool = True, 504 clear_first: bool = False, 505 ) -> WebElement: 506 """Types data into element and returns the element. 507 508 :param data: The string to send to the element. 509 510 :param click_first: If True, the element is clicked on 511 before the data is sent. 512 513 :param clear_first: If True, the current text of the element 514 is cleared before the data is sent.""" 515 element = self.click(locator) if click_first else self.find(locator) 516 if clear_first: 517 element.clear() 518 self.chill(self.after_click_wait) 519 if self.one_key_at_a_time: 520 for ch in str(data): 521 element.send_keys(ch) 522 self.chill(self.after_key_wait) 523 else: 524 element.send_keys(str(data)) 525 self.chill(self.after_field_wait) 526 return element
Types data into element and returns the element.
Parameters
data: The string to send to the element.
click_first: If True, the element is clicked on before the data is sent.
clear_first: If True, the current text of the element is cleared before the data is sent.
528 def fill_next( 529 self, data: list[str | tuple], start_element: WebElement = None 530 ) -> WebElement: 531 """Fills a form by tabbing from the current WebElement 532 to the next one and using the corresponding item in data. 533 Returns the last WebElement. 534 535 :param data: A list of form data. If an item is a string (except for 'skip') 536 it will be typed into the current WebElement. 537 538 An item in data can be a two-tuple of the form 539 ('downArrow', numberOfPresses:int|tuple[int, int]). 540 541 If numberOfPresses is a single int, Keys.ARROW_DOWN will be sent 542 that many times to the WebElement. 543 544 If numberOfPresses is a tuple, Keys.ARROW_DOWN will be sent a random 545 number of times between numberOfPresses[0] and numberOfPresses[1] inclusive. 546 This is typically for use with Select elements. 547 548 An item in data can also be 'skip', which will perform no action on the current 549 WebElement and will continue to the next one. 550 551 An item in data can also be 'click=n', where 'n' is an integer b/t 0 and 100, 552 representing a percent chance an element will be clicked or skipped: 553 >>> user.fill_next(["click=70"]) 554 555 has a 70% chance of being 556 >>> user.fill_next([user.keys.SPACE]) 557 558 and a 30% chance of being 559 >>> user.fill_next(["skip"]) 560 561 562 :param start_element: The WebElement to start tabbing from. 563 The currently active element will be used if start_element is None. 564 565 Note: The function tabs to the next element before sending data, 566 so the start_element should the WebElement before the one 567 that should receive data[0]. 568 """ 569 element = ( 570 self.browser.switch_to.active_element 571 if not start_element 572 else start_element 573 ) 574 for datum in data: 575 element.send_keys(Keys.TAB) 576 element = self.browser.switch_to.active_element 577 self.chill(self.after_key_wait) 578 if type(datum) == str and datum.strip().startswith("click="): 579 chance = int(datum.split("=")[1].strip()) 580 if random.randint(0, 100) <= chance: 581 datum = Keys.SPACE 582 else: 583 datum = "skip" 584 if datum[0] == "downArrow": 585 if type(datum[1]) == tuple: 586 times = random.randint(datum[1][0], datum[1][1]) 587 else: 588 times = datum[1] 589 for _ in range(times): 590 element.send_keys(Keys.ARROW_DOWN) 591 self.chill(self.after_key_wait) 592 elif datum == "skip": 593 self.chill(self.after_key_wait) 594 else: 595 596 if self.turbo_engaged: 597 element.send_keys(str(datum)) 598 else: 599 for ch in str(datum): 600 element.send_keys(ch) 601 self.chill(self.after_key_wait) 602 self.chill(self.after_field_wait) 603 return element
Fills a form by tabbing from the current WebElement to the next one and using the corresponding item in data. Returns the last WebElement.
Parameters
- data: A list of form data. If an item is a string (except for 'skip') it will be typed into the current WebElement.
An item in data can be a two-tuple of the form ('downArrow', numberOfPresses:int|tuple[int, int]).
If numberOfPresses is a single int, Keys.ARROW_DOWN will be sent that many times to the WebElement.
If numberOfPresses is a tuple, Keys.ARROW_DOWN will be sent a random number of times between numberOfPresses[0] and numberOfPresses[1] inclusive. This is typically for use with Select elements.
An item in data can also be 'skip', which will perform no action on the current WebElement and will continue to the next one.
An item in data can also be 'click=n', where 'n' is an integer b/t 0 and 100, representing a percent chance an element will be clicked or skipped:
>>> user.fill_next(["click=70"])
has a 70% chance of being
>>> user.fill_next([user.keys.SPACE])
and a 30% chance of being
>>> user.fill_next(["skip"])
- start_element: The WebElement to start tabbing from. The currently active element will be used if start_element is None.
Note: The function tabs to the next element before sending data, so the start_element should the WebElement before the one that should receive data[0].
605 def wait_until( 606 self, condition: LambdaType, max_wait: float = 10, polling_interval: float = 0.1 607 ): 608 """Checks condition repeatedly until either it is true, 609 or the max_wait is exceeded. 610 611 Raises a TimeoutError if the condition doesn't success within max_wait. 612 613 Useful for determing whether a form has been successfully submitted. 614 615 :param condition: The condition function to check. 616 617 :param max_wait: Number of seconds to continue checking condition 618 before throwing a TimeoutError. 619 620 :param polling_interval: The number of seconds to sleep before 621 checking the condition function again after it fails. 622 623 e.g. self.wait_until(lambda: 'Successfully Submitted' in self.text('//p[@id="form-output"]))""" 624 start_time = time.time() 625 while True: 626 try: 627 if condition(): 628 time.sleep(1) 629 break 630 elif (time.time() - start_time) > max_wait: 631 raise TimeoutError(f"max_wait exceeded in wait_until({condition})") 632 else: 633 time.sleep(polling_interval) 634 except: 635 if (time.time() - start_time) > max_wait: 636 raise TimeoutError(f"max_wait exceeded in wait_until({condition})") 637 else: 638 time.sleep(polling_interval)
Checks condition repeatedly until either it is true, or the max_wait is exceeded.
Raises a TimeoutError if the condition doesn't success within max_wait.
Useful for determing whether a form has been successfully submitted.
Parameters
condition: The condition function to check.
max_wait: Number of seconds to continue checking condition before throwing a TimeoutError.
polling_interval: The number of seconds to sleep before checking the condition function again after it fails.
e.g. self.wait_until(lambda: 'Successfully Submitted' in self.text('//p[@id="form-output"]))
640 def dismiss_alert(self): 641 """Dismiss alert dialog.""" 642 self.browser.switch_to.alert.dismiss()
Dismiss alert dialog.
644 def solve_recaptcha_v3( 645 self, 646 outer_iframe_xpath: str = '//iframe[@title="reCAPTCHA"]', 647 inner_iframe_xpath: str = '//iframe[@title="recaptcha challenge expires in two minutes"]', 648 ): 649 """Pass google recaptcha v3 by solving an audio puzzle. 650 651 :param outer_iframe_xpath: Xpath to the iframe containing the recaptcha checkbox. 652 If it's the recaptcha without the initial checkbox that just shows the image puzzle, 653 pass None to this argument. 654 655 """ 656 locator_method = self.locator_method 657 self.locator_method = "xpath" 658 try: 659 if outer_iframe_xpath: 660 self.switch_to_iframe(outer_iframe_xpath) 661 self.click('//*[@id="recaptcha-anchor"]') 662 self.switch_to_parent_frame() 663 self.switch_to_iframe(inner_iframe_xpath) 664 self.click('//*[@id="recaptcha-audio-button"]') 665 mp3_url = self.find( 666 '//a[@class="rc-audiochallenge-tdownload-link"]' 667 ).get_attribute("href") 668 text = get_text_from_url(mp3_url, ".mp3") 669 self.send_keys('//*[@id="audio-response"]', text) 670 self.click('//*[@id="recaptcha-verify-button"]') 671 except Exception as e: 672 print(e) 673 raise Exception("Could not solve captcha") 674 finally: 675 self.switch_to_parent_frame() 676 self.locator_method = locator_method
Pass google recaptcha v3 by solving an audio puzzle.
Parameters
- outer_iframe_xpath: Xpath to the iframe containing the recaptcha checkbox. If it's the recaptcha without the initial checkbox that just shows the image puzzle, pass None to this argument.