seleniumuser.seleniumuser
1import atexit 2import os 3import random 4import sys 5import time 6from pathlib import Path 7from types import LambdaType 8from typing import Any 9from warnings import warn 10 11from bs4 import BeautifulSoup 12from noiftimer import Timer 13from selenium import webdriver 14from selenium.webdriver.chrome.options import Options as ChromeOptions 15from selenium.webdriver.chrome.service import Service as ChromeService 16from selenium.webdriver.common.by import By 17from selenium.webdriver.common.keys import Keys 18from selenium.webdriver.firefox.firefox_profile import FirefoxProfile 19from selenium.webdriver.firefox.options import Options as FirefoxOptions 20from selenium.webdriver.firefox.service import Service as FirefoxService 21from selenium.webdriver.remote.webelement import WebElement 22from selenium.webdriver.support.ui import Select 23from voxscribe import get_text_from_url 24from whosyouragent import get_agent 25 26 27class User: 28 """Sits on top of selenium to streamline 29 automation and scraping tasks.""" 30 31 def __init__( 32 self, 33 headless: bool = False, 34 browser_type: str = "firefox", 35 implicit_wait: int = 10, 36 page_load_timeout: int = 60, 37 open_browser: bool = True, 38 locator_method: str = "xpath", 39 randomize_user_agent: bool = True, 40 user_agent_rotation_period: int = None, 41 move_window_by: tuple[int, int] = (0, -1000), 42 download_dir: str | Path = None, 43 driver_path: str | Path = None, 44 ): 45 """ 46 :param headless: If True, browser window will not be visible. 47 48 :param browser_type: Which browser to use. Can be 'firefox' or 'chrome'. 49 50 :param implicit_wait: Number of seconds to look for a specified element before 51 selenium considers it missing and throws an exception. 52 53 :param page_load_timeout: Time in seconds for selenium to wait for a page to load 54 before throwing an exception. 55 56 :param open_browser: If True, opens a browser window when a User object is created. 57 If False, a manual call to self.open_browser() must be made. 58 59 :param locator_method: The locator type User should expect to be given. 60 Can be 'xpath', 'id', 'className', 'name', or 'cssSelector'. 61 Every member function with a 'locator' argument refers to a string matching 62 the current locator_method. 63 64 :param randomize_user_agent: If True, a random useragent will be used whenever 65 the browser is opened. If False, the native useragent will be used. 66 67 :param user_agent_rotation_period: If not None, the browser window will be closed 68 and reopened with a new useragent every user_agent_rotation_period number of minutes. 69 Rotation occurs on the first call to self.get() after the time period has elapsed. 70 Ignored if randomize_user_agent is False. 71 72 :param move_window_by: The x and y amount of pixels to move the browser window by after opening. 73 74 :param download_dir: The download folder to use. If None, the default folder will be used. 75 76 :param driver_path: The path to the webdriver executable selenium should use. 77 If None, the system PATH will be checked for the executable. 78 If the executable isn't found, the parent directories and the immediate child directories 79 of the current working directory will be searched. 80 """ 81 self.headless = headless 82 browser_type = browser_type.lower() 83 if browser_type in ["firefox", "chrome"]: 84 self.browser_type = browser_type 85 else: 86 raise ValueError("'browser_type' parameter must be 'firefox' or 'chrome'") 87 self.browser_open = False 88 self.implicit_wait = implicit_wait 89 self.page_load_timeout = page_load_timeout 90 self.rotation_timer = Timer() 91 self.randomize_user_agent = randomize_user_agent 92 self.user_agent_rotation_period = user_agent_rotation_period 93 self.locator_method = locator_method 94 self.turbo() 95 self.keys = Keys 96 self.move_window_by = move_window_by 97 self.download_dir = download_dir 98 self.driver_path = driver_path 99 if not self.driver_path: 100 self.search_for_driver() 101 if open_browser: 102 self.open_browser() 103 else: 104 self.browser = None 105 atexit.register(self.close_browser) 106 107 def __enter__(self): 108 return self 109 110 def __exit__(self, *args): 111 self.close_browser() 112 113 def configure_firefox(self) -> FirefoxService: 114 """Configure options and profile for firefox.""" 115 self.options = FirefoxOptions() 116 self.options.headless = self.headless 117 self.options.set_preference( 118 "widget.windows.window_occlusion_tracking.enabled", False 119 ) 120 self.options.set_preference("dom.webaudio.enabled", False) 121 if self.randomize_user_agent: 122 self.options.set_preference("general.useragent.override", get_agent()) 123 if self.download_dir: 124 Path(self.download_dir).mkdir(parents=True, exist_ok=True) 125 self.profile = FirefoxProfile() 126 self.profile.set_preference("browser.download.dir", str(self.download_dir)) 127 self.profile.set_preference("browser.download.folderList", 2) 128 else: 129 self.profile = None 130 self.service = FirefoxService( 131 executable_path=str(self.driver_path), log_path=os.devnull 132 ) 133 134 def configure_chrome(self) -> ChromeService: 135 """Configure options and profile for chrome.""" 136 self.options = ChromeOptions() 137 self.options.headless = self.headless 138 self.options.add_argument("--disable-blink-features=AutomationControlled") 139 self.options.add_argument("--mute-audio") 140 self.options.add_argument("--disable-infobars") 141 self.options.add_argument("--disable-notifications") 142 self.options.add_argument("--log-level=3") 143 if self.randomize_user_agent: 144 self.options.add_argument(f"--user-agent={get_agent()}") 145 self.options.add_experimental_option("useAutomationExtension", False) 146 if self.download_dir: 147 Path(self.download_dir).mkdir(parents=True, exist_ok=True) 148 self.options.add_experimental_option( 149 "prefs", {"download.default_directory": str(self.download_dir)} 150 ) 151 self.service = ChromeService( 152 executable_path=str(self.driver_path), log_path=os.devnull 153 ) 154 155 def search_for_driver(self): 156 """Searches for the webdriver executable.""" 157 cwd = Path.cwd() 158 found = False 159 match self.browser_type: 160 case "firefox": 161 driver = "geckodriver.exe" 162 case "chrome": 163 driver = "chromedriver.exe" 164 # search PATH 165 env_path = os.environ["PATH"] 166 if sys.platform == "win32": 167 env_paths = env_path.split(";") 168 else: 169 env_paths = env_path.split(":") 170 driver = driver[: driver.find(".")] 171 for path in env_paths: 172 if (Path(path) / driver).exists(): 173 self.driver_path = Path(path) / driver 174 found = True 175 break 176 # check current working directory and parent folders 177 if not found: 178 while cwd != cwd.parent: 179 if (cwd / driver).exists(): 180 self.driver_path = cwd / driver 181 found = True 182 break 183 cwd = cwd.parent 184 # check top most level 185 if not found and (cwd / driver).exists(): 186 self.driver_path = cwd / driver 187 found = True 188 # check child folders (only 1 level down) 189 if not found: 190 for child in Path.cwd().iterdir(): 191 if child.is_dir() and (child / driver).exists(): 192 self.driver_path = child / driver 193 found = True 194 if not found: 195 warn(f"Could not find {driver}") 196 197 def set_implicit_wait(self, wait_time: int = None): 198 """Sets to default time if no arg given.""" 199 if not wait_time: 200 self.browser.implicitly_wait(self.implicit_wait) 201 else: 202 self.browser.implicitly_wait(wait_time) 203 204 def open_browser(self): 205 """Configures and opens selenium browser.""" 206 if not self.browser_open: 207 match self.browser_type: 208 case "firefox": 209 self.configure_firefox() 210 self.browser = webdriver.Firefox( 211 options=self.options, 212 service=self.service, 213 firefox_profile=self.profile, 214 ) 215 case "chrome": 216 self.configure_chrome() 217 self.browser = webdriver.Chrome( 218 options=self.options, service=self.service 219 ) 220 self.set_implicit_wait() 221 self.browser.maximize_window() 222 self.browser.set_window_position( 223 self.move_window_by[0], self.move_window_by[1] 224 ) 225 self.browser.maximize_window() 226 self.browser.set_page_load_timeout(self.page_load_timeout) 227 self.browser_open = True 228 self.tab_index = 0 229 self.rotation_timer.start() 230 else: 231 warn("Browser already open.") 232 233 def close_browser(self): 234 """Close browser window.""" 235 if self.browser_open: 236 self.browser_open = False 237 self.browser.quit() 238 239 def open_tab(self, url: str = "", switch_to_tab: bool = True): 240 """Opens new tab and, if provided, goes to url. 241 242 New tab is inserted after currently active tab.""" 243 self.script("window.open(arguments[0]);", url) 244 if switch_to_tab: 245 self.switch_to_tab(self.tab_index + 1) 246 247 def switch_to_tab(self, tab_index: int): 248 """Switch to a tab in browser, zero indexed.""" 249 self.browser.switch_to.window(self.browser.window_handles[tab_index]) 250 self.tab_index = tab_index 251 252 def get_num_tabs(self) -> int: 253 """Returns number of tabs open.""" 254 return len(self.browser.window_handles) 255 256 def close_tab(self, tab_index: int = 1): 257 """Close specified tab and 258 switches to tab index 0.""" 259 self.switch_to_tab(tab_index) 260 self.browser.close() 261 self.switch_to_tab(0) 262 263 def get(self, url: str): 264 """Requests webpage at given url and rotates userAgent if necessary.""" 265 if not self.browser_open: 266 self.open_browser() 267 if ( 268 self.randomize_user_agent 269 and self.user_agent_rotation_period is not None 270 and self.rotation_timer.elapsed > (60 * self.user_agent_rotation_period) 271 ): 272 self.rotation_timer.stop() 273 self.close_browser() 274 self.open_browser() 275 self.browser.get(url) 276 self.script("Object.defineProperty(navigator, 'webdriver', {get: () => false})") 277 self.chill(self.arrival_wait) 278 279 def get_soup(self) -> BeautifulSoup: 280 """Returns a BeautifulSoup object 281 of the current page source.""" 282 return BeautifulSoup(self.browser.page_source, "html.parser") 283 284 def current_url(self) -> str: 285 """Returns current url of active tab.""" 286 return self.browser.current_url 287 288 def delete_cookies(self): 289 """Delete all cookies for 290 this browser instance.""" 291 self.browser.delete_all_cookies() 292 293 def turbo(self, engage: bool = True): 294 """When engaged, strings will be sent 295 to elements all at once and there will be 296 no waiting after actions. 297 298 When disengaged, strings will be sent to elements 299 'one key at a time' with randomized amounts of 300 time between successive keys and after actions.""" 301 if engage: 302 self.after_key_wait = (0, 0) 303 self.after_field_wait = (0, 0) 304 self.after_click_wait = (0, 0) 305 self.arrival_wait = (1, 1) 306 self.one_key_at_a_time = False 307 self.turbo_engaged = True 308 else: 309 self.after_key_wait = (0.1, 0.5) 310 self.after_field_wait = (1, 2) 311 self.after_click_wait = (0.25, 1.5) 312 self.arrival_wait = (4, 10) 313 self.one_key_at_a_time = True 314 self.turbo_engaged = False 315 316 def chill(self, min_max: tuple[float, float]): 317 """Sleeps a random amount 318 between min_max[0] and min_max[1].""" 319 time.sleep(random.uniform(min_max[0], min_max[1])) 320 321 def script(self, script: str, args: Any = None) -> Any: 322 """Execute javascript code and returns result.""" 323 return self.browser.execute_script(script, args) 324 325 def remove(self, locator: str): 326 """Removes element from DOM.""" 327 self.script("arguments[0].remove();", self.find(locator)) 328 329 def get_length(self, locator: str) -> int: 330 """Returns number of child elements for a given element.""" 331 return int(self.script("return arguments[0].length;", self.find(locator))) 332 333 def find(self, locator: str) -> WebElement: 334 """Finds and returns a WebElement.""" 335 match self.locator_method: 336 case "xpath": 337 return self.browser.find_element(By.XPATH, locator) 338 case "id": 339 return self.browser.find_element(By.ID, locator) 340 case "className": 341 return self.browser.find_element(By.CLASS_NAME, locator) 342 case "name": 343 return self.browser.find_element(By.NAME, locator) 344 case "cssSelector": 345 return self.browser.find_element(By.CSS_SELECTOR, locator) 346 347 def find_children(self, locator: str) -> list[WebElement]: 348 """Returns a list of child WebElements 349 for given locator arg.""" 350 element = self.find(locator) 351 return element.find_elements("xpath", "./*") 352 353 def scroll(self, amount: int = None, fraction: float = None): 354 """Scroll web page. 355 :param amount: The number of lines to scroll if not None. 356 357 :param fraction: The amount between 0.0 and 1.0 358 of the page height to scroll. 359 360 If values are provided for both arguments, 361 amount will be used. 362 363 If values are provided for neither argument, 364 the entire page length will be scrolled. 365 366 Scrolls one line at a time if self.turbo is False.""" 367 if amount: 368 amount_to_scroll = amount 369 elif fraction: 370 amount_to_scroll = int( 371 fraction 372 * ( 373 int(self.script("return document.body.scrollHeight;")) 374 - int(self.script("return window.pageYOffset;")) 375 ) 376 ) 377 else: 378 amount_to_scroll = int(self.script("return document.body.scrollHeight;")) 379 if self.turbo_engaged: 380 self.script("window.scrollBy(0,arguments[0]);", amount_to_scroll) 381 else: 382 for _ in range(abs(amount_to_scroll)): 383 if amount_to_scroll >= 0: 384 self.script("window.scrollBy(0,1);") 385 else: 386 self.script("window.scrollBy(0,-1);") 387 self.chill(self.after_click_wait) 388 389 def scroll_into_view(self, locator: str) -> WebElement: 390 """Scrolls to a given element and returns the element.""" 391 element = self.find(locator) 392 self.script("arguments[0].scrollIntoView();", element) 393 self.chill(self.after_click_wait) 394 return element 395 396 def text(self, locator: str) -> str: 397 """Returns text of WebElement.""" 398 return self.find(locator).text 399 400 def click(self, locator: str) -> WebElement: 401 """Clicks on and returns WebElement.""" 402 element = self.find(locator) 403 element.click() 404 self.chill(self.after_click_wait) 405 return element 406 407 def clear(self, locator: str) -> WebElement: 408 """Clears content of WebElement if able 409 and then returns WebElement.""" 410 element = self.find(locator) 411 element.clear() 412 self.chill(self.after_click_wait) 413 return element 414 415 def switch_to_iframe(self, locator: str): 416 """Switch to an iframe from given locator.""" 417 self.browser.switch_to.frame(self.find(locator)) 418 419 def switch_to_parent_frame(self): 420 """Move up a frame level from current frame.""" 421 self.browser.switch_to.parent_frame() 422 423 def select( 424 self, locator: str, method: str, choice: str | int | tuple 425 ) -> WebElement: 426 """Select a choice from Select element. 427 Returns the Select element from the locator string, 428 not the option element that is selected. 429 430 :param method: Can be 'value' or 'index' 431 432 :param choice: The option to select. 433 434 If method is 'value', then choice should be 435 the html 'value' attribute of the desired option. 436 437 If method is 'index', choice can either be a single 438 int for the desired option or it can be a two-tuple. 439 If the tuple is provided, a random option between the 440 two indicies (inclusive) will be selected.""" 441 element = self.click(locator) 442 match method: 443 case "value": 444 Select(element).select_by_value(choice) 445 case "index": 446 if type(choice) == tuple: 447 choice = random.randint(choice[0], choice[1]) 448 Select(element).select_by_index(choice) 449 self.chill(self.after_field_wait) 450 return element 451 452 def click_elements( 453 self, locators: list[str], max_selections: int = None, min_selections: int = 1 454 ) -> WebElement: 455 """Click a random number of WebElements 456 and return the last WebElement clicked. 457 458 :param locators: A list of element locators to choose from. 459 460 :param max_selections: The maximum number of elements to click. 461 If None, the maximum will be the length of the locators list. 462 463 :param min_selections: The minimum number of elements to click. 464 465 e.g. self.click_elements([xpath1, xpath2, xpath3, xpath4], max_selections=3) 466 will click between 1 and 3 random elements from the list. 467 """ 468 if not max_selections: 469 max_selections = len(locators) 470 for option in random.sample( 471 locators, k=random.randint(min_selections, max_selections) 472 ): 473 element = self.click(option) 474 return element 475 476 def get_click_list( 477 self, num_options: int, max_choices: int = 1, min_choices: int = 1 478 ) -> list[str]: 479 """Similar to self.click_elements(), but for use with the self.fill_next() method. 480 481 Creates a list of length 'num_options' where every element is 'skip'. 482 483 A random number of elements in the list between 'min_choices' and 'max_choices' are 484 replaced with 'keys.SPACE' (interpreted as a click by almost all web forms).""" 485 click_list = ["skip"] * num_options 486 selected_indexes = [] 487 for i in range(random.randint(min_choices, max_choices)): 488 index = random.randint(0, num_options - 1) 489 while index in selected_indexes: 490 index = random.randint(0, num_options - 1) 491 selected_indexes.append(index) 492 click_list[index] = self.keys.SPACE 493 return click_list 494 495 def send_keys( 496 self, 497 locator: str, 498 data: str, 499 click_first: bool = True, 500 clear_first: bool = False, 501 ) -> WebElement: 502 """Types data into element and returns the element. 503 504 :param data: The string to send to the element. 505 506 :param click_first: If True, the element is clicked on 507 before the data is sent. 508 509 :param clear_first: If True, the current text of the element 510 is cleared before the data is sent.""" 511 element = self.click(locator) if click_first else self.find(locator) 512 if clear_first: 513 element.clear() 514 self.chill(self.after_click_wait) 515 if self.one_key_at_a_time: 516 for ch in str(data): 517 element.send_keys(ch) 518 self.chill(self.after_key_wait) 519 else: 520 element.send_keys(str(data)) 521 self.chill(self.after_field_wait) 522 return element 523 524 def fill_next( 525 self, data: list[str | tuple], start_element: WebElement = None 526 ) -> WebElement: 527 """Fills a form by tabbing from the current WebElement 528 to the next one and using the corresponding item in data. 529 Returns the last WebElement. 530 531 :param data: A list of form data. If an item is a string (except for 'skip') 532 it will be typed into the current WebElement. 533 534 An item in data can be a two-tuple of the form 535 ('downArrow', numberOfPresses:int|tuple[int, int]). 536 537 If numberOfPresses is a single int, Keys.ARROW_DOWN will be sent 538 that many times to the WebElement. 539 540 If numberOfPresses is a tuple, Keys.ARROW_DOWN will be sent a random 541 number of times between numberOfPresses[0] and numberOfPresses[1] inclusive. 542 This is typically for use with Select elements. 543 544 An item in data can also be 'skip', which will perform no action on the current 545 WebElement and will continue to the next one. 546 547 An item in data can also be 'click=n', where 'n' is an integer b/t 0 and 100, 548 representing a percent chance an element will be clicked or skipped: 549 >>> user.fill_next(["click=70"]) 550 551 has a 70% chance of being 552 >>> user.fill_next([user.keys.SPACE]) 553 554 and a 30% chance of being 555 >>> user.fill_next(["skip"]) 556 557 558 :param start_element: The WebElement to start tabbing from. 559 The currently active element will be used if start_element is None. 560 561 Note: The function tabs to the next element before sending data, 562 so the start_element should the WebElement before the one 563 that should receive data[0]. 564 """ 565 element = ( 566 self.browser.switch_to.active_element 567 if not start_element 568 else start_element 569 ) 570 for datum in data: 571 element.send_keys(Keys.TAB) 572 element = self.browser.switch_to.active_element 573 self.chill(self.after_key_wait) 574 if type(datum) == str and datum.strip().startswith("click="): 575 chance = int(datum.split("=")[1].strip()) 576 if random.randint(0, 100) <= chance: 577 datum = Keys.SPACE 578 else: 579 datum = "skip" 580 if datum[0] == "downArrow": 581 if type(datum[1]) == tuple: 582 times = random.randint(datum[1][0], datum[1][1]) 583 else: 584 times = datum[1] 585 for _ in range(times): 586 element.send_keys(Keys.ARROW_DOWN) 587 self.chill(self.after_key_wait) 588 elif datum == "skip": 589 self.chill(self.after_key_wait) 590 else: 591 592 if self.turbo_engaged: 593 element.send_keys(str(datum)) 594 else: 595 for ch in str(datum): 596 element.send_keys(ch) 597 self.chill(self.after_key_wait) 598 self.chill(self.after_field_wait) 599 return element 600 601 def wait_until( 602 self, condition: LambdaType, max_wait: float = 10, polling_interval: float = 0.1 603 ): 604 """Checks condition repeatedly until either it is true, 605 or the max_wait is exceeded. 606 607 Raises a TimeoutError if the condition doesn't success within max_wait. 608 609 Useful for determing whether a form has been successfully submitted. 610 611 :param condition: The condition function to check. 612 613 :param max_wait: Number of seconds to continue checking condition 614 before throwing a TimeoutError. 615 616 :param polling_interval: The number of seconds to sleep before 617 checking the condition function again after it fails. 618 619 e.g. self.wait_until(lambda: 'Successfully Submitted' in self.text('//p[@id="form-output"]))""" 620 start_time = time.time() 621 while True: 622 try: 623 if condition(): 624 time.sleep(1) 625 break 626 elif (time.time() - start_time) > max_wait: 627 raise TimeoutError(f"max_wait exceeded in wait_until({condition})") 628 else: 629 time.sleep(polling_interval) 630 except: 631 if (time.time() - start_time) > max_wait: 632 raise TimeoutError(f"max_wait exceeded in wait_until({condition})") 633 else: 634 time.sleep(polling_interval) 635 636 def dismiss_alert(self): 637 """Dismiss alert dialog.""" 638 self.browser.switch_to.alert.dismiss() 639 640 def solve_recaptcha_v3( 641 self, 642 outer_iframe_xpath: str = '//iframe[@title="reCAPTCHA"]', 643 inner_iframe_xpath: str = '//iframe[@title="recaptcha challenge expires in two minutes"]', 644 ): 645 """Pass google recaptcha v3 by solving an audio puzzle. 646 647 :param outer_iframe_xpath: Xpath to the iframe containing the recaptcha checkbox. 648 If it's the recaptcha without the initial checkbox that just shows the image puzzle, 649 pass None to this argument. 650 651 """ 652 locator_method = self.locator_method 653 self.locator_method = "xpath" 654 try: 655 if outer_iframe_xpath: 656 self.switch_to_iframe(outer_iframe_xpath) 657 self.click('//*[@id="recaptcha-anchor"]') 658 self.switch_to_parent_frame() 659 self.switch_to_iframe(inner_iframe_xpath) 660 self.click('//*[@id="recaptcha-audio-button"]') 661 mp3_url = self.find( 662 '//a[@class="rc-audiochallenge-tdownload-link"]' 663 ).get_attribute("href") 664 text = get_text_from_url(mp3_url, ".mp3") 665 self.send_keys('//*[@id="audio-response"]', text) 666 self.click('//*[@id="recaptcha-verify-button"]') 667 except Exception as e: 668 print(e) 669 raise Exception("Could not solve captcha") 670 finally: 671 self.switch_to_parent_frame() 672 self.locator_method = locator_method
28class User: 29 """Sits on top of selenium to streamline 30 automation and scraping tasks.""" 31 32 def __init__( 33 self, 34 headless: bool = False, 35 browser_type: str = "firefox", 36 implicit_wait: int = 10, 37 page_load_timeout: int = 60, 38 open_browser: bool = True, 39 locator_method: str = "xpath", 40 randomize_user_agent: bool = True, 41 user_agent_rotation_period: int = None, 42 move_window_by: tuple[int, int] = (0, -1000), 43 download_dir: str | Path = None, 44 driver_path: str | Path = None, 45 ): 46 """ 47 :param headless: If True, browser window will not be visible. 48 49 :param browser_type: Which browser to use. Can be 'firefox' or 'chrome'. 50 51 :param implicit_wait: Number of seconds to look for a specified element before 52 selenium considers it missing and throws an exception. 53 54 :param page_load_timeout: Time in seconds for selenium to wait for a page to load 55 before throwing an exception. 56 57 :param open_browser: If True, opens a browser window when a User object is created. 58 If False, a manual call to self.open_browser() must be made. 59 60 :param locator_method: The locator type User should expect to be given. 61 Can be 'xpath', 'id', 'className', 'name', or 'cssSelector'. 62 Every member function with a 'locator' argument refers to a string matching 63 the current locator_method. 64 65 :param randomize_user_agent: If True, a random useragent will be used whenever 66 the browser is opened. If False, the native useragent will be used. 67 68 :param user_agent_rotation_period: If not None, the browser window will be closed 69 and reopened with a new useragent every user_agent_rotation_period number of minutes. 70 Rotation occurs on the first call to self.get() after the time period has elapsed. 71 Ignored if randomize_user_agent is False. 72 73 :param move_window_by: The x and y amount of pixels to move the browser window by after opening. 74 75 :param download_dir: The download folder to use. If None, the default folder will be used. 76 77 :param driver_path: The path to the webdriver executable selenium should use. 78 If None, the system PATH will be checked for the executable. 79 If the executable isn't found, the parent directories and the immediate child directories 80 of the current working directory will be searched. 81 """ 82 self.headless = headless 83 browser_type = browser_type.lower() 84 if browser_type in ["firefox", "chrome"]: 85 self.browser_type = browser_type 86 else: 87 raise ValueError("'browser_type' parameter must be 'firefox' or 'chrome'") 88 self.browser_open = False 89 self.implicit_wait = implicit_wait 90 self.page_load_timeout = page_load_timeout 91 self.rotation_timer = Timer() 92 self.randomize_user_agent = randomize_user_agent 93 self.user_agent_rotation_period = user_agent_rotation_period 94 self.locator_method = locator_method 95 self.turbo() 96 self.keys = Keys 97 self.move_window_by = move_window_by 98 self.download_dir = download_dir 99 self.driver_path = driver_path 100 if not self.driver_path: 101 self.search_for_driver() 102 if open_browser: 103 self.open_browser() 104 else: 105 self.browser = None 106 atexit.register(self.close_browser) 107 108 def __enter__(self): 109 return self 110 111 def __exit__(self, *args): 112 self.close_browser() 113 114 def configure_firefox(self) -> FirefoxService: 115 """Configure options and profile for firefox.""" 116 self.options = FirefoxOptions() 117 self.options.headless = self.headless 118 self.options.set_preference( 119 "widget.windows.window_occlusion_tracking.enabled", False 120 ) 121 self.options.set_preference("dom.webaudio.enabled", False) 122 if self.randomize_user_agent: 123 self.options.set_preference("general.useragent.override", get_agent()) 124 if self.download_dir: 125 Path(self.download_dir).mkdir(parents=True, exist_ok=True) 126 self.profile = FirefoxProfile() 127 self.profile.set_preference("browser.download.dir", str(self.download_dir)) 128 self.profile.set_preference("browser.download.folderList", 2) 129 else: 130 self.profile = None 131 self.service = FirefoxService( 132 executable_path=str(self.driver_path), log_path=os.devnull 133 ) 134 135 def configure_chrome(self) -> ChromeService: 136 """Configure options and profile for chrome.""" 137 self.options = ChromeOptions() 138 self.options.headless = self.headless 139 self.options.add_argument("--disable-blink-features=AutomationControlled") 140 self.options.add_argument("--mute-audio") 141 self.options.add_argument("--disable-infobars") 142 self.options.add_argument("--disable-notifications") 143 self.options.add_argument("--log-level=3") 144 if self.randomize_user_agent: 145 self.options.add_argument(f"--user-agent={get_agent()}") 146 self.options.add_experimental_option("useAutomationExtension", False) 147 if self.download_dir: 148 Path(self.download_dir).mkdir(parents=True, exist_ok=True) 149 self.options.add_experimental_option( 150 "prefs", {"download.default_directory": str(self.download_dir)} 151 ) 152 self.service = ChromeService( 153 executable_path=str(self.driver_path), log_path=os.devnull 154 ) 155 156 def search_for_driver(self): 157 """Searches for the webdriver executable.""" 158 cwd = Path.cwd() 159 found = False 160 match self.browser_type: 161 case "firefox": 162 driver = "geckodriver.exe" 163 case "chrome": 164 driver = "chromedriver.exe" 165 # search PATH 166 env_path = os.environ["PATH"] 167 if sys.platform == "win32": 168 env_paths = env_path.split(";") 169 else: 170 env_paths = env_path.split(":") 171 driver = driver[: driver.find(".")] 172 for path in env_paths: 173 if (Path(path) / driver).exists(): 174 self.driver_path = Path(path) / driver 175 found = True 176 break 177 # check current working directory and parent folders 178 if not found: 179 while cwd != cwd.parent: 180 if (cwd / driver).exists(): 181 self.driver_path = cwd / driver 182 found = True 183 break 184 cwd = cwd.parent 185 # check top most level 186 if not found and (cwd / driver).exists(): 187 self.driver_path = cwd / driver 188 found = True 189 # check child folders (only 1 level down) 190 if not found: 191 for child in Path.cwd().iterdir(): 192 if child.is_dir() and (child / driver).exists(): 193 self.driver_path = child / driver 194 found = True 195 if not found: 196 warn(f"Could not find {driver}") 197 198 def set_implicit_wait(self, wait_time: int = None): 199 """Sets to default time if no arg given.""" 200 if not wait_time: 201 self.browser.implicitly_wait(self.implicit_wait) 202 else: 203 self.browser.implicitly_wait(wait_time) 204 205 def open_browser(self): 206 """Configures and opens selenium browser.""" 207 if not self.browser_open: 208 match self.browser_type: 209 case "firefox": 210 self.configure_firefox() 211 self.browser = webdriver.Firefox( 212 options=self.options, 213 service=self.service, 214 firefox_profile=self.profile, 215 ) 216 case "chrome": 217 self.configure_chrome() 218 self.browser = webdriver.Chrome( 219 options=self.options, service=self.service 220 ) 221 self.set_implicit_wait() 222 self.browser.maximize_window() 223 self.browser.set_window_position( 224 self.move_window_by[0], self.move_window_by[1] 225 ) 226 self.browser.maximize_window() 227 self.browser.set_page_load_timeout(self.page_load_timeout) 228 self.browser_open = True 229 self.tab_index = 0 230 self.rotation_timer.start() 231 else: 232 warn("Browser already open.") 233 234 def close_browser(self): 235 """Close browser window.""" 236 if self.browser_open: 237 self.browser_open = False 238 self.browser.quit() 239 240 def open_tab(self, url: str = "", switch_to_tab: bool = True): 241 """Opens new tab and, if provided, goes to url. 242 243 New tab is inserted after currently active tab.""" 244 self.script("window.open(arguments[0]);", url) 245 if switch_to_tab: 246 self.switch_to_tab(self.tab_index + 1) 247 248 def switch_to_tab(self, tab_index: int): 249 """Switch to a tab in browser, zero indexed.""" 250 self.browser.switch_to.window(self.browser.window_handles[tab_index]) 251 self.tab_index = tab_index 252 253 def get_num_tabs(self) -> int: 254 """Returns number of tabs open.""" 255 return len(self.browser.window_handles) 256 257 def close_tab(self, tab_index: int = 1): 258 """Close specified tab and 259 switches to tab index 0.""" 260 self.switch_to_tab(tab_index) 261 self.browser.close() 262 self.switch_to_tab(0) 263 264 def get(self, url: str): 265 """Requests webpage at given url and rotates userAgent if necessary.""" 266 if not self.browser_open: 267 self.open_browser() 268 if ( 269 self.randomize_user_agent 270 and self.user_agent_rotation_period is not None 271 and self.rotation_timer.elapsed > (60 * self.user_agent_rotation_period) 272 ): 273 self.rotation_timer.stop() 274 self.close_browser() 275 self.open_browser() 276 self.browser.get(url) 277 self.script("Object.defineProperty(navigator, 'webdriver', {get: () => false})") 278 self.chill(self.arrival_wait) 279 280 def get_soup(self) -> BeautifulSoup: 281 """Returns a BeautifulSoup object 282 of the current page source.""" 283 return BeautifulSoup(self.browser.page_source, "html.parser") 284 285 def current_url(self) -> str: 286 """Returns current url of active tab.""" 287 return self.browser.current_url 288 289 def delete_cookies(self): 290 """Delete all cookies for 291 this browser instance.""" 292 self.browser.delete_all_cookies() 293 294 def turbo(self, engage: bool = True): 295 """When engaged, strings will be sent 296 to elements all at once and there will be 297 no waiting after actions. 298 299 When disengaged, strings will be sent to elements 300 'one key at a time' with randomized amounts of 301 time between successive keys and after actions.""" 302 if engage: 303 self.after_key_wait = (0, 0) 304 self.after_field_wait = (0, 0) 305 self.after_click_wait = (0, 0) 306 self.arrival_wait = (1, 1) 307 self.one_key_at_a_time = False 308 self.turbo_engaged = True 309 else: 310 self.after_key_wait = (0.1, 0.5) 311 self.after_field_wait = (1, 2) 312 self.after_click_wait = (0.25, 1.5) 313 self.arrival_wait = (4, 10) 314 self.one_key_at_a_time = True 315 self.turbo_engaged = False 316 317 def chill(self, min_max: tuple[float, float]): 318 """Sleeps a random amount 319 between min_max[0] and min_max[1].""" 320 time.sleep(random.uniform(min_max[0], min_max[1])) 321 322 def script(self, script: str, args: Any = None) -> Any: 323 """Execute javascript code and returns result.""" 324 return self.browser.execute_script(script, args) 325 326 def remove(self, locator: str): 327 """Removes element from DOM.""" 328 self.script("arguments[0].remove();", self.find(locator)) 329 330 def get_length(self, locator: str) -> int: 331 """Returns number of child elements for a given element.""" 332 return int(self.script("return arguments[0].length;", self.find(locator))) 333 334 def find(self, locator: str) -> WebElement: 335 """Finds and returns a WebElement.""" 336 match self.locator_method: 337 case "xpath": 338 return self.browser.find_element(By.XPATH, locator) 339 case "id": 340 return self.browser.find_element(By.ID, locator) 341 case "className": 342 return self.browser.find_element(By.CLASS_NAME, locator) 343 case "name": 344 return self.browser.find_element(By.NAME, locator) 345 case "cssSelector": 346 return self.browser.find_element(By.CSS_SELECTOR, locator) 347 348 def find_children(self, locator: str) -> list[WebElement]: 349 """Returns a list of child WebElements 350 for given locator arg.""" 351 element = self.find(locator) 352 return element.find_elements("xpath", "./*") 353 354 def scroll(self, amount: int = None, fraction: float = None): 355 """Scroll web page. 356 :param amount: The number of lines to scroll if not None. 357 358 :param fraction: The amount between 0.0 and 1.0 359 of the page height to scroll. 360 361 If values are provided for both arguments, 362 amount will be used. 363 364 If values are provided for neither argument, 365 the entire page length will be scrolled. 366 367 Scrolls one line at a time if self.turbo is False.""" 368 if amount: 369 amount_to_scroll = amount 370 elif fraction: 371 amount_to_scroll = int( 372 fraction 373 * ( 374 int(self.script("return document.body.scrollHeight;")) 375 - int(self.script("return window.pageYOffset;")) 376 ) 377 ) 378 else: 379 amount_to_scroll = int(self.script("return document.body.scrollHeight;")) 380 if self.turbo_engaged: 381 self.script("window.scrollBy(0,arguments[0]);", amount_to_scroll) 382 else: 383 for _ in range(abs(amount_to_scroll)): 384 if amount_to_scroll >= 0: 385 self.script("window.scrollBy(0,1);") 386 else: 387 self.script("window.scrollBy(0,-1);") 388 self.chill(self.after_click_wait) 389 390 def scroll_into_view(self, locator: str) -> WebElement: 391 """Scrolls to a given element and returns the element.""" 392 element = self.find(locator) 393 self.script("arguments[0].scrollIntoView();", element) 394 self.chill(self.after_click_wait) 395 return element 396 397 def text(self, locator: str) -> str: 398 """Returns text of WebElement.""" 399 return self.find(locator).text 400 401 def click(self, locator: str) -> WebElement: 402 """Clicks on and returns WebElement.""" 403 element = self.find(locator) 404 element.click() 405 self.chill(self.after_click_wait) 406 return element 407 408 def clear(self, locator: str) -> WebElement: 409 """Clears content of WebElement if able 410 and then returns WebElement.""" 411 element = self.find(locator) 412 element.clear() 413 self.chill(self.after_click_wait) 414 return element 415 416 def switch_to_iframe(self, locator: str): 417 """Switch to an iframe from given locator.""" 418 self.browser.switch_to.frame(self.find(locator)) 419 420 def switch_to_parent_frame(self): 421 """Move up a frame level from current frame.""" 422 self.browser.switch_to.parent_frame() 423 424 def select( 425 self, locator: str, method: str, choice: str | int | tuple 426 ) -> WebElement: 427 """Select a choice from Select element. 428 Returns the Select element from the locator string, 429 not the option element that is selected. 430 431 :param method: Can be 'value' or 'index' 432 433 :param choice: The option to select. 434 435 If method is 'value', then choice should be 436 the html 'value' attribute of the desired option. 437 438 If method is 'index', choice can either be a single 439 int for the desired option or it can be a two-tuple. 440 If the tuple is provided, a random option between the 441 two indicies (inclusive) will be selected.""" 442 element = self.click(locator) 443 match method: 444 case "value": 445 Select(element).select_by_value(choice) 446 case "index": 447 if type(choice) == tuple: 448 choice = random.randint(choice[0], choice[1]) 449 Select(element).select_by_index(choice) 450 self.chill(self.after_field_wait) 451 return element 452 453 def click_elements( 454 self, locators: list[str], max_selections: int = None, min_selections: int = 1 455 ) -> WebElement: 456 """Click a random number of WebElements 457 and return the last WebElement clicked. 458 459 :param locators: A list of element locators to choose from. 460 461 :param max_selections: The maximum number of elements to click. 462 If None, the maximum will be the length of the locators list. 463 464 :param min_selections: The minimum number of elements to click. 465 466 e.g. self.click_elements([xpath1, xpath2, xpath3, xpath4], max_selections=3) 467 will click between 1 and 3 random elements from the list. 468 """ 469 if not max_selections: 470 max_selections = len(locators) 471 for option in random.sample( 472 locators, k=random.randint(min_selections, max_selections) 473 ): 474 element = self.click(option) 475 return element 476 477 def get_click_list( 478 self, num_options: int, max_choices: int = 1, min_choices: int = 1 479 ) -> list[str]: 480 """Similar to self.click_elements(), but for use with the self.fill_next() method. 481 482 Creates a list of length 'num_options' where every element is 'skip'. 483 484 A random number of elements in the list between 'min_choices' and 'max_choices' are 485 replaced with 'keys.SPACE' (interpreted as a click by almost all web forms).""" 486 click_list = ["skip"] * num_options 487 selected_indexes = [] 488 for i in range(random.randint(min_choices, max_choices)): 489 index = random.randint(0, num_options - 1) 490 while index in selected_indexes: 491 index = random.randint(0, num_options - 1) 492 selected_indexes.append(index) 493 click_list[index] = self.keys.SPACE 494 return click_list 495 496 def send_keys( 497 self, 498 locator: str, 499 data: str, 500 click_first: bool = True, 501 clear_first: bool = False, 502 ) -> WebElement: 503 """Types data into element and returns the element. 504 505 :param data: The string to send to the element. 506 507 :param click_first: If True, the element is clicked on 508 before the data is sent. 509 510 :param clear_first: If True, the current text of the element 511 is cleared before the data is sent.""" 512 element = self.click(locator) if click_first else self.find(locator) 513 if clear_first: 514 element.clear() 515 self.chill(self.after_click_wait) 516 if self.one_key_at_a_time: 517 for ch in str(data): 518 element.send_keys(ch) 519 self.chill(self.after_key_wait) 520 else: 521 element.send_keys(str(data)) 522 self.chill(self.after_field_wait) 523 return element 524 525 def fill_next( 526 self, data: list[str | tuple], start_element: WebElement = None 527 ) -> WebElement: 528 """Fills a form by tabbing from the current WebElement 529 to the next one and using the corresponding item in data. 530 Returns the last WebElement. 531 532 :param data: A list of form data. If an item is a string (except for 'skip') 533 it will be typed into the current WebElement. 534 535 An item in data can be a two-tuple of the form 536 ('downArrow', numberOfPresses:int|tuple[int, int]). 537 538 If numberOfPresses is a single int, Keys.ARROW_DOWN will be sent 539 that many times to the WebElement. 540 541 If numberOfPresses is a tuple, Keys.ARROW_DOWN will be sent a random 542 number of times between numberOfPresses[0] and numberOfPresses[1] inclusive. 543 This is typically for use with Select elements. 544 545 An item in data can also be 'skip', which will perform no action on the current 546 WebElement and will continue to the next one. 547 548 An item in data can also be 'click=n', where 'n' is an integer b/t 0 and 100, 549 representing a percent chance an element will be clicked or skipped: 550 >>> user.fill_next(["click=70"]) 551 552 has a 70% chance of being 553 >>> user.fill_next([user.keys.SPACE]) 554 555 and a 30% chance of being 556 >>> user.fill_next(["skip"]) 557 558 559 :param start_element: The WebElement to start tabbing from. 560 The currently active element will be used if start_element is None. 561 562 Note: The function tabs to the next element before sending data, 563 so the start_element should the WebElement before the one 564 that should receive data[0]. 565 """ 566 element = ( 567 self.browser.switch_to.active_element 568 if not start_element 569 else start_element 570 ) 571 for datum in data: 572 element.send_keys(Keys.TAB) 573 element = self.browser.switch_to.active_element 574 self.chill(self.after_key_wait) 575 if type(datum) == str and datum.strip().startswith("click="): 576 chance = int(datum.split("=")[1].strip()) 577 if random.randint(0, 100) <= chance: 578 datum = Keys.SPACE 579 else: 580 datum = "skip" 581 if datum[0] == "downArrow": 582 if type(datum[1]) == tuple: 583 times = random.randint(datum[1][0], datum[1][1]) 584 else: 585 times = datum[1] 586 for _ in range(times): 587 element.send_keys(Keys.ARROW_DOWN) 588 self.chill(self.after_key_wait) 589 elif datum == "skip": 590 self.chill(self.after_key_wait) 591 else: 592 593 if self.turbo_engaged: 594 element.send_keys(str(datum)) 595 else: 596 for ch in str(datum): 597 element.send_keys(ch) 598 self.chill(self.after_key_wait) 599 self.chill(self.after_field_wait) 600 return element 601 602 def wait_until( 603 self, condition: LambdaType, max_wait: float = 10, polling_interval: float = 0.1 604 ): 605 """Checks condition repeatedly until either it is true, 606 or the max_wait is exceeded. 607 608 Raises a TimeoutError if the condition doesn't success within max_wait. 609 610 Useful for determing whether a form has been successfully submitted. 611 612 :param condition: The condition function to check. 613 614 :param max_wait: Number of seconds to continue checking condition 615 before throwing a TimeoutError. 616 617 :param polling_interval: The number of seconds to sleep before 618 checking the condition function again after it fails. 619 620 e.g. self.wait_until(lambda: 'Successfully Submitted' in self.text('//p[@id="form-output"]))""" 621 start_time = time.time() 622 while True: 623 try: 624 if condition(): 625 time.sleep(1) 626 break 627 elif (time.time() - start_time) > max_wait: 628 raise TimeoutError(f"max_wait exceeded in wait_until({condition})") 629 else: 630 time.sleep(polling_interval) 631 except: 632 if (time.time() - start_time) > max_wait: 633 raise TimeoutError(f"max_wait exceeded in wait_until({condition})") 634 else: 635 time.sleep(polling_interval) 636 637 def dismiss_alert(self): 638 """Dismiss alert dialog.""" 639 self.browser.switch_to.alert.dismiss() 640 641 def solve_recaptcha_v3( 642 self, 643 outer_iframe_xpath: str = '//iframe[@title="reCAPTCHA"]', 644 inner_iframe_xpath: str = '//iframe[@title="recaptcha challenge expires in two minutes"]', 645 ): 646 """Pass google recaptcha v3 by solving an audio puzzle. 647 648 :param outer_iframe_xpath: Xpath to the iframe containing the recaptcha checkbox. 649 If it's the recaptcha without the initial checkbox that just shows the image puzzle, 650 pass None to this argument. 651 652 """ 653 locator_method = self.locator_method 654 self.locator_method = "xpath" 655 try: 656 if outer_iframe_xpath: 657 self.switch_to_iframe(outer_iframe_xpath) 658 self.click('//*[@id="recaptcha-anchor"]') 659 self.switch_to_parent_frame() 660 self.switch_to_iframe(inner_iframe_xpath) 661 self.click('//*[@id="recaptcha-audio-button"]') 662 mp3_url = self.find( 663 '//a[@class="rc-audiochallenge-tdownload-link"]' 664 ).get_attribute("href") 665 text = get_text_from_url(mp3_url, ".mp3") 666 self.send_keys('//*[@id="audio-response"]', text) 667 self.click('//*[@id="recaptcha-verify-button"]') 668 except Exception as e: 669 print(e) 670 raise Exception("Could not solve captcha") 671 finally: 672 self.switch_to_parent_frame() 673 self.locator_method = locator_method
Sits on top of selenium to streamline automation and scraping tasks.
32 def __init__( 33 self, 34 headless: bool = False, 35 browser_type: str = "firefox", 36 implicit_wait: int = 10, 37 page_load_timeout: int = 60, 38 open_browser: bool = True, 39 locator_method: str = "xpath", 40 randomize_user_agent: bool = True, 41 user_agent_rotation_period: int = None, 42 move_window_by: tuple[int, int] = (0, -1000), 43 download_dir: str | Path = None, 44 driver_path: str | Path = None, 45 ): 46 """ 47 :param headless: If True, browser window will not be visible. 48 49 :param browser_type: Which browser to use. Can be 'firefox' or 'chrome'. 50 51 :param implicit_wait: Number of seconds to look for a specified element before 52 selenium considers it missing and throws an exception. 53 54 :param page_load_timeout: Time in seconds for selenium to wait for a page to load 55 before throwing an exception. 56 57 :param open_browser: If True, opens a browser window when a User object is created. 58 If False, a manual call to self.open_browser() must be made. 59 60 :param locator_method: The locator type User should expect to be given. 61 Can be 'xpath', 'id', 'className', 'name', or 'cssSelector'. 62 Every member function with a 'locator' argument refers to a string matching 63 the current locator_method. 64 65 :param randomize_user_agent: If True, a random useragent will be used whenever 66 the browser is opened. If False, the native useragent will be used. 67 68 :param user_agent_rotation_period: If not None, the browser window will be closed 69 and reopened with a new useragent every user_agent_rotation_period number of minutes. 70 Rotation occurs on the first call to self.get() after the time period has elapsed. 71 Ignored if randomize_user_agent is False. 72 73 :param move_window_by: The x and y amount of pixels to move the browser window by after opening. 74 75 :param download_dir: The download folder to use. If None, the default folder will be used. 76 77 :param driver_path: The path to the webdriver executable selenium should use. 78 If None, the system PATH will be checked for the executable. 79 If the executable isn't found, the parent directories and the immediate child directories 80 of the current working directory will be searched. 81 """ 82 self.headless = headless 83 browser_type = browser_type.lower() 84 if browser_type in ["firefox", "chrome"]: 85 self.browser_type = browser_type 86 else: 87 raise ValueError("'browser_type' parameter must be 'firefox' or 'chrome'") 88 self.browser_open = False 89 self.implicit_wait = implicit_wait 90 self.page_load_timeout = page_load_timeout 91 self.rotation_timer = Timer() 92 self.randomize_user_agent = randomize_user_agent 93 self.user_agent_rotation_period = user_agent_rotation_period 94 self.locator_method = locator_method 95 self.turbo() 96 self.keys = Keys 97 self.move_window_by = move_window_by 98 self.download_dir = download_dir 99 self.driver_path = driver_path 100 if not self.driver_path: 101 self.search_for_driver() 102 if open_browser: 103 self.open_browser() 104 else: 105 self.browser = None 106 atexit.register(self.close_browser)
Parameters
headless: If True, browser window will not be visible.
browser_type: Which browser to use. Can be 'firefox' or 'chrome'.
implicit_wait: Number of seconds to look for a specified element before selenium considers it missing and throws an exception.
page_load_timeout: Time in seconds for selenium to wait for a page to load before throwing an exception.
open_browser: If True, opens a browser window when a User object is created. If False, a manual call to self.open_browser() must be made.
locator_method: The locator type User should expect to be given. Can be 'xpath', 'id', 'className', 'name', or 'cssSelector'. Every member function with a 'locator' argument refers to a string matching the current locator_method.
randomize_user_agent: If True, a random useragent will be used whenever the browser is opened. If False, the native useragent will be used.
user_agent_rotation_period: If not None, the browser window will be closed and reopened with a new useragent every user_agent_rotation_period number of minutes. Rotation occurs on the first call to self.get() after the time period has elapsed. Ignored if randomize_user_agent is False.
move_window_by: The x and y amount of pixels to move the browser window by after opening.
download_dir: The download folder to use. If None, the default folder will be used.
driver_path: The path to the webdriver executable selenium should use. If None, the system PATH will be checked for the executable. If the executable isn't found, the parent directories and the immediate child directories of the current working directory will be searched.
114 def configure_firefox(self) -> FirefoxService: 115 """Configure options and profile for firefox.""" 116 self.options = FirefoxOptions() 117 self.options.headless = self.headless 118 self.options.set_preference( 119 "widget.windows.window_occlusion_tracking.enabled", False 120 ) 121 self.options.set_preference("dom.webaudio.enabled", False) 122 if self.randomize_user_agent: 123 self.options.set_preference("general.useragent.override", get_agent()) 124 if self.download_dir: 125 Path(self.download_dir).mkdir(parents=True, exist_ok=True) 126 self.profile = FirefoxProfile() 127 self.profile.set_preference("browser.download.dir", str(self.download_dir)) 128 self.profile.set_preference("browser.download.folderList", 2) 129 else: 130 self.profile = None 131 self.service = FirefoxService( 132 executable_path=str(self.driver_path), log_path=os.devnull 133 )
Configure options and profile for firefox.
135 def configure_chrome(self) -> ChromeService: 136 """Configure options and profile for chrome.""" 137 self.options = ChromeOptions() 138 self.options.headless = self.headless 139 self.options.add_argument("--disable-blink-features=AutomationControlled") 140 self.options.add_argument("--mute-audio") 141 self.options.add_argument("--disable-infobars") 142 self.options.add_argument("--disable-notifications") 143 self.options.add_argument("--log-level=3") 144 if self.randomize_user_agent: 145 self.options.add_argument(f"--user-agent={get_agent()}") 146 self.options.add_experimental_option("useAutomationExtension", False) 147 if self.download_dir: 148 Path(self.download_dir).mkdir(parents=True, exist_ok=True) 149 self.options.add_experimental_option( 150 "prefs", {"download.default_directory": str(self.download_dir)} 151 ) 152 self.service = ChromeService( 153 executable_path=str(self.driver_path), log_path=os.devnull 154 )
Configure options and profile for chrome.
156 def search_for_driver(self): 157 """Searches for the webdriver executable.""" 158 cwd = Path.cwd() 159 found = False 160 match self.browser_type: 161 case "firefox": 162 driver = "geckodriver.exe" 163 case "chrome": 164 driver = "chromedriver.exe" 165 # search PATH 166 env_path = os.environ["PATH"] 167 if sys.platform == "win32": 168 env_paths = env_path.split(";") 169 else: 170 env_paths = env_path.split(":") 171 driver = driver[: driver.find(".")] 172 for path in env_paths: 173 if (Path(path) / driver).exists(): 174 self.driver_path = Path(path) / driver 175 found = True 176 break 177 # check current working directory and parent folders 178 if not found: 179 while cwd != cwd.parent: 180 if (cwd / driver).exists(): 181 self.driver_path = cwd / driver 182 found = True 183 break 184 cwd = cwd.parent 185 # check top most level 186 if not found and (cwd / driver).exists(): 187 self.driver_path = cwd / driver 188 found = True 189 # check child folders (only 1 level down) 190 if not found: 191 for child in Path.cwd().iterdir(): 192 if child.is_dir() and (child / driver).exists(): 193 self.driver_path = child / driver 194 found = True 195 if not found: 196 warn(f"Could not find {driver}")
Searches for the webdriver executable.
198 def set_implicit_wait(self, wait_time: int = None): 199 """Sets to default time if no arg given.""" 200 if not wait_time: 201 self.browser.implicitly_wait(self.implicit_wait) 202 else: 203 self.browser.implicitly_wait(wait_time)
Sets to default time if no arg given.
205 def open_browser(self): 206 """Configures and opens selenium browser.""" 207 if not self.browser_open: 208 match self.browser_type: 209 case "firefox": 210 self.configure_firefox() 211 self.browser = webdriver.Firefox( 212 options=self.options, 213 service=self.service, 214 firefox_profile=self.profile, 215 ) 216 case "chrome": 217 self.configure_chrome() 218 self.browser = webdriver.Chrome( 219 options=self.options, service=self.service 220 ) 221 self.set_implicit_wait() 222 self.browser.maximize_window() 223 self.browser.set_window_position( 224 self.move_window_by[0], self.move_window_by[1] 225 ) 226 self.browser.maximize_window() 227 self.browser.set_page_load_timeout(self.page_load_timeout) 228 self.browser_open = True 229 self.tab_index = 0 230 self.rotation_timer.start() 231 else: 232 warn("Browser already open.")
Configures and opens selenium browser.
234 def close_browser(self): 235 """Close browser window.""" 236 if self.browser_open: 237 self.browser_open = False 238 self.browser.quit()
Close browser window.
240 def open_tab(self, url: str = "", switch_to_tab: bool = True): 241 """Opens new tab and, if provided, goes to url. 242 243 New tab is inserted after currently active tab.""" 244 self.script("window.open(arguments[0]);", url) 245 if switch_to_tab: 246 self.switch_to_tab(self.tab_index + 1)
Opens new tab and, if provided, goes to url.
New tab is inserted after currently active tab.
248 def switch_to_tab(self, tab_index: int): 249 """Switch to a tab in browser, zero indexed.""" 250 self.browser.switch_to.window(self.browser.window_handles[tab_index]) 251 self.tab_index = tab_index
Switch to a tab in browser, zero indexed.
253 def get_num_tabs(self) -> int: 254 """Returns number of tabs open.""" 255 return len(self.browser.window_handles)
Returns number of tabs open.
257 def close_tab(self, tab_index: int = 1): 258 """Close specified tab and 259 switches to tab index 0.""" 260 self.switch_to_tab(tab_index) 261 self.browser.close() 262 self.switch_to_tab(0)
Close specified tab and switches to tab index 0.
264 def get(self, url: str): 265 """Requests webpage at given url and rotates userAgent if necessary.""" 266 if not self.browser_open: 267 self.open_browser() 268 if ( 269 self.randomize_user_agent 270 and self.user_agent_rotation_period is not None 271 and self.rotation_timer.elapsed > (60 * self.user_agent_rotation_period) 272 ): 273 self.rotation_timer.stop() 274 self.close_browser() 275 self.open_browser() 276 self.browser.get(url) 277 self.script("Object.defineProperty(navigator, 'webdriver', {get: () => false})") 278 self.chill(self.arrival_wait)
Requests webpage at given url and rotates userAgent if necessary.
280 def get_soup(self) -> BeautifulSoup: 281 """Returns a BeautifulSoup object 282 of the current page source.""" 283 return BeautifulSoup(self.browser.page_source, "html.parser")
Returns a BeautifulSoup object of the current page source.
285 def current_url(self) -> str: 286 """Returns current url of active tab.""" 287 return self.browser.current_url
Returns current url of active tab.
294 def turbo(self, engage: bool = True): 295 """When engaged, strings will be sent 296 to elements all at once and there will be 297 no waiting after actions. 298 299 When disengaged, strings will be sent to elements 300 'one key at a time' with randomized amounts of 301 time between successive keys and after actions.""" 302 if engage: 303 self.after_key_wait = (0, 0) 304 self.after_field_wait = (0, 0) 305 self.after_click_wait = (0, 0) 306 self.arrival_wait = (1, 1) 307 self.one_key_at_a_time = False 308 self.turbo_engaged = True 309 else: 310 self.after_key_wait = (0.1, 0.5) 311 self.after_field_wait = (1, 2) 312 self.after_click_wait = (0.25, 1.5) 313 self.arrival_wait = (4, 10) 314 self.one_key_at_a_time = True 315 self.turbo_engaged = False
When engaged, strings will be sent to elements all at once and there will be no waiting after actions.
When disengaged, strings will be sent to elements 'one key at a time' with randomized amounts of time between successive keys and after actions.
317 def chill(self, min_max: tuple[float, float]): 318 """Sleeps a random amount 319 between min_max[0] and min_max[1].""" 320 time.sleep(random.uniform(min_max[0], min_max[1]))
Sleeps a random amount between min_max[0] and min_max[1].
322 def script(self, script: str, args: Any = None) -> Any: 323 """Execute javascript code and returns result.""" 324 return self.browser.execute_script(script, args)
Execute javascript code and returns result.
326 def remove(self, locator: str): 327 """Removes element from DOM.""" 328 self.script("arguments[0].remove();", self.find(locator))
Removes element from DOM.
330 def get_length(self, locator: str) -> int: 331 """Returns number of child elements for a given element.""" 332 return int(self.script("return arguments[0].length;", self.find(locator)))
Returns number of child elements for a given element.
334 def find(self, locator: str) -> WebElement: 335 """Finds and returns a WebElement.""" 336 match self.locator_method: 337 case "xpath": 338 return self.browser.find_element(By.XPATH, locator) 339 case "id": 340 return self.browser.find_element(By.ID, locator) 341 case "className": 342 return self.browser.find_element(By.CLASS_NAME, locator) 343 case "name": 344 return self.browser.find_element(By.NAME, locator) 345 case "cssSelector": 346 return self.browser.find_element(By.CSS_SELECTOR, locator)
Finds and returns a WebElement.
348 def find_children(self, locator: str) -> list[WebElement]: 349 """Returns a list of child WebElements 350 for given locator arg.""" 351 element = self.find(locator) 352 return element.find_elements("xpath", "./*")
Returns a list of child WebElements for given locator arg.
354 def scroll(self, amount: int = None, fraction: float = None): 355 """Scroll web page. 356 :param amount: The number of lines to scroll if not None. 357 358 :param fraction: The amount between 0.0 and 1.0 359 of the page height to scroll. 360 361 If values are provided for both arguments, 362 amount will be used. 363 364 If values are provided for neither argument, 365 the entire page length will be scrolled. 366 367 Scrolls one line at a time if self.turbo is False.""" 368 if amount: 369 amount_to_scroll = amount 370 elif fraction: 371 amount_to_scroll = int( 372 fraction 373 * ( 374 int(self.script("return document.body.scrollHeight;")) 375 - int(self.script("return window.pageYOffset;")) 376 ) 377 ) 378 else: 379 amount_to_scroll = int(self.script("return document.body.scrollHeight;")) 380 if self.turbo_engaged: 381 self.script("window.scrollBy(0,arguments[0]);", amount_to_scroll) 382 else: 383 for _ in range(abs(amount_to_scroll)): 384 if amount_to_scroll >= 0: 385 self.script("window.scrollBy(0,1);") 386 else: 387 self.script("window.scrollBy(0,-1);") 388 self.chill(self.after_click_wait)
Scroll web page.
Parameters
amount: The number of lines to scroll if not None.
fraction: The amount between 0.0 and 1.0 of the page height to scroll.
If values are provided for both arguments, amount will be used.
If values are provided for neither argument, the entire page length will be scrolled.
Scrolls one line at a time if self.turbo is False.
390 def scroll_into_view(self, locator: str) -> WebElement: 391 """Scrolls to a given element and returns the element.""" 392 element = self.find(locator) 393 self.script("arguments[0].scrollIntoView();", element) 394 self.chill(self.after_click_wait) 395 return element
Scrolls to a given element and returns the element.
397 def text(self, locator: str) -> str: 398 """Returns text of WebElement.""" 399 return self.find(locator).text
Returns text of WebElement.
401 def click(self, locator: str) -> WebElement: 402 """Clicks on and returns WebElement.""" 403 element = self.find(locator) 404 element.click() 405 self.chill(self.after_click_wait) 406 return element
Clicks on and returns WebElement.
408 def clear(self, locator: str) -> WebElement: 409 """Clears content of WebElement if able 410 and then returns WebElement.""" 411 element = self.find(locator) 412 element.clear() 413 self.chill(self.after_click_wait) 414 return element
Clears content of WebElement if able and then returns WebElement.
416 def switch_to_iframe(self, locator: str): 417 """Switch to an iframe from given locator.""" 418 self.browser.switch_to.frame(self.find(locator))
Switch to an iframe from given locator.
420 def switch_to_parent_frame(self): 421 """Move up a frame level from current frame.""" 422 self.browser.switch_to.parent_frame()
Move up a frame level from current frame.
424 def select( 425 self, locator: str, method: str, choice: str | int | tuple 426 ) -> WebElement: 427 """Select a choice from Select element. 428 Returns the Select element from the locator string, 429 not the option element that is selected. 430 431 :param method: Can be 'value' or 'index' 432 433 :param choice: The option to select. 434 435 If method is 'value', then choice should be 436 the html 'value' attribute of the desired option. 437 438 If method is 'index', choice can either be a single 439 int for the desired option or it can be a two-tuple. 440 If the tuple is provided, a random option between the 441 two indicies (inclusive) will be selected.""" 442 element = self.click(locator) 443 match method: 444 case "value": 445 Select(element).select_by_value(choice) 446 case "index": 447 if type(choice) == tuple: 448 choice = random.randint(choice[0], choice[1]) 449 Select(element).select_by_index(choice) 450 self.chill(self.after_field_wait) 451 return element
Select a choice from Select element. Returns the Select element from the locator string, not the option element that is selected.
Parameters
method: Can be 'value' or 'index'
choice: The option to select.
If method is 'value', then choice should be the html 'value' attribute of the desired option.
If method is 'index', choice can either be a single int for the desired option or it can be a two-tuple. If the tuple is provided, a random option between the two indicies (inclusive) will be selected.
453 def click_elements( 454 self, locators: list[str], max_selections: int = None, min_selections: int = 1 455 ) -> WebElement: 456 """Click a random number of WebElements 457 and return the last WebElement clicked. 458 459 :param locators: A list of element locators to choose from. 460 461 :param max_selections: The maximum number of elements to click. 462 If None, the maximum will be the length of the locators list. 463 464 :param min_selections: The minimum number of elements to click. 465 466 e.g. self.click_elements([xpath1, xpath2, xpath3, xpath4], max_selections=3) 467 will click between 1 and 3 random elements from the list. 468 """ 469 if not max_selections: 470 max_selections = len(locators) 471 for option in random.sample( 472 locators, k=random.randint(min_selections, max_selections) 473 ): 474 element = self.click(option) 475 return element
Click a random number of WebElements and return the last WebElement clicked.
Parameters
locators: A list of element locators to choose from.
max_selections: The maximum number of elements to click. If None, the maximum will be the length of the locators list.
min_selections: The minimum number of elements to click.
e.g. self.click_elements([xpath1, xpath2, xpath3, xpath4], max_selections=3) will click between 1 and 3 random elements from the list.
477 def get_click_list( 478 self, num_options: int, max_choices: int = 1, min_choices: int = 1 479 ) -> list[str]: 480 """Similar to self.click_elements(), but for use with the self.fill_next() method. 481 482 Creates a list of length 'num_options' where every element is 'skip'. 483 484 A random number of elements in the list between 'min_choices' and 'max_choices' are 485 replaced with 'keys.SPACE' (interpreted as a click by almost all web forms).""" 486 click_list = ["skip"] * num_options 487 selected_indexes = [] 488 for i in range(random.randint(min_choices, max_choices)): 489 index = random.randint(0, num_options - 1) 490 while index in selected_indexes: 491 index = random.randint(0, num_options - 1) 492 selected_indexes.append(index) 493 click_list[index] = self.keys.SPACE 494 return click_list
Similar to self.click_elements(), but for use with the self.fill_next() method.
Creates a list of length 'num_options' where every element is 'skip'.
A random number of elements in the list between 'min_choices' and 'max_choices' are replaced with 'keys.SPACE' (interpreted as a click by almost all web forms).
496 def send_keys( 497 self, 498 locator: str, 499 data: str, 500 click_first: bool = True, 501 clear_first: bool = False, 502 ) -> WebElement: 503 """Types data into element and returns the element. 504 505 :param data: The string to send to the element. 506 507 :param click_first: If True, the element is clicked on 508 before the data is sent. 509 510 :param clear_first: If True, the current text of the element 511 is cleared before the data is sent.""" 512 element = self.click(locator) if click_first else self.find(locator) 513 if clear_first: 514 element.clear() 515 self.chill(self.after_click_wait) 516 if self.one_key_at_a_time: 517 for ch in str(data): 518 element.send_keys(ch) 519 self.chill(self.after_key_wait) 520 else: 521 element.send_keys(str(data)) 522 self.chill(self.after_field_wait) 523 return element
Types data into element and returns the element.
Parameters
data: The string to send to the element.
click_first: If True, the element is clicked on before the data is sent.
clear_first: If True, the current text of the element is cleared before the data is sent.
525 def fill_next( 526 self, data: list[str | tuple], start_element: WebElement = None 527 ) -> WebElement: 528 """Fills a form by tabbing from the current WebElement 529 to the next one and using the corresponding item in data. 530 Returns the last WebElement. 531 532 :param data: A list of form data. If an item is a string (except for 'skip') 533 it will be typed into the current WebElement. 534 535 An item in data can be a two-tuple of the form 536 ('downArrow', numberOfPresses:int|tuple[int, int]). 537 538 If numberOfPresses is a single int, Keys.ARROW_DOWN will be sent 539 that many times to the WebElement. 540 541 If numberOfPresses is a tuple, Keys.ARROW_DOWN will be sent a random 542 number of times between numberOfPresses[0] and numberOfPresses[1] inclusive. 543 This is typically for use with Select elements. 544 545 An item in data can also be 'skip', which will perform no action on the current 546 WebElement and will continue to the next one. 547 548 An item in data can also be 'click=n', where 'n' is an integer b/t 0 and 100, 549 representing a percent chance an element will be clicked or skipped: 550 >>> user.fill_next(["click=70"]) 551 552 has a 70% chance of being 553 >>> user.fill_next([user.keys.SPACE]) 554 555 and a 30% chance of being 556 >>> user.fill_next(["skip"]) 557 558 559 :param start_element: The WebElement to start tabbing from. 560 The currently active element will be used if start_element is None. 561 562 Note: The function tabs to the next element before sending data, 563 so the start_element should the WebElement before the one 564 that should receive data[0]. 565 """ 566 element = ( 567 self.browser.switch_to.active_element 568 if not start_element 569 else start_element 570 ) 571 for datum in data: 572 element.send_keys(Keys.TAB) 573 element = self.browser.switch_to.active_element 574 self.chill(self.after_key_wait) 575 if type(datum) == str and datum.strip().startswith("click="): 576 chance = int(datum.split("=")[1].strip()) 577 if random.randint(0, 100) <= chance: 578 datum = Keys.SPACE 579 else: 580 datum = "skip" 581 if datum[0] == "downArrow": 582 if type(datum[1]) == tuple: 583 times = random.randint(datum[1][0], datum[1][1]) 584 else: 585 times = datum[1] 586 for _ in range(times): 587 element.send_keys(Keys.ARROW_DOWN) 588 self.chill(self.after_key_wait) 589 elif datum == "skip": 590 self.chill(self.after_key_wait) 591 else: 592 593 if self.turbo_engaged: 594 element.send_keys(str(datum)) 595 else: 596 for ch in str(datum): 597 element.send_keys(ch) 598 self.chill(self.after_key_wait) 599 self.chill(self.after_field_wait) 600 return element
Fills a form by tabbing from the current WebElement to the next one and using the corresponding item in data. Returns the last WebElement.
Parameters
- data: A list of form data. If an item is a string (except for 'skip') it will be typed into the current WebElement.
An item in data can be a two-tuple of the form ('downArrow', numberOfPresses:int|tuple[int, int]).
If numberOfPresses is a single int, Keys.ARROW_DOWN will be sent that many times to the WebElement.
If numberOfPresses is a tuple, Keys.ARROW_DOWN will be sent a random number of times between numberOfPresses[0] and numberOfPresses[1] inclusive. This is typically for use with Select elements.
An item in data can also be 'skip', which will perform no action on the current WebElement and will continue to the next one.
An item in data can also be 'click=n', where 'n' is an integer b/t 0 and 100, representing a percent chance an element will be clicked or skipped:
>>> user.fill_next(["click=70"])
has a 70% chance of being
>>> user.fill_next([user.keys.SPACE])
and a 30% chance of being
>>> user.fill_next(["skip"])
- start_element: The WebElement to start tabbing from. The currently active element will be used if start_element is None.
Note: The function tabs to the next element before sending data, so the start_element should the WebElement before the one that should receive data[0].
602 def wait_until( 603 self, condition: LambdaType, max_wait: float = 10, polling_interval: float = 0.1 604 ): 605 """Checks condition repeatedly until either it is true, 606 or the max_wait is exceeded. 607 608 Raises a TimeoutError if the condition doesn't success within max_wait. 609 610 Useful for determing whether a form has been successfully submitted. 611 612 :param condition: The condition function to check. 613 614 :param max_wait: Number of seconds to continue checking condition 615 before throwing a TimeoutError. 616 617 :param polling_interval: The number of seconds to sleep before 618 checking the condition function again after it fails. 619 620 e.g. self.wait_until(lambda: 'Successfully Submitted' in self.text('//p[@id="form-output"]))""" 621 start_time = time.time() 622 while True: 623 try: 624 if condition(): 625 time.sleep(1) 626 break 627 elif (time.time() - start_time) > max_wait: 628 raise TimeoutError(f"max_wait exceeded in wait_until({condition})") 629 else: 630 time.sleep(polling_interval) 631 except: 632 if (time.time() - start_time) > max_wait: 633 raise TimeoutError(f"max_wait exceeded in wait_until({condition})") 634 else: 635 time.sleep(polling_interval)
Checks condition repeatedly until either it is true, or the max_wait is exceeded.
Raises a TimeoutError if the condition doesn't success within max_wait.
Useful for determing whether a form has been successfully submitted.
Parameters
condition: The condition function to check.
max_wait: Number of seconds to continue checking condition before throwing a TimeoutError.
polling_interval: The number of seconds to sleep before checking the condition function again after it fails.
e.g. self.wait_until(lambda: 'Successfully Submitted' in self.text('//p[@id="form-output"]))
637 def dismiss_alert(self): 638 """Dismiss alert dialog.""" 639 self.browser.switch_to.alert.dismiss()
Dismiss alert dialog.
641 def solve_recaptcha_v3( 642 self, 643 outer_iframe_xpath: str = '//iframe[@title="reCAPTCHA"]', 644 inner_iframe_xpath: str = '//iframe[@title="recaptcha challenge expires in two minutes"]', 645 ): 646 """Pass google recaptcha v3 by solving an audio puzzle. 647 648 :param outer_iframe_xpath: Xpath to the iframe containing the recaptcha checkbox. 649 If it's the recaptcha without the initial checkbox that just shows the image puzzle, 650 pass None to this argument. 651 652 """ 653 locator_method = self.locator_method 654 self.locator_method = "xpath" 655 try: 656 if outer_iframe_xpath: 657 self.switch_to_iframe(outer_iframe_xpath) 658 self.click('//*[@id="recaptcha-anchor"]') 659 self.switch_to_parent_frame() 660 self.switch_to_iframe(inner_iframe_xpath) 661 self.click('//*[@id="recaptcha-audio-button"]') 662 mp3_url = self.find( 663 '//a[@class="rc-audiochallenge-tdownload-link"]' 664 ).get_attribute("href") 665 text = get_text_from_url(mp3_url, ".mp3") 666 self.send_keys('//*[@id="audio-response"]', text) 667 self.click('//*[@id="recaptcha-verify-button"]') 668 except Exception as e: 669 print(e) 670 raise Exception("Could not solve captcha") 671 finally: 672 self.switch_to_parent_frame() 673 self.locator_method = locator_method
Pass google recaptcha v3 by solving an audio puzzle.
Parameters
- outer_iframe_xpath: Xpath to the iframe containing the recaptcha checkbox. If it's the recaptcha without the initial checkbox that just shows the image puzzle, pass None to this argument.