Coverage for src/artemis_sg/scraper.py: 83%
623 statements
« prev ^ index » next coverage.py v7.3.1, created at 2023-10-12 17:31 -0700
« prev ^ index » next coverage.py v7.3.1, created at 2023-10-12 17:31 -0700
1import logging
2import os.path
3import re
4import time # for additional sleeps in page load. This is a smell.
5import urllib.parse
7from rich.console import Console
8from rich.text import Text
10# Selenium
11from selenium import webdriver
12from selenium.common.exceptions import (
13 ElementClickInterceptedException,
14 ElementNotInteractableException,
15 NoSuchElementException,
16 StaleElementReferenceException,
17 TimeoutException,
18)
20# Chrome
21from selenium.webdriver.chrome.service import Service as ChromeService
22from selenium.webdriver.common.by import By
23from selenium.webdriver.common.keys import Keys as SeleniumKeys
24from selenium.webdriver.support import expected_conditions as ec
25from selenium.webdriver.support.ui import WebDriverWait
27from artemis_sg import spreadsheet, vendor
28from artemis_sg.config import CFG
29from artemis_sg.items import Items
31# Firefox
32# from selenium.webdriver.firefox.service import Service as FirefoxService
34MODULE = os.path.splitext(os.path.basename(__file__))[0]
35console = Console()
37IMG_FAILOVER_THRESHHOLD = 2
40class BaseScraper:
41 # {{{
42 """
43 Scraper objects know how to scrape base url
44 """
46 def __init__(self, selenium_driver, base_url=None):
47 self.selenium_driver = selenium_driver
48 if not base_url:
49 self.base_url = ""
50 else:
51 self.base_url = base_url
53 def load_item_page(self, item_number):
54 return False
56 def scrape_description(self):
57 description = ""
58 return description
60 def scrape_item_image_urls(self):
61 urls = []
62 return urls
64 def delay(self, secs):
65 time.sleep(secs)
67 # }}}
70class GJScraper(BaseScraper):
71 # {{{
72 """
73 GJScraper objects know how to scrape GJ item pages
74 """
76 def __init__(self, selenium_driver, base_url="https://greatjonesbooks.com"):
77 super().__init__(selenium_driver, base_url)
78 self.timeout = 3
80 def load_item_page(self, item_number, tries=0):
81 namespace = f"{type(self).__name__}.{self.load_item_page.__name__}"
83 # GJ does not maintain session if the links on page are not used
84 # if not logged in, then build url; else use search facility
85 try:
86 self.delay(1)
87 WebDriverWait(self.selenium_driver, self.timeout).until(
88 ec.presence_of_element_located(
89 (By.XPATH, "//a[@href='/account' and text()='Account Summary']")
90 )
91 )
92 except (NoSuchElementException, TimeoutException):
93 start = "/product/"
94 url = self.base_url + start + item_number
95 self.selenium_driver.get(url)
96 return True
97 try:
98 search = WebDriverWait(self.selenium_driver, self.timeout).until(
99 ec.presence_of_element_located((By.XPATH, "//a[@href='/search']"))
100 )
101 search.click()
102 self.delay(2)
104 # wait until Publisher list is populated
105 timeout_bak = self.timeout
106 self.timeout = 60
107 WebDriverWait(self.selenium_driver, self.timeout).until(
108 ec.presence_of_element_located(
109 # TODO: (#163) move to CFG
110 (By.XPATH, "//option[@value='Abbeville']")
111 )
112 )
113 self.timeout = timeout_bak
114 # then get itemCode field for search
115 item_field = WebDriverWait(self.selenium_driver, self.timeout).until(
116 ec.presence_of_element_located((By.XPATH, "//input[@name='itemCode']"))
117 )
118 search_button = self.selenium_driver.find_element(
119 By.CSS_SELECTOR, ".buttonSet > button:nth-child(1)"
120 )
121 clear_button = self.selenium_driver.find_element(
122 By.CSS_SELECTOR, ".buttonSet > button:nth-child(2)"
123 )
124 clear_button.click()
125 item_field.send_keys(item_number)
126 self.delay(2)
127 search_button.click()
128 self.delay(2)
129 # check for No Results
130 e = self.selenium_driver.find_element(
131 By.XPATH, "//div[@class='formBox']/div"
132 )
133 if "No Results" in e.text:
134 # Do not continue to try
135 logging.info(f"{namespace}: No Results found for {item_number}")
136 return False
137 items = self.selenium_driver.find_elements(By.ID, "product.item_id")
138 items[0].click()
139 return True
140 except (NoSuchElementException, TimeoutException, IndexError):
141 tries += 1
142 if tries < self.timeout: 142 ↛ 143line 142 didn't jump to line 143, because the condition on line 142 was never true
143 self.load_item_page(item_number, tries)
144 else:
145 logging.info(f"{namespace}: failed item search for {item_number}")
146 return False
148 def scrape_description(self):
149 try:
150 self.delay(1)
151 elem = WebDriverWait(self.selenium_driver, self.timeout).until(
152 ec.presence_of_element_located((By.CLASS_NAME, "desc"))
153 )
154 span = elem.find_element(By.CLASS_NAME, "short-comments")
155 description = span.text
156 except (NoSuchElementException, TimeoutException):
157 description = ""
159 return description
161 def scrape_item_image_urls(self):
162 namespace = f"{type(self).__name__}.{self.scrape_item_image_urls.__name__}"
164 urls = []
165 try:
166 self.delay(1)
167 # GJ appears to only have single cover images
168 elem = WebDriverWait(self.selenium_driver, self.timeout).until(
169 ec.presence_of_element_located((By.CLASS_NAME, "cover"))
170 )
171 img = elem.find_element(By.TAG_NAME, "img")
172 src = img.get_attribute("src")
173 if src: 173 ↛ 177line 173 didn't jump to line 177, because the condition on line 173 was never false
174 urls.append(src)
175 except (NoSuchElementException, TimeoutException) as e:
176 logging.warning(f"{namespace}: error {e}")
177 return urls
179 def load_login_page(self):
180 # Load search page while logged out in an attempt to get the
181 # Publishers list to populate when the page is loaded after login.
182 self.selenium_driver.get(self.base_url + "/search")
183 self.delay(self.timeout)
184 login = "/login"
185 url = self.base_url + login
186 self.selenium_driver.get(url)
188 def login(self):
189 namespace = f"{type(self).__name__}.{self.login.__name__}"
191 self.delay(2)
192 input_text = Text(
193 """
194 ******** USER INPUT REQUIRED ********
195 Locate the selenium controlled browser
196 and manually enter your login credentials.
197 ******** WAITING FOR USER INPUT ********
198 """
199 )
200 input_text.stylize("bold cyan")
201 console.print(input_text)
202 # TODO: DRY this up. It is duplicated between scrapers
203 # Take element search pattern as argument.
204 # TODO: vv (#163) move to CFG
205 # wait up to 90 seconds for user to manually enter credentials
206 # Verify by finding "a" with attribute "href"="/account"
207 try:
208 timeout_bak = self.timeout
209 self.timeout = 90
210 WebDriverWait(self.selenium_driver, self.timeout).until(
211 ec.presence_of_element_located((By.XPATH, "//a[@href='/account']"))
212 )
213 self.timeout = timeout_bak
214 success_text = Text(
215 """
216 ******** LOGIN SUCCESSFUL ********
217 ******** CONTINUING EXECUTION ********
218 """
219 )
220 success_text.stylize("green")
221 console.print(success_text)
222 except (NoSuchElementException, TimeoutException) as e:
223 logging.error(f"{namespace}: failed to login")
224 logging.error(f"{namespace}: Cannot proceed. Exiting.")
225 raise e
227 def add_to_cart(self, qty):
228 # TODO: Can we DRY this up? Some duplication between scrapers
229 namespace = f"{type(self).__name__}.{self.add_to_cart.__name__}"
231 self.delay(1)
232 stock_elem = self.selenium_driver.find_element(By.CLASS_NAME, "on-hand")
233 m = re.search(r"([0-9]+) in stock", stock_elem.text)
234 if m:
235 stock = m.group(1)
236 if int(stock) < int(qty): 236 ↛ 238line 236 didn't jump to line 238, because the condition on line 236 was never false
237 qty = stock
238 self.delay(1)
239 try:
240 # gather html elements needed
241 add_div = WebDriverWait(self.selenium_driver, self.timeout).until(
242 ec.presence_of_element_located((By.CLASS_NAME, "add"))
243 )
244 qty_field = add_div.find_element(By.XPATH, "//input[@name='qty']")
246 qty_field.clear()
247 qty_field.send_keys(qty + SeleniumKeys.ENTER)
248 except (NoSuchElementException, TimeoutException) as e:
249 logging.warning(f"{namespace}: error {e}")
250 return 0
251 return int(qty)
253 def load_cart_page(self):
254 # TODO: Can we DRY this up? Some duplication between scrapers
255 namespace = f"{type(self).__name__}.{self.load_cart_page.__name__}"
256 try:
257 cart = self.selenium_driver.find_element(By.CLASS_NAME, "cart")
258 cart.click()
259 self.delay(1)
260 cart.click()
261 self.delay(1)
262 except Exception as e:
263 logging.warning(f"{namespace}: error {e}")
264 return False
265 return True
267 def scrape_error_msg(self):
268 try:
269 elem = self.selenium_driver.find_element(By.CLASS_NAME, "errorMsg")
270 msg = elem.text
271 except NoSuchElementException:
272 msg = ""
273 return msg
275 # }}}
278class SDScraper(BaseScraper):
279 # {{{
280 """
281 SDScraper objects know how to scrape SD item pages
282 """
284 def __init__(self, selenium_driver, base_url="https://strathearndistribution.com"):
285 super().__init__(selenium_driver, base_url)
286 self.timeout = 3
288 def load_login_page(self):
289 namespace = f"{type(self).__name__}.{self.load_login_page.__name__}"
290 try:
291 self.selenium_driver.get(self.base_url)
292 self.delay(2)
293 button = self.selenium_driver.find_element(By.ID, "styled_btn")
294 button.click()
295 except (
296 StaleElementReferenceException,
297 NoSuchElementException,
298 TimeoutException,
299 ) as e:
300 logging.error(f"{namespace}: failed to load login page")
301 logging.error(f"{namespace}: Cannot proceed. Exiting.")
302 raise e
304 def login(self):
305 namespace = f"{type(self).__name__}.{self.login.__name__}"
306 input_text = Text(
307 """
308 ******** USER INPUT REQUIRED ********
309 Locate the selenium controlled browser
310 and manually enter your login credentials.
311 ******** WAITING FOR USER INPUT ********
312 """
313 )
314 input_text.stylize("bold cyan")
315 console.print(input_text)
316 # TODO: DRY this up. It is duplicated between scrapers
317 # TODO: vv (#163) move to CFG
318 # wait up to 90 seconds for user to manually enter credentials
319 # Verify by finding "span" with the text "My lists"
320 try:
321 timeout_bak = self.timeout
322 self.timeout = 90
323 WebDriverWait(self.selenium_driver, self.timeout).until(
324 ec.presence_of_element_located((By.XPATH, "//span[text()='My lists']"))
325 )
326 self.timeout = timeout_bak
327 success_text = Text(
328 """
329 ******** LOGIN SUCCESSFUL ********
330 ******** CONTINUING EXECUTION ********
331 """
332 )
333 success_text.stylize("green")
334 console.print(success_text)
335 except (NoSuchElementException, TimeoutException) as e:
336 logging.error(f"{namespace}: failed to login")
337 logging.error(f"{namespace}: Cannot proceed. Exiting.")
338 raise e
340 def load_item_page(self, item_number, tries=0):
341 namespace = f"{type(self).__name__}.{self.load_item_page.__name__}"
342 try:
343 self.selenium_driver.get(self.base_url)
344 self.delay(2)
345 search = WebDriverWait(self.selenium_driver, self.timeout).until(
346 ec.presence_of_element_located((By.ID, "search"))
347 )
348 search.send_keys(item_number + SeleniumKeys.ENTER)
349 self.delay(2)
350 elem = WebDriverWait(self.selenium_driver, self.timeout).until(
351 ec.presence_of_element_located((By.CLASS_NAME, "listItem"))
352 )
353 self.delay(2)
354 elem.click()
355 return True
356 except (
357 StaleElementReferenceException,
358 NoSuchElementException,
359 TimeoutException,
360 ) as e:
361 tries += 1
362 if tries < self.timeout:
363 self.load_item_page(item_number, tries)
364 else:
365 logging.warning(
366 f"{namespace}: Failed to load item page '{item_number}': {e}"
367 )
368 return False
370 def scrape_description(self):
371 try:
372 # rc-* IDs are dynamic, must use classes
373 elem = self.selenium_driver.find_element(By.CLASS_NAME, "ant-tabs-nav-list")
374 tab_btn = elem.find_element(By.CLASS_NAME, "ant-tabs-tab-btn")
375 tab_btn.click()
376 pane = self.selenium_driver.find_element(By.CLASS_NAME, "ant-tabs-tabpane")
377 description = pane.text
378 except NoSuchElementException:
379 description = ""
381 return description
383 def scrape_item_image_urls(self):
384 namespace = f"{type(self).__name__}.{self.scrape_item_image_urls.__name__}"
385 urls = []
386 try:
387 # main only
388 elem = self.selenium_driver.find_element(By.CLASS_NAME, "slick-current")
389 img = elem.find_element(By.TAG_NAME, "img")
390 src = img.get_attribute("src")
391 if src: 391 ↛ 394line 391 didn't jump to line 394, because the condition on line 391 was never false
392 urls.append(src)
393 # ensure we are seeing the top of the page
394 html = self.selenium_driver.find_element(By.TAG_NAME, "html")
395 html.send_keys(SeleniumKeys.PAGE_UP)
396 elems = self.selenium_driver.find_elements(By.CLASS_NAME, "gallery-vert")
397 for elem in elems:
398 src = elem.get_attribute("src")
399 if src: 399 ↛ 397line 399 didn't jump to line 397, because the condition on line 399 was never false
400 urls.append(src)
401 except NoSuchElementException as e:
402 logging.warning(f"{namespace}: error {e}")
403 return urls
405 def add_to_cart(self, qty):
406 namespace = f"{type(self).__name__}.{self.add_to_cart.__name__}"
408 self.delay(1)
409 # try:???
410 stock_elem = self.selenium_driver.find_element(
411 By.XPATH, "//span[contains(text(), 'in stock')]"
412 )
413 m = re.search(r"([0-9]+) in stock", stock_elem.get_attribute("innerHTML"))
414 if m:
415 stock = m.group(1)
416 if int(stock) < int(qty): 416 ↛ 418line 416 didn't jump to line 418, because the condition on line 416 was never false
417 qty = stock
418 self.delay(1)
419 try:
420 # gather html elements needed
421 elems = self.selenium_driver.find_elements(By.CLASS_NAME, "ant-btn-primary")
422 button = None
423 for e in elems: 423 ↛ 427line 423 didn't jump to line 427, because the loop on line 423 didn't complete
424 if "Add to cart" in e.text: 424 ↛ 423line 424 didn't jump to line 423, because the condition on line 424 was never false
425 button = e
426 break
427 qty_field = self.selenium_driver.find_element(
428 By.XPATH,
429 (
430 "//input[@class='ant-input' and @type='text' "
431 "and not(ancestor::div[contains(@class, '-block')])]"
432 ),
433 )
434 # the qty field must be clicked to highlight amount. Clearing doesn't work
435 qty_field.click()
436 qty_field.send_keys(qty)
437 button.click()
438 except Exception as e:
439 logging.warning(f"{namespace}: error {e}")
440 return 0
441 return int(qty)
443 def load_cart_page(self):
444 namespace = f"{type(self).__name__}.{self.load_cart_page.__name__}"
445 try:
446 cart = "/checkout/cart"
447 url = self.base_url + cart
448 self.selenium_driver.get(url)
449 self.delay(1)
450 return True
451 except Exception as e:
452 logging.warning(f"{namespace}: error {e}")
453 return False
455 # }}}
458class TBScraper(BaseScraper):
459 # {{{
460 """
461 TBScraper objects know how to scrape TB item pages
462 """
464 def __init__(self, selenium_driver, base_url="https://texasbookman.com/"):
465 super().__init__(selenium_driver, base_url)
466 self.timeout = 3
468 def load_item_page(self, item_number):
469 start = "p/"
470 url = self.base_url + start + item_number
471 self.selenium_driver.get(url)
472 return True
474 def scrape_description(self):
475 try:
476 elem = self.selenium_driver.find_element(
477 By.CLASS_NAME, "variant-description"
478 )
479 text = elem.text
480 description = text.replace("NO AMAZON SALES\n\n", "")
481 except NoSuchElementException:
482 description = ""
484 return description
486 def scrape_item_image_urls(self):
487 urls = []
488 try:
489 elem = WebDriverWait(self.selenium_driver, self.timeout).until(
490 ec.presence_of_element_located((By.CLASS_NAME, "a-left"))
491 )
492 elem = self.selenium_driver.find_element(By.CLASS_NAME, "picture-thumbs")
493 left = elem.find_element(By.CLASS_NAME, "a-left")
494 left.click()
495 while True:
496 self.delay(2)
497 thumb = self._get_thumb_from_slimbox()
498 if thumb:
499 urls.append(thumb)
500 next_link = WebDriverWait(self.selenium_driver, self.timeout).until(
501 ec.presence_of_element_located((By.ID, "lbNextLink"))
502 )
503 self.delay(2)
504 next_link.click()
505 except (
506 NoSuchElementException,
507 ElementNotInteractableException,
508 TimeoutException,
509 ):
510 try:
511 elem = self.selenium_driver.find_element(By.CLASS_NAME, "picture")
512 img = elem.find_element(By.TAG_NAME, "img")
513 thumb = img.get_attribute("src")
514 urls.append(thumb)
515 except NoSuchElementException:
516 pass
518 return urls
520 def _get_thumb_from_slimbox(self):
521 timeout = 3
522 thumb = None
523 try:
524 img_div = WebDriverWait(self.selenium_driver, timeout).until(
525 ec.presence_of_element_located((By.ID, "lbImage"))
526 )
527 style = img_div.get_attribute("style")
528 m = re.search('"(.*)"', style)
529 if m: 529 ↛ 534line 529 didn't jump to line 534, because the condition on line 529 was never false
530 thumb = m.group(1)
531 except (NoSuchElementException, TimeoutException):
532 pass
534 return thumb
536 def load_login_page(self):
537 login = "login"
538 url = self.base_url + login
539 self.selenium_driver.get(url)
541 def login(self):
542 namespace = f"{type(self).__name__}.{self.login.__name__}"
544 self.delay(2)
545 input_text = Text(
546 """
547 ******** USER INPUT REQUIRED ********
548 Locate the selenium controlled browser
549 and manually enter your login credentials.
550 ******** WAITING FOR USER INPUT ********
551 """
552 )
553 input_text.stylize("bold cyan")
554 console.print(input_text)
555 # TODO: DRY this up. It is duplicated between scrapers
556 # TODO: vv (#163) move to CFG
557 # wait up to 90 seconds for user to manually enter credentials
558 # Verify by finding "a" with attribute "href"="/admin"
559 try:
560 timeout_bak = self.timeout
561 self.timeout = 90
562 WebDriverWait(self.selenium_driver, 90).until(
563 ec.presence_of_element_located((By.XPATH, "//a[@href='/admin']"))
564 )
565 self.timeout = timeout_bak
566 success_text = Text(
567 """
568 ******** LOGIN SUCCESSFUL ********
569 ******** CONTINUING EXECUTION ********
570 """
571 )
572 success_text.stylize("green")
573 console.print(success_text)
574 except (NoSuchElementException, TimeoutException) as e:
575 logging.error(f"{namespace}: failed to login")
576 logging.error(f"{namespace}: Cannot proceed. Exiting.")
577 raise e
579 def impersonate(self, email):
580 namespace = f"{type(self).__name__}.{self.impersonate.__name__}"
582 # Go to /Admin/Customer/List
583 customers = "/Admin/Customer/List"
584 url = self.base_url + customers
585 self.selenium_driver.get(url)
586 self.delay(1)
587 try:
588 # search for email
589 search_email = WebDriverWait(self.selenium_driver, self.timeout).until(
590 ec.presence_of_element_located((By.ID, "SearchEmail"))
591 )
592 search_email.clear()
593 search_email.send_keys(email + SeleniumKeys.ENTER)
594 # Get customer link associated with email
595 email_xpath = (
596 f"//div[@id='customers-grid']/table/tbody/tr/td/a[text()='{email}']"
597 )
598 customer_link = WebDriverWait(self.selenium_driver, self.timeout).until(
599 ec.presence_of_element_located((By.XPATH, email_xpath))
600 )
601 links = self.selenium_driver.find_elements(By.XPATH, email_xpath)
602 # Bail if multiple customer records for given email.
603 if len(links) > 1:
604 logging.error(
605 f"{namespace}: Found multiple customer records for email "
606 f"'{email}' to impersonate"
607 )
608 logging.error(f"{namespace}: Cannot proceed. Exiting.")
609 raise Exception
610 customer_link.click()
611 # click "Place Order (impersonate)"
612 impersonate = WebDriverWait(self.selenium_driver, self.timeout).until(
613 ec.presence_of_element_located(
614 (By.XPATH, "//a[text()='Place order (Impersonate)']")
615 )
616 )
617 impersonate.click()
618 # click "Place Order" button
619 button = WebDriverWait(self.selenium_driver, self.timeout).until(
620 ec.presence_of_element_located(
621 (By.XPATH, "//input[@name='impersonate']")
622 )
623 )
624 button.click()
625 self.delay(1)
626 WebDriverWait(self.selenium_driver, self.timeout).until(
627 ec.presence_of_element_located((By.CLASS_NAME, "finish-impersonation"))
628 )
629 except (NoSuchElementException, TimeoutException) as e:
630 logging.error(f"{namespace}: failed to impersonate")
631 logging.error(f"{namespace}: Cannot proceed. Exiting.")
632 raise e
633 return True
635 def add_to_cart(self, qty):
636 namespace = f"{type(self).__name__}.{self.add_to_cart.__name__}"
638 qty = int(qty)
639 self.delay(1)
640 stock_elem = self.selenium_driver.find_element(By.CLASS_NAME, "stock")
641 m = re.search(r"Availability: ([0-9]+) in stock", stock_elem.text)
642 if m: 642 ↛ 647line 642 didn't jump to line 647, because the condition on line 642 was never false
643 stock = m.group(1)
644 stock = int(stock)
645 if stock < qty:
646 qty = stock
647 try:
648 # gather html elements needed
649 qty_field = WebDriverWait(self.selenium_driver, self.timeout).until(
650 ec.presence_of_element_located((By.CLASS_NAME, "qty-input"))
651 )
652 button = self.selenium_driver.find_element(
653 By.CLASS_NAME, "add-to-cart-button"
654 )
655 qty_field.clear()
656 # ENTERing out of the qty_field DOES NOT add to cart.
657 # The button must be clicked instead.
658 qty_field.send_keys(qty)
659 button.click()
660 self.delay(1)
661 except Exception as e:
662 logging.warning(f"{namespace}: error {e}")
663 return 0
664 return qty
666 def load_cart_page(self):
667 cart = "cart"
668 url = self.base_url + cart
669 self.selenium_driver.get(url)
670 return True
672 def search_item_num(self, search):
673 namespace = f"{type(self).__name__}.{self.search_item_num.__name__}"
675 item_num = ""
676 search = urllib.parse.quote_plus(search)
677 url = self.base_url + "search?q=" + search
678 self.selenium_driver.get(url)
679 self.delay(2)
680 timeout_bak = self.timeout
681 self.timeout = 120
682 WebDriverWait(self.selenium_driver, self.timeout).until(
683 ec.presence_of_element_located((By.CLASS_NAME, "search-results"))
684 )
685 self.timeout = timeout_bak
686 links = self.selenium_driver.find_elements(
687 By.XPATH, "//div[@class='search-results']//a[contains(@href, '/p/')]"
688 )
689 if links: 689 ↛ 695line 689 didn't jump to line 695, because the condition on line 689 was never false
690 item_url = links[0].get_attribute("href")
691 m = re.search(r"\/p\/([0-9]+)\/", item_url)
692 if m: 692 ↛ 696line 692 didn't jump to line 696, because the condition on line 692 was never false
693 item_num = m.group(1)
694 else:
695 logging.warning(f"{namespace}: Failed to find item using q='{search}'")
696 return item_num
698 # }}}
701class AmznScraper(BaseScraper):
702 # {{{
703 """
704 AmznScraper objects know how to scrape amazon item pages
705 """
707 def __init__(self, selenium_driver, base_url="https://www.amazon.com/"):
708 super().__init__(selenium_driver, base_url)
709 self.timeout = 1
711 def solve_captcha(self):
712 from amazoncaptcha import AmazonCaptcha
714 self.selenium_driver.get("https://www.amazon.com/errors/validateCaptcha")
715 try:
716 captcha = AmazonCaptcha.fromdriver(self.selenium_driver)
717 solution = captcha.solve()
718 elem = self.selenium_driver.find_element(By.ID, "captchacharacters")
719 elem.send_keys(solution + SeleniumKeys.ENTER)
720 return True
721 except (NoSuchElementException, TimeoutException):
722 return False
724 def load_item_page(self, item_number):
725 start = "dp/"
726 url = self.base_url + start + item_number
727 self.selenium_driver.get(url)
728 return True
730 def scrape_description(self):
731 description = ""
732 description = self._scrape_amazon_editorial_review()
733 if not description:
734 description = self._scrape_amazon_description()
736 return description
738 def _scrape_amazon_editorial_review(self):
739 descr = ""
740 try:
741 elem = self.selenium_driver.find_element(
742 By.ID, "editorialReviews_feature_div"
743 )
744 text = elem.text
745 descr_lines = re.split("^.*\\n.*\\n", text) # trim off first two lines
746 descr = descr_lines[-1]
747 except NoSuchElementException:
748 descr = ""
750 return descr
752 def _scrape_amazon_description(self):
753 descr = ""
754 try:
755 elem = self.selenium_driver.find_element(
756 By.ID, "bookDescription_feature_div"
757 )
758 # read_more = elem.find_element(By.CLASS_NAME, 'a-expander-prompt')
759 # read_more.click()
760 descr = elem.text
761 except NoSuchElementException:
762 descr = ""
764 return descr
766 def get_span_type_thumb_id_prefix(self):
767 """Get span_type and thumb_id_prefix from amazon images widget."""
768 namespace = (
769 f"{type(self).__name__}.{self.get_span_type_thumb_id_prefix.__name__}"
770 )
771 span_type = None
772 thumb_id_prefix = None
773 try:
774 span = WebDriverWait(self.selenium_driver, self.timeout).until(
775 ec.presence_of_element_located((By.ID, "imgThumbs"))
776 )
777 span_type = "imgThumbs"
778 except (NoSuchElementException, TimeoutException):
779 logging.info(f"{namespace}: No imgThumbs id, trying imgTagWrapperID")
780 try:
781 span = WebDriverWait(self.selenium_driver, self.timeout).until(
782 ec.presence_of_element_located((By.ID, "imgTagWrapperId"))
783 )
784 span_type = "imgTagWrapperId"
785 except (NoSuchElementException, TimeoutException):
786 logging.info(f"{namespace}: No imgTagWrapperId id")
787 logging.info(f"{namespace}: Returning empty urls list")
788 return (span_type, thumb_id_prefix)
790 if span_type == "imgThumbs": 790 ↛ 794line 790 didn't jump to line 794, because the condition on line 790 was never false
791 link = span.find_element(By.CLASS_NAME, "a-link-normal")
792 thumb_id_prefix = "ig-thumb-"
793 else:
794 link = span
795 thumb_id_prefix = "ivImage_"
796 try:
797 link.click()
798 except ElementClickInterceptedException:
799 logging.info(f"{namespace}: Failed to click images widget")
800 logging.info(f"{namespace}: Returning empty urls list")
801 return (span_type, thumb_id_prefix)
802 return (span_type, thumb_id_prefix)
804 def scrape_item_image_urls(self):
805 namespace = f"{type(self).__name__}.{self.scrape_item_image_urls.__name__}"
806 counter = 0
807 urls = []
809 span_type, thumb_id_prefix = self.get_span_type_thumb_id_prefix()
810 if thumb_id_prefix: 810 ↛ 840line 810 didn't jump to line 840, because the condition on line 810 was never false
811 logging.debug(f"{namespace}: Clicked images widget")
812 # get image urls
813 while True:
814 try:
815 thumb = ""
816 xpath = f"//*[@id='{thumb_id_prefix}{counter}']"
817 elem = WebDriverWait(self.selenium_driver, self.timeout).until(
818 ec.presence_of_element_located((By.XPATH, xpath))
819 )
820 if span_type == "imgThumbs": 820 ↛ 822line 820 didn't jump to line 822, because the condition on line 820 was never false
821 thumb = elem.get_attribute("src")
822 if span_type == "imgTagWrapperId": 822 ↛ 823line 822 didn't jump to line 823, because the condition on line 822 was never true
823 inner_elem = elem.find_element(By.CLASS_NAME, "ivThumbImage")
824 style = inner_elem.get_attribute("style")
825 m = re.search('"(.*)"', style)
826 if m:
827 thumb = m.group(1)
828 sub, suff = os.path.splitext(thumb)
829 indx = sub.find("._")
830 url = sub[:indx] + suff
831 if url: 831 ↛ 833line 831 didn't jump to line 833, because the condition on line 831 was never false
832 urls.append(url)
833 logging.debug(f"{namespace}: Thumbnail src is {thumb}")
834 logging.debug(f"{namespace}: Full size URL is %r" % url)
835 counter += 1
836 except (NoSuchElementException, TimeoutException):
837 break
838 # amazon adds stupid human holding book images
839 # remove this
840 if len(urls) > 1: 840 ↛ 843line 840 didn't jump to line 843, because the condition on line 840 was never false
841 urls.pop()
843 return urls
845 # }}}
848##############################################################################
849# utility functions
850##############################################################################
851def get_headless_driver():
852 return get_driver("--headless=new")
855def get_driver(option_args: str = ""):
856 """Creates a new instance of the chrome driver.
858 :param option_args:
859 Option arguments to pass to the driver
860 :returns: selenium.webdriver object
861 """
862 namespace = f"{MODULE}.{get_driver.__name__}"
863 service = ChromeService()
864 options = webdriver.ChromeOptions()
865 if option_args: 865 ↛ 866line 865 didn't jump to line 866, because the condition on line 865 was never true
866 options.add_argument(option_args)
867 logging.info(f"{namespace}: Setting webdriver option to '{option_args}'.")
868 driver = webdriver.Chrome(service=service, options=options)
869 return driver
872def scrape_item(scrapr, item_id, description="", image_urls=None):
873 if image_urls is None:
874 image_urls = []
875 namespace = f"{MODULE}.{scrape_item.__name__}"
876 scrapr.load_item_page(item_id)
877 logging.info(
878 f"{namespace}: Getting item image urls via {scrapr.__class__.__name__}"
879 )
880 l_image_urls = scrapr.scrape_item_image_urls()
881 if image_urls and len(l_image_urls) > 1: 881 ↛ 882line 881 didn't jump to line 882, because the condition on line 881 was never true
882 l_image_urls.pop(0)
883 image_urls = image_urls + l_image_urls
884 logging.info(" URLs: %r" % image_urls)
885 if image_urls and not description: 885 ↛ 891line 885 didn't jump to line 891, because the condition on line 885 was never false
886 logging.info(
887 f"{namespace}: Getting description via {scrapr.__class__.__name__}"
888 )
889 description = scrapr.scrape_description()
890 logging.info(" Description: %r" % description[:140])
891 return description, image_urls
894def get_failover_scraper_item_id(driver, vendor_code, item):
895 namespace = f"{MODULE}.{get_failover_scraper_item_id.__name__}"
896 failover_scrapr = None
897 item_id = item.isbn
898 # TODO: vvvvvv (#163) add to CFG["asg"]["vendors"]
899 # { code = "sample", name = "MyVendor",
900 # isbn_key = "ISBN", "2nd_scraper" = "TBScraper", },
901 if vendor_code == "tb":
902 try:
903 url = item.data["LINK"]
904 m = re.search(r"\/([0-9]+)\/", url)
905 if m: 905 ↛ 910line 905 didn't jump to line 910, because the condition on line 905 was never false
906 item_id = m.group(1)
907 failover_scrapr = TBScraper(driver)
908 except KeyError:
909 logging.error(f"{namespace}: No link found in item")
910 if vendor_code == "sd":
911 failover_scrapr = SDScraper(driver)
912 if vendor_code == "gj":
913 failover_scrapr = GJScraper(driver)
914 return failover_scrapr, item_id
917def main(vendor_code, sheet_id, worksheet, scraped_items_db): # noqa: C901
918 # {{{
919 namespace = f"{MODULE}.{main.__name__}"
920 # get vendor info from database
921 logging.debug(f"{namespace}: Instantiate vendor.")
922 vendr = vendor.Vendor(vendor_code)
923 vendr.set_vendor_data()
925 sheet_data = spreadsheet.get_sheet_data(sheet_id, worksheet)
927 sheet_keys = sheet_data.pop(0)
928 items_obj = Items(sheet_keys, sheet_data, vendr.isbn_key)
929 items_obj.load_scraped_data(scraped_items_db)
930 driver = None
931 prime_scrapr = None
932 failover_scrapr = None
933 for item in items_obj:
934 if not item.isbn: 934 ↛ 935line 934 didn't jump to line 935, because the condition on line 934 was never true
935 if "TBCODE" in item.data:
936 item.isbn = item.data["TBCODE"]
937 if not item.isbn:
938 logging.info(f"{namespace}: No isbn for item, skipping lookup")
939 continue
940 description = ""
941 image_urls = []
942 # if scraped_item image_urls is not empty:
943 # skip scraped_item
944 logging.info(f"{namespace}: Searching for {item.isbn} ...")
945 if item.image_urls != []: 945 ↛ 946line 945 didn't jump to line 946, because the condition on line 945 was never true
946 logging.info(f"{namespace}: {item.isbn} found in database, skipping")
947 continue
949 if not driver and not prime_scrapr: 949 ↛ 958line 949 didn't jump to line 958, because the condition on line 949 was never false
950 logging.info(f"{namespace}: Opening browser...")
951 if CFG["asg"]["scraper"]["headless"]: 951 ↛ 952line 951 didn't jump to line 952, because the condition on line 951 was never true
952 driver = get_headless_driver()
953 else:
954 driver = get_driver()
955 prime_scrapr = AmznScraper(driver)
956 prime_scrapr.solve_captcha()
958 logging.info(f"{namespace}: No scraped data currently: {item.isbn}")
959 description, image_urls = scrape_item(
960 prime_scrapr, item.isbn10, description, image_urls
961 )
962 if len(image_urls) < IMG_FAILOVER_THRESHHOLD: 962 ↛ 963line 962 didn't jump to line 963, because the condition on line 962 was never true
963 failover_scrapr, item_id = get_failover_scraper_item_id(
964 driver, vendr.vendor_code, item
965 )
966 if failover_scrapr:
967 description, image_urls = scrape_item(
968 failover_scrapr, item_id, description, image_urls
969 )
971 item.data["DESCRIPTION"] = description
972 item.image_urls = image_urls
974 logging.info(f"{namespace}: Saving scraped item data")
975 items_obj.save_scraped_data(scraped_items_db)
976 if driver: 976 ↛ exitline 976 didn't return from function 'main', because the condition on line 976 was never false
977 logging.info(f"{namespace}: Closing browser...")
978 driver.quit()
979 # }}}