Coverage for src/artemis_sg/scraper.py: 71%
599 statements
« prev ^ index » next coverage.py v7.3.2, created at 2023-10-05 11:29 -0700
« prev ^ index » next coverage.py v7.3.2, created at 2023-10-05 11:29 -0700
1# -*- coding: utf-8 -*-
3import logging
4import os.path
5import re
6import time # for additional sleeps in page load. This is a smell.
7import urllib.parse
9# Selenium
10from selenium import webdriver
11from selenium.common.exceptions import (
12 ElementClickInterceptedException,
13 ElementNotInteractableException,
14 NoSuchElementException,
15 StaleElementReferenceException,
16 TimeoutException,
17)
19# Chrome
20from selenium.webdriver.chrome.service import Service as ChromeService
21from selenium.webdriver.common.by import By
22from selenium.webdriver.common.keys import Keys as SeleniumKeys
23from selenium.webdriver.support import expected_conditions as EC
24from selenium.webdriver.support.ui import WebDriverWait
26import artemis_sg.spreadsheet as spreadsheet
27import artemis_sg.vendor as vendor
28from artemis_sg.config import CFG
29from artemis_sg.items import Items
31# Firefox
32# from selenium.webdriver.firefox.service import Service as FirefoxService
34MODULE = os.path.splitext(os.path.basename(__file__))[0]
37class BaseScraper:
38 # {{{
39 """
40 Scraper objects know how to scrape base url
41 """
43 def __init__(self, selenium_driver, base_url=None):
44 self.selenium_driver = selenium_driver
45 if not base_url: 45 ↛ 46line 45 didn't jump to line 46, because the condition on line 45 was never true
46 self.base_url = ""
47 else:
48 self.base_url = base_url
50 def load_item_page(self, item_number):
51 return False
53 def scrape_description(self):
54 description = ""
55 return description
57 def scrape_item_image_urls(self):
58 urls = []
59 return urls
61 def delay(self, secs):
62 time.sleep(secs)
64 # }}}
67class GJScraper(BaseScraper):
68 # {{{
69 """
70 GJScraper objects know how to scrape GJ item pages
71 """
73 def __init__(self, selenium_driver, base_url="https://greatjonesbooks.com"):
74 super().__init__(selenium_driver, base_url)
75 self.timeout = 3
77 def load_item_page(self, item_number, tries=0):
78 namespace = f"{type(self).__name__}.{self.load_item_page.__name__}"
80 # GJ does not maintain session if the links on page are not used
81 # if not logged in, then build url; else use search facility
82 try:
83 self.delay(1)
84 WebDriverWait(self.selenium_driver, 1).until(
85 EC.presence_of_element_located(
86 (By.XPATH, "//a[@href='/account' and text()='Account Summary']")
87 )
88 )
89 except (NoSuchElementException, TimeoutException):
90 start = "/product/"
91 url = self.base_url + start + item_number
92 self.selenium_driver.get(url)
93 return True
94 try:
95 search = WebDriverWait(self.selenium_driver, self.timeout).until(
96 EC.presence_of_element_located((By.XPATH, "//a[@href='/search']"))
97 )
98 search.click()
99 self.delay(2)
101 # wait until Publisher list is populated
102 WebDriverWait(self.selenium_driver, 60).until(
103 EC.presence_of_element_located(
104 (By.XPATH, "//option[@value='Abbeville']")
105 )
106 )
107 # then get itemCode field for search
108 item_field = WebDriverWait(self.selenium_driver, self.timeout).until(
109 EC.presence_of_element_located((By.XPATH, "//input[@name='itemCode']"))
110 )
111 search_button = self.selenium_driver.find_element(
112 By.CSS_SELECTOR, ".buttonSet > button:nth-child(1)"
113 )
114 clear_button = self.selenium_driver.find_element(
115 By.CSS_SELECTOR, ".buttonSet > button:nth-child(2)"
116 )
117 clear_button.click()
118 item_field.send_keys(item_number)
119 self.delay(2)
120 search_button.click()
121 self.delay(2)
122 # check for No Results
123 e = self.selenium_driver.find_element(
124 By.XPATH, "//div[@class='formBox']/div"
125 )
126 if "No Results" in e.text: 126 ↛ 128line 126 didn't jump to line 128, because the condition on line 126 was never true
127 # Do not continue to try
128 logging.info(f"{namespace}: No Results found for {item_number}")
129 return False
130 items = self.selenium_driver.find_elements(By.ID, "product.item_id")
131 items[0].click()
132 return True
133 except (NoSuchElementException, TimeoutException, IndexError):
134 tries += 1
135 if tries < self.timeout:
136 self.load_item_page(item_number, tries)
137 else:
138 logging.info(f"{namespace}: failed item search for {item_number}")
139 return False
141 def scrape_description(self):
142 try:
143 self.delay(1)
144 elem = WebDriverWait(self.selenium_driver, self.timeout).until(
145 EC.presence_of_element_located((By.CLASS_NAME, "desc"))
146 )
147 span = elem.find_element(By.CLASS_NAME, "short-comments")
148 description = span.text
149 except NoSuchElementException:
150 description = ""
152 return description
154 def scrape_item_image_urls(self):
155 namespace = f"{type(self).__name__}.{self.scrape_item_image_urls.__name__}"
157 urls = []
158 try:
159 self.delay(1)
160 # GJ appears to only have single cover images
161 elem = WebDriverWait(self.selenium_driver, self.timeout).until(
162 EC.presence_of_element_located((By.CLASS_NAME, "cover"))
163 )
164 img = elem.find_element(By.TAG_NAME, "img")
165 src = img.get_attribute("src")
166 if src: 166 ↛ 170line 166 didn't jump to line 170, because the condition on line 166 was never false
167 urls.append(src)
168 except NoSuchElementException as e:
169 logging.warning(f"{namespace}: error {e}")
170 return urls
172 def load_login_page(self):
173 # Load search page while logged out in an attempt to get the
174 # Publishers list to populate when the page is loaded after login.
175 self.selenium_driver.get(self.base_url + "/search")
176 self.delay(self.timeout)
177 login = "/login"
178 url = self.base_url + login
179 self.selenium_driver.get(url)
181 def login(self):
182 namespace = f"{type(self).__name__}.{self.login.__name__}"
184 self.delay(2)
185 print("******** USER INPUT REQUIRED ********")
186 print("Locate the selenium controlled browser")
187 print("and manually enter your login credentials.")
188 print("******** WAITING FOR USER INPUT ********")
189 # wait up to 90 seconds for user to manually enter credentials
190 # Verify by finding "a" with attribute "href"="/account"
191 try:
192 WebDriverWait(self.selenium_driver, 90).until(
193 EC.presence_of_element_located((By.XPATH, "//a[@href='/account']"))
194 )
195 print("******** LOGIN SUCCESSFUL ********")
196 print("******** CONTINUING EXECUTION ********")
197 except (NoSuchElementException, TimeoutException) as e:
198 logging.error(f"{namespace}: failed to login")
199 logging.error(f"{namespace}: Cannot proceed. Exiting.")
200 raise e
202 def add_to_cart(self, qty):
203 namespace = f"{type(self).__name__}.{self.add_to_cart.__name__}"
205 self.delay(1)
206 stock_elem = self.selenium_driver.find_element(By.CLASS_NAME, "on-hand")
207 m = re.search(r"([0-9]+) in stock", stock_elem.text)
208 if m:
209 stock = m.group(1)
210 if int(stock) < int(qty): 210 ↛ 212line 210 didn't jump to line 212, because the condition on line 210 was never false
211 qty = stock
212 self.delay(1)
213 try:
214 # gather html elements needed
215 add_div = WebDriverWait(self.selenium_driver, self.timeout).until(
216 EC.presence_of_element_located((By.CLASS_NAME, "add"))
217 )
218 qty_field = add_div.find_element(By.XPATH, "//input[@name='qty']")
220 qty_field.clear()
221 qty_field.send_keys(qty + SeleniumKeys.ENTER)
222 except Exception as e:
223 logging.warning(f"{namespace}: error {e}")
224 return 0
225 return int(qty)
227 def load_cart_page(self):
228 namespace = f"{type(self).__name__}.{self.load_cart_page.__name__}"
229 try:
230 cart = self.selenium_driver.find_element(By.CLASS_NAME, "cart")
231 cart.click()
232 self.delay(1)
233 cart.click()
234 self.delay(1)
235 except Exception as e:
236 logging.warning(f"{namespace}: error {e}")
237 return False
238 return True
240 def scrape_error_msg(self):
241 try:
242 elem = self.selenium_driver.find_element(By.CLASS_NAME, "errorMsg")
243 msg = elem.text
244 except NoSuchElementException:
245 msg = ""
246 return msg
248 # }}}
251class SDScraper(BaseScraper):
252 # {{{
253 """
254 SDScraper objects know how to scrape SD item pages
255 """
257 def __init__(self, selenium_driver, base_url="https://strathearndistribution.com"):
258 super().__init__(selenium_driver, base_url)
259 self.timeout = 3
261 def load_login_page(self):
262 self.selenium_driver.get(self.base_url)
263 self.delay(2)
264 button = self.selenium_driver.find_element(
265 By.CSS_SELECTOR, ".ant-col:nth-child(4) span:nth-child(2)"
266 )
267 button.click()
269 def login(self):
270 namespace = f"{type(self).__name__}.{self.login.__name__}"
271 print("******** USER INPUT REQUIRED ********")
272 print("Locate the selenium controlled browser")
273 print("and manually enter your login credentials.")
274 print("******** WAITING FOR USER INPUT ********")
275 # wait up to 90 seconds for user to manually enter credentials
276 # Verify by finding "span" with the text "My lists"
277 try:
278 WebDriverWait(self.selenium_driver, 90).until(
279 EC.presence_of_element_located((By.XPATH, "//span[text()='My lists']"))
280 )
281 print("******** LOGIN SUCCESSFUL ********")
282 print("******** CONTINUING EXECUTION ********")
283 except (NoSuchElementException, TimeoutException) as e:
284 logging.error(f"{namespace}: failed to login")
285 logging.error(f"{namespace}: Cannot proceed. Exiting.")
286 raise e
288 def load_item_page(self, item_number, tries=0):
289 namespace = f"{type(self).__name__}.{self.load_item_page.__name__}"
290 try:
291 self.selenium_driver.get(self.base_url)
292 self.delay(2)
293 search = WebDriverWait(self.selenium_driver, self.timeout).until(
294 EC.presence_of_element_located((By.ID, "search"))
295 )
296 search.send_keys(item_number + SeleniumKeys.ENTER)
297 self.delay(2)
298 elem = WebDriverWait(self.selenium_driver, self.timeout).until(
299 EC.presence_of_element_located((By.CLASS_NAME, "listItem"))
300 )
301 self.delay(2)
302 elem.click()
303 return True
304 except (
305 StaleElementReferenceException,
306 NoSuchElementException,
307 TimeoutException,
308 ) as e:
309 tries += 1
310 if tries < self.timeout:
311 self.load_item_page(item_number, tries)
312 else:
313 logging.warning(
314 f"{namespace}: Failed to load item page '{item_number}': {e}"
315 )
316 return False
318 def scrape_description(self):
319 try:
320 # rc-* IDs are dynamic, must use classes
321 elem = self.selenium_driver.find_element(By.CLASS_NAME, "ant-tabs-nav-list")
322 tab_btn = elem.find_element(By.CLASS_NAME, "ant-tabs-tab-btn")
323 tab_btn.click()
324 pane = self.selenium_driver.find_element(By.CLASS_NAME, "ant-tabs-tabpane")
325 description = pane.text
326 except NoSuchElementException:
327 description = ""
329 return description
331 def scrape_item_image_urls(self):
332 namespace = f"{type(self).__name__}.{self.scrape_item_image_urls.__name__}"
333 urls = []
334 try:
335 # main only
336 elem = self.selenium_driver.find_element(By.CLASS_NAME, "slick-current")
337 img = elem.find_element(By.TAG_NAME, "img")
338 src = img.get_attribute("src")
339 if src: 339 ↛ 342line 339 didn't jump to line 342, because the condition on line 339 was never false
340 urls.append(src)
341 # ensure we are seeing the top of the page
342 html = self.selenium_driver.find_element(By.TAG_NAME, "html")
343 html.send_keys(SeleniumKeys.PAGE_UP)
344 elems = self.selenium_driver.find_elements(By.CLASS_NAME, "gallery-vert")
345 for elem in elems:
346 src = elem.get_attribute("src")
347 if src: 347 ↛ 345line 347 didn't jump to line 345, because the condition on line 347 was never false
348 urls.append(src)
349 except NoSuchElementException as e:
350 logging.warning(f"{namespace}: error {e}")
351 return urls
353 def add_to_cart(self, qty):
354 namespace = f"{type(self).__name__}.{self.add_to_cart.__name__}"
356 self.delay(1)
357 # try:???
358 stock_elem = self.selenium_driver.find_element(
359 By.XPATH, "//span[contains(text(), 'in stock')]"
360 )
361 m = re.search(r"([0-9]+) in stock", stock_elem.get_attribute("innerHTML"))
362 if m:
363 stock = m.group(1)
364 if int(stock) < int(qty): 364 ↛ 366line 364 didn't jump to line 366, because the condition on line 364 was never false
365 qty = stock
366 self.delay(1)
367 try:
368 # gather html elements needed
369 elems = self.selenium_driver.find_elements(By.CLASS_NAME, "ant-btn-primary")
370 button = None
371 for e in elems: 371 ↛ 375line 371 didn't jump to line 375, because the loop on line 371 didn't complete
372 if "Add to cart" in e.text: 372 ↛ 371line 372 didn't jump to line 371, because the condition on line 372 was never false
373 button = e
374 break
375 qty_field = self.selenium_driver.find_element(
376 By.XPATH,
377 (
378 "//input[@class='ant-input' and @type='text' "
379 "and not(ancestor::div[contains(@class, '-block')])]"
380 ),
381 )
382 # the qty field must be clicked to highlight amount. Clearing doesn't work
383 qty_field.click()
384 qty_field.send_keys(qty)
385 button.click()
386 except Exception as e:
387 logging.warning(f"{namespace}: error {e}")
388 return 0
389 return int(qty)
391 def load_cart_page(self):
392 namespace = f"{type(self).__name__}.{self.load_cart_page.__name__}"
393 try:
394 cart = "/checkout/cart"
395 url = self.base_url + cart
396 self.selenium_driver.get(url)
397 self.delay(1)
398 return True
399 except Exception as e:
400 logging.warning(f"{namespace}: error {e}")
401 return False
403 # }}}
406class TBScraper(BaseScraper):
407 # {{{
408 """
409 TBScraper objects know how to scrape TB item pages
410 """
412 def __init__(self, selenium_driver, base_url="https://texasbookman.com/"):
413 super().__init__(selenium_driver, base_url)
414 self.timeout = 3
416 def load_item_page(self, item_number):
417 start = "p/"
418 url = self.base_url + start + item_number
419 self.selenium_driver.get(url)
420 return True
422 def scrape_description(self):
423 try:
424 elem = self.selenium_driver.find_element(
425 By.CLASS_NAME, "variant-description"
426 )
427 text = elem.text
428 description = text.replace("NO AMAZON SALES\n\n", "")
429 except NoSuchElementException:
430 description = ""
432 return description
434 def scrape_item_image_urls(self):
435 urls = []
436 try:
437 elem = WebDriverWait(self.selenium_driver, self.timeout).until(
438 EC.presence_of_element_located((By.CLASS_NAME, "a-left"))
439 )
440 elem = self.selenium_driver.find_element(By.CLASS_NAME, "picture-thumbs")
441 left = elem.find_element(By.CLASS_NAME, "a-left")
442 left.click()
443 while True:
444 self.delay(2)
445 thumb = self._get_thumb_from_slimbox()
446 if thumb:
447 urls.append(thumb)
448 next_link = WebDriverWait(self.selenium_driver, self.timeout).until(
449 EC.presence_of_element_located((By.ID, "lbNextLink"))
450 )
451 self.delay(2)
452 next_link.click()
453 except (
454 NoSuchElementException,
455 ElementNotInteractableException,
456 TimeoutException,
457 ):
458 try:
459 elem = self.selenium_driver.find_element(By.CLASS_NAME, "picture")
460 img = elem.find_element(By.TAG_NAME, "img")
461 thumb = img.get_attribute("src")
462 urls.append(thumb)
463 except NoSuchElementException:
464 pass
466 return urls
468 def _get_thumb_from_slimbox(self):
469 timeout = 3
470 thumb = None
471 try:
472 img_div = WebDriverWait(self.selenium_driver, timeout).until(
473 EC.presence_of_element_located((By.ID, "lbImage"))
474 )
475 style = img_div.get_attribute("style")
476 m = re.search('"(.*)"', style)
477 if m: 477 ↛ 482line 477 didn't jump to line 482, because the condition on line 477 was never false
478 thumb = m.group(1)
479 except (NoSuchElementException, TimeoutException):
480 pass
482 return thumb
484 def load_login_page(self):
485 login = "login"
486 url = self.base_url + login
487 self.selenium_driver.get(url)
489 def login(self):
490 namespace = f"{type(self).__name__}.{self.login.__name__}"
492 self.delay(2)
493 print("******** USER INPUT REQUIRED ********")
494 print("Locate the selenium controlled browser")
495 print("and manually enter your login credentials.")
496 print("******** WAITING FOR USER INPUT ********")
497 # wait up to 90 seconds for user to manually enter credentials
498 # Verify by finding "a" with attribute "href"="/admin"
499 try:
500 WebDriverWait(self.selenium_driver, 90).until(
501 EC.presence_of_element_located((By.XPATH, "//a[@href='/admin']"))
502 )
503 print("******** LOGIN SUCCESSFUL ********")
504 print("******** CONTINUING EXECUTION ********")
505 except (NoSuchElementException, TimeoutException) as e:
506 logging.error(f"{namespace}: failed to login")
507 logging.error(f"{namespace}: Cannot proceed. Exiting.")
508 raise e
510 def impersonate(self, email):
511 namespace = f"{type(self).__name__}.{self.impersonate.__name__}"
513 # Go to /Admin/Customer/List
514 customers = "/Admin/Customer/List"
515 url = self.base_url + customers
516 self.selenium_driver.get(url)
517 self.delay(1)
518 try:
519 # search for email
520 search_email = WebDriverWait(self.selenium_driver, self.timeout).until(
521 EC.presence_of_element_located((By.ID, "SearchEmail"))
522 )
523 search_email.clear()
524 search_email.send_keys(email + SeleniumKeys.ENTER)
525 # Get customer link associated with email
526 email_xpath = (
527 "//div[@id='customers-grid']/table/tbody/tr/td/a[text()='{email}']"
528 ).format(email=email)
529 customer_link = WebDriverWait(self.selenium_driver, self.timeout).until(
530 EC.presence_of_element_located((By.XPATH, email_xpath))
531 )
532 links = self.selenium_driver.find_elements(By.XPATH, email_xpath)
533 # Bail if multiple customer records for given email.
534 if len(links) > 1:
535 logging.error(
536 (
537 "{namespace}: Found multiple customer records for email "
538 "'{email}' to impersonate"
539 ).format(namespace=namespace, email=email)
540 )
541 logging.error(f"{namespace}: Cannot proceed. Exiting.")
542 raise Exception
543 customer_link.click()
544 # click "Place Order (impersonate)"
545 impersonate = WebDriverWait(self.selenium_driver, self.timeout).until(
546 EC.presence_of_element_located(
547 (By.XPATH, "//a[text()='Place order (Impersonate)']")
548 )
549 )
550 impersonate.click()
551 # click "Place Order" button
552 button = WebDriverWait(self.selenium_driver, self.timeout).until(
553 EC.presence_of_element_located(
554 (By.XPATH, "//input[@name='impersonate']")
555 )
556 )
557 button.click()
558 self.delay(1)
559 WebDriverWait(self.selenium_driver, self.timeout).until(
560 EC.presence_of_element_located((By.CLASS_NAME, "finish-impersonation"))
561 )
562 except (NoSuchElementException, TimeoutException) as e:
563 logging.error(f"{namespace}: failed to impersonate")
564 logging.error(f"{namespace}: Cannot proceed. Exiting.")
565 raise e
566 return True
568 def add_to_cart(self, qty):
569 namespace = f"{type(self).__name__}.{self.add_to_cart.__name__}"
571 qty = int(qty)
572 self.delay(1)
573 stock_elem = self.selenium_driver.find_element(By.CLASS_NAME, "stock")
574 m = re.search(r"Availability: ([0-9]+) in stock", stock_elem.text)
575 if m: 575 ↛ 580line 575 didn't jump to line 580, because the condition on line 575 was never false
576 stock = m.group(1)
577 stock = int(stock)
578 if stock < qty:
579 qty = stock
580 try:
581 # gather html elements needed
582 qty_field = WebDriverWait(self.selenium_driver, self.timeout).until(
583 EC.presence_of_element_located((By.CLASS_NAME, "qty-input"))
584 )
585 button = self.selenium_driver.find_element(
586 By.CLASS_NAME, "add-to-cart-button"
587 )
588 qty_field.clear()
589 # ENTERing out of the qty_field DOES NOT add to cart.
590 # The button must be clicked instead.
591 qty_field.send_keys(qty)
592 button.click()
593 self.delay(1)
594 except Exception as e:
595 logging.warning(f"{namespace}: error {e}")
596 return 0
597 return qty
599 def load_cart_page(self):
600 cart = "cart"
601 url = self.base_url + cart
602 self.selenium_driver.get(url)
603 return True
605 def search_item_num(self, search):
606 namespace = f"{type(self).__name__}.{self.search_item_num.__name__}"
608 item_num = ""
609 search = urllib.parse.quote_plus(search)
610 url = self.base_url + "search?q=" + search
611 self.selenium_driver.get(url)
612 self.delay(2)
613 WebDriverWait(self.selenium_driver, 120).until(
614 EC.presence_of_element_located((By.CLASS_NAME, "search-results"))
615 )
616 links = self.selenium_driver.find_elements(
617 By.XPATH, "//div[@class='search-results']//a[contains(@href, '/p/')]"
618 )
619 if links: 619 ↛ 625line 619 didn't jump to line 625, because the condition on line 619 was never false
620 item_url = links[0].get_attribute("href")
621 m = re.search(r"\/p\/([0-9]+)\/", item_url)
622 if m: 622 ↛ 626line 622 didn't jump to line 626, because the condition on line 622 was never false
623 item_num = m.group(1)
624 else:
625 logging.warning(f"{namespace}: Failed to find item using q='{search}'")
626 return item_num
628 # }}}
631class AmznScraper(BaseScraper):
632 # {{{
633 """
634 AmznScraper objects know how to scrape amazon item pages
635 """
637 def __init__(self, selenium_driver, base_url="https://www.amazon.com/"):
638 super().__init__(selenium_driver, base_url)
639 self.timeout = 1
641 def load_item_page(self, item_number):
642 start = "dp/"
643 url = self.base_url + start + item_number
644 self.selenium_driver.get(url)
645 return True
647 def scrape_description(self):
648 description = ""
649 description = self._scrape_amazon_editorial_review()
650 if not description:
651 description = self._scrape_amazon_description()
653 return description
655 def _scrape_amazon_editorial_review(self):
656 descr = ""
657 try:
658 elem = self.selenium_driver.find_element(
659 By.ID, "editorialReviews_feature_div"
660 )
661 text = elem.text
662 descr_lines = re.split("^.*\\n.*\\n", text) # trim off first two lines
663 descr = descr_lines[-1]
664 except NoSuchElementException:
665 descr = ""
667 return descr
669 def _scrape_amazon_description(self):
670 descr = ""
671 try:
672 elem = self.selenium_driver.find_element(
673 By.ID, "bookDescription_feature_div"
674 )
675 # read_more = elem.find_element(By.CLASS_NAME, 'a-expander-prompt')
676 # read_more.click()
677 descr = elem.text
678 except NoSuchElementException:
679 descr = ""
681 return descr
683 def scrape_item_image_urls(self):
684 namespace = f"{type(self).__name__}.{self.scrape_item_image_urls.__name__}"
685 counter = 0
686 urls = []
688 # open amazon images widget
689 try:
690 span = WebDriverWait(self.selenium_driver, self.timeout).until(
691 EC.presence_of_element_located((By.ID, "imgThumbs"))
692 )
693 span_type = "imgThumbs"
694 except (NoSuchElementException, TimeoutException):
695 logging.info(f"{namespace}: No imgThumbs id, trying imgTagWrapperID")
696 try:
697 span = WebDriverWait(self.selenium_driver, self.timeout).until(
698 EC.presence_of_element_located((By.ID, "imgTagWrapperId"))
699 )
700 span_type = "imgTagWrapperId"
701 except (NoSuchElementException, TimeoutException):
702 logging.info(f"{namespace}: No imgTagWrapperId id")
703 logging.info(f"{namespace}: Returning empty urls list")
704 return urls
706 if span_type == "imgThumbs": 706 ↛ 710line 706 didn't jump to line 710, because the condition on line 706 was never false
707 link = span.find_element(By.CLASS_NAME, "a-link-normal")
708 thumb_id_prefix = "ig-thumb-"
709 else:
710 link = span
711 thumb_id_prefix = "ivImage_"
712 try:
713 link.click()
714 except ElementClickInterceptedException:
715 logging.info(f"{namespace}: Failed to click images widget")
716 logging.info(f"{namespace}: Returning empty urls list")
717 return urls
719 logging.debug(f"{namespace}: Clicked images widget")
720 # get image urls
721 while True:
722 try:
723 thumb = ""
724 xpath = f"//*[@id='{thumb_id_prefix}{counter}']"
725 elem = WebDriverWait(self.selenium_driver, self.timeout).until(
726 EC.presence_of_element_located((By.XPATH, xpath))
727 )
728 if span_type == "imgThumbs": 728 ↛ 730line 728 didn't jump to line 730, because the condition on line 728 was never false
729 thumb = elem.get_attribute("src")
730 if span_type == "imgTagWrapperId": 730 ↛ 731line 730 didn't jump to line 731, because the condition on line 730 was never true
731 inner_elem = elem.find_element(By.CLASS_NAME, "ivThumbImage")
732 style = inner_elem.get_attribute("style")
733 m = re.search('"(.*)"', style)
734 if m:
735 thumb = m.group(1)
736 sub, suff = os.path.splitext(thumb)
737 indx = sub.find("._")
738 url = sub[:indx] + suff
739 if url: 739 ↛ 741line 739 didn't jump to line 741, because the condition on line 739 was never false
740 urls.append(url)
741 logging.debug(f"{namespace}: Thumbnail src is {thumb}")
742 logging.debug(f"{namespace}: Full size URL is %r" % url)
743 counter += 1
744 except (NoSuchElementException, TimeoutException):
745 break
747 # amazon adds stupid human holding book images
748 # remove this
749 if len(urls) > 1: 749 ↛ 752line 749 didn't jump to line 752, because the condition on line 749 was never false
750 urls.pop()
752 return urls
754 # }}}
757##############################################################################
758# utility functions
759##############################################################################
760def get_driver(headless: bool = False):
761 """Creates a new instance of the chrome driver.
763 :param headless:
764 Whether to configure Chrome driver to be headless.
765 :returns: selenium.webdriver object
766 """
767 namespace = f"{MODULE}.{get_driver.__name__}"
768 service = ChromeService()
769 options = webdriver.ChromeOptions()
770 logging.debug(f"{namespace}: Received '{headless}' value for headless.")
771 if headless is True: 771 ↛ 772line 771 didn't jump to line 772, because the condition on line 771 was never true
772 options.add_argument("--headless=new")
773 logging.info(f"{namespace}: Setting webdriver option to 'HEADLESS'.")
774 driver = webdriver.Chrome(service=service, options=options)
775 return driver
778def main(vendor_code, sheet_id, worksheet, scraped_items_db):
779 # {{{
780 namespace = f"{MODULE}.{main.__name__}"
781 # get vendor info from database
782 logging.debug(f"{namespace}: Instantiate vendor.")
783 vendr = vendor.Vendor(vendor_code)
784 vendr.set_vendor_data()
786 sheet_data = spreadsheet.get_sheet_data(sheet_id, worksheet)
788 sheet_keys = sheet_data.pop(0)
789 items_obj = Items(sheet_keys, sheet_data, vendr.isbn_key)
790 items_obj.load_scraped_data(scraped_items_db)
791 driver = None
792 scrapr = None
793 tbscrapr = None
794 sdscrapr = None
795 gjscrapr = None
796 for item in items_obj:
797 if not item.isbn: 797 ↛ 798line 797 didn't jump to line 798, because the condition on line 797 was never true
798 if "TBCODE" in item.data:
799 item.isbn = item.data["TBCODE"]
800 if not item.isbn:
801 logging.info(f"{namespace}: No isbn for item, skipping lookup")
802 continue
803 description = ""
804 image_urls = []
805 # if scraped_item image_urls is not empty:
806 # skip scraped_item
807 logging.info(f"{namespace}: Searching for {item.isbn} ...")
808 if item.image_urls != []: 808 ↛ 809line 808 didn't jump to line 809, because the condition on line 808 was never true
809 logging.info(f"{namespace}: {item.isbn} found in database, skipping")
810 continue
812 if not driver and not scrapr: 812 ↛ 823line 812 didn't jump to line 823, because the condition on line 812 was never false
813 logging.info(f"{namespace}: Opening browser...")
814 driver = get_driver(CFG["asg"]["scraper"]["headless"])
815 scrapr = AmznScraper(driver)
816 if vendr.vendor_code == "tb": 816 ↛ 817line 816 didn't jump to line 817, because the condition on line 816 was never true
817 tbscrapr = TBScraper(driver)
818 if vendr.vendor_code == "sd": 818 ↛ 819line 818 didn't jump to line 819, because the condition on line 818 was never true
819 sdscrapr = SDScraper(driver)
820 if vendr.vendor_code == "gj": 820 ↛ 821line 820 didn't jump to line 821, because the condition on line 820 was never true
821 gjscrapr = GJScraper(driver)
823 logging.info(f"{namespace}: No scraped data currently: {item.isbn}")
824 scrapr.load_item_page(item.isbn10)
825 logging.info(f"{namespace}: Getting item description")
826 description = scrapr.scrape_description()
827 logging.info(" Description: %r" % description[:140])
828 item.data["DESCRIPTION"] = description
829 logging.info(f"{namespace}: Getting item image urls")
830 image_urls = scrapr.scrape_item_image_urls()
831 logging.info(" URLs: %r" % image_urls)
832 if tbscrapr and len(image_urls) < 2: 832 ↛ 833line 832 didn't jump to line 833, because the condition on line 832 was never true
833 logging.info(f"{namespace}: Getting item image urls via TBScraper")
834 try:
835 url = item.data["LINK"]
836 m = re.search(r"\/([0-9]+)\/", url)
837 if m:
838 web_item = m.group(1)
839 tbscrapr.load_item_page(web_item)
840 except KeyError:
841 logging.info(f"{namespace}: No link found in item")
843 tb_image_urls = tbscrapr.scrape_item_image_urls()
844 # IF only one image came from primary
845 # THEN images = images + (Get images from secondary - first secondary image)
846 if image_urls and len(tb_image_urls) > 1:
847 tb_image_urls.pop(0)
848 image_urls = image_urls + tb_image_urls
849 logging.info(" URLs: %r" % image_urls)
850 if image_urls and not description:
851 logging.info(f"{namespace}: Getting description via TBScraper")
852 description = tbscrapr.scrape_description()
853 logging.info(" Description: %r" % description[:140])
854 item.data["DESCRIPTION"] = description
855 if sdscrapr and len(image_urls) < 2: 855 ↛ 856line 855 didn't jump to line 856, because the condition on line 855 was never true
856 logging.info(f"{namespace}: Getting item image urls via SDScraper")
857 sdscrapr.load_item_page(item.isbn)
858 sd_image_urls = sdscrapr.scrape_item_image_urls()
859 if image_urls and len(sd_image_urls) > 0:
860 sd_image_urls.pop(0)
861 image_urls = image_urls + sd_image_urls
862 logging.info(" URLs: %r" % image_urls)
863 if image_urls and not description:
864 logging.info(f"{namespace}: Getting description via SDScraper")
865 description = sdscrapr.scrape_description()
866 logging.info(" Description: %r" % description[:140])
867 item.data["DESCRIPTION"] = description
868 if gjscrapr and len(image_urls) < 2: 868 ↛ 869line 868 didn't jump to line 869, because the condition on line 868 was never true
869 logging.info(f"{namespace}: Getting item image urls via GJScraper")
870 gjscrapr.load_item_page(item.isbn)
871 gj_image_urls = gjscrapr.scrape_item_image_urls()
872 if image_urls and len(gj_image_urls) > 0:
873 gj_image_urls.pop(0)
874 image_urls = image_urls + gj_image_urls
875 logging.info(" URLs: %r" % image_urls)
876 if image_urls and not description:
877 logging.info(f"{namespace}: Getting description via GJScraper")
878 description = gjscrapr.scrape_description()
879 logging.info(" Description: %r" % description[:140])
880 item.data["DESCRIPTION"] = description
882 item.image_urls = image_urls
884 logging.info(f"{namespace}: Saving scraped item data")
885 items_obj.save_scraped_data(scraped_items_db)
886 if driver: 886 ↛ exitline 886 didn't return from function 'main', because the condition on line 886 was never false
887 logging.info(f"{namespace}: Closing browser...")
888 driver.quit()
889 # }}}