Coverage for src/artemis_sg/scraper.py: 83%

623 statements  

« prev     ^ index     » next       coverage.py v7.3.1, created at 2023-10-12 17:31 -0700

1import logging 

2import os.path 

3import re 

4import time # for additional sleeps in page load. This is a smell. 

5import urllib.parse 

6 

7from rich.console import Console 

8from rich.text import Text 

9 

10# Selenium 

11from selenium import webdriver 

12from selenium.common.exceptions import ( 

13 ElementClickInterceptedException, 

14 ElementNotInteractableException, 

15 NoSuchElementException, 

16 StaleElementReferenceException, 

17 TimeoutException, 

18) 

19 

20# Chrome 

21from selenium.webdriver.chrome.service import Service as ChromeService 

22from selenium.webdriver.common.by import By 

23from selenium.webdriver.common.keys import Keys as SeleniumKeys 

24from selenium.webdriver.support import expected_conditions as ec 

25from selenium.webdriver.support.ui import WebDriverWait 

26 

27from artemis_sg import spreadsheet, vendor 

28from artemis_sg.config import CFG 

29from artemis_sg.items import Items 

30 

31# Firefox 

32# from selenium.webdriver.firefox.service import Service as FirefoxService 

33 

34MODULE = os.path.splitext(os.path.basename(__file__))[0] 

35console = Console() 

36 

37IMG_FAILOVER_THRESHHOLD = 2 

38 

39 

40class BaseScraper: 

41 # {{{ 

42 """ 

43 Scraper objects know how to scrape base url 

44 """ 

45 

46 def __init__(self, selenium_driver, base_url=None): 

47 self.selenium_driver = selenium_driver 

48 if not base_url: 

49 self.base_url = "" 

50 else: 

51 self.base_url = base_url 

52 

53 def load_item_page(self, item_number): 

54 return False 

55 

56 def scrape_description(self): 

57 description = "" 

58 return description 

59 

60 def scrape_item_image_urls(self): 

61 urls = [] 

62 return urls 

63 

64 def delay(self, secs): 

65 time.sleep(secs) 

66 

67 # }}} 

68 

69 

70class GJScraper(BaseScraper): 

71 # {{{ 

72 """ 

73 GJScraper objects know how to scrape GJ item pages 

74 """ 

75 

76 def __init__(self, selenium_driver, base_url="https://greatjonesbooks.com"): 

77 super().__init__(selenium_driver, base_url) 

78 self.timeout = 3 

79 

80 def load_item_page(self, item_number, tries=0): 

81 namespace = f"{type(self).__name__}.{self.load_item_page.__name__}" 

82 

83 # GJ does not maintain session if the links on page are not used 

84 # if not logged in, then build url; else use search facility 

85 try: 

86 self.delay(1) 

87 WebDriverWait(self.selenium_driver, self.timeout).until( 

88 ec.presence_of_element_located( 

89 (By.XPATH, "//a[@href='/account' and text()='Account Summary']") 

90 ) 

91 ) 

92 except (NoSuchElementException, TimeoutException): 

93 start = "/product/" 

94 url = self.base_url + start + item_number 

95 self.selenium_driver.get(url) 

96 return True 

97 try: 

98 search = WebDriverWait(self.selenium_driver, self.timeout).until( 

99 ec.presence_of_element_located((By.XPATH, "//a[@href='/search']")) 

100 ) 

101 search.click() 

102 self.delay(2) 

103 

104 # wait until Publisher list is populated 

105 timeout_bak = self.timeout 

106 self.timeout = 60 

107 WebDriverWait(self.selenium_driver, self.timeout).until( 

108 ec.presence_of_element_located( 

109 # TODO: (#163) move to CFG 

110 (By.XPATH, "//option[@value='Abbeville']") 

111 ) 

112 ) 

113 self.timeout = timeout_bak 

114 # then get itemCode field for search 

115 item_field = WebDriverWait(self.selenium_driver, self.timeout).until( 

116 ec.presence_of_element_located((By.XPATH, "//input[@name='itemCode']")) 

117 ) 

118 search_button = self.selenium_driver.find_element( 

119 By.CSS_SELECTOR, ".buttonSet > button:nth-child(1)" 

120 ) 

121 clear_button = self.selenium_driver.find_element( 

122 By.CSS_SELECTOR, ".buttonSet > button:nth-child(2)" 

123 ) 

124 clear_button.click() 

125 item_field.send_keys(item_number) 

126 self.delay(2) 

127 search_button.click() 

128 self.delay(2) 

129 # check for No Results 

130 e = self.selenium_driver.find_element( 

131 By.XPATH, "//div[@class='formBox']/div" 

132 ) 

133 if "No Results" in e.text: 

134 # Do not continue to try 

135 logging.info(f"{namespace}: No Results found for {item_number}") 

136 return False 

137 items = self.selenium_driver.find_elements(By.ID, "product.item_id") 

138 items[0].click() 

139 return True 

140 except (NoSuchElementException, TimeoutException, IndexError): 

141 tries += 1 

142 if tries < self.timeout: 142 ↛ 143line 142 didn't jump to line 143, because the condition on line 142 was never true

143 self.load_item_page(item_number, tries) 

144 else: 

145 logging.info(f"{namespace}: failed item search for {item_number}") 

146 return False 

147 

148 def scrape_description(self): 

149 try: 

150 self.delay(1) 

151 elem = WebDriverWait(self.selenium_driver, self.timeout).until( 

152 ec.presence_of_element_located((By.CLASS_NAME, "desc")) 

153 ) 

154 span = elem.find_element(By.CLASS_NAME, "short-comments") 

155 description = span.text 

156 except (NoSuchElementException, TimeoutException): 

157 description = "" 

158 

159 return description 

160 

161 def scrape_item_image_urls(self): 

162 namespace = f"{type(self).__name__}.{self.scrape_item_image_urls.__name__}" 

163 

164 urls = [] 

165 try: 

166 self.delay(1) 

167 # GJ appears to only have single cover images 

168 elem = WebDriverWait(self.selenium_driver, self.timeout).until( 

169 ec.presence_of_element_located((By.CLASS_NAME, "cover")) 

170 ) 

171 img = elem.find_element(By.TAG_NAME, "img") 

172 src = img.get_attribute("src") 

173 if src: 173 ↛ 177line 173 didn't jump to line 177, because the condition on line 173 was never false

174 urls.append(src) 

175 except (NoSuchElementException, TimeoutException) as e: 

176 logging.warning(f"{namespace}: error {e}") 

177 return urls 

178 

179 def load_login_page(self): 

180 # Load search page while logged out in an attempt to get the 

181 # Publishers list to populate when the page is loaded after login. 

182 self.selenium_driver.get(self.base_url + "/search") 

183 self.delay(self.timeout) 

184 login = "/login" 

185 url = self.base_url + login 

186 self.selenium_driver.get(url) 

187 

188 def login(self): 

189 namespace = f"{type(self).__name__}.{self.login.__name__}" 

190 

191 self.delay(2) 

192 input_text = Text( 

193 """ 

194 ******** USER INPUT REQUIRED ******** 

195 Locate the selenium controlled browser 

196 and manually enter your login credentials. 

197 ******** WAITING FOR USER INPUT ******** 

198 """ 

199 ) 

200 input_text.stylize("bold cyan") 

201 console.print(input_text) 

202 # TODO: DRY this up. It is duplicated between scrapers 

203 # Take element search pattern as argument. 

204 # TODO: vv (#163) move to CFG 

205 # wait up to 90 seconds for user to manually enter credentials 

206 # Verify by finding "a" with attribute "href"="/account" 

207 try: 

208 timeout_bak = self.timeout 

209 self.timeout = 90 

210 WebDriverWait(self.selenium_driver, self.timeout).until( 

211 ec.presence_of_element_located((By.XPATH, "//a[@href='/account']")) 

212 ) 

213 self.timeout = timeout_bak 

214 success_text = Text( 

215 """ 

216 ******** LOGIN SUCCESSFUL ******** 

217 ******** CONTINUING EXECUTION ******** 

218 """ 

219 ) 

220 success_text.stylize("green") 

221 console.print(success_text) 

222 except (NoSuchElementException, TimeoutException) as e: 

223 logging.error(f"{namespace}: failed to login") 

224 logging.error(f"{namespace}: Cannot proceed. Exiting.") 

225 raise e 

226 

227 def add_to_cart(self, qty): 

228 # TODO: Can we DRY this up? Some duplication between scrapers 

229 namespace = f"{type(self).__name__}.{self.add_to_cart.__name__}" 

230 

231 self.delay(1) 

232 stock_elem = self.selenium_driver.find_element(By.CLASS_NAME, "on-hand") 

233 m = re.search(r"([0-9]+) in stock", stock_elem.text) 

234 if m: 

235 stock = m.group(1) 

236 if int(stock) < int(qty): 236 ↛ 238line 236 didn't jump to line 238, because the condition on line 236 was never false

237 qty = stock 

238 self.delay(1) 

239 try: 

240 # gather html elements needed 

241 add_div = WebDriverWait(self.selenium_driver, self.timeout).until( 

242 ec.presence_of_element_located((By.CLASS_NAME, "add")) 

243 ) 

244 qty_field = add_div.find_element(By.XPATH, "//input[@name='qty']") 

245 

246 qty_field.clear() 

247 qty_field.send_keys(qty + SeleniumKeys.ENTER) 

248 except (NoSuchElementException, TimeoutException) as e: 

249 logging.warning(f"{namespace}: error {e}") 

250 return 0 

251 return int(qty) 

252 

253 def load_cart_page(self): 

254 # TODO: Can we DRY this up? Some duplication between scrapers 

255 namespace = f"{type(self).__name__}.{self.load_cart_page.__name__}" 

256 try: 

257 cart = self.selenium_driver.find_element(By.CLASS_NAME, "cart") 

258 cart.click() 

259 self.delay(1) 

260 cart.click() 

261 self.delay(1) 

262 except Exception as e: 

263 logging.warning(f"{namespace}: error {e}") 

264 return False 

265 return True 

266 

267 def scrape_error_msg(self): 

268 try: 

269 elem = self.selenium_driver.find_element(By.CLASS_NAME, "errorMsg") 

270 msg = elem.text 

271 except NoSuchElementException: 

272 msg = "" 

273 return msg 

274 

275 # }}} 

276 

277 

278class SDScraper(BaseScraper): 

279 # {{{ 

280 """ 

281 SDScraper objects know how to scrape SD item pages 

282 """ 

283 

284 def __init__(self, selenium_driver, base_url="https://strathearndistribution.com"): 

285 super().__init__(selenium_driver, base_url) 

286 self.timeout = 3 

287 

288 def load_login_page(self): 

289 namespace = f"{type(self).__name__}.{self.load_login_page.__name__}" 

290 try: 

291 self.selenium_driver.get(self.base_url) 

292 self.delay(2) 

293 button = self.selenium_driver.find_element(By.ID, "styled_btn") 

294 button.click() 

295 except ( 

296 StaleElementReferenceException, 

297 NoSuchElementException, 

298 TimeoutException, 

299 ) as e: 

300 logging.error(f"{namespace}: failed to load login page") 

301 logging.error(f"{namespace}: Cannot proceed. Exiting.") 

302 raise e 

303 

304 def login(self): 

305 namespace = f"{type(self).__name__}.{self.login.__name__}" 

306 input_text = Text( 

307 """ 

308 ******** USER INPUT REQUIRED ******** 

309 Locate the selenium controlled browser 

310 and manually enter your login credentials. 

311 ******** WAITING FOR USER INPUT ******** 

312 """ 

313 ) 

314 input_text.stylize("bold cyan") 

315 console.print(input_text) 

316 # TODO: DRY this up. It is duplicated between scrapers 

317 # TODO: vv (#163) move to CFG 

318 # wait up to 90 seconds for user to manually enter credentials 

319 # Verify by finding "span" with the text "My lists" 

320 try: 

321 timeout_bak = self.timeout 

322 self.timeout = 90 

323 WebDriverWait(self.selenium_driver, self.timeout).until( 

324 ec.presence_of_element_located((By.XPATH, "//span[text()='My lists']")) 

325 ) 

326 self.timeout = timeout_bak 

327 success_text = Text( 

328 """ 

329 ******** LOGIN SUCCESSFUL ******** 

330 ******** CONTINUING EXECUTION ******** 

331 """ 

332 ) 

333 success_text.stylize("green") 

334 console.print(success_text) 

335 except (NoSuchElementException, TimeoutException) as e: 

336 logging.error(f"{namespace}: failed to login") 

337 logging.error(f"{namespace}: Cannot proceed. Exiting.") 

338 raise e 

339 

340 def load_item_page(self, item_number, tries=0): 

341 namespace = f"{type(self).__name__}.{self.load_item_page.__name__}" 

342 try: 

343 self.selenium_driver.get(self.base_url) 

344 self.delay(2) 

345 search = WebDriverWait(self.selenium_driver, self.timeout).until( 

346 ec.presence_of_element_located((By.ID, "search")) 

347 ) 

348 search.send_keys(item_number + SeleniumKeys.ENTER) 

349 self.delay(2) 

350 elem = WebDriverWait(self.selenium_driver, self.timeout).until( 

351 ec.presence_of_element_located((By.CLASS_NAME, "listItem")) 

352 ) 

353 self.delay(2) 

354 elem.click() 

355 return True 

356 except ( 

357 StaleElementReferenceException, 

358 NoSuchElementException, 

359 TimeoutException, 

360 ) as e: 

361 tries += 1 

362 if tries < self.timeout: 

363 self.load_item_page(item_number, tries) 

364 else: 

365 logging.warning( 

366 f"{namespace}: Failed to load item page '{item_number}': {e}" 

367 ) 

368 return False 

369 

370 def scrape_description(self): 

371 try: 

372 # rc-* IDs are dynamic, must use classes 

373 elem = self.selenium_driver.find_element(By.CLASS_NAME, "ant-tabs-nav-list") 

374 tab_btn = elem.find_element(By.CLASS_NAME, "ant-tabs-tab-btn") 

375 tab_btn.click() 

376 pane = self.selenium_driver.find_element(By.CLASS_NAME, "ant-tabs-tabpane") 

377 description = pane.text 

378 except NoSuchElementException: 

379 description = "" 

380 

381 return description 

382 

383 def scrape_item_image_urls(self): 

384 namespace = f"{type(self).__name__}.{self.scrape_item_image_urls.__name__}" 

385 urls = [] 

386 try: 

387 # main only 

388 elem = self.selenium_driver.find_element(By.CLASS_NAME, "slick-current") 

389 img = elem.find_element(By.TAG_NAME, "img") 

390 src = img.get_attribute("src") 

391 if src: 391 ↛ 394line 391 didn't jump to line 394, because the condition on line 391 was never false

392 urls.append(src) 

393 # ensure we are seeing the top of the page 

394 html = self.selenium_driver.find_element(By.TAG_NAME, "html") 

395 html.send_keys(SeleniumKeys.PAGE_UP) 

396 elems = self.selenium_driver.find_elements(By.CLASS_NAME, "gallery-vert") 

397 for elem in elems: 

398 src = elem.get_attribute("src") 

399 if src: 399 ↛ 397line 399 didn't jump to line 397, because the condition on line 399 was never false

400 urls.append(src) 

401 except NoSuchElementException as e: 

402 logging.warning(f"{namespace}: error {e}") 

403 return urls 

404 

405 def add_to_cart(self, qty): 

406 namespace = f"{type(self).__name__}.{self.add_to_cart.__name__}" 

407 

408 self.delay(1) 

409 # try:??? 

410 stock_elem = self.selenium_driver.find_element( 

411 By.XPATH, "//span[contains(text(), 'in stock')]" 

412 ) 

413 m = re.search(r"([0-9]+) in stock", stock_elem.get_attribute("innerHTML")) 

414 if m: 

415 stock = m.group(1) 

416 if int(stock) < int(qty): 416 ↛ 418line 416 didn't jump to line 418, because the condition on line 416 was never false

417 qty = stock 

418 self.delay(1) 

419 try: 

420 # gather html elements needed 

421 elems = self.selenium_driver.find_elements(By.CLASS_NAME, "ant-btn-primary") 

422 button = None 

423 for e in elems: 423 ↛ 427line 423 didn't jump to line 427, because the loop on line 423 didn't complete

424 if "Add to cart" in e.text: 424 ↛ 423line 424 didn't jump to line 423, because the condition on line 424 was never false

425 button = e 

426 break 

427 qty_field = self.selenium_driver.find_element( 

428 By.XPATH, 

429 ( 

430 "//input[@class='ant-input' and @type='text' " 

431 "and not(ancestor::div[contains(@class, '-block')])]" 

432 ), 

433 ) 

434 # the qty field must be clicked to highlight amount. Clearing doesn't work 

435 qty_field.click() 

436 qty_field.send_keys(qty) 

437 button.click() 

438 except Exception as e: 

439 logging.warning(f"{namespace}: error {e}") 

440 return 0 

441 return int(qty) 

442 

443 def load_cart_page(self): 

444 namespace = f"{type(self).__name__}.{self.load_cart_page.__name__}" 

445 try: 

446 cart = "/checkout/cart" 

447 url = self.base_url + cart 

448 self.selenium_driver.get(url) 

449 self.delay(1) 

450 return True 

451 except Exception as e: 

452 logging.warning(f"{namespace}: error {e}") 

453 return False 

454 

455 # }}} 

456 

457 

458class TBScraper(BaseScraper): 

459 # {{{ 

460 """ 

461 TBScraper objects know how to scrape TB item pages 

462 """ 

463 

464 def __init__(self, selenium_driver, base_url="https://texasbookman.com/"): 

465 super().__init__(selenium_driver, base_url) 

466 self.timeout = 3 

467 

468 def load_item_page(self, item_number): 

469 start = "p/" 

470 url = self.base_url + start + item_number 

471 self.selenium_driver.get(url) 

472 return True 

473 

474 def scrape_description(self): 

475 try: 

476 elem = self.selenium_driver.find_element( 

477 By.CLASS_NAME, "variant-description" 

478 ) 

479 text = elem.text 

480 description = text.replace("NO AMAZON SALES\n\n", "") 

481 except NoSuchElementException: 

482 description = "" 

483 

484 return description 

485 

486 def scrape_item_image_urls(self): 

487 urls = [] 

488 try: 

489 elem = WebDriverWait(self.selenium_driver, self.timeout).until( 

490 ec.presence_of_element_located((By.CLASS_NAME, "a-left")) 

491 ) 

492 elem = self.selenium_driver.find_element(By.CLASS_NAME, "picture-thumbs") 

493 left = elem.find_element(By.CLASS_NAME, "a-left") 

494 left.click() 

495 while True: 

496 self.delay(2) 

497 thumb = self._get_thumb_from_slimbox() 

498 if thumb: 

499 urls.append(thumb) 

500 next_link = WebDriverWait(self.selenium_driver, self.timeout).until( 

501 ec.presence_of_element_located((By.ID, "lbNextLink")) 

502 ) 

503 self.delay(2) 

504 next_link.click() 

505 except ( 

506 NoSuchElementException, 

507 ElementNotInteractableException, 

508 TimeoutException, 

509 ): 

510 try: 

511 elem = self.selenium_driver.find_element(By.CLASS_NAME, "picture") 

512 img = elem.find_element(By.TAG_NAME, "img") 

513 thumb = img.get_attribute("src") 

514 urls.append(thumb) 

515 except NoSuchElementException: 

516 pass 

517 

518 return urls 

519 

520 def _get_thumb_from_slimbox(self): 

521 timeout = 3 

522 thumb = None 

523 try: 

524 img_div = WebDriverWait(self.selenium_driver, timeout).until( 

525 ec.presence_of_element_located((By.ID, "lbImage")) 

526 ) 

527 style = img_div.get_attribute("style") 

528 m = re.search('"(.*)"', style) 

529 if m: 529 ↛ 534line 529 didn't jump to line 534, because the condition on line 529 was never false

530 thumb = m.group(1) 

531 except (NoSuchElementException, TimeoutException): 

532 pass 

533 

534 return thumb 

535 

536 def load_login_page(self): 

537 login = "login" 

538 url = self.base_url + login 

539 self.selenium_driver.get(url) 

540 

541 def login(self): 

542 namespace = f"{type(self).__name__}.{self.login.__name__}" 

543 

544 self.delay(2) 

545 input_text = Text( 

546 """ 

547 ******** USER INPUT REQUIRED ******** 

548 Locate the selenium controlled browser 

549 and manually enter your login credentials. 

550 ******** WAITING FOR USER INPUT ******** 

551 """ 

552 ) 

553 input_text.stylize("bold cyan") 

554 console.print(input_text) 

555 # TODO: DRY this up. It is duplicated between scrapers 

556 # TODO: vv (#163) move to CFG 

557 # wait up to 90 seconds for user to manually enter credentials 

558 # Verify by finding "a" with attribute "href"="/admin" 

559 try: 

560 timeout_bak = self.timeout 

561 self.timeout = 90 

562 WebDriverWait(self.selenium_driver, 90).until( 

563 ec.presence_of_element_located((By.XPATH, "//a[@href='/admin']")) 

564 ) 

565 self.timeout = timeout_bak 

566 success_text = Text( 

567 """ 

568 ******** LOGIN SUCCESSFUL ******** 

569 ******** CONTINUING EXECUTION ******** 

570 """ 

571 ) 

572 success_text.stylize("green") 

573 console.print(success_text) 

574 except (NoSuchElementException, TimeoutException) as e: 

575 logging.error(f"{namespace}: failed to login") 

576 logging.error(f"{namespace}: Cannot proceed. Exiting.") 

577 raise e 

578 

579 def impersonate(self, email): 

580 namespace = f"{type(self).__name__}.{self.impersonate.__name__}" 

581 

582 # Go to /Admin/Customer/List 

583 customers = "/Admin/Customer/List" 

584 url = self.base_url + customers 

585 self.selenium_driver.get(url) 

586 self.delay(1) 

587 try: 

588 # search for email 

589 search_email = WebDriverWait(self.selenium_driver, self.timeout).until( 

590 ec.presence_of_element_located((By.ID, "SearchEmail")) 

591 ) 

592 search_email.clear() 

593 search_email.send_keys(email + SeleniumKeys.ENTER) 

594 # Get customer link associated with email 

595 email_xpath = ( 

596 f"//div[@id='customers-grid']/table/tbody/tr/td/a[text()='{email}']" 

597 ) 

598 customer_link = WebDriverWait(self.selenium_driver, self.timeout).until( 

599 ec.presence_of_element_located((By.XPATH, email_xpath)) 

600 ) 

601 links = self.selenium_driver.find_elements(By.XPATH, email_xpath) 

602 # Bail if multiple customer records for given email. 

603 if len(links) > 1: 

604 logging.error( 

605 f"{namespace}: Found multiple customer records for email " 

606 f"'{email}' to impersonate" 

607 ) 

608 logging.error(f"{namespace}: Cannot proceed. Exiting.") 

609 raise Exception 

610 customer_link.click() 

611 # click "Place Order (impersonate)" 

612 impersonate = WebDriverWait(self.selenium_driver, self.timeout).until( 

613 ec.presence_of_element_located( 

614 (By.XPATH, "//a[text()='Place order (Impersonate)']") 

615 ) 

616 ) 

617 impersonate.click() 

618 # click "Place Order" button 

619 button = WebDriverWait(self.selenium_driver, self.timeout).until( 

620 ec.presence_of_element_located( 

621 (By.XPATH, "//input[@name='impersonate']") 

622 ) 

623 ) 

624 button.click() 

625 self.delay(1) 

626 WebDriverWait(self.selenium_driver, self.timeout).until( 

627 ec.presence_of_element_located((By.CLASS_NAME, "finish-impersonation")) 

628 ) 

629 except (NoSuchElementException, TimeoutException) as e: 

630 logging.error(f"{namespace}: failed to impersonate") 

631 logging.error(f"{namespace}: Cannot proceed. Exiting.") 

632 raise e 

633 return True 

634 

635 def add_to_cart(self, qty): 

636 namespace = f"{type(self).__name__}.{self.add_to_cart.__name__}" 

637 

638 qty = int(qty) 

639 self.delay(1) 

640 stock_elem = self.selenium_driver.find_element(By.CLASS_NAME, "stock") 

641 m = re.search(r"Availability: ([0-9]+) in stock", stock_elem.text) 

642 if m: 642 ↛ 647line 642 didn't jump to line 647, because the condition on line 642 was never false

643 stock = m.group(1) 

644 stock = int(stock) 

645 if stock < qty: 

646 qty = stock 

647 try: 

648 # gather html elements needed 

649 qty_field = WebDriverWait(self.selenium_driver, self.timeout).until( 

650 ec.presence_of_element_located((By.CLASS_NAME, "qty-input")) 

651 ) 

652 button = self.selenium_driver.find_element( 

653 By.CLASS_NAME, "add-to-cart-button" 

654 ) 

655 qty_field.clear() 

656 # ENTERing out of the qty_field DOES NOT add to cart. 

657 # The button must be clicked instead. 

658 qty_field.send_keys(qty) 

659 button.click() 

660 self.delay(1) 

661 except Exception as e: 

662 logging.warning(f"{namespace}: error {e}") 

663 return 0 

664 return qty 

665 

666 def load_cart_page(self): 

667 cart = "cart" 

668 url = self.base_url + cart 

669 self.selenium_driver.get(url) 

670 return True 

671 

672 def search_item_num(self, search): 

673 namespace = f"{type(self).__name__}.{self.search_item_num.__name__}" 

674 

675 item_num = "" 

676 search = urllib.parse.quote_plus(search) 

677 url = self.base_url + "search?q=" + search 

678 self.selenium_driver.get(url) 

679 self.delay(2) 

680 timeout_bak = self.timeout 

681 self.timeout = 120 

682 WebDriverWait(self.selenium_driver, self.timeout).until( 

683 ec.presence_of_element_located((By.CLASS_NAME, "search-results")) 

684 ) 

685 self.timeout = timeout_bak 

686 links = self.selenium_driver.find_elements( 

687 By.XPATH, "//div[@class='search-results']//a[contains(@href, '/p/')]" 

688 ) 

689 if links: 689 ↛ 695line 689 didn't jump to line 695, because the condition on line 689 was never false

690 item_url = links[0].get_attribute("href") 

691 m = re.search(r"\/p\/([0-9]+)\/", item_url) 

692 if m: 692 ↛ 696line 692 didn't jump to line 696, because the condition on line 692 was never false

693 item_num = m.group(1) 

694 else: 

695 logging.warning(f"{namespace}: Failed to find item using q='{search}'") 

696 return item_num 

697 

698 # }}} 

699 

700 

701class AmznScraper(BaseScraper): 

702 # {{{ 

703 """ 

704 AmznScraper objects know how to scrape amazon item pages 

705 """ 

706 

707 def __init__(self, selenium_driver, base_url="https://www.amazon.com/"): 

708 super().__init__(selenium_driver, base_url) 

709 self.timeout = 1 

710 

711 def solve_captcha(self): 

712 from amazoncaptcha import AmazonCaptcha 

713 

714 self.selenium_driver.get("https://www.amazon.com/errors/validateCaptcha") 

715 try: 

716 captcha = AmazonCaptcha.fromdriver(self.selenium_driver) 

717 solution = captcha.solve() 

718 elem = self.selenium_driver.find_element(By.ID, "captchacharacters") 

719 elem.send_keys(solution + SeleniumKeys.ENTER) 

720 return True 

721 except (NoSuchElementException, TimeoutException): 

722 return False 

723 

724 def load_item_page(self, item_number): 

725 start = "dp/" 

726 url = self.base_url + start + item_number 

727 self.selenium_driver.get(url) 

728 return True 

729 

730 def scrape_description(self): 

731 description = "" 

732 description = self._scrape_amazon_editorial_review() 

733 if not description: 

734 description = self._scrape_amazon_description() 

735 

736 return description 

737 

738 def _scrape_amazon_editorial_review(self): 

739 descr = "" 

740 try: 

741 elem = self.selenium_driver.find_element( 

742 By.ID, "editorialReviews_feature_div" 

743 ) 

744 text = elem.text 

745 descr_lines = re.split("^.*\\n.*\\n", text) # trim off first two lines 

746 descr = descr_lines[-1] 

747 except NoSuchElementException: 

748 descr = "" 

749 

750 return descr 

751 

752 def _scrape_amazon_description(self): 

753 descr = "" 

754 try: 

755 elem = self.selenium_driver.find_element( 

756 By.ID, "bookDescription_feature_div" 

757 ) 

758 # read_more = elem.find_element(By.CLASS_NAME, 'a-expander-prompt') 

759 # read_more.click() 

760 descr = elem.text 

761 except NoSuchElementException: 

762 descr = "" 

763 

764 return descr 

765 

766 def get_span_type_thumb_id_prefix(self): 

767 """Get span_type and thumb_id_prefix from amazon images widget.""" 

768 namespace = ( 

769 f"{type(self).__name__}.{self.get_span_type_thumb_id_prefix.__name__}" 

770 ) 

771 span_type = None 

772 thumb_id_prefix = None 

773 try: 

774 span = WebDriverWait(self.selenium_driver, self.timeout).until( 

775 ec.presence_of_element_located((By.ID, "imgThumbs")) 

776 ) 

777 span_type = "imgThumbs" 

778 except (NoSuchElementException, TimeoutException): 

779 logging.info(f"{namespace}: No imgThumbs id, trying imgTagWrapperID") 

780 try: 

781 span = WebDriverWait(self.selenium_driver, self.timeout).until( 

782 ec.presence_of_element_located((By.ID, "imgTagWrapperId")) 

783 ) 

784 span_type = "imgTagWrapperId" 

785 except (NoSuchElementException, TimeoutException): 

786 logging.info(f"{namespace}: No imgTagWrapperId id") 

787 logging.info(f"{namespace}: Returning empty urls list") 

788 return (span_type, thumb_id_prefix) 

789 

790 if span_type == "imgThumbs": 790 ↛ 794line 790 didn't jump to line 794, because the condition on line 790 was never false

791 link = span.find_element(By.CLASS_NAME, "a-link-normal") 

792 thumb_id_prefix = "ig-thumb-" 

793 else: 

794 link = span 

795 thumb_id_prefix = "ivImage_" 

796 try: 

797 link.click() 

798 except ElementClickInterceptedException: 

799 logging.info(f"{namespace}: Failed to click images widget") 

800 logging.info(f"{namespace}: Returning empty urls list") 

801 return (span_type, thumb_id_prefix) 

802 return (span_type, thumb_id_prefix) 

803 

804 def scrape_item_image_urls(self): 

805 namespace = f"{type(self).__name__}.{self.scrape_item_image_urls.__name__}" 

806 counter = 0 

807 urls = [] 

808 

809 span_type, thumb_id_prefix = self.get_span_type_thumb_id_prefix() 

810 if thumb_id_prefix: 810 ↛ 840line 810 didn't jump to line 840, because the condition on line 810 was never false

811 logging.debug(f"{namespace}: Clicked images widget") 

812 # get image urls 

813 while True: 

814 try: 

815 thumb = "" 

816 xpath = f"//*[@id='{thumb_id_prefix}{counter}']" 

817 elem = WebDriverWait(self.selenium_driver, self.timeout).until( 

818 ec.presence_of_element_located((By.XPATH, xpath)) 

819 ) 

820 if span_type == "imgThumbs": 820 ↛ 822line 820 didn't jump to line 822, because the condition on line 820 was never false

821 thumb = elem.get_attribute("src") 

822 if span_type == "imgTagWrapperId": 822 ↛ 823line 822 didn't jump to line 823, because the condition on line 822 was never true

823 inner_elem = elem.find_element(By.CLASS_NAME, "ivThumbImage") 

824 style = inner_elem.get_attribute("style") 

825 m = re.search('"(.*)"', style) 

826 if m: 

827 thumb = m.group(1) 

828 sub, suff = os.path.splitext(thumb) 

829 indx = sub.find("._") 

830 url = sub[:indx] + suff 

831 if url: 831 ↛ 833line 831 didn't jump to line 833, because the condition on line 831 was never false

832 urls.append(url) 

833 logging.debug(f"{namespace}: Thumbnail src is {thumb}") 

834 logging.debug(f"{namespace}: Full size URL is %r" % url) 

835 counter += 1 

836 except (NoSuchElementException, TimeoutException): 

837 break 

838 # amazon adds stupid human holding book images 

839 # remove this 

840 if len(urls) > 1: 840 ↛ 843line 840 didn't jump to line 843, because the condition on line 840 was never false

841 urls.pop() 

842 

843 return urls 

844 

845 # }}} 

846 

847 

848############################################################################## 

849# utility functions 

850############################################################################## 

851def get_headless_driver(): 

852 return get_driver("--headless=new") 

853 

854 

855def get_driver(option_args: str = ""): 

856 """Creates a new instance of the chrome driver. 

857 

858 :param option_args: 

859 Option arguments to pass to the driver 

860 :returns: selenium.webdriver object 

861 """ 

862 namespace = f"{MODULE}.{get_driver.__name__}" 

863 service = ChromeService() 

864 options = webdriver.ChromeOptions() 

865 if option_args: 865 ↛ 866line 865 didn't jump to line 866, because the condition on line 865 was never true

866 options.add_argument(option_args) 

867 logging.info(f"{namespace}: Setting webdriver option to '{option_args}'.") 

868 driver = webdriver.Chrome(service=service, options=options) 

869 return driver 

870 

871 

872def scrape_item(scrapr, item_id, description="", image_urls=None): 

873 if image_urls is None: 

874 image_urls = [] 

875 namespace = f"{MODULE}.{scrape_item.__name__}" 

876 scrapr.load_item_page(item_id) 

877 logging.info( 

878 f"{namespace}: Getting item image urls via {scrapr.__class__.__name__}" 

879 ) 

880 l_image_urls = scrapr.scrape_item_image_urls() 

881 if image_urls and len(l_image_urls) > 1: 881 ↛ 882line 881 didn't jump to line 882, because the condition on line 881 was never true

882 l_image_urls.pop(0) 

883 image_urls = image_urls + l_image_urls 

884 logging.info(" URLs: %r" % image_urls) 

885 if image_urls and not description: 885 ↛ 891line 885 didn't jump to line 891, because the condition on line 885 was never false

886 logging.info( 

887 f"{namespace}: Getting description via {scrapr.__class__.__name__}" 

888 ) 

889 description = scrapr.scrape_description() 

890 logging.info(" Description: %r" % description[:140]) 

891 return description, image_urls 

892 

893 

894def get_failover_scraper_item_id(driver, vendor_code, item): 

895 namespace = f"{MODULE}.{get_failover_scraper_item_id.__name__}" 

896 failover_scrapr = None 

897 item_id = item.isbn 

898 # TODO: vvvvvv (#163) add to CFG["asg"]["vendors"] 

899 # { code = "sample", name = "MyVendor", 

900 # isbn_key = "ISBN", "2nd_scraper" = "TBScraper", }, 

901 if vendor_code == "tb": 

902 try: 

903 url = item.data["LINK"] 

904 m = re.search(r"\/([0-9]+)\/", url) 

905 if m: 905 ↛ 910line 905 didn't jump to line 910, because the condition on line 905 was never false

906 item_id = m.group(1) 

907 failover_scrapr = TBScraper(driver) 

908 except KeyError: 

909 logging.error(f"{namespace}: No link found in item") 

910 if vendor_code == "sd": 

911 failover_scrapr = SDScraper(driver) 

912 if vendor_code == "gj": 

913 failover_scrapr = GJScraper(driver) 

914 return failover_scrapr, item_id 

915 

916 

917def main(vendor_code, sheet_id, worksheet, scraped_items_db): # noqa: C901 

918 # {{{ 

919 namespace = f"{MODULE}.{main.__name__}" 

920 # get vendor info from database 

921 logging.debug(f"{namespace}: Instantiate vendor.") 

922 vendr = vendor.Vendor(vendor_code) 

923 vendr.set_vendor_data() 

924 

925 sheet_data = spreadsheet.get_sheet_data(sheet_id, worksheet) 

926 

927 sheet_keys = sheet_data.pop(0) 

928 items_obj = Items(sheet_keys, sheet_data, vendr.isbn_key) 

929 items_obj.load_scraped_data(scraped_items_db) 

930 driver = None 

931 prime_scrapr = None 

932 failover_scrapr = None 

933 for item in items_obj: 

934 if not item.isbn: 934 ↛ 935line 934 didn't jump to line 935, because the condition on line 934 was never true

935 if "TBCODE" in item.data: 

936 item.isbn = item.data["TBCODE"] 

937 if not item.isbn: 

938 logging.info(f"{namespace}: No isbn for item, skipping lookup") 

939 continue 

940 description = "" 

941 image_urls = [] 

942 # if scraped_item image_urls is not empty: 

943 # skip scraped_item 

944 logging.info(f"{namespace}: Searching for {item.isbn} ...") 

945 if item.image_urls != []: 945 ↛ 946line 945 didn't jump to line 946, because the condition on line 945 was never true

946 logging.info(f"{namespace}: {item.isbn} found in database, skipping") 

947 continue 

948 

949 if not driver and not prime_scrapr: 949 ↛ 958line 949 didn't jump to line 958, because the condition on line 949 was never false

950 logging.info(f"{namespace}: Opening browser...") 

951 if CFG["asg"]["scraper"]["headless"]: 951 ↛ 952line 951 didn't jump to line 952, because the condition on line 951 was never true

952 driver = get_headless_driver() 

953 else: 

954 driver = get_driver() 

955 prime_scrapr = AmznScraper(driver) 

956 prime_scrapr.solve_captcha() 

957 

958 logging.info(f"{namespace}: No scraped data currently: {item.isbn}") 

959 description, image_urls = scrape_item( 

960 prime_scrapr, item.isbn10, description, image_urls 

961 ) 

962 if len(image_urls) < IMG_FAILOVER_THRESHHOLD: 962 ↛ 963line 962 didn't jump to line 963, because the condition on line 962 was never true

963 failover_scrapr, item_id = get_failover_scraper_item_id( 

964 driver, vendr.vendor_code, item 

965 ) 

966 if failover_scrapr: 

967 description, image_urls = scrape_item( 

968 failover_scrapr, item_id, description, image_urls 

969 ) 

970 

971 item.data["DESCRIPTION"] = description 

972 item.image_urls = image_urls 

973 

974 logging.info(f"{namespace}: Saving scraped item data") 

975 items_obj.save_scraped_data(scraped_items_db) 

976 if driver: 976 ↛ exitline 976 didn't return from function 'main', because the condition on line 976 was never false

977 logging.info(f"{namespace}: Closing browser...") 

978 driver.quit() 

979 # }}}