Coverage for src/artemis_sg/scraper.py: 71%

599 statements  

« prev     ^ index     » next       coverage.py v7.3.2, created at 2023-10-05 11:29 -0700

1# -*- coding: utf-8 -*- 

2 

3import logging 

4import os.path 

5import re 

6import time # for additional sleeps in page load. This is a smell. 

7import urllib.parse 

8 

9# Selenium 

10from selenium import webdriver 

11from selenium.common.exceptions import ( 

12 ElementClickInterceptedException, 

13 ElementNotInteractableException, 

14 NoSuchElementException, 

15 StaleElementReferenceException, 

16 TimeoutException, 

17) 

18 

19# Chrome 

20from selenium.webdriver.chrome.service import Service as ChromeService 

21from selenium.webdriver.common.by import By 

22from selenium.webdriver.common.keys import Keys as SeleniumKeys 

23from selenium.webdriver.support import expected_conditions as EC 

24from selenium.webdriver.support.ui import WebDriverWait 

25 

26import artemis_sg.spreadsheet as spreadsheet 

27import artemis_sg.vendor as vendor 

28from artemis_sg.config import CFG 

29from artemis_sg.items import Items 

30 

31# Firefox 

32# from selenium.webdriver.firefox.service import Service as FirefoxService 

33 

34MODULE = os.path.splitext(os.path.basename(__file__))[0] 

35 

36 

37class BaseScraper: 

38 # {{{ 

39 """ 

40 Scraper objects know how to scrape base url 

41 """ 

42 

43 def __init__(self, selenium_driver, base_url=None): 

44 self.selenium_driver = selenium_driver 

45 if not base_url: 45 ↛ 46line 45 didn't jump to line 46, because the condition on line 45 was never true

46 self.base_url = "" 

47 else: 

48 self.base_url = base_url 

49 

50 def load_item_page(self, item_number): 

51 return False 

52 

53 def scrape_description(self): 

54 description = "" 

55 return description 

56 

57 def scrape_item_image_urls(self): 

58 urls = [] 

59 return urls 

60 

61 def delay(self, secs): 

62 time.sleep(secs) 

63 

64 # }}} 

65 

66 

67class GJScraper(BaseScraper): 

68 # {{{ 

69 """ 

70 GJScraper objects know how to scrape GJ item pages 

71 """ 

72 

73 def __init__(self, selenium_driver, base_url="https://greatjonesbooks.com"): 

74 super().__init__(selenium_driver, base_url) 

75 self.timeout = 3 

76 

77 def load_item_page(self, item_number, tries=0): 

78 namespace = f"{type(self).__name__}.{self.load_item_page.__name__}" 

79 

80 # GJ does not maintain session if the links on page are not used 

81 # if not logged in, then build url; else use search facility 

82 try: 

83 self.delay(1) 

84 WebDriverWait(self.selenium_driver, 1).until( 

85 EC.presence_of_element_located( 

86 (By.XPATH, "//a[@href='/account' and text()='Account Summary']") 

87 ) 

88 ) 

89 except (NoSuchElementException, TimeoutException): 

90 start = "/product/" 

91 url = self.base_url + start + item_number 

92 self.selenium_driver.get(url) 

93 return True 

94 try: 

95 search = WebDriverWait(self.selenium_driver, self.timeout).until( 

96 EC.presence_of_element_located((By.XPATH, "//a[@href='/search']")) 

97 ) 

98 search.click() 

99 self.delay(2) 

100 

101 # wait until Publisher list is populated 

102 WebDriverWait(self.selenium_driver, 60).until( 

103 EC.presence_of_element_located( 

104 (By.XPATH, "//option[@value='Abbeville']") 

105 ) 

106 ) 

107 # then get itemCode field for search 

108 item_field = WebDriverWait(self.selenium_driver, self.timeout).until( 

109 EC.presence_of_element_located((By.XPATH, "//input[@name='itemCode']")) 

110 ) 

111 search_button = self.selenium_driver.find_element( 

112 By.CSS_SELECTOR, ".buttonSet > button:nth-child(1)" 

113 ) 

114 clear_button = self.selenium_driver.find_element( 

115 By.CSS_SELECTOR, ".buttonSet > button:nth-child(2)" 

116 ) 

117 clear_button.click() 

118 item_field.send_keys(item_number) 

119 self.delay(2) 

120 search_button.click() 

121 self.delay(2) 

122 # check for No Results 

123 e = self.selenium_driver.find_element( 

124 By.XPATH, "//div[@class='formBox']/div" 

125 ) 

126 if "No Results" in e.text: 126 ↛ 128line 126 didn't jump to line 128, because the condition on line 126 was never true

127 # Do not continue to try 

128 logging.info(f"{namespace}: No Results found for {item_number}") 

129 return False 

130 items = self.selenium_driver.find_elements(By.ID, "product.item_id") 

131 items[0].click() 

132 return True 

133 except (NoSuchElementException, TimeoutException, IndexError): 

134 tries += 1 

135 if tries < self.timeout: 

136 self.load_item_page(item_number, tries) 

137 else: 

138 logging.info(f"{namespace}: failed item search for {item_number}") 

139 return False 

140 

141 def scrape_description(self): 

142 try: 

143 self.delay(1) 

144 elem = WebDriverWait(self.selenium_driver, self.timeout).until( 

145 EC.presence_of_element_located((By.CLASS_NAME, "desc")) 

146 ) 

147 span = elem.find_element(By.CLASS_NAME, "short-comments") 

148 description = span.text 

149 except NoSuchElementException: 

150 description = "" 

151 

152 return description 

153 

154 def scrape_item_image_urls(self): 

155 namespace = f"{type(self).__name__}.{self.scrape_item_image_urls.__name__}" 

156 

157 urls = [] 

158 try: 

159 self.delay(1) 

160 # GJ appears to only have single cover images 

161 elem = WebDriverWait(self.selenium_driver, self.timeout).until( 

162 EC.presence_of_element_located((By.CLASS_NAME, "cover")) 

163 ) 

164 img = elem.find_element(By.TAG_NAME, "img") 

165 src = img.get_attribute("src") 

166 if src: 166 ↛ 170line 166 didn't jump to line 170, because the condition on line 166 was never false

167 urls.append(src) 

168 except NoSuchElementException as e: 

169 logging.warning(f"{namespace}: error {e}") 

170 return urls 

171 

172 def load_login_page(self): 

173 # Load search page while logged out in an attempt to get the 

174 # Publishers list to populate when the page is loaded after login. 

175 self.selenium_driver.get(self.base_url + "/search") 

176 self.delay(self.timeout) 

177 login = "/login" 

178 url = self.base_url + login 

179 self.selenium_driver.get(url) 

180 

181 def login(self): 

182 namespace = f"{type(self).__name__}.{self.login.__name__}" 

183 

184 self.delay(2) 

185 print("******** USER INPUT REQUIRED ********") 

186 print("Locate the selenium controlled browser") 

187 print("and manually enter your login credentials.") 

188 print("******** WAITING FOR USER INPUT ********") 

189 # wait up to 90 seconds for user to manually enter credentials 

190 # Verify by finding "a" with attribute "href"="/account" 

191 try: 

192 WebDriverWait(self.selenium_driver, 90).until( 

193 EC.presence_of_element_located((By.XPATH, "//a[@href='/account']")) 

194 ) 

195 print("******** LOGIN SUCCESSFUL ********") 

196 print("******** CONTINUING EXECUTION ********") 

197 except (NoSuchElementException, TimeoutException) as e: 

198 logging.error(f"{namespace}: failed to login") 

199 logging.error(f"{namespace}: Cannot proceed. Exiting.") 

200 raise e 

201 

202 def add_to_cart(self, qty): 

203 namespace = f"{type(self).__name__}.{self.add_to_cart.__name__}" 

204 

205 self.delay(1) 

206 stock_elem = self.selenium_driver.find_element(By.CLASS_NAME, "on-hand") 

207 m = re.search(r"([0-9]+) in stock", stock_elem.text) 

208 if m: 

209 stock = m.group(1) 

210 if int(stock) < int(qty): 210 ↛ 212line 210 didn't jump to line 212, because the condition on line 210 was never false

211 qty = stock 

212 self.delay(1) 

213 try: 

214 # gather html elements needed 

215 add_div = WebDriverWait(self.selenium_driver, self.timeout).until( 

216 EC.presence_of_element_located((By.CLASS_NAME, "add")) 

217 ) 

218 qty_field = add_div.find_element(By.XPATH, "//input[@name='qty']") 

219 

220 qty_field.clear() 

221 qty_field.send_keys(qty + SeleniumKeys.ENTER) 

222 except Exception as e: 

223 logging.warning(f"{namespace}: error {e}") 

224 return 0 

225 return int(qty) 

226 

227 def load_cart_page(self): 

228 namespace = f"{type(self).__name__}.{self.load_cart_page.__name__}" 

229 try: 

230 cart = self.selenium_driver.find_element(By.CLASS_NAME, "cart") 

231 cart.click() 

232 self.delay(1) 

233 cart.click() 

234 self.delay(1) 

235 except Exception as e: 

236 logging.warning(f"{namespace}: error {e}") 

237 return False 

238 return True 

239 

240 def scrape_error_msg(self): 

241 try: 

242 elem = self.selenium_driver.find_element(By.CLASS_NAME, "errorMsg") 

243 msg = elem.text 

244 except NoSuchElementException: 

245 msg = "" 

246 return msg 

247 

248 # }}} 

249 

250 

251class SDScraper(BaseScraper): 

252 # {{{ 

253 """ 

254 SDScraper objects know how to scrape SD item pages 

255 """ 

256 

257 def __init__(self, selenium_driver, base_url="https://strathearndistribution.com"): 

258 super().__init__(selenium_driver, base_url) 

259 self.timeout = 3 

260 

261 def load_login_page(self): 

262 self.selenium_driver.get(self.base_url) 

263 self.delay(2) 

264 button = self.selenium_driver.find_element( 

265 By.CSS_SELECTOR, ".ant-col:nth-child(4) span:nth-child(2)" 

266 ) 

267 button.click() 

268 

269 def login(self): 

270 namespace = f"{type(self).__name__}.{self.login.__name__}" 

271 print("******** USER INPUT REQUIRED ********") 

272 print("Locate the selenium controlled browser") 

273 print("and manually enter your login credentials.") 

274 print("******** WAITING FOR USER INPUT ********") 

275 # wait up to 90 seconds for user to manually enter credentials 

276 # Verify by finding "span" with the text "My lists" 

277 try: 

278 WebDriverWait(self.selenium_driver, 90).until( 

279 EC.presence_of_element_located((By.XPATH, "//span[text()='My lists']")) 

280 ) 

281 print("******** LOGIN SUCCESSFUL ********") 

282 print("******** CONTINUING EXECUTION ********") 

283 except (NoSuchElementException, TimeoutException) as e: 

284 logging.error(f"{namespace}: failed to login") 

285 logging.error(f"{namespace}: Cannot proceed. Exiting.") 

286 raise e 

287 

288 def load_item_page(self, item_number, tries=0): 

289 namespace = f"{type(self).__name__}.{self.load_item_page.__name__}" 

290 try: 

291 self.selenium_driver.get(self.base_url) 

292 self.delay(2) 

293 search = WebDriverWait(self.selenium_driver, self.timeout).until( 

294 EC.presence_of_element_located((By.ID, "search")) 

295 ) 

296 search.send_keys(item_number + SeleniumKeys.ENTER) 

297 self.delay(2) 

298 elem = WebDriverWait(self.selenium_driver, self.timeout).until( 

299 EC.presence_of_element_located((By.CLASS_NAME, "listItem")) 

300 ) 

301 self.delay(2) 

302 elem.click() 

303 return True 

304 except ( 

305 StaleElementReferenceException, 

306 NoSuchElementException, 

307 TimeoutException, 

308 ) as e: 

309 tries += 1 

310 if tries < self.timeout: 

311 self.load_item_page(item_number, tries) 

312 else: 

313 logging.warning( 

314 f"{namespace}: Failed to load item page '{item_number}': {e}" 

315 ) 

316 return False 

317 

318 def scrape_description(self): 

319 try: 

320 # rc-* IDs are dynamic, must use classes 

321 elem = self.selenium_driver.find_element(By.CLASS_NAME, "ant-tabs-nav-list") 

322 tab_btn = elem.find_element(By.CLASS_NAME, "ant-tabs-tab-btn") 

323 tab_btn.click() 

324 pane = self.selenium_driver.find_element(By.CLASS_NAME, "ant-tabs-tabpane") 

325 description = pane.text 

326 except NoSuchElementException: 

327 description = "" 

328 

329 return description 

330 

331 def scrape_item_image_urls(self): 

332 namespace = f"{type(self).__name__}.{self.scrape_item_image_urls.__name__}" 

333 urls = [] 

334 try: 

335 # main only 

336 elem = self.selenium_driver.find_element(By.CLASS_NAME, "slick-current") 

337 img = elem.find_element(By.TAG_NAME, "img") 

338 src = img.get_attribute("src") 

339 if src: 339 ↛ 342line 339 didn't jump to line 342, because the condition on line 339 was never false

340 urls.append(src) 

341 # ensure we are seeing the top of the page 

342 html = self.selenium_driver.find_element(By.TAG_NAME, "html") 

343 html.send_keys(SeleniumKeys.PAGE_UP) 

344 elems = self.selenium_driver.find_elements(By.CLASS_NAME, "gallery-vert") 

345 for elem in elems: 

346 src = elem.get_attribute("src") 

347 if src: 347 ↛ 345line 347 didn't jump to line 345, because the condition on line 347 was never false

348 urls.append(src) 

349 except NoSuchElementException as e: 

350 logging.warning(f"{namespace}: error {e}") 

351 return urls 

352 

353 def add_to_cart(self, qty): 

354 namespace = f"{type(self).__name__}.{self.add_to_cart.__name__}" 

355 

356 self.delay(1) 

357 # try:??? 

358 stock_elem = self.selenium_driver.find_element( 

359 By.XPATH, "//span[contains(text(), 'in stock')]" 

360 ) 

361 m = re.search(r"([0-9]+) in stock", stock_elem.get_attribute("innerHTML")) 

362 if m: 

363 stock = m.group(1) 

364 if int(stock) < int(qty): 364 ↛ 366line 364 didn't jump to line 366, because the condition on line 364 was never false

365 qty = stock 

366 self.delay(1) 

367 try: 

368 # gather html elements needed 

369 elems = self.selenium_driver.find_elements(By.CLASS_NAME, "ant-btn-primary") 

370 button = None 

371 for e in elems: 371 ↛ 375line 371 didn't jump to line 375, because the loop on line 371 didn't complete

372 if "Add to cart" in e.text: 372 ↛ 371line 372 didn't jump to line 371, because the condition on line 372 was never false

373 button = e 

374 break 

375 qty_field = self.selenium_driver.find_element( 

376 By.XPATH, 

377 ( 

378 "//input[@class='ant-input' and @type='text' " 

379 "and not(ancestor::div[contains(@class, '-block')])]" 

380 ), 

381 ) 

382 # the qty field must be clicked to highlight amount. Clearing doesn't work 

383 qty_field.click() 

384 qty_field.send_keys(qty) 

385 button.click() 

386 except Exception as e: 

387 logging.warning(f"{namespace}: error {e}") 

388 return 0 

389 return int(qty) 

390 

391 def load_cart_page(self): 

392 namespace = f"{type(self).__name__}.{self.load_cart_page.__name__}" 

393 try: 

394 cart = "/checkout/cart" 

395 url = self.base_url + cart 

396 self.selenium_driver.get(url) 

397 self.delay(1) 

398 return True 

399 except Exception as e: 

400 logging.warning(f"{namespace}: error {e}") 

401 return False 

402 

403 # }}} 

404 

405 

406class TBScraper(BaseScraper): 

407 # {{{ 

408 """ 

409 TBScraper objects know how to scrape TB item pages 

410 """ 

411 

412 def __init__(self, selenium_driver, base_url="https://texasbookman.com/"): 

413 super().__init__(selenium_driver, base_url) 

414 self.timeout = 3 

415 

416 def load_item_page(self, item_number): 

417 start = "p/" 

418 url = self.base_url + start + item_number 

419 self.selenium_driver.get(url) 

420 return True 

421 

422 def scrape_description(self): 

423 try: 

424 elem = self.selenium_driver.find_element( 

425 By.CLASS_NAME, "variant-description" 

426 ) 

427 text = elem.text 

428 description = text.replace("NO AMAZON SALES\n\n", "") 

429 except NoSuchElementException: 

430 description = "" 

431 

432 return description 

433 

434 def scrape_item_image_urls(self): 

435 urls = [] 

436 try: 

437 elem = WebDriverWait(self.selenium_driver, self.timeout).until( 

438 EC.presence_of_element_located((By.CLASS_NAME, "a-left")) 

439 ) 

440 elem = self.selenium_driver.find_element(By.CLASS_NAME, "picture-thumbs") 

441 left = elem.find_element(By.CLASS_NAME, "a-left") 

442 left.click() 

443 while True: 

444 self.delay(2) 

445 thumb = self._get_thumb_from_slimbox() 

446 if thumb: 

447 urls.append(thumb) 

448 next_link = WebDriverWait(self.selenium_driver, self.timeout).until( 

449 EC.presence_of_element_located((By.ID, "lbNextLink")) 

450 ) 

451 self.delay(2) 

452 next_link.click() 

453 except ( 

454 NoSuchElementException, 

455 ElementNotInteractableException, 

456 TimeoutException, 

457 ): 

458 try: 

459 elem = self.selenium_driver.find_element(By.CLASS_NAME, "picture") 

460 img = elem.find_element(By.TAG_NAME, "img") 

461 thumb = img.get_attribute("src") 

462 urls.append(thumb) 

463 except NoSuchElementException: 

464 pass 

465 

466 return urls 

467 

468 def _get_thumb_from_slimbox(self): 

469 timeout = 3 

470 thumb = None 

471 try: 

472 img_div = WebDriverWait(self.selenium_driver, timeout).until( 

473 EC.presence_of_element_located((By.ID, "lbImage")) 

474 ) 

475 style = img_div.get_attribute("style") 

476 m = re.search('"(.*)"', style) 

477 if m: 477 ↛ 482line 477 didn't jump to line 482, because the condition on line 477 was never false

478 thumb = m.group(1) 

479 except (NoSuchElementException, TimeoutException): 

480 pass 

481 

482 return thumb 

483 

484 def load_login_page(self): 

485 login = "login" 

486 url = self.base_url + login 

487 self.selenium_driver.get(url) 

488 

489 def login(self): 

490 namespace = f"{type(self).__name__}.{self.login.__name__}" 

491 

492 self.delay(2) 

493 print("******** USER INPUT REQUIRED ********") 

494 print("Locate the selenium controlled browser") 

495 print("and manually enter your login credentials.") 

496 print("******** WAITING FOR USER INPUT ********") 

497 # wait up to 90 seconds for user to manually enter credentials 

498 # Verify by finding "a" with attribute "href"="/admin" 

499 try: 

500 WebDriverWait(self.selenium_driver, 90).until( 

501 EC.presence_of_element_located((By.XPATH, "//a[@href='/admin']")) 

502 ) 

503 print("******** LOGIN SUCCESSFUL ********") 

504 print("******** CONTINUING EXECUTION ********") 

505 except (NoSuchElementException, TimeoutException) as e: 

506 logging.error(f"{namespace}: failed to login") 

507 logging.error(f"{namespace}: Cannot proceed. Exiting.") 

508 raise e 

509 

510 def impersonate(self, email): 

511 namespace = f"{type(self).__name__}.{self.impersonate.__name__}" 

512 

513 # Go to /Admin/Customer/List 

514 customers = "/Admin/Customer/List" 

515 url = self.base_url + customers 

516 self.selenium_driver.get(url) 

517 self.delay(1) 

518 try: 

519 # search for email 

520 search_email = WebDriverWait(self.selenium_driver, self.timeout).until( 

521 EC.presence_of_element_located((By.ID, "SearchEmail")) 

522 ) 

523 search_email.clear() 

524 search_email.send_keys(email + SeleniumKeys.ENTER) 

525 # Get customer link associated with email 

526 email_xpath = ( 

527 "//div[@id='customers-grid']/table/tbody/tr/td/a[text()='{email}']" 

528 ).format(email=email) 

529 customer_link = WebDriverWait(self.selenium_driver, self.timeout).until( 

530 EC.presence_of_element_located((By.XPATH, email_xpath)) 

531 ) 

532 links = self.selenium_driver.find_elements(By.XPATH, email_xpath) 

533 # Bail if multiple customer records for given email. 

534 if len(links) > 1: 

535 logging.error( 

536 ( 

537 "{namespace}: Found multiple customer records for email " 

538 "'{email}' to impersonate" 

539 ).format(namespace=namespace, email=email) 

540 ) 

541 logging.error(f"{namespace}: Cannot proceed. Exiting.") 

542 raise Exception 

543 customer_link.click() 

544 # click "Place Order (impersonate)" 

545 impersonate = WebDriverWait(self.selenium_driver, self.timeout).until( 

546 EC.presence_of_element_located( 

547 (By.XPATH, "//a[text()='Place order (Impersonate)']") 

548 ) 

549 ) 

550 impersonate.click() 

551 # click "Place Order" button 

552 button = WebDriverWait(self.selenium_driver, self.timeout).until( 

553 EC.presence_of_element_located( 

554 (By.XPATH, "//input[@name='impersonate']") 

555 ) 

556 ) 

557 button.click() 

558 self.delay(1) 

559 WebDriverWait(self.selenium_driver, self.timeout).until( 

560 EC.presence_of_element_located((By.CLASS_NAME, "finish-impersonation")) 

561 ) 

562 except (NoSuchElementException, TimeoutException) as e: 

563 logging.error(f"{namespace}: failed to impersonate") 

564 logging.error(f"{namespace}: Cannot proceed. Exiting.") 

565 raise e 

566 return True 

567 

568 def add_to_cart(self, qty): 

569 namespace = f"{type(self).__name__}.{self.add_to_cart.__name__}" 

570 

571 qty = int(qty) 

572 self.delay(1) 

573 stock_elem = self.selenium_driver.find_element(By.CLASS_NAME, "stock") 

574 m = re.search(r"Availability: ([0-9]+) in stock", stock_elem.text) 

575 if m: 575 ↛ 580line 575 didn't jump to line 580, because the condition on line 575 was never false

576 stock = m.group(1) 

577 stock = int(stock) 

578 if stock < qty: 

579 qty = stock 

580 try: 

581 # gather html elements needed 

582 qty_field = WebDriverWait(self.selenium_driver, self.timeout).until( 

583 EC.presence_of_element_located((By.CLASS_NAME, "qty-input")) 

584 ) 

585 button = self.selenium_driver.find_element( 

586 By.CLASS_NAME, "add-to-cart-button" 

587 ) 

588 qty_field.clear() 

589 # ENTERing out of the qty_field DOES NOT add to cart. 

590 # The button must be clicked instead. 

591 qty_field.send_keys(qty) 

592 button.click() 

593 self.delay(1) 

594 except Exception as e: 

595 logging.warning(f"{namespace}: error {e}") 

596 return 0 

597 return qty 

598 

599 def load_cart_page(self): 

600 cart = "cart" 

601 url = self.base_url + cart 

602 self.selenium_driver.get(url) 

603 return True 

604 

605 def search_item_num(self, search): 

606 namespace = f"{type(self).__name__}.{self.search_item_num.__name__}" 

607 

608 item_num = "" 

609 search = urllib.parse.quote_plus(search) 

610 url = self.base_url + "search?q=" + search 

611 self.selenium_driver.get(url) 

612 self.delay(2) 

613 WebDriverWait(self.selenium_driver, 120).until( 

614 EC.presence_of_element_located((By.CLASS_NAME, "search-results")) 

615 ) 

616 links = self.selenium_driver.find_elements( 

617 By.XPATH, "//div[@class='search-results']//a[contains(@href, '/p/')]" 

618 ) 

619 if links: 619 ↛ 625line 619 didn't jump to line 625, because the condition on line 619 was never false

620 item_url = links[0].get_attribute("href") 

621 m = re.search(r"\/p\/([0-9]+)\/", item_url) 

622 if m: 622 ↛ 626line 622 didn't jump to line 626, because the condition on line 622 was never false

623 item_num = m.group(1) 

624 else: 

625 logging.warning(f"{namespace}: Failed to find item using q='{search}'") 

626 return item_num 

627 

628 # }}} 

629 

630 

631class AmznScraper(BaseScraper): 

632 # {{{ 

633 """ 

634 AmznScraper objects know how to scrape amazon item pages 

635 """ 

636 

637 def __init__(self, selenium_driver, base_url="https://www.amazon.com/"): 

638 super().__init__(selenium_driver, base_url) 

639 self.timeout = 1 

640 

641 def load_item_page(self, item_number): 

642 start = "dp/" 

643 url = self.base_url + start + item_number 

644 self.selenium_driver.get(url) 

645 return True 

646 

647 def scrape_description(self): 

648 description = "" 

649 description = self._scrape_amazon_editorial_review() 

650 if not description: 

651 description = self._scrape_amazon_description() 

652 

653 return description 

654 

655 def _scrape_amazon_editorial_review(self): 

656 descr = "" 

657 try: 

658 elem = self.selenium_driver.find_element( 

659 By.ID, "editorialReviews_feature_div" 

660 ) 

661 text = elem.text 

662 descr_lines = re.split("^.*\\n.*\\n", text) # trim off first two lines 

663 descr = descr_lines[-1] 

664 except NoSuchElementException: 

665 descr = "" 

666 

667 return descr 

668 

669 def _scrape_amazon_description(self): 

670 descr = "" 

671 try: 

672 elem = self.selenium_driver.find_element( 

673 By.ID, "bookDescription_feature_div" 

674 ) 

675 # read_more = elem.find_element(By.CLASS_NAME, 'a-expander-prompt') 

676 # read_more.click() 

677 descr = elem.text 

678 except NoSuchElementException: 

679 descr = "" 

680 

681 return descr 

682 

683 def scrape_item_image_urls(self): 

684 namespace = f"{type(self).__name__}.{self.scrape_item_image_urls.__name__}" 

685 counter = 0 

686 urls = [] 

687 

688 # open amazon images widget 

689 try: 

690 span = WebDriverWait(self.selenium_driver, self.timeout).until( 

691 EC.presence_of_element_located((By.ID, "imgThumbs")) 

692 ) 

693 span_type = "imgThumbs" 

694 except (NoSuchElementException, TimeoutException): 

695 logging.info(f"{namespace}: No imgThumbs id, trying imgTagWrapperID") 

696 try: 

697 span = WebDriverWait(self.selenium_driver, self.timeout).until( 

698 EC.presence_of_element_located((By.ID, "imgTagWrapperId")) 

699 ) 

700 span_type = "imgTagWrapperId" 

701 except (NoSuchElementException, TimeoutException): 

702 logging.info(f"{namespace}: No imgTagWrapperId id") 

703 logging.info(f"{namespace}: Returning empty urls list") 

704 return urls 

705 

706 if span_type == "imgThumbs": 706 ↛ 710line 706 didn't jump to line 710, because the condition on line 706 was never false

707 link = span.find_element(By.CLASS_NAME, "a-link-normal") 

708 thumb_id_prefix = "ig-thumb-" 

709 else: 

710 link = span 

711 thumb_id_prefix = "ivImage_" 

712 try: 

713 link.click() 

714 except ElementClickInterceptedException: 

715 logging.info(f"{namespace}: Failed to click images widget") 

716 logging.info(f"{namespace}: Returning empty urls list") 

717 return urls 

718 

719 logging.debug(f"{namespace}: Clicked images widget") 

720 # get image urls 

721 while True: 

722 try: 

723 thumb = "" 

724 xpath = f"//*[@id='{thumb_id_prefix}{counter}']" 

725 elem = WebDriverWait(self.selenium_driver, self.timeout).until( 

726 EC.presence_of_element_located((By.XPATH, xpath)) 

727 ) 

728 if span_type == "imgThumbs": 728 ↛ 730line 728 didn't jump to line 730, because the condition on line 728 was never false

729 thumb = elem.get_attribute("src") 

730 if span_type == "imgTagWrapperId": 730 ↛ 731line 730 didn't jump to line 731, because the condition on line 730 was never true

731 inner_elem = elem.find_element(By.CLASS_NAME, "ivThumbImage") 

732 style = inner_elem.get_attribute("style") 

733 m = re.search('"(.*)"', style) 

734 if m: 

735 thumb = m.group(1) 

736 sub, suff = os.path.splitext(thumb) 

737 indx = sub.find("._") 

738 url = sub[:indx] + suff 

739 if url: 739 ↛ 741line 739 didn't jump to line 741, because the condition on line 739 was never false

740 urls.append(url) 

741 logging.debug(f"{namespace}: Thumbnail src is {thumb}") 

742 logging.debug(f"{namespace}: Full size URL is %r" % url) 

743 counter += 1 

744 except (NoSuchElementException, TimeoutException): 

745 break 

746 

747 # amazon adds stupid human holding book images 

748 # remove this 

749 if len(urls) > 1: 749 ↛ 752line 749 didn't jump to line 752, because the condition on line 749 was never false

750 urls.pop() 

751 

752 return urls 

753 

754 # }}} 

755 

756 

757############################################################################## 

758# utility functions 

759############################################################################## 

760def get_driver(headless: bool = False): 

761 """Creates a new instance of the chrome driver. 

762 

763 :param headless: 

764 Whether to configure Chrome driver to be headless. 

765 :returns: selenium.webdriver object 

766 """ 

767 namespace = f"{MODULE}.{get_driver.__name__}" 

768 service = ChromeService() 

769 options = webdriver.ChromeOptions() 

770 logging.debug(f"{namespace}: Received '{headless}' value for headless.") 

771 if headless is True: 771 ↛ 772line 771 didn't jump to line 772, because the condition on line 771 was never true

772 options.add_argument("--headless=new") 

773 logging.info(f"{namespace}: Setting webdriver option to 'HEADLESS'.") 

774 driver = webdriver.Chrome(service=service, options=options) 

775 return driver 

776 

777 

778def main(vendor_code, sheet_id, worksheet, scraped_items_db): 

779 # {{{ 

780 namespace = f"{MODULE}.{main.__name__}" 

781 # get vendor info from database 

782 logging.debug(f"{namespace}: Instantiate vendor.") 

783 vendr = vendor.Vendor(vendor_code) 

784 vendr.set_vendor_data() 

785 

786 sheet_data = spreadsheet.get_sheet_data(sheet_id, worksheet) 

787 

788 sheet_keys = sheet_data.pop(0) 

789 items_obj = Items(sheet_keys, sheet_data, vendr.isbn_key) 

790 items_obj.load_scraped_data(scraped_items_db) 

791 driver = None 

792 scrapr = None 

793 tbscrapr = None 

794 sdscrapr = None 

795 gjscrapr = None 

796 for item in items_obj: 

797 if not item.isbn: 797 ↛ 798line 797 didn't jump to line 798, because the condition on line 797 was never true

798 if "TBCODE" in item.data: 

799 item.isbn = item.data["TBCODE"] 

800 if not item.isbn: 

801 logging.info(f"{namespace}: No isbn for item, skipping lookup") 

802 continue 

803 description = "" 

804 image_urls = [] 

805 # if scraped_item image_urls is not empty: 

806 # skip scraped_item 

807 logging.info(f"{namespace}: Searching for {item.isbn} ...") 

808 if item.image_urls != []: 808 ↛ 809line 808 didn't jump to line 809, because the condition on line 808 was never true

809 logging.info(f"{namespace}: {item.isbn} found in database, skipping") 

810 continue 

811 

812 if not driver and not scrapr: 812 ↛ 823line 812 didn't jump to line 823, because the condition on line 812 was never false

813 logging.info(f"{namespace}: Opening browser...") 

814 driver = get_driver(CFG["asg"]["scraper"]["headless"]) 

815 scrapr = AmznScraper(driver) 

816 if vendr.vendor_code == "tb": 816 ↛ 817line 816 didn't jump to line 817, because the condition on line 816 was never true

817 tbscrapr = TBScraper(driver) 

818 if vendr.vendor_code == "sd": 818 ↛ 819line 818 didn't jump to line 819, because the condition on line 818 was never true

819 sdscrapr = SDScraper(driver) 

820 if vendr.vendor_code == "gj": 820 ↛ 821line 820 didn't jump to line 821, because the condition on line 820 was never true

821 gjscrapr = GJScraper(driver) 

822 

823 logging.info(f"{namespace}: No scraped data currently: {item.isbn}") 

824 scrapr.load_item_page(item.isbn10) 

825 logging.info(f"{namespace}: Getting item description") 

826 description = scrapr.scrape_description() 

827 logging.info(" Description: %r" % description[:140]) 

828 item.data["DESCRIPTION"] = description 

829 logging.info(f"{namespace}: Getting item image urls") 

830 image_urls = scrapr.scrape_item_image_urls() 

831 logging.info(" URLs: %r" % image_urls) 

832 if tbscrapr and len(image_urls) < 2: 832 ↛ 833line 832 didn't jump to line 833, because the condition on line 832 was never true

833 logging.info(f"{namespace}: Getting item image urls via TBScraper") 

834 try: 

835 url = item.data["LINK"] 

836 m = re.search(r"\/([0-9]+)\/", url) 

837 if m: 

838 web_item = m.group(1) 

839 tbscrapr.load_item_page(web_item) 

840 except KeyError: 

841 logging.info(f"{namespace}: No link found in item") 

842 

843 tb_image_urls = tbscrapr.scrape_item_image_urls() 

844 # IF only one image came from primary 

845 # THEN images = images + (Get images from secondary - first secondary image) 

846 if image_urls and len(tb_image_urls) > 1: 

847 tb_image_urls.pop(0) 

848 image_urls = image_urls + tb_image_urls 

849 logging.info(" URLs: %r" % image_urls) 

850 if image_urls and not description: 

851 logging.info(f"{namespace}: Getting description via TBScraper") 

852 description = tbscrapr.scrape_description() 

853 logging.info(" Description: %r" % description[:140]) 

854 item.data["DESCRIPTION"] = description 

855 if sdscrapr and len(image_urls) < 2: 855 ↛ 856line 855 didn't jump to line 856, because the condition on line 855 was never true

856 logging.info(f"{namespace}: Getting item image urls via SDScraper") 

857 sdscrapr.load_item_page(item.isbn) 

858 sd_image_urls = sdscrapr.scrape_item_image_urls() 

859 if image_urls and len(sd_image_urls) > 0: 

860 sd_image_urls.pop(0) 

861 image_urls = image_urls + sd_image_urls 

862 logging.info(" URLs: %r" % image_urls) 

863 if image_urls and not description: 

864 logging.info(f"{namespace}: Getting description via SDScraper") 

865 description = sdscrapr.scrape_description() 

866 logging.info(" Description: %r" % description[:140]) 

867 item.data["DESCRIPTION"] = description 

868 if gjscrapr and len(image_urls) < 2: 868 ↛ 869line 868 didn't jump to line 869, because the condition on line 868 was never true

869 logging.info(f"{namespace}: Getting item image urls via GJScraper") 

870 gjscrapr.load_item_page(item.isbn) 

871 gj_image_urls = gjscrapr.scrape_item_image_urls() 

872 if image_urls and len(gj_image_urls) > 0: 

873 gj_image_urls.pop(0) 

874 image_urls = image_urls + gj_image_urls 

875 logging.info(" URLs: %r" % image_urls) 

876 if image_urls and not description: 

877 logging.info(f"{namespace}: Getting description via GJScraper") 

878 description = gjscrapr.scrape_description() 

879 logging.info(" Description: %r" % description[:140]) 

880 item.data["DESCRIPTION"] = description 

881 

882 item.image_urls = image_urls 

883 

884 logging.info(f"{namespace}: Saving scraped item data") 

885 items_obj.save_scraped_data(scraped_items_db) 

886 if driver: 886 ↛ exitline 886 didn't return from function 'main', because the condition on line 886 was never false

887 logging.info(f"{namespace}: Closing browser...") 

888 driver.quit() 

889 # }}}