Source code for scitex_scholar.pdf_download.ScholarPDFDownloader

#!/usr/bin/env python3
# Timestamp: "2026-01-22 (ywatanabe)"
# File: src/scitex/scholar/pdf_download/ScholarPDFDownloader.py
"""PDF downloader with multiple fallback strategies."""

from __future__ import annotations

import asyncio
import os
import traceback
from pathlib import Path
from typing import Any, List, Optional, Union

import scitex_logging as logging
from playwright.async_api import BrowserContext

from scitex_scholar.config import ScholarConfig
from scitex_scholar.pdf_download.strategies import (
    FlexibleFilenameGenerator,
    handle_manual_download_on_page_async,
    try_download_chrome_pdf_viewer_async,
    try_download_direct_async,
    try_download_open_access_async,
    try_download_response_body_async,
)

logger = logging.getLogger(__name__)


[docs] class ScholarPDFDownloader: """Download PDFs from URLs with multiple fallback strategies. Strategies tried in order: - Chrome PDF Viewer - Direct Download (ERR_ABORTED) - Response Body Extraction - Manual Download Fallback URL resolution (DOI -> URL) should be handled by the caller. """
[docs] def __init__(self, context: BrowserContext, config: ScholarConfig = None): self.name = self.__class__.__name__ self.config = config if config else ScholarConfig() self.context = context self.output_dir = self.config.get_library_downloads_dir() self.prefer_open_access = self.config.resolve( "prefer_open_access", default=True, type=bool ) self.enable_paywall_access = self.config.resolve( "enable_paywall_access", default=False, type=bool ) self.track_paywall_attempts = self.config.resolve( "track_paywall_attempts", default=True, type=bool )
async def __aexit__(self, exc_type, exc_val, exc_tb): pass
[docs] async def download_from_urls( self, pdf_urls: List[str], output_dir: Union[str, Path] = None, max_concurrent: int = 3, ) -> List[Path]: """Download multiple PDFs with parallel processing.""" output_dir = Path(output_dir or self.output_dir) if not pdf_urls: return [] output_paths = [ output_dir / f"{ii_pdf:03d}_{os.path.basename(pdf_url)}" for ii_pdf, pdf_url in enumerate(pdf_urls) ] semaphore = asyncio.Semaphore(max_concurrent) async def download_with_semaphore(url: str, path: Path, index: int): async with semaphore: logger.info( f"{self.name}: Downloading PDF {index}/{len(pdf_urls)}: {url}" ) result = await self.download_from_url(url, path) if result: logger.info(f"{self.name}: Downloaded to {result}") return result tasks = [ download_with_semaphore(url, path, idx + 1) for idx, (url, path) in enumerate(zip(pdf_urls, output_paths)) ] results = await asyncio.gather(*tasks, return_exceptions=True) saved_paths = [] for result in results: if isinstance(result, Exception): logger.debug(f"{self.name}: Download error: {result}") elif result: saved_paths.append(result) logger.info(f"{self.name}: Downloaded {len(saved_paths)}/{len(pdf_urls)} PDFs") return saved_paths
[docs] async def download_open_access( self, oa_url: str, output_path: Union[str, Path], metadata: Optional[dict] = None, ) -> Optional[Path]: """Download PDF from an Open Access URL.""" if not oa_url: logger.debug(f"{self.name}: No OA URL provided") return None if isinstance(output_path, str): output_path = Path(output_path) if not str(output_path).endswith(".pdf"): output_path = Path(str(output_path) + ".pdf") output_path.parent.mkdir(parents=True, exist_ok=True) logger.info(f"{self.name}: Attempting OA download from {oa_url[:60]}...") result = await try_download_open_access_async( oa_url=oa_url, output_path=output_path, metadata=metadata, func_name=self.name, ) if result: logger.info(f"{self.name}: Successfully downloaded OA PDF to {result}") else: logger.debug(f"{self.name}: OA download failed") return result
[docs] async def download_smart( self, paper, output_path: Union[str, Path] ) -> Optional[Path]: """Smart download choosing best strategy based on paper metadata.""" if isinstance(output_path, str): output_path = Path(output_path) if not str(output_path).endswith(".pdf"): output_path = Path(str(output_path) + ".pdf") meta = paper.metadata if hasattr(paper, "metadata") else paper access = getattr(meta, "access", None) url_meta = getattr(meta, "url", None) id_meta = getattr(meta, "id", None) is_open_access = getattr(access, "is_open_access", False) if access else False oa_url = getattr(access, "oa_url", None) if access else None pdf_urls = getattr(url_meta, "pdfs", []) if url_meta else [] doi = getattr(id_meta, "doi", None) if id_meta else None logger.info(f"{self.name}: Smart download for DOI={doi}, OA={is_open_access}") # Strategy 1: Try Open Access if available if self.prefer_open_access and oa_url: logger.info(f"{self.name}: Trying Open Access URL first") result = await self.download_open_access(oa_url, output_path) if result: if access and self.track_paywall_attempts: access.paywall_bypass_attempted = False return result # Strategy 2: Try available PDF URLs for pdf_entry in pdf_urls: pdf_url = pdf_entry.get("url") if isinstance(pdf_entry, dict) else pdf_entry if pdf_url: logger.info(f"{self.name}: Trying PDF URL: {pdf_url[:60]}...") result = await self.download_from_url(pdf_url, output_path, doi=doi) if result: return result # Strategy 3: Try paywall access if enabled if self.enable_paywall_access and not is_open_access: logger.info(f"{self.name}: Attempting paywall access (opt-in enabled)") if access and self.track_paywall_attempts: access.paywall_bypass_attempted = True if doi: doi_url = f"https://doi.org/{doi}" result = await self.download_from_url(doi_url, output_path, doi=doi) if result: if access and self.track_paywall_attempts: access.paywall_bypass_success = True return result elif access and self.track_paywall_attempts: access.paywall_bypass_success = False logger.warning(f"{self.name}: All download strategies exhausted for DOI={doi}") return None
[docs] async def download_from_url( self, pdf_url: str, output_path: Union[str, Path], doi: Optional[str] = None, ) -> Optional[Path]: """Main download method with manual override support.""" if not pdf_url: logger.warning(f"{self.name}: PDF URL passed but not valid: {pdf_url}") return None if isinstance(output_path, str): output_path = Path(output_path) if not str(output_path).endswith(".pdf"): output_path = Path(str(output_path) + ".pdf") output_path.parent.mkdir(parents=True, exist_ok=True) target_filename = FlexibleFilenameGenerator.generate_filename( doi=doi, url=pdf_url, content_type="main" ) stop_event = asyncio.Event() self.context._scitex_is_manual_mode = False self.context._scitex_manual_mode_event = stop_event from scitex_scholar.pdf_download.strategies.manual_download_utils import ( get_manual_button_init_script, ) button_script = get_manual_button_init_script(target_filename) await self.context.add_init_script(button_script) logger.info(f"{self.name}: Manual mode button injected into browser context") button_task = None pdf_page: Any = None async def chrome_pdf_wrapper(url, path): return await try_download_chrome_pdf_viewer_async( self.context, url, path, self.name ) async def direct_download_wrapper(url, path): return await try_download_direct_async(self.context, url, path, self.name) async def response_body_wrapper(url, path): return await try_download_response_body_async( self.context, url, path, self.name ) async def manual_fallback_wrapper(url, path): return None try_download_methods = [ ("Chrome PDF", chrome_pdf_wrapper), ("Direct Download", direct_download_wrapper), ("From Response Body", response_body_wrapper), ("Manual Download", manual_fallback_wrapper), ] for method_name, method_func in try_download_methods: if stop_event.is_set(): logger.info(f"{self.name}: Manual mode - stopping automation") break logger.info(f"{self.name}: Trying method: {method_name}") try: if stop_event.is_set(): logger.info(f"{self.name}: Manual mode, skipping {method_name}") break is_downloaded = await method_func(pdf_url, output_path) if stop_event.is_set(): logger.info(f"{self.name}: Manual mode during {method_name}") break if is_downloaded: if button_task: button_task.cancel() if pdf_page: await pdf_page.close() logger.info(f"{self.name}: Downloaded via {method_name}") return is_downloaded else: logger.debug(f"{self.name}: {method_name} returned None") except Exception as e: logger.warning(f"{self.name}: {method_name} raised exception: {e}") logger.debug(f"{self.name}: Traceback: {traceback.format_exc()}") # Handle manual download if user chose it if stop_event.is_set(): self.context._scitex_is_manual_mode = True logger.info(f"{self.name}: User chose manual download - starting") if button_task: button_task.cancel() if not pdf_page: pdf_page = await self.context.new_page() await pdf_page.goto( pdf_url, timeout=30000, wait_until="domcontentloaded" ) result = await handle_manual_download_on_page_async( pdf_page, pdf_url, output_path, func_name=self.name, config=self.config, doi=doi, ) await pdf_page.close() return result # All methods failed if button_task: button_task.cancel() if pdf_page: await pdf_page.close() logger.fail(f"{self.name}: All download methods failed for {pdf_url}") return None
# CLI entry point moved to _cli.py if __name__ == "__main__": from scitex_scholar.pdf_download._cli import run_main run_main() # EOF