Source code for scitex_scholar.auth.core.AuthenticationGateway

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
# Timestamp: "2025-10-10 03:24:07 (ywatanabe)"
# File: /home/ywatanabe/proj/scitex_repo/src/scitex/scholar/auth/AuthenticationGateway.py
# ----------------------------------------
from __future__ import annotations

import os

__FILE__ = "./src/scitex/scholar/auth/core/AuthenticationGateway.py"
__DIR__ = os.path.dirname(__FILE__)
# ----------------------------------------

__FILE__ = __file__

"""
Authentication Gateway Pattern for Scholar Module

Provides transparent authentication layer that:
- Determines if URL requires authentication (config-based)
- Prepares authenticated browser context before URL finding
- Visits authentication gateways (OpenURL) to establish sessions
- Caches authentication state to avoid redundant operations

This keeps URL finders and PDF downloaders free of authentication logic.
"""

from dataclasses import dataclass, field
from typing import Dict, List, Optional

import scitex_logging as logging
from playwright.async_api import BrowserContext

from scitex_scholar.config import ScholarConfig

logger = logging.getLogger(__name__)


[docs] @dataclass class URLContext: """ Context for URL operations with authentication information. This dataclass carries all information needed for URL resolution and PDF download, including authentication state. """ doi: str title: Optional[str] = None url: Optional[str] = None # Publisher landing page URL pdf_urls: List[str] = field(default_factory=list) requires_auth: Optional[bool] = None auth_provider: Optional[str] = None # openathens, ezproxy, shibboleth auth_gateway_url: Optional[str] = None # OpenURL for establishing session
[docs] class AuthenticationGateway: """ Transparent authentication layer for Scholar operations. Responsibilities: - Determine if URL requires authentication (config-based, no hardcoding) - Prepare authenticated browser context - Visit authentication gateways (OpenURL) to establish publisher sessions - Cache authentication state for performance This gateway sits between Scholar and URL/Download operations, preparing authentication transparently before content access. """ @property def name(self): return self.__class__.__name__
[docs] def __init__( self, auth_manager, # ScholarAuthManager browser_manager, # ScholarBrowserManager config: ScholarConfig = None, ): """ Initialize authentication gateway. Args: auth_manager: ScholarAuthManager instance browser_manager: ScholarBrowserManager instance config: ScholarConfig instance """ self.auth_manager = auth_manager self.browser_manager = browser_manager self.config = config or ScholarConfig() self._auth_cache: Dict[str, "bool | str"] = {} # Cache visited gateways / URLs
[docs] async def prepare_context_async( self, doi: str, context: BrowserContext, title: Optional[str] = None ) -> URLContext: """ Prepare URL context with authentication if needed. This is the main entry point - called BEFORE URL finding. Flow: 1. Build OpenURL (authentication gateway) 2. Check if DOI needs authentication (based on known publishers) 3. If auth needed: Visit OpenURL to establish publisher cookies 4. Resolve to final publisher URL 5. Return prepared context with authenticated session Args: doi: Paper DOI context: Browser context (will be updated with auth cookies) title: Optional paper title Returns: URLContext with authentication prepared and ready """ url_context = URLContext(doi=doi, title=title) # Step 1: Build OpenURL from scitex_scholar.auth.gateway._OpenURLResolver import OpenURLResolver resolver = OpenURLResolver(config=self.config) openurl = resolver._build_query(url_context.doi) url_context.auth_gateway_url = openurl # Step 2: Try to determine if auth needed from DOI patterns # (IEEE DOIs start with 10.1109, Springer with 10.1007, etc.) url_context = self._check_auth_requirements_from_doi(url_context) # Step 3: If authentication needed, visit OpenURL and establish cookies # This also resolves to the publisher URL as a side effect if url_context.requires_auth: publisher_url = await self._establish_authentication_async( url_context, context ) url_context.url = publisher_url or openurl else: # Step 4: For open access, use direct DOI navigation (faster than OpenURL) from scitex_scholar.auth.gateway._resolve_functions import ( resolve_publisher_url_by_navigating_to_doi_page, ) page = await context.new_page() try: # Try direct DOI navigation first (fast for open access) publisher_url = await resolve_publisher_url_by_navigating_to_doi_page( url_context.doi, page ) url_context.url = publisher_url logger.debug( f"{self.name}: Resolved {url_context.doi}{publisher_url}" ) except Exception as e: # Fallback to OpenURL resolver if direct navigation fails logger.debug( f"{self.name}: Direct navigation failed, trying OpenURL: {e}" ) try: publisher_url = await resolver.resolve_doi(url_context.doi, page) url_context.url = publisher_url except Exception as openurl_error: logger.warning( f"{self.name}: Both methods failed for {url_context.doi}: {openurl_error}" ) url_context.url = openurl # Last resort fallback finally: await page.close() return url_context
[docs] async def _resolve_publisher_url_async( self, url_context: URLContext, context: BrowserContext ) -> URLContext: """ Resolve DOI to publisher landing page URL. Uses OpenURLResolver which already exists and works. The OpenURL is the authentication gateway for paywalled content. Args: url_context: URLContext with DOI context: Browser context Returns: URLContext with url and auth_gateway_url populated """ from scitex_scholar.auth.gateway._OpenURLResolver import OpenURLResolver resolver = OpenURLResolver(config=self.config) # Build OpenURL (this is the authentication gateway) # Use the private _build_query method since no public method exists openurl = resolver._build_query(url_context.doi) url_context.auth_gateway_url = openurl # Resolve to publisher URL (may redirect through OpenAthens) page = await context.new_page() try: publisher_url = await resolver.resolve_doi(url_context.doi, page) url_context.url = publisher_url logger.debug(f"{self.name}: Resolved {url_context.doi}{publisher_url}") except Exception as e: logger.warning(f"{self.name}: Failed to resolve DOI {url_context.doi}: {e}") url_context.url = openurl # Fallback to OpenURL finally: await page.close() return url_context
[docs] def _check_auth_requirements_from_doi(self, url_context: URLContext) -> URLContext: """ Determine if DOI requires authentication based on DOI prefix patterns. This allows early detection before resolving URL. IEEE DOIs start with 10.1109, Springer with 10.1007, etc. Args: url_context: URLContext with doi populated Returns: URLContext with requires_auth and auth_provider populated """ # Get authenticated publishers from config # auth_config = self.config.get("authentication") or {} # paywalled_publishers = auth_config.get("paywalled_publishers") or [] paywalled_publishers = self.config.resolve( "paywalled_publishers", None, default=[] ) if not isinstance(paywalled_publishers, list): paywalled_publishers = [] doi = url_context.doi or "" for publisher_config in paywalled_publishers: doi_prefixes = publisher_config.get("doi_prefixes", []) for prefix in doi_prefixes: if doi.startswith(prefix): url_context.requires_auth = True url_context.auth_provider = publisher_config.get( "preferred_provider", "openathens" ) logger.info( f"{self.name}: Authentication required for {publisher_config.get('name')} " f"(DOI prefix: {prefix}, provider: {url_context.auth_provider})" ) return url_context # Fallback: check by URL if DOI detection didn't match # (for cases where DOI prefix is not in config) url_context.requires_auth = False return url_context
[docs] def _check_auth_requirements(self, url_context: URLContext) -> URLContext: """ Determine if URL requires authentication based on config. This is config-based (no hardcoded domain lists). Checks URL against paywalled_publishers in config. Args: url_context: URLContext with url populated Returns: URLContext with requires_auth and auth_provider populated """ # Get authenticated publishers from config # auth_config = self.config.get("authentication") or {} # paywalled_publishers = auth_config.get("paywalled_publishers") or [] paywalled_publishers = self.config.resolve( "paywalled_publishers", None, default=[] ) if not isinstance(paywalled_publishers, list): paywalled_publishers = [] # Check if URL matches any paywalled publisher url_lower = (url_context.url or "").lower() for publisher_config in paywalled_publishers: domain_patterns = publisher_config.get("domain_patterns", []) for pattern in domain_patterns: if pattern.lower() in url_lower: url_context.requires_auth = True url_context.auth_provider = publisher_config.get( "preferred_provider", "openathens" ) logger.info( f"{self.name}: Authentication required for {publisher_config.get('name')} " f"(provider: {url_context.auth_provider})" ) return url_context # No authentication required url_context.requires_auth = False return url_context
[docs] async def _establish_authentication_async( self, url_context: URLContext, context: BrowserContext ) -> Optional[str]: """ Establish authentication by visiting gateway URL and clicking through to publisher. This is the KEY OPERATION that solves the IEEE issue: 1. Visit OpenURL (library resolver) 2. Find publisher link on resolver page 3. Click link → redirects through OpenAthens → lands at publisher 4. Publisher session cookies established in browser context Without this step: - OpenAthens cookies exist at openathens.net - NO cookies exist at ieee.org - Chrome PDF viewer opens but download fails With this step: - Visit OpenURL - Click IEEE link → redirect through OpenAthens - Land at ieee.org → IEEE session cookies established - Now ieee.org has cookies, Chrome PDF viewer works Args: url_context: URLContext with auth_gateway_url and doi context: Browser context (will receive publisher cookies) Returns: Publisher URL if successful, None otherwise """ gateway_url = url_context.auth_gateway_url if not gateway_url: logger.warning(f"{self.name}: No gateway URL available for authentication") return None # Check cache - avoid redundant visits cache_key = f"{url_context.doi}" if cache_key in self._auth_cache: logger.debug( f"{self.name}: Authentication already established for {url_context.doi}" ) # Return cached URL if available cached = self._auth_cache.get(f"{cache_key}_url") return cached if isinstance(cached, str) else None logger.info( f"{self.name}: Establishing auth via OpenURL", ) # Visit OpenURL and click through to publisher # This uses the existing OpenURLResolver flow from scitex_browser import browser_logger from scitex_scholar.auth.gateway._OpenURLResolver import OpenURLResolver resolver = OpenURLResolver(config=self.config) page = await context.new_page() try: publisher_url = await resolver.resolve_doi(url_context.doi, page) if publisher_url: logger.info(f"{self.name}: Auth established") await browser_logger.info( page, f"{self.name}: ✓ Session established at {publisher_url[:60]}", ) await page.wait_for_timeout(2000) # Cache successful authentication self._auth_cache[cache_key] = True self._auth_cache[f"{cache_key}_url"] = publisher_url return publisher_url else: logger.warning(f"{self.name}: OpenURL resolution failed") await browser_logger.info( page, f"{self.name}: ✗ Could not resolve to publisher URL" ) await page.wait_for_timeout(2000) return None except Exception as e: logger.warning(f"{self.name}: Auth setup failed: {e}") try: await browser_logger.info( page, f"{self.name}: ✗ EXCEPTION: {str(e)[:80]}" ) await page.wait_for_timeout(2000) except Exception as ui_exc: logger.debug( f"{self.name}: in-page error banner failed " f"({type(ui_exc).__name__}: {ui_exc})" ) # Don't raise - allow downstream to try anyway return None finally: await page.close()
async def main_async(): """ Demonstration of AuthenticationGateway usage. Shows how to: 1. Initialize authentication components 2. Prepare authenticated browser context 3. Use the context for subsequent operations """ from scitex_scholar.auth.ScholarAuthManager import ScholarAuthManager from scitex_scholar.browser.ScholarBrowserManager import ScholarBrowserManager from scitex_scholar.config import ScholarConfig # Initialize components config = ScholarConfig() auth_manager = ScholarAuthManager(config=config) browser_manager = ScholarBrowserManager(auth_manager=auth_manager, config=config) # Initialize gateway gateway = AuthenticationGateway( auth_manager=auth_manager, browser_manager=browser_manager, config=config, ) # Example DOIs - one paywalled (IEEE), one open access test_dois = [ "10.1109/JBHI.2024.1234567", # IEEE (paywalled) "10.1088/1741-2552/aaf92e", # IOP Publishing (paywalled) "10.1038/s41467-020-12345-6", # Nature Communications (open access) ] # Get authenticated browser context ( browser, context, ) = await browser_manager.get_authenticated_browser_and_context_async() try: for doi in test_dois: logger.info(f"\n{'=' * 60}") logger.info(f"Testing DOI: {doi}") logger.info(f"{'=' * 60}") # Prepare authentication (this is the key operation) url_context = await gateway.prepare_context_async(doi=doi, context=context) # Show results logger.info(f"Publisher URL: {url_context.url}") logger.info(f"Requires auth: {url_context.requires_auth}") logger.info(f"Auth provider: {url_context.auth_provider}") logger.info(f"Gateway URL: {url_context.auth_gateway_url}") # At this point, the browser context has publisher cookies # You can now use it for URL finding or PDF download finally: await context.close() await browser.close() if __name__ == "__main__": import asyncio asyncio.run(main_async()) # EOF