Source code for scitex_browser.core.ChromeProfileManager

#!/usr/bin/env python3
# Timestamp: "2025-10-11 07:53:19 (ywatanabe)"
# File: /home/ywatanabe/proj/scitex_repo/src/scitex/browser/core/ChromeProfileManager.py
# ----------------------------------------
from __future__ import annotations

import os

__FILE__ = "./src/scitex/browser/core/ChromeProfileManager.py"
__DIR__ = os.path.dirname(__FILE__)
# ----------------------------------------

__FILE__ = __file__

import subprocess
import time
from pathlib import Path
from typing import Dict, Optional

import scitex_logging as logging
from scitex_config._ecosystem import local_state

logger = logging.getLogger(__name__)

_DEFAULT_CHROME_CACHE = Path(
    os.environ.get(
        "SCITEX_BROWSER_CHROME_CACHE_DIR",
        str(local_state.runtime_path("browser", "chrome")),
    )
)


[docs] class ChromeProfileManager: """Manages Chrome profile especially extensions for automated literature search.""" EXTENSIONS = { "zotero_connector": { "id": "ekhagklcjbdpajgpjgmbionohlpdbjgc", "name": "Zotero Connector", }, "lean_library": { "id": "hghakoefmnkhamdhenpbogkeopjlkpoa", "name": "Lean Library", }, "popup_blocker": { "id": "bkkbcggnhapdmkeljlodobbkopceiche", "name": "Pop-up Blocker", }, "accept_cookies": { "id": "ofpnikijgfhlmmjlpkfaifhhdonchhoi", "name": "Accept all cookies", }, "2captcha_solver": { "id": "ifibfemgeogfhoebkmokieepdoobkbpo", "name": "2Captcha Solver", }, "captcha_solver": { "id": "hlifkpholllijblknnmbfagnkjneagid", "name": "CAPTCHA Solver", }, } AVAILABLE_PROFILE_NAMES = ["system", "extension", "auth", "stealth"]
[docs] def __init__( self, profile_name: str, chrome_cache_dir: Optional[Path] = None, config: Optional[object] = None, ): """Manage a Chrome profile for browser automation. Parameters ---------- profile_name Subdirectory under ``chrome_cache_dir`` to use as the profile. chrome_cache_dir Base directory that holds profile subdirectories. Defaults to ``$SCITEX_BROWSER_CHROME_CACHE_DIR`` or ``$SCITEX_DIR/browser/runtime/chrome`` (``~/.scitex/browser/runtime/chrome`` by default). config Deprecated. Back-compat shim: any object exposing ``get_cache_chrome_dir(profile_name) -> Path`` is accepted so callers passing ``ScholarConfig`` still work. Prefer ``chrome_cache_dir``. """ self.name = self.__class__.__name__ self.profile_name = profile_name if chrome_cache_dir is not None: self._chrome_cache_dir = Path(chrome_cache_dir) self.profile_dir = self._chrome_cache_dir / profile_name self.profile_dir.mkdir(parents=True, exist_ok=True) elif config is not None and hasattr(config, "get_cache_chrome_dir"): self.profile_dir = config.get_cache_chrome_dir(profile_name) self._chrome_cache_dir = self.profile_dir.parent else: self._chrome_cache_dir = _DEFAULT_CHROME_CACHE self.profile_dir = self._chrome_cache_dir / profile_name self.profile_dir.mkdir(parents=True, exist_ok=True) logger.debug( f"{self.name}: profile_name={self.profile_name}, profile_dir={self.profile_dir}" )
[docs] def _get_extension_statuses(self, profile_dir: Path) -> Dict[str, bool]: """Get detailed status of each extension.""" status = {} extensions_path = profile_dir / "Default" / "Extensions" if not extensions_path.exists(): return dict.fromkeys(self.EXTENSIONS, False) for key, ext_info in self.EXTENSIONS.items(): ext_id = ext_info["id"] ext_dir = extensions_path / ext_id if ext_dir.exists(): version_dirs = [d for d in ext_dir.iterdir() if d.is_dir()] if version_dirs: latest_version = max(version_dirs, key=lambda x: x.name) manifest_file = latest_version / "manifest.json" status[key] = manifest_file.exists() else: status[key] = False else: status[key] = False return status
[docs] def check_extensions_installed( self, profile_dir: Path = None, verbose: bool = True ) -> bool: """Check installation status of all extensions from profile directory.""" if profile_dir is None: profile_dir = self.profile_dir status = self._get_extension_statuses(profile_dir) installed_count = sum(status.values()) if verbose: for key, ext_info in self.EXTENSIONS.items(): ext_id = ext_info["id"] if not status.get(key, False): logger.warning( f"{self.name}: {ext_info['name']} ({ext_id}) not installed" ) all_installed = installed_count == len(self.EXTENSIONS) if all_installed: logger.debug( f"{self.name}: All {installed_count}/{len(self.EXTENSIONS)} extensions installed" ) else: logger.warning( f"{self.name}: Only {installed_count}/{len(self.EXTENSIONS)} extensions installed" ) return installed_count == len(self.EXTENSIONS)
[docs] def _get_installed_extension_paths(self, profile_dir: Path) -> list[str]: """Get paths to installed extensions for --load-extension argument.""" extension_paths = [] extensions_dir = profile_dir / "Default" / "Extensions" if not extensions_dir.exists(): return extension_paths for key, ext_info in self.EXTENSIONS.items(): ext_id = ext_info["id"] ext_dir = extensions_dir / ext_id if ext_dir.exists(): version_dirs = [d for d in ext_dir.iterdir() if d.is_dir()] if version_dirs: latest_version = max(version_dirs, key=lambda x: x.name) manifest_file = latest_version / "manifest.json" if manifest_file.exists(): extension_paths.append(str(latest_version)) return extension_paths
[docs] def get_extension_args(self): """Get extension args using appropriate profile directory.""" # profile_dir = self._get_profile_dir_with_system_handling() extension_paths = self._get_installed_extension_paths(self.profile_dir) extension_args = [] if extension_paths: extensions_list = ",".join(extension_paths) extension_args.extend( [ f"--load-extension={extensions_list}", f"--disable-extensions-except={extensions_list}", "--enable-extensions", "--disable-extensions-file-access-check", "--disable-web-security", ] ) logger.debug( f"Loading {len(extension_paths)} extensions from {self.profile_dir}" ) return extension_args
[docs] async def install_extensions_manually_if_not_installed_async(self, verbose=False): """Open Chrome for manual extension installation.""" if self.check_extensions_installed(verbose=verbose): return True # Build Chrome command chrome_cmd = [ "google-chrome", f"--user-data-dir={self.profile_dir}", "--enable-extensions", "--new-window", "--no-sandbox", "--disable-dev-shm-usage", ] chrome_cmd_str = " ".join(chrome_cmd) logger.info(f"Chrome command: {chrome_cmd_str}") # Add extension URLs for ext_info in self.EXTENSIONS.values(): url = f"https://chrome.google.com/webstore/detail/{ext_info['id']}" chrome_cmd.append(url) # Set environment for WSL2 env = os.environ.copy() if "WSL_DISTRO_NAME" in env and "DISPLAY" not in env: env["DISPLAY"] = ":0.0" # Try to launch Chrome try: process = subprocess.Popen( chrome_cmd, env=env, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL, start_new_session=True, ) logger.debug(f"Launched Chrome with PID {process.pid}") time.sleep(2) if process.poll() is not None: logger.error("Chrome exited immediately") return False except FileNotFoundError: logger.error("Chrome not found") return False print("\n" + "=" * 60) print("Chrome Extension Installation") print("=" * 60) print("Install extensions from the opened Chrome tabs, then press Enter") try: input("Press Enter when done...") except (EOFError, KeyboardInterrupt): pass time.sleep(2) if self.check_extensions_installed(verbose=False): logger.success("Extension installation complete!") return True else: logger.warning("Extension installation may be incomplete") return False
[docs] async def handle_runtime_extension_dialogs_async(self, page): """Handle extension consent dialogs that appear at runtime.""" try: await page.wait_for_timeout(2000) consent_selectors = [ 'button:has-text("Agree")', 'button:has-text("Accept")', 'button:has-text("Continue")', 'button:has-text("OK")', 'button:has-text("Dismiss")', 'button:has-text("Close")', ] for selector in consent_selectors: element = await page.query_selector(selector) if element: await element.click() logger.debug(f"Clicked dialog: {selector}") return True return False except Exception as e: logger.error(f"Error handling dialogs: {e}") return False
[docs] def sync_from_profile(self, source_profile_name: str = "system") -> bool: """ Sync extensions and cookies from source profile to this profile using rsync. Args: source_profile_name: Name of source profile (default: "system") Returns ------- True if sync succeeded, False otherwise """ import time source_profile_dir = self._chrome_cache_dir / source_profile_name if not source_profile_dir.exists(): logger.error(f"Source profile does not exist: {source_profile_dir}") return False # Create target profile directory if needed self.profile_dir.mkdir(parents=True, exist_ok=True) logger.debug(f"Syncing profile: {self.profile_name}{source_profile_name}") logger.debug(f" Source: {source_profile_dir}") logger.debug(f" Target: {self.profile_dir}") # Use rsync to sync entire profile directory # -a: archive mode (preserves permissions, timestamps, symlinks) # -u: skip files newer on receiver # -v: verbose output # --stats: show transfer statistics # --delete: delete files not in source (keep profiles identical) rsync_cmd = [ "rsync", "-auv", "--stats", "--delete", f"{source_profile_dir}/", f"{self.profile_dir}/", ] start_time = time.time() try: result = subprocess.run( rsync_cmd, capture_output=True, text=True, check=True ) elapsed = time.time() - start_time # Parse rsync stats stats_lines = result.stdout.strip().split("\n") transferred_files = 0 total_size = 0 for line in stats_lines: if "Number of regular files transferred:" in line: # Remove commas from number (e.g., "3,301" -> "3301") transferred_files = int(line.split(":")[1].strip().replace(",", "")) elif "Total transferred file size:" in line: size_str = line.split(":")[1].strip().split()[0] total_size = int(size_str.replace(",", "")) # Log detailed results if transferred_files > 0: size_mb = total_size / (1024 * 1024) logger.success( f"Profile sync complete: {self.profile_name}{source_profile_name} " f"({transferred_files} files, {size_mb:.1f}MB, {elapsed:.2f}s)" ) else: logger.debug( f"Profile sync complete: {self.profile_name}{source_profile_name} " f"(no changes, {elapsed:.2f}s)" ) # Log verbose output at debug level if result.stdout: logger.debug(f"rsync output:\n{result.stdout}") return True except subprocess.CalledProcessError as e: # Exit code 23 = partial transfer (often just timestamp issues on WSL) # Check if it's only "failed to set times" errors - these are harmless if e.returncode == 23 and "failed to set times" in e.stderr: # Files copied successfully, just couldn't preserve timestamps logger.debug("Profile sync complete (timestamps not preserved)") return True logger.error(f"rsync failed (exit code {e.returncode}): {e.stderr}") return False except FileNotFoundError: logger.error("rsync command not found - please install rsync") return False
def main(args): """Demonstrate ChromeProfileManager functionality.""" import asyncio async def demo(): manager = ChromeProfileManager("system") # Check extensions print("Checking system profile extensions...") all_installed = manager.check_extensions_installed(verbose=True) if not all_installed: print("\nInstalling missing extensions...") await manager.install_extensions_manually_if_not_installed_async( verbose=True ) # Demo profile sync print("\nChecking profile sync capability...") test_profile = ChromeProfileManager("test_profile") success = test_profile.sync_from_profile("system") if success: print("✓ Profile sync test complete") else: print("✓ Profile sync skipped (source profile not ready)") print("✓ Demo complete") asyncio.run(demo()) return 0 def parse_args(): """Parse command line arguments.""" import argparse parser = argparse.ArgumentParser(description="ChromeProfileManager demo") return parser.parse_args() def run_main() -> None: """Initialize scitex framework, run main function, and cleanup.""" global CONFIG, CC, sys, plt, rng import sys import matplotlib.pyplot as plt import scitex as stx args = parse_args() CONFIG, sys.stdout, sys.stderr, plt, CC, rng = stx.session.start( sys, plt, args=args, file=__FILE__, sdir_suffix=None, verbose=False, agg=True, ) exit_status = main(args) stx.session.close( CONFIG, verbose=False, notify=False, message="", exit_status=exit_status, ) if __name__ == "__main__": run_main() # python -m scitex_browser.core.ChromeProfileManager # EOF