#!/usr/bin/env python3
# -*- coding: utf-8 -*-
# Timestamp: "2025-07-31 22:08:31 (ywatanabe)"
# File: /home/ywatanabe/proj/scitex_repo/src/scitex/scholar/browser/remote/_ZenRowsRemoteScholarBrowserManager.py
# ----------------------------------------
from __future__ import annotations
import os
__FILE__ = __file__
__DIR__ = os.path.dirname(__FILE__)
# ----------------------------------------
"""
Browser manager specifically for the ZenRows Scraping Browser service.
This provides cloud-based Chrome instances with built-in anti-bot bypass.
"""
from typing import Any, Dict, Optional
import scitex_logging as logging
from playwright.async_api import Browser, BrowserContext, async_playwright
try:
from scitex_browser.automation.CookieHandler import CookieAutoAcceptor
except ImportError:
CookieAutoAcceptor = None
from .ZenRowsAPIClient import ZenRowsAPIBrowser
logger = logging.getLogger(__name__)
[docs]
class ZenRowsRemoteScholarBrowserManager:
"""
Manages a connection to the remote ZenRows Scraping Browser service.
"""
[docs]
def __init__(
self,
auth_manager=None,
zenrows_api_key: Optional[str] = os.getenv("SCITEX_SCHOLAR_ZENROWS_API_KEY"),
proxy_country: Optional[str] = os.getenv(
"SCITEX_SCHOLAR_ZENROWS_PROXY_COUNTRY"
),
**kwargs,
):
"""
Initialize ZenRows browser manager.
Args:
auth_manager: Authentication manager for cookie injection.
zenrows_api_key: ZenRows API key.
proxy_country: Country code for proxy routing (e.g., 'au', 'us').
Note: Country routing may only work with certain endpoints.
**kwargs: Additional arguments (ignored, for compatibility).
"""
self.auth_manager = auth_manager
self.zenrows_api_key = zenrows_api_key
self.proxy_country = proxy_country
if not self.zenrows_api_key:
raise ValueError(
"ZenRows API key required. Set SCITEX_SCHOLAR_ZENROWS_API_KEY env var "
"or pass zenrows_api_key parameter"
)
self._playwright = None
self._browser: Optional[Browser] = None
self._context: Optional[BrowserContext] = None
self.cookie_acceptor = CookieAutoAcceptor()
# Also initialize API browser for reliable screenshots
self._api_browser = ZenRowsAPIBrowser(
api_key=self.zenrows_api_key, proxy_country=self.proxy_country or "au"
)
[docs]
async def get_browser_async(self) -> Browser:
"""Connect to the ZenRows Scraping Browser."""
if self._browser and self._browser.is_connected():
return self._browser
logger.debug("Connecting to ZenRows Scraping Browser...")
if not self._playwright:
self._playwright = await async_playwright().start()
# Build connection URL with optional country parameter
connection_url = f"wss://browser.zenrows.com?apikey={self.zenrows_api_key}"
# Note: Country routing via WebSocket URL is not documented
# but we can try appending it as a parameter
if self.proxy_country:
connection_url += f"&proxy_country={self.proxy_country}"
logger.debug(f"Requesting proxy country: {self.proxy_country.upper()}")
try:
self._browser = await self._playwright.chromium.connect_over_cdp(
connection_url
)
logger.debug("Successfully connected to ZenRows browser")
# Log a note about country routing
if self.proxy_country:
logger.debug(
"Note: Country routing via Scraping Browser is experimental. "
"Use API mode for guaranteed country-specific IPs."
)
return self._browser
except Exception as e:
logger.error(f"Failed to connect to ZenRows browser: {e}")
raise
[docs]
async def get_authenticated_browser_and_context_async(
self,
) -> tuple[Browser, BrowserContext]:
"""Get browser context with authentication cookies pre-loaded."""
if self.auth_manager is None:
err_msg = (
"Authentication manager is not set. "
"Initialize ScholarBrowserManager with an auth_manager to use this method."
)
raise ValueError(err_msg)
browser = await self.get_browser_async()
if browser.contexts:
context = browser.contexts[0]
else:
context = await browser.new_context()
# Inject cookie auto-acceptor
try:
await self.cookie_acceptor.inject_auto_acceptor_async(context)
logger.debug("Injected cookie auto-acceptor")
except Exception as e:
logger.warn(f"Failed to inject cookie acceptor: {e}")
if self.auth_manager and await self.auth_manager.is_authenticate_async():
try:
cookies = await self.auth_manager.get_auth_cookies_async()
await context.add_cookies(cookies)
logger.success(f"Injected {len(cookies)} authentication cookies")
except Exception as e:
logger.error(f"Failed to inject auth cookies: {e}")
self._context = context
return browser, context
[docs]
async def new_page(self, context: Optional[BrowserContext] = None) -> Any:
"""Create a new page in the ZenRows browser."""
if not context:
_, context = await self.get_authenticated_browser_and_context_async()
page = await context.new_page()
await page.set_extra_http_headers(
{
"Accept-Language": "en-US,en;q=0.9",
"Accept-Encoding": "gzip, deflate, br",
}
)
return page
[docs]
async def close(self):
"""Close the ZenRows browser connection."""
if self._browser and self._browser.is_connected():
await self._browser.close()
logger.debug("Closed ZenRows browser connection")
if self._playwright:
await self._playwright.stop()
self._browser = None
self._context = None
self._playwright = None
[docs]
async def take_screenshot_reliable_async(
self, url: str, output_path: str, use_api: bool = True, wait_ms: int = 5000
) -> Dict[str, Any]:
"""Take a screenshot with automatic CAPTCHA handling.
This method provides reliable screenshot capture by:
1. Using the API approach by default (more reliable)
2. Falling back to WebSocket browser if needed
3. Automatically handling CAPTCHAs via ZenRows
Args:
url: URL to screenshot
output_path: Path to save screenshot
use_api: Use API browser (recommended) vs WebSocket
wait_ms: Additional wait time
Returns:
Dict with success status and details
"""
if use_api:
# Use API browser for reliability
logger.debug("Using ZenRows API for screenshot (recommended)")
return await self._api_browser.navigate_and_screenshot_async(
url=url, screenshot_path=output_path, wait_ms=wait_ms
)
else:
# Use WebSocket browser (less reliable for captchas)
logger.debug("Using ZenRows WebSocket browser")
try:
browser = await self.get_browser_async()
context = await browser.new_context()
page = await context.new_page()
# Navigate
await page.goto(url, wait_until="domcontentloaded", timeout=30000)
# Wait for content
await page.wait_for_load_state("networkidle", timeout=10000)
await page.wait_for_timeout(wait_ms)
# Take screenshot
await page.screenshot(path=output_path, full_page=True)
await page.close()
await context.close()
return {
"success": True,
"screenshot": {"saved": True, "path": output_path},
}
except Exception as e:
logger.error(f"WebSocket screenshot failed: {e}")
return {"success": False, "error": str(e)}
[docs]
async def navigate_and_extract_async(
self,
url: str,
extract_pdf_url: bool = True,
take_screenshot: bool = False,
screenshot_path: Optional[str] = None,
) -> Dict[str, Any]:
"""Navigate to URL and extract information.
This combines navigation, screenshot, and data extraction.
Uses the API approach for better reliability.
Args:
url: Target URL
extract_pdf_url: Try to find PDF URL
take_screenshot: Whether to capture screenshot
screenshot_path: Where to save screenshot
Returns:
Dict with extracted data
"""
result = await self._api_browser.navigate_and_screenshot_async(
url=url,
screenshot_path=screenshot_path if take_screenshot else None,
return_html=extract_pdf_url,
wait_ms=8000, # Longer wait for academic sites
)
if extract_pdf_url and result.get("html"):
# Try to extract PDF URL
import re
html = result["html"]
pdf_patterns = [
r'href="([^"]+\.pdf[^"]*)"',
r'content="([^"]+\.pdf[^"]*)"',
r'data-pdf-url="([^"]+)"',
r'pdfUrl["\']?\s*:\s*["\']([^"\']+)',
]
for pattern in pdf_patterns:
match = re.search(pattern, html, re.IGNORECASE)
if match:
result["pdf_url"] = match.group(1)
logger.debug(f"Found PDF URL: {result['pdf_url']}")
break
return result
[docs]
async def __aenter__(self):
"""Async context manager entry."""
return self
[docs]
async def __aexit__(self, exc_type, exc_val, exc_tb):
"""Async context manager exit."""
await self.close()
if __name__ == "__main__":
import asyncio
import os
async def main():
"""Comprehensive test of ZenRowsRemoteScholarBrowserManager with comparisons."""
import json
from datetime import datetime
from pathlib import Path
# Create screenshots directory
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
screenshots_dir = Path(f"./screenshots_remote_{timestamp}")
screenshots_dir.mkdir(exist_ok=True)
# Test sites for comprehensive evaluation
test_sites = [
("ip", "https://httpbin.org/ip", "Shows your public IP address"),
("headers", "https://httpbin.org/headers", "HTTP headers sent by browser"),
(
"bot_detection",
"https://bot.sannysoft.com/",
"Bot tests - green=good, red=detected",
),
(
"fingerprint",
"https://pixelscan.net/",
"Browser fingerprinting analysis",
),
("webrtc", "https://browserleaks.com/webrtc", "WebRTC IP leak test"),
]
async def test_browser_async(browser_type, browser_manager, use_auth=False):
"""Test a browser with all test sites."""
print(f"\n{'=' * 60}")
print(f"Testing: {browser_type}")
print("=" * 60)
results = {}
try:
if use_auth and hasattr(
browser_manager, "get_authenticated_browser_and_context_async"
):
# For managers with auth support
(
browser,
context,
) = await browser_manager.get_authenticated_browser_and_context_async()
pages_via_context = True
else:
# Direct browser access
browser = await browser_manager.get_browser_async()
pages_via_context = False
for test_name, url, description in test_sites:
print(f"\n{test_name}: {description}")
page = None
try:
if pages_via_context:
page = await context.new_page()
else:
page = await browser.new_page()
# Navigate with timeout
await page.goto(
url, wait_until="domcontentloaded", timeout=30000
)
if test_name in ["ip", "headers"]:
# Extract text content
content = await page.text_content("pre")
print(f"Result: {content.strip()[:200]}...")
# Parse IP if available
if test_name == "ip":
try:
ip_data = json.loads(content)
results["ip"] = ip_data.get("origin", "Unknown")
print(f"Detected IP: {results['ip']}")
except:
results["ip"] = "Parse error"
else:
# Wait for dynamic content
await page.wait_for_timeout(5000)
# For fingerprint test, try to click start button
if test_name == "fingerprint":
try:
await page.click(
'button:has-text("Start")', timeout=3000
)
await page.wait_for_timeout(5000)
except:
pass
# Take screenshot
screenshot_path = (
screenshots_dir
/ f"{browser_type.lower().replace(' ', '_')}_{test_name}.png"
)
await page.screenshot(path=screenshot_path, full_page=True)
print(f"Screenshot saved: {screenshot_path}")
results[test_name] = "Success"
except Exception as e:
print(f"Failed: {str(e)[:100]}...")
results[test_name] = f"Failed: {str(e)[:50]}"
finally:
if page:
await page.close()
# Clean up
if hasattr(browser_manager, "close"):
await browser_manager.close()
except Exception as e:
print(f"Browser initialization failed: {str(e)}")
results["error"] = str(e)
return results
# Store all results
all_results = {}
# Test 1: Regular browser baseline removed during standalonization of
# scitex-browser. A one-way dep rule means this demo cannot reach back
# into scitex-scholar. If you want a baseline comparison, instantiate
# a plain Playwright context in the caller and pass its results here.
all_results["Regular Browser"] = {"error": "Not available"}
# Test 2: ZenRows Remote Browser (default settings)
print("\nInitializing ZenRows Remote Browser...")
try:
zenrows_manager = ZenRowsRemoteScholarBrowserManager()
zenrows_results = await test_browser_async(
"ZenRows Remote", zenrows_manager
)
all_results["ZenRows Remote"] = zenrows_results
except Exception as e:
print(f"ZenRows Remote test failed: {e}")
all_results["ZenRows Remote"] = {"error": str(e)}
# Test 3: ZenRows Remote Browser with country (if supported)
print("\nInitializing ZenRows Remote Browser with AU country...")
try:
zenrows_au_manager = ZenRowsRemoteScholarBrowserManager(proxy_country="au")
zenrows_au_results = await test_browser_async(
"ZenRows Remote AU", zenrows_au_manager
)
all_results["ZenRows Remote AU"] = zenrows_au_results
except Exception as e:
print(f"ZenRows Remote AU test failed: {e}")
all_results["ZenRows Remote AU"] = {"error": str(e)}
# Test 4: Test the API client as well
print("\nTesting ZenRows API Client for comparison...")
try:
from .ZenRowsAPIClient import ZenRowsAPIBrowser as ZenRowsAPIClient
print("Testing basic API request...")
api_client = ZenRowsAPIClient()
response = api_client.request("https://httpbin.org/ip")
if response.status_code == 200:
ip_data = json.loads(response.text)
print(f"API Client IP (Basic): {ip_data.get('origin', 'Unknown')}")
print(
f"API Cost: {response.headers.get('X-Request-Cost', 'Unknown')} credits"
)
all_results["API Client Basic"] = {
"ip": ip_data.get("origin", "Unknown")
}
print("\nTesting API with Australian proxy...")
api_client_au = ZenRowsAPIClient(default_country="au")
response_au = api_client_au.request("https://httpbin.org/ip")
if response_au.status_code == 200:
ip_data_au = json.loads(response_au.text)
print(f"API Client IP (AU): {ip_data_au.get('origin', 'Unknown')}")
print(
f"API Cost: {response_au.headers.get('X-Request-Cost', 'Unknown')} credits"
)
all_results["API Client AU"] = {
"ip": ip_data_au.get("origin", "Unknown")
}
except Exception as e:
print(f"API Client test failed: {e}")
all_results["API Client"] = {"error": str(e)}
# Print summary
print("\n" + "=" * 60)
print("SUMMARY REPORT")
print("=" * 60)
print("\nIP Addresses detected:")
for method, data in all_results.items():
if isinstance(data, dict):
ip = data.get("ip", "Not tested")
else:
ip = "Error"
print(f" {method:.<35} {ip}")
print(f"\nScreenshots saved in: {screenshots_dir.absolute()}")
# Save summary report
summary_path = screenshots_dir / "test_summary.json"
with open(summary_path, "w") as f:
json.dump(
{
"timestamp": timestamp,
"results": all_results,
"test_sites": [
{"name": t[0], "url": t[1], "description": t[2]}
for t in test_sites
],
},
f,
indent=2,
)
print(f"Summary report saved: {summary_path}")
# Comparison notes
print("\n" + "=" * 60)
print("COMPARISON NOTES:")
print("=" * 60)
print("1. Regular Browser: Uses your local IP, no proxy")
print("2. ZenRows Remote: Cloud browser with built-in anti-bot")
print("3. ZenRows Remote AU: Attempts Australian IP (experimental)")
print("4. API Client Basic: Direct API without country routing")
print("5. API Client AU: Guaranteed Australian IP via API mode")
print("\nRecommendation: Use API Client for country-specific needs,")
print("Remote Browser for complex JavaScript sites.")
# async def main():
# """Example usage of ZenRowsRemoteScholarBrowserManager."""
# # Get API key from environment or use a test key
# api_key = os.getenv(
# "SCITEX_SCHOLAR_ZENROWS_API_KEY", "your_api_key_here"
# )
# # Initialize remote browser manager
# async with ZenRowsRemoteScholarBrowserManager(api_key=api_key) as manager:
# try:
# # Connect to ZenRows Scraping Browser
# browser = await manager.connect()
# print("Connected to ZenRows Scraping Browser")
# # Get the browser context
# context = await manager.get_context()
# # Create a new page
# page = await context.new_page()
# # Navigate to a site with anti-bot protection
# print("Navigating to protected site...")
# await page.goto("https://httpbin.org/headers", wait_until="domcontentloaded", timeout=30000)
# # Get page content
# content = await page.content()
# print("Page loaded successfully")
# # Check headers to verify we're using ZenRows
# import json
# try:
# # Extract JSON from pre tag
# pre_element = await page.query_selector("pre")
# if pre_element:
# text = await pre_element.inner_text()
# headers = json.loads(text)
# print("\nRequest headers seen by server:")
# for key, value in headers.get("headers", {}).items():
# print(f" {key}: {value}")
# except Exception as e:
# print(f"Could not parse headers: {e}")
# # Example: Navigate to a site that requires authentication
# print("\nNavigating to academic site...")
# await page.goto("https://scholar.google.com", wait_until="domcontentloaded", timeout=30000)
# await page.wait_for_timeout(2000)
# # Take screenshot
# await page.screenshot(path="zenrows_remote_screenshot.png")
# print("Screenshot saved as zenrows_remote_screenshot.png")
# # Example: Handle dynamic content
# print("\nTesting dynamic content handling...")
# await page.goto("https://example.com", wait_until="domcontentloaded", timeout=30000)
# title = await page.title()
# print(f"Page title: {title}")
# except Exception as e:
# print(f"Error during browser operation: {e}")
# import traceback
# traceback.print_exc()
# print("\nZenRows browser session closed")
# Run the example
asyncio.run(main())
# python -m scitex.scholar.browser.remote._ZenRowsRemoteScholarBrowserManager
# EOF