#!/usr/bin/env python3
# -*- coding: utf-8 -*-
# Timestamp: 2025-07-31 17:45:00
# Author: ywatanabe
# File: /home/ywatanabe/proj/SciTeX-Code/src/scitex/scholar/browser/remote/_CaptchaHandler.py
# ----------------------------------------
from __future__ import annotations
import os
__FILE__ = "./src/scitex/scholar/browser/remote/_CaptchaHandler.py"
__DIR__ = os.path.dirname(__FILE__)
# ----------------------------------------
"""
Functionalities:
- Handles CAPTCHA solving using 2Captcha service
- Detects and solves Cloudflare, reCAPTCHA, and hCaptcha challenges
- Provides automated CAPTCHA resolution for browser automation
- Demonstrates captcha handling when run standalone
Dependencies:
- packages:
- playwright
- aiohttp
IO:
- input-files:
- None
- output-files:
- None
"""
"""Imports"""
import argparse
import asyncio
import json
import time
from typing import Any, Dict, Optional, Union
import aiohttp
from playwright.async_api import Frame, Page
import scitex_logging as logging
from scitex_browser._compat import ScholarError
logger = logging.getLogger(__name__)
"""Functions & Classes"""
[docs]
class CaptchaHandler:
"""Handles CAPTCHA solving using 2Captcha service."""
[docs]
def __init__(self, api_key: Optional[str] = None):
"""Initialize with 2Captcha API key."""
self.api_key = api_key or os.getenv("SCITEX_SCHOLAR_2CAPTCHA_API_KEY")
if not self.api_key:
logger.warn("2Captcha API key not configured - CAPTCHA solving disabled")
self.base_url = "http://2captcha.com"
self.timeout = 180 # 3 minutes max wait time
[docs]
async def handle_page_async(self, page: Page) -> bool:
"""Check and handle captcha on the current page.
Returns:
bool: True if captcha was found and solved, False otherwise
"""
if not self.api_key:
return False
# Check for common captcha indicators
captcha_found = await self._detect_captcha_async(page)
if not captcha_found:
return False
logger.debug("Captcha detected on page - attempting to solve")
# Determine captcha type and solve
if await self._is_cloudflare_challenge_async(page):
return await self._solve_cloudflare_challenge_async(page)
elif await self._has_recaptcha_async(page):
return await self._solve_recaptcha_async(page)
elif await self._has_hcaptcha_async(page):
return await self._solve_hcaptcha_async(page)
else:
logger.warn("Unknown captcha type detected")
return False
[docs]
async def _detect_captcha_async(self, page: Page) -> bool:
"""Detect if page has a captcha."""
# Check for common captcha elements
selectors = [
# Cloudflare
"iframe[title*='Cloudflare']",
"#cf-challenge-running",
".cf-challenge",
"div:has-text('Verifying you are human')",
"div:has-text('Checking your browser')",
# reCAPTCHA
"iframe[src*='recaptcha']",
".g-recaptcha",
"#g-recaptcha",
# hCaptcha
"iframe[src*='hcaptcha']",
".h-captcha",
# Generic
"div:has-text('I am not a robot')",
"div:has-text('Verify you are human')",
"div:has-text('Security check')",
]
for selector in selectors:
try:
if await page.locator(selector).first.is_visible():
return True
except:
continue
return False
[docs]
async def _is_cloudflare_challenge_async(self, page: Page) -> bool:
"""Check if this is a Cloudflare challenge."""
try:
# Check for Cloudflare-specific elements
cf_indicators = [
"iframe[title*='Cloudflare']",
"#cf-challenge-running",
".cf-challenge",
"div:has-text('Verifying you are human')",
"div:has-text('Checking your browser')",
]
for indicator in cf_indicators:
if await page.locator(indicator).first.is_visible():
return True
# Check page title
title = await page.title()
if "Just a moment" in title or "Attention Required" in title:
return True
return False
except:
return False
[docs]
async def _solve_cloudflare_challenge_async(self, page: Page) -> bool:
"""Handle Cloudflare challenge/turnstile."""
logger.debug("Handling Cloudflare challenge")
try:
# First, wait a bit to see if it auto-solves
logger.debug("Waiting for Cloudflare auto-solve...")
await asyncio.sleep(5)
# Check if still on challenge page
if not await self._is_cloudflare_challenge_async(page):
logger.debug("Cloudflare challenge auto-solved")
return True
# If Turnstile captcha is present, solve it
turnstile_frame = page.frame_locator("iframe[title*='Cloudflare']").first
if turnstile_frame:
logger.debug("Cloudflare Turnstile detected - solving with 2Captcha")
# Get site key
site_key = await self._extract_turnstile_key_async(page)
if not site_key:
logger.error("Could not extract Turnstile site key")
return False
# Submit to 2Captcha
task_id = await self._submit_turnstile_async(page.url, site_key)
if not task_id:
return False
# Get solution
solution = await self._get_captcha_result_async(task_id)
if not solution:
return False
# Inject solution
await page.evaluate(
f"""
window.turnstile.render.solutions = window.turnstile.render.solutions || [];
window.turnstile.render.solutions.push('{solution}');
"""
)
# Click verify if needed
verify_btn = page.locator("input[type='submit'][value*='Verify']")
if await verify_btn.is_visible():
await verify_btn.click()
# Wait for navigation
await page.wait_for_load_state("networkidle", timeout=30000)
return not await self._is_cloudflare_challenge_async(page)
# For other Cloudflare challenges, just wait
logger.debug("Waiting for Cloudflare challenge to complete...")
await page.wait_for_function(
"!document.querySelector('#cf-challenge-running')", timeout=30000
)
return True
except Exception as e:
logger.error(f"Failed to solve Cloudflare challenge: {e}")
return False
[docs]
async def _has_recaptcha_async(self, page: Page) -> bool:
"""Check if page has reCAPTCHA."""
try:
return await page.locator("iframe[src*='recaptcha']").first.is_visible()
except:
return False
[docs]
async def _solve_recaptcha_async(self, page: Page) -> bool:
"""Solve reCAPTCHA v2."""
logger.debug("Solving reCAPTCHA")
try:
# Get site key
site_key = await page.evaluate(
"""
() => {
const elem = document.querySelector('[data-sitekey]');
return elem ? elem.getAttribute('data-sitekey') : null;
}
"""
)
if not site_key:
logger.error("Could not find reCAPTCHA site key")
return False
# Submit to 2Captcha
task_id = await self._submit_recaptcha_async(page.url, site_key)
if not task_id:
return False
# Get solution
solution = await self._get_captcha_result_async(task_id)
if not solution:
return False
# Inject solution
await page.evaluate(
f"""
document.getElementById('g-recaptcha-response').innerHTML = '{solution}';
if (typeof ___grecaptcha_cfg !== 'undefined') {{
Object.entries(___grecaptcha_cfg.clients).forEach(([key, client]) => {{
if (client.callback) {{
client.callback('{solution}');
}}
}});
}}
"""
)
# Submit form if present
submit_btn = page.locator(
"button[type='submit'], input[type='submit']"
).first
if await submit_btn.is_visible():
await submit_btn.click()
await page.wait_for_load_state("networkidle", timeout=10000)
return True
except Exception as e:
logger.error(f"Failed to solve reCAPTCHA: {e}")
return False
[docs]
async def _has_hcaptcha_async(self, page: Page) -> bool:
"""Check if page has hCaptcha."""
try:
return await page.locator("iframe[src*='hcaptcha']").first.is_visible()
except:
return False
[docs]
async def _solve_hcaptcha_async(self, page: Page) -> bool:
"""Solve hCaptcha."""
logger.debug("Solving hCaptcha")
try:
# Get site key
site_key = await page.evaluate(
"""
() => {
const elem = document.querySelector('[data-sitekey]');
return elem ? elem.getAttribute('data-sitekey') : null;
}
"""
)
if not site_key:
logger.error("Could not find hCaptcha site key")
return False
# Submit to 2Captcha
task_id = await self._submit_hcaptcha_async(page.url, site_key)
if not task_id:
return False
# Get solution
solution = await self._get_captcha_result_async(task_id)
if not solution:
return False
# Inject solution
await page.evaluate(
f"""
document.querySelector('[name="h-captcha-response"]').value = '{solution}';
document.querySelector('[name="g-recaptcha-response"]').value = '{solution}';
if (window.hcaptcha) {{
window.hcaptcha.execute();
}}
"""
)
return True
except Exception as e:
logger.error(f"Failed to solve hCaptcha: {e}")
return False
[docs]
async def _extract_turnstile_key_async(self, page: Page) -> Optional[str]:
"""Extract Cloudflare Turnstile site key."""
try:
# Try different methods to get the key
site_key = await page.evaluate(
"""
() => {
// Method 1: Check data attributes
const elem = document.querySelector('[data-sitekey]');
if (elem) return elem.getAttribute('data-sitekey');
// Method 2: Check Turnstile config
if (window.turnstile?.config?.sitekey) {
return window.turnstile.config.sitekey;
}
// Method 3: Parse from script
const scripts = Array.from(document.scripts);
for (const script of scripts) {
const match = script.textContent.match(/sitekey['"]\s*:\s*['"]([^'"]+)/);
if (match) return match[1];
}
return null;
}
"""
)
return site_key
except Exception as e:
logger.error(f"Failed to extract Turnstile key: {e}")
return None
[docs]
async def _submit_recaptcha_async(
self, page_url: str, site_key: str
) -> Optional[str]:
"""Submit reCAPTCHA to 2Captcha."""
return await self._submit_captcha_async(
{
"key": self.api_key,
"method": "userrecaptcha",
"googlekey": site_key,
"pageurl": page_url,
"json": 1,
}
)
[docs]
async def _submit_hcaptcha_async(
self, page_url: str, site_key: str
) -> Optional[str]:
"""Submit hCaptcha to 2Captcha."""
return await self._submit_captcha_async(
{
"key": self.api_key,
"method": "hcaptcha",
"sitekey": site_key,
"pageurl": page_url,
"json": 1,
}
)
[docs]
async def _submit_turnstile_async(
self, page_url: str, site_key: str
) -> Optional[str]:
"""Submit Turnstile to 2Captcha."""
return await self._submit_captcha_async(
{
"key": self.api_key,
"method": "turnstile",
"sitekey": site_key,
"pageurl": page_url,
"json": 1,
}
)
[docs]
async def _submit_captcha_async(self, params: Dict[str, Any]) -> Optional[str]:
"""Submit captcha to 2Captcha and get task ID."""
try:
async with aiohttp.ClientSession() as session:
async with session.post(
f"{self.base_url}/in.php", data=params
) as response:
result = await response.json()
if result.get("status") == 1:
task_id = result.get("request")
logger.debug(f"Captcha submitted, task ID: {task_id}")
return task_id
else:
logger.error(f"2Captcha submission failed: {result}")
return None
except Exception as e:
logger.error(f"Failed to submit captcha: {e}")
return None
[docs]
async def _get_captcha_result_async(self, task_id: str) -> Optional[str]:
"""Poll 2Captcha for result."""
start_time = time.time()
while time.time() - start_time < self.timeout:
try:
async with aiohttp.ClientSession() as session:
async with session.get(
f"{self.base_url}/res.php",
params={
"key": self.api_key,
"action": "get",
"id": task_id,
"json": 1,
},
) as response:
result = await response.json()
if result.get("status") == 1:
solution = result.get("request")
logger.debug("Captcha solved successfully")
return solution
elif result.get("request") == "CAPCHA_NOT_READY":
logger.debug("Captcha not ready yet, waiting...")
await asyncio.sleep(5)
else:
logger.error(f"2Captcha error: {result}")
return None
except Exception as e:
logger.error(f"Failed to get captcha result: {e}")
await asyncio.sleep(5)
logger.error("Captcha solving timeout")
return None