Spaces:
Running
Running
Upload 10 files
Browse files- Dockerfile +31 -0
- browser/instance.py +208 -0
- browser/navigation.py +57 -0
- main.py +235 -0
- requirements.txt +33 -0
- utils/common.py +52 -0
- utils/cookie_handler.py +97 -0
- utils/cookie_manager.py +201 -0
- utils/logger.py +42 -0
- utils/paths.py +32 -0
Dockerfile
ADDED
|
@@ -0,0 +1,31 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# 使用一个轻量的 Python 官方镜像作为基础
|
| 2 |
+
FROM python:3.11-slim-bookworm
|
| 3 |
+
|
| 4 |
+
# 设置工作目录,后续的命令都在这个目录下执行
|
| 5 |
+
WORKDIR /app
|
| 6 |
+
|
| 7 |
+
# 安装运行 Playwright 所需的最小系统依赖集
|
| 8 |
+
# 在同一层中清理 apt 缓存以减小镜像体积
|
| 9 |
+
RUN apt-get update && apt-get install -y --no-install-recommends \
|
| 10 |
+
libatk1.0-0 libatk-bridge2.0-0 libcups2 libdbus-1-3 libdrm2 libgbm1 libgtk-3-0 \
|
| 11 |
+
libnspr4 libnss3 libx11-6 libx11-xcb1 libxcb1 libxcomposite1 libxdamage1 \
|
| 12 |
+
libxext6 libxfixes3 libxrandr2 libxrender1 libxtst6 ca-certificates \
|
| 13 |
+
fonts-liberation libasound2 libpangocairo-1.0-0 libpango-1.0-0 libu2f-udev xvfb \
|
| 14 |
+
&& rm -rf /var/lib/apt/lists/*
|
| 15 |
+
|
| 16 |
+
# 拷贝并安装 Python 依赖
|
| 17 |
+
COPY requirements.txt .
|
| 18 |
+
RUN pip install --no-cache-dir -r requirements.txt
|
| 19 |
+
|
| 20 |
+
# 下载 camoufox
|
| 21 |
+
RUN camoufox fetch
|
| 22 |
+
|
| 23 |
+
# 将项目中的所有文件拷贝到工作目录
|
| 24 |
+
COPY . .
|
| 25 |
+
|
| 26 |
+
# 暴露 Hugging Face Spaces 期望的端口(仅在服务器模式下使用)
|
| 27 |
+
EXPOSE 7860
|
| 28 |
+
|
| 29 |
+
|
| 30 |
+
# 设置容器启动时要执行的命令
|
| 31 |
+
CMD ["python", "main.py"]
|
browser/instance.py
ADDED
|
@@ -0,0 +1,208 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
from playwright.sync_api import TimeoutError, Error as PlaywrightError
|
| 3 |
+
from utils.logger import setup_logging
|
| 4 |
+
from utils.cookie_manager import CookieManager
|
| 5 |
+
from browser.navigation import handle_successful_navigation
|
| 6 |
+
from camoufox.sync_api import Camoufox
|
| 7 |
+
from utils.paths import logs_dir
|
| 8 |
+
from utils.common import parse_headless_mode, ensure_dir
|
| 9 |
+
|
| 10 |
+
|
| 11 |
+
def run_browser_instance(config):
|
| 12 |
+
"""
|
| 13 |
+
根据最终合并的配置,启动并管理一个单独的 Camoufox 浏览器实例。
|
| 14 |
+
使用CookieManager统一管理cookie加载,避免重复的扫描逻辑。
|
| 15 |
+
"""
|
| 16 |
+
cookie_source = config.get('cookie_source')
|
| 17 |
+
if not cookie_source:
|
| 18 |
+
# 使用默认logger进行错误报告
|
| 19 |
+
logger = setup_logging(os.path.join(logs_dir(), 'app.log'))
|
| 20 |
+
logger.error("错误: 配置中缺少cookie_source对象")
|
| 21 |
+
return
|
| 22 |
+
|
| 23 |
+
instance_label = cookie_source.display_name
|
| 24 |
+
logger = setup_logging(
|
| 25 |
+
os.path.join(logs_dir(), 'app.log'), prefix=instance_label
|
| 26 |
+
)
|
| 27 |
+
diagnostic_tag = instance_label.replace(os.sep, "_")
|
| 28 |
+
|
| 29 |
+
expected_url = config.get('url')
|
| 30 |
+
proxy = config.get('proxy')
|
| 31 |
+
headless_setting = config.get('headless', 'virtual')
|
| 32 |
+
|
| 33 |
+
# 使用CookieManager加载cookie
|
| 34 |
+
cookie_manager = CookieManager(logger)
|
| 35 |
+
all_cookies = []
|
| 36 |
+
|
| 37 |
+
try:
|
| 38 |
+
# 直接使用CookieSource对象加载cookie
|
| 39 |
+
cookies = cookie_manager.load_cookies(cookie_source)
|
| 40 |
+
all_cookies.extend(cookies)
|
| 41 |
+
|
| 42 |
+
except Exception as e:
|
| 43 |
+
logger.error(f"从cookie来源加载时出错: {e}")
|
| 44 |
+
return
|
| 45 |
+
|
| 46 |
+
# 3. 检查是否有任何cookie可用
|
| 47 |
+
if not all_cookies:
|
| 48 |
+
logger.error("错误: 没有可用的cookie(既没有有效的JSON文件,也没有环境变量)")
|
| 49 |
+
return
|
| 50 |
+
|
| 51 |
+
cookies = all_cookies
|
| 52 |
+
|
| 53 |
+
headless_mode = parse_headless_mode(headless_setting)
|
| 54 |
+
launch_options = {"headless": headless_mode}
|
| 55 |
+
if proxy:
|
| 56 |
+
logger.info(f"使用代理: {proxy} 访问")
|
| 57 |
+
launch_options["proxy"] = {"server": proxy, "bypass": "localhost, 127.0.0.1"}
|
| 58 |
+
# 无需禁用图片加载, 因为图片很少, 禁用还可能导致风控增加
|
| 59 |
+
# launch_options["block_images"] = True
|
| 60 |
+
|
| 61 |
+
screenshot_dir = logs_dir()
|
| 62 |
+
ensure_dir(screenshot_dir)
|
| 63 |
+
|
| 64 |
+
try:
|
| 65 |
+
with Camoufox(**launch_options) as browser:
|
| 66 |
+
context = browser.new_context()
|
| 67 |
+
context.add_cookies(cookies)
|
| 68 |
+
page = context.new_page()
|
| 69 |
+
|
| 70 |
+
# ####################################################################
|
| 71 |
+
# ############ 增强的 page.goto() 错误处理和日志记录 ###############
|
| 72 |
+
# ####################################################################
|
| 73 |
+
|
| 74 |
+
response = None
|
| 75 |
+
try:
|
| 76 |
+
logger.info(f"正在导航到: {expected_url} (超时设置为 120 秒)")
|
| 77 |
+
# page.goto() 会返回一个 response 对象,我们可以用它来获取状态码等信息
|
| 78 |
+
response = page.goto(expected_url, wait_until='domcontentloaded', timeout=120000)
|
| 79 |
+
|
| 80 |
+
# 检查HTTP响应状态码
|
| 81 |
+
if response:
|
| 82 |
+
logger.info(f"导航初步成功,服务器响应状态码: {response.status} {response.status_text}")
|
| 83 |
+
if not response.ok: # response.ok 检查状态码是否在 200-299 范围内
|
| 84 |
+
logger.warning(f"警告:页面加载成功,但HTTP状态码表示错误: {response.status}")
|
| 85 |
+
# 即使状态码错误,也保存快照以供分析
|
| 86 |
+
page.screenshot(path=os.path.join(screenshot_dir, f"WARN_http_status_{response.status}_{diagnostic_tag}.png"))
|
| 87 |
+
else:
|
| 88 |
+
# 对于非http/https的导航(如 about:blank),response可能为None
|
| 89 |
+
logger.warning("page.goto 未返回响应对象,可能是一个非HTTP导航。")
|
| 90 |
+
|
| 91 |
+
except TimeoutError:
|
| 92 |
+
# 这是最常见的错误:超时
|
| 93 |
+
logger.error(f"导航到 {expected_url} 超时 (超过120秒)。")
|
| 94 |
+
logger.error("可能原因:网络连接缓慢、目标网站服务器无响应、代理问题、或页面资源被阻塞。")
|
| 95 |
+
# 尝试保存诊断信息
|
| 96 |
+
try:
|
| 97 |
+
# 截图对于看到页面卡在什么状态非常有帮助(例如,空白页、加载中、Chrome错误页)
|
| 98 |
+
screenshot_path = os.path.join(screenshot_dir, f"FAIL_timeout_{diagnostic_tag}.png")
|
| 99 |
+
page.screenshot(path=screenshot_path, full_page=True)
|
| 100 |
+
logger.info(f"已截取超时时的屏幕快照: {screenshot_path}")
|
| 101 |
+
|
| 102 |
+
# 保存HTML可以帮助分析DOM结构,即使在无头模式下也很有用
|
| 103 |
+
html_path = os.path.join(screenshot_dir, f"FAIL_timeout_{diagnostic_tag}.html")
|
| 104 |
+
with open(html_path, 'w', encoding='utf-8') as f:
|
| 105 |
+
f.write(page.content())
|
| 106 |
+
logger.info(f"已保存超时时的页面HTML: {html_path}")
|
| 107 |
+
except Exception as diag_e:
|
| 108 |
+
logger.error(f"在尝试进行超时诊断(截图/保存HTML)时发生额外错误: {diag_e}")
|
| 109 |
+
return # 超时后,后续操作无意义,直接终止
|
| 110 |
+
|
| 111 |
+
except PlaywrightError as e:
|
| 112 |
+
# 捕获其他Playwright相关的网络错误,例如DNS解析失败、连接被拒绝等
|
| 113 |
+
error_message = str(e)
|
| 114 |
+
logger.error(f"导航到 {expected_url} 时发生 Playwright 网络错误。")
|
| 115 |
+
logger.error(f"错误详情: {error_message}")
|
| 116 |
+
|
| 117 |
+
# Playwright的错误信息通常很具体,例如 "net::ERR_CONNECTION_REFUSED"
|
| 118 |
+
if "net::ERR_NAME_NOT_RESOLVED" in error_message:
|
| 119 |
+
logger.error("排查建议:检查DNS设置或域名是否正确。")
|
| 120 |
+
elif "net::ERR_CONNECTION_REFUSED" in error_message:
|
| 121 |
+
logger.error("排查建议:目标服务器可能已关闭,或代理/防火墙阻止了连接。")
|
| 122 |
+
elif "net::ERR_INTERNET_DISCONNECTED" in error_message:
|
| 123 |
+
logger.error("排查建议:检查本机的网络连接。")
|
| 124 |
+
|
| 125 |
+
# 同样,尝试截图,尽管此时页面可能完全无法访问
|
| 126 |
+
try:
|
| 127 |
+
screenshot_path = os.path.join(screenshot_dir, f"FAIL_network_error_{diagnostic_tag}.png")
|
| 128 |
+
page.screenshot(path=screenshot_path)
|
| 129 |
+
logger.info(f"已截取网络错误时的屏幕快照: {screenshot_path}")
|
| 130 |
+
except Exception as diag_e:
|
| 131 |
+
logger.error(f"在尝试进行网络错误诊断(截图)时发生额外错误: {diag_e}")
|
| 132 |
+
return # 网络错误,终止
|
| 133 |
+
|
| 134 |
+
# --- 如果导航没有抛出异常,继续执行后续逻辑 ---
|
| 135 |
+
|
| 136 |
+
logger.info("页面初步加载完成,正在检查并处理初始弹窗...")
|
| 137 |
+
page.wait_for_timeout(2000)
|
| 138 |
+
|
| 139 |
+
final_url = page.url
|
| 140 |
+
logger.info(f"导航完成。最终URL为: {final_url}")
|
| 141 |
+
|
| 142 |
+
# ... 你原有的URL检查逻辑保持不变 ...
|
| 143 |
+
if "accounts.google.com/v3/signin/identifier" in final_url:
|
| 144 |
+
logger.error("检测到Google登录页面(需要输入邮箱)。Cookie已完全失效。")
|
| 145 |
+
page.screenshot(path=os.path.join(screenshot_dir, f"FAIL_identifier_page_{diagnostic_tag}.png"))
|
| 146 |
+
return
|
| 147 |
+
elif expected_url.split('?')[0] in final_url:
|
| 148 |
+
|
| 149 |
+
logger.info("URL正确。现在等待页面完成初始加载...")
|
| 150 |
+
|
| 151 |
+
# --- NEW ROBUST STRATEGY: Wait for the loading spinner to disappear ---
|
| 152 |
+
# This is the key to solving the race condition. The error message or
|
| 153 |
+
# content will only appear AFTER the initial loading is done.
|
| 154 |
+
spinner_locator = page.locator('mat-spinner')
|
| 155 |
+
try:
|
| 156 |
+
logger.info("正在等待加载指示器 (spinner) 消失... (最长等待30秒)")
|
| 157 |
+
# We wait for the spinner to be 'hidden' or not present in the DOM.
|
| 158 |
+
spinner_locator.wait_for(state='hidden', timeout=30000)
|
| 159 |
+
logger.info("加载指示器已消失。页面已完成异步加载。")
|
| 160 |
+
except TimeoutError:
|
| 161 |
+
logger.error("页面加载指示器在30秒内未消失。页面可能已卡住。")
|
| 162 |
+
page.screenshot(path=os.path.join(screenshot_dir, f"FAIL_spinner_stuck_{diagnostic_tag}.png"))
|
| 163 |
+
return # Exit if the page is stuck loading
|
| 164 |
+
|
| 165 |
+
# --- NOW, we can safely check for the error message ---
|
| 166 |
+
# We use the most specific text possible to avoid false positives.
|
| 167 |
+
auth_error_text = "authentication error"
|
| 168 |
+
auth_error_locator = page.get_by_text(auth_error_text, exact=False)
|
| 169 |
+
|
| 170 |
+
# We only need a very short timeout here because the page should be stable.
|
| 171 |
+
if auth_error_locator.is_visible(timeout=2000):
|
| 172 |
+
logger.error(f"检测到认证失败的错误横幅: '{auth_error_text}'. Cookie已过期或无效。")
|
| 173 |
+
screenshot_path = os.path.join(screenshot_dir, f"FAIL_auth_error_banner_{diagnostic_tag}.png")
|
| 174 |
+
page.screenshot(path=screenshot_path)
|
| 175 |
+
|
| 176 |
+
# html_path = os.path.join(screenshot_dir, f"FAIL_auth_error_banner_{diagnostic_tag}.html")
|
| 177 |
+
# with open(html_path, 'w', encoding='utf-8') as f:
|
| 178 |
+
# f.write(page.content())
|
| 179 |
+
# logger.info(f"已保存包含错误信息的页面HTML: {html_path}")
|
| 180 |
+
return # Definitive failure, so we exit.
|
| 181 |
+
|
| 182 |
+
# --- If no error, proceed to final confirmation (as a fallback) ---
|
| 183 |
+
logger.info("未检测到认证错误横幅。进行最终确认。")
|
| 184 |
+
login_button_cn = page.get_by_role('button', name='登录')
|
| 185 |
+
login_button_en = page.get_by_role('button', name='Login')
|
| 186 |
+
|
| 187 |
+
if login_button_cn.is_visible(timeout=1000) or login_button_en.is_visible(timeout=1000):
|
| 188 |
+
logger.error("页面上仍显示'登录'按钮。Cookie无效。")
|
| 189 |
+
page.screenshot(path=os.path.join(screenshot_dir, f"FAIL_login_button_visible_{diagnostic_tag}.png"))
|
| 190 |
+
return
|
| 191 |
+
|
| 192 |
+
# --- If all checks pass, we assume success ---
|
| 193 |
+
logger.info("所有验证通过,确认已成功登录。")
|
| 194 |
+
handle_successful_navigation(page, logger, diagnostic_tag)
|
| 195 |
+
elif "accounts.google.com/v3/signin/accountchooser" in final_url:
|
| 196 |
+
logger.warning("检测到Google账户选择页面。登录失败或Cookie已过期。")
|
| 197 |
+
page.screenshot(path=os.path.join(screenshot_dir, f"FAIL_chooser_click_failed_{diagnostic_tag}.png"))
|
| 198 |
+
return
|
| 199 |
+
else:
|
| 200 |
+
logger.error(f"导航到了一个意外的URL: {final_url}")
|
| 201 |
+
page.screenshot(path=os.path.join(screenshot_dir, f"FAIL_unexpected_url_{diagnostic_tag}.png"))
|
| 202 |
+
return
|
| 203 |
+
|
| 204 |
+
except KeyboardInterrupt:
|
| 205 |
+
logger.info(f"用户中断,正在关闭...")
|
| 206 |
+
except Exception as e:
|
| 207 |
+
# 这是一个最终的捕获,用于捕获所有未预料到的错误
|
| 208 |
+
logger.exception(f"运行 Camoufox 实例时发生未预料的严重错误: {e}")
|
browser/navigation.py
ADDED
|
@@ -0,0 +1,57 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import time
|
| 2 |
+
import os
|
| 3 |
+
from playwright.sync_api import Page, expect
|
| 4 |
+
from utils.paths import logs_dir
|
| 5 |
+
from utils.common import ensure_dir
|
| 6 |
+
|
| 7 |
+
def handle_untrusted_dialog(page: Page, logger=None):
|
| 8 |
+
"""
|
| 9 |
+
检查并处理 "Last modified by..." 的弹窗。
|
| 10 |
+
如果弹窗出现,则点击 "OK" 按钮。
|
| 11 |
+
"""
|
| 12 |
+
ok_button_locator = page.get_by_role("button", name="OK")
|
| 13 |
+
|
| 14 |
+
try:
|
| 15 |
+
if ok_button_locator.is_visible(timeout=10000): # 等待最多10秒
|
| 16 |
+
logger.info(f"检测到弹窗,正在点击 'OK' 按钮...")
|
| 17 |
+
|
| 18 |
+
ok_button_locator.click(force=True)
|
| 19 |
+
logger.info(f"'OK' 按钮已点击。")
|
| 20 |
+
expect(ok_button_locator).to_be_hidden(timeout=1000)
|
| 21 |
+
logger.info(f"弹窗已确认关闭。")
|
| 22 |
+
else:
|
| 23 |
+
logger.info(f"在10秒内未检测到弹窗,继续执行...")
|
| 24 |
+
except Exception as e:
|
| 25 |
+
logger.info(f"检查弹窗时发生意外:{e},将继续执行...")
|
| 26 |
+
|
| 27 |
+
def handle_successful_navigation(page: Page, logger, cookie_file_config):
|
| 28 |
+
"""
|
| 29 |
+
在成功导航到目标页面后,执行后续操作(处理弹窗、保持运行)。
|
| 30 |
+
"""
|
| 31 |
+
logger.info("已成功到达目标页面。")
|
| 32 |
+
page.click('body') # 给予页面焦点
|
| 33 |
+
|
| 34 |
+
# 检查并处理 "Last modified by..." 的弹窗
|
| 35 |
+
handle_untrusted_dialog(page, logger=logger)
|
| 36 |
+
|
| 37 |
+
# 等待页面加载和渲染
|
| 38 |
+
logger.info("等待15秒以便页面完全渲染...")
|
| 39 |
+
time.sleep(15)
|
| 40 |
+
|
| 41 |
+
logger.info("实例将保持运行状态。每10秒点击一次页面以保持活动。")
|
| 42 |
+
while True:
|
| 43 |
+
try:
|
| 44 |
+
page.click('body')
|
| 45 |
+
time.sleep(10)
|
| 46 |
+
except Exception as e:
|
| 47 |
+
logger.error(f"在保持活动循环中出错: {e}")
|
| 48 |
+
# 在保持活动循环中出错时截屏
|
| 49 |
+
try:
|
| 50 |
+
screenshot_dir = logs_dir()
|
| 51 |
+
ensure_dir(screenshot_dir)
|
| 52 |
+
screenshot_filename = os.path.join(screenshot_dir, f"FAIL_keep_alive_error_{cookie_file_config}.png")
|
| 53 |
+
page.screenshot(path=screenshot_filename, full_page=True)
|
| 54 |
+
logger.info(f"已在保持活动循环出错时截屏: {screenshot_filename}")
|
| 55 |
+
except Exception as screenshot_e:
|
| 56 |
+
logger.error(f"在保持活动循环出错时截屏失败: {screenshot_e}")
|
| 57 |
+
break # 如果页面关闭或出错,则退出循环
|
main.py
ADDED
|
@@ -0,0 +1,235 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import threading
|
| 3 |
+
import multiprocessing
|
| 4 |
+
import signal
|
| 5 |
+
import sys
|
| 6 |
+
import time
|
| 7 |
+
|
| 8 |
+
from browser.instance import run_browser_instance
|
| 9 |
+
from utils.logger import setup_logging
|
| 10 |
+
from utils.paths import cookies_dir, logs_dir
|
| 11 |
+
from utils.cookie_manager import CookieManager
|
| 12 |
+
from utils.common import clean_env_value, ensure_dir
|
| 13 |
+
|
| 14 |
+
# 全局变量
|
| 15 |
+
browser_processes = []
|
| 16 |
+
app_running = False
|
| 17 |
+
flask_app = None
|
| 18 |
+
|
| 19 |
+
|
| 20 |
+
def load_instance_configurations(logger):
|
| 21 |
+
"""
|
| 22 |
+
使用CookieManager解析环境变量和cookies目录,为每个cookie来源创建独立的浏览器实例配置。
|
| 23 |
+
"""
|
| 24 |
+
# 1. 读取所有实例共享的URL
|
| 25 |
+
shared_url = clean_env_value(os.getenv("CAMOUFOX_INSTANCE_URL"))
|
| 26 |
+
if not shared_url:
|
| 27 |
+
logger.error("错误: 缺少环境变量 CAMOUFOX_INSTANCE_URL。所有实例需要一个共享的目标URL。")
|
| 28 |
+
return None, None
|
| 29 |
+
|
| 30 |
+
# 2. 读取全局设置
|
| 31 |
+
global_settings = {
|
| 32 |
+
"headless": clean_env_value(os.getenv("CAMOUFOX_HEADLESS")) or "virtual",
|
| 33 |
+
"url": shared_url # 所有实例都使用这个URL
|
| 34 |
+
}
|
| 35 |
+
|
| 36 |
+
proxy_value = clean_env_value(os.getenv("CAMOUFOX_PROXY"))
|
| 37 |
+
if proxy_value:
|
| 38 |
+
global_settings["proxy"] = proxy_value
|
| 39 |
+
|
| 40 |
+
# 3. 使用CookieManager检测所有cookie来源
|
| 41 |
+
cookie_manager = CookieManager(logger)
|
| 42 |
+
sources = cookie_manager.detect_all_sources()
|
| 43 |
+
|
| 44 |
+
# 检查是否有任何cookie来源
|
| 45 |
+
if not sources:
|
| 46 |
+
logger.error("错误: 未找到任何cookie来源(既没有JSON文件,也没有环境变量cookie)。")
|
| 47 |
+
return None, None
|
| 48 |
+
|
| 49 |
+
# 4. 为每个cookie来源创建实例配置
|
| 50 |
+
instances = []
|
| 51 |
+
for source in sources:
|
| 52 |
+
if source.type == "file":
|
| 53 |
+
instances.append({
|
| 54 |
+
"cookie_file": source.identifier,
|
| 55 |
+
"cookie_source": source
|
| 56 |
+
})
|
| 57 |
+
elif source.type == "env_var":
|
| 58 |
+
# 从环境变量名中提取索引,如 "USER_COOKIE_1" -> 1
|
| 59 |
+
env_index = source.identifier.split("_")[-1]
|
| 60 |
+
instances.append({
|
| 61 |
+
"cookie_file": None,
|
| 62 |
+
"env_cookie_index": int(env_index),
|
| 63 |
+
"cookie_source": source
|
| 64 |
+
})
|
| 65 |
+
|
| 66 |
+
logger.info(f"将启动 {len(instances)} 个浏览器实例")
|
| 67 |
+
|
| 68 |
+
return global_settings, instances
|
| 69 |
+
|
| 70 |
+
def start_browser_instances():
|
| 71 |
+
"""启动浏览器实例的核心逻辑"""
|
| 72 |
+
global browser_processes, app_running
|
| 73 |
+
|
| 74 |
+
log_dir = logs_dir()
|
| 75 |
+
logger = setup_logging(str(log_dir / 'app.log'))
|
| 76 |
+
logger.info("---------------------Camoufox 实例管理器开始启动---------------------")
|
| 77 |
+
|
| 78 |
+
global_settings, instance_profiles = load_instance_configurations(logger)
|
| 79 |
+
if not instance_profiles:
|
| 80 |
+
logger.error("错误: 环境变量中未找到任何实例配置。")
|
| 81 |
+
return
|
| 82 |
+
|
| 83 |
+
for i, profile in enumerate(instance_profiles, 1):
|
| 84 |
+
if not app_running:
|
| 85 |
+
break
|
| 86 |
+
|
| 87 |
+
final_config = global_settings.copy()
|
| 88 |
+
final_config.update(profile)
|
| 89 |
+
|
| 90 |
+
if 'url' not in final_config:
|
| 91 |
+
logger.warning(f"警告: 跳过一个无效的配置项 (缺少 url): {profile}")
|
| 92 |
+
continue
|
| 93 |
+
|
| 94 |
+
cookie_source = final_config.get('cookie_source')
|
| 95 |
+
|
| 96 |
+
if cookie_source:
|
| 97 |
+
if cookie_source.type == "file":
|
| 98 |
+
logger.info(
|
| 99 |
+
f"正在启动第 {i}/{len(instance_profiles)} 个浏览器实例 (file: {cookie_source.display_name})..."
|
| 100 |
+
)
|
| 101 |
+
elif cookie_source.type == "env_var":
|
| 102 |
+
logger.info(
|
| 103 |
+
f"正在启动第 {i}/{len(instance_profiles)} 个浏览器实例 (env: {cookie_source.display_name})..."
|
| 104 |
+
)
|
| 105 |
+
else:
|
| 106 |
+
logger.error(f"错误: 配置中缺少cookie_source对象")
|
| 107 |
+
continue
|
| 108 |
+
|
| 109 |
+
process = multiprocessing.Process(target=run_browser_instance, args=(final_config,))
|
| 110 |
+
browser_processes.append(process)
|
| 111 |
+
process.start()
|
| 112 |
+
|
| 113 |
+
# 如果不是最后一个实例,等待30秒再启动下一个实例,避免并发启动导致的高CPU占用
|
| 114 |
+
if i < len(instance_profiles):
|
| 115 |
+
logger.info(f"等待 30 秒后启动下一个实例...")
|
| 116 |
+
time.sleep(30)
|
| 117 |
+
|
| 118 |
+
# 等待所有进程
|
| 119 |
+
try:
|
| 120 |
+
while app_running and browser_processes:
|
| 121 |
+
for process in browser_processes[:]:
|
| 122 |
+
if not process.is_alive():
|
| 123 |
+
browser_processes.remove(process)
|
| 124 |
+
else:
|
| 125 |
+
process.join(timeout=1)
|
| 126 |
+
time.sleep(1)
|
| 127 |
+
except KeyboardInterrupt:
|
| 128 |
+
logger.info("捕获到终止信号,正在关闭所有浏览器进程...")
|
| 129 |
+
for process in browser_processes:
|
| 130 |
+
process.terminate()
|
| 131 |
+
process.join()
|
| 132 |
+
|
| 133 |
+
def run_standalone_mode():
|
| 134 |
+
"""独立模式"""
|
| 135 |
+
global app_running
|
| 136 |
+
app_running = True
|
| 137 |
+
|
| 138 |
+
start_browser_instances()
|
| 139 |
+
|
| 140 |
+
def run_server_mode():
|
| 141 |
+
"""服务器模式"""
|
| 142 |
+
global app_running, flask_app
|
| 143 |
+
|
| 144 |
+
log_dir = logs_dir()
|
| 145 |
+
server_logger = setup_logging(str(log_dir / 'app.log'), prefix="server")
|
| 146 |
+
|
| 147 |
+
# 动态导入 Flask(只在需要时)
|
| 148 |
+
try:
|
| 149 |
+
from flask import Flask, jsonify
|
| 150 |
+
flask_app = Flask(__name__)
|
| 151 |
+
except ImportError:
|
| 152 |
+
server_logger.error("错误: 服务器模式需要 Flask,请安装: pip install flask")
|
| 153 |
+
return
|
| 154 |
+
|
| 155 |
+
app_running = True
|
| 156 |
+
|
| 157 |
+
# 在后台线程中启动浏览器实例
|
| 158 |
+
browser_thread = threading.Thread(target=start_browser_instances, daemon=True)
|
| 159 |
+
browser_thread.start()
|
| 160 |
+
|
| 161 |
+
# 定义路由
|
| 162 |
+
@flask_app.route('/health')
|
| 163 |
+
def health_check():
|
| 164 |
+
"""健康检查端点"""
|
| 165 |
+
running_count = sum(1 for p in browser_processes if p.is_alive())
|
| 166 |
+
return jsonify({
|
| 167 |
+
'status': 'healthy',
|
| 168 |
+
'browser_instances': len(browser_processes),
|
| 169 |
+
'running_instances': running_count,
|
| 170 |
+
'message': f'Application is running with {running_count} active browser instances'
|
| 171 |
+
})
|
| 172 |
+
|
| 173 |
+
@flask_app.route('/')
|
| 174 |
+
def index():
|
| 175 |
+
"""主页端点"""
|
| 176 |
+
running_count = sum(1 for p in browser_processes if p.is_alive())
|
| 177 |
+
return jsonify({
|
| 178 |
+
'status': 'running',
|
| 179 |
+
'browser_instances': len(browser_processes),
|
| 180 |
+
'running_instances': running_count,
|
| 181 |
+
'run_mode': 'server',
|
| 182 |
+
'message': 'Camoufox Browser Automation is running in server mode'
|
| 183 |
+
})
|
| 184 |
+
|
| 185 |
+
# 禁用 Flask 的默认日志
|
| 186 |
+
import logging
|
| 187 |
+
log = logging.getLogger('werkzeug')
|
| 188 |
+
log.setLevel(logging.ERROR)
|
| 189 |
+
|
| 190 |
+
# 启动 Flask 服务器
|
| 191 |
+
try:
|
| 192 |
+
flask_app.run(host='0.0.0.0', port=7860, debug=False)
|
| 193 |
+
except KeyboardInterrupt:
|
| 194 |
+
server_logger.info("服务器正在关闭...")
|
| 195 |
+
|
| 196 |
+
def signal_handler(signum, frame):
|
| 197 |
+
"""统一的信号处理器"""
|
| 198 |
+
global app_running
|
| 199 |
+
logger = setup_logging(str(logs_dir() / 'app.log'), prefix="signal")
|
| 200 |
+
logger.info(f"接收到信号 {signum},正在关闭应用...")
|
| 201 |
+
app_running = False
|
| 202 |
+
|
| 203 |
+
# 关闭所有浏览器进程
|
| 204 |
+
for process in browser_processes:
|
| 205 |
+
if process.is_alive():
|
| 206 |
+
process.terminate()
|
| 207 |
+
try:
|
| 208 |
+
process.join(timeout=5)
|
| 209 |
+
except:
|
| 210 |
+
process.kill()
|
| 211 |
+
|
| 212 |
+
logger.info("所有进程已关闭")
|
| 213 |
+
sys.exit(0)
|
| 214 |
+
|
| 215 |
+
def main():
|
| 216 |
+
"""主入口函数"""
|
| 217 |
+
# 初始化必要的目录
|
| 218 |
+
ensure_dir(logs_dir())
|
| 219 |
+
ensure_dir(cookies_dir())
|
| 220 |
+
|
| 221 |
+
# 注册信号处理器
|
| 222 |
+
signal.signal(signal.SIGTERM, signal_handler)
|
| 223 |
+
signal.signal(signal.SIGINT, signal_handler)
|
| 224 |
+
|
| 225 |
+
# 检查运行模式环境变量
|
| 226 |
+
hg_mode = os.getenv('HG', '').lower()
|
| 227 |
+
|
| 228 |
+
if hg_mode == 'true':
|
| 229 |
+
run_server_mode()
|
| 230 |
+
else:
|
| 231 |
+
run_standalone_mode()
|
| 232 |
+
|
| 233 |
+
if __name__ == "__main__":
|
| 234 |
+
multiprocessing.freeze_support()
|
| 235 |
+
main()
|
requirements.txt
ADDED
|
@@ -0,0 +1,33 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
camoufox[geoip]==0.4.11
|
| 2 |
+
aiohappyeyeballs==2.6.1
|
| 3 |
+
aiohttp==3.12.13
|
| 4 |
+
aiosignal==1.3.2
|
| 5 |
+
attrs==25.3.0
|
| 6 |
+
browserforge==1.2.3
|
| 7 |
+
certifi==2025.6.15
|
| 8 |
+
charset-normalizer==3.4.2
|
| 9 |
+
click==8.2.1
|
| 10 |
+
frozenlist==1.7.0
|
| 11 |
+
flask==3.0.0
|
| 12 |
+
geoip2==5.1.0
|
| 13 |
+
greenlet==3.2.3
|
| 14 |
+
idna==3.10
|
| 15 |
+
language-tags==1.2.0
|
| 16 |
+
lxml==5.4.0
|
| 17 |
+
maxminddb==2.7.0
|
| 18 |
+
multidict==6.5.0
|
| 19 |
+
numpy==2.3.0
|
| 20 |
+
orjson==3.10.18
|
| 21 |
+
platformdirs==4.3.8
|
| 22 |
+
playwright==1.52.0
|
| 23 |
+
propcache==0.3.2
|
| 24 |
+
pyee==13.0.0
|
| 25 |
+
PySocks==1.7.1
|
| 26 |
+
requests==2.32.4
|
| 27 |
+
screeninfo==0.8.1
|
| 28 |
+
tqdm==4.67.1
|
| 29 |
+
typing_extensions==4.14.0
|
| 30 |
+
ua-parser==1.0.1
|
| 31 |
+
ua-parser-builtins==0.18.0.post1
|
| 32 |
+
urllib3==2.4.0
|
| 33 |
+
yarl==1.20.1
|
utils/common.py
ADDED
|
@@ -0,0 +1,52 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
通用工具函数
|
| 3 |
+
提供项目中常用的基础功能
|
| 4 |
+
"""
|
| 5 |
+
|
| 6 |
+
import os
|
| 7 |
+
from pathlib import Path
|
| 8 |
+
|
| 9 |
+
def clean_env_value(value):
|
| 10 |
+
"""
|
| 11 |
+
清理环境变量值,去除首尾空白字符
|
| 12 |
+
|
| 13 |
+
Args:
|
| 14 |
+
value: 环境变量的原始值
|
| 15 |
+
|
| 16 |
+
Returns:
|
| 17 |
+
str or None: 清理后的值,如果为空或None则返回None
|
| 18 |
+
"""
|
| 19 |
+
if value is None:
|
| 20 |
+
return None
|
| 21 |
+
stripped = value.strip()
|
| 22 |
+
return stripped or None
|
| 23 |
+
|
| 24 |
+
|
| 25 |
+
def parse_headless_mode(headless_setting):
|
| 26 |
+
"""
|
| 27 |
+
解析headless模式配置
|
| 28 |
+
|
| 29 |
+
Args:
|
| 30 |
+
headless_setting: headless配置值
|
| 31 |
+
|
| 32 |
+
Returns:
|
| 33 |
+
bool or str: True表示headless,False表示有界面,'virtual'表示虚拟模式
|
| 34 |
+
"""
|
| 35 |
+
if str(headless_setting).lower() == 'true':
|
| 36 |
+
return True
|
| 37 |
+
elif str(headless_setting).lower() == 'false':
|
| 38 |
+
return False
|
| 39 |
+
else:
|
| 40 |
+
return 'virtual'
|
| 41 |
+
|
| 42 |
+
|
| 43 |
+
def ensure_dir(path):
|
| 44 |
+
"""
|
| 45 |
+
确保目录存在,如果不存在则创建
|
| 46 |
+
|
| 47 |
+
Args:
|
| 48 |
+
path: 目录路径(可以是字符串或Path对象)
|
| 49 |
+
"""
|
| 50 |
+
if isinstance(path, str):
|
| 51 |
+
path = Path(path)
|
| 52 |
+
os.makedirs(path, exist_ok=True)
|
utils/cookie_handler.py
ADDED
|
@@ -0,0 +1,97 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
def convert_cookie_editor_to_playwright(cookies_from_editor, logger=None):
|
| 2 |
+
"""
|
| 3 |
+
将从 Cookie-Editor 插件导出的 Cookie 列表转换为 Playwright 兼容的格式。
|
| 4 |
+
"""
|
| 5 |
+
playwright_cookies = []
|
| 6 |
+
allowed_keys = {'name', 'value', 'domain', 'path', 'expires', 'httpOnly', 'secure', 'sameSite'}
|
| 7 |
+
|
| 8 |
+
for cookie in cookies_from_editor:
|
| 9 |
+
pw_cookie = {}
|
| 10 |
+
for key in ['name', 'value', 'domain', 'path', 'httpOnly', 'secure']:
|
| 11 |
+
if key in cookie:
|
| 12 |
+
pw_cookie[key] = cookie[key]
|
| 13 |
+
if cookie.get('session', False):
|
| 14 |
+
pw_cookie['expires'] = -1
|
| 15 |
+
elif 'expirationDate' in cookie:
|
| 16 |
+
if cookie['expirationDate'] is not None:
|
| 17 |
+
pw_cookie['expires'] = int(cookie['expirationDate'])
|
| 18 |
+
else:
|
| 19 |
+
pw_cookie['expires'] = -1
|
| 20 |
+
|
| 21 |
+
if 'sameSite' in cookie:
|
| 22 |
+
same_site_value = str(cookie['sameSite']).lower()
|
| 23 |
+
if same_site_value == 'no_restriction':
|
| 24 |
+
pw_cookie['sameSite'] = 'None'
|
| 25 |
+
elif same_site_value in ['lax', 'strict']:
|
| 26 |
+
pw_cookie['sameSite'] = same_site_value.capitalize()
|
| 27 |
+
elif same_site_value == 'unspecified':
|
| 28 |
+
pw_cookie['sameSite'] = 'Lax'
|
| 29 |
+
|
| 30 |
+
if all(key in pw_cookie for key in ['name', 'value', 'domain', 'path']):
|
| 31 |
+
playwright_cookies.append(pw_cookie)
|
| 32 |
+
else:
|
| 33 |
+
if logger:
|
| 34 |
+
logger.warning(f"跳过一个格式不完整的 Cookie: {cookie}")
|
| 35 |
+
|
| 36 |
+
return playwright_cookies
|
| 37 |
+
|
| 38 |
+
|
| 39 |
+
def convert_kv_to_playwright(kv_string, default_domain=".google.com", logger=None):
|
| 40 |
+
"""
|
| 41 |
+
将键值对格式的 Cookie 字符串转换为 Playwright 兼容的格式。
|
| 42 |
+
|
| 43 |
+
Args:
|
| 44 |
+
kv_string (str): 包含 Cookie 的键值对字符串,格式为 "name1=value1; name2=value2; ..."
|
| 45 |
+
default_domain (str): 默认域名,默认为".google.com"
|
| 46 |
+
logger: 日志记录器
|
| 47 |
+
|
| 48 |
+
Returns:
|
| 49 |
+
list: Playwright 兼容的 Cookie 列表
|
| 50 |
+
"""
|
| 51 |
+
import re
|
| 52 |
+
|
| 53 |
+
playwright_cookies = []
|
| 54 |
+
|
| 55 |
+
# 按分号分割 Cookie
|
| 56 |
+
cookie_pairs = kv_string.split(';')
|
| 57 |
+
|
| 58 |
+
for pair in cookie_pairs:
|
| 59 |
+
pair = pair.strip() # 去除首尾空白字符
|
| 60 |
+
|
| 61 |
+
if not pair: # 跳过空字符串
|
| 62 |
+
continue
|
| 63 |
+
|
| 64 |
+
# 跳过无效的 Cookie(不包含等号)
|
| 65 |
+
if '=' not in pair:
|
| 66 |
+
if logger:
|
| 67 |
+
logger.warning(f"跳过无效的 Cookie 格式: '{pair}'")
|
| 68 |
+
continue
|
| 69 |
+
|
| 70 |
+
# 分割name和value
|
| 71 |
+
name, value = pair.split('=', 1) # 只分割第一个等号
|
| 72 |
+
name = name.strip()
|
| 73 |
+
value = value.strip()
|
| 74 |
+
|
| 75 |
+
if not name: # 跳过空名称
|
| 76 |
+
if logger:
|
| 77 |
+
logger.warning(f"跳过空名称的 Cookie: '{pair}'")
|
| 78 |
+
continue
|
| 79 |
+
|
| 80 |
+
# 构造 Playwright 格式的 Cookie
|
| 81 |
+
pw_cookie = {
|
| 82 |
+
'name': name,
|
| 83 |
+
'value': value,
|
| 84 |
+
'domain': default_domain,
|
| 85 |
+
'path': '/',
|
| 86 |
+
'expires': -1, # 默认为会话 Cookie
|
| 87 |
+
'httpOnly': False, # KV 格式无法确定 httpOnly 状态,默认为 False
|
| 88 |
+
'secure': True, # 假设为安全 Cookie
|
| 89 |
+
'sameSite': 'Lax' # 默认 SameSite 策略
|
| 90 |
+
}
|
| 91 |
+
|
| 92 |
+
playwright_cookies.append(pw_cookie)
|
| 93 |
+
|
| 94 |
+
if logger:
|
| 95 |
+
logger.debug(f"成功转换 Cookie: {name} -> domain={default_domain}")
|
| 96 |
+
|
| 97 |
+
return playwright_cookies
|
utils/cookie_manager.py
ADDED
|
@@ -0,0 +1,201 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
统一的Cookie管理器
|
| 3 |
+
整合JSON文件和环境变量cookie的检测、加载和管理功能
|
| 4 |
+
"""
|
| 5 |
+
|
| 6 |
+
import os
|
| 7 |
+
import json
|
| 8 |
+
from dataclasses import dataclass
|
| 9 |
+
from typing import List, Dict, Optional
|
| 10 |
+
from utils.paths import cookies_dir
|
| 11 |
+
from utils.cookie_handler import convert_cookie_editor_to_playwright, convert_kv_to_playwright
|
| 12 |
+
from utils.common import clean_env_value
|
| 13 |
+
|
| 14 |
+
@dataclass
|
| 15 |
+
class CookieSource:
|
| 16 |
+
"""Cookie来源的统一表示"""
|
| 17 |
+
type: str # "file" | "env_var"
|
| 18 |
+
identifier: str # filename or "USER_COOKIE_1"
|
| 19 |
+
display_name: str # 显示名称
|
| 20 |
+
exists: bool = True
|
| 21 |
+
|
| 22 |
+
def __str__(self):
|
| 23 |
+
return f"{self.type}:{self.identifier}"
|
| 24 |
+
|
| 25 |
+
|
| 26 |
+
class CookieManager:
|
| 27 |
+
"""
|
| 28 |
+
统一的Cookie管理器
|
| 29 |
+
负责检测、加载和缓存所有来源的cookie数据
|
| 30 |
+
"""
|
| 31 |
+
|
| 32 |
+
def __init__(self, logger=None):
|
| 33 |
+
self.logger = logger
|
| 34 |
+
self._detected_sources: Optional[List[CookieSource]] = None
|
| 35 |
+
self._cookie_cache: Dict[str, List[Dict]] = {}
|
| 36 |
+
|
| 37 |
+
def detect_all_sources(self) -> List[CookieSource]:
|
| 38 |
+
"""
|
| 39 |
+
检测所有可用的cookie来源(JSON文件 + 环境变量)
|
| 40 |
+
结果会被缓存,避免重复扫描
|
| 41 |
+
"""
|
| 42 |
+
if self._detected_sources is not None:
|
| 43 |
+
return self._detected_sources
|
| 44 |
+
|
| 45 |
+
sources = []
|
| 46 |
+
|
| 47 |
+
# 1. 扫描cookies目录中的JSON文件
|
| 48 |
+
try:
|
| 49 |
+
cookie_path = cookies_dir()
|
| 50 |
+
if os.path.isdir(cookie_path):
|
| 51 |
+
cookie_files = [f for f in os.listdir(cookie_path) if f.lower().endswith('.json')]
|
| 52 |
+
|
| 53 |
+
for cookie_file in cookie_files:
|
| 54 |
+
source = CookieSource(
|
| 55 |
+
type="file",
|
| 56 |
+
identifier=cookie_file,
|
| 57 |
+
display_name=cookie_file
|
| 58 |
+
)
|
| 59 |
+
sources.append(source)
|
| 60 |
+
|
| 61 |
+
if cookie_files and self.logger:
|
| 62 |
+
self.logger.info(f"发现 {len(cookie_files)} 个 Cookie 文件")
|
| 63 |
+
elif self.logger:
|
| 64 |
+
self.logger.info(f"在 {cookie_path} 目录下未找到任何 .json 格式的 Cookie 文件")
|
| 65 |
+
else:
|
| 66 |
+
if self.logger:
|
| 67 |
+
self.logger.error(f"Cookie 目录不存在: {cookie_path}")
|
| 68 |
+
|
| 69 |
+
except Exception as e:
|
| 70 |
+
if self.logger:
|
| 71 |
+
self.logger.error(f"扫描 Cookie 目录时出错: {e}")
|
| 72 |
+
|
| 73 |
+
# 2. 扫描USER_COOKIE环境变量
|
| 74 |
+
cookie_index = 1
|
| 75 |
+
env_cookie_count = 0
|
| 76 |
+
|
| 77 |
+
while True:
|
| 78 |
+
env_var_name = f"USER_COOKIE_{cookie_index}"
|
| 79 |
+
env_value = clean_env_value(os.getenv(env_var_name))
|
| 80 |
+
|
| 81 |
+
if not env_value:
|
| 82 |
+
if cookie_index == 1 and self.logger:
|
| 83 |
+
self.logger.info(f"未检测到任何 USER_COOKIE 环境变量")
|
| 84 |
+
break
|
| 85 |
+
|
| 86 |
+
source = CookieSource(
|
| 87 |
+
type="env_var",
|
| 88 |
+
identifier=env_var_name,
|
| 89 |
+
display_name=env_var_name
|
| 90 |
+
)
|
| 91 |
+
sources.append(source)
|
| 92 |
+
|
| 93 |
+
env_cookie_count += 1
|
| 94 |
+
cookie_index += 1
|
| 95 |
+
|
| 96 |
+
if env_cookie_count > 0 and self.logger:
|
| 97 |
+
self.logger.info(f"发现 {env_cookie_count} 个 Cookie 环境变量")
|
| 98 |
+
|
| 99 |
+
# 缓存结果
|
| 100 |
+
self._detected_sources = sources
|
| 101 |
+
return sources
|
| 102 |
+
|
| 103 |
+
def load_cookies(self, source: CookieSource) -> List[Dict]:
|
| 104 |
+
"""
|
| 105 |
+
从指定来源加载cookie数据
|
| 106 |
+
|
| 107 |
+
Args:
|
| 108 |
+
source: Cookie来源对象
|
| 109 |
+
|
| 110 |
+
Returns:
|
| 111 |
+
Playwright兼容的cookie列表
|
| 112 |
+
"""
|
| 113 |
+
cache_key = str(source)
|
| 114 |
+
|
| 115 |
+
# 检查缓存
|
| 116 |
+
if cache_key in self._cookie_cache:
|
| 117 |
+
if self.logger:
|
| 118 |
+
self.logger.debug(f"从缓存加载 Cookie: {source.display_name}")
|
| 119 |
+
return self._cookie_cache[cache_key]
|
| 120 |
+
|
| 121 |
+
cookies = []
|
| 122 |
+
|
| 123 |
+
try:
|
| 124 |
+
if source.type == "file":
|
| 125 |
+
cookies = self._load_from_file(source.identifier)
|
| 126 |
+
elif source.type == "env_var":
|
| 127 |
+
cookies = self._load_from_env(source.identifier)
|
| 128 |
+
else:
|
| 129 |
+
if self.logger:
|
| 130 |
+
self.logger.error(f"未知的 Cookie 来源类型: {source.type}")
|
| 131 |
+
return []
|
| 132 |
+
|
| 133 |
+
# 缓存结果
|
| 134 |
+
self._cookie_cache[cache_key] = cookies
|
| 135 |
+
|
| 136 |
+
if self.logger:
|
| 137 |
+
self.logger.info(f"从 {source.display_name} 加载了 {len(cookies)} 个 Cookie 数据")
|
| 138 |
+
|
| 139 |
+
except Exception as e:
|
| 140 |
+
if self.logger:
|
| 141 |
+
self.logger.error(f"从 {source.display_name} 加载 Cookie 时出错: {e}")
|
| 142 |
+
return []
|
| 143 |
+
|
| 144 |
+
return cookies
|
| 145 |
+
|
| 146 |
+
def _load_from_file(self, filename: str) -> List[Dict]:
|
| 147 |
+
"""从JSON文件加载 Cookie"""
|
| 148 |
+
cookie_path = cookies_dir() / filename
|
| 149 |
+
|
| 150 |
+
if not os.path.exists(cookie_path):
|
| 151 |
+
raise FileNotFoundError(f"Cookie 文件不存在: {cookie_path}")
|
| 152 |
+
|
| 153 |
+
with open(cookie_path, 'r', encoding='utf-8') as f:
|
| 154 |
+
cookies_from_file = json.load(f)
|
| 155 |
+
|
| 156 |
+
return convert_cookie_editor_to_playwright(cookies_from_file, logger=self.logger)
|
| 157 |
+
|
| 158 |
+
def _load_from_env(self, env_var_name: str) -> List[Dict]:
|
| 159 |
+
"""从环境变量加载 Cookie"""
|
| 160 |
+
env_value = clean_env_value(os.getenv(env_var_name))
|
| 161 |
+
|
| 162 |
+
if not env_value:
|
| 163 |
+
raise ValueError(f"环境变量 {env_var_name} 不存在或为空")
|
| 164 |
+
|
| 165 |
+
return convert_kv_to_playwright(
|
| 166 |
+
env_value,
|
| 167 |
+
default_domain=".google.com",
|
| 168 |
+
logger=self.logger
|
| 169 |
+
)
|
| 170 |
+
|
| 171 |
+
def get_all_sources(self) -> List[CookieSource]:
|
| 172 |
+
"""获取所有检测到的 Cookie 来源"""
|
| 173 |
+
return self.detect_all_sources()
|
| 174 |
+
|
| 175 |
+
def clear_cache(self):
|
| 176 |
+
"""清空 Cookie 缓存"""
|
| 177 |
+
self._cookie_cache.clear()
|
| 178 |
+
if self.logger:
|
| 179 |
+
self.logger.debug("Cookie 缓存已清空")
|
| 180 |
+
|
| 181 |
+
def get_source_summary(self) -> Dict[str, int]:
|
| 182 |
+
"""
|
| 183 |
+
获取 Cookie 来源统计信息
|
| 184 |
+
|
| 185 |
+
Returns:
|
| 186 |
+
包含各类型来源数量的字典
|
| 187 |
+
"""
|
| 188 |
+
sources = self.detect_all_sources()
|
| 189 |
+
summary = {
|
| 190 |
+
"total": len(sources),
|
| 191 |
+
"files": 0,
|
| 192 |
+
"env_vars": 0
|
| 193 |
+
}
|
| 194 |
+
|
| 195 |
+
for source in sources:
|
| 196 |
+
if source.type == "file":
|
| 197 |
+
summary["files"] += 1
|
| 198 |
+
elif source.type == "env_var":
|
| 199 |
+
summary["env_vars"] += 1
|
| 200 |
+
|
| 201 |
+
return summary
|
utils/logger.py
ADDED
|
@@ -0,0 +1,42 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import logging
|
| 2 |
+
|
| 3 |
+
def setup_logging(log_file, prefix=None, level=logging.INFO):
|
| 4 |
+
"""
|
| 5 |
+
配置日志记录器,使其输出到文件和控制台。
|
| 6 |
+
支持一个可选的前缀,用于标识日志来源。
|
| 7 |
+
|
| 8 |
+
每次调用都会重新配置处理器,以适应多进程环境。
|
| 9 |
+
|
| 10 |
+
:param log_file: 日志文件的路径。
|
| 11 |
+
:param prefix: (可选) 要添加到每条日志消息开头的字符串前缀。
|
| 12 |
+
:param level: 日志级别。
|
| 13 |
+
"""
|
| 14 |
+
logger = logging.getLogger('my_app_logger')
|
| 15 |
+
logger.setLevel(level)
|
| 16 |
+
|
| 17 |
+
if logger.hasHandlers():
|
| 18 |
+
logger.handlers.clear()
|
| 19 |
+
|
| 20 |
+
base_format = '%(asctime)s - %(process)d - %(levelname)s - %(message)s'
|
| 21 |
+
|
| 22 |
+
if prefix:
|
| 23 |
+
log_format = f'%(asctime)s - %(process)d - %(levelname)s - {prefix} - %(message)s'
|
| 24 |
+
else:
|
| 25 |
+
log_format = base_format
|
| 26 |
+
|
| 27 |
+
fh = logging.FileHandler(log_file)
|
| 28 |
+
fh.setLevel(level)
|
| 29 |
+
|
| 30 |
+
ch = logging.StreamHandler()
|
| 31 |
+
ch.setLevel(level)
|
| 32 |
+
|
| 33 |
+
formatter = logging.Formatter(log_format)
|
| 34 |
+
fh.setFormatter(formatter)
|
| 35 |
+
ch.setFormatter(formatter)
|
| 36 |
+
|
| 37 |
+
logger.addHandler(fh)
|
| 38 |
+
logger.addHandler(ch)
|
| 39 |
+
|
| 40 |
+
logger.propagate = False
|
| 41 |
+
|
| 42 |
+
return logger
|
utils/paths.py
ADDED
|
@@ -0,0 +1,32 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
from functools import lru_cache
|
| 3 |
+
from pathlib import Path
|
| 4 |
+
|
| 5 |
+
|
| 6 |
+
@lru_cache(maxsize=1)
|
| 7 |
+
def project_root() -> Path:
|
| 8 |
+
"""
|
| 9 |
+
Return the repository root so callers can build absolute paths that do not
|
| 10 |
+
depend on the current working directory.
|
| 11 |
+
"""
|
| 12 |
+
env_root = os.getenv("CAMOUFOX_PROJECT_ROOT")
|
| 13 |
+
if env_root:
|
| 14 |
+
return Path(env_root).expanduser().resolve()
|
| 15 |
+
|
| 16 |
+
current = Path(__file__).resolve()
|
| 17 |
+
for parent in current.parents:
|
| 18 |
+
if (parent / "cookies").exists():
|
| 19 |
+
return parent
|
| 20 |
+
|
| 21 |
+
# Fallback to the original behaviour if the marker directory is missing.
|
| 22 |
+
return current.parents[min(2, len(current.parents) - 1)]
|
| 23 |
+
|
| 24 |
+
|
| 25 |
+
def logs_dir() -> Path:
|
| 26 |
+
"""Root-level directory that stores log files and screenshots."""
|
| 27 |
+
return project_root() / "logs"
|
| 28 |
+
|
| 29 |
+
|
| 30 |
+
def cookies_dir() -> Path:
|
| 31 |
+
"""Root-level directory that stores persistent cookie JSON files."""
|
| 32 |
+
return project_root() / "cookies"
|