|
|
@@ -1242,19 +1242,118 @@ async def browser_extract_content(query: str, extract_links: bool = False,
|
|
|
long_term_memory=f"提取内容失败: {query}"
|
|
|
)
|
|
|
|
|
|
+async def _detect_and_download_pdf_via_cdp(browser) -> Optional[str]:
|
|
|
+ """
|
|
|
+ 检测当前页面是否为 PDF,如果是则通过 CDP(浏览器内 fetch)下载到本地。
|
|
|
+ 优势:自动携带浏览器的 cookies/session,可访问需要登录的 PDF。
|
|
|
+ 返回本地文件路径,非 PDF 页面返回 None。
|
|
|
+ """
|
|
|
+ try:
|
|
|
+ current_url = await browser.get_current_page_url()
|
|
|
+ if not current_url:
|
|
|
+ return None
|
|
|
+
|
|
|
+ parsed = urlparse(current_url)
|
|
|
+ is_pdf = parsed.path.lower().endswith('.pdf')
|
|
|
+
|
|
|
+ # URL 不明显是 PDF 时,通过 CDP 检查 content-type
|
|
|
+ if not is_pdf:
|
|
|
+ try:
|
|
|
+ cdp = await browser.get_or_create_cdp_session()
|
|
|
+ ct_result = await cdp.cdp_client.send.Runtime.evaluate(
|
|
|
+ params={'expression': 'document.contentType'},
|
|
|
+ session_id=cdp.session_id
|
|
|
+ )
|
|
|
+ content_type = ct_result.get('result', {}).get('value', '')
|
|
|
+ is_pdf = 'pdf' in content_type.lower()
|
|
|
+ except Exception:
|
|
|
+ pass
|
|
|
+
|
|
|
+ if not is_pdf:
|
|
|
+ return None
|
|
|
+
|
|
|
+ # 通过浏览器内 fetch API 下载 PDF(自动携带 cookies)
|
|
|
+ cdp = await browser.get_or_create_cdp_session()
|
|
|
+ js_code = """
|
|
|
+ (async () => {
|
|
|
+ try {
|
|
|
+ const resp = await fetch(window.location.href);
|
|
|
+ if (!resp.ok) return JSON.stringify({error: 'HTTP ' + resp.status});
|
|
|
+ const blob = await resp.blob();
|
|
|
+ return new Promise((resolve, reject) => {
|
|
|
+ const reader = new FileReader();
|
|
|
+ reader.onloadend = () => resolve(JSON.stringify({data: reader.result}));
|
|
|
+ reader.onerror = () => resolve(JSON.stringify({error: 'FileReader failed'}));
|
|
|
+ reader.readAsDataURL(blob);
|
|
|
+ });
|
|
|
+ } catch(e) {
|
|
|
+ return JSON.stringify({error: e.message});
|
|
|
+ }
|
|
|
+ })()
|
|
|
+ """
|
|
|
+ result = await cdp.cdp_client.send.Runtime.evaluate(
|
|
|
+ params={
|
|
|
+ 'expression': js_code,
|
|
|
+ 'awaitPromise': True,
|
|
|
+ 'returnByValue': True,
|
|
|
+ 'timeout': 60000
|
|
|
+ },
|
|
|
+ session_id=cdp.session_id
|
|
|
+ )
|
|
|
+
|
|
|
+ value = result.get('result', {}).get('value', '')
|
|
|
+ if not value:
|
|
|
+ print("⚠️ CDP fetch PDF: 无返回值")
|
|
|
+ return None
|
|
|
+
|
|
|
+ data = json.loads(value)
|
|
|
+ if 'error' in data:
|
|
|
+ print(f"⚠️ CDP fetch PDF 失败: {data['error']}")
|
|
|
+ return None
|
|
|
+
|
|
|
+ # 从 data URL 中提取 base64 并解码
|
|
|
+ data_url = data['data'] # data:application/pdf;base64,JVBERi0...
|
|
|
+ base64_data = data_url.split(',', 1)[1]
|
|
|
+ pdf_bytes = base64.b64decode(base64_data)
|
|
|
+
|
|
|
+ # 保存到本地
|
|
|
+ save_dir = Path.cwd() / ".browser_use_files"
|
|
|
+ save_dir.mkdir(parents=True, exist_ok=True)
|
|
|
+
|
|
|
+ filename = Path(parsed.path).name if parsed.path else ""
|
|
|
+ if not filename or not filename.lower().endswith('.pdf'):
|
|
|
+ import time
|
|
|
+ filename = f"downloaded_{int(time.time())}.pdf"
|
|
|
+ save_path = str(save_dir / filename)
|
|
|
+
|
|
|
+ with open(save_path, 'wb') as f:
|
|
|
+ f.write(pdf_bytes)
|
|
|
+
|
|
|
+ print(f"📄 PDF 已通过 CDP 下载到: {save_path} ({len(pdf_bytes)} bytes)")
|
|
|
+ return save_path
|
|
|
+
|
|
|
+ except Exception as e:
|
|
|
+ print(f"⚠️ PDF 检测/下载异常: {e}")
|
|
|
+ return None
|
|
|
+
|
|
|
+
|
|
|
@tool()
|
|
|
async def browser_read_long_content(
|
|
|
- goal: Any,
|
|
|
- source: str = "page",
|
|
|
- context: Any = "",
|
|
|
- **kwargs
|
|
|
+ goal: Any,
|
|
|
+ source: str = "page",
|
|
|
+ context: Any = "",
|
|
|
+ **kwargs
|
|
|
) -> ToolResult:
|
|
|
"""
|
|
|
- 智能读取长内容。已修复参数嵌套导致的 Field Missing 报错。
|
|
|
+ 智能读取长内容。支持自动检测并读取网页上的 PDF 文件。
|
|
|
+
|
|
|
+ 当 source="page" 且当前页面是 PDF 时,会通过 CDP 下载 PDF 并用 pypdf 解析,
|
|
|
+ 而非使用 DOM 提取(DOM 无法读取浏览器内置 PDF Viewer 的内容)。
|
|
|
+ 通过 CDP 下载可自动携带浏览器的 cookies/session,支持需要登录的 PDF。
|
|
|
"""
|
|
|
try:
|
|
|
browser, tools = await get_browser_session()
|
|
|
-
|
|
|
+
|
|
|
# 1. 提取目标文本 (针对 GoalTree 字典结构)
|
|
|
final_goal_text = ""
|
|
|
if isinstance(goal, dict):
|
|
|
@@ -1265,27 +1364,32 @@ async def browser_read_long_content(
|
|
|
# 2. 清洗业务背景 (过滤框架注入的 dict 类型 context)
|
|
|
business_context = context if isinstance(context, str) else ""
|
|
|
|
|
|
- # 3. 验证并实例化
|
|
|
+ # 3. PDF 自动检测:当 source="page" 时检查是否为 PDF 页面
|
|
|
+ available_files = []
|
|
|
+ if source.lower() == "page":
|
|
|
+ pdf_path = await _detect_and_download_pdf_via_cdp(browser)
|
|
|
+ if pdf_path:
|
|
|
+ source = pdf_path
|
|
|
+ available_files.append(pdf_path)
|
|
|
+
|
|
|
+ # 4. 验证并实例化
|
|
|
action_params = ReadContentAction(
|
|
|
- goal=final_goal_text,
|
|
|
+ goal=final_goal_text,
|
|
|
source=source,
|
|
|
context=business_context
|
|
|
)
|
|
|
|
|
|
- # --- 4. 核心修复:解包参数 (Unpacking) ---
|
|
|
- # 使用 ** 将字典解包为平铺参数:goal='...', source='...', context='...'
|
|
|
- # 这样底层函数就能直接识别到 goal 字段了
|
|
|
+ # 5. 解包参数调用底层方法
|
|
|
result = await tools.read_long_content(
|
|
|
- **action_params.model_dump(),
|
|
|
+ **action_params.model_dump(),
|
|
|
browser_session=browser,
|
|
|
page_extraction_llm=RunnableLambda(extraction_adapter),
|
|
|
- available_file_paths=[]
|
|
|
+ available_file_paths=available_files
|
|
|
)
|
|
|
|
|
|
return action_result_to_tool_result(result, f"深度读取: {source}")
|
|
|
|
|
|
except Exception as e:
|
|
|
- # 补全 output 参数确保报错链路不崩溃
|
|
|
return ToolResult(
|
|
|
title="深度读取失败",
|
|
|
output="",
|