guantao 2 дней назад
Родитель
Сommit
2b6e974588

+ 86 - 7
agent/tools/builtin/content/platforms/aigc_channel.py

@@ -116,10 +116,32 @@ async def search(
 
     # 构建概览摘要
     summary_list = []
+    
+    # 动态导入评价模块
+    try:
+        from examples.process_pipeline.script.evaluate_source_quality import SourceQualityEvaluator
+        evaluator = SourceQualityEvaluator()
+    except ImportError:
+        evaluator = None
+        
     for idx, post in enumerate(posts, 1):
         body = post.get("body_text", "") or ""
         title = post.get("title") or body[:20] or ""
-        summary_list.append({
+        
+        score_info = {}
+        if evaluator:
+            try:
+                eval_res = evaluator.evaluate_post(post)
+                score_info = {
+                    "quality_score": eval_res["total_score"],
+                    "quality_grade": eval_res["grade"]
+                }
+                post["_quality_score"] = eval_res["total_score"]
+                post["_quality_grade"] = eval_res["grade"]
+            except Exception:
+                pass
+                
+        summary_item = {
             "index": idx,
             "title": title,
             "body_text": body[:100] + ("..." if len(body) > 100 else ""),
@@ -128,7 +150,9 @@ async def search(
             "channel": post.get("channel"),
             "link": post.get("link"),
             "content_type": post.get("content_type"),
-        })
+        }
+        summary_item.update(score_info)
+        summary_list.append(summary_item)
 
     # 封面拼图
     images = []
@@ -151,18 +175,67 @@ async def search(
 
 # ── 详情实现(从缓存获取,不需要额外 HTTP) ──
 
+MAX_DETAIL_IMAGES = 10  # detail 中保留的图片总数上限(含拼图)
+KEEP_INDIVIDUAL = 8     # 单张图片保留数量;剩余图片合并为 1 张拼图
+
+
+async def _build_images_collage(urls: List[str]) -> Optional[Dict[str, Any]]:
+    """将一组图片 URL 拼成单张网格图"""
+    if not urls:
+        return None
+
+    loaded = await load_images(urls)
+    valid_images = [img for (_, img) in loaded if img is not None]
+    if not valid_images:
+        return None
+
+    grid = build_image_grid(images=valid_images, labels=None)
+    import io
+    buf = io.BytesIO()
+    grid.save(buf, format="PNG")
+    img_bytes = buf.getvalue()
+
+    try:
+        from agent.tools.builtin.file.image_cdn import _upload_bytes_to_oss
+        import hashlib
+
+        md5_hash = hashlib.md5(img_bytes).hexdigest()[:12]
+        filename = f"collage_detail_{md5_hash}.png"
+        cdn_url = await _upload_bytes_to_oss(img_bytes, filename)
+        return {"type": "url", "url": cdn_url}
+    except Exception as e:
+        import logging
+        logging.getLogger(__name__).warning("Failed to upload detail collage to CDN: %s", e)
+        b64, _ = encode_base64(grid, format="PNG")
+        return {"type": "base64", "media_type": "image/png", "data": b64}
+
+
 async def detail(post: Dict[str, Any], extras: Optional[Dict[str, Any]] = None) -> ToolResult:
     """返回单条帖子的完整内容"""
     title = post.get("title") or post.get("body_text", "")[:30] or "无标题"
 
+    img_urls = [u for u in post.get("images", []) if u]
     images = []
-    for img_url in post.get("images", []):
-        if img_url:
-            images.append({"type": "url", "url": img_url})
+    if len(img_urls) > MAX_DETAIL_IMAGES:
+        # 保留前 KEEP_INDIVIDUAL 张原图,剩余拼成 1 张网格图
+        for u in img_urls[:KEEP_INDIVIDUAL]:
+            images.append({"type": "url", "url": u})
+        collage = await _build_images_collage(img_urls[KEEP_INDIVIDUAL:])
+        if collage:
+            images.append(collage)
+    else:
+        for u in img_urls:
+            images.append({"type": "url", "url": u})
+
+    output_json = json.dumps(post, ensure_ascii=False, indent=2)
+    output_text = (
+        output_json
+        + "\n\n---\n请基于以上内容,从信息完整度、内容质量和实用价值三个角度,给出一句简短的内容评价。"
+    )
 
     return ToolResult(
         title=f"详情: {title}",
-        output=json.dumps(post, ensure_ascii=False, indent=2),
+        output=output_text,
         long_term_memory=f"Viewed detail: {title}",
         images=images,
     )
@@ -201,7 +274,13 @@ async def _build_collage(posts: List[Dict[str, Any]]) -> Optional[str]:
         imgs = post.get("images", [])
         if imgs and imgs[0]:
             urls.append(imgs[0])
-            titles.append(post.get("title", "") or "")
+            base_title = post.get("title", "") or ""
+            score = post.get("_quality_score")
+            if score is not None:
+                title_with_score = f"[{score}分] {base_title}"
+            else:
+                title_with_score = base_title
+            titles.append(title_with_score)
 
     if not urls:
         return None

+ 87 - 6
agent/tools/builtin/content/platforms/x.py

@@ -36,16 +36,39 @@ async def search(
         result_data = data.get("data", {})
         tweets = result_data.get("data", []) if isinstance(result_data, dict) else []
 
+        # 动态导入评价模块
+        try:
+            from examples.process_pipeline.script.evaluate_source_quality import SourceQualityEvaluator
+            evaluator = SourceQualityEvaluator()
+        except ImportError:
+            evaluator = None
+
         summary_list = []
         for idx, tweet in enumerate(tweets[:max_count], 1):
             text = tweet.get("body_text", "")
-            summary_list.append({
+
+            score_info = {}
+            if evaluator:
+                try:
+                    eval_res = evaluator.evaluate_post(tweet)
+                    score_info = {
+                        "quality_score": eval_res["total_score"],
+                        "quality_grade": eval_res["grade"]
+                    }
+                    tweet["_quality_score"] = eval_res["total_score"]
+                except Exception:
+                    pass
+
+            summary_item = {
                 "index": idx,
                 "author": tweet.get("channel_account_name", ""),
                 "body_text": text[:100] + ("..." if len(text) > 100 else ""),
                 "like_count": tweet.get("like_count"),
                 "comment_count": tweet.get("comment_count"),
-            })
+                "link": tweet.get("link"),
+            }
+            summary_item.update(score_info)
+            summary_list.append(summary_item)
 
         # 拼图
         images = []
@@ -65,20 +88,72 @@ async def search(
         return ToolResult(title="X 搜索异常", output="", error=str(e))
 
 
+MAX_DETAIL_IMAGES = 10
+KEEP_INDIVIDUAL = 8
+
+
+async def _build_images_collage(urls: List[str]) -> Optional[Dict[str, Any]]:
+    """将一组图片 URL 拼成单张网格图"""
+    if not urls:
+        return None
+
+    loaded = await load_images(urls)
+    valid_images = [img for (_, img) in loaded if img is not None]
+    if not valid_images:
+        return None
+
+    grid = build_image_grid(images=valid_images, labels=None)
+    import io
+    buf = io.BytesIO()
+    grid.save(buf, format="PNG")
+    img_bytes = buf.getvalue()
+
+    try:
+        from agent.tools.builtin.file.image_cdn import _upload_bytes_to_oss
+        import hashlib
+
+        md5_hash = hashlib.md5(img_bytes).hexdigest()[:12]
+        filename = f"x_detail_collage_{md5_hash}.png"
+        cdn_url = await _upload_bytes_to_oss(img_bytes, filename)
+        return {"type": "url", "url": cdn_url}
+    except Exception as e:
+        import logging
+        logging.getLogger(__name__).warning("Failed to upload x detail collage to CDN: %s", e)
+        b64, _ = encode_base64(grid, format="PNG")
+        return {"type": "base64", "media_type": "image/png", "data": b64}
+
+
 async def detail(post: Dict[str, Any], extras: Optional[Dict[str, Any]] = None) -> ToolResult:
     """X 的详情直接从缓存的搜索结果取完整数据"""
     author = post.get("channel_account_name", "")
     text = post.get("body_text", "")[:30]
 
-    all_images = []
+    img_urls = []
     for img_item in post.get("image_url_list", []):
         url = img_item.get("image_url") if isinstance(img_item, dict) else img_item
         if url:
-            all_images.append({"type": "url", "url": url})
+            img_urls.append(url)
+
+    all_images = []
+    if len(img_urls) > MAX_DETAIL_IMAGES:
+        for u in img_urls[:KEEP_INDIVIDUAL]:
+            all_images.append({"type": "url", "url": u})
+        collage = await _build_images_collage(img_urls[KEEP_INDIVIDUAL:])
+        if collage:
+            all_images.append(collage)
+    else:
+        for u in img_urls:
+            all_images.append({"type": "url", "url": u})
+
+    output_json = json.dumps(post, ensure_ascii=False, indent=2)
+    output_text = (
+        output_json
+        + "\n\n---\n请基于以上内容,从信息完整度、内容质量和实用价值三个角度,给出一句简短的内容评价。"
+    )
 
     return ToolResult(
         title=f"X 详情: @{author}",
-        output=json.dumps(post, ensure_ascii=False, indent=2),
+        output=output_text,
         long_term_memory=f"Viewed X post by @{author}: {text}",
         images=all_images,
     )
@@ -97,7 +172,13 @@ async def _build_tweet_collage(tweets: List[Dict[str, Any]]) -> Optional[str]:
             thumb = tweet.get("cover_url")
         if thumb:
             urls.append(thumb)
-            titles.append(f"@{tweet.get('channel_account_name', '')}")
+            base_title = f"@{tweet.get('channel_account_name', '')}"
+            score = tweet.get("_quality_score")
+            if score is not None:
+                title_with_score = f"[{score}分] {base_title}"
+            else:
+                title_with_score = base_title
+            titles.append(title_with_score)
 
     if not urls:
         return None

+ 38 - 4
agent/tools/builtin/content/platforms/youtube.py

@@ -43,15 +43,37 @@ async def search(
         result_data = data.get("data", {})
         videos = result_data.get("data", []) if isinstance(result_data, dict) else []
 
+        # 动态导入评价模块
+        try:
+            from examples.process_pipeline.script.evaluate_source_quality import SourceQualityEvaluator
+            evaluator = SourceQualityEvaluator()
+        except ImportError:
+            evaluator = None
+
         # 概览
         summary_list = []
         for idx, video in enumerate(videos[:max_count], 1):
-            summary_list.append({
+            score_info = {}
+            if evaluator:
+                try:
+                    eval_res = evaluator.evaluate_post(video)
+                    score_info = {
+                        "quality_score": eval_res["total_score"],
+                        "quality_grade": eval_res["grade"]
+                    }
+                    video["_quality_score"] = eval_res["total_score"]
+                    video["_quality_grade"] = eval_res["grade"]
+                except Exception:
+                    pass
+            
+            summary_item = {
                 "index": idx,
                 "title": video.get("title", ""),
                 "author": video.get("author", ""),
                 "video_id": video.get("video_id", ""),
-            })
+            }
+            summary_item.update(score_info)
+            summary_list.append(summary_item)
 
         # 拼图
         images = []
@@ -142,9 +164,15 @@ async def detail(post: Dict[str, Any], extras: Optional[Dict[str, Any]] = None)
             output_data["video_path"] = video_path
             output_data["video_outline"] = video_outline
 
+        output_json = json.dumps(output_data, ensure_ascii=False, indent=2)
+        output_text = (
+            output_json
+            + "\n\n---\n请基于以上内容,从信息完整度、内容质量和实用价值三个角度,给出一句简短的内容评价。"
+        )
+
         return ToolResult(
             title=f"YouTube 详情: {video_info.get('title', content_id)}",
-            output=json.dumps(output_data, ensure_ascii=False, indent=2),
+            output=output_text,
             long_term_memory=f"YouTube detail for {content_id}" + (" with captions" if captions_text else ""),
         )
 
@@ -167,7 +195,13 @@ async def _build_video_collage(videos: List[Dict[str, Any]]) -> Optional[str]:
 
         if thumb:
             urls.append(thumb)
-            titles.append(video.get("title", ""))
+            base_title = video.get("title", "")
+            score = video.get("_quality_score")
+            if score is not None:
+                title_with_score = f"[{score}分] {base_title}"
+            else:
+                title_with_score = base_title
+            titles.append(title_with_score)
 
     if not urls:
         return None

+ 3 - 1
examples/process_pipeline/db_requirements.json

@@ -107,5 +107,7 @@
   "test",
   "用AI生成人物写真组图",
   "按url搜索到的6条帖子",
-  "用AI生成真实感live图"
+  "用AI生成真实感live图",
+  "用ai生成真实摄影的美女写真组图,要求具有真实感,氛围感,人物一致性保持",
+  "图文排版(优先选择使用终端/API/agent友好工具的帖子)"
 ]

+ 169 - 28
examples/process_pipeline/prompts/extract_capability.schema.json

@@ -2,19 +2,41 @@
   "$schema": "http://json-schema.org/draft-07/schema#",
   "title": "extract_capability_output_v5",
   "type": "object",
-  "required": ["skip", "skip_reason", "capabilities"],
+  "required": [
+    "skip",
+    "skip_reason",
+    "capabilities"
+  ],
   "properties": {
-    "skip": { "type": "boolean" },
-    "skip_reason": { "type": "string" },
+    "skip": {
+      "type": "boolean"
+    },
+    "skip_reason": {
+      "type": "string"
+    },
     "capabilities": {
       "type": "array",
       "items": {
         "type": "object",
-        "required": ["action", "inputs", "outputs", "body", "effects", "stage", "tools", "criterion", "apply_to_draft", "unstructured_what"],
+        "required": [
+          "action",
+          "inputs",
+          "outputs",
+          "body",
+          "effects",
+          "stage",
+          "tools",
+          "criterion",
+          "apply_to_draft",
+          "unstructured_what"
+        ],
         "properties": {
           "action": {
             "type": "object",
-            "required": ["main_action", "mechanism"],
+            "required": [
+              "main_action",
+              "mechanism"
+            ],
             "properties": {
               "main_action": {
                 "type": "string",
@@ -32,7 +54,17 @@
             "type": "array",
             "items": {
               "type": "object",
-              "required": ["role", "modality", "artifact_type", "control_target", "target_scope", "constraint_strength", "source", "lifecycle", "description"],
+              "required": [
+                "role",
+                "modality",
+                "artifact_type",
+                "control_target",
+                "target_scope",
+                "constraint_strength",
+                "source",
+                "lifecycle",
+                "description"
+              ],
               "properties": {
                 "role": {
                   "type": "string",
@@ -51,12 +83,16 @@
                 },
                 "control_target": {
                   "type": "array",
-                  "items": { "type": "string" },
+                  "items": {
+                    "type": "string"
+                  },
                   "description": "控制目标:主体、身份一致性、脸部特征、姿势、构图、场景、光线、色彩、质感、画质、局部区域、瑕疵排除"
                 },
                 "target_scope": {
                   "type": "array",
-                  "items": { "type": "string" },
+                  "items": {
+                    "type": "string"
+                  },
                   "description": "作用范围:整图、人物、脸部、身体、背景、局部区域"
                 },
                 "constraint_strength": {
@@ -83,48 +119,153 @@
             "type": "array",
             "items": {
               "type": "object",
-              "required": ["role", "modality", "artifact_type", "control_target", "target_scope", "constraint_strength", "source", "lifecycle", "description"],
+              "required": [
+                "role",
+                "modality",
+                "artifact_type",
+                "control_target",
+                "target_scope",
+                "constraint_strength",
+                "source",
+                "lifecycle",
+                "description"
+              ],
               "properties": {
-                "role": { "type": "string", "minLength": 1 },
-                "modality": { "type": "string", "minLength": 1 },
-                "artifact_type": { "type": "string", "minLength": 1 },
-                "control_target": { "type": "array", "items": { "type": "string" } },
-                "target_scope": { "type": "array", "items": { "type": "string" } },
-                "constraint_strength": { "type": "string" },
-                "source": { "type": "string" },
-                "lifecycle": { "type": "string" },
-                "description": { "type": "string", "minLength": 1 }
+                "role": {
+                  "type": "string",
+                  "minLength": 1
+                },
+                "modality": {
+                  "type": "string",
+                  "minLength": 1
+                },
+                "artifact_type": {
+                  "type": "string",
+                  "minLength": 1
+                },
+                "control_target": {
+                  "type": "array",
+                  "items": {
+                    "type": "string"
+                  }
+                },
+                "target_scope": {
+                  "type": "array",
+                  "items": {
+                    "type": "string"
+                  }
+                },
+                "constraint_strength": {
+                  "type": "string"
+                },
+                "source": {
+                  "type": "string"
+                },
+                "lifecycle": {
+                  "type": "string"
+                },
+                "description": {
+                  "type": "string",
+                  "minLength": 1
+                }
               }
             }
           },
-          "body": { "type": "string", "minLength": 1 },
+          "body": {
+            "type": "string",
+            "minLength": 1
+          },
           "effects": {
             "type": "array",
-            "items": { "type": "string", "pattern": "^实现" }
+            "items": {
+              "type": "object",
+              "required": [
+                "statement",
+                "criteria",
+                "judge_method",
+                "negative_examples"
+              ],
+              "properties": {
+                "statement": {
+                  "type": "string",
+                  "pattern": "^实现"
+                },
+                "criteria": {
+                  "type": "string",
+                  "minLength": 1
+                },
+                "judge_method": {
+                  "type": "string",
+                  "enum": [
+                    "llm",
+                    "vlm",
+                    "rule",
+                    "human"
+                  ]
+                },
+                "negative_examples": {
+                  "type": "array",
+                  "items": {
+                    "type": "string",
+                    "minLength": 1
+                  },
+                  "default": []
+                }
+              }
+            }
           },
           "stage": {
             "type": "array",
-            "items": { "type": "string", "enum": ["preprocess", "generate", "refine"] }
+            "items": {
+              "type": "string",
+              "enum": [
+                "preprocess",
+                "generate",
+                "refine"
+              ]
+            }
           },
           "tools": {
             "type": "array",
-            "items": { "type": "string" }
+            "items": {
+              "type": "string"
+            }
+          },
+          "criterion": {
+            "type": [
+              "string",
+              "null"
+            ]
           },
-          "criterion": { "type": ["string", "null"] },
           "apply_to_draft": {
             "type": "object",
-            "required": ["实质", "形式"],
+            "required": [
+              "实质",
+              "形式"
+            ],
             "properties": {
-              "实质": { "type": "array", "items": { "type": "string" } },
-              "形式": { "type": "array", "items": { "type": "string" } }
+              "实质": {
+                "type": "array",
+                "items": {
+                  "type": "string"
+                }
+              },
+              "形式": {
+                "type": "array",
+                "items": {
+                  "type": "string"
+                }
+              }
             }
           },
           "unstructured_what": {
             "type": "array",
-            "items": { "type": "string" }
+            "items": {
+              "type": "string"
+            }
           }
         }
       }
     }
   }
-}
+}

+ 66 - 23
examples/process_pipeline/prompts/extract_workflow.prompt

@@ -26,31 +26,30 @@ $system$
 
 - order:步骤序号,整数。
 - phase: phase: 该步骤所属阶段,取值为「非制作」/「预处理」/「生成」/「编辑」之一;「非制作」指创作与运营层面的行为,如故事构思、选题策划、热点参考、发布节奏等,该阶段不含 action;「预处理」是产出物的目的,不面向最终成品;生成和编辑可复用 action 的定义。
-- action: 该步骤的核心动作,格式为「一级动作:二级动作」,如「编辑:局部重绘」、「生成:融合」;若二级动作与一级相同可省略,如「生成」;非制作阶段的 action 值固定为 null。
+- action: 该步骤的核心动作,格式为「一级动作:二级动作」;若二级动作与一级相同可省略;非制作阶段的 action 值固定为 null。
 - body:具体做法,包含 prompt 写法、参数配置、操作细节等;从帖子原文中提取,未提及则为 null。
-- inputs:该步骤的输入,数组。
-- outputs:该步骤的产出,数组。
+- inputs:该步骤的输入,包含来源,数组。
+- outputs:该步骤的产出,包含去向,数组。
 - tools:使用的工具或平台,数组;未提及则为 []。
 
-# inputs / outputs
+# action 字段
 
-每个输入 / 输出项写成:
+## 格式规定
 
-```json
-{
-  "modality": "文本",
-  "description": "该项在当前步骤中实际起到的作用,用简短名词短语表达,如:场景参考、角色参考、故事情节与镜头要求等",不要写指令,中间产物,等没有信息量的词汇
-}
-```
+- 一级动作,即main_action,应从以下五个选项中选择:
+  - 1.生成。此时output为新内容,input不延续到output,只作为约束或参考
+  - 2.编辑。input延续到output,output只是input的修改
+  - 3.提取。output是来源的子集。(来源可以是input或者外部库)
+  - 4.组织。多项素材的结构化集合/索引/模板
+  - 5.筛选。output是候选集的子集(基于评估)。
+- 每个main_action应该有不同的二级动作,即mechanism:
+  - 生成:直接生成/参考引导/一致性保持/动画化/多模态合成/多候选生成/......
+  - 编辑:局部重绘/风格迁移/颜色调整/蒙板重绘/拼接组合/裁切扩展/......
+  - 提取:提示词反推/关键帧提取/蒙板提取/知识库检索/特征向量话/......
+  - 组织:分类入库/模板化/标签化/变量抽象/结构抽象/......
+  - 筛选:抽卡选优/评分排序top k/人工挑选/阈值过滤/......
 
-要求:
-
-- modality 是数据形态,如 文本 / 图片 / 视频 / 音频 / 特征点 / 参数 / 模型 / 向量。
-- 同一次提交给模型的所有文字描述统一合并为一个输入项,不得按语义功能拆分
-
-# action
-
-action 写成对象:
+## action 写成对象:
 
 ```json
 { "main_action": "编辑", "mechanism": "局部重绘" }
@@ -63,6 +62,31 @@ action 写成对象:
 
 {interface_vocab}
 
+# body 字段
+**重要**:在描述具体做法时,应注意结合已有的架构。遇到相关内容时,*必须*使用已有的术语和架构。已有架构如下:
+- 制作表,是制作帖子的原始输入,代表了制作一个贴子所需要的全部信息。所有对帖子解构的最终结果都应该是**制作表**。
+- 知识库,这是一个庞大的数据库系统,用于保存得到的数据,知识是对数据的抽象提炼,由Agent负责获取。
+- 不能新建、生成知识库或其他数据库,**系统里只有一个知识库**。不同种类的数据都存入这一个知识库中,不额外构建新的数据库,但支持根据标签进行筛选,从而隔离不同知识。
+- 业务Agent,是一个自建的智能体系统,可以从知识库中获取知识;也可以处理数据,将其变成知识存入知识库;处理制作表,完成任务。**所有需要的智能体系统都应该被业务Agent所替代**
+
+# inputs / outputs
+
+每个输入 / 输出项写成:
+
+```json
+{
+  "modality": "文本",
+  "description": "该项在当前步骤中实际起到的作用,用简短名词短语表达,如:场景参考、角色参考、故事情节与镜头要求等",不要写指令,中间产物,等没有信息量的词汇。
+  "relation": "文本",该输入/输出的来源/去向。
+}
+```
+
+要求:
+
+- modality 是数据形态,如 文本 / 图片 / 视频 / 音频 / 特征点 / 参数 / 模型 / 向量。
+- 同一次提交给模型的所有文字描述统一合并为一个输入项,不得按语义功能拆分
+- relation 格式应为:[来源.1O]、[去向.2I]、[来源.原始输入]、[去向.最终成品](含义分别为:来源是序号1的output、去向是序号2的input、从原贴得到的信息、最终的结果,不需要额外的文字描述或标点符号)。input只需要来源,output只需要去向,来源和去向可以有多个。
+
 $user$
 
 # 输入:原帖
@@ -87,20 +111,39 @@ $user$
         "inputs": [
           {
             "modality": "...",
-            "description": "..."
+            "description": "...",
+            "relation": "..."
           }
         ],
         "outputs": [
           {
             "modality": "...",
-            "description": "..."
+            "description": "...",
+            "relation": "..."
           }
         ],
         "tools": [],
         "apply_to_draft": { "实质": ["该步骤操作的内容点"], "形式": ["该步骤的呈现方式"] }
       }
     ],
-    "effects": ["实现 XX 效果", "实现 YY 效果"],
+    "effects": [
+     {                                                               
+          "statement": "实现XXX",  // 解决了什么具体需求
+          "criteria": "...", // 结果是否成功的判定标准
+          "judge_method": "...", // 可选: llm / vlm / rule / human                       
+          "negative_examples": [                                                                      
+              "...",                                                                                  
+              "..."                                                                                     
+          ]                                                             
+       },
+      {                                                               
+          "statement": "实现YYY",
+          "criteria": "...",
+          "judge_method": "...",                 
+          "negative_examples": [                                                                      
+              "...",                                                                                                                                                                    
+          ]                                                             
+       }],
     "criterion": null,
     "unstructured_what": []
   }
@@ -113,4 +156,4 @@ $user$
 - strategy 顶层不要输出 inputs / outputs / tools / stage。
 - 不要任何前言、解释、标题。
 - 字符串值内禁止出现 ASCII 双引号;需要引号请用中文书名号。
-- **effects 数组中的每个字符串都必须以"实现"开头**,如 "实现快速生成"、"实现风格统一"。
+- **effects 的 statement 都必须以"实现"开头**,如 "实现快速生成"、"实现风格统一"。

+ 50 - 6
examples/process_pipeline/prompts/extract_workflow.schema.json

@@ -67,12 +67,12 @@
                       "main_action": {
                         "type": "string",
                         "minLength": 1,
-                        "description": "主动作:生成、编辑、提取、改写、合成、修复、增强、训练、评估、剪辑、模板化、排版、转写、配音、匹配、扩展、导出"
+                        "description": "主动作:生成、编辑、提取、组织、筛选"
                       },
                       "mechanism": {
                         "type": "string",
                         "minLength": 1,
-                        "description": "动作方式:直接生成、一致性保持、结构约束、质量收束、局部重绘、扩图、换背景等"
+                        "description": "动作方式:直接生成、局部重绘、提示词反推、分类入库、抽卡选优等"
                       }
                     }
                   },
@@ -88,7 +88,8 @@
                       "type": "object",
                       "required": [
                         "modality",
-                        "description"
+                        "description",
+                        "relation"
                       ],
                       "properties": {
                         "modality": {
@@ -100,6 +101,11 @@
                           "type": "string",
                           "minLength": 1,
                           "description": "功能性描述,不写具体内容what"
+                        },
+                        "relation": {
+                          "type": "string",
+                          "minLength": 1,
+                          "description": "来源与去向,有特定格式"
                         }
                       }
                     }
@@ -110,7 +116,8 @@
                       "type": "object",
                       "required": [
                         "modality",
-                        "description"
+                        "description",
+                        "relation"
                       ],
                       "properties": {
                         "modality": {
@@ -120,6 +127,11 @@
                         "description": {
                           "type": "string",
                           "minLength": 1
+                        },
+                        "relation": {
+                          "type": "string",
+                          "minLength": 1,
+                          "description": "来源与去向,有特定格式"
                         }
                       }
                     }
@@ -157,8 +169,40 @@
             "effects": {
               "type": "array",
               "items": {
-                "type": "string",
-                "pattern": "^实现"
+                "type": "object",
+                "required": [
+                  "statement",
+                  "criteria",
+                  "judge_method",
+                  "negative_examples"
+                ],
+                "properties": {
+                  "statement": {
+                    "type": "string",
+                    "pattern": "^实现"
+                  },
+                  "criteria": {
+                    "type": "string",
+                    "minLength": 1
+                  },
+                  "judge_method": {
+                    "type": "string",
+                    "enum": [
+                      "llm",
+                      "vlm",
+                      "rule",
+                      "human"
+                    ]
+                  },
+                  "negative_examples": {
+                    "type": "array",
+                    "items": {
+                      "type": "string",
+                      "minLength": 1
+                    },
+                    "default": []
+                  }
+                }
               }
             },
             "criterion": {

+ 24 - 12
examples/process_pipeline/prompts/researcher.prompt

@@ -36,13 +36,8 @@ $system$
   - 禁止搜索具体的软件名称,如 MJ,controlnet
 2. **搜索要求**:仅搜索/查看近半年的结果,不要查看过时的帖子
 3. **适度查看内容**:对点赞数高或标题符合业务需求的帖子查看详情(可看图片)。
-4. **结构化提取**:
-    提取规则
-    - 步骤粒度是"做了什么",而非"怎么做"
-    - 若涉及输入内容,描述其目的而非具体内容(如"输入将照片转化为X风格的提示词",而非"输入提示词包含A/B/C参数")
-    - 若涉及上传素材,说明素材类型和用途
-    - 同一工具内的参数配置不拆分为多步
-    - 若本质上只有一步,就输出一步
+    - 当调用 `content_search` 时,你会看到每条结果附带了 `quality_score`(质量得分)。**必须主动剔除得分低于 80 分的结果,只提取高质量帖子**。
+    - 在写入 case 前,你需要针对帖子执行多维度评估,包括:内容本身的知识质量(指导性与可信度)、是否包含多步骤执行、以及需求匹配度(具体解决的需求细分)。
 
 ### 第三步:存储结果文件
 🚨 **绝对不能更改任务规定的 `output_file` 路径名**!
@@ -63,9 +58,26 @@ write_file("{output_file}", 更新后的完整 JSON)
   "采集时间": "string - ISO 8601",
   "cases": [
     {
-      "case_id": "string - 格式:{platform}_{channel_content_id},如 bili_BV1xxx、xhs_694e17e9000000001e006669",
+      "case_id": "string - 格式:{platform}_{channel_content_id}",
       "source_url": "string - 帖子链接",
-      "title": "string - 帖子标题"
+      "title": "string - 帖子标题",
+      "evaluation": {
+        "quality": {
+          "overall_score": "number (0-100) - 总体的知识质量分数",
+          "instructive_score": "number (0-10) - 指导性评分(是否包含可参考的、详细的做法和教程)",
+          "credibility": {
+            "on_feedback": "number (0-10) - 基于互动反馈(如点赞/评论等)的可信度评分",
+            "on_content": "number (0-10) - 基于内容特征的可信度评分(警惕AI水帖,甄别真实分享)",
+            "on_author": "number (0-10) - 基于作者特征的可信度评分(根据提供的作者信息评估,若无则酌情打分)"
+          }
+        },
+        "multi_step": "boolean - 是否为多步骤执行(仅做判断,不影响质量分,用于后续筛选)",
+        "requirement": {
+          "match_score": "number (0-10) - 与原始业务需求的匹配度评分",
+          "description": "string - 简述该内容具体解决了什么需求,是否对需求进行了细分"
+        },
+        "reason": "string - 一句话简述给出上述评分的核心理由"
+      }
     }
   ]
 }
@@ -73,9 +85,9 @@ write_file("{output_file}", 更新后的完整 JSON)
 
 **重要说明**:
 - `case_id` 必须使用 `{platform}_{channel_content_id}` 格式,不要自己编号
-- 只需要输出 case_id、source_url、title 三个字段
-- 不需要提取工序步骤(后续会由专门的模块处理)
-- 每收集到 2~3 个 case,应立即持久化一次
+- 必须基于帖子的真实内容给出 evaluation(评价分数)
+- 不需要提取具体的工序步骤(后续会由专门的模块处理)
+- 每收集到 2~3 个高质量 case,应立即持久化追加一次
 
 $user$
 

+ 41 - 0
examples/process_pipeline/run_metrics.json

@@ -2294,5 +2294,46 @@
     "trace_ids": {},
     "errors": [],
     "timestamp": "2026-05-06T15:46:19.875101"
+  },
+  {
+    "index": 108,
+    "requirement": "用AI生成真实感live图...",
+    "duration_seconds": 915.27,
+    "total_cost_usd": 1.983,
+    "costs_breakdown": {
+      "P1_Research_zhihu": 0.3943,
+      "P1.6a_WorkflowExtraction": 0.3264,
+      "P1.6b_CapabilityExtraction": 0.489,
+      "P1.7_ApplyGrounding": 0.7733
+    },
+    "trace_ids": {
+      "P1_Research_zhihu": "44fdfdff-056e-4cde-93c8-bccbced38c82"
+    },
+    "errors": [
+      "Case generation failed: ImportError: cannot import name 'generate_case' from 'examples.process_pipeline.script.generate_case' (C:\\Users\\11304\\gitlab\\cybertogether\\Agent\\examples\\process_pipeline\\script\\generate_case.py)"
+    ],
+    "timestamp": "2026-05-06T21:15:01.947405"
+  },
+  {
+    "index": 109,
+    "requirement": "用ai生成真实摄影的美女写真组图,要求具有真实感,氛围感,人物一致性保持...",
+    "duration_seconds": 186.76,
+    "total_cost_usd": 0.0,
+    "costs_breakdown": {},
+    "trace_ids": {},
+    "errors": [
+      "Case generation failed: ImportError: cannot import name 'generate_case' from 'examples.process_pipeline.script.generate_case' (C:\\Users\\11304\\gitlab\\cybertogether\\Agent\\examples\\process_pipeline\\script\\generate_case.py)"
+    ],
+    "timestamp": "2026-05-07T13:36:22.884612"
+  },
+  {
+    "index": 109,
+    "requirement": "用ai生成真实摄影的美女写真组图,要求具有真实感,氛围感,人物一致性保持...",
+    "duration_seconds": 0.33,
+    "total_cost_usd": 0.0,
+    "costs_breakdown": {},
+    "trace_ids": {},
+    "errors": [],
+    "timestamp": "2026-05-07T15:48:22.454758"
   }
 ]

+ 157 - 60
examples/process_pipeline/run_pipeline.py

@@ -37,13 +37,15 @@ from examples.process_research.config import (
 )
 from agent.utils import setup_logging
 
-async def run_agent_task(runner: AgentRunner, prompt_name: str, kwargs: dict, task_name: str, model_name: str, skip_validation: bool = False):
+async def run_agent_task(runner: AgentRunner, prompt_name: str, kwargs: dict, task_name: str, model_name: str, skip_validation: bool = False, start_trace_id: str = None, additional_messages: list = None):
     from examples.process_pipeline.script.validate_schema import validate_case, validate_blueprint, validate_capabilities, validate_strategy
     base_dir = Path(__file__).parent
     prompt_path = base_dir / "prompts" / f"{prompt_name}.prompt"
     prompt = SimplePrompt(prompt_path)
 
     messages = prompt.build_messages(**kwargs)
+    if additional_messages:
+        messages.extend(additional_messages)
     target_tools = []
     if prompt_name == "extract_capabilities":
         target_tools = ["capability_search", "capability_list", "tool_search"]
@@ -61,7 +63,7 @@ async def run_agent_task(runner: AgentRunner, prompt_name: str, kwargs: dict, ta
     out_file = kwargs.get("output_file")
 
     max_retries = 3
-    last_trace_id = None
+    last_trace_id = start_trace_id
     last_validation_error = None
     final_trace_id = None  # 用于返回最终成功的 trace_id
 
@@ -199,7 +201,8 @@ async def run_agent_task(runner: AgentRunner, prompt_name: str, kwargs: dict, ta
                 agent_type=prompt_name,
                 tools=target_tools,
                 tool_groups=tool_groups_map.get(prompt_name, ["core"]),
-                knowledge=KnowledgeConfig(enable_completion_extraction=False, enable_extraction=False, enable_injection=False)
+                knowledge=KnowledgeConfig(enable_completion_extraction=False, enable_extraction=False, enable_injection=False),
+                trace_id=last_trace_id
             )
 
             print(f"🚀 [Launch] {task_name} (Attempt {attempt+1})")
@@ -790,6 +793,7 @@ async def main():
         costs_breakdown = {}
         global_errors = []
         strategy_file = None
+        TARGET_QUALIFIED_CASES = 15
 
         # ── --only-step 单步执行模式 ──────────────────────────────
         if args.only_step:
@@ -814,7 +818,7 @@ async def main():
                 single_is_single = args.restart_mode.startswith("single_")
                 phase1_tasks = []
                 for p in single_platforms:
-                    task_desc = f"渠道:{p.upper()}。核心需求:{requirement}"
+                    task_desc = f"渠道:{p.upper()}。核心需求:{requirement}。目标:至少收集 {TARGET_QUALIFIED_CASES} 条高质量案例(评分>=70、正文充实)。"
                     out_file = str(raw_cases_dir / f"case_{p}.json")
                     kwargs = {
                         "task": task_desc,
@@ -832,7 +836,10 @@ async def main():
                 # Phase 1.5: 提取原始 source.json
                 from examples.process_pipeline.script.extract_sources import extract_sources_to_json
                 result = extract_sources_to_json(raw_cases_dir)
-                print(f"   ✓ source.json: matched={result['total_matched']}")
+                print(f"   ✓ source.json: matched={result['total_matched']}, filtered={result['filtered_total']}")
+                if result.get("filtered_reasons"):
+                    for reason, cnt in result["filtered_reasons"].items():
+                        print(f"      - {reason}: {cnt}")
 
             elif step == "generate-case":
                 # Phase 1.5.5: 生成标准化 case.json
@@ -1064,73 +1071,163 @@ async def main():
             return
 
         # ── 正常 pipeline 流程 ──────────────────────────────
-        existing_platforms = []
-        if raw_cases_dir.exists():
-            for f in raw_cases_dir.glob("case_*.json"):
-                plat = f.stem.replace("case_", "")
-                if plat in ["xhs", "youtube", "bili", "x", "zhihu", "gzh"]:
-                    existing_platforms.append(plat)
 
         platforms = [p.strip() for p in args.platforms.split(",") if p.strip()]
 
         # Phase 0: Platform Selection (controlled by --platforms)
         if should_run("research"):
-            new_platforms = [p for p in platforms if p not in existing_platforms]
-            if not new_platforms:
-                print(f"\n--- Phase 0: Skipping Research (All specified platforms already exist: {existing_platforms}) ---")
-                platforms = []
-            else:
-                print(f"\n--- Phase 0: Using specified platforms ---")
-                print(f"📡 Found existing cases: {existing_platforms}. Will research new platforms: {new_platforms}")
-                platforms = new_platforms
+            print(f"\n--- Phase 0: Using specified platforms ---")
+            print(f"📡 Will research platforms: {platforms}")
+            # 注:不再跳过已有平台,因为 agent 是增量追加模式
 
         is_single_step = args.restart_mode.startswith("single_")
 
-        # Phase 1: MAP (Parallel Search) uses Qwen
-        if should_run("research"):
-            print(f"\n--- Phase 1: Distributed Research Map ({qwen_model}) ---")
-            phase1_tasks = []
-            for p in platforms:
-                task_desc = f"渠道:{p.upper()}。核心需求:{requirement}"
-                out_file = str(raw_cases_dir / f"case_{p}.json")
-                kwargs = {
-                    "task": task_desc,
-                    "output_file": out_file
-                }
-                phase1_tasks.append(run_agent_task(runner_qwen, "researcher", kwargs, f"P1_Research_{p}", qwen_model, skip_validation=is_single_step))
-                
-            phase1_results = await asyncio.gather(*phase1_tasks)
+        # Phase 1 & 1.5: MAP (Parallel Search) and Source Extraction Loop
+        if should_run("research") or should_run("source"):
+            print(f"\n--- Phase 1: Distributed Research Map with Source Filter Loop ({qwen_model}) ---")
+
+            from examples.process_pipeline.script.extract_sources import extract_sources_to_json
+
+            MAX_ROUNDS = 50
+
+            platform_traces = {p: None for p in platforms}
+            active_platforms = platforms.copy()
             phase1_trace_ids = {}
-            for (task_cost, task_errors, trace_id), p in zip(phase1_results, platforms):
-                total_cost += task_cost
-                costs_breakdown[f"P1_Research_{p}"] = round(task_cost, 4)
-                phase1_trace_ids[f"P1_Research_{p}"] = trace_id
-                global_errors.extend(task_errors)
-
-                # Check if cases actually got written
-                expected_file = Path(raw_cases_dir / f"case_{p}.json")
-                if not expected_file.exists():
-                    err_msg = f"Missing case file for {p}! Agent likely hit max_iterations without saving."
+
+            if not should_run("research") and should_run("source"):
+                try:
+                    src_stats = extract_sources_to_json(raw_cases_dir, trace_ids=None)
+                    print(f"📎 [Source Extraction] matched={src_stats['total_matched']}, filtered={src_stats['filtered_total']} → {raw_cases_dir / 'source.json'}")
+                    if src_stats.get("filtered_reasons"):
+                        for reason, cnt in src_stats["filtered_reasons"].items():
+                            print(f"      - {reason}: {cnt}")
+                except Exception as e:
+                    err_msg = f"Source extraction failed: {type(e).__name__}: {e}"
                     print(f"⚠️ [Warning] {err_msg}")
                     global_errors.append(err_msg)
-        else:
-            print("\n⏭️  [Skip] Phase 1 Skipped. Using existing cases...")
+            elif should_run("research"):
+                round_idx = 0
+                last_src_stats = None  # 保存上一轮的过滤统计
+                last_platform_count = {}  # 保存上一轮每个平台的合格数
+                while active_platforms and round_idx < MAX_ROUNDS:
+                    print(f"\n   >>> [Research Loop] Round {round_idx+1} - Active platforms: {active_platforms}")
+
+                    if active_platforms:
+                        phase1_tasks = []
+                        for p in active_platforms:
+                            task_desc = f"渠道:{p.upper()}。核心需求:{requirement}。目标:至少收集 {TARGET_QUALIFIED_CASES} 条高质量案例(评分>=70、正文充实)。"
+                            out_file = str(raw_cases_dir / f"case_{p}.json")
+
+                            additional_msgs = None
+                            if round_idx > 0 and last_src_stats:
+                                # 构建带过滤详情的 feedback message
+                                p_count = last_platform_count.get(p, 0)
+                                # 筛选出该平台被过滤的条目
+                                p_filtered = [d for d in last_src_stats.get("filtered_details", []) if d.get("platform") == p]
+                                reason_summary = last_src_stats.get("filtered_reasons", {})
+
+                                feedback_lines = [
+                                    f"【系统反馈】你在上一轮提取的有效案例数量未达标。",
+                                    f"当前 {p.upper()} 合格案例:{p_count}/{TARGET_QUALIFIED_CASES}",
+                                    f"过滤统计:{dict(reason_summary)}" if reason_summary else "",
+                                ]
+
+                                if p_filtered:
+                                    feedback_lines.append(f"\n以下是你提交的被过滤掉的帖子(共{len(p_filtered)}条):")
+                                    for item in p_filtered[:10]:
+                                        feedback_lines.append(f"  - [{item['case_id']}] {item['title']} → 原因: {item['filter_reason']}")
+                                    if len(p_filtered) > 10:
+                                        feedback_lines.append(f"  ... 还有 {len(p_filtered) - 10} 条未列出")
+
+                                feedback_lines.append(
+                                    f"\n请继续搜索并提取更多**全新的、不同的**高质量案例,**追加**写入到你一直在维护的文件中。"
+                                    f"注意:不要重复之前已找过的案例!针对上述被过滤的原因,请确保新案例有详实的正文内容且评分准确。"
+                                )
+
+                                additional_msgs = [{
+                                    "role": "user",
+                                    "content": "\n".join(line for line in feedback_lines if line)
+                                }]
+
+                            kwargs = {
+                                "task": task_desc,
+                                "output_file": out_file
+                            }
+                            phase1_tasks.append(
+                                run_agent_task(
+                                    runner_qwen, "researcher", kwargs,
+                                    f"P1_Research_{p}_R{round_idx+1}", qwen_model,
+                                    skip_validation=is_single_step,
+                                    start_trace_id=platform_traces[p],
+                                    additional_messages=additional_msgs
+                                )
+                            )
+
+                        phase1_results = await asyncio.gather(*phase1_tasks)
+                        for (task_cost, task_errors, trace_id), p in zip(phase1_results, active_platforms):
+                            total_cost += task_cost
+                            cost_key = f"P1_Research_{p}"
+                            costs_breakdown[cost_key] = costs_breakdown.get(cost_key, 0.0) + round(task_cost, 4)
+                            platform_traces[p] = trace_id
+                            phase1_trace_ids[f"P1_Research_{p}"] = trace_id
+                            global_errors.extend(task_errors)
+
+                            expected_file = Path(raw_cases_dir / f"case_{p}.json")
+                            if not expected_file.exists():
+                                err_msg = f"Missing case file for {p}! Agent likely hit max_iterations without saving."
+                                print(f"⚠️ [Warning] {err_msg}")
+                                global_errors.append(err_msg)
+
+                    if should_run("source"):
+                        try:
+                            trace_id_list = [tid for tid in phase1_trace_ids.values() if tid]
+                            src_stats = extract_sources_to_json(raw_cases_dir, trace_ids=trace_id_list)
+                            last_src_stats = src_stats
+                            print(f"   📎 [Source Extraction] matched={src_stats['total_matched']}, filtered={src_stats['filtered_total']} → {raw_cases_dir / 'source.json'}")
+                            if src_stats.get("filtered_reasons"):
+                                for reason, cnt in src_stats["filtered_reasons"].items():
+                                    print(f"      - {reason}: {cnt}")
+                        except Exception as e:
+                            err_msg = f"Source extraction failed: {type(e).__name__}: {e}"
+                            print(f"   ⚠️ [Warning] {err_msg}")
+                            global_errors.append(err_msg)
+
+                    # 判断是否达标:直接看 source.json 里每个平台的条目数
+                    source_file = raw_cases_dir / "source.json"
+                    if source_file.exists():
+                        with open(source_file, "r", encoding="utf-8") as f:
+                            source_data = json.load(f)
+
+                        platform_count = {}
+                        for s in source_data.get("sources", []):
+                            p = s.get("platform")
+                            if p:
+                                platform_count[p] = platform_count.get(p, 0) + 1
+                        last_platform_count = platform_count
+
+                        print(f"   📊 [Source Count] Target: >={TARGET_QUALIFIED_CASES} qualified items per platform")
+                        platforms_to_keep = []
+                        for p in platforms:
+                            count = platform_count.get(p, 0)
+                            if p in active_platforms:
+                                print(f"      - {p}: {count}/{TARGET_QUALIFIED_CASES}")
+                            if count < TARGET_QUALIFIED_CASES:
+                                platforms_to_keep.append(p)
+
+                        active_platforms = platforms_to_keep
+
+                        if not active_platforms:
+                            print(f"   ✅ All platforms reached the target {TARGET_QUALIFIED_CASES} qualified items!")
+                            break
+                    else:
+                        print(f"   ⚠️ source.json not found, continuing loop to retry...")
 
-        # Phase 1.5: Extract raw post data from cache → raw_cases/source.json
-        if should_run("source"):
-            try:
-                from examples.process_pipeline.script.extract_sources import extract_sources_to_json
-                trace_id_list = [tid for tid in phase1_trace_ids.values() if tid] if 'phase1_trace_ids' in dir() else None
-                src_stats = extract_sources_to_json(raw_cases_dir, trace_ids=trace_id_list)
-                print(
-                    f"📎 [Source Extraction] "
-                    f"matched={src_stats['total_matched']} "
-                    f"→ {raw_cases_dir / 'source.json'}"
-                )
-            except Exception as e:
-                err_msg = f"Source extraction failed: {type(e).__name__}: {e}"
-                print(f"⚠️ [Warning] {err_msg}")
-                global_errors.append(err_msg)
+                    round_idx += 1
+
+                if round_idx >= MAX_ROUNDS and active_platforms:
+                    print(f"   ⚠️ [Max Rounds] Reached {MAX_ROUNDS} rounds. Remaining platforms: {active_platforms}")
+        else:
+            print("\n⏭️  [Skip] Phase 1 & 1.5 Skipped. Using existing cases...")
 
         # Phase 1.3: Generate case.json from source.json
         if should_run("generate-case"):

+ 443 - 0
examples/process_pipeline/script/evaluate_source_quality.py

@@ -0,0 +1,443 @@
+"""
+Source.json 质量评估模块
+
+基于字段完整性和文本量对调研结果进行评分,识别低质量内容并支持二次筛选。
+
+评分维度:
+1. 字段完整性(40分):有效字段占比
+2. 文本质量(40分):body_text 长度和信息密度
+3. 互动数据(20分):点赞数、时间戳等
+"""
+
+import json
+from pathlib import Path
+from typing import Dict, List, Tuple
+from datetime import datetime, timedelta
+
+
+class SourceQualityEvaluator:
+    """Source 数据质量评估器"""
+
+    # 字段权重配置
+    FIELD_WEIGHTS = {
+        "title": 5,
+        "body_text": 15,
+        "like_count": 5,
+        "publish_timestamp": 5,
+        "images": 3,
+        "videos": 3,
+        "link": 2,
+        "content_type": 2,
+    }
+
+    # 文本质量阈值
+    TEXT_LENGTH_THRESHOLDS = {
+        "excellent": 200,   # 优秀:200字以上
+        "good": 100,        # 良好:100-200字
+        "fair": 50,         # 一般:50-100字
+        "poor": 20,         # 较差:20-50字
+        # < 20字:极差
+    }
+
+    def __init__(self, time_window_days: int = 180):
+        """
+        Args:
+            time_window_days: 时效性窗口(天),默认180天(半年)
+        """
+        self.time_window_days = time_window_days
+        self.cutoff_timestamp = (
+            datetime.now() - timedelta(days=time_window_days)
+        ).timestamp()
+
+    def evaluate_post(self, post: dict) -> Dict[str, any]:
+        """
+        评估单个 post 的质量
+
+        Returns:
+            {
+                "field_score": float,      # 字段完整性得分 (0-40)
+                "text_score": float,       # 文本质量得分 (0-40)
+                "engagement_score": float, # 互动数据得分 (0-20)
+                "total_score": float,      # 总分 (0-100)
+                "grade": str,              # 等级 A/B/C/D/F
+                "issues": List[str],       # 问题列表
+                "valid_fields": int,       # 有效字段数
+                "total_fields": int,       # 总字段数
+            }
+        """
+        result = {
+            "field_score": 0.0,
+            "text_score": 0.0,
+            "engagement_score": 0.0,
+            "total_score": 0.0,
+            "grade": "F",
+            "issues": [],
+            "valid_fields": 0,
+            "total_fields": len(self.FIELD_WEIGHTS),
+        }
+
+        # 1. 字段完整性评分 (0-40分)
+        field_score, valid_count = self._evaluate_fields(post)
+        result["field_score"] = field_score
+        result["valid_fields"] = valid_count
+
+        # 2. 文本质量评分 (0-40分)
+        text_score, text_issues = self._evaluate_text(post)
+        result["text_score"] = text_score
+        result["issues"].extend(text_issues)
+
+        # 3. 互动数据评分 (0-20分)
+        engagement_score, engagement_issues = self._evaluate_engagement(post)
+        result["engagement_score"] = engagement_score
+        result["issues"].extend(engagement_issues)
+
+        # 计算总分和等级
+        result["total_score"] = round(
+            result["field_score"] + result["text_score"] + result["engagement_score"], 2
+        )
+        result["grade"] = self._calculate_grade(result["total_score"])
+
+        return result
+
+    def _evaluate_fields(self, post: dict) -> Tuple[float, int]:
+        """评估字段完整性"""
+        total_weight = sum(self.FIELD_WEIGHTS.values())
+        earned_weight = 0.0
+        valid_count = 0
+
+        for field, weight in self.FIELD_WEIGHTS.items():
+            value = post.get(field)
+            is_valid = False
+
+            if field == "title":
+                is_valid = bool(value and len(str(value).strip()) > 0)
+            elif field == "body_text":
+                is_valid = bool(value and len(str(value).strip()) > 0)
+            elif field == "like_count":
+                is_valid = isinstance(value, (int, float)) and value > 0
+            elif field == "publish_timestamp":
+                is_valid = isinstance(value, (int, float)) and value > 0
+            elif field in ("images", "videos"):
+                is_valid = isinstance(value, list) and len(value) > 0
+            elif field == "link":
+                is_valid = bool(value and len(str(value).strip()) > 0)
+            elif field == "content_type":
+                is_valid = bool(value and len(str(value).strip()) > 0)
+
+            if is_valid:
+                earned_weight += weight
+                valid_count += 1
+
+        # 转换为 0-40 分
+        field_score = (earned_weight / total_weight) * 40
+        return round(field_score, 2), valid_count
+
+    def _evaluate_text(self, post: dict) -> Tuple[float, List[str]]:
+        """评估文本质量"""
+        issues = []
+        body_text = post.get("body_text", "")
+        title = post.get("title", "")
+
+        # 清理 HTML 标签(如 <em class="keyword">)
+        import re
+        body_text_clean = re.sub(r'<[^>]+>', '', body_text)
+        title_clean = re.sub(r'<[^>]+>', '', title)
+
+        body_len = len(body_text_clean.strip())
+        title_len = len(title_clean.strip())
+
+        # 标题评分 (0-10分)
+        if title_len == 0:
+            title_score = 0
+            issues.append("标题为空")
+        elif title_len < 10:
+            title_score = 3
+            issues.append(f"标题过短 ({title_len}字)")
+        elif title_len < 20:
+            title_score = 6
+        else:
+            title_score = 10
+
+        # 正文评分 (0-30分)
+        if body_len == 0:
+            body_score = 0
+            issues.append("正文为空")
+        elif body_len < self.TEXT_LENGTH_THRESHOLDS["poor"]:
+            body_score = 5
+            issues.append(f"正文极短 ({body_len}字)")
+        elif body_len < self.TEXT_LENGTH_THRESHOLDS["fair"]:
+            body_score = 12
+            issues.append(f"正文较短 ({body_len}字)")
+        elif body_len < self.TEXT_LENGTH_THRESHOLDS["good"]:
+            body_score = 20
+        elif body_len < self.TEXT_LENGTH_THRESHOLDS["excellent"]:
+            body_score = 26
+        else:
+            body_score = 30
+
+        text_score = title_score + body_score
+        return round(text_score, 2), issues
+
+    def _evaluate_engagement(self, post: dict) -> Tuple[float, List[str]]:
+        """评估互动数据"""
+        issues = []
+        score = 0.0
+
+        # 点赞数评分 (0-10分)
+        like_count = post.get("like_count", 0)
+        if not isinstance(like_count, (int, float)):
+            like_count = 0
+
+        if like_count == 0:
+            issues.append("无点赞数据")
+        elif like_count < 10:
+            score += 3
+        elif like_count < 100:
+            score += 6
+        elif like_count < 1000:
+            score += 8
+        else:
+            score += 10
+
+        # 时间戳评分 (0-10分)
+        timestamp = post.get("publish_timestamp", 0)
+        if not isinstance(timestamp, (int, float)):
+            timestamp = 0
+
+        if timestamp == 0:
+            issues.append("无发布时间")
+        elif timestamp < self.cutoff_timestamp:
+            issues.append(f"内容过时(超过{self.time_window_days}天)")
+            score += 2
+        else:
+            score += 10
+
+        return round(score, 2), issues
+
+    def _calculate_grade(self, score: float) -> str:
+        """计算等级"""
+        if score >= 80:
+            return "A"
+        elif score >= 60:
+            return "B"
+        elif score >= 40:
+            return "C"
+        elif score >= 20:
+            return "D"
+        else:
+            return "F"
+
+    def evaluate_source_file(self, source_file: Path) -> Dict[str, any]:
+        """
+        评估整个 source.json 文件
+
+        Returns:
+            {
+                "total_sources": int,
+                "grade_distribution": Dict[str, int],  # A/B/C/D/F 的数量分布
+                "avg_score": float,
+                "low_quality_count": int,  # C/D/F 的数量
+                "low_quality_indices": List[int],  # 低质量条目的索引
+                "details": List[Dict],  # 每条的详细评分
+            }
+        """
+        with open(source_file, "r", encoding="utf-8") as f:
+            data = json.load(f)
+
+        sources = data.get("sources", [])
+        total = len(sources)
+
+        grade_dist = {"A": 0, "B": 0, "C": 0, "D": 0, "F": 0}
+        scores = []
+        low_quality_indices = []
+        details = []
+
+        for idx, source in enumerate(sources):
+            post = source.get("post", {})
+            eval_result = self.evaluate_post(post)
+            eval_result["index"] = idx
+            eval_result["case_id"] = source.get("case_id", "")
+            eval_result["platform"] = source.get("platform", "")
+
+            grade_dist[eval_result["grade"]] += 1
+            scores.append(eval_result["total_score"])
+            details.append(eval_result)
+
+            # C/D/F 视为低质量
+            if eval_result["grade"] in ("C", "D", "F"):
+                low_quality_indices.append(idx)
+
+        avg_score = round(sum(scores) / total, 2) if total > 0 else 0.0
+
+        return {
+            "total_sources": total,
+            "grade_distribution": grade_dist,
+            "avg_score": avg_score,
+            "low_quality_count": len(low_quality_indices),
+            "low_quality_indices": low_quality_indices,
+            "details": details,
+        }
+
+    def filter_low_quality(
+        self, source_file: Path, output_file: Path, min_score: float = 40.0
+    ) -> Dict[str, any]:
+        """
+        过滤低质量内容,生成新的 source.json
+
+        Args:
+            source_file: 原始 source.json 路径
+            output_file: 输出文件路径
+            min_score: 最低分数阈值(默认40分,即C级以上)
+
+        Returns:
+            {
+                "original_count": int,
+                "filtered_count": int,
+                "removed_count": int,
+                "removed_cases": List[str],  # 被移除的 case_id
+            }
+        """
+        with open(source_file, "r", encoding="utf-8") as f:
+            data = json.load(f)
+
+        sources = data.get("sources", [])
+        original_count = len(sources)
+
+        filtered_sources = []
+        removed_sources = []
+        removed_cases = []
+
+        for source in sources:
+            post = source.get("post", {})
+            eval_result = self.evaluate_post(post)
+
+            if eval_result["total_score"] >= min_score:
+                filtered_sources.append(source)
+            else:
+                source["filter_reason"] = f"完备性评分不足 (得分: {eval_result['total_score']} < {min_score})"
+                removed_sources.append(source)
+                removed_cases.append(source.get("case_id", "unknown"))
+
+        # 将被过滤的数据保存到 filtered_cases.json
+        if removed_sources:
+            filtered_cases_file = output_file.parent / "filtered_cases.json"
+            filtered_data = {"total": len(removed_sources), "sources": removed_sources}
+            with open(filtered_cases_file, "w", encoding="utf-8") as f:
+                json.dump(filtered_data, f, ensure_ascii=False, indent=2)
+
+        # 更新数据
+        data["sources"] = filtered_sources
+        data["total"] = len(filtered_sources)
+        data["quality_filter"] = {
+            "min_score": min_score,
+            "original_count": original_count,
+            "filtered_count": len(filtered_sources),
+            "removed_count": len(removed_cases),
+            "filter_timestamp": datetime.now().isoformat(),
+        }
+
+        # 写入新文件
+        with open(output_file, "w", encoding="utf-8") as f:
+            json.dump(data, f, ensure_ascii=False, indent=2)
+
+        return {
+            "original_count": original_count,
+            "filtered_count": len(filtered_sources),
+            "removed_count": len(removed_cases),
+            "removed_cases": removed_cases,
+        }
+
+
+def generate_quality_report(source_file: Path, output_report: Path = None):
+    """生成质量评估报告"""
+    evaluator = SourceQualityEvaluator()
+    result = evaluator.evaluate_source_file(source_file)
+
+    # 生成报告文本
+    report_lines = [
+        "=" * 60,
+        f"Source 质量评估报告",
+        f"文件:{source_file}",
+        f"评估时间:{datetime.now().strftime('%Y-%m-%d %H:%M:%S')}",
+        "=" * 60,
+        "",
+        f"📊 总体统计",
+        f"   总条目数:{result['total_sources']}",
+        f"   平均得分:{result['avg_score']}/100",
+        f"   低质量数:{result['low_quality_count']} ({result['low_quality_count']/result['total_sources']*100:.1f}%)",
+        "",
+        f"📈 等级分布",
+    ]
+
+    for grade in ["A", "B", "C", "D", "F"]:
+        count = result["grade_distribution"][grade]
+        pct = count / result["total_sources"] * 100 if result["total_sources"] > 0 else 0
+        bar = "█" * int(pct / 2)
+        report_lines.append(f"   {grade}: {count:3d} ({pct:5.1f}%) {bar}")
+
+    report_lines.extend([
+        "",
+        f"⚠️  低质量条目详情 (C/D/F 级)",
+        "",
+    ])
+
+    # 只显示低质量条目
+    low_quality_details = [d for d in result["details"] if d["grade"] in ("C", "D", "F")]
+    low_quality_details.sort(key=lambda x: x["total_score"])
+
+    for detail in low_quality_details[:20]:  # 最多显示20条
+        report_lines.extend([
+            f"[{detail['index']:3d}] {detail['case_id']} | 得分: {detail['total_score']}/100 ({detail['grade']})",
+            f"      字段: {detail['field_score']}/40 ({detail['valid_fields']}/{detail['total_fields']}个有效)",
+            f"      文本: {detail['text_score']}/40",
+            f"      互动: {detail['engagement_score']}/20",
+        ])
+        if detail["issues"]:
+            report_lines.append(f"      问题: {', '.join(detail['issues'])}")
+        report_lines.append("")
+
+    if len(low_quality_details) > 20:
+        report_lines.append(f"   ... 还有 {len(low_quality_details) - 20} 条低质量条目未显示")
+
+    report_text = "\n".join(report_lines)
+
+    # 输出到控制台
+    print(report_text)
+
+    # 保存到文件
+    if output_report:
+        with open(output_report, "w", encoding="utf-8") as f:
+            f.write(report_text)
+        print(f"\n报告已保存到:{output_report}")
+
+    return result
+
+
+if __name__ == "__main__":
+    import argparse
+
+    parser = argparse.ArgumentParser(description="评估 source.json 的质量")
+    parser.add_argument("source_file", type=Path, help="source.json 文件路径")
+    parser.add_argument("--report", type=Path, help="输出报告文件路径")
+    parser.add_argument("--filter", type=Path, help="过滤后输出文件路径")
+    parser.add_argument("--min-score", type=float, default=40.0, help="最低分数阈值(默认40)")
+    args = parser.parse_args()
+
+    if not args.source_file.exists():
+        print(f"错误:文件不存在 {args.source_file}")
+        exit(1)
+
+    # 生成评估报告
+    result = generate_quality_report(args.source_file, args.report)
+
+    # 如果指定了过滤输出
+    if args.filter:
+        evaluator = SourceQualityEvaluator()
+        filter_result = evaluator.filter_low_quality(
+            args.source_file, args.filter, args.min_score
+        )
+        print(f"\n🔍 质量过滤完成")
+        print(f"   原始条目:{filter_result['original_count']}")
+        print(f"   保留条目:{filter_result['filtered_count']}")
+        print(f"   移除条目:{filter_result['removed_count']}")
+        print(f"   输出文件:{args.filter}")

+ 181 - 56
examples/process_pipeline/script/extract_sources.py

@@ -49,30 +49,130 @@ def parse_url(url: str) -> Optional[Tuple[str, str]]:
     return None
 
 
-# ── 从 case 文件中抽取所有链接 ────────────────────────────────
+# ── 从 case 文件中抽取所有条目(URL + evaluation) ────────────────────────────────
 
-def extract_urls_from_case(case_data: Any) -> List[str]:
-    """兼容新旧两种格式,返回 case 文件里出现的所有 URL。"""
-    urls: List[str] = []
+def extract_entries_from_case(case_data: Any) -> List[Dict[str, Any]]:
+    """
+    从 case 数据中抽取条目,每条包含 url 和 agent 的 evaluation。
+
+    返回: [{"url": str, "evaluation": dict | None, "title": str | None, "case_id": str | None}]
+    """
+    entries: List[Dict[str, Any]] = []
 
     if not isinstance(case_data, dict):
-        return urls
+        return entries
 
     # 新格式:工序发现[].帖子链接
     for item in case_data.get("工序发现", []) or []:
         if isinstance(item, dict):
             link = item.get("帖子链接") or item.get("source_url")
             if link:
-                urls.append(link)
+                entries.append({
+                    "url": link,
+                    "evaluation": item.get("evaluation"),
+                    "title": item.get("title"),
+                    "case_id": item.get("case_id"),
+                })
 
-    # 旧格式:cases[].source_url
+    # 主格式:cases[]
     for item in case_data.get("cases", []) or []:
         if isinstance(item, dict):
             link = item.get("source_url") or item.get("帖子链接")
             if link:
-                urls.append(link)
+                entries.append({
+                    "url": link,
+                    "evaluation": item.get("evaluation"),
+                    "title": item.get("title"),
+                    "case_id": item.get("case_id"),
+                })
 
-    return urls
+    return entries
+
+
+def extract_urls_from_case(case_data: Any) -> List[str]:
+    """[Legacy] 旧接口,保留供可能的外部调用。内部改用 extract_entries_from_case。"""
+    return [e["url"] for e in extract_entries_from_case(case_data)]
+
+
+# ── 过滤规则(统一入口) ────────────────────────────────
+
+# 默认阈值:body_text 最少字符数、agent 评分下限
+DEFAULT_MIN_BODY_LEN = 30
+DEFAULT_MIN_SCORE = 70.0
+DEFAULT_CUTOFF_DATE = (2025, 10, 1)
+
+
+def _is_before_cutoff(source: Dict[str, Any], cutoff_ts: int) -> bool:
+    """判断帖子是否早于截止时间戳(秒级)
+
+    如果 timestamp 为 0 或不存在,返回 False(保留)
+    """
+    post = source.get("post", {})
+    if not isinstance(post, dict):
+        return False
+
+    ts = post.get("publish_timestamp")
+    # 没有 timestamp 或为 0,保留
+    if not ts or ts == 0:
+        return False
+
+    try:
+        ts = int(ts)
+        # 毫秒级转秒级
+        if ts > 1000000000000:
+            ts = ts / 1000
+        return ts < cutoff_ts
+    except (ValueError, TypeError):
+        pass
+
+    # 尝试解析字符串格式 "2025-05-02 19:25:30"
+    try:
+        from datetime import datetime
+        dt = datetime.strptime(str(ts), "%Y-%m-%d %H:%M:%S")
+        return dt.timestamp() < cutoff_ts
+    except Exception:
+        # 解析失败,保留
+        return False
+
+
+def _check_filters(
+    source: Dict[str, Any],
+    cutoff_ts: int,
+    min_body_len: int,
+    min_score: float,
+) -> Optional[str]:
+    """
+    对一条 source 逐条检查过滤规则。
+
+    返回:
+        None          —— 条目合格
+        非 None 字符串 —— 不合格的原因(写入 filter_reason)
+    """
+    post = source.get("post", {}) or {}
+
+    # 1. body_text 完整性
+    body = post.get("body_text") or post.get("desc") or ""
+    if not isinstance(body, str) or len(body.strip()) < min_body_len:
+        return f"missing_body_text:len={len(body.strip()) if isinstance(body, str) else 0}"
+
+    # 2. agent 评分
+    evaluation = source.get("evaluation")
+    if not isinstance(evaluation, dict):
+        return "missing_evaluation"
+    quality = evaluation.get("quality")
+    if not isinstance(quality, dict):
+        return "missing_evaluation"
+    score = quality.get("overall_score")
+    if not isinstance(score, (int, float)):
+        return "invalid_score"
+    if score < min_score:
+        return f"low_score:{score}"
+
+    # 3. 过时(复用原 _is_before_cutoff 对时间戳的解析)
+    if _is_before_cutoff(source, cutoff_ts):
+        return "outdated"
+
+    return None
 
 
 # ── 从 cache 中构建 (platform, content_id) → post 索引 ────────────────────────────────
@@ -172,10 +272,23 @@ def extract_sources_to_json(
     cache_dir: Optional[Path] = None,
     output_name: str = "source.json",
     trace_ids: Optional[List[str]] = None,
+    min_body_len: int = DEFAULT_MIN_BODY_LEN,
+    min_score: float = DEFAULT_MIN_SCORE,
+    cutoff_date: Tuple[int, int, int] = DEFAULT_CUTOFF_DATE,
 ) -> Dict[str, Any]:
     """
     扫描 raw_cases_dir 下的 case_*.json,
-    从 cache 中找出原始帖子,输出到 {raw_cases_dir}/{output_name}。
+    从 cache 中找出原始帖子,并对结果执行统一过滤(body_text / 评分 / 时效)。
+
+    过滤规则(均可通过参数调整):
+      - body_text 为空或少于 min_body_len 字符  → filter_reason=missing_body_text
+      - agent evaluation 缺失或 weighted_score < min_score → filter_reason=missing_evaluation / invalid_score / low_score
+      - publish_timestamp 早于 cutoff_date → filter_reason=outdated
+
+    参数:
+        min_body_len: body_text 最少字符数,默认 30
+        min_score:    agent 评分下限,默认 70
+        cutoff_date:  过时截止日期 (year, month, day),默认 (2025, 10, 1)
 
     返回统计信息 dict。
     """
@@ -253,8 +366,10 @@ def extract_sources_to_json(
                 unmatched.append({"case_file": case_file.name, "error": str(e)})
                 continue
 
-        urls = extract_urls_from_case(case_data)
-        for url in urls:
+        entries = extract_entries_from_case(case_data)
+        for entry in entries:
+            url = entry["url"]
+            evaluation = entry.get("evaluation")
             # 解析 URL 得到 platform 和 content_id
             parsed = parse_url(url)
             if not parsed:
@@ -299,6 +414,7 @@ def extract_sources_to_json(
                     "platform": platform,
                     "channel_content_id": str(actual_cid),
                     "source_url": url,
+                    "evaluation": evaluation,
                     "post": post,
                 })
             else:
@@ -314,13 +430,29 @@ def extract_sources_to_json(
     # 4. 合并已有数据和新匹配的数据
     all_sources = existing_sources + matched
 
-    # 5. 过滤掉 2025-10 之前的过时帖子
+    # 5. 统一过滤:body_text 完整性 / agent 评分 / 时效
     from datetime import datetime as _dt
-    cutoff_ts = int(_dt(2025, 10, 1).timestamp())  # 本地时区的 2025-10-01
-    before_filter = len(all_sources)
-    filtered_sources = [s for s in all_sources if _is_before_cutoff(s, cutoff_ts)]
-    all_sources = [s for s in all_sources if not _is_before_cutoff(s, cutoff_ts)]
-    filtered_count = before_filter - len(all_sources)
+    cutoff_ts = int(_dt(*cutoff_date).timestamp())
+
+    kept_sources: List[Dict[str, Any]] = []
+    filtered_sources: List[Dict[str, Any]] = []
+    reason_counts: Dict[str, int] = {}
+
+    for s in all_sources:
+        reason = _check_filters(s, cutoff_ts, min_body_len, min_score)
+        if reason is None:
+            kept_sources.append(s)
+        else:
+            # 记录过滤原因
+            s_copy = dict(s)
+            s_copy["filter_reason"] = reason
+            filtered_sources.append(s_copy)
+            # 统计原因类型(只取冒号前的类别)
+            category = reason.split(":", 1)[0]
+            reason_counts[category] = reason_counts.get(category, 0) + 1
+
+    all_sources = kept_sources
+    filtered_count = len(filtered_sources)
 
     # 6. 转换 timestamp 为可读格式
     _convert_timestamps(all_sources)
@@ -337,7 +469,7 @@ def extract_sources_to_json(
     with open(output_file, "w", encoding="utf-8") as f:
         json.dump(output, f, ensure_ascii=False, indent=2)
 
-    # 8. 写 filtered_cases.json(被过滤掉的帖子,去重后追加
+    # 8. 写 filtered_cases.json(被过滤掉的帖子,按原因分组
     if filtered_sources:
         for fs in filtered_sources:
             key = (fs.get("platform"), fs.get("channel_content_id"))
@@ -345,10 +477,22 @@ def extract_sources_to_json(
                 existing_filtered_sources.append(fs)
                 existing_filtered_ids.add(key)
 
+        # 按原因类别分组
+        by_reason: Dict[str, List[Dict[str, Any]]] = {}
+        for fs in existing_filtered_sources:
+            reason = fs.get("filter_reason", "unknown")
+            category = reason.split(":", 1)[0]
+            by_reason.setdefault(category, []).append(fs)
+
         filtered_output = {
             "total": len(existing_filtered_sources),
-            "reason": "publish_timestamp before 2025-10-01",
-            "sources": existing_filtered_sources,
+            "by_reason": {
+                category: {
+                    "count": len(items),
+                    "sources": items,
+                }
+                for category, items in by_reason.items()
+            },
         }
         with open(filtered_output_file, "w", encoding="utf-8") as f:
             json.dump(filtered_output, f, ensure_ascii=False, indent=2)
@@ -356,12 +500,26 @@ def extract_sources_to_json(
     # 9. 下载图片到 raw_cases/images/{case_id}/
     images_downloaded = download_images_for_sources(matched, raw_cases_dir)
 
-    # 返回统计信息(包含 unmatched 用于日志输出)
+    # 10. 构建被过滤条目的摘要(供续跑 feedback 使用)
+    filtered_details: List[Dict[str, Any]] = []
+    for fs in filtered_sources:
+        post = fs.get("post", {}) or {}
+        title = post.get("title") or fs.get("source_url", "")
+        filtered_details.append({
+            "case_id": fs.get("case_id", ""),
+            "platform": fs.get("platform", ""),
+            "title": title[:60] if title else "",
+            "filter_reason": fs.get("filter_reason", ""),
+        })
+
+    # 返回统计信息
     return {
         "total_matched": len(matched),
         "total_existing": len(existing_sources),
         "total_unmatched": len(unmatched),
-        "filtered_outdated": filtered_count,
+        "filtered_total": filtered_count,
+        "filtered_reasons": reason_counts,
+        "filtered_details": filtered_details,
         "images_downloaded": images_downloaded,
         "output_file": str(output_file),
     }
@@ -389,39 +547,6 @@ def _get_image_urls_from_post(post: Dict[str, Any]) -> List[str]:
     return urls
 
 
-def _is_before_cutoff(source: Dict[str, Any], cutoff_ts: int) -> bool:
-    """判断帖子是否早于截止时间戳(秒级)
-
-    如果 timestamp 为 0 或不存在,返回 False(保留)
-    """
-    post = source.get("post", {})
-    if not isinstance(post, dict):
-        return False
-
-    ts = post.get("publish_timestamp")
-    # 没有 timestamp 或为 0,保留
-    if not ts or ts == 0:
-        return False
-
-    try:
-        ts = int(ts)
-        # 毫秒级转秒级
-        if ts > 1000000000000:
-            ts = ts / 1000
-        return ts < cutoff_ts
-    except (ValueError, TypeError):
-        pass
-
-    # 尝试解析字符串格式 "2025-05-02 19:25:30"
-    try:
-        from datetime import datetime
-        dt = datetime.strptime(str(ts), "%Y-%m-%d %H:%M:%S")
-        return dt.timestamp() < cutoff_ts
-    except Exception:
-        # 解析失败,保留
-        return False
-
-
 def _format_timestamp(ts: Any) -> Optional[str]:
     """将时间戳(秒/毫秒)转换为可读格式"""
     from datetime import datetime

+ 183 - 0
examples/process_pipeline/script/integrate_quality_check.py

@@ -0,0 +1,183 @@
+"""
+将质量评估集成到 Pipeline 中
+
+在 Phase 1.5 (source.json 生成后) 自动执行质量检查,
+对低质量内容进行二次筛选或触发重新调研。
+"""
+
+import json
+from pathlib import Path
+from typing import Dict, List
+from evaluate_source_quality import SourceQualityEvaluator, generate_quality_report
+
+
+def check_and_filter_source(
+    source_file: Path,
+    min_score: float = 40.0,
+    min_pass_rate: float = 0.6,
+    auto_filter: bool = True,
+) -> Dict[str, any]:
+    """
+    检查 source.json 质量并决定是否需要重新调研
+
+    Args:
+        source_file: source.json 路径
+        min_score: 单条内容的最低分数阈值(默认40分,C级)
+        min_pass_rate: 整体通过率阈值(默认60%)
+        auto_filter: 是否自动过滤低质量内容
+
+    Returns:
+        {
+            "status": str,  # "pass" | "filtered" | "failed"
+            "message": str,
+            "stats": dict,
+            "action": str,  # "continue" | "rerun_research" | "manual_review"
+        }
+    """
+    evaluator = SourceQualityEvaluator()
+
+    # 1. 生成质量报告
+    print(f"\n{'='*60}")
+    print(f"📊 质量评估:{source_file.name}")
+    print(f"{'='*60}")
+
+    eval_result = evaluator.evaluate_source_file(source_file)
+
+    total = eval_result["total_sources"]
+    low_quality_count = eval_result["low_quality_count"]
+    pass_count = total - low_quality_count
+    pass_rate = pass_count / total if total > 0 else 0.0
+    avg_score = eval_result["avg_score"]
+
+    print(f"\n总条目数:{total}")
+    print(f"平均得分:{avg_score:.1f}/100")
+    print(f"通过率:{pass_rate*100:.1f}% ({pass_count}/{total})")
+    print(f"\n等级分布:")
+    for grade in ["A", "B", "C", "D", "F"]:
+        count = eval_result["grade_distribution"][grade]
+        pct = count / total * 100 if total > 0 else 0
+        print(f"  {grade}: {count:3d} ({pct:5.1f}%)")
+
+    # 2. 判断质量是否达标
+    result = {
+        "status": "unknown",
+        "message": "",
+        "stats": {
+            "total": total,
+            "pass_count": pass_count,
+            "pass_rate": pass_rate,
+            "avg_score": avg_score,
+            "grade_distribution": eval_result["grade_distribution"],
+        },
+        "action": "continue",
+    }
+
+    # 3. 决策逻辑
+    if pass_rate >= min_pass_rate and avg_score >= 50:
+        # 质量良好,直接通过
+        result["status"] = "pass"
+        result["message"] = f"✅ 质量达标(通过率 {pass_rate*100:.1f}%)"
+        result["action"] = "continue"
+        print(f"\n{result['message']}")
+
+    elif pass_rate >= 0.4 and auto_filter:
+        # 质量一般,自动过滤低质量内容
+        print(f"\n⚠️  质量一般(通过率 {pass_rate*100:.1f}%),执行自动过滤...")
+
+        filtered_file = source_file.parent / "source_filtered.json"
+        filter_result = evaluator.filter_low_quality(
+            source_file, filtered_file, min_score
+        )
+
+        # 备份原文件
+        backup_file = source_file.parent / "source_original.json"
+        source_file.rename(backup_file)
+        filtered_file.rename(source_file)
+
+        result["status"] = "filtered"
+        result["message"] = (
+            f"🔍 已过滤 {filter_result['removed_count']} 条低质量内容\n"
+            f"   保留:{filter_result['filtered_count']}/{filter_result['original_count']}\n"
+            f"   原始文件备份至:{backup_file.name}"
+        )
+        result["action"] = "continue"
+        result["stats"]["filtered_count"] = filter_result["filtered_count"]
+        result["stats"]["removed_count"] = filter_result["removed_count"]
+
+        print(f"\n{result['message']}")
+
+    else:
+        # 质量太差,建议重新调研
+        result["status"] = "failed"
+        result["message"] = (
+            f"❌ 质量不达标(通过率 {pass_rate*100:.1f}%,平均分 {avg_score:.1f})\n"
+            f"   建议:重新调研或人工审核"
+        )
+        result["action"] = "manual_review"
+
+        print(f"\n{result['message']}")
+
+        # 显示主要问题
+        print(f"\n主要问题:")
+        issue_summary = _summarize_issues(eval_result["details"])
+        for issue, count in issue_summary.items():
+            print(f"  - {issue}: {count} 条")
+
+    return result
+
+
+def _summarize_issues(details: List[Dict]) -> Dict[str, int]:
+    """汇总常见问题"""
+    issue_counts = {}
+    for detail in details:
+        for issue in detail.get("issues", []):
+            issue_counts[issue] = issue_counts.get(issue, 0) + 1
+    # 按频率排序
+    return dict(sorted(issue_counts.items(), key=lambda x: x[1], reverse=True)[:5])
+
+
+def integrate_into_pipeline(output_dir: Path, min_score: float = 40.0):
+    """
+    集成到 pipeline 的入口函数
+
+    在 Phase 1.5 之后调用,检查 source.json 质量
+    """
+    source_file = output_dir / "raw_cases" / "source.json"
+
+    if not source_file.exists():
+        print(f"⚠️  source.json 不存在,跳过质量检查")
+        return {"status": "skip", "action": "continue"}
+
+    # 执行质量检查
+    result = check_and_filter_source(
+        source_file,
+        min_score=min_score,
+        min_pass_rate=0.6,
+        auto_filter=True,
+    )
+
+    # 保存质量报告
+    report_file = output_dir / "quality_report.txt"
+    generate_quality_report(source_file, report_file)
+    print(f"\n📄 详细报告已保存:{report_file}")
+
+    return result
+
+
+if __name__ == "__main__":
+    import argparse
+
+    parser = argparse.ArgumentParser(description="集成质量检查到 pipeline")
+    parser.add_argument("output_dir", type=Path, help="输出目录(如 output/001)")
+    parser.add_argument("--min-score", type=float, default=40.0, help="最低分数阈值")
+    args = parser.parse_args()
+
+    result = integrate_into_pipeline(args.output_dir, args.min_score)
+
+    # 根据结果决定是否继续
+    if result["action"] == "manual_review":
+        print(f"\n⚠️  需要人工介入,pipeline 暂停")
+        exit(1)
+    else:
+        print(f"\n✅ 质量检查完成,pipeline 继续")
+        exit(0)

+ 1 - 2
examples/process_pipeline/server.py

@@ -174,11 +174,10 @@ def get_requirements():
         case_json_path = dir_path / "case.json"
         if case_json_path.exists():
             try:
-                import json
                 with open(case_json_path, 'r', encoding='utf-8') as f:
                     case_data = json.load(f)
                     raw_cases_count = len(case_data.get('cases', []))
-            except:
+            except Exception:
                 pass
         
         if raw_cases_count == 0:

+ 177 - 13
examples/process_pipeline/ui/app.js

@@ -235,9 +235,11 @@ function renderRawCases(rawCasesObj) {
     const renderPaneContent = (pList) => {
         let paneHtml = '';
         let totalCases = 0;
-        let gridHtml = '';
         let seenIds = new Set();
         
+        let groupedHtml = {};
+        const getGroupKey = (c, p) => (p === 'filtered_cases' && c.filter_reason) ? `🚫 过滤原因: ${c.filter_reason}` : 'default';
+        
         pList.forEach(p => {
             if (!rawCasesObj[p]) return;
             if (rawCasesObj[p].error) {
@@ -248,11 +250,27 @@ function renderRawCases(rawCasesObj) {
                 </div>`;
                 return;
             }
-            if (rawCasesObj[p].reason) {
-                paneHtml += `<div style="padding:0.5rem 1rem; background:rgba(239, 68, 68, 0.1); border:1px solid var(--danger); border-radius:6px; margin-bottom:1rem; color:var(--danger); font-size:0.9rem;">🛑 ${p} 过滤原因: ${rawCasesObj[p].reason}</div>`;
+            if (rawCasesObj[p].reason && p !== 'filtered_cases') {
+                paneHtml += `<div style="padding:0.5rem 1rem; background:rgba(239, 68, 68, 0.1); border:1px solid var(--danger); border-radius:6px; margin-bottom:1rem; color:var(--danger); font-size:0.9rem;">🛑 ${p} 提示: ${rawCasesObj[p].reason}</div>`;
             }
 
-            const cases = Array.isArray(rawCasesObj[p]) ? rawCasesObj[p] : (rawCasesObj[p].cases || rawCasesObj[p].sources || []);
+            let cases = [];
+            if (Array.isArray(rawCasesObj[p])) {
+                cases = rawCasesObj[p];
+            } else if (rawCasesObj[p].cases) {
+                cases = rawCasesObj[p].cases;
+            } else if (rawCasesObj[p].sources) {
+                cases = rawCasesObj[p].sources;
+            } else if (rawCasesObj[p].by_reason) {
+                Object.entries(rawCasesObj[p].by_reason).forEach(([reasonKey, reasonObj]) => {
+                    if (reasonObj.sources && Array.isArray(reasonObj.sources)) {
+                        reasonObj.sources.forEach(src => {
+                            if (!src.filter_reason) src.filter_reason = reasonKey;
+                            cases.push(src);
+                        });
+                    }
+                });
+            }
             if (cases.length > 0) {
                 if (!rawCasesObj['source'] && p !== 'source_ex' && p !== 'filtered_cases' && p !== 'source') {
                     paneHtml += `<div style="padding:1rem; background:rgba(0, 0, 0, 0.05); border:1px solid rgba(0, 0, 0, 0.1); border-radius:8px; margin-bottom:1rem;">
@@ -266,6 +284,10 @@ function renderRawCases(rawCasesObj) {
                     totalCases++;
                     const cId = c.case_id || (c._raw && c._raw.case_id) || (c.post && c.post.channel_content_id) || `temp_${p}_${idx}`;
                     const cUrl = c.source_url || c.url || (c.post && c.post.link) || '';
+                    
+                    const groupKey = getGroupKey(c, p);
+                    if (!groupedHtml[groupKey]) groupedHtml[groupKey] = '';
+
                     if (cId || cUrl || c.post) {
                         const mappedS = sourceMap[cId] || sourceMap[cUrl] || (c._raw && sourceMap[c._raw.case_id]);
                         if (p !== 'filtered_cases' && p !== 'source' && p !== 'source_ex' && !mappedS) return;
@@ -300,7 +322,7 @@ function renderRawCases(rawCasesObj) {
                             }
                         }
                         
-                        gridHtml += `<div class="masonry-card" style="position:relative;" onclick="openCaseDetail('${p}', ${idx})">
+                        groupedHtml[groupKey] += `<div class="masonry-card" style="position:relative;" onclick="openCaseDetail('${p}', ${idx})">
                             ${platBadge}
                             ${actionBtn}
                             ${allImages.length > 0 ? `<img class="cover-img" src="${coverImgUrl}" onerror="this.onerror=null; this.src='${fallbackImgUrl}';">` : ''}
@@ -314,7 +336,7 @@ function renderRawCases(rawCasesObj) {
                             </div>
                         </div>`;
                     } else {
-                        gridHtml += `<div class="masonry-card" style="padding:12px; font-family:monospace; font-size:0.8em;" onclick="openCaseDetail('${p}', ${idx})">
+                        groupedHtml[groupKey] += `<div class="masonry-card" style="padding:12px; font-family:monospace; font-size:0.8em;" onclick="openCaseDetail('${p}', ${idx})">
                             📝 旧版格式 / 解析失败<br>点击查看详情
                         </div>`;
                     }
@@ -324,8 +346,15 @@ function renderRawCases(rawCasesObj) {
         
         if (totalCases === 0 && pList.length > 0 && !paneHtml.includes('解析失败') && !paneHtml.includes('未进行')) {
             paneHtml += `<div style="padding:1rem; color:var(--text-muted); text-align:center;">暂无数据</div>`;
-        } else if (gridHtml) {
-            paneHtml += `<div class="masonry-grid">${gridHtml}</div>`;
+        } else {
+            Object.entries(groupedHtml).forEach(([groupName, gHtml]) => {
+                if (gHtml) {
+                    if (groupName !== 'default') {
+                        paneHtml += `<h3 style="margin-top: 1rem; margin-bottom: 0.5rem; font-size: 1.1rem; color: var(--text-main); border-left: 4px solid var(--accent-primary); padding-left: 8px;">${groupName}</h3>`;
+                    }
+                    paneHtml += `<div class="masonry-grid">${gHtml}</div>`;
+                }
+            });
         }
         return paneHtml;
     };
@@ -1824,9 +1853,29 @@ window.openCaseDetail = function(p, initialIdx) {
     
     platformsToAggregate.forEach(plat => {
         if (!ctx.rawCasesObj[plat]) return;
-        const platCases = ctx.rawCasesObj[plat].cases || ctx.rawCasesObj[plat].sources || [];
+        let platCases = [];
+        if (Array.isArray(ctx.rawCasesObj[plat])) {
+            platCases = ctx.rawCasesObj[plat];
+        } else if (ctx.rawCasesObj[plat].cases) {
+            platCases = ctx.rawCasesObj[plat].cases;
+        } else if (ctx.rawCasesObj[plat].sources) {
+            platCases = ctx.rawCasesObj[plat].sources;
+        } else if (ctx.rawCasesObj[plat].by_reason) {
+            Object.entries(ctx.rawCasesObj[plat].by_reason).forEach(([reasonKey, reasonObj]) => {
+                if (reasonObj.sources && Array.isArray(reasonObj.sources)) {
+                    reasonObj.sources.forEach(src => {
+                        if (!src.filter_reason) src.filter_reason = reasonKey;
+                        platCases.push(src);
+                    });
+                }
+            });
+        }
         platCases.forEach((c, idx) => {
             const cId = c.case_id || (c._raw && c._raw.case_id) || (c.post && c.post.channel_content_id) || `temp_${plat}_${idx}`;
+            const cUrl = c.source_url || c.url || (c.post && c.post.link) || '';
+            
+            const mappedS = ctx.sourceMap[cId] || ctx.sourceMap[cUrl] || (c._raw && ctx.sourceMap[c._raw.case_id]);
+            if (plat !== 'filtered_cases' && plat !== 'source' && plat !== 'source_ex' && !mappedS) return;
             
             if (seenIds.has(cId)) return;
             seenIds.add(cId);
@@ -1914,13 +1963,40 @@ window.renderSingleCaseDetail = function(idx) {
     const title = post.title || c.title || '无标题';
     const workflowUrl = s.source_url || s.url || cUrl;
     
-    // Header
+    const publishedTime = post.publish_timestamp || post.published_at || '-';
+    const likeCount = post.like_count !== undefined ? post.like_count : (post.likes !== undefined ? post.likes : '-');
+    const collectCount = post.collect_count !== undefined ? post.collect_count : (post.collects !== undefined ? post.collects : '-');
+    const commentCount = post.comment_count !== undefined ? post.comment_count : (post.comments !== undefined ? post.comments : '-');
+    const shareCount = post.share_count !== undefined ? post.share_count : (post.shares !== undefined ? post.shares : '-');
+
     const headerHtml = `
         <h2 style="margin: 0 0 0.5rem 0; font-size: 1.4em; color: var(--text-main);">${title}</h2>
-        <div style="display: flex; gap: 12px; margin-bottom: 1rem;">
+        <div style="display: flex; gap: 12px; margin-bottom: 0.8rem;">
             ${workflowUrl ? `<a href="${workflowUrl}" target="_blank" style="color: var(--accent-primary); text-decoration: none; font-size: 0.9em;">原文 ↗</a>` : ''}
             <span style="color: var(--text-muted); font-size: 0.9em;">平台: ${platformName}</span>
         </div>
+        <div style="display: flex; gap: 10px; margin-bottom: 1rem; flex-wrap: wrap;">
+            <div style="display: flex; flex-direction: column; background: rgba(0,0,0,0.02); border: 1px solid rgba(0,0,0,0.06); padding: 4px 10px; border-radius: 6px;">
+                <span style="font-size: 0.7em; color: var(--text-muted); text-transform: uppercase;">Published</span>
+                <span style="font-size: 0.9rem; font-weight: 500; color: var(--text-main);">${publishedTime}</span>
+            </div>
+            <div style="display: flex; flex-direction: column; background: rgba(0,0,0,0.02); border: 1px solid rgba(0,0,0,0.06); padding: 4px 10px; border-radius: 6px;">
+                <span style="font-size: 0.7em; color: var(--text-muted); text-transform: uppercase;">Likes</span>
+                <span style="font-size: 0.9rem; font-weight: 500; color: var(--text-main);">${likeCount}</span>
+            </div>
+            <div style="display: flex; flex-direction: column; background: rgba(0,0,0,0.02); border: 1px solid rgba(0,0,0,0.06); padding: 4px 10px; border-radius: 6px;">
+                <span style="font-size: 0.7em; color: var(--text-muted); text-transform: uppercase;">Collects</span>
+                <span style="font-size: 0.9rem; font-weight: 500; color: var(--text-main);">${collectCount}</span>
+            </div>
+            <div style="display: flex; flex-direction: column; background: rgba(0,0,0,0.02); border: 1px solid rgba(0,0,0,0.06); padding: 4px 10px; border-radius: 6px;">
+                <span style="font-size: 0.7em; color: var(--text-muted); text-transform: uppercase;">Comments</span>
+                <span style="font-size: 0.9rem; font-weight: 500; color: var(--text-main);">${commentCount}</span>
+            </div>
+            <div style="display: flex; flex-direction: column; background: rgba(0,0,0,0.02); border: 1px solid rgba(0,0,0,0.06); padding: 4px 10px; border-radius: 6px;">
+                <span style="font-size: 0.7em; color: var(--text-muted); text-transform: uppercase;">Shares</span>
+                <span style="font-size: 0.9rem; font-weight: 500; color: var(--text-main);">${shareCount}</span>
+            </div>
+        </div>
     `;
     document.getElementById('modal-main-header').innerHTML = headerHtml;
     
@@ -1969,6 +2045,40 @@ window.renderSingleCaseDetail = function(idx) {
         </div>
     `;
     
+    // Evaluation Panel
+    if (s.evaluation && Object.keys(s.evaluation).length > 0) {
+        const renderEvalNode = (node, indent = 0) => {
+            let html = '';
+            if (typeof node === 'object' && node !== null) {
+                Object.entries(node).forEach(([k, v]) => {
+                    html += `<div style="display: flex; flex-direction: column; padding-left: ${indent}px; margin-bottom: 8px;">
+                        <span style="color: var(--text-muted); font-size: 0.85rem; font-weight: bold; text-transform: uppercase;">${k.replace(/_/g, ' ')}</span>`;
+                    if (typeof v === 'object' && v !== null) {
+                        html += `<div style="margin-top: 4px; border-left: 2px solid rgba(0,0,0,0.1); padding-left: 8px;">${renderEvalNode(v, 0)}</div>`;
+                    } else {
+                        const valColor = typeof v === 'number' ? '#3b82f6' : 'var(--text-main)';
+                        html += `<span style="font-weight: 500; font-size: 0.95rem; color: ${valColor}; margin-top: 2px;">${String(v).replace(/</g, '&lt;').replace(/>/g, '&gt;')}</span>`;
+                    }
+                    html += `</div>`;
+                });
+            }
+            return html;
+        };
+        
+        mainScrollableHtml += `
+        <div class="case-section" style="margin-top: 1rem;">
+            <div style="background: rgba(0,0,0,0.02); border: 1px solid rgba(0,0,0,0.05); border-radius: 8px; padding: 1rem;">
+                <h3 style="margin: 0 0 1rem 0; color: var(--text-main); font-size: 1.1rem; display: flex; align-items: center; gap: 10px;">
+                    <span style="color: #10b981;">📊</span> 质量评估 (Evaluation)
+                </h3>
+                <div style="display: flex; flex-direction: column; gap: 4px;">
+                    ${renderEvalNode(s.evaluation)}
+                </div>
+            </div>
+        </div>`;
+    }
+
+    
     // Extracted Data
     const wf = ctx.detailMap[cId] || (workflowUrl ? ctx.detailMapByUrl[workflowUrl] : null) || c;
     
@@ -2236,11 +2346,57 @@ window.renderStructuredData = function(items, type, parentItem = null) {
         // Stage rendering removed per request
         // Render effects
         if (item.effects && Array.isArray(item.effects) && item.effects.length > 0) {
+            let effectsHtml = '';
+            item.effects.forEach(effectItem => {
+                if (typeof effectItem === 'string') {
+                    effectsHtml += `<li style="margin-bottom: 4px;">${effectItem.replace(/</g, '&lt;').replace(/>/g, '&gt;')}</li>`;
+                } else if (typeof effectItem === 'object' && effectItem !== null) {
+                    const stmt = effectItem.statement ? effectItem.statement.replace(/</g, '&lt;').replace(/>/g, '&gt;') : 'Effect';
+                    let detailsHtml = '';
+                    const excludeKeys = ['statement'];
+                    Object.entries(effectItem).forEach(([k, v]) => {
+                        if (!excludeKeys.includes(k) && v !== null && v !== undefined && v !== '' && (!Array.isArray(v) || v.length > 0)) {
+                            let valStr = '';
+                            if (Array.isArray(v)) {
+                                valStr = `<ul style="margin: 2px 0 0 20px; padding: 0;">` + v.map(vi => `<li>${String(vi).replace(/</g, '&lt;').replace(/>/g, '&gt;')}</li>`).join('') + `</ul>`;
+                            } else if (typeof v === 'object') {
+                                valStr = JSON.stringify(v);
+                            } else {
+                                valStr = String(v).replace(/</g, '&lt;').replace(/>/g, '&gt;');
+                            }
+                            
+                            if (Array.isArray(v)) {
+                                detailsHtml += `<div style="margin-top: 6px; font-size: 0.9em;">
+                                    <div style="color: var(--text-muted); font-weight: 500; margin-bottom: 2px; text-transform: capitalize;">${k.replace(/_/g, ' ')}:</div> 
+                                    <div style="color: var(--text-main);">${valStr}</div>
+                                </div>`;
+                            } else {
+                                detailsHtml += `<div style="margin-top: 4px; font-size: 0.9em;">
+                                    <span style="color: var(--text-muted); font-weight: 500; text-transform: capitalize;">${k.replace(/_/g, ' ')}:</span> 
+                                    <span style="color: var(--text-main);">${valStr}</span>
+                                </div>`;
+                            }
+                        }
+                    });
+                    
+                    effectsHtml += `<li style="list-style: none; margin-bottom: 8px; margin-left: -20px;">
+                        <details style="background: rgba(0,0,0,0.02); border: 1px solid rgba(0,0,0,0.06); border-radius: 6px; padding: 6px 10px;">
+                            <summary style="cursor: pointer; font-weight: 500; color: var(--accent-primary); outline: none;">
+                                ${stmt}
+                            </summary>
+                            <div style="padding-top: 8px; margin-top: 6px; border-top: 1px dashed rgba(0,0,0,0.1);">
+                                ${detailsHtml}
+                            </div>
+                        </details>
+                    </li>`;
+                }
+            });
+            
             html += `<div class="structured-row">
                 <div class="structured-label">effects</div>
                 <div class="structured-value">
                     <ul class="effects-list">
-                        ${item.effects.map(li => `<li>${li.replace(/</g, '&lt;').replace(/>/g, '&gt;')}</li>`).join('')}
+                        ${effectsHtml}
                     </ul>
                 </div>
             </div>`;
@@ -2334,6 +2490,8 @@ window.renderStructuredData = function(items, type, parentItem = null) {
             return list.map(io => {
                 const desc = isValid(io.description) ? io.description.replace(/</g, '&lt;').replace(/>/g, '&gt;') : '';
                 const mod = isValid(io.modality) ? io.modality : '';
+                const relation = isValid(io.relation) ? io.relation.replace(/</g, '&lt;').replace(/>/g, '&gt;') : '';
+                
                 let content = '';
                 if (mod) {
                     content += `<span class="data-type-badge" style="background:#e0e7ff;color:#3730a3;font-weight:normal;margin-right:6px;margin-bottom:2px;display:inline-block;">${mod}</span>`;
@@ -2341,6 +2499,12 @@ window.renderStructuredData = function(items, type, parentItem = null) {
                 if (desc) {
                     content += desc;
                 }
+                
+                let extraHtml = '';
+                if (relation) {
+                    extraHtml = `<div style="font-size: 0.85em; color: #94a3b8; margin-top: 2px; font-weight: normal;">↳ relation: ${relation}</div>`;
+                }
+                
                 if (!content) {
                     const keys = Object.keys(io);
                     if (keys.length === 1 && typeof io[keys[0]] === 'string') {
@@ -2349,7 +2513,7 @@ window.renderStructuredData = function(items, type, parentItem = null) {
                         content = `<span class="data-type-badge">未知</span>`;
                     }
                 }
-                return `<div style="margin-bottom: 6px; color: var(--text-main); font-weight: bold; line-height: 1.5; word-wrap: break-word;">${content}</div>`;
+                return `<div style="margin-bottom: 6px; color: var(--text-main); font-weight: bold; line-height: 1.5; word-wrap: break-word;">${content}${extraHtml}</div>`;
             }).join('');
         };
 

+ 39 - 0
examples/test_gzh_api.py

@@ -0,0 +1,39 @@
+import asyncio
+import json
+import os
+import sys
+
+# 将当前目录加入系统路径以便导包
+sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+
+from agent.tools.builtin.content.platforms.aigc_channel import search
+
+async def main():
+    keyword = "人工智能"
+    print(f"=====================================")
+    print(f"Testing GZH (公众号) search API")
+    print(f"Keyword: {keyword}")
+    print(f"=====================================\n")
+    
+    result = await search(platform_id="gzh", keyword=keyword, max_count=5)
+    
+    if result.error:
+        print(f"❌ [Error] 搜索失败: {result.error}")
+    else:
+        print(f"✅ [Success] 搜索成功!返回结果如下:")
+        try:
+            parsed = json.loads(result.output)
+            print(json.dumps(parsed, ensure_ascii=False, indent=2))
+        except Exception:
+            print(result.output)
+            
+        if result.metadata and "posts" in result.metadata:
+            posts = result.metadata["posts"]
+            print(f"\n[Metadata] 成功获取到底层 posts 完整数据条数: {len(posts)}")
+            if len(posts) > 0:
+                print(f"[Metadata Sample] 第一条数据的标题: {posts[0].get('title', '无标题')}")
+                if "_quality_score" in posts[0]:
+                    print(f"[Quality] 第一条数据的质量评分: {posts[0]['_quality_score']} ({posts[0]['_quality_grade']})")
+
+if __name__ == "__main__":
+    asyncio.run(main())