yangxiaohui 1 mēnesi atpakaļ
vecāks
revīzija
d1cd7f924e
3 mainītis faili ar 147 papildinājumiem un 124 dzēšanām
  1. 32 1
      script/search/xiaohongshu_search.py
  2. 52 62
      sug_v6_1_2_3.py
  3. 63 61
      visualize_steps.py

+ 32 - 1
script/search/xiaohongshu_search.py

@@ -79,11 +79,42 @@ class XiaohongshuSearch:
                 headers={"Content-Type": "application/json"}
             )
             response.raise_for_status()
-            return response.json()
+            result = response.json()
+
+            # 预处理返回数据:提取 image_list 中的 URL 字符串
+            self._preprocess_response(result)
+
+            return result
         except requests.exceptions.RequestException as e:
             print(f"请求失败: {e}")
             raise
 
+    def _preprocess_response(self, result: Dict[str, Any]) -> None:
+        """
+        预处理搜索结果,将 image_list 中的字典格式转换为 URL 字符串列表
+
+        Args:
+            result: API返回的原始结果字典(会直接修改)
+        """
+        # 获取帖子列表
+        notes = result.get("data", {}).get("data", [])
+
+        for note in notes:
+            note_card = note.get("note_card", {})
+            image_list_raw = note_card.get("image_list", [])
+
+            # 提取 URL 字符串
+            image_list = []
+            for img in image_list_raw:
+                if isinstance(img, dict) and "image_url" in img:
+                    image_list.append(img["image_url"])
+                elif isinstance(img, str):
+                    # 如果已经是字符串,直接使用
+                    image_list.append(img)
+
+            # 更新为预处理后的列表
+            note_card["image_list"] = image_list
+
     def save_result(self, keyword: str, result: Dict[str, Any], page: int = 1) -> str:
         """
         保存结果到文件

+ 52 - 62
sug_v6_1_2_3.py

@@ -481,6 +481,41 @@ def add_step(context: RunContext, step_name: str, step_type: str, data: dict):
     return step
 
 
+def process_note_data(note: dict) -> dict:
+    """
+    处理搜索接口返回的帖子数据,标准化为统一格式
+
+    Args:
+        note: 搜索接口返回的原始帖子数据
+
+    Returns:
+        标准化后的帖子数据字典
+    """
+    note_card = note.get("note_card", {})
+    image_list = note_card.get("image_list", [])  # 已在搜索API层预处理为URL字符串列表
+    interact_info = note_card.get("interact_info", {})
+    user_info = note_card.get("user", {})
+
+    return {
+        "note_id": note.get("id", ""),
+        "title": note_card.get("display_title", ""),
+        "desc": note_card.get("desc", ""),
+        "image_list": image_list,  # 第一张就是封面,已在XiaohongshuSearch.search()中预处理为URL字符串列表
+        "interact_info": {
+            "liked_count": interact_info.get("liked_count", 0),
+            "collected_count": interact_info.get("collected_count", 0),
+            "comment_count": interact_info.get("comment_count", 0),
+            "shared_count": interact_info.get("shared_count", 0)
+        },
+        "user": {
+            "nickname": user_info.get("nickname", ""),
+            "user_id": user_info.get("user_id", "")
+        },
+        "type": note_card.get("type", "normal"),
+        "note_url": f"https://www.xiaohongshu.com/explore/{note.get('id', '')}"
+    }
+
+
 # ============================================================================
 # 核心函数
 # ============================================================================
@@ -701,7 +736,7 @@ async def evaluate_candidates(candidates: list[str], original_question: str, con
 
                         # 对每个帖子进行独立评估
                         note_evaluations = []
-                        for note_idx, note in enumerate(notes[:10], 1):  # 只评估前10条
+                        for note_idx, note in enumerate(notes, 1):  # 评估所有帖子
                             note_card = note.get("note_card", {})
                             title = note_card.get("display_title", "")
                             desc = note_card.get("desc", "")
@@ -728,7 +763,7 @@ async def evaluate_candidates(candidates: list[str], original_question: str, con
                                 "note_index": note_idx,
                                 "note_id": note_id,
                                 "title": title,
-                                "desc": desc[:200],  # 只保存前200字
+                                "desc": desc,  # 保存完整描述
                                 "evaluation": {
                                     "title_relevance": note_eval.title_relevance,
                                     "content_expectation": note_eval.content_expectation,
@@ -740,8 +775,8 @@ async def evaluate_candidates(candidates: list[str], original_question: str, con
                             note_evaluations.append(note_evaluation_record)
 
                             # 简单打印进度
-                            if note_idx % 3 == 0 or note_idx == len(notes[:10]):
-                                print(f"         已评估 {note_idx}/{len(notes[:10])} 个帖子")
+                            if note_idx % 3 == 0 or note_idx == len(notes):
+                                print(f"         已评估 {note_idx}/{len(notes)} 个帖子")
 
                         # 统计满足需求的帖子数量
                         satisfied_count = sum(1 for ne in note_evaluations if ne["evaluation"]["need_satisfaction"])
@@ -1129,30 +1164,7 @@ async def step_search_qualified_queries(qualified_queries: list[dict], context:
             print(f"  → 搜索成功,获得 {len(notes)} 个帖子")
 
             # ⭐ 提取帖子摘要信息用于steps.json
-            notes_summary = []
-            for note in notes[:10]:  # 只保存前10个
-                note_card = note.get("note_card", {})
-                image_list = note_card.get("image_list", [])
-                interact_info = note_card.get("interact_info", {})
-                user_info = note_card.get("user", {})
-
-                notes_summary.append({
-                    "note_id": note.get("id", ""),
-                    "title": note_card.get("display_title", ""),
-                    "desc": note_card.get("desc", "")[:200],
-                    "cover_image": image_list[0] if image_list else {},
-                    "interact_info": {
-                        "liked_count": interact_info.get("liked_count", 0),
-                        "collected_count": interact_info.get("collected_count", 0),
-                        "comment_count": interact_info.get("comment_count", 0),
-                        "shared_count": interact_info.get("shared_count", 0)
-                    },
-                    "user": {
-                        "nickname": user_info.get("nickname", ""),
-                        "user_id": user_info.get("user_id", "")
-                    },
-                    "type": note_card.get("type", "normal")
-                })
+            notes_summary = [process_note_data(note) for note in notes]
 
             return {
                 "query": query,
@@ -1162,7 +1174,7 @@ async def step_search_qualified_queries(qualified_queries: list[dict], context:
                 "reason": query_info['reason'],
                 "search_result_file": search_result_file,
                 "note_count": len(notes),
-                "notes": notes[:10],  # 只保存前10个用于评估
+                "notes": notes,  # 保存所有帖子用于评估
                 "notes_summary": notes_summary  # ⭐ 保存到steps.json
             }
 
@@ -1253,16 +1265,10 @@ async def step_evaluate_search_notes(search_data: dict, original_question: str,
         # 评估每个帖子
         note_evaluations = []
         for note_idx, note in enumerate(notes, 1):
-            note_card = note.get("note_card", {})
-            title = note_card.get("display_title", "")
-            desc = note_card.get("desc", "")
-            note_id = note.get("id", "")
-
-            # ⭐ 提取完整帖子信息用于可视化
-            image_list = note_card.get("image_list", [])
-            cover_image = image_list[0] if image_list else {}
-            interact_info = note_card.get("interact_info", {})
-            user_info = note_card.get("user", {})
+            # 使用标准化函数处理帖子数据
+            note_data = process_note_data(note)
+            title = note_data["title"]
+            desc = note_data["desc"]
 
             # 调用评估Agent
             eval_input = f"""
@@ -1280,26 +1286,10 @@ async def step_evaluate_search_notes(search_data: dict, original_question: str,
             eval_result_run = await Runner.run(note_evaluator, eval_input)
             note_eval: NoteEvaluation = eval_result_run.final_output
 
+            # 合并标准化的帖子数据和评估结果
             note_evaluations.append({
+                **note_data,  # 包含所有标准化字段
                 "note_index": note_idx,
-                "note_id": note_id,
-                "title": title,
-                "desc": desc[:200],
-                # ⭐ 新增:完整帖子信息
-                "image_list": image_list,
-                "cover_image": cover_image,
-                "interact_info": {
-                    "liked_count": interact_info.get("liked_count", 0),
-                    "collected_count": interact_info.get("collected_count", 0),
-                    "comment_count": interact_info.get("comment_count", 0),
-                    "shared_count": interact_info.get("shared_count", 0)
-                },
-                "user": {
-                    "nickname": user_info.get("nickname", ""),
-                    "user_id": user_info.get("user_id", "")
-                },
-                "type": note_card.get("type", "normal"),
-                "note_url": f"https://www.xiaohongshu.com/explore/{note_id}",
                 "evaluation": {
                     "title_relevance": note_eval.title_relevance,
                     "content_expectation": note_eval.content_expectation,
@@ -1576,7 +1566,7 @@ async def progressive_exploration(context: RunContext, max_levels: int = 4) -> d
     candidates_to_evaluate = []
 
     # Level 1:单个关键词
-    level_1_queries = context.keywords[:7]
+    level_1_queries = context.keywords  # 使用所有关键词
     level_1_data = await explore_level(level_1_queries, current_level, context)
     analysis_1 = await analyze_level(level_1_data, context.exploration_levels, context.q, context)
 
@@ -1719,7 +1709,7 @@ def format_output(optimization_result: dict, context: RunContext) -> str:
         for cited_note in final_answer.get("cited_notes", []):
             output += f"[{cited_note['index']}] {cited_note['title']}\n"
             output += f"    置信度: {cited_note['confidence_score']:.2f}\n"
-            output += f"    描述: {cited_note['desc'][:100]}...\n"
+            output += f"    描述: {cited_note['desc']}\n"
             output += f"    note_id: {cited_note['note_id']}\n\n"
     else:
         output += "未能生成答案\n"
@@ -1804,7 +1794,7 @@ async def main(input_dir: str, max_levels: int = 4, visualize: bool = False):
                 "title": note["title"],
                 "confidence_score": note["confidence_score"]
             }
-            for note in satisfied_notes[:10]  # 只保存前10个摘要
+            for note in satisfied_notes  # 保存所有满足条件的帖子摘要
         ] if satisfied_notes else [],
         "final_output": final_output
     })
@@ -1852,8 +1842,8 @@ if __name__ == "__main__":
     parser.add_argument(
         "--max-levels",
         type=int,
-        default=4,
-        help="最大探索层数,默认: 4"
+        default=10,
+        help="最大探索层数,默认: 10"
     )
     parser.add_argument(
         "--visualize",

+ 63 - 61
visualize_steps.py

@@ -687,8 +687,8 @@ HTML_TEMPLATE = """<!DOCTYPE html>
 
         .modal-images-grid {
             display: grid;
-            grid-template-columns: repeat(auto-fill, minmax(280px, 1fr));
-            gap: 15px;
+            grid-template-columns: repeat(auto-fill, minmax(200px, 1fr));
+            gap: 12px;
         }
 
         .modal-image-item {
@@ -696,6 +696,7 @@ HTML_TEMPLATE = """<!DOCTYPE html>
             overflow: hidden;
             border: 2px solid #e5e7eb;
             transition: border-color 0.2s;
+            cursor: pointer;
         }
 
         .modal-image-item:hover {
@@ -706,6 +707,8 @@ HTML_TEMPLATE = """<!DOCTYPE html>
             width: 100%;
             height: auto;
             display: block;
+            max-height: 250px;
+            object-fit: cover;
         }
 
         .modal-section {
@@ -1150,9 +1153,15 @@ def build_post_json_data(note, evaluation=None):
 
     image_list = note.get('image_list', [])
     if not image_list and note.get('cover_image'):
-        image_list = [note.get('cover_image')]
+        cover = note.get('cover_image')
+        # cover_image 可能是字典或字符串
+        if isinstance(cover, dict):
+            image_list = [cover.get('image_url', '')]
+        else:
+            image_list = [cover]
 
-    images = [img.get('image_url', '') for img in image_list if img.get('image_url')]
+    # image_list 现在已经是 URL 字符串列表(由搜索API预处理)
+    images = [img if isinstance(img, str) else img.get('image_url', '') for img in image_list if img]
 
     interact = note.get('interact_info', {})
     user = note.get('user', {})
@@ -1406,16 +1415,20 @@ def render_search_results(step):
         # 渲染该query的帖子
         posts_cards = ""
         for note in notes_summary:
-            cover = note.get("cover_image", {})
-            cover_url = cover.get("image_url", "") if cover else ""
+            # 获取封面图
+            image_list = note.get('image_list', [])
+            if image_list:
+                # image_list 已经是 URL 字符串列表,第一张就是封面
+                cover_url = image_list[0] if isinstance(image_list[0], str) else image_list[0].get('image_url', '')
+            else:
+                cover = note.get("cover_image", {})
+                cover_url = cover.get("image_url", "") if isinstance(cover, dict) else cover if cover else ""
+
             interact = note.get("interact_info", {})
             user = note.get("user", {})
 
-            # 获取所有图片用于轮播
-            image_list = note.get('image_list', [])
-            if not image_list and cover:
-                image_list = [cover]
-            images = [img.get('image_url', '') for img in image_list if img.get('image_url')]
+            # image_list 现在已经是 URL 字符串列表
+            images = [img if isinstance(img, str) else img.get('image_url', '') for img in image_list if img]
 
             # 构建帖子数据用于模态框
             post_data = build_post_json_data(note)
@@ -1507,18 +1520,21 @@ def render_note_evaluations(step):
         # 渲染满足需求的帖子
         satisfied_cards = ""
         for note in satisfied_notes:
-            cover = note.get("cover_image", {})
-            cover_url = cover.get("image_url", "") if cover else ""
+            # 获取封面图
+            image_list = note.get('image_list', [])
+            if image_list:
+                cover_url = image_list[0] if isinstance(image_list[0], str) else image_list[0].get('image_url', '')
+            else:
+                cover = note.get("cover_image", {})
+                cover_url = cover.get("image_url", "") if isinstance(cover, dict) else cover if cover else ""
+
             interact = note.get("interact_info", {})
             user = note.get("user", {})
             evaluation = note.get("evaluation", {})
             confidence = evaluation.get("confidence_score", 0)
 
-            # 获取所有图片用于轮播
-            image_list = note.get('image_list', [])
-            if not image_list and cover:
-                image_list = [cover]
-            images = [img.get('image_url', '') for img in image_list if img.get('image_url')]
+            # image_list 现在已经是 URL 字符串列表
+            images = [img if isinstance(img, str) else img.get('image_url', '') for img in image_list if img]
 
             # 构建帖子数据用于模态框
             post_data = build_post_json_data(note, evaluation)
@@ -1544,16 +1560,7 @@ def render_note_evaluations(step):
             title_rel = evaluation.get("title_relevance", 0)
             content_exp = evaluation.get("content_expectation", 0)
 
-            eval_details = f"""
-            <div class="evaluation-reason">
-                <strong>💡 评估理由:</strong><br>
-                {eval_reason}
-                <div class="evaluation-scores">
-                    <span class="score-item">📌 标题相关性: {title_rel:.2f}</span>
-                    <span class="score-item">📄 内容期望: {content_exp:.2f}</span>
-                </div>
-            </div>
-            """ if eval_reason else ""
+            eval_details = ""
 
             # 置信度百分比
             confidence_percent = int(confidence * 100)
@@ -1588,17 +1595,21 @@ def render_note_evaluations(step):
         # 渲染不满足需求的帖子
         unsatisfied_cards = ""
         for note in unsatisfied_notes:
-            cover = note.get("cover_image", {})
-            cover_url = cover.get("image_url", "") if cover else ""
+            # 获取封面图
+            image_list = note.get('image_list', [])
+            if image_list:
+                cover_url = image_list[0] if isinstance(image_list[0], str) else image_list[0].get('image_url', '')
+            else:
+                cover = note.get("cover_image", {})
+                cover_url = cover.get("image_url", "") if isinstance(cover, dict) else cover if cover else ""
+
             interact = note.get("interact_info", {})
             user = note.get("user", {})
             evaluation = note.get("evaluation", {})
             confidence = evaluation.get("confidence_score", 0)
 
-            image_list = note.get('image_list', [])
-            if not image_list and cover:
-                image_list = [cover]
-            images = [img.get('image_url', '') for img in image_list if img.get('image_url')]
+            # image_list 现在已经是 URL 字符串列表
+            images = [img if isinstance(img, str) else img.get('image_url', '') for img in image_list if img]
 
             post_data = build_post_json_data(note, evaluation)
             images_json = json.dumps(images)
@@ -1621,16 +1632,7 @@ def render_note_evaluations(step):
             title_rel = evaluation.get("title_relevance", 0)
             content_exp = evaluation.get("content_expectation", 0)
 
-            eval_details = f"""
-            <div class="evaluation-reason">
-                <strong>💡 评估理由:</strong><br>
-                {eval_reason}
-                <div class="evaluation-scores">
-                    <span class="score-item">📌 标题相关性: {title_rel:.2f}</span>
-                    <span class="score-item">📄 内容期望: {content_exp:.2f}</span>
-                </div>
-            </div>
-            """ if eval_reason else ""
+            eval_details = ""
 
             confidence_percent = int(confidence * 100)
 
@@ -1728,16 +1730,19 @@ def render_answer_generation(step):
     # 渲染引用的帖子
     cited_html = ""
     for note in cited_notes:
-        cover = note.get("cover_image", {})
-        cover_url = cover.get("image_url", "") if cover else ""
+        # 获取封面图
+        image_list = note.get('image_list', [])
+        if image_list:
+            cover_url = image_list[0] if isinstance(image_list[0], str) else image_list[0].get('image_url', '')
+        else:
+            cover = note.get("cover_image", {})
+            cover_url = cover.get("image_url", "") if isinstance(cover, dict) else cover if cover else ""
+
         interact = note.get("interact_info", {})
         user = note.get("user", {})
 
-        # 获取所有图片用于轮播
-        image_list = note.get('image_list', [])
-        if not image_list and cover:
-            image_list = [cover]
-        images = [img.get('image_url', '') for img in image_list if img.get('image_url')]
+        # image_list 现在已经是 URL 字符串列表
+        images = [img if isinstance(img, str) else img.get('image_url', '') for img in image_list if img]
 
         # 构建帖子数据用于模态框(包含评估信息)
         eval_data = {
@@ -1751,6 +1756,11 @@ def render_answer_generation(step):
 
         image_html = f'<img src="{cover_url}" class="post-image" alt="{note.get("title", "")}">' if cover_url else '<div class="no-image">无图片</div>'
 
+        # 类型标识
+        type_badge = ""
+        if note.get("type") == "video":
+            type_badge = '<div class="post-type-badge">📹 视频</div>'
+
         # 轮播指示器
         dots_html = ""
         if len(images) > 1:
@@ -1765,16 +1775,7 @@ def render_answer_generation(step):
         title_rel = note.get("title_relevance", 0)
         content_exp = note.get("content_expectation", 0)
 
-        eval_details = f"""
-        <div class="evaluation-reason">
-            <strong>💡 评估理由:</strong><br>
-            {eval_reason}
-            <div class="evaluation-scores">
-                <span class="score-item">📌 标题相关性: {title_rel:.2f}</span>
-                <span class="score-item">📄 内容期望: {content_exp:.2f}</span>
-            </div>
-        </div>
-        """ if eval_reason else ""
+        eval_details = ""
 
         # 置信度百分比
         note_confidence = note.get('confidence_score', 0)
@@ -1784,6 +1785,7 @@ def render_answer_generation(step):
         <div class="post-card" onclick='openModal({post_data})' data-images='{images_json}'>
             <div class="post-image-wrapper">
                 {image_html}
+                {type_badge}
                 {dots_html}
             </div>
             <div class="post-info">