Переглянути джерело

refactor: 优化特征分类提取逻辑

- 只从"灵感点"中提取分类特征(之前从所有特征类型提取)
- 减少匹配特征数量从134个到48个(36标签+12分类)
- 修复未使用变量的Pylance警告

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>
yangxiaohui 1 тиждень тому
батько
коміт
18695bc29a

+ 83 - 4
script/data_processing/match_inspiration_features.py

@@ -35,6 +35,7 @@ def get_semaphore():
 async def match_single_pair(
     feature_name: str,
     persona_name: str,
+    category_mapping: Dict = None,
     model_name: str = None
 ) -> Dict:
     """
@@ -43,12 +44,15 @@ async def match_single_pair(
     Args:
         feature_name: 要匹配的特征名称
         persona_name: 人设特征名称
+        category_mapping: 特征分类映射字典
         model_name: 使用的模型名称
 
     Returns:
         单个匹配结果,格式:
         {
             "人设特征名称": "xxx",
+            "特征类型": "标签",
+            "特征分类": ["分类1", "分类2"],
             "匹配结果": {
                 "相似度": 0.75,
                 "说明": "..."
@@ -63,8 +67,45 @@ async def match_single_pair(
             phrase_b=persona_name,
         )
 
+        # 判断该特征是标签还是分类
+        feature_type = "分类"  # 默认为分类
+        categories = []
+
+        if category_mapping:
+            # 先在标签特征中查找(灵感点、关键点、目的点)
+            is_tag_feature = False
+            for ft in ["灵感点", "关键点", "目的点"]:
+                if ft in category_mapping:
+                    type_mapping = category_mapping[ft]
+                    if persona_name in type_mapping:
+                        # 找到了,说明是标签特征
+                        feature_type = "标签"
+                        categories = type_mapping[persona_name].get("所属分类", [])
+                        is_tag_feature = True
+                        break
+
+            # 如果不是标签特征,检查是否是分类特征
+            if not is_tag_feature:
+                # 收集所有分类
+                all_categories = set()
+                for ft in ["灵感点", "关键点", "目的点"]:
+                    if ft in category_mapping:
+                        for fname, fdata in category_mapping[ft].items():
+                            cats = fdata.get("所属分类", [])
+                            all_categories.update(cats)
+
+                # 如果当前特征名在分类列表中,则是分类特征
+                if persona_name in all_categories:
+                    feature_type = "分类"
+                    categories = []  # 分类特征本身没有所属分类
+
+        # 去重分类
+        unique_categories = list(dict.fromkeys(categories))
+
         return {
             "人设特征名称": persona_name,
+            "特征类型": feature_type,
+            "特征分类": unique_categories,
             "匹配结果": similarity_result
         }
 
@@ -72,6 +113,7 @@ async def match_single_pair(
 async def match_feature_with_persona(
     feature_name: str,
     persona_features: List[Dict],
+    category_mapping: Dict = None,
     model_name: str = None
 ) -> List[Dict]:
     """
@@ -80,6 +122,7 @@ async def match_feature_with_persona(
     Args:
         feature_name: 要匹配的特征名称
         persona_features: 人设特征列表
+        category_mapping: 特征分类映射字典
         model_name: 使用的模型名称
 
     Returns:
@@ -87,7 +130,7 @@ async def match_feature_with_persona(
     """
     # 创建所有匹配任务
     tasks = [
-        match_single_pair(feature_name, persona_feature["特征名称"], model_name)
+        match_single_pair(feature_name, persona_feature["特征名称"], category_mapping, model_name)
         for persona_feature in persona_features
     ]
 
@@ -100,6 +143,7 @@ async def match_feature_with_persona(
 async def match_single_feature(
     feature_name: str,
     persona_features: List[Dict],
+    category_mapping: Dict = None,
     model_name: str = None
 ) -> Dict:
     """
@@ -108,6 +152,7 @@ async def match_single_feature(
     Args:
         feature_name: 特征名称
         persona_features: 人设特征列表
+        category_mapping: 特征分类映射字典
         model_name: 使用的模型名称
 
     Returns:
@@ -117,6 +162,7 @@ async def match_single_feature(
     match_results = await match_feature_with_persona(
         feature_name=feature_name,
         persona_features=persona_features,
+        category_mapping=category_mapping,
         model_name=model_name
     )
 
@@ -129,6 +175,7 @@ async def match_single_feature(
 async def process_single_inspiration_point(
     inspiration_point: Dict,
     persona_features: List[Dict],
+    category_mapping: Dict = None,
     model_name: str = None
 ) -> Dict:
     """
@@ -137,6 +184,7 @@ async def process_single_inspiration_point(
     Args:
         inspiration_point: 灵感点数据
         persona_features: 人设灵感特征列表
+        category_mapping: 特征分类映射字典
         model_name: 使用的模型名称
 
     Returns:
@@ -150,7 +198,7 @@ async def process_single_inspiration_point(
 
     # 并发匹配所有特征
     tasks = [
-        match_single_feature(feature_name, persona_features, model_name)
+        match_single_feature(feature_name, persona_features, category_mapping, model_name)
         for feature_name in feature_list
     ]
     feature_match_results = await asyncio.gather(*tasks)
@@ -173,6 +221,7 @@ async def process_single_task(
     task_index: int,
     total_tasks: int,
     persona_inspiration_features: List[Dict],
+    category_mapping: Dict = None,
     model_name: str = None
 ) -> Dict:
     """
@@ -183,6 +232,7 @@ async def process_single_task(
         task_index: 任务索引(从1开始)
         total_tasks: 总任务数
         persona_inspiration_features: 人设灵感特征列表
+        category_mapping: 特征分类映射字典
         model_name: 使用的模型名称
 
     Returns:
@@ -202,6 +252,7 @@ async def process_single_task(
         process_single_inspiration_point(
             inspiration_point=inspiration_point,
             persona_features=persona_inspiration_features,
+            category_mapping=category_mapping,
             model_name=model_name
         )
         for inspiration_point in inspiration_list
@@ -223,6 +274,7 @@ async def process_single_task(
 async def process_task_list(
     task_list: List[Dict],
     persona_features_dict: Dict,
+    category_mapping: Dict = None,
     model_name: str = None
 ) -> List[Dict]:
     """
@@ -231,13 +283,33 @@ async def process_task_list(
     Args:
         task_list: 解构任务列表
         persona_features_dict: 人设特征字典(包含灵感点、目的点、关键点)
+        category_mapping: 特征分类映射字典
         model_name: 使用的模型名称
 
     Returns:
         包含 how 解构结果的任务列表
     """
+    # 获取标签特征列表
     persona_inspiration_features = persona_features_dict.get("灵感点", [])
-    print(f"人设灵感特征数量: {len(persona_inspiration_features)}")
+    print(f"人设标签特征数量: {len(persona_inspiration_features)}")
+
+    # 从分类映射中提取所有唯一的分类作为分类特征(仅从灵感点中提取)
+    category_features = []
+    if category_mapping:
+        all_categories = set()
+        # 只从灵感点中提取分类
+        if "灵感点" in category_mapping:
+            for _, feature_data in category_mapping["灵感点"].items():
+                categories = feature_data.get("所属分类", [])
+                all_categories.update(categories)
+
+        # 转换为特征格式
+        category_features = [{"特征名称": cat} for cat in sorted(all_categories)]
+        print(f"人设分类特征数量: {len(category_features)}")
+
+    # 合并标签特征和分类特征
+    all_features = persona_inspiration_features + category_features
+    print(f"总特征数量(标签+分类): {len(all_features)}")
 
     # 并发处理所有任务
     tasks = [
@@ -245,7 +317,8 @@ async def process_task_list(
             task=task,
             task_index=i,
             total_tasks=len(task_list),
-            persona_inspiration_features=persona_inspiration_features,
+            persona_inspiration_features=all_features,
+            category_mapping=category_mapping,
             model_name=model_name
         )
         for i, task in enumerate(task_list, 1)
@@ -264,6 +337,7 @@ async def main():
 
     task_list_file = data_dir / "当前帖子_解构任务列表.json"
     persona_features_file = data_dir / "特征名称_帖子来源.json"
+    category_mapping_file = data_dir / "特征名称_分类映射.json"
     output_dir = data_dir / "当前帖子_how解构结果"
 
     # 创建输出目录
@@ -277,6 +351,10 @@ async def main():
     with open(persona_features_file, "r", encoding="utf-8") as f:
         persona_features_data = json.load(f)
 
+    print(f"读取特征分类映射: {category_mapping_file}")
+    with open(category_mapping_file, "r", encoding="utf-8") as f:
+        category_mapping = json.load(f)
+
     # 获取任务列表
     task_list = task_list_data.get("解构任务列表", [])
     print(f"\n总任务数: {len(task_list)}")
@@ -285,6 +363,7 @@ async def main():
     updated_task_list = await process_task_list(
         task_list=task_list,
         persona_features_dict=persona_features_data,
+        category_mapping=category_mapping,
         model_name=None  # 使用默认模型
     )
 

+ 34 - 0
script/data_processing/visualize_how_results.py

@@ -302,6 +302,8 @@ def generate_match_results_html(how_steps: List[Dict], feature_idx: int, insp_id
     matches_html = ""
     for i, match in enumerate(sorted_matches):
         persona_name = match.get("人设特征名称", "")
+        feature_type = match.get("特征类型", "")
+        feature_categories = match.get("特征分类", [])
         match_result = match.get("匹配结果", {})
         similarity = match_result.get("相似度", 0.0)
         explanation = match_result.get("说明", "")
@@ -319,6 +321,16 @@ def generate_match_results_html(how_steps: List[Dict], feature_idx: int, insp_id
 
         match_id = f"post-{post_idx}-insp-{insp_idx}-feat-{feature_idx}-match-{i}"
 
+        # 生成特征类型和分类标签
+        type_badge_html = ""
+        if feature_type:
+            type_badge_html = f'<span class="feature-type-badge">{html_module.escape(feature_type)}</span>'
+
+        categories_badge_html = ""
+        if feature_categories:
+            categories_text = " / ".join(feature_categories)
+            categories_badge_html = f'<span class="feature-category-badge">{html_module.escape(categories_text)}</span>'
+
         # 获取该人设特征的分类信息
         # 需要在三个类型中查找该特征
         categories_html = ""
@@ -366,6 +378,8 @@ def generate_match_results_html(how_steps: List[Dict], feature_idx: int, insp_id
                 <div class="match-header-left">
                     <span class="expand-icon" id="{match_id}-icon">▶</span>
                     <span class="persona-name">{categories_html} {html_module.escape(persona_name)}</span>
+                    {type_badge_html}
+                    {categories_badge_html}
                     <span class="relation-badge" style="background: {color};">{label}</span>
                     <span class="score-badge">相似度: {similarity:.2f}</span>
                 </div>
@@ -1137,6 +1151,26 @@ def generate_combined_html(posts_data: List[Dict], category_mapping: Dict = None
                 font-weight: 600;
             }}
 
+            .feature-type-badge {{
+                padding: 3px 8px;
+                border-radius: 10px;
+                background: #fef3c7;
+                color: #92400e;
+                font-size: 10px;
+                font-weight: 600;
+                border: 1px solid #fcd34d;
+            }}
+
+            .feature-category-badge {{
+                padding: 3px 8px;
+                border-radius: 10px;
+                background: #dbeafe;
+                color: #1e40af;
+                font-size: 10px;
+                font-weight: 500;
+                border: 1px solid #93c5fd;
+            }}
+
             .match-content {{
                 padding: 16px;
                 background: #f9fafb;