瀏覽代碼

组词和可视化

刘立冬 3 周之前
父節點
當前提交
4e3252e2af
共有 3 個文件被更改,包括 223 次插入160 次删除
  1. 40 19
      sug_v6_1_2_121.py
  2. 100 137
      visualization/sug_v6_1_2_121/convert_v8_to_graph_v3.js
  3. 83 4
      visualization/sug_v6_1_2_121/index.js

+ 40 - 19
sug_v6_1_2_121.py

@@ -109,6 +109,8 @@ class Q(BaseModel):
     reason: str = ""  # 评分理由
     from_source: str = ""  # v120: seg/sug/add; v121新增: segment/domain_comb/sug
     type_label: str = ""  # v121新增:域类型标签(仅用于domain_comb来源)
+    domain_index: int = -1  # v121新增:域索引(word来源时有效,-1表示无域)
+    domain_type: str = ""  # v121新增:域类型(word来源时表示所属segment的type,如"中心名词")
 
 
 class Sug(BaseModel):
@@ -252,7 +254,7 @@ word_segmentation_instructions = """
 1. 保留有搜索意义的词汇
 2. 拆分成独立的概念
 3. 保留专业术语的完整性
-4. 去除虚词(的、吗、呢等)
+4. 去除虚词(的、吗、呢等),但保留疑问词(如何、为什么、怎样等)
 
 ## 输出要求
 返回分词列表和分词理由。
@@ -817,6 +819,10 @@ def generate_domain_combinations(segments: list[Segment], n_domains: int) -> lis
     for domain_indices in combinations(range(n), n_domains):
         selected_segments = [segments[i] for i in domain_indices]
 
+        # 新增:如果所有域都只有1个词,跳过(单段落单词不组合)
+        if all(len(seg.words) == 1 for seg in selected_segments):
+            continue
+
         # 2. 为每个选中的域生成其words的所有有序子集
         domain_subsets = []
         for seg in selected_segments:
@@ -882,7 +888,9 @@ def extract_words_from_segments(segments: list[Segment]) -> list[Q]:
                 score_with_o=word_score,
                 reason=word_reason,
                 from_source="word",  # 标记来源为 word
-                type_label=f"[{segment.type}]"  # 保留域信息
+                type_label=f"[{segment.type}]",  # 保留域信息
+                domain_index=seg_idx,  # 添加域索引
+                domain_type=segment.type  # 添加域类型(如"中心名词"、"核心动作")
             )
             q_list.append(q)
 
@@ -1139,7 +1147,7 @@ async def initialize(o: str, context: RunContext) -> tuple[list[Seg], list[Word]
     # 2. 分词评估:seg_list -> 每个seg与o进行评分(使用信号量限制并发数)
     print(f"\n[步骤2] 评估每个分词与原始问题的相关度...")
 
-    MAX_CONCURRENT_SEG_EVALUATIONS = 5
+    MAX_CONCURRENT_SEG_EVALUATIONS = 10
     seg_semaphore = asyncio.Semaphore(MAX_CONCURRENT_SEG_EVALUATIONS)
 
     async def evaluate_seg(seg: Seg) -> Seg:
@@ -2030,12 +2038,12 @@ async def run_round_v2(
     eval_tasks = [evaluate_combination(comb) for comb in domain_combinations]
     await asyncio.gather(*eval_tasks)
 
-    # 排序
-    domain_combinations.sort(key=lambda x: x.score_with_o, reverse=True)
+    # 排序 - 已注释,保持原始顺序
+    # domain_combinations.sort(key=lambda x: x.score_with_o, reverse=True)
 
-    # 打印Top 10
-    print(f"  评估完成,Top 10:")
-    for i, comb in enumerate(domain_combinations[:10], 1):
+    # 打印所有组合(保持原始顺序)
+    print(f"  评估完成,共{len(domain_combinations)}个组合:")
+    for i, comb in enumerate(domain_combinations, 1):
         print(f"    {i}. {comb.text} {comb.type_label} (分数: {comb.score_with_o:.2f})")
 
     # 步骤6: 构建 q_list_next(组合 + 高分SUG)
@@ -2045,12 +2053,16 @@ async def run_round_v2(
     # 6.1 添加高分组合
     high_score_combinations = [comb for comb in domain_combinations if comb.score_with_o > REQUIRED_SCORE_GAIN]
     for comb in high_score_combinations:
+        # 生成域字符串,如 "D0,D3"
+        domains_str = ','.join([f'D{d}' for d in comb.domains]) if comb.domains else ''
+
         q = Q(
             text=comb.text,
             score_with_o=comb.score_with_o,
             reason=comb.reason,
             from_source="domain_comb",
-            type_label=comb.type_label
+            type_label=comb.type_label,
+            domain_type=domains_str  # 添加域信息
         )
         q_list_next.append(q)
 
@@ -2072,20 +2084,29 @@ async def run_round_v2(
 
     print(f"  添加 {len(high_gain_sugs)} 个高增益SUG(增益 > {REQUIRED_SCORE_GAIN})")
 
-    # 保存round数据
-    search_results_data = [
-        {
+    # 保存round数据(包含完整帖子信息)
+    search_results_data = []
+    for search in search_list:
+        search_results_data.append({
             "text": search.text,
             "score_with_o": search.score_with_o,
-            "post_count": len(search.post_list)
-        }
-        for search in search_list
-    ]
+            "post_list": [
+                {
+                    "note_id": post.note_id,
+                    "note_url": post.note_url,
+                    "title": post.title,
+                    "body_text": post.body_text,
+                    "images": post.images,
+                    "interact_info": post.interact_info
+                }
+                for post in search.post_list
+            ]
+        })
 
     round_data.update({
-        "input_queries": [{"text": q.text, "score": q.score_with_o, "from_source": q.from_source, "type": "input"} for q in query_input],
+        "input_queries": [{"text": q.text, "score": q.score_with_o, "from_source": q.from_source, "type": "input", "domain_index": q.domain_index, "domain_type": q.domain_type} for q in query_input],
         "domain_combinations_count": len(domain_combinations),
-        "domain_combinations_top10": [
+        "domain_combinations": [
             {
                 "text": comb.text,
                 "type_label": comb.type_label,
@@ -2095,7 +2116,7 @@ async def run_round_v2(
                 "source_words": comb.source_words,
                 "from_segments": comb.from_segments
             }
-            for comb in domain_combinations[:10]
+            for comb in domain_combinations
         ],
         "high_score_combinations": [{"text": q.text, "score": q.score_with_o, "type_label": q.type_label, "type": "combination"} for q in q_list_next if q.from_source == "domain_comb"],
         "sug_count": len(all_sugs),

+ 100 - 137
visualization/sug_v6_1_2_121/convert_v8_to_graph_v3.js

@@ -88,7 +88,8 @@ function convertV8ToGraphV2(runContext, searchResults) {
             iteration: roundNum,
             is_selected: true,
             segment_type: seg.type,
-            domain_index: seg.domain_index
+            domain_index: seg.domain_index,
+            domain_type: seg.type  // 新增:让可视化显示类型而不是D0
           };
 
           edges.push({
@@ -238,13 +239,13 @@ function convertV8ToGraphV2(runContext, searchResults) {
         // 为每个 Q 创建节点
         Object.keys(round.sug_details).forEach((qText, qIndex) => {
           // 从q_list_1中查找对应的q获取分数和理由
-          // Round 0: 从q_list_1查找; Round 1+: 从input_q_list查找
+          // Round 0: 从q_list_1查找; Round 1+: 从input_queries查找
           let qData = {};
           if (roundNum === 0) {
             qData = round.q_list_1?.find(q => q.text === qText) || {};
           } else {
-            // 从当前轮的input_q_list中查找
-            qData = round.input_q_list?.find(q => q.text === qText) || {};
+            // 从当前轮的input_queries中查找
+            qData = round.input_queries?.find(q => q.text === qText) || {};
           }
           const qId = `q_${qText}_r${roundNum}_${qIndex}`;
           nodes[qId] = {
@@ -256,7 +257,9 @@ function convertV8ToGraphV2(runContext, searchResults) {
             strategy: 'Query',
             iteration: roundNum,
             is_selected: true,
-            type_label: qData.type_label || qData.typeLabel || ''
+            type_label: qData.type_label || qData.typeLabel || '',
+            domain_index: qData.domain_index,
+            domain_type: qData.domain_type || ''
           };
 
           edges.push({
@@ -305,7 +308,64 @@ function convertV8ToGraphV2(runContext, searchResults) {
         });
       }
 
-      // 步骤2: 筛选并执行搜索
+      // 步骤2: 域内组词(Round 1+)
+      // 兼容旧字段名 domain_combinations_top10
+      const domainCombinations = round.domain_combinations || round.domain_combinations_top10 || [];
+      if (domainCombinations.length > 0) {
+        const combStepId = `step_comb_r${roundNum}`;
+        nodes[combStepId] = {
+          type: 'step',
+          query: `步骤2: 跨${roundNum}个域组合 (${domainCombinations.length}个组合)`,
+          level: roundNum * 10 + 1,
+          relevance_score: 0,
+          strategy: '域内组词',
+          iteration: roundNum,
+          is_selected: true
+        };
+
+        edges.push({
+          from: roundId,
+          to: combStepId,
+          edge_type: 'round_to_step',
+          strategy: '域内组词'
+        });
+
+        iterations[roundNum * 10].push(combStepId);
+
+        // 为每个域内组合创建节点
+        domainCombinations.forEach((comb, combIndex) => {
+          const combId = `comb_${comb.text}_r${roundNum}_${combIndex}`;
+          const domainsStr = comb.domains ? comb.domains.map(d => `D${d}`).join(',') : '';
+
+          nodes[combId] = {
+            type: 'domain_combination',
+            query: `${comb.text}`,  // 移除 type_label,稍后在UI中单独显示
+            level: roundNum * 10 + 2,
+            relevance_score: comb.score || 0,
+            evaluationReason: comb.reason || '',
+            strategy: '域内组合',
+            iteration: roundNum,
+            is_selected: true,
+            type_label: comb.type_label || '',
+            source_words: comb.source_words || [],
+            from_segments: comb.from_segments || [],
+            domains: comb.domains || [],
+            domains_str: domainsStr
+          };
+
+          edges.push({
+            from: combStepId,
+            to: combId,
+            edge_type: 'step_to_comb',
+            strategy: '域内组合'
+          });
+
+          if (!iterations[roundNum * 10 + 2]) iterations[roundNum * 10 + 2] = [];
+          iterations[roundNum * 10 + 2].push(combId);
+        });
+      }
+
+      // 步骤3: 筛选并执行搜索
       const searchStepId = `step_search_r${roundNum}`;
       const searchCountText = round.search_count > 0
         ? `筛选${round.high_score_sug_count}个高分词,搜索${round.search_count}次,${round.total_posts}个帖子`
@@ -313,7 +373,7 @@ function convertV8ToGraphV2(runContext, searchResults) {
 
       nodes[searchStepId] = {
         type: 'step',
-        query: `步骤2: 筛选并执行搜索 (${searchCountText})`,
+        query: `步骤3: 筛选并执行搜索 (${searchCountText})`,
         level: roundNum * 10 + 1,
         relevance_score: 0,
         strategy: '筛选并执行搜索',
@@ -545,7 +605,7 @@ function convertV8ToGraphV2(runContext, searchResults) {
                 if (roundNum === 0) {
                   qData = round.q_list_1?.find(q => q.text === qText) || {};
                 } else {
-                  qData = round.input_q_list?.find(q => q.text === qText) || {};
+                  qData = round.input_queries?.find(q => q.text === qText) || {};
                 }
                 parentQScore = qData.score || 0;
                 break;
@@ -581,153 +641,56 @@ function convertV8ToGraphV2(runContext, searchResults) {
         });
       }
 
-      // 步骤5: 构建下一轮
-      const nextRoundStepId = `step_next_round_r${roundNum}`;
-      const nextQCount = round.output_q_list?.length || 0;
-      const nextSeedCount = round.seed_list_next_size || 0;
-
-      nodes[nextRoundStepId] = {
-        type: 'step',
-        query: `步骤5: 构建下一轮 (${nextQCount}个查询, ${nextSeedCount}个种子)`,
-        level: roundNum * 10 + 1,
-        relevance_score: 0,
-        strategy: '构建下一轮',
-        iteration: roundNum,
-        is_selected: true
-      };
+      // 步骤4: 构建下一轮(Round 1+)
+      const highScoreCombinations = round.high_score_combinations || [];
+      const highGainSugs = round.high_gain_sugs || [];
+      const nextRoundItems = [...highScoreCombinations, ...highGainSugs];
 
-      edges.push({
-        from: roundId,
-        to: nextRoundStepId,
-        edge_type: 'round_to_step',
-        strategy: '构建下一轮'
-      });
-
-      iterations[roundNum * 10].push(nextRoundStepId);
-
-      // 5.1: 构建下轮查询
-      if (round.output_q_list && round.output_q_list.length > 0) {
-        const nextQStepId = `step_next_q_r${roundNum}`;
-        nodes[nextQStepId] = {
-          type: 'step',
-          query: `构建下轮查询 (${nextQCount}个)`,
-          level: roundNum * 10 + 2,
-          relevance_score: 0,
-          strategy: '下轮查询',
-          iteration: roundNum,
-          is_selected: true
-        };
-
-        edges.push({
-          from: nextRoundStepId,
-          to: nextQStepId,
-          edge_type: 'step_to_step',
-          strategy: '查询'
-        });
-
-        if (!iterations[roundNum * 10 + 2]) iterations[roundNum * 10 + 2] = [];
-        iterations[roundNum * 10 + 2].push(nextQStepId);
-
-        // 添加下轮查询列表
-        round.output_q_list.forEach((q, qIndex) => {
-          const nextQId = `next_q_${q.text}_r${roundNum}_${qIndex}`;
-
-          // 根据来源设置strategy
-          let strategy;
-          if (q.from === 'seg') {
-            strategy = '初始分词';
-          } else if (q.from === 'add') {
-            strategy = '加词';
-          } else if (q.from === 'sug') {
-            strategy = '调用sug';
-          } else {
-            strategy = 'Query'; // 默认
-          }
-
-          nodes[nextQId] = {
-            type: 'next_q',
-            query: '[Q] ' + q.text,
-            level: roundNum * 10 + 3,
-            relevance_score: q.score || 0,
-            evaluationReason: q.reason || '',
-            strategy: strategy,
-            iteration: roundNum,
-            is_selected: true,
-            from_source: q.from,
-            type_label: q.type_label || q.typeLabel || ''
-          };
-
-          edges.push({
-            from: nextQStepId,
-            to: nextQId,
-            edge_type: 'step_to_next_q',
-            strategy: strategy
-          });
-
-          if (!iterations[roundNum * 10 + 3]) iterations[roundNum * 10 + 3] = [];
-          iterations[roundNum * 10 + 3].push(nextQId);
-        });
-      }
-
-      // 5.2: 构建下轮种子(如果有数据的话)
-      if (nextSeedCount > 0 && round.seed_list_next) {
-        const nextSeedStepId = `step_next_seed_r${roundNum}`;
-        nodes[nextSeedStepId] = {
+      if (nextRoundItems.length > 0) {
+        const nextRoundStepId = `step_next_round_r${roundNum}`;
+        nodes[nextRoundStepId] = {
           type: 'step',
-          query: `构建下轮种子 (${nextSeedCount}个)`,
-          level: roundNum * 10 + 2,
+          query: `步骤4: 构建下一轮 (${nextRoundItems.length}个查询)`,
+          level: roundNum * 10 + 1,
           relevance_score: 0,
-          strategy: '下轮种子',
+          strategy: '构建下一轮',
           iteration: roundNum,
           is_selected: true
         };
 
         edges.push({
-          from: nextRoundStepId,
-          to: nextSeedStepId,
-          edge_type: 'step_to_step',
-          strategy: '种子'
+          from: roundId,
+          to: nextRoundStepId,
+          edge_type: 'round_to_step',
+          strategy: '构建下一轮'
         });
 
-        if (!iterations[roundNum * 10 + 2]) iterations[roundNum * 10 + 2] = [];
-        iterations[roundNum * 10 + 2].push(nextSeedStepId);
+        iterations[roundNum * 10].push(nextRoundStepId);
 
-        // 添加下轮种子列表
-        round.seed_list_next.forEach((seed, seedIndex) => {
-          const nextSeedId = `next_seed_${seed.text}_r${roundNum}_${seedIndex}`;
-
-          // 根据来源设置strategy
-          let strategy;
-          if (seed.from === 'seg') {
-            strategy = '初始分词';
-          } else if (seed.from === 'add') {
-            strategy = '加词';
-          } else if (seed.from === 'sug') {
-            strategy = '调用sug';
-          } else {
-            strategy = 'Seed'; // 默认
-          }
-
-          nodes[nextSeedId] = {
-            type: 'next_seed',
-            query: seed.text,
-            level: roundNum * 10 + 3,
-            relevance_score: seed.score || 0,
-            strategy: strategy,
+        // 创建查询节点
+        nextRoundItems.forEach((item, index) => {
+          const itemId = `next_round_${item.text}_r${roundNum}_${index}`;
+          nodes[itemId] = {
+            type: 'next_round_item',
+            query: '[Q] ' + item.text,
+            level: roundNum * 10 + 2,
+            relevance_score: item.score || 0,
+            strategy: item.type === 'combination' ? '域内组合' : '高增益SUG',
             iteration: roundNum,
             is_selected: true,
-            from_source: seed.from
+            type_label: item.type_label || '',
+            item_type: item.type
           };
 
           edges.push({
-            from: nextSeedStepId,
-            to: nextSeedId,
-            edge_type: 'step_to_next_seed',
-            strategy: strategy
+            from: nextRoundStepId,
+            to: itemId,
+            edge_type: 'step_to_next_round',
+            strategy: item.type === 'combination' ? '域内组合' : 'SUG'
           });
 
-          if (!iterations[roundNum * 10 + 3]) iterations[roundNum * 10 + 3] = [];
-          iterations[roundNum * 10 + 3].push(nextSeedId);
+          if (!iterations[roundNum * 10 + 2]) iterations[roundNum * 10 + 2] = [];
+          iterations[roundNum * 10 + 2].push(itemId);
         });
       }
     }

+ 83 - 4
visualization/sug_v6_1_2_121/index.js

@@ -881,18 +881,64 @@ function TreeNode({ node, level, children, isCollapsed, onToggle, isSelected, on
             alignItems: 'center',
             gap: '8px',
           }}>
+            {/* 文本标题 - 左侧 */}
             <div style={{
               fontWeight: level === 0 ? '600' : '400',
-              maxWidth: '180px',
               flex: 1,
               minWidth: 0,
               color: node.data.scoreColor || fontColor,
+              overflow: 'hidden',
+              textOverflow: 'ellipsis',
+              whiteSpace: 'nowrap',
             }}
             title={node.data.title || node.id}
             >
-              {truncateMiddle(node.data.title || node.id, 18)}
+              {node.data.title || node.id}
             </div>
 
+            {/* 域标识 - 右侧,挨着分数,优先显示域类型,否则显示域索引或域字符串,但domain_combination节点不显示 */}
+            {(node.data.domain_type || node.data.domains_str || (node.data.domain_index !== null && node.data.domain_index !== undefined)) && nodeActualType !== 'domain_combination' && (
+              <span style={{
+                fontSize: '9px',
+                color: '#fff',
+                background: '#6366f1',
+                padding: '2px 5px',
+                borderRadius: '3px',
+                flexShrink: 0,
+                fontWeight: '600',
+                marginLeft: '4px',
+              }}
+              title={
+                node.data.domain_type ? '域: ' + node.data.domain_type + ' (D' + node.data.domain_index + ')' :
+                node.data.domains_str ? '域: ' + node.data.domains_str :
+                '域 D' + node.data.domain_index
+              }
+              >
+                {node.data.domain_type || node.data.domains_str || ('D' + node.data.domain_index)}
+              </span>
+            )}
+
+            {/* 类型标签 - 显示在右侧靠近分数,蓝色背景 */}
+            {node.data.type_label && (
+              <span style={{
+                fontSize: '9px',
+                color: '#fff',
+                background: '#3b82f6',
+                padding: '2px 5px',
+                borderRadius: '3px',
+                flexShrink: 0,
+                fontWeight: '600',
+                maxWidth: '80px',
+                overflow: 'hidden',
+                textOverflow: 'ellipsis',
+                whiteSpace: 'nowrap',
+              }}
+              title={'类型: ' + node.data.type_label}
+              >
+                {node.data.type_label}
+              </span>
+            )}
+
             {/* 分数显示 - 步骤和轮次节点不显示分数 */}
             {nodeActualType !== 'step' && nodeActualType !== 'round' && (
               <span style={{
@@ -1109,6 +1155,13 @@ function transformData(data) {
             selectedWord: node.selected_word || '', // 加词节点特有 - 显示选择的词
             scoreColor: node.scoreColor || null,        // SUG节点的颜色标识
             parentQScore: node.parentQScore || 0,       // 父Q得分(用于调试)
+            domain_index: node.domain_index !== undefined ? node.domain_index : null, // 域索引
+            domain_type: node.domain_type || '', // 域类型(如"中心名词"、"核心动作"),只有Q节点有,segment节点不显示
+            segment_type: node.segment_type || '', // segment类型(只有segment节点才有)
+            type_label: node.type_label || '', // 类型标签
+            domains: node.domains || [], // 域索引数组(domain_combination节点特有)
+            domains_str: node.domains_str || '', // 域标识字符串(如"D0,D1")
+            from_segments: node.from_segments || [], // 来源segments(domain_combination节点特有)
           },
           position: { x: 0, y: 0 },
         });
@@ -1932,15 +1985,40 @@ function FlowContent() {
                                   flexShrink: 0,
                                 }} />
 
-                                {/* 节点文字 */}
+                                {/* 节点文字 - 左侧 */}
                                 <span style={{
                                   flex: 1,
                                   fontSize: '12px',
                                   color: pathFontColor,
+                                  overflow: 'hidden',
+                                  textOverflow: 'ellipsis',
+                                  whiteSpace: 'nowrap',
                                 }}>
-                                  {truncateMiddle(node.data.title || node.id, 18)}
+                                  {node.data.title || node.id}
                                 </span>
 
+                                {/* 域标识 - 右侧,挨着分数 */}
+                                {(node.data.domain_type || node.data.domains_str || (node.data.domain_index !== null && node.data.domain_index !== undefined)) && (
+                                  <span style={{
+                                    fontSize: '8px',
+                                    color: '#fff',
+                                    background: '#6366f1',
+                                    padding: '1px 4px',
+                                    borderRadius: '2px',
+                                    flexShrink: 0,
+                                    fontWeight: '600',
+                                    marginLeft: '4px',
+                                  }}
+                                  title={
+                                    node.data.domain_type ? '域: ' + node.data.domain_type + ' (D' + node.data.domain_index + ')' :
+                                    node.data.domains_str ? '域: ' + node.data.domains_str :
+                                    '域 D' + node.data.domain_index
+                                  }
+                                  >
+                                    {node.data.domain_type || node.data.domains_str || ('D' + node.data.domain_index)}
+                                  </span>
+                                )}
+
                                 {/* 分数显示 - 步骤和轮次节点不显示分数 */}
                                 {nodeActualType !== 'step' && nodeActualType !== 'round' && (
                                   <span style={{
@@ -1950,6 +2028,7 @@ function FlowContent() {
                                     flexShrink: 0,
                                     minWidth: '35px',
                                     textAlign: 'right',
+                                    marginLeft: '4px',
                                   }}>
                                     {nodeScore.toFixed(2)}
                                   </span>