|
|
@@ -36,22 +36,37 @@ class WordLibrary(BaseModel):
|
|
|
"""动态分词库"""
|
|
|
words: set[str] = Field(default_factory=set)
|
|
|
word_sources: dict[str, str] = Field(default_factory=dict) # 记录词的来源:word -> source(note_id或"initial")
|
|
|
+ core_words: set[str] = Field(default_factory=set) # 核心词(第一层初始分词)
|
|
|
|
|
|
- def add_word(self, word: str, source: str = "unknown"):
|
|
|
+ def add_word(self, word: str, source: str = "unknown", is_core: bool = False):
|
|
|
"""添加单词到分词库"""
|
|
|
if word and word.strip():
|
|
|
word = word.strip()
|
|
|
self.words.add(word)
|
|
|
if word not in self.word_sources:
|
|
|
self.word_sources[word] = source
|
|
|
+ if is_core:
|
|
|
+ self.core_words.add(word)
|
|
|
|
|
|
- def add_words(self, words: list[str], source: str = "unknown"):
|
|
|
+ def add_words(self, words: list[str], source: str = "unknown", is_core: bool = False):
|
|
|
"""批量添加单词"""
|
|
|
for word in words:
|
|
|
- self.add_word(word, source)
|
|
|
-
|
|
|
- def get_unused_word(self, current_query: str) -> str | None:
|
|
|
- """获取一个当前query中没有的词"""
|
|
|
+ self.add_word(word, source, is_core)
|
|
|
+
|
|
|
+ def get_unused_word(self, current_query: str, prefer_core: bool = True) -> str | None:
|
|
|
+ """获取一个当前query中没有的词
|
|
|
+
|
|
|
+ Args:
|
|
|
+ current_query: 当前查询
|
|
|
+ prefer_core: 是否优先返回核心词(默认True)
|
|
|
+ """
|
|
|
+ # 优先从核心词中查找
|
|
|
+ if prefer_core and self.core_words:
|
|
|
+ for word in self.core_words:
|
|
|
+ if word not in current_query:
|
|
|
+ return word
|
|
|
+
|
|
|
+ # 如果核心词都用完了,或者不优先使用核心词,从所有词中查找
|
|
|
for word in self.words:
|
|
|
if word not in current_query:
|
|
|
return word
|
|
|
@@ -61,7 +76,8 @@ class WordLibrary(BaseModel):
|
|
|
"""序列化为dict"""
|
|
|
return {
|
|
|
"words": list(self.words),
|
|
|
- "word_sources": self.word_sources
|
|
|
+ "word_sources": self.word_sources,
|
|
|
+ "core_words": list(self.core_words)
|
|
|
}
|
|
|
|
|
|
|
|
|
@@ -458,9 +474,10 @@ async def initialize_word_library(original_query: str, context: RunContext) -> W
|
|
|
segmentation: WordSegmentation = result.final_output
|
|
|
|
|
|
word_lib = WordLibrary()
|
|
|
- word_lib.add_words(segmentation.words, source="initial")
|
|
|
+ # 初始分词标记为核心词(is_core=True)
|
|
|
+ word_lib.add_words(segmentation.words, source="initial", is_core=True)
|
|
|
|
|
|
- print(f"初始分词库: {list(word_lib.words)}")
|
|
|
+ print(f"初始分词库(核心词): {list(word_lib.words)}")
|
|
|
print(f"分词理由: {segmentation.reasoning}")
|
|
|
|
|
|
# 保存到context
|
|
|
@@ -713,9 +730,12 @@ async def process_suggestions(
|
|
|
else:
|
|
|
print(f" ✗ 改写(同义): {rewrite_syn.rewritten_query} (分数: {rewrite_syn_eval.relevance_score:.2f}, 未提升)")
|
|
|
|
|
|
- # 4. 加词策略
|
|
|
- unused_word = word_lib.get_unused_word(query)
|
|
|
+ # 4. 加词策略(优先使用核心词)
|
|
|
+ unused_word = word_lib.get_unused_word(query, prefer_core=True)
|
|
|
+ is_core_word = unused_word in word_lib.core_words if unused_word else False
|
|
|
+
|
|
|
if unused_word and len(new_queries) < 5:
|
|
|
+ word_type = "核心词" if is_core_word else "普通词"
|
|
|
insertion_input = f"""
|
|
|
<当前Query>
|
|
|
{query}
|
|
|
@@ -733,10 +753,11 @@ async def process_suggestions(
|
|
|
# 收集加词Agent的输入输出
|
|
|
insertion_agent_call = {
|
|
|
"agent": "加词位置评估专家",
|
|
|
- "action": "加词",
|
|
|
+ "action": f"加词({word_type})",
|
|
|
"input": {
|
|
|
"query": query,
|
|
|
- "word_to_add": unused_word
|
|
|
+ "word_to_add": unused_word,
|
|
|
+ "is_core_word": is_core_word
|
|
|
},
|
|
|
"output": {
|
|
|
"new_query": insertion.new_query,
|
|
|
@@ -769,10 +790,10 @@ async def process_suggestions(
|
|
|
)
|
|
|
|
|
|
if insertion_eval.is_improved:
|
|
|
- print(f" ✓ 加词: {insertion.new_query} (分数: {insertion_eval.relevance_score:.2f})")
|
|
|
+ print(f" ✓ 加词({word_type}): {insertion.new_query} [+{unused_word}] (分数: {insertion_eval.relevance_score:.2f})")
|
|
|
new_queries.append(new_state)
|
|
|
else:
|
|
|
- print(f" ✗ 加词: {insertion.new_query} (分数: {insertion_eval.relevance_score:.2f}, 未提升)")
|
|
|
+ print(f" ✗ 加词({word_type}): {insertion.new_query} [+{unused_word}] (分数: {insertion_eval.relevance_score:.2f}, 未提升)")
|
|
|
|
|
|
# 记录完整的suggestion分支处理结果(层级化)
|
|
|
add_step(context, f"Suggestion分支 - {query}", "suggestion_branch", {
|
|
|
@@ -1333,6 +1354,7 @@ async def iterative_search_loop(
|
|
|
print(f"迭代搜索完成")
|
|
|
print(f" 总迭代次数: {iteration}")
|
|
|
print(f" 最终满足帖子数: {len(all_satisfied_notes)}")
|
|
|
+ print(f" 核心词库: {list(word_lib.core_words)}")
|
|
|
print(f" 最终分词库大小: {len(word_lib.words)}")
|
|
|
print(f"{'='*60}")
|
|
|
|
|
|
@@ -1340,6 +1362,7 @@ async def iterative_search_loop(
|
|
|
add_step(context, "迭代搜索完成", "loop_complete", {
|
|
|
"total_iterations": iteration,
|
|
|
"total_satisfied_notes": len(all_satisfied_notes),
|
|
|
+ "core_words": list(word_lib.core_words),
|
|
|
"final_word_library_size": len(word_lib.words),
|
|
|
"final_word_library": list(word_lib.words)
|
|
|
})
|
|
|
@@ -1405,6 +1428,7 @@ async def main(input_dir: str, max_iterations: int = 20, visualize: bool = False
|
|
|
# 格式化输出
|
|
|
output = f"原始问题:{run_context.q}\n"
|
|
|
output += f"找到满足需求的帖子:{len(satisfied_notes)} 个\n"
|
|
|
+ output += f"核心词库:{', '.join(run_context.word_library.get('core_words', []))}\n"
|
|
|
output += f"分词库大小:{len(run_context.word_library.get('words', []))} 个词\n"
|
|
|
output += "\n" + "="*60 + "\n"
|
|
|
|