|
@@ -13,6 +13,8 @@ from pydantic import BaseModel, Field
|
|
|
from lib.utils import read_file_as_string
|
|
from lib.utils import read_file_as_string
|
|
|
from lib.client import get_model
|
|
from lib.client import get_model
|
|
|
MODEL_NAME = "google/gemini-2.5-flash"
|
|
MODEL_NAME = "google/gemini-2.5-flash"
|
|
|
|
|
+# 得分提升阈值:sug或组合词必须比来源query提升至少此幅度才能进入下一轮
|
|
|
|
|
+REQUIRED_SCORE_GAIN = 0.05
|
|
|
from script.search_recommendations.xiaohongshu_search_recommendations import XiaohongshuSearchRecommendations
|
|
from script.search_recommendations.xiaohongshu_search_recommendations import XiaohongshuSearchRecommendations
|
|
|
from script.search.xiaohongshu_search import XiaohongshuSearch
|
|
from script.search.xiaohongshu_search import XiaohongshuSearch
|
|
|
|
|
|
|
@@ -141,7 +143,6 @@ word_segmentation_instructions = """
|
|
|
2. 拆分成独立的概念
|
|
2. 拆分成独立的概念
|
|
|
3. 保留专业术语的完整性
|
|
3. 保留专业术语的完整性
|
|
|
4. 去除虚词(的、吗、呢等)
|
|
4. 去除虚词(的、吗、呢等)
|
|
|
-如果是双标行为,单独分词 不拆分,如果有如何两个字 不要
|
|
|
|
|
|
|
|
|
|
## 输出要求
|
|
## 输出要求
|
|
|
返回分词列表和分词理由。
|
|
返回分词列表和分词理由。
|
|
@@ -1306,9 +1307,9 @@ async def run_round(
|
|
|
|
|
|
|
|
# 将Top 5全部加入q_list_next(去重检查 + 得分过滤)
|
|
# 将Top 5全部加入q_list_next(去重检查 + 得分过滤)
|
|
|
for comb in top_5:
|
|
for comb in top_5:
|
|
|
- # 得分过滤:只有得分大于种子得分的组合词才加入下一轮
|
|
|
|
|
- if comb['score'] <= seed.score_with_o:
|
|
|
|
|
- print(f" ⊗ 跳过低分: {comb['query']} (分数{comb['score']:.2f} ≤ 种子{seed.score_with_o:.2f})")
|
|
|
|
|
|
|
+ # 得分过滤:组合词必须比种子提升至少REQUIRED_SCORE_GAIN才能加入下一轮
|
|
|
|
|
+ if comb['score'] < seed.score_with_o + REQUIRED_SCORE_GAIN:
|
|
|
|
|
+ print(f" ⊗ 跳过低分: {comb['query']} (分数{comb['score']:.2f} < 种子{seed.score_with_o:.2f} + {REQUIRED_SCORE_GAIN:.2f})")
|
|
|
continue
|
|
continue
|
|
|
|
|
|
|
|
# 去重检查
|
|
# 去重检查
|
|
@@ -1357,7 +1358,8 @@ async def run_round(
|
|
|
print(f" ⊗ 跳过来自被剪枝query的sug: {sug.text} (来源: {sug.from_q.text})")
|
|
print(f" ⊗ 跳过来自被剪枝query的sug: {sug.text} (来源: {sug.from_q.text})")
|
|
|
continue
|
|
continue
|
|
|
|
|
|
|
|
- if sug.from_q and sug.score_with_o > sug.from_q.score_with_o:
|
|
|
|
|
|
|
+ # sug必须比来源query提升至少REQUIRED_SCORE_GAIN才能加入下一轮
|
|
|
|
|
+ if sug.from_q and sug.score_with_o >= sug.from_q.score_with_o + REQUIRED_SCORE_GAIN:
|
|
|
# 去重检查
|
|
# 去重检查
|
|
|
if sug.text in existing_q_texts:
|
|
if sug.text in existing_q_texts:
|
|
|
print(f" ⊗ 跳过重复: {sug.text}")
|
|
print(f" ⊗ 跳过重复: {sug.text}")
|
|
@@ -1371,7 +1373,7 @@ async def run_round(
|
|
|
)
|
|
)
|
|
|
q_list_next.append(new_q)
|
|
q_list_next.append(new_q)
|
|
|
existing_q_texts.add(sug.text) # 记录到去重集合
|
|
existing_q_texts.add(sug.text) # 记录到去重集合
|
|
|
- print(f" ✓ {sug.text} (分数: {sug.score_with_o:.2f} > {sug.from_q.score_with_o:.2f})")
|
|
|
|
|
|
|
+ print(f" ✓ {sug.text} (分数: {sug.score_with_o:.2f} >= 来源query: {sug.from_q.score_with_o:.2f} + {REQUIRED_SCORE_GAIN:.2f})")
|
|
|
|
|
|
|
|
# 5. 构建seed_list_next(关键修改:不保留上一轮的seed)
|
|
# 5. 构建seed_list_next(关键修改:不保留上一轮的seed)
|
|
|
print(f"\n[步骤5] 构建seed_list_next(不保留上轮seed)...")
|
|
print(f"\n[步骤5] 构建seed_list_next(不保留上轮seed)...")
|
|
@@ -1381,10 +1383,10 @@ async def run_round(
|
|
|
# 5.1 加入本轮所有组合词(只加入得分提升的)
|
|
# 5.1 加入本轮所有组合词(只加入得分提升的)
|
|
|
print(f" 5.1 加入本轮所有组合词(得分过滤)...")
|
|
print(f" 5.1 加入本轮所有组合词(得分过滤)...")
|
|
|
for comb in all_seed_combinations:
|
|
for comb in all_seed_combinations:
|
|
|
- # 得分过滤:只有得分大于种子得分的组合词才作为下一轮种子
|
|
|
|
|
|
|
+ # 得分过滤:组合词必须比种子提升至少REQUIRED_SCORE_GAIN才作为下一轮种子
|
|
|
seed_score = comb.get('seed_score', 0)
|
|
seed_score = comb.get('seed_score', 0)
|
|
|
- if comb['score'] <= seed_score:
|
|
|
|
|
- print(f" ⊗ 跳过低分: {comb['query']} (分数{comb['score']:.2f} ≤ 种子{seed_score:.2f})")
|
|
|
|
|
|
|
+ if comb['score'] < seed_score + REQUIRED_SCORE_GAIN:
|
|
|
|
|
+ print(f" ⊗ 跳过低分: {comb['query']} (分数{comb['score']:.2f} < 种子{seed_score:.2f} + {REQUIRED_SCORE_GAIN:.2f})")
|
|
|
continue
|
|
continue
|
|
|
|
|
|
|
|
if comb['query'] not in existing_seed_texts:
|
|
if comb['query'] not in existing_seed_texts:
|
|
@@ -1396,7 +1398,7 @@ async def run_round(
|
|
|
)
|
|
)
|
|
|
seed_list_next.append(new_seed)
|
|
seed_list_next.append(new_seed)
|
|
|
existing_seed_texts.add(comb['query'])
|
|
existing_seed_texts.add(comb['query'])
|
|
|
- print(f" ✓ {comb['query']} (分数: {comb['score']:.2f} > 种子: {seed_score:.2f})")
|
|
|
|
|
|
|
+ print(f" ✓ {comb['query']} (分数: {comb['score']:.2f} >= 种子: {seed_score:.2f} + {REQUIRED_SCORE_GAIN:.2f})")
|
|
|
|
|
|
|
|
# 5.2 加入高分sug
|
|
# 5.2 加入高分sug
|
|
|
print(f" 5.2 加入高分sug...")
|
|
print(f" 5.2 加入高分sug...")
|
|
@@ -1405,8 +1407,8 @@ async def run_round(
|
|
|
if sug.from_q and sug.from_q.text in pruned_query_texts:
|
|
if sug.from_q and sug.from_q.text in pruned_query_texts:
|
|
|
continue
|
|
continue
|
|
|
|
|
|
|
|
- # sug分数 > 对应query分数
|
|
|
|
|
- if sug.from_q and sug.score_with_o > sug.from_q.score_with_o and sug.text not in existing_seed_texts:
|
|
|
|
|
|
|
+ # sug必须比来源query提升至少REQUIRED_SCORE_GAIN才作为下一轮种子
|
|
|
|
|
+ if sug.from_q and sug.score_with_o >= sug.from_q.score_with_o + REQUIRED_SCORE_GAIN and sug.text not in existing_seed_texts:
|
|
|
new_seed = Seed(
|
|
new_seed = Seed(
|
|
|
text=sug.text,
|
|
text=sug.text,
|
|
|
added_words=[],
|
|
added_words=[],
|
|
@@ -1415,7 +1417,7 @@ async def run_round(
|
|
|
)
|
|
)
|
|
|
seed_list_next.append(new_seed)
|
|
seed_list_next.append(new_seed)
|
|
|
existing_seed_texts.add(sug.text)
|
|
existing_seed_texts.add(sug.text)
|
|
|
- print(f" ✓ {sug.text} (分数: {sug.score_with_o:.2f} > 来源query: {sug.from_q.score_with_o:.2f})")
|
|
|
|
|
|
|
+ print(f" ✓ {sug.text} (分数: {sug.score_with_o:.2f} >= 来源query: {sug.from_q.score_with_o:.2f} + {REQUIRED_SCORE_GAIN:.2f})")
|
|
|
|
|
|
|
|
# 序列化搜索结果数据(包含帖子详情)
|
|
# 序列化搜索结果数据(包含帖子详情)
|
|
|
search_results_data = []
|
|
search_results_data = []
|