|
|
@@ -97,7 +97,7 @@ DEFAULT_BATCH_PROMPT_TEMPLATE = """
|
|
|
# Embedding 相似度 API
|
|
|
EMBEDDING_SIMILARITY_URL = "http://61.48.133.26:8187/cartesian_similarity"
|
|
|
# LLM 模型
|
|
|
-LLM_MODEL = "openai/gpt-4.1-mini"
|
|
|
+LLM_MODEL = "google/gemini-3-flash-preview"
|
|
|
|
|
|
|
|
|
def _phrase_pairs(phrases_a: List[str], phrases_b: List[str]) -> List[Tuple[str, str]]:
|
|
|
@@ -165,9 +165,12 @@ def _extract_json_array(content: str) -> List[dict]:
|
|
|
return json.loads(content)
|
|
|
|
|
|
|
|
|
-async def _llm_similarity(phrases_a: List[str], phrases_b: List[str]) -> List[List[float]]:
|
|
|
+async def _llm_similarity(
|
|
|
+ phrases_a: List[str], phrases_b: List[str], *, use_cache: bool = True
|
|
|
+) -> List[List[float]]:
|
|
|
"""
|
|
|
- 用 LLM 对短语对打分,返回 M×N 矩阵。先查原子缓存,仅对未命中的短语对调用 API。
|
|
|
+ 用 LLM 对短语对打分,返回 M×N 矩阵。use_cache=True 时先查原子缓存,仅对未命中的短语对调用 API;
|
|
|
+ use_cache=False 时不读不写缓存。
|
|
|
"""
|
|
|
if not phrases_a or not phrases_b:
|
|
|
return []
|
|
|
@@ -177,16 +180,18 @@ async def _llm_similarity(phrases_a: List[str], phrases_b: List[str]) -> List[Li
|
|
|
missing_indices: List[Tuple[int, int]] = []
|
|
|
for i in range(M):
|
|
|
for j in range(N):
|
|
|
- score = _read_atomic_score("llm", phrases_a[i], phrases_b[j])
|
|
|
- if score is not None:
|
|
|
- matrix[i][j] = score
|
|
|
- else:
|
|
|
- missing_indices.append((i, j))
|
|
|
+ if use_cache:
|
|
|
+ score = _read_atomic_score("llm", phrases_a[i], phrases_b[j])
|
|
|
+ if score is not None:
|
|
|
+ matrix[i][j] = score
|
|
|
+ continue
|
|
|
+ missing_indices.append((i, j))
|
|
|
|
|
|
total = M * N
|
|
|
- hit_count = total - len(missing_indices)
|
|
|
- if hit_count > 0:
|
|
|
- logger.info("[similarity_matrix] LLM 原子缓存命中 %d/%d", hit_count, total)
|
|
|
+ if use_cache:
|
|
|
+ hit_count = total - len(missing_indices)
|
|
|
+ if hit_count > 0:
|
|
|
+ logger.info("[similarity_matrix] LLM 原子缓存命中 %d/%d", hit_count, total)
|
|
|
if not missing_indices:
|
|
|
return matrix
|
|
|
|
|
|
@@ -213,8 +218,9 @@ async def _llm_similarity(phrases_a: List[str], phrases_b: List[str]) -> List[Li
|
|
|
score = float(items[idx].get("score", 0.0))
|
|
|
score = max(0.0, min(1.0, score))
|
|
|
matrix[i][j] = score
|
|
|
- a, b = phrases_a[i], phrases_b[j]
|
|
|
- _write_atomic_score("llm", a, b, score)
|
|
|
+ if use_cache:
|
|
|
+ a, b = phrases_a[i], phrases_b[j]
|
|
|
+ _write_atomic_score("llm", a, b, score)
|
|
|
return matrix
|
|
|
|
|
|
|
|
|
@@ -224,6 +230,7 @@ async def similarity_matrix(
|
|
|
*,
|
|
|
embedding_weight: float = 0.5,
|
|
|
llm_weight: float = 0.5,
|
|
|
+ use_llm_cache: bool = True,
|
|
|
) -> List[SimilarityItem]:
|
|
|
"""
|
|
|
计算两组短语的相似度,返回对象列表(每条含 phrase_a, phrase_b, embedding_score, llm_score, combined_score)。
|
|
|
@@ -236,6 +243,7 @@ async def similarity_matrix(
|
|
|
phrases_b: 第二组短语列表(N 个)
|
|
|
embedding_weight: embedding 权重,默认 0.5
|
|
|
llm_weight: LLM 权重,默认 0.5
|
|
|
+ use_llm_cache: 是否使用 LLM 相似度缓存,默认 True
|
|
|
|
|
|
Returns:
|
|
|
对象列表,长度 M×N,顺序与短语对 (a0,b0),(a0,b1),...,(aM-1,bN-1) 一致。
|
|
|
@@ -256,7 +264,7 @@ async def similarity_matrix(
|
|
|
|
|
|
async def _run_llm() -> List[List[float]]:
|
|
|
t0 = time.perf_counter()
|
|
|
- out = await _llm_similarity(phrases_a, phrases_b)
|
|
|
+ out = await _llm_similarity(phrases_a, phrases_b, use_cache=use_llm_cache)
|
|
|
logger.info("[similarity_matrix] LLM 耗时: %.3fs", time.perf_counter() - t0)
|
|
|
return out
|
|
|
|
|
|
@@ -328,22 +336,28 @@ def test_extract_json_array() -> None:
|
|
|
|
|
|
|
|
|
async def test_similarity_matrix() -> None:
|
|
|
- """集成测试:调用 embedding + LLM 得到相似度对象列表。"""
|
|
|
- phrases_a = ["犬", "猫咪"]
|
|
|
- phrases_b = ["狗", "手机"]
|
|
|
- items = await similarity_matrix(phrases_a, phrases_b)
|
|
|
- assert len(items) == 4
|
|
|
- for row in items:
|
|
|
- assert "phrase_a" in row and "phrase_b" in row
|
|
|
- assert "embedding_score" in row and "llm_score" in row and "combined_score" in row
|
|
|
- assert 0 <= row["combined_score"] <= 1, f"combined_score 应在 [0,1],得到 {row['combined_score']}"
|
|
|
- # 语义上 "犬"-"狗" 应高于 "犬"-"手机"
|
|
|
- dog_dog = next(r for r in items if r["phrase_a"] == "犬" and r["phrase_b"] == "狗")
|
|
|
- dog_phone = next(r for r in items if r["phrase_a"] == "犬" and r["phrase_b"] == "手机")
|
|
|
- assert dog_dog["combined_score"] > dog_phone["combined_score"], "犬-狗 应高于 犬-手机"
|
|
|
- print("test_similarity_matrix: ok")
|
|
|
- for r in items:
|
|
|
- print(f" {r['phrase_a']}-{r['phrase_b']}: emb={r['embedding_score']:.4f} llm={r['llm_score']:.4f} combined={r['combined_score']:.4f}")
|
|
|
+ """集成测试:调用 embedding + LLM 得到相似度对象列表。use_llm_cache 可控制是否使用 LLM 缓存。"""
|
|
|
+ # use_llm_cache = True
|
|
|
+ use_llm_cache = False
|
|
|
+ phrases_a = ["犬", "猫咪", "夸张"]
|
|
|
+ phrases_b = ["狗", "手机", "夸张堆叠"]
|
|
|
+ items = await similarity_matrix(phrases_a, phrases_b, use_llm_cache=use_llm_cache)
|
|
|
+ for item in items:
|
|
|
+ print(item)
|
|
|
+
|
|
|
+
|
|
|
+ # assert len(items) == 4
|
|
|
+ # for row in items:
|
|
|
+ # assert "phrase_a" in row and "phrase_b" in row
|
|
|
+ # assert "embedding_score" in row and "llm_score" in row and "combined_score" in row
|
|
|
+ # assert 0 <= row["combined_score"] <= 1, f"combined_score 应在 [0,1],得到 {row['combined_score']}"
|
|
|
+ # # 语义上 "犬"-"狗" 应高于 "犬"-"手机"
|
|
|
+ # dog_dog = next(r for r in items if r["phrase_a"] == "犬" and r["phrase_b"] == "狗")
|
|
|
+ # dog_phone = next(r for r in items if r["phrase_a"] == "犬" and r["phrase_b"] == "手机")
|
|
|
+ # assert dog_dog["combined_score"] > dog_phone["combined_score"], "犬-狗 应高于 犬-手机"
|
|
|
+ # print("test_similarity_matrix: ok")
|
|
|
+ # for r in items:
|
|
|
+ # print(f" {r['phrase_a']}-{r['phrase_b']}: emb={r['embedding_score']:.4f} llm={r['llm_score']:.4f} combined={r['combined_score']:.4f}")
|
|
|
|
|
|
|
|
|
if __name__ == "__main__":
|