|
|
@@ -107,6 +107,25 @@ def _resolve_demand_name(
|
|
|
return demand_lookup.get(value) or demand_lookup.get(_normalize_demand_key(value))
|
|
|
|
|
|
|
|
|
+def _collect_matched_demand_names(matched_word_rows: list[Any]) -> list[str]:
|
|
|
+ demand_names: list[str] = []
|
|
|
+ seen: set[str] = set()
|
|
|
+ for row in matched_word_rows:
|
|
|
+ if not isinstance(row, dict):
|
|
|
+ continue
|
|
|
+ match_rows = row.get("匹配需求列表") or []
|
|
|
+ if not isinstance(match_rows, list):
|
|
|
+ continue
|
|
|
+ for match in match_rows:
|
|
|
+ if not isinstance(match, dict):
|
|
|
+ continue
|
|
|
+ demand_name = str(match.get("demand_name") or "").strip()
|
|
|
+ if demand_name and demand_name not in seen:
|
|
|
+ seen.add(demand_name)
|
|
|
+ demand_names.append(demand_name)
|
|
|
+ return demand_names
|
|
|
+
|
|
|
+
|
|
|
class ContributionPostprocessService:
|
|
|
def __init__(
|
|
|
self,
|
|
|
@@ -488,75 +507,112 @@ class ContributionPostprocessService:
|
|
|
if not isinstance(matched_word_rows, list) or not matched_word_rows:
|
|
|
return None
|
|
|
|
|
|
- candidate_words = [
|
|
|
+ contribution_words = [
|
|
|
str(row.get("词") or "").strip()
|
|
|
for row in matched_word_rows
|
|
|
if isinstance(row, dict) and str(row.get("词") or "").strip()
|
|
|
]
|
|
|
- if not candidate_words:
|
|
|
+ if not contribution_words:
|
|
|
return None
|
|
|
|
|
|
channel_content_id = str(
|
|
|
match_result.get("channelContentId") or record.get("unique_key") or ""
|
|
|
)
|
|
|
article_title, body_text = self.extract_article_text(record)
|
|
|
- if len(candidate_words) == 1:
|
|
|
- pick = {
|
|
|
- "selected_word": candidate_words[0],
|
|
|
- "reason": "only one candidate word",
|
|
|
- }
|
|
|
- else:
|
|
|
- pick = self.llm_pick_best_word(
|
|
|
- channel_content_id=channel_content_id,
|
|
|
- article_title=article_title,
|
|
|
- body_text=body_text,
|
|
|
- candidate_words=candidate_words,
|
|
|
- )
|
|
|
- selected_word = pick["selected_word"]
|
|
|
+ matched_demands = _collect_matched_demand_names(matched_word_rows)
|
|
|
+
|
|
|
+ pick = self.llm_extract_wxindex_words(
|
|
|
+ channel_content_id=channel_content_id,
|
|
|
+ article_title=article_title,
|
|
|
+ body_text=body_text,
|
|
|
+ contribution_words=contribution_words,
|
|
|
+ matched_demands=matched_demands,
|
|
|
+ )
|
|
|
+ selected_words = pick["selected_words"]
|
|
|
start_ymd, end_ymd = _get_recent_range(self.config.wxindex_lookback_days)
|
|
|
- wx_payload = {
|
|
|
- "keyword": selected_word,
|
|
|
- "start_ymd": start_ymd,
|
|
|
- "end_ymd": end_ymd,
|
|
|
- }
|
|
|
- wx_resp = self.api_client.post_json(self.config.wxindex_api_url, wx_payload)
|
|
|
- series = _parse_total_scores(wx_resp)
|
|
|
- latest_score = series[-1]["total_score"] if series else None
|
|
|
threshold = float(self.config.wxindex_score_threshold)
|
|
|
+
|
|
|
+ wxindex_searches: list[dict[str, Any]] = []
|
|
|
+ for keyword in selected_words:
|
|
|
+ wx_payload = {
|
|
|
+ "keyword": keyword,
|
|
|
+ "start_ymd": start_ymd,
|
|
|
+ "end_ymd": end_ymd,
|
|
|
+ }
|
|
|
+ wx_resp = self.api_client.post_json(self.config.wxindex_api_url, wx_payload)
|
|
|
+ series = _parse_total_scores(wx_resp)
|
|
|
+ latest_score = series[-1]["total_score"] if series else None
|
|
|
+ wxindex_searches.append(
|
|
|
+ {
|
|
|
+ "keyword": keyword,
|
|
|
+ "start_ymd": start_ymd,
|
|
|
+ "end_ymd": end_ymd,
|
|
|
+ "total_score_7d": series,
|
|
|
+ "latest_total_score": latest_score,
|
|
|
+ "threshold": threshold,
|
|
|
+ "latest_gt_threshold": (
|
|
|
+ False
|
|
|
+ if latest_score is None
|
|
|
+ else latest_score >= threshold
|
|
|
+ ),
|
|
|
+ "trend": calc_wxindex_trend(series),
|
|
|
+ }
|
|
|
+ )
|
|
|
+
|
|
|
+ searchable = [
|
|
|
+ item
|
|
|
+ for item in wxindex_searches
|
|
|
+ if item.get("latest_total_score") is not None
|
|
|
+ ]
|
|
|
+ if not searchable:
|
|
|
+ raise WxindexSelectionSkipped(
|
|
|
+ f"no wxindex score for any keyword in {channel_content_id}: "
|
|
|
+ f"{selected_words}"
|
|
|
+ )
|
|
|
+
|
|
|
+ best = max(searchable, key=lambda item: float(item["latest_total_score"]))
|
|
|
+ selected_word = str(best["keyword"])
|
|
|
+ latest_score = best["latest_total_score"]
|
|
|
+ series = best["total_score_7d"]
|
|
|
return {
|
|
|
"channelContentId": channel_content_id,
|
|
|
"article_title": article_title,
|
|
|
+ "llm_selected_words": selected_words,
|
|
|
"llm_selected_word": selected_word,
|
|
|
"llm_reason": pick["reason"],
|
|
|
+ "wxindex_searches": wxindex_searches,
|
|
|
"wxindex": {
|
|
|
"keyword": selected_word,
|
|
|
+ "keywords": selected_words,
|
|
|
"start_ymd": start_ymd,
|
|
|
"end_ymd": end_ymd,
|
|
|
"total_score_7d": series,
|
|
|
"latest_total_score": latest_score,
|
|
|
"threshold": threshold,
|
|
|
- "latest_gt_threshold": (
|
|
|
- False
|
|
|
- if latest_score is None
|
|
|
- else latest_score >= threshold
|
|
|
- ),
|
|
|
- "trend": calc_wxindex_trend(series),
|
|
|
+ "latest_gt_threshold": latest_score >= threshold,
|
|
|
+ "trend": best["trend"],
|
|
|
},
|
|
|
}
|
|
|
|
|
|
- def llm_pick_best_word(
|
|
|
+ def llm_extract_wxindex_words(
|
|
|
self,
|
|
|
*,
|
|
|
channel_content_id: str,
|
|
|
article_title: str,
|
|
|
body_text: str,
|
|
|
- candidate_words: list[str],
|
|
|
- ) -> dict[str, str]:
|
|
|
+ contribution_words: list[str],
|
|
|
+ matched_demands: list[str],
|
|
|
+ ) -> dict[str, Any]:
|
|
|
system_prompt = """
|
|
|
#角色
|
|
|
- 你是一个专业的语义分析专家,擅长精准概括整篇文章。
|
|
|
+ 你是一个专业的语义分析专家,擅长从文章中提取简洁、精准的热搜检索词。
|
|
|
# 任务
|
|
|
- 我会提供一篇文章的标题、正文和候选词列表,请你选择一个最能代表文章内容的词。
|
|
|
+ 我会提供文章标题、正文,以及两类备选词来源:
|
|
|
+ 1. 高贡献词:文章贡献度较高的关键词
|
|
|
+ 2. 已匹配需求:已与需求库匹配上的需求名
|
|
|
+ 请结合标题、正文与上述备选词,提取用于「微信指数」热度检索的词。
|
|
|
+ 需自行从标题中识别可检索的关键词;词应简洁(2-4 字)、概括、精准覆盖事件。
|
|
|
+ 若文章涉及多个子事件,可分别提取多个词,每个词覆盖部分事件。
|
|
|
# 输出规则
|
|
|
1. 严格输出 JSON 对象,禁止输出 JSON 之外的任何内容。
|
|
|
"""
|
|
|
@@ -564,15 +620,21 @@ class ContributionPostprocessService:
|
|
|
"source": channel_content_id,
|
|
|
"article_title": article_title,
|
|
|
"article_body_text": body_text,
|
|
|
- "candidate_words": candidate_words,
|
|
|
+ "contribution_words": contribution_words,
|
|
|
+ "matched_demands": matched_demands,
|
|
|
"output_schema": {
|
|
|
"source": "string",
|
|
|
- "selected_word": "string, must be selected from candidate_words",
|
|
|
+ "selected_words": [
|
|
|
+ "string, concise keyword for wxindex search, one or more"
|
|
|
+ ],
|
|
|
"reason": "string",
|
|
|
},
|
|
|
"constraints": [
|
|
|
- "selected_word 必须来自 candidate_words",
|
|
|
- "reason 简洁说明,不超过40字",
|
|
|
+ "selected_words 为数组,至少 1 个词,可多个",
|
|
|
+ "每个词简洁(2-4 字),适合微信指数检索",
|
|
|
+ "结合标题、高贡献词、已匹配需求提炼,可合并改写,不必逐字照搬",
|
|
|
+ "多个词应分别覆盖不同事件或角度,避免语义重复",
|
|
|
+ "reason 简洁说明,不超过60字",
|
|
|
"仅输出 JSON 对象,不要 markdown 代码块",
|
|
|
],
|
|
|
}
|
|
|
@@ -593,20 +655,35 @@ class ContributionPostprocessService:
|
|
|
max_tokens=max(self.config.wxindex_llm_max_tokens, 1),
|
|
|
)
|
|
|
parsed = _extract_json_object(str(resp.get("content") or ""))
|
|
|
- selected_word = str(parsed.get("selected_word") or "").strip()
|
|
|
+ raw_words = parsed.get("selected_words")
|
|
|
+ if isinstance(raw_words, str):
|
|
|
+ raw_words = [raw_words]
|
|
|
+ if not isinstance(raw_words, list):
|
|
|
+ legacy_word = str(parsed.get("selected_word") or "").strip()
|
|
|
+ raw_words = [legacy_word] if legacy_word else []
|
|
|
+
|
|
|
+ selected_words: list[str] = []
|
|
|
+ seen: set[str] = set()
|
|
|
+ for item in raw_words:
|
|
|
+ word = str(item or "").strip()
|
|
|
+ if word and word not in seen:
|
|
|
+ seen.add(word)
|
|
|
+ selected_words.append(word)
|
|
|
+
|
|
|
reason = str(parsed.get("reason") or "").strip()
|
|
|
- if selected_word not in candidate_words:
|
|
|
+ if not selected_words:
|
|
|
raise WxindexSelectionSkipped(
|
|
|
- f"selected_word not in candidates for {channel_content_id}: "
|
|
|
- f"{selected_word}"
|
|
|
+ f"selected_words empty for {channel_content_id}"
|
|
|
)
|
|
|
- return {"selected_word": selected_word, "reason": reason}
|
|
|
+ return {"selected_words": selected_words, "reason": reason}
|
|
|
+ except WxindexSelectionSkipped:
|
|
|
+ raise
|
|
|
except (OpenRouterCallError, HotContentFlowError) as exc:
|
|
|
last_error = exc
|
|
|
if attempt < max(self.config.wxindex_llm_max_attempts, 1):
|
|
|
continue
|
|
|
raise HotContentFlowError(
|
|
|
- f"llm pick word failed for {channel_content_id}: {last_error}"
|
|
|
+ f"llm extract wxindex words failed for {channel_content_id}: {last_error}"
|
|
|
) from last_error
|
|
|
|
|
|
@staticmethod
|