| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744 |
- """热点内容 MySQL 仓储。"""
- from __future__ import annotations
- import hashlib
- import json
- from datetime import datetime, timedelta
- from typing import Any
- try:
- import pymysql
- from pymysql.cursors import DictCursor
- except ImportError: # pragma: no cover - runtime dependency check
- pymysql = None
- DictCursor = None
- from app.hot_content.exceptions import HotContentFlowError
- from app.hot_content.status import ExecutionStatus, PostprocessStatus
- from app.hot_content.timezone import SHANGHAI_TZ
- from app.hot_content.types import MysqlConfig
- def _json_dumps(data: Any) -> str:
- return json.dumps(data, ensure_ascii=False, separators=(",", ":"))
- def _json_loads(value: Any) -> Any:
- if value is None:
- return None
- if isinstance(value, (dict, list)):
- return value
- if isinstance(value, (bytes, bytearray)):
- value = value.decode("utf-8")
- if isinstance(value, str):
- return json.loads(value)
- return value
- def _normalize_demand_names(demand_name_set: list[str]) -> list[str]:
- names: list[str] = []
- seen: set[str] = set()
- for item in demand_name_set:
- name = str(item).strip()
- if not name or name in seen:
- continue
- seen.add(name)
- names.append(name)
- return names
- def unique_title_key(source: str, title: str) -> str:
- return hashlib.sha256(f"{source}\n{title}".encode("utf-8")).hexdigest()
- class HotContentRepository:
- def __init__(self, config: MysqlConfig):
- if pymysql is None or DictCursor is None:
- raise HotContentFlowError("missing dependency: pip install pymysql")
- self.conn = pymysql.connect(
- host=config.host,
- port=config.port,
- user=config.user,
- password=config.password,
- database=config.database,
- charset=config.charset,
- autocommit=True,
- cursorclass=DictCursor,
- )
- def close(self) -> None:
- self.conn.close()
- def upsert_record(self, *, source: str, title: str, rank: int | None) -> dict[str, Any]:
- key = unique_title_key(source, title)
- sql = """
- INSERT INTO hot_content_records (
- unique_key, source, title, hot_rank, execution_status, created_at, updated_at
- )
- VALUES (%s, %s, %s, %s, %s, NOW(), NOW())
- ON DUPLICATE KEY UPDATE
- id=LAST_INSERT_ID(id),
- hot_rank=VALUES(hot_rank),
- updated_at=NOW()
- """
- with self.conn.cursor() as cursor:
- cursor.execute(
- sql,
- (
- key,
- source,
- title,
- rank,
- ExecutionStatus.HOT_SAVED,
- ),
- )
- record_id = int(cursor.lastrowid)
- cursor.execute(
- """
- SELECT
- id,
- unique_key,
- execution_status,
- article_title,
- article_body,
- article_url,
- decode_request_result IS NOT NULL AS has_decode_request,
- contribution_points_json IS NOT NULL AS has_contribution_points
- FROM hot_content_records
- WHERE id = %s
- """,
- (record_id,),
- )
- row = cursor.fetchone()
- if not row:
- raise HotContentFlowError(f"missing hot_content_records id={record_id}")
- return {
- "id": int(row["id"]),
- "unique_key": str(row["unique_key"]),
- "execution_status": int(row["execution_status"]),
- "article_title": row.get("article_title"),
- "article_body": row.get("article_body"),
- "article_url": row.get("article_url"),
- "has_decode_request": bool(row.get("has_decode_request")),
- "has_contribution_points": bool(row.get("has_contribution_points")),
- }
- def update_status(
- self,
- *,
- record_id: int,
- status: int,
- error_message: str | None = None,
- ) -> None:
- sql = """
- UPDATE hot_content_records
- SET execution_status=%s, error_reason=%s, updated_at=NOW()
- WHERE id=%s
- """
- with self.conn.cursor() as cursor:
- cursor.execute(sql, (status, error_message, record_id))
- def update_article(
- self,
- *,
- record_id: int,
- article_title: str,
- article_body: str,
- url: str,
- ) -> None:
- sql = """
- UPDATE hot_content_records
- SET article_title=%s,
- article_body=%s,
- article_url=%s,
- execution_status=%s,
- error_reason=NULL,
- updated_at=NOW()
- WHERE id=%s
- """
- with self.conn.cursor() as cursor:
- cursor.execute(
- sql,
- (
- article_title,
- article_body,
- url,
- ExecutionStatus.CONTENT_OK,
- record_id,
- ),
- )
- def update_decode_result(
- self,
- *,
- record_id: int,
- status: int,
- request_json: dict[str, Any],
- response_json: dict[str, Any] | None,
- error_message: str | None = None,
- ) -> None:
- decode_request_result = {
- "request": request_json,
- "response": response_json,
- }
- sql = """
- UPDATE hot_content_records
- SET decode_request_result=%s,
- execution_status=%s,
- error_reason=%s,
- updated_at=NOW()
- WHERE id=%s
- """
- with self.conn.cursor() as cursor:
- cursor.execute(
- sql,
- (
- _json_dumps(decode_request_result),
- status,
- error_message,
- record_id,
- ),
- )
- def list_decode_result_candidates(self, *, limit: int) -> list[dict[str, Any]]:
- sql = """
- SELECT id, unique_key
- FROM hot_content_records
- WHERE execution_status IN (%s, %s, %s)
- AND contribution_points_json IS NULL
- ORDER BY updated_at ASC, id ASC
- LIMIT %s
- """
- with self.conn.cursor() as cursor:
- cursor.execute(
- sql,
- (
- ExecutionStatus.DECODE_SUBMITTED,
- ExecutionStatus.DECODE_SUCCESS,
- ExecutionStatus.DECODE_PENDING,
- limit,
- ),
- )
- rows = cursor.fetchall()
- return [
- {
- "id": int(row["id"]),
- "unique_key": str(row["unique_key"]),
- }
- for row in rows
- ]
- def save_decode_result_export(
- self,
- *,
- record_id: int,
- decode_result_json: dict[str, Any],
- contribution_points_json: dict[str, Any],
- ) -> None:
- sql = """
- UPDATE hot_content_records
- SET decode_result_json=%s,
- contribution_points_json=%s,
- execution_status=%s,
- error_reason=NULL,
- updated_at=NOW()
- WHERE id=%s
- """
- with self.conn.cursor() as cursor:
- cursor.execute(
- sql,
- (
- _json_dumps(decode_result_json),
- _json_dumps(contribution_points_json),
- ExecutionStatus.CONTRIBUTION_EXTRACTED,
- record_id,
- ),
- )
- def get_demand_cache_by_hour(self, *, cache_hour: datetime) -> dict[str, Any] | None:
- sql = """
- SELECT
- id,
- cache_hour,
- source_table,
- partition_dt,
- demand_name_set_json,
- item_count,
- updated_at
- FROM demand_pool_hourly_cache
- WHERE cache_hour=%s
- LIMIT 1
- """
- with self.conn.cursor() as cursor:
- cursor.execute(sql, (cache_hour,))
- row = cursor.fetchone()
- if not row:
- return None
- demand_name_set = _json_loads(row.get("demand_name_set_json")) or []
- if not isinstance(demand_name_set, list):
- demand_name_set = []
- return {
- "id": int(row["id"]),
- "cache_hour": row.get("cache_hour"),
- "source_table": str(row["source_table"]),
- "partition_dt": row.get("partition_dt"),
- "demand_name_set": [
- str(name).strip()
- for name in demand_name_set
- if str(name).strip()
- ],
- "item_count": int(row.get("item_count") or 0),
- "updated_at": row.get("updated_at"),
- }
- def save_demand_cache_set(
- self,
- *,
- cache_hour: datetime,
- source_table: str,
- partition_dt: str | None,
- excluded_strategy: str,
- top_n: int,
- demand_name_set: list[str],
- ) -> int:
- sql = """
- INSERT INTO demand_pool_hourly_cache (
- cache_hour,
- source_table,
- partition_dt,
- excluded_strategy,
- top_n,
- demand_name_set_json,
- item_count,
- created_at,
- updated_at
- )
- VALUES (%s, %s, %s, %s, %s, %s, %s, NOW(), NOW())
- ON DUPLICATE KEY UPDATE
- id=LAST_INSERT_ID(id),
- source_table=VALUES(source_table),
- partition_dt=VALUES(partition_dt),
- excluded_strategy=VALUES(excluded_strategy),
- top_n=VALUES(top_n),
- demand_name_set_json=VALUES(demand_name_set_json),
- item_count=VALUES(item_count),
- updated_at=NOW()
- """
- normalized_names = _normalize_demand_names(demand_name_set)
- with self.conn.cursor() as cursor:
- cursor.execute(
- sql,
- (
- cache_hour,
- source_table,
- partition_dt,
- excluded_strategy,
- top_n,
- _json_dumps(normalized_names),
- len(normalized_names),
- ),
- )
- return int(cursor.lastrowid)
- def list_postprocess_candidates(self, *, limit: int) -> list[dict[str, Any]]:
- sql = """
- SELECT
- id,
- unique_key,
- source,
- title,
- article_title,
- article_body,
- demand_cache_run_id,
- decode_result_json,
- contribution_points_json,
- contribution_demand_match_json
- FROM hot_content_records
- WHERE contribution_points_json IS NOT NULL
- AND postprocess_status IN (%s, %s, %s)
- ORDER BY updated_at ASC, id ASC
- LIMIT %s
- """
- with self.conn.cursor() as cursor:
- cursor.execute(
- sql,
- (
- PostprocessStatus.PENDING,
- PostprocessStatus.DEMAND_MATCHED,
- PostprocessStatus.FAILED,
- limit,
- ),
- )
- rows = cursor.fetchall()
- return [
- {
- "id": int(row["id"]),
- "unique_key": str(row["unique_key"]),
- "source": str(row.get("source") or ""),
- "title": str(row.get("title") or ""),
- "article_title": row.get("article_title"),
- "article_body": row.get("article_body"),
- "demand_cache_run_id": row.get("demand_cache_run_id"),
- "decode_result_json": _json_loads(row.get("decode_result_json")),
- "contribution_points_json": _json_loads(row.get("contribution_points_json")),
- "contribution_demand_match_json": _json_loads(
- row.get("contribution_demand_match_json")
- ),
- }
- for row in rows
- ]
- def save_contribution_demand_match(
- self,
- *,
- record_id: int,
- demand_cache_run_id: int,
- match_json: dict[str, Any],
- ) -> None:
- sql = """
- UPDATE hot_content_records
- SET demand_cache_run_id=%s,
- contribution_demand_match_json=%s,
- postprocess_status=%s,
- postprocess_error_reason=NULL,
- updated_at=NOW()
- WHERE id=%s
- """
- with self.conn.cursor() as cursor:
- cursor.execute(
- sql,
- (
- demand_cache_run_id,
- _json_dumps(match_json),
- PostprocessStatus.DEMAND_MATCHED,
- record_id,
- ),
- )
- def save_wxindex_trend(
- self,
- *,
- record_id: int,
- trend_json: dict[str, Any],
- ) -> None:
- sql = """
- UPDATE hot_content_records
- SET wxindex_trend_json=%s,
- postprocess_status=%s,
- postprocess_error_reason=NULL,
- updated_at=NOW()
- WHERE id=%s
- """
- with self.conn.cursor() as cursor:
- cursor.execute(
- sql,
- (
- _json_dumps(trend_json),
- PostprocessStatus.WXINDEX_DONE,
- record_id,
- ),
- )
- def update_postprocess_status(
- self,
- *,
- record_id: int,
- status: int,
- error_message: str | None = None,
- ) -> None:
- sql = """
- UPDATE hot_content_records
- SET postprocess_status=%s,
- postprocess_error_reason=%s,
- updated_at=NOW()
- WHERE id=%s
- """
- with self.conn.cursor() as cursor:
- cursor.execute(sql, (status, error_message, record_id))
- def replace_demand_export_rows(
- self,
- *,
- record_id: int,
- source: str,
- hot_title: str,
- article_title: str,
- rows: list[dict[str, Any]],
- ) -> None:
- self._ensure_demand_export_table()
- delete_sql = "DELETE FROM hot_content_demand_exports WHERE record_id=%s"
- insert_sql = """
- INSERT INTO hot_content_demand_exports (
- record_id,
- source,
- hot_title,
- article_title,
- item_type,
- item_text,
- point_category,
- matched_demand,
- contribution_score,
- wxindex_keyword,
- all_hot_keywords,
- wxindex_latest_score,
- wxindex_trend,
- created_at,
- updated_at
- )
- VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, NOW(), NOW())
- """
- with self.conn.cursor() as cursor:
- cursor.execute(delete_sql, (record_id,))
- insert_rows = [
- (
- record_id,
- source,
- hot_title,
- article_title,
- str(item.get("item_type") or ""),
- str(item.get("item_text") or ""),
- str(item.get("point_category") or ""),
- str(item.get("matched_demand") or ""),
- item.get("contribution_score"),
- str(item.get("wxindex_keyword") or ""),
- str(item.get("all_hot_keywords") or ""),
- float(item.get("wxindex_latest_score") or 0),
- str(item.get("wxindex_trend") or ""),
- )
- for item in rows
- if str(item.get("item_type") or "").strip()
- and str(item.get("item_text") or "").strip()
- ]
- if insert_rows:
- cursor.executemany(insert_sql, insert_rows)
- def list_demand_export_groups(self) -> list[dict[str, Any]]:
- """读取主表当天创建的 record 对应导出分组,仅供 ODPS 当天分区同步(不跨天)。"""
- self._ensure_demand_export_table()
- today_start = datetime.now(SHANGHAI_TZ).replace(
- hour=0,
- minute=0,
- second=0,
- microsecond=0,
- tzinfo=None,
- )
- today_end = today_start + timedelta(days=1)
- sql = """
- SELECT
- e.record_id,
- e.item_type,
- e.item_text,
- e.point_category,
- e.matched_demand,
- e.wxindex_latest_score
- FROM hot_content_demand_exports e
- INNER JOIN hot_content_records r ON r.id = e.record_id
- WHERE r.created_at >= %s
- AND r.created_at < %s
- ORDER BY e.record_id ASC, e.id ASC
- """
- with self.conn.cursor() as cursor:
- cursor.execute(sql, (today_start, today_end))
- rows = cursor.fetchall()
- grouped: dict[int, list[dict[str, Any]]] = {}
- for row in rows:
- record_id = int(row["record_id"])
- grouped.setdefault(record_id, []).append(
- {
- "item_type": str(row.get("item_type") or ""),
- "item_text": str(row.get("item_text") or ""),
- "point_category": str(row.get("point_category") or ""),
- "matched_demand": str(row.get("matched_demand") or ""),
- "wxindex_latest_score": float(row.get("wxindex_latest_score") or 0),
- }
- )
- return [
- {"record_id": record_id, "export_rows": export_rows}
- for record_id, export_rows in grouped.items()
- ]
- def _ensure_demand_export_table(self) -> None:
- sql = """
- CREATE TABLE IF NOT EXISTS hot_content_demand_exports (
- id BIGINT UNSIGNED NOT NULL AUTO_INCREMENT,
- record_id BIGINT UNSIGNED NOT NULL,
- source VARCHAR(64) NOT NULL DEFAULT '',
- hot_title VARCHAR(1024) NOT NULL DEFAULT '',
- article_title VARCHAR(1024) NOT NULL DEFAULT '',
- item_type VARCHAR(32) NOT NULL COMMENT '元素/短语',
- item_text VARCHAR(1024) NOT NULL,
- point_category VARCHAR(32) NOT NULL DEFAULT '' COMMENT '点类型:灵感点/目的点/关键点',
- matched_demand VARCHAR(1024) NOT NULL DEFAULT '' COMMENT '匹配到的需求',
- contribution_score DOUBLE NULL COMMENT '贡献分,仅元素有值',
- wxindex_keyword VARCHAR(1024) NOT NULL DEFAULT '' COMMENT '获取微信指数的元素',
- all_hot_keywords VARCHAR(1024) NOT NULL DEFAULT '' COMMENT '全部热点词',
- wxindex_latest_score DOUBLE NOT NULL DEFAULT 0,
- wxindex_trend VARCHAR(32) NOT NULL DEFAULT '' COMMENT '微信指数趋势',
- created_at DATETIME NOT NULL DEFAULT CURRENT_TIMESTAMP,
- updated_at DATETIME NOT NULL DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP,
- PRIMARY KEY (id),
- KEY idx_record_id (record_id),
- KEY idx_source_type (source, item_type)
- ) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci
- """
- with self.conn.cursor() as cursor:
- cursor.execute(sql)
- self._ensure_demand_export_column(
- cursor,
- "matched_demand",
- """
- ALTER TABLE hot_content_demand_exports
- ADD COLUMN matched_demand VARCHAR(1024) NOT NULL DEFAULT ''
- COMMENT '匹配到的需求'
- AFTER item_text
- """,
- )
- self._ensure_demand_export_column(
- cursor,
- "point_category",
- """
- ALTER TABLE hot_content_demand_exports
- ADD COLUMN point_category VARCHAR(32) NOT NULL DEFAULT ''
- COMMENT '点类型:灵感点/目的点/关键点'
- AFTER item_text
- """,
- )
- self._ensure_demand_export_column(
- cursor,
- "contribution_score",
- """
- ALTER TABLE hot_content_demand_exports
- ADD COLUMN contribution_score DOUBLE NULL
- COMMENT '贡献分,仅词有值'
- AFTER matched_demand
- """,
- )
- self._ensure_demand_export_column(
- cursor,
- "wxindex_trend",
- """
- ALTER TABLE hot_content_demand_exports
- ADD COLUMN wxindex_trend VARCHAR(32) NOT NULL DEFAULT ''
- COMMENT '微信指数趋势'
- AFTER wxindex_latest_score
- """,
- )
- self._ensure_demand_export_column(
- cursor,
- "wxindex_keyword",
- """
- ALTER TABLE hot_content_demand_exports
- ADD COLUMN wxindex_keyword VARCHAR(1024) NOT NULL DEFAULT ''
- COMMENT '获取微信指数的词'
- AFTER contribution_score
- """,
- )
- self._ensure_demand_export_column(
- cursor,
- "all_hot_keywords",
- """
- ALTER TABLE hot_content_demand_exports
- ADD COLUMN all_hot_keywords VARCHAR(1024) NOT NULL DEFAULT ''
- COMMENT '全部热点词'
- AFTER wxindex_keyword
- """,
- )
- def _ensure_demand_export_column(
- self,
- cursor: Any,
- column_name: str,
- alter_sql: str,
- ) -> None:
- cursor.execute(
- """
- SELECT COUNT(*) AS cnt
- FROM information_schema.COLUMNS
- WHERE TABLE_SCHEMA = DATABASE()
- AND TABLE_NAME = 'hot_content_demand_exports'
- AND COLUMN_NAME = %s
- """,
- (column_name,),
- )
- if int((cursor.fetchone() or {}).get("cnt") or 0) == 0:
- cursor.execute(alter_sql)
- def list_synced_odps_demand_ids(
- self,
- *,
- partition_dt: str,
- strategy: str,
- ) -> set[str]:
- self._ensure_odps_sync_log_table()
- sql = """
- SELECT demand_id
- FROM hot_content_odps_sync_log
- WHERE partition_dt = %s
- AND strategy = %s
- """
- with self.conn.cursor() as cursor:
- cursor.execute(sql, (partition_dt, strategy))
- rows = cursor.fetchall()
- return {
- str(row.get("demand_id") or "").strip()
- for row in rows
- if str(row.get("demand_id") or "").strip()
- }
- def save_odps_sync_logs(self, rows: list[dict[str, Any]]) -> int:
- if not rows:
- return 0
- self._ensure_odps_sync_log_table()
- sql = """
- INSERT INTO hot_content_odps_sync_log (
- partition_dt,
- strategy,
- demand_id,
- demand_name,
- demand_type,
- record_id
- )
- VALUES (%s, %s, %s, %s, %s, %s)
- ON DUPLICATE KEY UPDATE
- demand_name = VALUES(demand_name),
- demand_type = VALUES(demand_type),
- record_id = VALUES(record_id),
- synced_at = CURRENT_TIMESTAMP
- """
- insert_rows = [
- (
- str(item.get("partition_dt") or ""),
- str(item.get("strategy") or ""),
- str(item.get("demand_id") or ""),
- str(item.get("demand_name") or ""),
- str(item.get("demand_type") or ""),
- int(item.get("record_id") or 0),
- )
- for item in rows
- if str(item.get("demand_id") or "").strip()
- ]
- with self.conn.cursor() as cursor:
- cursor.executemany(sql, insert_rows)
- return len(insert_rows)
- def _ensure_odps_sync_log_table(self) -> None:
- sql = """
- CREATE TABLE IF NOT EXISTS hot_content_odps_sync_log (
- id BIGINT UNSIGNED NOT NULL AUTO_INCREMENT,
- partition_dt VARCHAR(8) NOT NULL COMMENT 'ODPS 分区 dt',
- strategy VARCHAR(128) NOT NULL COMMENT '需求 strategy',
- demand_id CHAR(32) NOT NULL COMMENT 'ODPS demand_id',
- demand_name VARCHAR(1024) NOT NULL DEFAULT '' COMMENT '需求名',
- demand_type VARCHAR(32) NOT NULL DEFAULT '' COMMENT '特征点/短语',
- record_id BIGINT UNSIGNED NOT NULL DEFAULT 0 COMMENT '来源 hot_content_records.id',
- synced_at DATETIME NOT NULL DEFAULT CURRENT_TIMESTAMP COMMENT '写入 ODPS 时间',
- PRIMARY KEY (id),
- UNIQUE KEY uk_odps_sync (partition_dt, strategy, demand_id),
- KEY idx_record_partition (record_id, partition_dt),
- KEY idx_partition_strategy (partition_dt, strategy)
- ) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci
- """
- with self.conn.cursor() as cursor:
- cursor.execute(sql)
|