config.py 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315
  1. """热点内容流程配置加载。"""
  2. from __future__ import annotations
  3. import json
  4. import os
  5. from pathlib import Path
  6. from typing import Any
  7. from app.core.config import PROJECT_ROOT, settings
  8. from app.hot_content.category_filter import DEFAULT_ELDERLY_CATEGORY_LIST
  9. from app.hot_content.exceptions import HotContentFlowError
  10. from app.hot_content.types import FlowConfig, HotSourceConfig, MysqlConfig
  11. def _get_env(name: str, default: str = "") -> str:
  12. value = os.getenv(name)
  13. if value is None or value == "":
  14. return default
  15. return value
  16. def _get_env_int(name: str, default: int) -> int:
  17. raw = os.getenv(name)
  18. if raw is None or raw == "":
  19. return default
  20. try:
  21. return int(raw)
  22. except ValueError as exc:
  23. raise HotContentFlowError(f"invalid integer env {name}={raw!r}") from exc
  24. def _get_env_float(name: str, default: float) -> float:
  25. raw = os.getenv(name)
  26. if raw is None or raw == "":
  27. return default
  28. try:
  29. return float(raw)
  30. except ValueError as exc:
  31. raise HotContentFlowError(f"invalid float env {name}={raw!r}") from exc
  32. def _get_env_bool(name: str, default: bool) -> bool:
  33. raw = os.getenv(name)
  34. if raw is None or raw == "":
  35. return default
  36. return raw.strip().lower() in {"1", "true", "yes", "y", "on"}
  37. def _load_json_from_env_or_file(env_name: str, file_env_name: str) -> Any | None:
  38. file_path = os.getenv(file_env_name)
  39. if file_path:
  40. path = Path(file_path).expanduser()
  41. if not path.is_absolute():
  42. path = PROJECT_ROOT / path
  43. try:
  44. return json.loads(path.read_text(encoding="utf-8"))
  45. except json.JSONDecodeError as exc:
  46. raise HotContentFlowError(f"invalid json file {path}") from exc
  47. raw = os.getenv(env_name)
  48. if not raw:
  49. return None
  50. try:
  51. return json.loads(raw)
  52. except json.JSONDecodeError as exc:
  53. raise HotContentFlowError(f"invalid json env {env_name}") from exc
  54. def _normalize_source_config(item: Any) -> HotSourceConfig:
  55. if isinstance(item, str):
  56. source = item.strip()
  57. if not source:
  58. raise HotContentFlowError("hot source cannot be empty")
  59. return HotSourceConfig(source=source)
  60. if not isinstance(item, dict):
  61. raise HotContentFlowError(f"invalid hot source config: {item!r}")
  62. source = str(item.get("source") or item.get("source_name") or "").strip()
  63. if not source:
  64. raise HotContentFlowError(f"hot source missing source: {item!r}")
  65. return HotSourceConfig(
  66. source=source,
  67. count=int(item.get("count") or item.get("limit") or item.get("rank_limit") or 10),
  68. )
  69. def _load_sources() -> list[HotSourceConfig]:
  70. raw_sources = _load_json_from_env_or_file("HOT_FLOW_SOURCES_JSON", "HOT_FLOW_SOURCES_FILE")
  71. if raw_sources is None:
  72. raw_sources = settings.hot_flow_sources
  73. if not isinstance(raw_sources, list):
  74. raise HotContentFlowError("HOT_FLOW_SOURCES_JSON/HOT_FLOW_SOURCES_FILE must be a list")
  75. sources = [_normalize_source_config(item) for item in raw_sources]
  76. if not sources:
  77. raise HotContentFlowError("hot sources cannot be empty")
  78. return sources
  79. def _parse_cron_hours(value: str) -> str:
  80. hours = [item.strip() for item in value.split(",") if item.strip()]
  81. if not hours:
  82. raise HotContentFlowError("hot flow cron hours cannot be empty")
  83. normalized: list[str] = []
  84. for hour in hours:
  85. try:
  86. hour_num = int(hour)
  87. except ValueError as exc:
  88. raise HotContentFlowError(f"invalid hot flow cron hour: {hour!r}") from exc
  89. if not 0 <= hour_num <= 23:
  90. raise HotContentFlowError(f"hot flow cron hour out of range: {hour_num}")
  91. normalized.append(str(hour_num))
  92. return ",".join(normalized)
  93. def _load_category_filter_categories() -> list[str]:
  94. raw = _load_json_from_env_or_file(
  95. "CATEGORY_FILTER_CATEGORIES_JSON",
  96. "CATEGORY_FILTER_CATEGORIES_FILE",
  97. )
  98. if raw is None:
  99. return list(DEFAULT_ELDERLY_CATEGORY_LIST)
  100. if not isinstance(raw, list):
  101. raise HotContentFlowError(
  102. "CATEGORY_FILTER_CATEGORIES_JSON/CATEGORY_FILTER_CATEGORIES_FILE must be a list"
  103. )
  104. categories = [str(item).strip() for item in raw if str(item).strip()]
  105. if not categories:
  106. raise HotContentFlowError("category filter categories cannot be empty")
  107. return categories
  108. def load_flow_config(interval_override: int | None = None) -> FlowConfig:
  109. crawapi_base_url = _get_env("CRAWAPI_BASE_URL", settings.crawapi_base_url).rstrip("/")
  110. hot_rank_path = _get_env(
  111. "CRAWAPI_HOT_CONTENT_RANK_PATH",
  112. settings.crawapi_hot_content_rank_path,
  113. )
  114. if not crawapi_base_url:
  115. raise HotContentFlowError("missing CRAWAPI_BASE_URL or settings.crawapi_base_url")
  116. if not hot_rank_path:
  117. raise HotContentFlowError(
  118. "missing CRAWAPI_HOT_CONTENT_RANK_PATH or settings.crawapi_hot_content_rank_path"
  119. )
  120. interval_seconds = (
  121. interval_override
  122. if interval_override is not None
  123. else _get_env_int("HOT_FLOW_INTERVAL_SECONDS", settings.hot_flow_interval_seconds)
  124. )
  125. return FlowConfig(
  126. crawapi_base_url=crawapi_base_url,
  127. hot_rank_path=hot_rank_path,
  128. keyword_search_path=_get_env(
  129. "CRAWAPI_KEYWORD_SEARCH_PATH",
  130. settings.crawapi_keyword_search_path,
  131. ),
  132. decode_api_url=_get_env("DECODE_API_URL", settings.decode_api_url),
  133. decode_result_api_url=_get_env(
  134. "DECODE_RESULT_API_URL",
  135. settings.decode_result_api_url,
  136. ),
  137. decode_config_id=_get_env_int("DECODE_CONFIG_ID", settings.decode_config_id),
  138. request_timeout_seconds=_get_env_int(
  139. "REQUEST_TIMEOUT_SECONDS",
  140. settings.request_timeout_seconds,
  141. ),
  142. https_verify_ssl=_get_env_bool("HTTPS_VERIFY_SSL", settings.https_verify_ssl),
  143. hot_flow_cron_hours=_parse_cron_hours(
  144. _get_env("HOT_FLOW_CRON_HOURS", settings.hot_flow_cron_hours)
  145. ),
  146. hot_flow_cron_minute=_get_env_int(
  147. "HOT_FLOW_CRON_MINUTE",
  148. settings.hot_flow_cron_minute,
  149. ),
  150. schedule_interval_seconds=interval_seconds,
  151. decode_result_interval_seconds=_get_env_int(
  152. "DECODE_RESULT_FLOW_INTERVAL_SECONDS",
  153. settings.decode_result_flow_interval_seconds,
  154. ),
  155. decode_result_batch_size=_get_env_int(
  156. "DECODE_RESULT_BATCH_SIZE",
  157. settings.decode_result_batch_size,
  158. ),
  159. contribution_score_threshold=float(
  160. _get_env(
  161. "CONTRIBUTION_SCORE_THRESHOLD",
  162. str(settings.contribution_score_threshold),
  163. )
  164. ),
  165. demand_pool_source_table=_get_env(
  166. "DEMAND_POOL_SOURCE_TABLE",
  167. settings.demand_pool_source_table,
  168. ),
  169. demand_pool_excluded_strategy=_get_env(
  170. "DEMAND_POOL_EXCLUDED_STRATEGY",
  171. settings.demand_pool_excluded_strategy,
  172. ),
  173. demand_pool_top_n=_get_env_int(
  174. "DEMAND_POOL_TOP_N",
  175. settings.demand_pool_top_n,
  176. ),
  177. hot_demand_pool_strategy=_get_env(
  178. "HOT_DEMAND_POOL_STRATEGY",
  179. settings.hot_demand_pool_strategy,
  180. ),
  181. wxindex_score_threshold=_get_env_float(
  182. "WXINDEX_SCORE_THRESHOLD",
  183. _get_env_float(
  184. "HOT_DEMAND_POOL_WXINDEX_THRESHOLD",
  185. _get_env_float(
  186. "WXINDEX_LATEST_SCORE_THRESHOLD",
  187. settings.wxindex_score_threshold,
  188. ),
  189. ),
  190. ),
  191. odps_daily_write_limit=_get_env_int(
  192. "ODPS_DAILY_WRITE_LIMIT",
  193. settings.odps_daily_write_limit,
  194. ),
  195. postprocess_batch_size=_get_env_int(
  196. "POSTPROCESS_BATCH_SIZE",
  197. settings.postprocess_batch_size,
  198. ),
  199. contribution_match_llm_model=_get_env(
  200. "CONTRIBUTION_MATCH_LLM_MODEL",
  201. settings.contribution_match_llm_model,
  202. ),
  203. contribution_match_llm_max_attempts=_get_env_int(
  204. "CONTRIBUTION_MATCH_LLM_MAX_ATTEMPTS",
  205. settings.contribution_match_llm_max_attempts,
  206. ),
  207. contribution_match_llm_retry_sleep_seconds=_get_env_float(
  208. "CONTRIBUTION_MATCH_LLM_RETRY_SLEEP_SECONDS",
  209. settings.contribution_match_llm_retry_sleep_seconds,
  210. ),
  211. contribution_match_llm_max_tokens=_get_env_int(
  212. "CONTRIBUTION_MATCH_LLM_MAX_TOKENS",
  213. settings.contribution_match_llm_max_tokens,
  214. ),
  215. wxindex_llm_model=_get_env("WXINDEX_LLM_MODEL", settings.wxindex_llm_model),
  216. wxindex_llm_max_attempts=_get_env_int(
  217. "WXINDEX_LLM_MAX_ATTEMPTS",
  218. settings.wxindex_llm_max_attempts,
  219. ),
  220. wxindex_llm_max_tokens=_get_env_int(
  221. "WXINDEX_LLM_MAX_TOKENS",
  222. settings.wxindex_llm_max_tokens,
  223. ),
  224. wxindex_api_url=_get_env("WXINDEX_API_URL", settings.wxindex_api_url),
  225. wxindex_lookback_days=_get_env_int(
  226. "WXINDEX_LOOKBACK_DAYS",
  227. settings.wxindex_lookback_days,
  228. ),
  229. demand_event_sense_threshold=_get_env_float(
  230. "DEMAND_EVENT_SENSE_THRESHOLD",
  231. settings.demand_event_sense_threshold,
  232. ),
  233. demand_senior_fit_threshold=_get_env_float(
  234. "DEMAND_SENIOR_FIT_THRESHOLD",
  235. settings.demand_senior_fit_threshold,
  236. ),
  237. demand_quality_llm_model=_get_env(
  238. "DEMAND_QUALITY_LLM_MODEL",
  239. settings.demand_quality_llm_model,
  240. ),
  241. demand_quality_llm_max_attempts=_get_env_int(
  242. "DEMAND_QUALITY_LLM_MAX_ATTEMPTS",
  243. settings.demand_quality_llm_max_attempts,
  244. ),
  245. demand_quality_llm_retry_sleep_seconds=_get_env_float(
  246. "DEMAND_QUALITY_LLM_RETRY_SLEEP_SECONDS",
  247. settings.demand_quality_llm_retry_sleep_seconds,
  248. ),
  249. demand_quality_llm_max_tokens=_get_env_int(
  250. "DEMAND_QUALITY_LLM_MAX_TOKENS",
  251. settings.demand_quality_llm_max_tokens,
  252. ),
  253. category_filter_llm_model=_get_env(
  254. "CATEGORY_FILTER_LLM_MODEL",
  255. settings.category_filter_llm_model,
  256. ),
  257. category_filter_llm_max_attempts=_get_env_int(
  258. "CATEGORY_FILTER_LLM_MAX_ATTEMPTS",
  259. settings.category_filter_llm_max_attempts,
  260. ),
  261. category_filter_llm_retry_sleep_seconds=_get_env_float(
  262. "CATEGORY_FILTER_LLM_RETRY_SLEEP_SECONDS",
  263. settings.category_filter_llm_retry_sleep_seconds,
  264. ),
  265. category_filter_llm_max_tokens=_get_env_int(
  266. "CATEGORY_FILTER_LLM_MAX_TOKENS",
  267. settings.category_filter_llm_max_tokens,
  268. ),
  269. category_filter_body_max_chars=_get_env_int(
  270. "CATEGORY_FILTER_BODY_MAX_CHARS",
  271. settings.category_filter_body_max_chars,
  272. ),
  273. category_filter_item_sleep_seconds=_get_env_float(
  274. "CATEGORY_FILTER_ITEM_SLEEP_SECONDS",
  275. settings.category_filter_item_sleep_seconds,
  276. ),
  277. category_filter_categories=_load_category_filter_categories(),
  278. sources=_load_sources(),
  279. mysql=MysqlConfig(
  280. host=_get_env("MYSQL_HOST", settings.mysql_host),
  281. port=_get_env_int("MYSQL_PORT", settings.mysql_port),
  282. user=_get_env("MYSQL_USER", settings.mysql_user),
  283. password=_get_env("MYSQL_PASSWORD", settings.mysql_password),
  284. database=_get_env("MYSQL_DATABASE", settings.mysql_database),
  285. charset=_get_env("MYSQL_CHARSET", settings.mysql_charset),
  286. ),
  287. )