config.py 13 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341
  1. """热点内容流程配置加载。"""
  2. from __future__ import annotations
  3. import json
  4. import os
  5. from pathlib import Path
  6. from typing import Any
  7. from app.core.config import PROJECT_ROOT, settings
  8. from app.hot_content.category_filter import DEFAULT_ELDERLY_CATEGORY_LIST
  9. from app.hot_content.exceptions import HotContentFlowError
  10. from app.hot_content.types import FlowConfig, HotSourceConfig, MysqlConfig
  11. def _get_env(name: str, default: str = "") -> str:
  12. value = os.getenv(name)
  13. if value is None or value == "":
  14. return default
  15. return value
  16. def _get_env_int(name: str, default: int) -> int:
  17. raw = os.getenv(name)
  18. if raw is None or raw == "":
  19. return default
  20. try:
  21. return int(raw)
  22. except ValueError as exc:
  23. raise HotContentFlowError(f"invalid integer env {name}={raw!r}") from exc
  24. def _get_env_float(name: str, default: float) -> float:
  25. raw = os.getenv(name)
  26. if raw is None or raw == "":
  27. return default
  28. try:
  29. return float(raw)
  30. except ValueError as exc:
  31. raise HotContentFlowError(f"invalid float env {name}={raw!r}") from exc
  32. def _get_env_bool(name: str, default: bool) -> bool:
  33. raw = os.getenv(name)
  34. if raw is None or raw == "":
  35. return default
  36. return raw.strip().lower() in {"1", "true", "yes", "y", "on"}
  37. def _load_json_from_env_or_file(env_name: str, file_env_name: str) -> Any | None:
  38. file_path = os.getenv(file_env_name)
  39. if file_path:
  40. path = Path(file_path).expanduser()
  41. if not path.is_absolute():
  42. path = PROJECT_ROOT / path
  43. try:
  44. return json.loads(path.read_text(encoding="utf-8"))
  45. except json.JSONDecodeError as exc:
  46. raise HotContentFlowError(f"invalid json file {path}") from exc
  47. raw = os.getenv(env_name)
  48. if not raw:
  49. return None
  50. try:
  51. return json.loads(raw)
  52. except json.JSONDecodeError as exc:
  53. raise HotContentFlowError(f"invalid json env {env_name}") from exc
  54. def _normalize_source_config(item: Any) -> HotSourceConfig:
  55. if isinstance(item, str):
  56. source = item.strip()
  57. if not source:
  58. raise HotContentFlowError("hot source cannot be empty")
  59. return HotSourceConfig(source=source)
  60. if not isinstance(item, dict):
  61. raise HotContentFlowError(f"invalid hot source config: {item!r}")
  62. source = str(item.get("source") or item.get("source_name") or "").strip()
  63. if not source:
  64. raise HotContentFlowError(f"hot source missing source: {item!r}")
  65. hot_rank_base_url = str(item.get("hot_rank_base_url") or "").strip().rstrip("/") or None
  66. hot_rank_path = str(item.get("hot_rank_path") or "").strip() or None
  67. hot_rank_payload = item.get("hot_rank_payload")
  68. if hot_rank_payload is not None and not isinstance(hot_rank_payload, dict):
  69. raise HotContentFlowError(
  70. f"hot source hot_rank_payload must be a dict: {item!r}"
  71. )
  72. return HotSourceConfig(
  73. source=source,
  74. hot_rank_base_url=hot_rank_base_url,
  75. hot_rank_path=hot_rank_path,
  76. hot_rank_payload=hot_rank_payload,
  77. )
  78. def _load_sources() -> list[HotSourceConfig]:
  79. raw_sources = _load_json_from_env_or_file("HOT_FLOW_SOURCES_JSON", "HOT_FLOW_SOURCES_FILE")
  80. if raw_sources is None:
  81. raw_sources = settings.hot_flow_sources
  82. if not isinstance(raw_sources, list):
  83. raise HotContentFlowError("HOT_FLOW_SOURCES_JSON/HOT_FLOW_SOURCES_FILE must be a list")
  84. sources = [_normalize_source_config(item) for item in raw_sources]
  85. if not sources:
  86. raise HotContentFlowError("hot sources cannot be empty")
  87. return sources
  88. def _parse_cron_hours(value: str) -> str:
  89. hours = [item.strip() for item in value.split(",") if item.strip()]
  90. if not hours:
  91. raise HotContentFlowError("hot flow cron hours cannot be empty")
  92. normalized: list[str] = []
  93. for hour in hours:
  94. try:
  95. hour_num = int(hour)
  96. except ValueError as exc:
  97. raise HotContentFlowError(f"invalid hot flow cron hour: {hour!r}") from exc
  98. if not 0 <= hour_num <= 23:
  99. raise HotContentFlowError(f"hot flow cron hour out of range: {hour_num}")
  100. normalized.append(str(hour_num))
  101. return ",".join(normalized)
  102. def _load_category_filter_categories() -> list[str]:
  103. raw = _load_json_from_env_or_file(
  104. "CATEGORY_FILTER_CATEGORIES_JSON",
  105. "CATEGORY_FILTER_CATEGORIES_FILE",
  106. )
  107. if raw is None:
  108. return list(DEFAULT_ELDERLY_CATEGORY_LIST)
  109. if not isinstance(raw, list):
  110. raise HotContentFlowError(
  111. "CATEGORY_FILTER_CATEGORIES_JSON/CATEGORY_FILTER_CATEGORIES_FILE must be a list"
  112. )
  113. categories = [str(item).strip() for item in raw if str(item).strip()]
  114. if not categories:
  115. raise HotContentFlowError("category filter categories cannot be empty")
  116. return categories
  117. def load_flow_config(interval_override: int | None = None) -> FlowConfig:
  118. crawapi_base_url = _get_env("CRAWAPI_BASE_URL", settings.crawapi_base_url).rstrip("/")
  119. hot_rank_path = _get_env(
  120. "CRAWAPI_HOT_CONTENT_RANK_PATH",
  121. settings.crawapi_hot_content_rank_path,
  122. )
  123. if not crawapi_base_url:
  124. raise HotContentFlowError("missing CRAWAPI_BASE_URL or settings.crawapi_base_url")
  125. if not hot_rank_path:
  126. raise HotContentFlowError(
  127. "missing CRAWAPI_HOT_CONTENT_RANK_PATH or settings.crawapi_hot_content_rank_path"
  128. )
  129. interval_seconds = (
  130. interval_override
  131. if interval_override is not None
  132. else _get_env_int("HOT_FLOW_INTERVAL_SECONDS", settings.hot_flow_interval_seconds)
  133. )
  134. return FlowConfig(
  135. crawapi_base_url=crawapi_base_url,
  136. hot_rank_path=hot_rank_path,
  137. keyword_search_path=_get_env(
  138. "CRAWAPI_KEYWORD_SEARCH_PATH",
  139. settings.crawapi_keyword_search_path,
  140. ),
  141. decode_api_url=_get_env("DECODE_API_URL", settings.decode_api_url),
  142. decode_result_api_url=_get_env(
  143. "DECODE_RESULT_API_URL",
  144. settings.decode_result_api_url,
  145. ),
  146. decode_config_id=_get_env_int("DECODE_CONFIG_ID", settings.decode_config_id),
  147. request_timeout_seconds=_get_env_int(
  148. "REQUEST_TIMEOUT_SECONDS",
  149. settings.request_timeout_seconds,
  150. ),
  151. https_verify_ssl=_get_env_bool("HTTPS_VERIFY_SSL", settings.https_verify_ssl),
  152. hot_flow_cron_hours=_parse_cron_hours(
  153. _get_env("HOT_FLOW_CRON_HOURS", settings.hot_flow_cron_hours)
  154. ),
  155. hot_flow_cron_minute=_get_env_int(
  156. "HOT_FLOW_CRON_MINUTE",
  157. settings.hot_flow_cron_minute,
  158. ),
  159. schedule_interval_seconds=interval_seconds,
  160. decode_result_interval_seconds=_get_env_int(
  161. "DECODE_RESULT_FLOW_INTERVAL_SECONDS",
  162. settings.decode_result_flow_interval_seconds,
  163. ),
  164. decode_result_batch_size=_get_env_int(
  165. "DECODE_RESULT_BATCH_SIZE",
  166. settings.decode_result_batch_size,
  167. ),
  168. contribution_score_threshold=float(
  169. _get_env(
  170. "CONTRIBUTION_SCORE_THRESHOLD",
  171. str(settings.contribution_score_threshold),
  172. )
  173. ),
  174. demand_pool_source_table=_get_env(
  175. "DEMAND_POOL_SOURCE_TABLE",
  176. settings.demand_pool_source_table,
  177. ),
  178. demand_pool_excluded_strategy=_get_env(
  179. "DEMAND_POOL_EXCLUDED_STRATEGY",
  180. settings.demand_pool_excluded_strategy,
  181. ),
  182. demand_pool_top_n=_get_env_int(
  183. "DEMAND_POOL_TOP_N",
  184. settings.demand_pool_top_n,
  185. ),
  186. hot_demand_pool_strategy=_get_env(
  187. "HOT_DEMAND_POOL_STRATEGY",
  188. settings.hot_demand_pool_strategy,
  189. ),
  190. wxindex_score_threshold=_get_env_float(
  191. "WXINDEX_SCORE_THRESHOLD",
  192. _get_env_float(
  193. "HOT_DEMAND_POOL_WXINDEX_THRESHOLD",
  194. _get_env_float(
  195. "WXINDEX_LATEST_SCORE_THRESHOLD",
  196. settings.wxindex_score_threshold,
  197. ),
  198. ),
  199. ),
  200. odps_daily_write_limit=_get_env_int(
  201. "ODPS_DAILY_WRITE_LIMIT",
  202. settings.odps_daily_write_limit,
  203. ),
  204. postprocess_batch_size=_get_env_int(
  205. "POSTPROCESS_BATCH_SIZE",
  206. settings.postprocess_batch_size,
  207. ),
  208. contribution_match_llm_model=_get_env(
  209. "CONTRIBUTION_MATCH_LLM_MODEL",
  210. settings.contribution_match_llm_model,
  211. ),
  212. contribution_match_llm_max_attempts=_get_env_int(
  213. "CONTRIBUTION_MATCH_LLM_MAX_ATTEMPTS",
  214. settings.contribution_match_llm_max_attempts,
  215. ),
  216. contribution_match_llm_retry_sleep_seconds=_get_env_float(
  217. "CONTRIBUTION_MATCH_LLM_RETRY_SLEEP_SECONDS",
  218. settings.contribution_match_llm_retry_sleep_seconds,
  219. ),
  220. contribution_match_llm_max_tokens=_get_env_int(
  221. "CONTRIBUTION_MATCH_LLM_MAX_TOKENS",
  222. settings.contribution_match_llm_max_tokens,
  223. ),
  224. wxindex_llm_model=_get_env("WXINDEX_LLM_MODEL", settings.wxindex_llm_model),
  225. wxindex_llm_max_attempts=_get_env_int(
  226. "WXINDEX_LLM_MAX_ATTEMPTS",
  227. settings.wxindex_llm_max_attempts,
  228. ),
  229. wxindex_llm_max_tokens=_get_env_int(
  230. "WXINDEX_LLM_MAX_TOKENS",
  231. settings.wxindex_llm_max_tokens,
  232. ),
  233. wxindex_api_url=_get_env("WXINDEX_API_URL", settings.wxindex_api_url),
  234. wxindex_lookback_days=_get_env_int(
  235. "WXINDEX_LOOKBACK_DAYS",
  236. settings.wxindex_lookback_days,
  237. ),
  238. wxindex_words_cron_hour=_get_env_int(
  239. "WXINDEX_WORDS_CRON_HOUR",
  240. settings.wxindex_words_cron_hour,
  241. ),
  242. wxindex_words_cron_minute=_get_env_int(
  243. "WXINDEX_WORDS_CRON_MINUTE",
  244. settings.wxindex_words_cron_minute,
  245. ),
  246. wxindex_heat_pattern_cron_hour=_get_env_int(
  247. "WXINDEX_HEAT_PATTERN_CRON_HOUR",
  248. settings.wxindex_heat_pattern_cron_hour,
  249. ),
  250. wxindex_heat_pattern_cron_minute=_get_env_int(
  251. "WXINDEX_HEAT_PATTERN_CRON_MINUTE",
  252. settings.wxindex_heat_pattern_cron_minute,
  253. ),
  254. demand_event_sense_threshold=_get_env_float(
  255. "DEMAND_EVENT_SENSE_THRESHOLD",
  256. settings.demand_event_sense_threshold,
  257. ),
  258. demand_senior_fit_threshold=_get_env_float(
  259. "DEMAND_SENIOR_FIT_THRESHOLD",
  260. settings.demand_senior_fit_threshold,
  261. ),
  262. demand_quality_llm_model=_get_env(
  263. "DEMAND_QUALITY_LLM_MODEL",
  264. settings.demand_quality_llm_model,
  265. ),
  266. demand_quality_llm_max_attempts=_get_env_int(
  267. "DEMAND_QUALITY_LLM_MAX_ATTEMPTS",
  268. settings.demand_quality_llm_max_attempts,
  269. ),
  270. demand_quality_llm_retry_sleep_seconds=_get_env_float(
  271. "DEMAND_QUALITY_LLM_RETRY_SLEEP_SECONDS",
  272. settings.demand_quality_llm_retry_sleep_seconds,
  273. ),
  274. demand_quality_llm_max_tokens=_get_env_int(
  275. "DEMAND_QUALITY_LLM_MAX_TOKENS",
  276. settings.demand_quality_llm_max_tokens,
  277. ),
  278. category_filter_llm_model=_get_env(
  279. "CATEGORY_FILTER_LLM_MODEL",
  280. settings.category_filter_llm_model,
  281. ),
  282. category_filter_llm_max_attempts=_get_env_int(
  283. "CATEGORY_FILTER_LLM_MAX_ATTEMPTS",
  284. settings.category_filter_llm_max_attempts,
  285. ),
  286. category_filter_llm_retry_sleep_seconds=_get_env_float(
  287. "CATEGORY_FILTER_LLM_RETRY_SLEEP_SECONDS",
  288. settings.category_filter_llm_retry_sleep_seconds,
  289. ),
  290. category_filter_llm_max_tokens=_get_env_int(
  291. "CATEGORY_FILTER_LLM_MAX_TOKENS",
  292. settings.category_filter_llm_max_tokens,
  293. ),
  294. category_filter_body_max_chars=_get_env_int(
  295. "CATEGORY_FILTER_BODY_MAX_CHARS",
  296. settings.category_filter_body_max_chars,
  297. ),
  298. category_filter_item_sleep_seconds=_get_env_float(
  299. "CATEGORY_FILTER_ITEM_SLEEP_SECONDS",
  300. settings.category_filter_item_sleep_seconds,
  301. ),
  302. category_filter_categories=_load_category_filter_categories(),
  303. sources=_load_sources(),
  304. mysql=MysqlConfig(
  305. host=_get_env("MYSQL_HOST", settings.mysql_host),
  306. port=_get_env_int("MYSQL_PORT", settings.mysql_port),
  307. user=_get_env("MYSQL_USER", settings.mysql_user),
  308. password=_get_env("MYSQL_PASSWORD", settings.mysql_password),
  309. database=_get_env("MYSQL_DATABASE", settings.mysql_database),
  310. charset=_get_env("MYSQL_CHARSET", settings.mysql_charset),
  311. ),
  312. )