config.py 9.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272
  1. """热点内容流程配置加载。"""
  2. from __future__ import annotations
  3. import json
  4. import os
  5. from pathlib import Path
  6. from typing import Any
  7. from app.core.config import PROJECT_ROOT, settings
  8. from app.hot_content.exceptions import HotContentFlowError
  9. from app.hot_content.types import FlowConfig, HotSourceConfig, MysqlConfig
  10. def _get_env(name: str, default: str = "") -> str:
  11. value = os.getenv(name)
  12. if value is None or value == "":
  13. return default
  14. return value
  15. def _get_env_int(name: str, default: int) -> int:
  16. raw = os.getenv(name)
  17. if raw is None or raw == "":
  18. return default
  19. try:
  20. return int(raw)
  21. except ValueError as exc:
  22. raise HotContentFlowError(f"invalid integer env {name}={raw!r}") from exc
  23. def _get_env_float(name: str, default: float) -> float:
  24. raw = os.getenv(name)
  25. if raw is None or raw == "":
  26. return default
  27. try:
  28. return float(raw)
  29. except ValueError as exc:
  30. raise HotContentFlowError(f"invalid float env {name}={raw!r}") from exc
  31. def _get_env_bool(name: str, default: bool) -> bool:
  32. raw = os.getenv(name)
  33. if raw is None or raw == "":
  34. return default
  35. return raw.strip().lower() in {"1", "true", "yes", "y", "on"}
  36. def _load_json_from_env_or_file(env_name: str, file_env_name: str) -> Any | None:
  37. file_path = os.getenv(file_env_name)
  38. if file_path:
  39. path = Path(file_path).expanduser()
  40. if not path.is_absolute():
  41. path = PROJECT_ROOT / path
  42. try:
  43. return json.loads(path.read_text(encoding="utf-8"))
  44. except json.JSONDecodeError as exc:
  45. raise HotContentFlowError(f"invalid json file {path}") from exc
  46. raw = os.getenv(env_name)
  47. if not raw:
  48. return None
  49. try:
  50. return json.loads(raw)
  51. except json.JSONDecodeError as exc:
  52. raise HotContentFlowError(f"invalid json env {env_name}") from exc
  53. def _normalize_source_config(item: Any) -> HotSourceConfig:
  54. if isinstance(item, str):
  55. source = item.strip()
  56. if not source:
  57. raise HotContentFlowError("hot source cannot be empty")
  58. return HotSourceConfig(source=source)
  59. if not isinstance(item, dict):
  60. raise HotContentFlowError(f"invalid hot source config: {item!r}")
  61. source = str(item.get("source") or item.get("source_name") or "").strip()
  62. if not source:
  63. raise HotContentFlowError(f"hot source missing source: {item!r}")
  64. return HotSourceConfig(
  65. source=source,
  66. count=int(item.get("count") or item.get("limit") or item.get("rank_limit") or 10),
  67. )
  68. def _load_sources() -> list[HotSourceConfig]:
  69. raw_sources = _load_json_from_env_or_file("HOT_FLOW_SOURCES_JSON", "HOT_FLOW_SOURCES_FILE")
  70. if raw_sources is None:
  71. raw_sources = settings.hot_flow_sources
  72. if not isinstance(raw_sources, list):
  73. raise HotContentFlowError("HOT_FLOW_SOURCES_JSON/HOT_FLOW_SOURCES_FILE must be a list")
  74. sources = [_normalize_source_config(item) for item in raw_sources]
  75. if not sources:
  76. raise HotContentFlowError("hot sources cannot be empty")
  77. return sources
  78. def _parse_cron_hours(value: str) -> str:
  79. hours = [item.strip() for item in value.split(",") if item.strip()]
  80. if not hours:
  81. raise HotContentFlowError("hot flow cron hours cannot be empty")
  82. normalized: list[str] = []
  83. for hour in hours:
  84. try:
  85. hour_num = int(hour)
  86. except ValueError as exc:
  87. raise HotContentFlowError(f"invalid hot flow cron hour: {hour!r}") from exc
  88. if not 0 <= hour_num <= 23:
  89. raise HotContentFlowError(f"hot flow cron hour out of range: {hour_num}")
  90. normalized.append(str(hour_num))
  91. return ",".join(normalized)
  92. def load_flow_config(interval_override: int | None = None) -> FlowConfig:
  93. crawapi_base_url = _get_env("CRAWAPI_BASE_URL", settings.crawapi_base_url).rstrip("/")
  94. hot_rank_path = _get_env(
  95. "CRAWAPI_HOT_CONTENT_RANK_PATH",
  96. settings.crawapi_hot_content_rank_path,
  97. )
  98. if not crawapi_base_url:
  99. raise HotContentFlowError("missing CRAWAPI_BASE_URL or settings.crawapi_base_url")
  100. if not hot_rank_path:
  101. raise HotContentFlowError(
  102. "missing CRAWAPI_HOT_CONTENT_RANK_PATH or settings.crawapi_hot_content_rank_path"
  103. )
  104. interval_seconds = (
  105. interval_override
  106. if interval_override is not None
  107. else _get_env_int("HOT_FLOW_INTERVAL_SECONDS", settings.hot_flow_interval_seconds)
  108. )
  109. return FlowConfig(
  110. crawapi_base_url=crawapi_base_url,
  111. hot_rank_path=hot_rank_path,
  112. keyword_search_path=_get_env(
  113. "CRAWAPI_KEYWORD_SEARCH_PATH",
  114. settings.crawapi_keyword_search_path,
  115. ),
  116. decode_api_url=_get_env("DECODE_API_URL", settings.decode_api_url),
  117. decode_result_api_url=_get_env(
  118. "DECODE_RESULT_API_URL",
  119. settings.decode_result_api_url,
  120. ),
  121. decode_config_id=_get_env_int("DECODE_CONFIG_ID", settings.decode_config_id),
  122. request_timeout_seconds=_get_env_int(
  123. "REQUEST_TIMEOUT_SECONDS",
  124. settings.request_timeout_seconds,
  125. ),
  126. https_verify_ssl=_get_env_bool("HTTPS_VERIFY_SSL", settings.https_verify_ssl),
  127. hot_flow_cron_hours=_parse_cron_hours(
  128. _get_env("HOT_FLOW_CRON_HOURS", settings.hot_flow_cron_hours)
  129. ),
  130. hot_flow_cron_minute=_get_env_int(
  131. "HOT_FLOW_CRON_MINUTE",
  132. settings.hot_flow_cron_minute,
  133. ),
  134. schedule_interval_seconds=interval_seconds,
  135. decode_result_interval_seconds=_get_env_int(
  136. "DECODE_RESULT_FLOW_INTERVAL_SECONDS",
  137. settings.decode_result_flow_interval_seconds,
  138. ),
  139. decode_result_batch_size=_get_env_int(
  140. "DECODE_RESULT_BATCH_SIZE",
  141. settings.decode_result_batch_size,
  142. ),
  143. contribution_score_threshold=float(
  144. _get_env(
  145. "CONTRIBUTION_SCORE_THRESHOLD",
  146. str(settings.contribution_score_threshold),
  147. )
  148. ),
  149. demand_pool_source_table=_get_env(
  150. "DEMAND_POOL_SOURCE_TABLE",
  151. settings.demand_pool_source_table,
  152. ),
  153. demand_pool_excluded_strategy=_get_env(
  154. "DEMAND_POOL_EXCLUDED_STRATEGY",
  155. settings.demand_pool_excluded_strategy,
  156. ),
  157. demand_pool_top_n=_get_env_int(
  158. "DEMAND_POOL_TOP_N",
  159. settings.demand_pool_top_n,
  160. ),
  161. hot_demand_pool_strategy=_get_env(
  162. "HOT_DEMAND_POOL_STRATEGY",
  163. settings.hot_demand_pool_strategy,
  164. ),
  165. wxindex_score_threshold=_get_env_float(
  166. "WXINDEX_SCORE_THRESHOLD",
  167. _get_env_float(
  168. "HOT_DEMAND_POOL_WXINDEX_THRESHOLD",
  169. _get_env_float(
  170. "WXINDEX_LATEST_SCORE_THRESHOLD",
  171. settings.wxindex_score_threshold,
  172. ),
  173. ),
  174. ),
  175. odps_daily_write_limit=_get_env_int(
  176. "ODPS_DAILY_WRITE_LIMIT",
  177. settings.odps_daily_write_limit,
  178. ),
  179. postprocess_batch_size=_get_env_int(
  180. "POSTPROCESS_BATCH_SIZE",
  181. settings.postprocess_batch_size,
  182. ),
  183. contribution_match_llm_model=_get_env(
  184. "CONTRIBUTION_MATCH_LLM_MODEL",
  185. settings.contribution_match_llm_model,
  186. ),
  187. contribution_match_llm_max_attempts=_get_env_int(
  188. "CONTRIBUTION_MATCH_LLM_MAX_ATTEMPTS",
  189. settings.contribution_match_llm_max_attempts,
  190. ),
  191. contribution_match_llm_retry_sleep_seconds=_get_env_float(
  192. "CONTRIBUTION_MATCH_LLM_RETRY_SLEEP_SECONDS",
  193. settings.contribution_match_llm_retry_sleep_seconds,
  194. ),
  195. contribution_match_llm_max_tokens=_get_env_int(
  196. "CONTRIBUTION_MATCH_LLM_MAX_TOKENS",
  197. settings.contribution_match_llm_max_tokens,
  198. ),
  199. wxindex_llm_model=_get_env("WXINDEX_LLM_MODEL", settings.wxindex_llm_model),
  200. wxindex_llm_max_attempts=_get_env_int(
  201. "WXINDEX_LLM_MAX_ATTEMPTS",
  202. settings.wxindex_llm_max_attempts,
  203. ),
  204. wxindex_llm_max_tokens=_get_env_int(
  205. "WXINDEX_LLM_MAX_TOKENS",
  206. settings.wxindex_llm_max_tokens,
  207. ),
  208. wxindex_api_url=_get_env("WXINDEX_API_URL", settings.wxindex_api_url),
  209. wxindex_lookback_days=_get_env_int(
  210. "WXINDEX_LOOKBACK_DAYS",
  211. settings.wxindex_lookback_days,
  212. ),
  213. demand_event_sense_threshold=_get_env_float(
  214. "DEMAND_EVENT_SENSE_THRESHOLD",
  215. settings.demand_event_sense_threshold,
  216. ),
  217. demand_senior_fit_threshold=_get_env_float(
  218. "DEMAND_SENIOR_FIT_THRESHOLD",
  219. settings.demand_senior_fit_threshold,
  220. ),
  221. demand_quality_llm_model=_get_env(
  222. "DEMAND_QUALITY_LLM_MODEL",
  223. settings.demand_quality_llm_model,
  224. ),
  225. demand_quality_llm_max_attempts=_get_env_int(
  226. "DEMAND_QUALITY_LLM_MAX_ATTEMPTS",
  227. settings.demand_quality_llm_max_attempts,
  228. ),
  229. demand_quality_llm_retry_sleep_seconds=_get_env_float(
  230. "DEMAND_QUALITY_LLM_RETRY_SLEEP_SECONDS",
  231. settings.demand_quality_llm_retry_sleep_seconds,
  232. ),
  233. demand_quality_llm_max_tokens=_get_env_int(
  234. "DEMAND_QUALITY_LLM_MAX_TOKENS",
  235. settings.demand_quality_llm_max_tokens,
  236. ),
  237. sources=_load_sources(),
  238. mysql=MysqlConfig(
  239. host=_get_env("MYSQL_HOST", settings.mysql_host),
  240. port=_get_env_int("MYSQL_PORT", settings.mysql_port),
  241. user=_get_env("MYSQL_USER", settings.mysql_user),
  242. password=_get_env("MYSQL_PASSWORD", settings.mysql_password),
  243. database=_get_env("MYSQL_DATABASE", settings.mysql_database),
  244. charset=_get_env("MYSQL_CHARSET", settings.mysql_charset),
  245. ),
  246. )