config.py 16 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417
  1. """应用配置。
  2. 配置优先级:环境变量 > 这里的默认值。
  3. """
  4. from __future__ import annotations
  5. import json
  6. import os
  7. from dataclasses import dataclass, field
  8. from pathlib import Path
  9. from typing import Any
  10. PROJECT_ROOT = Path(__file__).resolve().parents[2]
  11. def _env(name: str, default: str) -> str:
  12. value = os.getenv(name)
  13. if value is None or value == "":
  14. return default
  15. return value
  16. def _env_int(name: str, default: int) -> int:
  17. value = os.getenv(name)
  18. if value is None or value == "":
  19. return default
  20. return int(value)
  21. def _env_int_optional(name: str, default: int | None = None) -> int | None:
  22. value = os.getenv(name)
  23. if value is None or value == "":
  24. return default
  25. return int(value)
  26. def _env_float(name: str, default: float) -> float:
  27. value = os.getenv(name)
  28. if value is None or value == "":
  29. return default
  30. return float(value)
  31. def _env_float_optional(name: str, default: float | None = None) -> float | None:
  32. value = os.getenv(name)
  33. if value is None or value == "":
  34. return default
  35. return float(value)
  36. def _env_first(names: tuple[str, ...], default: str) -> str:
  37. for name in names:
  38. value = os.getenv(name)
  39. if value is not None and value != "":
  40. return value
  41. return default
  42. def _env_bool(name: str, default: bool) -> bool:
  43. value = os.getenv(name)
  44. if value is None or value == "":
  45. return default
  46. return value.strip().lower() in {"1", "true", "yes", "y", "on"}
  47. def _load_json_file(path_value: str) -> Any:
  48. path = Path(path_value).expanduser()
  49. if not path.is_absolute():
  50. path = PROJECT_ROOT / path
  51. return json.loads(path.read_text(encoding="utf-8"))
  52. def _env_json(name: str, default: Any, file_env_name: str | None = None) -> Any:
  53. if file_env_name:
  54. file_path = os.getenv(file_env_name)
  55. if file_path:
  56. return _load_json_file(file_path)
  57. value = os.getenv(name)
  58. if value is None or value == "":
  59. return default
  60. return json.loads(value)
  61. @dataclass(frozen=True)
  62. class Settings:
  63. crawapi_base_url: str = "http://crawapi.piaoquantv.com"
  64. crawapi_hot_content_rank_path: str = "/crawler/jin_ri_re_bang/content_rank"
  65. crawapi_keyword_search_path: str = "/crawler/bai_du/keyword"
  66. decode_api_url: str = "https://aigc-api.aiddit.com/aigc/api/task/decode"
  67. decode_result_api_url: str = "https://aigc-api.aiddit.com/aigc/api/task/decode/result"
  68. decode_config_id: int = 70
  69. request_timeout_seconds: int = 180
  70. https_verify_ssl: bool = False
  71. hot_flow_cron_hours: str = "6,12,18"
  72. hot_flow_cron_minute: int = 0
  73. hot_flow_interval_seconds: int = 1800
  74. decode_result_flow_interval_seconds: int = 1800
  75. decode_result_batch_size: int = 50
  76. contribution_score_threshold: float = 0.6
  77. hot_flow_sources: list[dict[str, Any]] = field(
  78. default_factory=lambda: [
  79. {"source": "百度"},
  80. {"source": "微博"},
  81. {"source": "微信"},
  82. {
  83. "source": "快手",
  84. "hot_rank_base_url": "http://crawler.aiddit.com",
  85. "hot_rank_path": "/crawler/kuai_shou/hot_rank",
  86. "hot_rank_payload": {},
  87. },
  88. {
  89. "source": "抖音",
  90. "hot_rank_base_url": "http://8.217.190.241:8888",
  91. "hot_rank_path": "/crawler/dou_yin/hot_rank",
  92. "hot_rank_payload": {"tab_name": "热点榜"},
  93. },
  94. ]
  95. )
  96. mysql_host: str = "rm-t4nh1xx6o2a6vj8qu3o.mysql.singapore.rds.aliyuncs.com"
  97. mysql_port: int = 3306
  98. mysql_user: str = "content_rw"
  99. mysql_password: str = "bC1aH4bA1lB0"
  100. mysql_database: str = "external_demand"
  101. mysql_charset: str = "utf8mb4"
  102. open_router_api_key: str = "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"
  103. open_router_default_model: str = "anthropic/claude-haiku-4-5"
  104. open_router_timeout_seconds: int = 60
  105. open_router_http_referer: str = ""
  106. open_router_app_title: str = "external_demand"
  107. open_router_base_url: str = "https://openrouter.ai/api/v1"
  108. open_router_temperature: float | None = 0.7
  109. open_router_max_tokens: int | None = 20000
  110. odps_access_id: str = "LTAI9EBa0bd5PrDa"
  111. odps_access_key: str = "vAalxds7YxhfOA2yVv8GziCg3Y87v5"
  112. odps_project: str = "loghubods"
  113. odps_endpoint: str = "http://service.odps.aliyun.com/api"
  114. odps_tunnel_endpoint: str = ""
  115. demand_pool_source_table: str = "dwd_multi_demand_pool_di"
  116. demand_pool_excluded_strategy: str = "当下供需gap-分词"
  117. demand_pool_top_n: int = 200
  118. hot_demand_pool_strategy: str = "新热事件"
  119. wxindex_score_threshold: float = 100_000.0
  120. odps_daily_write_limit: int = 500
  121. postprocess_batch_size: int = 20
  122. contribution_match_llm_model: str = ""
  123. contribution_match_llm_max_attempts: int = 3
  124. contribution_match_llm_retry_sleep_seconds: float = 1.0
  125. contribution_match_llm_max_tokens: int = 4000
  126. wxindex_llm_model: str = "anthropic/claude-haiku-4-5"
  127. wxindex_llm_max_attempts: int = 3
  128. wxindex_llm_max_tokens: int = 4000
  129. wxindex_api_url: str = "http://crawapi.piaoquantv.com/crawler/wei_xin/wxindex"
  130. wxindex_lookback_days: int = 7
  131. wxindex_words_cron_hours: str = "10,14"
  132. wxindex_words_cron_minute: int = 0
  133. wxindex_heat_pattern_cron_hours: str = "11,15"
  134. wxindex_heat_pattern_cron_minute: int = 0
  135. demand_event_sense_threshold: float = 6.0
  136. demand_senior_fit_threshold: float = 6.0
  137. demand_quality_llm_model: str = "anthropic/claude-haiku-4-5"
  138. demand_quality_llm_max_attempts: int = 3
  139. demand_quality_llm_retry_sleep_seconds: float = 1.0
  140. demand_quality_llm_max_tokens: int = 4000
  141. category_filter_llm_model: str = "anthropic/claude-haiku-4-5"
  142. category_filter_llm_max_attempts: int = 3
  143. category_filter_llm_retry_sleep_seconds: float = 1.0
  144. category_filter_llm_max_tokens: int = 1024
  145. category_filter_body_max_chars: int = 2000
  146. category_filter_item_sleep_seconds: float = 0.0
  147. @classmethod
  148. def from_env(cls) -> "Settings":
  149. defaults = cls()
  150. return cls(
  151. crawapi_base_url=_env("CRAWAPI_BASE_URL", defaults.crawapi_base_url),
  152. crawapi_hot_content_rank_path=_env(
  153. "CRAWAPI_HOT_CONTENT_RANK_PATH",
  154. defaults.crawapi_hot_content_rank_path,
  155. ),
  156. crawapi_keyword_search_path=_env(
  157. "CRAWAPI_KEYWORD_SEARCH_PATH",
  158. defaults.crawapi_keyword_search_path,
  159. ),
  160. decode_api_url=_env("DECODE_API_URL", defaults.decode_api_url),
  161. decode_result_api_url=_env(
  162. "DECODE_RESULT_API_URL",
  163. defaults.decode_result_api_url,
  164. ),
  165. decode_config_id=_env_int("DECODE_CONFIG_ID", defaults.decode_config_id),
  166. request_timeout_seconds=_env_int(
  167. "REQUEST_TIMEOUT_SECONDS",
  168. defaults.request_timeout_seconds,
  169. ),
  170. https_verify_ssl=_env_bool("HTTPS_VERIFY_SSL", defaults.https_verify_ssl),
  171. hot_flow_cron_hours=_env(
  172. "HOT_FLOW_CRON_HOURS",
  173. defaults.hot_flow_cron_hours,
  174. ),
  175. hot_flow_cron_minute=_env_int(
  176. "HOT_FLOW_CRON_MINUTE",
  177. defaults.hot_flow_cron_minute,
  178. ),
  179. hot_flow_interval_seconds=_env_int(
  180. "HOT_FLOW_INTERVAL_SECONDS",
  181. defaults.hot_flow_interval_seconds,
  182. ),
  183. decode_result_flow_interval_seconds=_env_int(
  184. "DECODE_RESULT_FLOW_INTERVAL_SECONDS",
  185. defaults.decode_result_flow_interval_seconds,
  186. ),
  187. decode_result_batch_size=_env_int(
  188. "DECODE_RESULT_BATCH_SIZE",
  189. defaults.decode_result_batch_size,
  190. ),
  191. contribution_score_threshold=float(
  192. _env(
  193. "CONTRIBUTION_SCORE_THRESHOLD",
  194. str(defaults.contribution_score_threshold),
  195. )
  196. ),
  197. hot_flow_sources=_env_json(
  198. "HOT_FLOW_SOURCES_JSON",
  199. defaults.hot_flow_sources,
  200. "HOT_FLOW_SOURCES_FILE",
  201. ),
  202. mysql_host=_env("MYSQL_HOST", defaults.mysql_host),
  203. mysql_port=_env_int("MYSQL_PORT", defaults.mysql_port),
  204. mysql_user=_env("MYSQL_USER", defaults.mysql_user),
  205. mysql_password=_env("MYSQL_PASSWORD", defaults.mysql_password),
  206. mysql_database=_env("MYSQL_DATABASE", defaults.mysql_database),
  207. mysql_charset=_env("MYSQL_CHARSET", defaults.mysql_charset),
  208. open_router_api_key=_env_first(
  209. ("OPEN_ROUTER_API_KEY", "OPENROUTER_API_KEY"),
  210. defaults.open_router_api_key,
  211. ),
  212. open_router_default_model=_env(
  213. "OPEN_ROUTER_DEFAULT_MODEL",
  214. defaults.open_router_default_model,
  215. ),
  216. open_router_timeout_seconds=_env_int(
  217. "OPEN_ROUTER_TIMEOUT_SECONDS",
  218. defaults.open_router_timeout_seconds,
  219. ),
  220. open_router_http_referer=_env_first(
  221. ("OPEN_ROUTER_HTTP_REFERER", "OPENROUTER_HTTP_REFERER"),
  222. defaults.open_router_http_referer,
  223. ),
  224. open_router_app_title=_env_first(
  225. ("OPEN_ROUTER_APP_TITLE", "OPENROUTER_X_OPEN_ROUTER_TITLE"),
  226. defaults.open_router_app_title,
  227. ),
  228. open_router_base_url=_env(
  229. "OPEN_ROUTER_BASE_URL",
  230. defaults.open_router_base_url,
  231. ),
  232. open_router_temperature=_env_float_optional(
  233. "OPEN_ROUTER_TEMPERATURE",
  234. defaults.open_router_temperature,
  235. ),
  236. open_router_max_tokens=_env_int_optional(
  237. "OPEN_ROUTER_MAX_TOKENS",
  238. defaults.open_router_max_tokens,
  239. ),
  240. odps_access_id=_env("ODPS_ACCESS_ID", defaults.odps_access_id),
  241. odps_access_key=_env("ODPS_ACCESS_KEY", defaults.odps_access_key),
  242. odps_project=_env("ODPS_PROJECT", defaults.odps_project),
  243. odps_endpoint=_env("ODPS_ENDPOINT", defaults.odps_endpoint),
  244. odps_tunnel_endpoint=_env(
  245. "ODPS_TUNNEL_ENDPOINT",
  246. defaults.odps_tunnel_endpoint,
  247. ),
  248. demand_pool_source_table=_env(
  249. "DEMAND_POOL_SOURCE_TABLE",
  250. defaults.demand_pool_source_table,
  251. ),
  252. demand_pool_excluded_strategy=_env(
  253. "DEMAND_POOL_EXCLUDED_STRATEGY",
  254. defaults.demand_pool_excluded_strategy,
  255. ),
  256. demand_pool_top_n=_env_int(
  257. "DEMAND_POOL_TOP_N",
  258. defaults.demand_pool_top_n,
  259. ),
  260. hot_demand_pool_strategy=_env(
  261. "HOT_DEMAND_POOL_STRATEGY",
  262. defaults.hot_demand_pool_strategy,
  263. ),
  264. wxindex_score_threshold=_env_float(
  265. "WXINDEX_SCORE_THRESHOLD",
  266. _env_float(
  267. "HOT_DEMAND_POOL_WXINDEX_THRESHOLD",
  268. _env_float(
  269. "WXINDEX_LATEST_SCORE_THRESHOLD",
  270. defaults.wxindex_score_threshold,
  271. ),
  272. ),
  273. ),
  274. odps_daily_write_limit=_env_int(
  275. "ODPS_DAILY_WRITE_LIMIT",
  276. defaults.odps_daily_write_limit,
  277. ),
  278. postprocess_batch_size=_env_int(
  279. "POSTPROCESS_BATCH_SIZE",
  280. defaults.postprocess_batch_size,
  281. ),
  282. contribution_match_llm_model=_env(
  283. "CONTRIBUTION_MATCH_LLM_MODEL",
  284. defaults.contribution_match_llm_model,
  285. ),
  286. contribution_match_llm_max_attempts=_env_int(
  287. "CONTRIBUTION_MATCH_LLM_MAX_ATTEMPTS",
  288. defaults.contribution_match_llm_max_attempts,
  289. ),
  290. contribution_match_llm_retry_sleep_seconds=_env_float(
  291. "CONTRIBUTION_MATCH_LLM_RETRY_SLEEP_SECONDS",
  292. defaults.contribution_match_llm_retry_sleep_seconds,
  293. ),
  294. contribution_match_llm_max_tokens=_env_int(
  295. "CONTRIBUTION_MATCH_LLM_MAX_TOKENS",
  296. defaults.contribution_match_llm_max_tokens,
  297. ),
  298. wxindex_llm_model=_env(
  299. "WXINDEX_LLM_MODEL",
  300. defaults.wxindex_llm_model,
  301. ),
  302. wxindex_llm_max_attempts=_env_int(
  303. "WXINDEX_LLM_MAX_ATTEMPTS",
  304. defaults.wxindex_llm_max_attempts,
  305. ),
  306. wxindex_llm_max_tokens=_env_int(
  307. "WXINDEX_LLM_MAX_TOKENS",
  308. defaults.wxindex_llm_max_tokens,
  309. ),
  310. wxindex_api_url=_env("WXINDEX_API_URL", defaults.wxindex_api_url),
  311. wxindex_lookback_days=_env_int(
  312. "WXINDEX_LOOKBACK_DAYS",
  313. defaults.wxindex_lookback_days,
  314. ),
  315. wxindex_words_cron_hours=_env(
  316. "WXINDEX_WORDS_CRON_HOURS",
  317. defaults.wxindex_words_cron_hours,
  318. ),
  319. wxindex_words_cron_minute=_env_int(
  320. "WXINDEX_WORDS_CRON_MINUTE",
  321. defaults.wxindex_words_cron_minute,
  322. ),
  323. wxindex_heat_pattern_cron_hours=_env(
  324. "WXINDEX_HEAT_PATTERN_CRON_HOURS",
  325. defaults.wxindex_heat_pattern_cron_hours,
  326. ),
  327. wxindex_heat_pattern_cron_minute=_env_int(
  328. "WXINDEX_HEAT_PATTERN_CRON_MINUTE",
  329. defaults.wxindex_heat_pattern_cron_minute,
  330. ),
  331. demand_event_sense_threshold=_env_float(
  332. "DEMAND_EVENT_SENSE_THRESHOLD",
  333. defaults.demand_event_sense_threshold,
  334. ),
  335. demand_senior_fit_threshold=_env_float(
  336. "DEMAND_SENIOR_FIT_THRESHOLD",
  337. defaults.demand_senior_fit_threshold,
  338. ),
  339. demand_quality_llm_model=_env(
  340. "DEMAND_QUALITY_LLM_MODEL",
  341. defaults.demand_quality_llm_model,
  342. ),
  343. demand_quality_llm_max_attempts=_env_int(
  344. "DEMAND_QUALITY_LLM_MAX_ATTEMPTS",
  345. defaults.demand_quality_llm_max_attempts,
  346. ),
  347. demand_quality_llm_retry_sleep_seconds=_env_float(
  348. "DEMAND_QUALITY_LLM_RETRY_SLEEP_SECONDS",
  349. defaults.demand_quality_llm_retry_sleep_seconds,
  350. ),
  351. demand_quality_llm_max_tokens=_env_int(
  352. "DEMAND_QUALITY_LLM_MAX_TOKENS",
  353. defaults.demand_quality_llm_max_tokens,
  354. ),
  355. category_filter_llm_model=_env(
  356. "CATEGORY_FILTER_LLM_MODEL",
  357. defaults.category_filter_llm_model,
  358. ),
  359. category_filter_llm_max_attempts=_env_int(
  360. "CATEGORY_FILTER_LLM_MAX_ATTEMPTS",
  361. defaults.category_filter_llm_max_attempts,
  362. ),
  363. category_filter_llm_retry_sleep_seconds=_env_float(
  364. "CATEGORY_FILTER_LLM_RETRY_SLEEP_SECONDS",
  365. defaults.category_filter_llm_retry_sleep_seconds,
  366. ),
  367. category_filter_llm_max_tokens=_env_int(
  368. "CATEGORY_FILTER_LLM_MAX_TOKENS",
  369. defaults.category_filter_llm_max_tokens,
  370. ),
  371. category_filter_body_max_chars=_env_int(
  372. "CATEGORY_FILTER_BODY_MAX_CHARS",
  373. defaults.category_filter_body_max_chars,
  374. ),
  375. category_filter_item_sleep_seconds=_env_float(
  376. "CATEGORY_FILTER_ITEM_SLEEP_SECONDS",
  377. defaults.category_filter_item_sleep_seconds,
  378. ),
  379. )
  380. settings = Settings.from_env()