__init__.py 10 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505
  1. """
  2. @author: luojunhui
  3. 任务常量配置文件
  4. """
  5. class ColdStartTaskConst:
  6. """
  7. 冷启动任务常量配置
  8. """
  9. PUBLISHED_STATUS = 2 # 文章已发布状态
  10. INIT_STATUS = 1 # 文章初始状态
  11. BAD_STATUS = 0 # 低质量文章状态
  12. # 常量
  13. ACCOUNT_GOOD_STATUS = 1
  14. # 账号是否每日抓取
  15. ACCOUNT_DAILY_SCRAPE = 1
  16. ACCOUNT_NOT_DAILY_SCRAPE = 0
  17. # 默认值
  18. DEFAULT_VIEW_COUNT = 0
  19. DEFAULT_LIKE_COUNT = 0
  20. DEFAULT_ARTICLE_STATUS = 1
  21. DEFAULT_TIMESTAMP = 1717171200
  22. # 标题sensitivity
  23. TITLE_SENSITIVE = 1
  24. TITLE_NOT_SENSITIVE = 0
  25. # 文章联想深度
  26. ARTICLE_ASSOCIATION_MAX_DEPTH = 4
  27. # 相关分百分位阈值
  28. PERCENT_THRESHOLD = 95
  29. # 相关性分阈值
  30. CORRELATION_THRESHOLD = 0.5
  31. # 阅读量阈值
  32. READ_COUNT_THRESHOLD = 1000
  33. # 阅读均值倍数阈值
  34. READ_AVG_THRESHOLD = 1.3
  35. # 群发类型
  36. BULK_PUBLISH_TYPE = 9
  37. # 种子文章数量
  38. SEED_ARTICLE_LIMIT_NUM = 60
  39. class updatePublishedMsgTaskConst:
  40. """
  41. 更新已发布文章消息常量配置
  42. """
  43. # 爬虫详情接口返回code
  44. ARTICLE_ILLEGAL_CODE = 25012
  45. ARTICLE_DELETE_CODE = 25005
  46. ARTICLE_SUCCESS_CODE = 0
  47. ARTICLE_UNKNOWN_CODE = 10000
  48. # 请求爬虫详情接口状态码
  49. # 记录默认状态
  50. DEFAULT_STATUS = 0
  51. # 请求接口失败状态
  52. REQUEST_FAIL_STATUS = -1
  53. # 文章被删除状态
  54. DELETE_STATUS = -2
  55. # 未知原因无信息返回状态
  56. UNKNOWN_STATUS = -3
  57. # 文章违规状态
  58. ILLEGAL_STATUS = -4
  59. # 公众号类型(订阅号 or 服务号)
  60. # 订阅号
  61. SUBSCRIBE_TYPE_SET = {0, 1}
  62. # 服务号
  63. SERVICE_TYPE = 2
  64. # 监测周期(秒)
  65. MONITOR_PERIOD = 60 * 60 * 24 * 3
  66. # 新号抓文章周期
  67. NEW_ACCOUNT_CRAWL_PERIOD = 60 * 60 * 24 * 30
  68. # 订阅号,抓取失败失败率报警阈值
  69. SUBSCRIBE_FAIL_RATE_THRESHOLD = 0.3
  70. class UpdateAccountReadRateTaskConst:
  71. """
  72. 更新账号阅读率常量配置
  73. """
  74. # 阅读率统计周期(秒)
  75. STATISTICS_PERIOD = 31 * 24 * 60 * 60
  76. # 一天的秒数
  77. ONE_DAY_IN_SECONDS = 60 * 60 * 24
  78. # 相对变化率阈值
  79. RELATIVE_VALUE_THRESHOLD = 0.1
  80. # 发文类型
  81. UNLIMITED_PUBLISH_TYPE = 10002
  82. BULK_PUBLISH_TYPE = 9
  83. # 文章位置
  84. ARTICLE_INDEX_LIST = [1, 2, 3, 4, 5, 6, 7, 8]
  85. # 默认粉丝
  86. DEFAULT_FANS = 0
  87. # 最低粉丝量
  88. MIN_FANS = 1000
  89. GROUP_ACCOUNT_SET = {'gh_9cf3b7ff486b', 'gh_ecb21c0453af', 'gh_45beb952dc74', 'gh_84e744b16b3a', 'gh_b3ffc1ca3a04', 'gh_b8baac4296cb', 'gh_efaf7da157f5', 'gh_5855bed97938', 'gh_b32125c73861', 'gh_761976bb98a6', 'gh_5e543853d8f0', 'gh_61a72b720de3'}
  90. class UpdateAccountReadAvgTaskConst:
  91. """
  92. 更新账号阅读均值常量配置
  93. """
  94. # 投流账号
  95. TOULIU_ACCOUNTS = {
  96. "gh_93e00e187787",
  97. "gh_ac43e43b253b",
  98. "gh_68e7fdc09fe4",
  99. "gh_77f36c109fb1",
  100. "gh_b181786a6c8c",
  101. "gh_1ee2e1b39ccf",
  102. "gh_d3f039c9db2b",
  103. }
  104. # 发文模式
  105. ARTICLES_DAILY = 1
  106. TOULIU = 2
  107. # 默认粉丝
  108. DEFAULT_FANS = 0
  109. # index list
  110. ARTICLE_INDEX_LIST = [1, 2, 3, 4, 5, 6, 7, 8]
  111. # 默认点赞
  112. DEFAULT_LIKE = 0
  113. # 状态
  114. USING_STATUS = 1
  115. NOT_USING_STATUS = 0
  116. # 统计周期
  117. STAT_PERIOD = 30
  118. # default upper_quantile, confidence = 0.95
  119. DEFAULT_UPPER_QUANTILE = 0.975
  120. ACCOUNT_READ_RATE_TABLE = "long_articles_read_rate"
  121. ACCOUNT_READ_AVG_TABLE = "account_avg_info_v3"
  122. class WeixinVideoCrawlerConst:
  123. """
  124. 微信视频抓取常量配置
  125. """
  126. # 账号抓取状态
  127. ACCOUNT_CRAWL_STATUS = 1
  128. ACCOUNT_DO_NOT_CRAWL_STATUS = 0
  129. # 默认最早抓取时间戳(2024-01-01)
  130. DEFAULT_TIMESTAMP = 1704038400
  131. # 搜索爬虫最大页数
  132. MAX_SEARCH_PAGE_NUM = 10
  133. # 抓取每一页的等待时间
  134. SLEEP_SECONDS = 5
  135. # 种子标题最低阅读均值倍数
  136. READ_AVG_MULTIPLE = 1.3
  137. # 种子标题最低阅读量
  138. MIN_READ_COUNT = 2000
  139. # 获取种子标题的统计周期
  140. STAT_PERIOD = 7 * 24 * 60 * 60
  141. # 接口请求成功code
  142. REQUEST_SUCCESS = 0
  143. PUBLISHED_ILLEGAL_TITLE_CODE = 1015
  144. # 是否需要扫描查询源账号
  145. NEED_SCAN_SOURCE_ACCOUNT = 1
  146. DO_NOT_NEED_SOURCE_ACCOUNT = 0
  147. # 视频审核状态长文库
  148. VIDEO_AUDIT_INIT_STATUS = 0
  149. VIDEO_AUDIT_SUCCESS_STATUS = 1
  150. VIDEO_AUDIT_FAIL_STATUS = 2
  151. VIDEO_TITLE_GENERATE_FAIL_STATUS = 4
  152. VIDEO_AUDIT_PROCESSING_STATUS = -1
  153. # 票圈视频审核状态, 1 审核中,2 不通过 3 待修改,4 自己可见 5 通过
  154. PQ_AUDIT_PROCESSING_STATUS = 1
  155. PQ_AUDIT_FAIL_STATUS = 2
  156. PQ_AUDIT_WAIT_STATUS = 3
  157. PQ_AUDIT_SELF_VISIBLE_STATUS = 4
  158. PQ_AUDIT_SUCCESS_STATUS = 5
  159. # 默认账号
  160. DEFAULT_ACCOUNT_UID = 76862180
  161. # 每天发送的审核视频数量
  162. MAX_VIDEO_NUM = 1000
  163. # 单次发布视频审核量
  164. MAX_VIDEO_NUM_PER_PUBLISH = 350
  165. # 标题状态
  166. TITLE_DEFAULT_STATUS = 0
  167. TITLE_EXIT_STATUS = 1
  168. TITLE_FESTIVAL_STATUS = 2
  169. TITLE_SHORT_STATUS = 3
  170. # 标题最短长度
  171. TITLE_MIN_LENGTH = 15
  172. # safe score
  173. TITLE_SAFE_SCORE_THRESHOLD = 7
  174. # Task Status
  175. INIT_STATUS = 0
  176. PROCESSING_STATUS = 1
  177. SUCCESS_STATUS = 2
  178. FAIL_STATUS = 99
  179. class UpdateMiniProgramDetailConst(updatePublishedMsgTaskConst):
  180. """
  181. 更新小程序详情常量配置
  182. """
  183. # 账号联想
  184. class AccountAssociationTaskConst:
  185. """
  186. 账号联想任务常量配置
  187. """
  188. # 获取种子标题的统计周期
  189. STAT_PERIOD = 7 * 24 * 60 * 60
  190. # 阅读均值阈值
  191. READ_AVG_MULTIPLE = 1.3
  192. # 最小阅读量
  193. MIN_READ_COUNT = 2000
  194. # 种子数量限制
  195. SEED_TITLE_LIMIT = 100
  196. # 从aigc获取文章
  197. class ArticleCollectorConst:
  198. """
  199. 文章采集任务常量配置
  200. """
  201. # 发送方式
  202. # 手动推送
  203. MANUAL_PUSH = 1
  204. # 自动群发
  205. BULK_AUTO_PUSH = 2
  206. # 无限流推送
  207. UNLIMITED_PUSH = 3
  208. # 文章状态
  209. # 初始状态
  210. INIT_STATUS = 0
  211. # 成功状态
  212. SUCCESS_STATUS = 1
  213. # 失败状态
  214. FAIL_STATUS = -1
  215. # 发布状态
  216. PUBLISHED_STATUS = 2
  217. # 爬虫接口
  218. ARTICLE_ILLEGAL_CODE = 25012
  219. ARTICLE_DELETE_CODE = 25005
  220. ARTICLE_SUCCESS_CODE = 0
  221. ARTICLE_UNKNOWN_CODE = 10000
  222. class BaiduVideoCrawlerConst:
  223. """
  224. const for baidu video crawler
  225. """
  226. # account status
  227. BAIDU_ACCOUNT_GOOD_STATUS = 1
  228. BAIDU_ACCOUNT_BAD_STATUS = 0
  229. # earliest cursor, 2024-01-01 00:00:00
  230. DEFAULT_CURSOR = 17040384000000
  231. # no source account
  232. NO_SOURCE_ACCOUNT_STATUS = 0
  233. # timestamp To Cursor
  234. TIMESTAMP_TO_CURSOR = 10000
  235. # local path dir
  236. LOCAL_PATH_DIR = "static"
  237. class TitleRewriteTaskConst:
  238. """
  239. title rewrite task const
  240. """
  241. # title rewrite status
  242. TITLE_REWRITE_INIT_STATUS = 0
  243. TITLE_REWRITE_SUCCESS_STATUS = 1
  244. TITLE_REWRITE_FAIL_STATUS = 99
  245. TITLE_REWRITE_LOCK_STATUS = 101
  246. # article status
  247. ARTICLE_AUDIT_PASSED_STATUS = 1
  248. ARTICLE_POSITIVE_STATUS = 0
  249. # title useful status
  250. TITLE_USEFUL_STATUS = 1
  251. # prompt version
  252. PROMPT_VERSION = "xx_250228" # 信欣2025-02-28提供
  253. # block expire time 1h
  254. TITLE_REWRITE_LOCK_TIME = 60 * 60
  255. class ChannelVideoCrawlerConst:
  256. """
  257. const for baidu video crawler
  258. """
  259. # account status
  260. CHANNEL_ACCOUNT_GOOD_STATUS = 1
  261. CHANNEL_ACCOUNT_BAD_STATUS = 0
  262. # earliest cursor, 2024-01-01 00:00:00
  263. DEFAULT_CURSOR = 1704038400
  264. # no source account
  265. NO_SOURCE_ACCOUNT_STATUS = 0
  266. # local path dir
  267. LOCAL_PATH_DIR = "static"
  268. # title length min
  269. MIN_TITLE_LENGTH = 10
  270. # max video length(second)
  271. MAX_VIDEO_LENGTH = 600
  272. # sleep second
  273. SLEEP_SECOND = 2
  274. class ToutiaoVideoCrawlerConst:
  275. """
  276. const for toutiao video crawler
  277. """
  278. # platform
  279. PLATFORM = "toutiao"
  280. # account status
  281. TOUTIAO_ACCOUNT_GOOD_STATUS = 1
  282. TOUTIAO_ACCOUNT_BAD_STATUS = 0
  283. # earliest cursor, 2021-01-01 00:00:00
  284. DEFAULT_CURSOR = 1609430400
  285. # no source account
  286. NO_SOURCE_ACCOUNT_STATUS = 0
  287. # title length min
  288. MIN_TITLE_LENGTH = 10
  289. # max video length(second)
  290. MAX_VIDEO_LENGTH = 600
  291. # sleep second
  292. SLEEP_SECOND = 3
  293. class SohuVideoCrawlerConst:
  294. """
  295. const for sohu video crawler
  296. """
  297. # platform
  298. PLATFORM = "sohu"
  299. # account status
  300. GET_RECOMMEND_INIT_STATUS = 0
  301. GET_RECOMMEND_SUCCESS_STATUS = 1
  302. GET_RECOMMEND_FAIL_STATUS = 99
  303. # title length min
  304. MIN_TITLE_LENGTH = 10
  305. # max video length(second)
  306. MAX_VIDEO_LENGTH = 600
  307. # sleep second
  308. SLEEP_SECOND = 3
  309. # 获取推荐的最低相关性分
  310. GET_RECOMMEND_THRESHOLD_SCORE = 0.6
  311. # 审核状态
  312. AUDIT_SUCCESS_STATUS = 1
  313. # 视频状态
  314. VIDEO_NOT_BAD_STATUS = 0
  315. # PAGE_LIST
  316. PAGE_LIST = [i for i in range(1, 8)]
  317. class SingleVideoPoolPublishTaskConst:
  318. """
  319. const for single video pool publish task
  320. """
  321. TRANSFORM_INIT_STATUS = 0
  322. TRANSFORM_SUCCESS_STATUS = 1
  323. TRANSFORM_FAIL_STATUS = 99
  324. SUCCESS_STATUS = 2
  325. class GoogleVideoUnderstandTaskConst:
  326. # task batch size
  327. BATCH_SIZE = 100
  328. # task status
  329. INIT_STATUS = 0
  330. PROCESSING_STATUS = 1
  331. SUCCESS_STATUS = 2
  332. FAIL_STATUS = 99
  333. # sleep seconds
  334. SLEEP_SECONDS = 60
  335. # max processing time
  336. MAX_PROCESSING_TIME = 3600
  337. # task info
  338. TABLE_NAME = "long_articles_new_video_cover"
  339. TASK_NAME = "extract_video_best_frame_as_cover"
  340. DIR_NAME = "static"
  341. POOL_SIZE = 15
  342. class CategoryGenerationTaskConst:
  343. """
  344. const for category generation task
  345. """
  346. # MAX THREAD
  347. MAX_WORKERS = 10
  348. # task batch size
  349. BATCH_SIZE = 20
  350. # min batch
  351. MIN_BATCH_SIZE = 1
  352. # article status
  353. ARTICLE_GOOD_STATUS = 0
  354. # task status
  355. INIT_STATUS = 0
  356. PROCESSING_STATUS = 1
  357. SUCCESS_STATUS = 2
  358. FAIL_STATUS = 99
  359. # max processing time
  360. MAX_PROCESSING_TIME = 3600
  361. # task info
  362. VIDEO_TABLE_NAME = "publish_single_video_source"
  363. ARTICLE_TABLE_NAME = "crawler_meta_article"
  364. TASK_NAME = "generate_category_with_title"
  365. # article_status
  366. ARTICLE_INIT_STATUS = 1
  367. ARTICLE_PUBLISHED_STATUS = 2
  368. ARTICLE_BAD_STATUS = 0
  369. # limit score
  370. LIMIT_SCORE = 0.4