updatePublishedMsgDaily.py 18 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559
  1. """
  2. @author: luojunhui
  3. @description: update daily information into official articles v2
  4. """
  5. import time
  6. import json
  7. import traceback
  8. from tqdm import tqdm
  9. from datetime import datetime
  10. from applications import PQMySQL, WeixinSpider, Functions, log, bot, aiditApi
  11. ARTICLE_TABLE = "official_articles_v2"
  12. ARTICLE_DELETE_CODE = 25005
  13. ARTICLE_SUCCESS_CODE = 0
  14. DEFAULT_STATUS = 0
  15. REQUEST_FAIL_STATUS = -1
  16. DELETE_STATUS = -2
  17. UNKNOWN_STATUS = -3
  18. def get_accounts_v1():
  19. """
  20. 获取账号信息
  21. :return: [{}, {},...], [{}, {}, {}...]
  22. """
  23. with open("config/accountInfoV0914.json", encoding="utf-8") as f:
  24. account_list = json.loads(f.read())
  25. subscription_account = [i for i in account_list if i['type'] == '订阅号']
  26. server_account = [i for i in account_list if i['type'] == '服务号']
  27. return subscription_account, server_account
  28. def get_account_using_status():
  29. """
  30. 获取正在 using 的 ghid
  31. :return:
  32. """
  33. sql = "SELECT gh_id FROM long_articles_publishing_accounts WHERE is_using = 1;"
  34. gh_id_tuple = PQMySQL().select(sql)
  35. gh_id_list = [
  36. i[0] for i in gh_id_tuple
  37. ]
  38. return set(gh_id_list)
  39. def get_accounts():
  40. """
  41. 从 aigc 数据库中获取目前处于发布状态的账号
  42. :return:
  43. "name": line[0],
  44. "ghId": line[1],
  45. "follower_count": line[2],
  46. "account_init_time": int(line[3] / 1000),
  47. "account_type": line[4],
  48. "account_auth": line[5]
  49. """
  50. using_account_set = get_account_using_status()
  51. account_list_with_out_using_status = aiditApi.get_publish_account_from_aigc()
  52. account_list = []
  53. for item in account_list_with_out_using_status:
  54. if item['ghId'] in using_account_set:
  55. item['using_status'] = 1
  56. else:
  57. item['using_status'] = 0
  58. account_list.append(item)
  59. subscription_account = [i for i in account_list if i['account_type'] in {0, 1}]
  60. server_account = [i for i in account_list if i['account_type'] == 2]
  61. return subscription_account, server_account
  62. def insert_each_msg(db_client, account_info, account_name, msg_list):
  63. """
  64. 把消息数据更新到数据库中
  65. :param account_info:
  66. :param db_client:
  67. :param account_name:
  68. :param msg_list:
  69. :return:
  70. """
  71. gh_id = account_info['ghId']
  72. for info in msg_list:
  73. baseInfo = info.get("BaseInfo", {})
  74. appMsgId = info.get("AppMsg", {}).get("BaseInfo", {}).get("AppMsgId", None)
  75. createTime = info.get("AppMsg", {}).get("BaseInfo", {}).get("CreateTime", None)
  76. updateTime = info.get("AppMsg", {}).get("BaseInfo", {}).get("UpdateTime", None)
  77. Type = info.get("AppMsg", {}).get("BaseInfo", {}).get("Type", None)
  78. detail_article_list = info.get("AppMsg", {}).get("DetailInfo", [])
  79. if detail_article_list:
  80. for article in detail_article_list:
  81. title = article.get("Title", None)
  82. Digest = article.get("Digest", None)
  83. ItemIndex = article.get("ItemIndex", None)
  84. ContentUrl = article.get("ContentUrl", None)
  85. SourceUrl = article.get("SourceUrl", None)
  86. CoverImgUrl = article.get("CoverImgUrl", None)
  87. CoverImgUrl_1_1 = article.get("CoverImgUrl_1_1", None)
  88. CoverImgUrl_235_1 = article.get("CoverImgUrl_235_1", None)
  89. ItemShowType = article.get("ItemShowType", None)
  90. IsOriginal = article.get("IsOriginal", None)
  91. ShowDesc = article.get("ShowDesc", None)
  92. show_stat = Functions().show_desc_to_sta(ShowDesc)
  93. ori_content = article.get("ori_content", None)
  94. show_view_count = show_stat.get("show_view_count", 0)
  95. show_like_count = show_stat.get("show_like_count", 0)
  96. show_zs_count = show_stat.get("show_zs_count", 0)
  97. show_pay_count = show_stat.get("show_pay_count", 0)
  98. wx_sn = ContentUrl.split("&sn=")[1].split("&")[0] if ContentUrl else None
  99. status = account_info['using_status']
  100. info_tuple = (
  101. gh_id,
  102. account_name,
  103. appMsgId,
  104. title,
  105. Type,
  106. createTime,
  107. updateTime,
  108. Digest,
  109. ItemIndex,
  110. ContentUrl,
  111. SourceUrl,
  112. CoverImgUrl,
  113. CoverImgUrl_1_1,
  114. CoverImgUrl_235_1,
  115. ItemShowType,
  116. IsOriginal,
  117. ShowDesc,
  118. ori_content,
  119. show_view_count,
  120. show_like_count,
  121. show_zs_count,
  122. show_pay_count,
  123. wx_sn,
  124. json.dumps(baseInfo, ensure_ascii=False),
  125. Functions().str_to_md5(title),
  126. status
  127. )
  128. try:
  129. insert_sql = f"""
  130. INSERT INTO {ARTICLE_TABLE}
  131. (ghId, accountName, appMsgId, title, Type, createTime, updateTime, Digest, ItemIndex, ContentUrl, SourceUrl, CoverImgUrl, CoverImgUrl_1_1, CoverImgUrl_255_1, ItemShowType, IsOriginal, ShowDesc, ori_content, show_view_count, show_like_count, show_zs_count, show_pay_count, wx_sn, baseInfo, title_md5, status)
  132. values
  133. (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s);
  134. """
  135. db_client.update(sql=insert_sql, params=info_tuple)
  136. log(
  137. task="updatePublishedMsgDaily",
  138. function="insert_each_msg",
  139. message="插入文章数据成功",
  140. data={
  141. "info": info_tuple
  142. }
  143. )
  144. except Exception as e:
  145. try:
  146. update_sql = f"""
  147. UPDATE {ARTICLE_TABLE}
  148. SET show_view_count = %s, show_like_count=%s
  149. WHERE wx_sn = %s;
  150. """
  151. db_client.update(sql=update_sql,
  152. params=(show_view_count, show_like_count, wx_sn))
  153. log(
  154. task="updatePublishedMsgDaily",
  155. function="insert_each_msg",
  156. message="更新文章数据成功",
  157. data={
  158. "wxSn": wx_sn,
  159. "likeCount": show_like_count,
  160. "viewCount": show_view_count
  161. }
  162. )
  163. except Exception as e:
  164. log(
  165. task="updatePublishedMsgDaily",
  166. function="insert_each_msg",
  167. message="更新文章失败, 报错原因是: {}".format(e),
  168. status="fail"
  169. )
  170. continue
  171. def update_each_account(db_client, account_info, account_name, latest_update_time, cursor=None):
  172. """
  173. 更新每一个账号信息
  174. :param account_info:
  175. :param account_name:
  176. :param cursor:
  177. :param latest_update_time: 最新更新时间
  178. :param db_client: 数据库连接信息
  179. :return: None
  180. """
  181. gh_id = account_info['ghId']
  182. response = WeixinSpider().update_msg_list(ghId=gh_id, index=cursor)
  183. msg_list = response.get("data", {}).get("data", {})
  184. if msg_list:
  185. # do
  186. last_article_in_this_msg = msg_list[-1]
  187. last_time_stamp_in_this_msg = last_article_in_this_msg['AppMsg']['BaseInfo']['UpdateTime']
  188. last_url = last_article_in_this_msg['AppMsg']['DetailInfo'][0]['ContentUrl']
  189. resdata = WeixinSpider().get_account_by_url(last_url)
  190. check_id = resdata['data'].get('data', {}).get('wx_gh')
  191. if check_id == gh_id:
  192. insert_each_msg(
  193. db_client=db_client,
  194. account_info=account_info,
  195. account_name=account_name,
  196. msg_list=msg_list
  197. )
  198. if last_time_stamp_in_this_msg > latest_update_time:
  199. next_cursor = response['data']['next_cursor']
  200. return update_each_account(
  201. db_client=db_client,
  202. account_info=account_info,
  203. account_name=account_name,
  204. latest_update_time=latest_update_time,
  205. cursor=next_cursor
  206. )
  207. log(
  208. task="updatePublishedMsgDaily",
  209. function="update_each_account",
  210. message="账号文章更新成功",
  211. data=response
  212. )
  213. else:
  214. log(
  215. task="updatePublishedMsgDaily",
  216. function="update_each_account",
  217. message="账号文章更新失败",
  218. status="fail",
  219. data=response
  220. )
  221. return
  222. def check_account_info(db_client, gh_id, account_name):
  223. """
  224. 通过 gh_id查询视频信息
  225. :param account_name:
  226. :param db_client:
  227. :param gh_id:
  228. :return:
  229. """
  230. sql = f"""
  231. SELECT accountName, updateTime
  232. FROM {ARTICLE_TABLE}
  233. WHERE ghId = '{gh_id}'
  234. ORDER BY updateTime DESC LIMIT 1;
  235. """
  236. result = db_client.select(sql)
  237. if result:
  238. old_account_name, update_time = result[0]
  239. return {
  240. "account_name": old_account_name,
  241. "update_time": update_time,
  242. "account_type": "history"
  243. }
  244. else:
  245. return {
  246. "account_name": account_name,
  247. "update_time": int(time.time()) - 30 * 24 * 60 * 60,
  248. "account_type": "new"
  249. }
  250. def update_single_account(db_client, account_info):
  251. """
  252. :param account_info:
  253. :param db_client:
  254. :return:
  255. """
  256. gh_id = account_info['ghId']
  257. account_name = account_info['name']
  258. account_detail = check_account_info(db_client, gh_id, account_name)
  259. account_name = account_detail['account_name']
  260. update_time = account_detail['update_time']
  261. update_each_account(
  262. db_client=db_client,
  263. account_info=account_info,
  264. account_name=account_name,
  265. latest_update_time=update_time
  266. )
  267. def check_single_account(db_client, account_item):
  268. """
  269. 校验每个账号是否更新
  270. :param db_client:
  271. :param account_item:
  272. :return: True / False
  273. """
  274. gh_id = account_item['ghId']
  275. account_type = account_item['account_type']
  276. today_str = datetime.today().strftime("%Y-%m-%d")
  277. today_date_time = datetime.strptime(today_str, "%Y-%m-%d")
  278. today_timestamp = today_date_time.timestamp()
  279. sql = f"""
  280. SELECT updateTime
  281. FROM {ARTICLE_TABLE}
  282. WHERE ghId = '{gh_id}'
  283. ORDER BY updateTime
  284. DESC
  285. LIMIT 1;
  286. """
  287. try:
  288. latest_update_time = db_client.select(sql)[0][0]
  289. # 判断该账号当天发布的文章是否被收集
  290. if account_type in {0, 1}:
  291. if int(latest_update_time) > int(today_timestamp):
  292. return True
  293. else:
  294. return False
  295. else:
  296. if int(latest_update_time) > int(today_timestamp) - 7 * 24 * 3600:
  297. return True
  298. else:
  299. return False
  300. except Exception as e:
  301. print("updateTime Error -- {}".format(e))
  302. return False
  303. def update_job():
  304. """
  305. 更新任务
  306. :return:
  307. """
  308. try:
  309. db_client = PQMySQL()
  310. except Exception as e:
  311. error_msg = traceback.format_exc()
  312. bot(
  313. title="更新文章任务连接数据库失败",
  314. detail={
  315. "error": e,
  316. "msg": error_msg
  317. }
  318. )
  319. return
  320. sub_accounts, server_accounts = get_accounts()
  321. s_count = 0
  322. f_count = 0
  323. for sub_item in tqdm(sub_accounts):
  324. try:
  325. update_single_account(db_client, sub_item)
  326. s_count += 1
  327. time.sleep(5)
  328. except Exception as e:
  329. f_count += 1
  330. log(
  331. task="updatePublishedMsgDaily",
  332. function="update_job",
  333. message="单个账号文章更新失败, 报错信息是: {}".format(e),
  334. status="fail",
  335. )
  336. log(
  337. task="updatePublishedMsgDaily",
  338. function="update_job",
  339. message="订阅号更新完成",
  340. data={
  341. "success": s_count,
  342. "fail": f_count
  343. }
  344. )
  345. if f_count / (s_count + f_count) > 0.3:
  346. bot(
  347. title="订阅号超过 30% 的账号更新失败",
  348. detail={
  349. "success": s_count,
  350. "fail": f_count,
  351. "failRate": f_count / (s_count + f_count)
  352. }
  353. )
  354. bot(
  355. title="更新每日发布文章任务完成通知",
  356. detail={
  357. "msg": "订阅号更新完成",
  358. "finish_time": datetime.today().__str__()
  359. },
  360. mention=False
  361. )
  362. for sub_item in tqdm(server_accounts):
  363. try:
  364. update_single_account(db_client, sub_item)
  365. time.sleep(5)
  366. except Exception as e:
  367. print(e)
  368. bot(
  369. title="更新每日发布文章任务完成通知",
  370. detail={
  371. "msg": "服务号更新完成",
  372. "finish_time": datetime.today().__str__()
  373. },
  374. mention=False
  375. )
  376. def check_job():
  377. """
  378. 校验任务
  379. :return:
  380. """
  381. try:
  382. db_client = PQMySQL()
  383. except Exception as e:
  384. error_msg = traceback.format_exc()
  385. bot(
  386. title="校验更新文章任务连接数据库失败",
  387. detail={
  388. "job": "check_job",
  389. "error": e,
  390. "msg": error_msg
  391. }
  392. )
  393. return
  394. sub_accounts, server_accounts = get_accounts()
  395. fail_list = []
  396. # account_list = sub_accounts + server_accounts
  397. account_list = sub_accounts
  398. # check and rework if fail
  399. for sub_item in tqdm(account_list):
  400. res = check_single_account(db_client, sub_item)
  401. if not res:
  402. update_single_account(db_client, sub_item)
  403. # check whether success and bot if fails
  404. for sub_item in tqdm(account_list):
  405. res = check_single_account(db_client, sub_item)
  406. if not res:
  407. fail_list.append(sub_item)
  408. if fail_list:
  409. try:
  410. bot(
  411. title="日常报警, 存在账号更新失败",
  412. detail=fail_list
  413. )
  414. except Exception as e:
  415. print("Timeout Error: {}".format(e))
  416. else:
  417. bot(
  418. title="校验完成通知",
  419. mention=False,
  420. detail={
  421. "msg": "校验任务完成",
  422. "finish_time": datetime.today().__str__()
  423. }
  424. )
  425. def get_articles(db_client):
  426. """
  427. :return:
  428. """
  429. sql = f"""
  430. SELECT ContentUrl, wx_sn
  431. FROM official_articles_v2
  432. WHERE publish_timestamp in {(DEFAULT_STATUS, REQUEST_FAIL_STATUS)};"""
  433. response = db_client.select(sql)
  434. return response
  435. def update_publish_timestamp(db_client, row):
  436. """
  437. 更新发布时间戳 && minigram 信息
  438. :param db_client:
  439. :param row:
  440. :return:
  441. """
  442. url = row[0]
  443. wx_sn = row[1]
  444. try:
  445. response = WeixinSpider().get_article_text(url)
  446. response_code = response['code']
  447. if response_code == ARTICLE_DELETE_CODE:
  448. publish_timestamp_s = DELETE_STATUS
  449. root_source_id_list = []
  450. elif response_code == ARTICLE_SUCCESS_CODE:
  451. data = response['data']['data']
  452. publish_timestamp_ms = data['publish_timestamp']
  453. publish_timestamp_s = int(publish_timestamp_ms / 1000)
  454. mini_program = data.get('mini_program', [])
  455. if mini_program:
  456. root_source_id_list = [
  457. i['path'].split("%26rootSourceId%3D")[-1]
  458. for i in mini_program
  459. ]
  460. else:
  461. root_source_id_list = []
  462. else:
  463. publish_timestamp_s = UNKNOWN_STATUS
  464. root_source_id_list = []
  465. except Exception as e:
  466. publish_timestamp_s = REQUEST_FAIL_STATUS
  467. root_source_id_list = []
  468. error_msg = traceback.format_exc()
  469. print(e, error_msg)
  470. update_sql = f"""
  471. UPDATE official_articles_v2
  472. SET publish_timestamp = %s, root_source_id_list = %s
  473. WHERE wx_sn = %s;
  474. """
  475. db_client.update(
  476. sql=update_sql,
  477. params=(
  478. publish_timestamp_s,
  479. json.dumps(root_source_id_list, ensure_ascii=False),
  480. wx_sn
  481. ))
  482. def get_article_detail_job():
  483. """
  484. 获取发布文章详情
  485. :return:
  486. """
  487. try:
  488. db_client = PQMySQL()
  489. except Exception as e:
  490. error_msg = traceback.format_exc()
  491. bot(
  492. title="获取文章详情任务连接数据库失败",
  493. detail={
  494. "job": "get_article_detail_job",
  495. "error": e,
  496. "msg": error_msg
  497. }
  498. )
  499. return
  500. article_tuple = get_articles(db_client)
  501. for article in tqdm(article_tuple):
  502. try:
  503. update_publish_timestamp(db_client=db_client, row=article)
  504. except Exception as e:
  505. print(e)
  506. error_msg = traceback.format_exc()
  507. print(error_msg)
  508. def main():
  509. """
  510. main
  511. :return:
  512. """
  513. update_job()
  514. check_job()
  515. get_article_detail_job()
  516. if __name__ == '__main__':
  517. main()