published_articles_monitor.py 3.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113
  1. """
  2. 监测已发布文章
  3. """
  4. from datetime import datetime
  5. from argparse import ArgumentParser
  6. from concurrent.futures import ThreadPoolExecutor
  7. from tqdm import tqdm
  8. from applications import bot
  9. from applications import aiditApi
  10. from applications.db import DatabaseConnector
  11. from applications.const import updatePublishedMsgTaskConst
  12. from applications import WeixinSpider
  13. from config import piaoquan_crawler_config, long_articles_config
  14. const = updatePublishedMsgTaskConst()
  15. spider = WeixinSpider()
  16. def monitor_article(article):
  17. """
  18. 校验单篇文章是否
  19. """
  20. gh_id, account_name, title, url, wx_sn, publish_date = article
  21. try:
  22. response = spider.get_article_text(url, is_cache=False)
  23. response_code = response["code"]
  24. if response_code == const.ARTICLE_ILLEGAL_CODE:
  25. error_detail = response.get("msg")
  26. insert_sql = f"""
  27. INSERT IGNORE INTO illegal_articles
  28. (gh_id, account_name, title, wx_sn, publish_date, illegal_reason)
  29. VALUES
  30. (%s, %s, %s, %s, %s, %s);
  31. """
  32. affected_rows = long_articles_db_client.save(
  33. query=insert_sql,
  34. params=(gh_id, account_name, title, wx_sn, publish_date, error_detail),
  35. )
  36. if affected_rows:
  37. bot(
  38. title="文章违规告警",
  39. detail={
  40. "account_name": account_name,
  41. "gh_id": gh_id,
  42. "title": title,
  43. "wx_sn": wx_sn.decode("utf-8"),
  44. "publish_date": str(publish_date),
  45. "error_detail": error_detail,
  46. },
  47. mention=False
  48. )
  49. aiditApi.delete_articles(
  50. gh_id=gh_id,
  51. title=title
  52. )
  53. except Exception as e:
  54. print(e)
  55. def get_article_list(run_date):
  56. """
  57. 监控任务, 监测周期为7天,监测文章是否被违规,若监测到违规文章,则进行告警
  58. :return:
  59. """
  60. if not run_date:
  61. run_date = datetime.today().strftime("%Y-%m-%d")
  62. monitor_start_timestamp = (
  63. int(datetime.strptime(run_date, "%Y-%m-%d").timestamp()) - const.MONITOR_PERIOD
  64. )
  65. select_sql = f"""
  66. SELECT ghId, accountName, title, ContentUrl, wx_sn, from_unixtime(publish_timestamp) AS publish_timestamp
  67. FROM official_articles_v2
  68. WHERE publish_timestamp >= {monitor_start_timestamp}
  69. ORDER BY publish_timestamp DESC;
  70. """
  71. article_list = piaoquan_crawler_db_client.fetch(select_sql)
  72. return article_list
  73. if __name__ == "__main__":
  74. parser = ArgumentParser()
  75. parser.add_argument(
  76. "--run_date",
  77. help="--run_date %Y-%m-%d",
  78. )
  79. args = parser.parse_args()
  80. if args.run_date:
  81. run_date = args.run_date
  82. else:
  83. run_date = None
  84. piaoquan_crawler_db_client = DatabaseConnector(db_config=piaoquan_crawler_config)
  85. piaoquan_crawler_db_client.connect()
  86. long_articles_db_client = DatabaseConnector(db_config=long_articles_config)
  87. long_articles_db_client.connect()
  88. # Number of concurrent threads
  89. MAX_WORKERS = 4
  90. article_list = get_article_list(run_date=None)
  91. with ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor:
  92. list(
  93. tqdm(
  94. executor.map(monitor_article, article_list),
  95. total=len(article_list),
  96. desc="Monitor Article List",
  97. )
  98. )