|
@@ -0,0 +1,90 @@
|
|
|
+from datetime import datetime
|
|
|
+from argparse import ArgumentParser
|
|
|
+from concurrent.futures import ThreadPoolExecutor
|
|
|
+
|
|
|
+from tqdm import tqdm
|
|
|
+from applications.db import DatabaseConnector
|
|
|
+from applications.const import updatePublishedMsgTaskConst
|
|
|
+from applications import WeixinSpider
|
|
|
+from config import piaoquan_crawler_config, long_articles_config
|
|
|
+
|
|
|
+const = updatePublishedMsgTaskConst()
|
|
|
+spider = WeixinSpider()
|
|
|
+
|
|
|
+
|
|
|
+def monitor_article(article):
|
|
|
+ """
|
|
|
+ 校验单篇文章是否
|
|
|
+ """
|
|
|
+ gh_id, account_name, title, url, wx_sn, publish_date = article
|
|
|
+ try:
|
|
|
+ response = spider.get_article_text(url, is_cache=False)
|
|
|
+ response_code = response["code"]
|
|
|
+ if response_code == const.ARTICLE_ILLEGAL_CODE:
|
|
|
+ error_detail = response.get("msg")
|
|
|
+ insert_sql = f"""
|
|
|
+ INSERT IGNORE INTO illegal_articles
|
|
|
+ (gh_id, account_name, title, wx_sn, publish_date, illegal_reason)
|
|
|
+ VALUES
|
|
|
+ (%s, %s, %s, %s, %s, %s);
|
|
|
+ """
|
|
|
+ long_articles_db_client.save(
|
|
|
+ query=insert_sql,
|
|
|
+ params=(gh_id, account_name, title, wx_sn, publish_date, error_detail),
|
|
|
+ )
|
|
|
+ except Exception as e:
|
|
|
+ print(e)
|
|
|
+
|
|
|
+
|
|
|
+def get_article_list(run_date):
|
|
|
+ """
|
|
|
+ 监控任务, 监测周期为7天,监测文章是否被违规,若监测到违规文章,则进行告警
|
|
|
+ :return:
|
|
|
+ """
|
|
|
+ if not run_date:
|
|
|
+ run_date = datetime.today().strftime("%Y-%m-%d")
|
|
|
+
|
|
|
+ monitor_start_timestamp = (
|
|
|
+ int(datetime.strptime(run_date, "%Y-%m-%d").timestamp()) - const.MONITOR_PERIOD
|
|
|
+ )
|
|
|
+ select_sql = f"""
|
|
|
+ SELECT ghId, accountName, title, ContentUrl, wx_sn, from_unixtime(publish_timestamp) AS publish_timestamp
|
|
|
+ FROM official_articles_v2
|
|
|
+ WHERE publish_timestamp >= {monitor_start_timestamp}
|
|
|
+ ORDER BY publish_timestamp DESC;
|
|
|
+ """
|
|
|
+ article_list = piaoquan_crawler_db_client.fetch(select_sql)
|
|
|
+ return article_list
|
|
|
+
|
|
|
+
|
|
|
+if __name__ == "__main__":
|
|
|
+ parser = ArgumentParser()
|
|
|
+
|
|
|
+ parser.add_argument(
|
|
|
+ "--run_date",
|
|
|
+ help="--run_date %Y-%m-%d",
|
|
|
+ )
|
|
|
+ args = parser.parse_args()
|
|
|
+ if args.run_date:
|
|
|
+ run_date = args.run_date
|
|
|
+ else:
|
|
|
+ run_date = None
|
|
|
+
|
|
|
+ piaoquan_crawler_db_client = DatabaseConnector(db_config=piaoquan_crawler_config)
|
|
|
+ piaoquan_crawler_db_client.connect()
|
|
|
+ long_articles_db_client = DatabaseConnector(db_config=long_articles_config)
|
|
|
+ long_articles_db_client.connect()
|
|
|
+
|
|
|
+ # Number of concurrent threads
|
|
|
+ MAX_WORKERS = 4
|
|
|
+
|
|
|
+ article_list = get_article_list(run_date=None)
|
|
|
+
|
|
|
+ with ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor:
|
|
|
+ list(
|
|
|
+ tqdm(
|
|
|
+ executor.map(monitor_article, article_list),
|
|
|
+ total=len(article_list),
|
|
|
+ desc="Monitor Article List",
|
|
|
+ )
|
|
|
+ )
|