123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113 |
- """
- 监测已发布文章
- """
- from datetime import datetime
- from argparse import ArgumentParser
- from concurrent.futures import ThreadPoolExecutor
- from tqdm import tqdm
- from applications import bot
- from applications import aiditApi
- from applications.db import DatabaseConnector
- from applications.const import updatePublishedMsgTaskConst
- from applications import WeixinSpider
- from config import piaoquan_crawler_config, long_articles_config
- const = updatePublishedMsgTaskConst()
- spider = WeixinSpider()
- def monitor_article(article):
- """
- 校验单篇文章是否
- """
- gh_id, account_name, title, url, wx_sn, publish_date = article
- try:
- response = spider.get_article_text(url, is_cache=False)
- response_code = response["code"]
- if response_code == const.ARTICLE_ILLEGAL_CODE:
- error_detail = response.get("msg")
- insert_sql = f"""
- INSERT IGNORE INTO illegal_articles
- (gh_id, account_name, title, wx_sn, publish_date, illegal_reason)
- VALUES
- (%s, %s, %s, %s, %s, %s);
- """
- affected_rows = long_articles_db_client.save(
- query=insert_sql,
- params=(gh_id, account_name, title, wx_sn, publish_date, error_detail),
- )
- if affected_rows:
- bot(
- title="文章违规告警",
- detail={
- "account_name": account_name,
- "gh_id": gh_id,
- "title": title,
- "wx_sn": wx_sn.decode("utf-8"),
- "publish_date": str(publish_date),
- "error_detail": error_detail,
- },
- mention=False
- )
- aiditApi.delete_articles(
- gh_id=gh_id,
- title=title
- )
- except Exception as e:
- print(e)
- def get_article_list(run_date):
- """
- 监控任务, 监测周期为7天,监测文章是否被违规,若监测到违规文章,则进行告警
- :return:
- """
- if not run_date:
- run_date = datetime.today().strftime("%Y-%m-%d")
- monitor_start_timestamp = (
- int(datetime.strptime(run_date, "%Y-%m-%d").timestamp()) - const.MONITOR_PERIOD
- )
- select_sql = f"""
- SELECT ghId, accountName, title, ContentUrl, wx_sn, from_unixtime(publish_timestamp) AS publish_timestamp
- FROM official_articles_v2
- WHERE publish_timestamp >= {monitor_start_timestamp}
- ORDER BY publish_timestamp DESC;
- """
- article_list = piaoquan_crawler_db_client.fetch(select_sql)
- return article_list
- if __name__ == "__main__":
- parser = ArgumentParser()
- parser.add_argument(
- "--run_date",
- help="--run_date %Y-%m-%d",
- )
- args = parser.parse_args()
- if args.run_date:
- run_date = args.run_date
- else:
- run_date = None
- piaoquan_crawler_db_client = DatabaseConnector(db_config=piaoquan_crawler_config)
- piaoquan_crawler_db_client.connect()
- long_articles_db_client = DatabaseConnector(db_config=long_articles_config)
- long_articles_db_client.connect()
- # Number of concurrent threads
- MAX_WORKERS = 4
- article_list = get_article_list(run_date=None)
- with ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor:
- list(
- tqdm(
- executor.map(monitor_article, article_list),
- total=len(article_list),
- desc="Monitor Article List",
- )
- )
|