|
@@ -10,7 +10,20 @@ from cold_start.crawler.wechat import get_article_list_from_account
|
|
|
from config import long_articles_config, denet_config
|
|
|
|
|
|
|
|
|
-class OutsideGzhArticlesManager:
|
|
|
+class Const:
|
|
|
+ # 文章违规状态
|
|
|
+ ILLEGAL_STATUS = 1
|
|
|
+ INIT_STATUS = 0
|
|
|
+
|
|
|
+ # 监测周期
|
|
|
+ MONITOR_CYCLE = 5 * 60 * 60 * 24
|
|
|
+
|
|
|
+ # Article Code
|
|
|
+ ILLEGAL_CODE = 25012
|
|
|
+ SUCCESS_CODE = 0
|
|
|
+
|
|
|
+
|
|
|
+class OutsideGzhArticlesManager(Const):
|
|
|
|
|
|
def __init__(self):
|
|
|
self.long_articles_client = DatabaseConnector(long_articles_config)
|
|
@@ -19,7 +32,9 @@ class OutsideGzhArticlesManager:
|
|
|
self.denet_client.connect()
|
|
|
self.feishu_bot_api = FeishuBotApi()
|
|
|
|
|
|
- def update_article_illegal_status(self, article_id: int, illegal_reason: str) -> None:
|
|
|
+ def update_article_illegal_status(
|
|
|
+ self, article_id: int, illegal_reason: str
|
|
|
+ ) -> None:
|
|
|
update_query = f"""
|
|
|
update outside_gzh_account_monitor
|
|
|
set illegal_status = %s, illegal_reason = %s
|
|
@@ -27,7 +42,7 @@ class OutsideGzhArticlesManager:
|
|
|
"""
|
|
|
self.long_articles_client.save(
|
|
|
query=update_query,
|
|
|
- params=(1, illegal_reason, article_id, 0)
|
|
|
+ params=(self.ILLEGAL_STATUS, illegal_reason, article_id, self.INIT_STATUS),
|
|
|
)
|
|
|
|
|
|
def whether_published_in_a_week(self, gh_id: str) -> bool:
|
|
@@ -40,13 +55,15 @@ class OutsideGzhArticlesManager:
|
|
|
order by publish_timestamp desc
|
|
|
limit 1;
|
|
|
"""
|
|
|
- fetch_response = self.long_articles_client.fetch(query=fetch_query, cursor_type=DictCursor)
|
|
|
+ fetch_response = self.long_articles_client.fetch(
|
|
|
+ query=fetch_query, cursor_type=DictCursor
|
|
|
+ )
|
|
|
if fetch_response:
|
|
|
- publish_timestamp = fetch_response[0]['publish_timestamp']
|
|
|
- if publish_timestamp is None:
|
|
|
- return False
|
|
|
- else:
|
|
|
- return int(time.time()) - publish_timestamp <= 5 * 24 * 3600
|
|
|
+ publish_timestamp = fetch_response[0]["publish_timestamp"]
|
|
|
+ if publish_timestamp is None:
|
|
|
+ return False
|
|
|
+ else:
|
|
|
+ return int(time.time()) - publish_timestamp <= self.MONITOR_CYCLE
|
|
|
else:
|
|
|
return False
|
|
|
|
|
@@ -78,7 +95,9 @@ class OutsideGzhArticlesCollector(OutsideGzhArticlesManager):
|
|
|
try:
|
|
|
msg_list = fetch_response.get("data", {}).get("data", [])
|
|
|
if msg_list:
|
|
|
- for msg in tqdm(msg_list, desc=f"insert account {account['account_name']}"):
|
|
|
+ for msg in tqdm(
|
|
|
+ msg_list, desc=f"insert account {account['account_name']}"
|
|
|
+ ):
|
|
|
self.save_each_msg_to_db(msg, account)
|
|
|
|
|
|
else:
|
|
@@ -101,22 +120,22 @@ class OutsideGzhArticlesCollector(OutsideGzhArticlesManager):
|
|
|
link = article["ContentUrl"]
|
|
|
article_detail = get_article_detail(link)
|
|
|
response_code = article_detail["code"]
|
|
|
- if response_code == 25012:
|
|
|
+ if response_code == self.ILLEGAL_CODE:
|
|
|
illegal_reason = article_detail.get("msg")
|
|
|
# bot and return
|
|
|
self.feishu_bot_api.bot(
|
|
|
title="文章违规告警",
|
|
|
detail={
|
|
|
"account_name": article["account_name"],
|
|
|
- "title": article['title'],
|
|
|
+ "title": article["title"],
|
|
|
"reason": illegal_reason,
|
|
|
"publish_timestamp": create_timestamp,
|
|
|
- "account_source": article["account_source"]
|
|
|
+ "account_source": article["account_source"],
|
|
|
},
|
|
|
- env="outside_gzh_monitor"
|
|
|
+ env="outside_gzh_monitor",
|
|
|
)
|
|
|
|
|
|
- elif response_code == 0:
|
|
|
+ elif response_code == self.SUCCESS_CODE:
|
|
|
insert_query = f"""
|
|
|
insert ignore into outside_gzh_account_monitor
|
|
|
(account_name, gh_id, account_source, account_type, app_msg_id, publish_type, position, title, link,
|
|
@@ -150,20 +169,18 @@ class OutsideGzhArticlesCollector(OutsideGzhArticlesManager):
|
|
|
try:
|
|
|
self.fetch_each_account(account)
|
|
|
except Exception as e:
|
|
|
- print(
|
|
|
- f"crawler failed: {account['account_name']}, error: {e}"
|
|
|
- )
|
|
|
+ print(f"crawler failed: {account['account_name']}, error: {e}")
|
|
|
|
|
|
|
|
|
class OutsideGzhArticlesMonitor(OutsideGzhArticlesManager):
|
|
|
|
|
|
def fetch_article_list_to_check(self):
|
|
|
- publish_timestamp_threshold = int(time.time()) - 7 * 24 * 3600
|
|
|
+ publish_timestamp_threshold = int(time.time()) - self.MONITOR_CYCLE
|
|
|
fetch_query = f"""
|
|
|
select id, account_name, gh_id, account_source, account_type,
|
|
|
title, link, from_unixtime(publish_timestamp) as publish_date
|
|
|
from outside_gzh_account_monitor
|
|
|
- where illegal_status = 0 and publish_timestamp > {publish_timestamp_threshold};
|
|
|
+ where illegal_status = {self.INIT_STATUS} and publish_timestamp > {publish_timestamp_threshold};
|
|
|
"""
|
|
|
return self.long_articles_client.fetch(
|
|
|
query=fetch_query, cursor_type=DictCursor
|
|
@@ -176,18 +193,18 @@ class OutsideGzhArticlesMonitor(OutsideGzhArticlesManager):
|
|
|
link = article["link"]
|
|
|
article_detail = get_article_detail(link)
|
|
|
response_code = article_detail["code"]
|
|
|
- if response_code == 25012:
|
|
|
+ if response_code == self.ILLEGAL_CODE:
|
|
|
illegal_reason = article_detail.get("msg")
|
|
|
self.feishu_bot_api.bot(
|
|
|
title="文章违规告警",
|
|
|
detail={
|
|
|
"account_name": article["account_name"],
|
|
|
- "title": article['title'],
|
|
|
+ "title": article["title"],
|
|
|
"reason": illegal_reason,
|
|
|
"publish_date": article["publish_date"],
|
|
|
- "account_source": article["account_source"]
|
|
|
+ "account_source": article["account_source"],
|
|
|
},
|
|
|
- env="outside_gzh_monitor"
|
|
|
+ env="outside_gzh_monitor",
|
|
|
)
|
|
|
article_id = article["id"]
|
|
|
self.update_article_illegal_status(article_id, illegal_reason)
|
|
@@ -205,4 +222,4 @@ class OutsideGzhArticlesMonitor(OutsideGzhArticlesManager):
|
|
|
f"link: {article['link']}\n"
|
|
|
f"title: {article['title']}\n"
|
|
|
f"error: {e}\n"
|
|
|
- )
|
|
|
+ )
|