|
@@ -19,7 +19,7 @@ class OutsideGzhArticlesManager:
|
|
|
self.denet_client.connect()
|
|
|
self.feishu_bot_api = FeishuBotApi()
|
|
|
|
|
|
- def update_article_illegal_status(self, article_id, illegal_reason):
|
|
|
+ def update_article_illegal_status(self, article_id: int, illegal_reason: str) -> None:
|
|
|
update_query = f"""
|
|
|
update outside_gzh_account_monitor
|
|
|
set illegal_status = %s, illegal_reason = %s
|
|
@@ -30,6 +30,26 @@ class OutsideGzhArticlesManager:
|
|
|
params=(1, illegal_reason, article_id, 0)
|
|
|
)
|
|
|
|
|
|
+ def whether_published_in_a_week(self, gh_id: str) -> bool:
|
|
|
+ """
|
|
|
+ 判断该账号一周内是否有发文,如有,则说无需抓
|
|
|
+ """
|
|
|
+ fetch_query = f"""
|
|
|
+ select id, publish_timestamp from outside_gzh_account_monitor
|
|
|
+ where gh_id = '{gh_id}'
|
|
|
+ order by publish_timestamp desc
|
|
|
+ limit 1;
|
|
|
+ """
|
|
|
+ fetch_response = self.long_articles_client.fetch(query=fetch_query, cursor_type=DictCursor)
|
|
|
+ if fetch_response:
|
|
|
+ publish_timestamp = fetch_response[0]['publish_timestamp']
|
|
|
+ if publish_timestamp is None:
|
|
|
+ return False
|
|
|
+ else:
|
|
|
+ return int(time.time()) - publish_timestamp <= 5 * 24 * 3600
|
|
|
+ else:
|
|
|
+ return False
|
|
|
+
|
|
|
|
|
|
class OutsideGzhArticlesCollector(OutsideGzhArticlesManager):
|
|
|
|
|
@@ -50,14 +70,24 @@ class OutsideGzhArticlesCollector(OutsideGzhArticlesManager):
|
|
|
|
|
|
def fetch_each_account(self, account: dict):
|
|
|
gh_id = account["gh_id"]
|
|
|
+ # 判断该账号本周是否已经发布过
|
|
|
+ if self.whether_published_in_a_week(gh_id):
|
|
|
+ return
|
|
|
+
|
|
|
fetch_response = get_article_list_from_account(gh_id)
|
|
|
- msg_list = fetch_response.get("data", {}).get("data", [])
|
|
|
- if msg_list:
|
|
|
- for msg in tqdm(msg_list, desc=f"insert account {account['account_name']}"):
|
|
|
- self.save_each_msg_to_db(msg, account)
|
|
|
+ try:
|
|
|
+ msg_list = fetch_response.get("data", {}).get("data", [])
|
|
|
+ if msg_list:
|
|
|
+ for msg in tqdm(msg_list, desc=f"insert account {account['account_name']}"):
|
|
|
+ self.save_each_msg_to_db(msg, account)
|
|
|
|
|
|
- else:
|
|
|
- print(f"crawler failed: {account['account_name']}")
|
|
|
+ else:
|
|
|
+ print(f"crawler failed: {account['account_name']}")
|
|
|
+ except Exception as e:
|
|
|
+ print(
|
|
|
+ f"crawler failed: account_name: {account['account_name']}\n"
|
|
|
+ f"error: {e}\n"
|
|
|
+ )
|
|
|
|
|
|
def save_each_msg_to_db(self, msg: dict, account: dict):
|
|
|
base_info = msg["AppMsg"]["BaseInfo"]
|
|
@@ -83,7 +113,7 @@ class OutsideGzhArticlesCollector(OutsideGzhArticlesManager):
|
|
|
"publish_timestamp": create_timestamp,
|
|
|
"account_source": article["account_source"]
|
|
|
},
|
|
|
- env="dev"
|
|
|
+ env="outside_gzh_monitor"
|
|
|
)
|
|
|
|
|
|
elif response_code == 0:
|
|
@@ -120,7 +150,9 @@ class OutsideGzhArticlesCollector(OutsideGzhArticlesManager):
|
|
|
try:
|
|
|
self.fetch_each_account(account)
|
|
|
except Exception as e:
|
|
|
- print(e)
|
|
|
+ print(
|
|
|
+ f"crawler failed: {account['account_name']}, error: {e}"
|
|
|
+ )
|
|
|
|
|
|
|
|
|
class OutsideGzhArticlesMonitor(OutsideGzhArticlesManager):
|
|
@@ -155,7 +187,7 @@ class OutsideGzhArticlesMonitor(OutsideGzhArticlesManager):
|
|
|
"publish_date": article["publish_date"],
|
|
|
"account_source": article["account_source"]
|
|
|
},
|
|
|
- env="dev"
|
|
|
+ env="outside_gzh_monitor"
|
|
|
)
|
|
|
article_id = article["id"]
|
|
|
self.update_article_illegal_status(article_id, illegal_reason)
|
|
@@ -168,5 +200,9 @@ class OutsideGzhArticlesMonitor(OutsideGzhArticlesManager):
|
|
|
try:
|
|
|
self.check_each_article(article)
|
|
|
except Exception as e:
|
|
|
- print(e)
|
|
|
- continue
|
|
|
+ print(
|
|
|
+ f"crawler failed: account_name: {article['account_name']}\n"
|
|
|
+ f"link: {article['link']}\n"
|
|
|
+ f"title: {article['title']}\n"
|
|
|
+ f"error: {e}\n"
|
|
|
+ )
|