|
@@ -55,6 +55,7 @@ class weixinCategory(object):
|
|
将数据更新到数据库
|
|
将数据更新到数据库
|
|
:return:
|
|
:return:
|
|
"""
|
|
"""
|
|
|
|
+ success_records = []
|
|
for article_obj in article_list:
|
|
for article_obj in article_list:
|
|
detail_article_list = article_obj["AppMsg"]["DetailInfo"]
|
|
detail_article_list = article_obj["AppMsg"]["DetailInfo"]
|
|
for obj in detail_article_list:
|
|
for obj in detail_article_list:
|
|
@@ -62,6 +63,7 @@ class weixinCategory(object):
|
|
show_stat = self.function.show_desc_to_sta(obj["ShowDesc"])
|
|
show_stat = self.function.show_desc_to_sta(obj["ShowDesc"])
|
|
show_view_count = show_stat.get("show_view_count", DEFAULT_VIEW_COUNT)
|
|
show_view_count = show_stat.get("show_view_count", DEFAULT_VIEW_COUNT)
|
|
show_like_count = show_stat.get("show_like_count", DEFAULT_LIKE_COUNT)
|
|
show_like_count = show_stat.get("show_like_count", DEFAULT_LIKE_COUNT)
|
|
|
|
+ unique_idx = self.function.generateGzhId(obj["ContentUrl"])
|
|
insert_sql = f"""
|
|
insert_sql = f"""
|
|
insert into crawler_meta_article
|
|
insert into crawler_meta_article
|
|
(
|
|
(
|
|
@@ -87,12 +89,28 @@ class weixinCategory(object):
|
|
obj["send_time"],
|
|
obj["send_time"],
|
|
int(time.time()),
|
|
int(time.time()),
|
|
DEFAULT_ARTICLE_STATUS,
|
|
DEFAULT_ARTICLE_STATUS,
|
|
- self.function.generateGzhId(obj["ContentUrl"]),
|
|
|
|
|
|
+ unique_idx,
|
|
obj.get("llm_sensitivity", -1)
|
|
obj.get("llm_sensitivity", -1)
|
|
),
|
|
),
|
|
)
|
|
)
|
|
|
|
+ success_records.append({
|
|
|
|
+ 'unique_index': unique_idx, 'title': obj['Title']
|
|
|
|
+ })
|
|
except Exception as e:
|
|
except Exception as e:
|
|
print(e)
|
|
print(e)
|
|
|
|
+ return success_records
|
|
|
|
+
|
|
|
|
+ def update_article_sensitive_status(self, category, unique_index, status):
|
|
|
|
+ """
|
|
|
|
+ 更新文章敏感状态
|
|
|
|
+ :return:
|
|
|
|
+ """
|
|
|
|
+ update_sql = f"""
|
|
|
|
+ update crawler_meta_article
|
|
|
|
+ set llm_sensitivity = %s
|
|
|
|
+ where category = %s and unique_index = %s;
|
|
|
|
+ """
|
|
|
|
+ self.db_client_lam.update(sql=update_sql, params=(status, category, unique_index))
|
|
|
|
|
|
def update_latest_account_timestamp(self, gh_id):
|
|
def update_latest_account_timestamp(self, gh_id):
|
|
"""
|
|
"""
|
|
@@ -124,25 +142,13 @@ class weixinCategory(object):
|
|
msg_list = response.get("data", {}).get("data")
|
|
msg_list = response.get("data", {}).get("data")
|
|
if msg_list:
|
|
if msg_list:
|
|
last_article_in_this_msg = msg_list[-1]
|
|
last_article_in_this_msg = msg_list[-1]
|
|
-
|
|
|
|
- article_titles = []
|
|
|
|
- for msg in msg_list:
|
|
|
|
- for article in msg['AppMsg']['DetailInfo']:
|
|
|
|
- article_titles.append(article['Title'])
|
|
|
|
- sensitive_results = llm_sensitivity.check_titles(article_titles, True)
|
|
|
|
- for msg in msg_list:
|
|
|
|
- for article in msg['AppMsg']['DetailInfo']:
|
|
|
|
- sensitive_hit = sensitive_results.get(article['Title'], None)
|
|
|
|
- if sensitive_hit:
|
|
|
|
- article['llm_sensitivity'] = sensitive_hit['hit_rule']
|
|
|
|
-
|
|
|
|
- self.insert_data_into_db(
|
|
|
|
|
|
+ success_records = self.insert_data_into_db(
|
|
gh_id=gh_id, category=category, article_list=msg_list
|
|
gh_id=gh_id, category=category, article_list=msg_list
|
|
)
|
|
)
|
|
last_time_stamp_in_this_msg = last_article_in_this_msg["AppMsg"]["BaseInfo"]["UpdateTime"]
|
|
last_time_stamp_in_this_msg = last_article_in_this_msg["AppMsg"]["BaseInfo"]["UpdateTime"]
|
|
if latest_time_stamp < last_time_stamp_in_this_msg:
|
|
if latest_time_stamp < last_time_stamp_in_this_msg:
|
|
next_cursor = response["data"]["next_cursor"]
|
|
next_cursor = response["data"]["next_cursor"]
|
|
- return self.update_each_account(
|
|
|
|
|
|
+ return success_records + self.update_each_account(
|
|
gh_id=gh_id,
|
|
gh_id=gh_id,
|
|
latest_time_stamp=latest_time_stamp,
|
|
latest_time_stamp=latest_time_stamp,
|
|
category=category,
|
|
category=category,
|
|
@@ -152,8 +158,10 @@ class weixinCategory(object):
|
|
# 更新最近抓取时间
|
|
# 更新最近抓取时间
|
|
self.update_latest_account_timestamp(gh_id=gh_id)
|
|
self.update_latest_account_timestamp(gh_id=gh_id)
|
|
print("账号时间更新成功")
|
|
print("账号时间更新成功")
|
|
|
|
+ return success_records
|
|
else:
|
|
else:
|
|
print("No more data")
|
|
print("No more data")
|
|
|
|
+ return []
|
|
|
|
|
|
def deal(self, category_list):
|
|
def deal(self, category_list):
|
|
"""
|
|
"""
|
|
@@ -171,11 +179,20 @@ class weixinCategory(object):
|
|
timestamp = int(account['latest_timestamp'].timestamp())
|
|
timestamp = int(account['latest_timestamp'].timestamp())
|
|
except Exception as e:
|
|
except Exception as e:
|
|
timestamp = DEFAULT_TIMESTAMP
|
|
timestamp = DEFAULT_TIMESTAMP
|
|
- self.update_each_account(
|
|
|
|
|
|
+ success_records = self.update_each_account(
|
|
gh_id=gh_id,
|
|
gh_id=gh_id,
|
|
category=category,
|
|
category=category,
|
|
latest_time_stamp=timestamp
|
|
latest_time_stamp=timestamp
|
|
)
|
|
)
|
|
|
|
+ success_titles = [x['title'] for x in success_records]
|
|
|
|
+ if success_titles:
|
|
|
|
+ sensitive_results = llm_sensitivity.check_titles(success_titles)
|
|
|
|
+ for record, sensitive_result in zip(success_records, sensitive_results):
|
|
|
|
+ self.update_article_sensitive_status(
|
|
|
|
+ category=category,
|
|
|
|
+ unique_index=record['unique_index'],
|
|
|
|
+ status=sensitive_result['hit_rule']
|
|
|
|
+ )
|
|
print("success")
|
|
print("success")
|
|
except Exception as e:
|
|
except Exception as e:
|
|
print("fail because of {}".format(e))
|
|
print("fail because of {}".format(e))
|