|
@@ -2,7 +2,7 @@
|
|
|
@author: luojunhui
|
|
|
抓取全局品类文章
|
|
|
"""
|
|
|
-
|
|
|
+import json
|
|
|
import time
|
|
|
|
|
|
from tqdm import tqdm
|
|
@@ -10,6 +10,7 @@ from pymysql.cursors import DictCursor
|
|
|
|
|
|
from applications import WeixinSpider, Functions, llm_sensitivity, log
|
|
|
from coldStartTasks.filter import article_crawler_duplicate_filter
|
|
|
+from config import apolloConfig
|
|
|
|
|
|
# 常量
|
|
|
ACCOUNT_GOOD_STATUS = 1
|
|
@@ -24,6 +25,24 @@ DEFAULT_LIKE_COUNT = 0
|
|
|
DEFAULT_ARTICLE_STATUS = 1
|
|
|
DEFAULT_TIMESTAMP = 1717171200
|
|
|
|
|
|
+# 标题sensitivity
|
|
|
+TITLE_SENSITIVE = 1
|
|
|
+TITLE_NOT_SENSITIVE = 0
|
|
|
+
|
|
|
+config = apolloConfig()
|
|
|
+sensitive_word_list = json.loads(config.getConfigValue("sensitive_word_list"))
|
|
|
+
|
|
|
+
|
|
|
+def whether_title_sensitive(title: str) -> bool:
|
|
|
+ """
|
|
|
+ : param title:
|
|
|
+ 判断视频是否的标题是否包含敏感词
|
|
|
+ """
|
|
|
+ for word in sensitive_word_list:
|
|
|
+ if word in title:
|
|
|
+ return True
|
|
|
+ return False
|
|
|
+
|
|
|
|
|
|
class weixinCategory(object):
|
|
|
"""
|
|
@@ -77,6 +96,7 @@ class weixinCategory(object):
|
|
|
"""
|
|
|
将数据更新到数据库
|
|
|
:return:
|
|
|
+
|
|
|
"""
|
|
|
success_records = []
|
|
|
for article_obj in article_list:
|
|
@@ -85,7 +105,7 @@ class weixinCategory(object):
|
|
|
try:
|
|
|
# 判断文章是否存在相同的标题
|
|
|
if article_crawler_duplicate_filter(
|
|
|
- new_article_title=obj["Title"], db_client=self.db_client_lam
|
|
|
+ new_article_title=obj["Title"], db_client=self.db_client_lam
|
|
|
):
|
|
|
log(
|
|
|
function="weixinCategory",
|
|
@@ -94,6 +114,9 @@ class weixinCategory(object):
|
|
|
data={"title": obj["Title"]}
|
|
|
)
|
|
|
continue
|
|
|
+
|
|
|
+ # 判断标题是否包含敏感词
|
|
|
+ title_sensitivity = TITLE_SENSITIVE if whether_title_sensitive(obj["Title"]) else TITLE_NOT_SENSITIVE
|
|
|
show_stat = self.function.show_desc_to_sta(obj["ShowDesc"])
|
|
|
show_view_count = show_stat.get("show_view_count", DEFAULT_VIEW_COUNT)
|
|
|
show_like_count = show_stat.get("show_like_count", DEFAULT_LIKE_COUNT)
|
|
@@ -102,10 +125,10 @@ class weixinCategory(object):
|
|
|
insert into crawler_meta_article
|
|
|
(
|
|
|
platform, mode, category, out_account_id, article_index, title, link, read_cnt, like_cnt,
|
|
|
- description, publish_time, crawler_time, status, unique_index, llm_sensitivity
|
|
|
+ description, publish_time, crawler_time, status, unique_index, llm_sensitivity, title_sensitivity
|
|
|
)
|
|
|
VALUES
|
|
|
- (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s);
|
|
|
+ (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s);
|
|
|
"""
|
|
|
self.db_client_lam.update(
|
|
|
sql=insert_sql,
|
|
@@ -124,7 +147,8 @@ class weixinCategory(object):
|
|
|
int(time.time()),
|
|
|
DEFAULT_ARTICLE_STATUS,
|
|
|
unique_idx,
|
|
|
- obj.get("llm_sensitivity", -1)
|
|
|
+ obj.get("llm_sensitivity", -1),
|
|
|
+ title_sensitivity
|
|
|
),
|
|
|
)
|
|
|
success_records.append({
|
|
@@ -275,5 +299,3 @@ class weixinCategory(object):
|
|
|
)
|
|
|
except Exception as e:
|
|
|
print(e)
|
|
|
-
|
|
|
-
|