|
@@ -2,19 +2,46 @@
|
|
@author: luojunhui
|
|
@author: luojunhui
|
|
抓取全局品类文章
|
|
抓取全局品类文章
|
|
"""
|
|
"""
|
|
-
|
|
|
|
|
|
+import json
|
|
import time
|
|
import time
|
|
|
|
|
|
from tqdm import tqdm
|
|
from tqdm import tqdm
|
|
|
|
+from pymysql.cursors import DictCursor
|
|
|
|
+
|
|
from applications import WeixinSpider, Functions, llm_sensitivity, log
|
|
from applications import WeixinSpider, Functions, llm_sensitivity, log
|
|
from coldStartTasks.filter import article_crawler_duplicate_filter
|
|
from coldStartTasks.filter import article_crawler_duplicate_filter
|
|
|
|
+from config import apolloConfig
|
|
|
|
|
|
# 常量
|
|
# 常量
|
|
ACCOUNT_GOOD_STATUS = 1
|
|
ACCOUNT_GOOD_STATUS = 1
|
|
|
|
+
|
|
|
|
+# 账号是否每日抓取
|
|
|
|
+ACCOUNT_DAILY_SCRAPE = 1
|
|
|
|
+ACCOUNT_NOT_DAILY_SCRAPE = 0
|
|
|
|
+
|
|
|
|
+# 默认值
|
|
DEFAULT_VIEW_COUNT = 0
|
|
DEFAULT_VIEW_COUNT = 0
|
|
DEFAULT_LIKE_COUNT = 0
|
|
DEFAULT_LIKE_COUNT = 0
|
|
DEFAULT_ARTICLE_STATUS = 1
|
|
DEFAULT_ARTICLE_STATUS = 1
|
|
-DEFAULT_TIMESTAMP = 1704038400
|
|
|
|
|
|
+DEFAULT_TIMESTAMP = 1717171200
|
|
|
|
+
|
|
|
|
+# 标题sensitivity
|
|
|
|
+TITLE_SENSITIVE = 1
|
|
|
|
+TITLE_NOT_SENSITIVE = 0
|
|
|
|
+
|
|
|
|
+config = apolloConfig()
|
|
|
|
+sensitive_word_list = json.loads(config.getConfigValue("sensitive_word_list"))
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+def whether_title_sensitive(title: str) -> bool:
|
|
|
|
+ """
|
|
|
|
+ : param title:
|
|
|
|
+ 判断视频是否的标题是否包含敏感词
|
|
|
|
+ """
|
|
|
|
+ for word in sensitive_word_list:
|
|
|
|
+ if word in title:
|
|
|
|
+ return True
|
|
|
|
+ return False
|
|
|
|
|
|
|
|
|
|
class weixinCategory(object):
|
|
class weixinCategory(object):
|
|
@@ -36,7 +63,7 @@ class weixinCategory(object):
|
|
sql = f"""
|
|
sql = f"""
|
|
select gh_id, account_source, account_name, account_category, latest_update_time
|
|
select gh_id, account_source, account_name, account_category, latest_update_time
|
|
from long_articles_accounts
|
|
from long_articles_accounts
|
|
- where account_category = '{account_category}' and is_using = {ACCOUNT_GOOD_STATUS};
|
|
|
|
|
|
+ where account_category = '{account_category}' and is_using = {ACCOUNT_GOOD_STATUS} and daily_scrape = {ACCOUNT_DAILY_SCRAPE};
|
|
"""
|
|
"""
|
|
account_tuple = self.db_client_lam.select(sql)
|
|
account_tuple = self.db_client_lam.select(sql)
|
|
result = [
|
|
result = [
|
|
@@ -51,10 +78,25 @@ class weixinCategory(object):
|
|
]
|
|
]
|
|
return result
|
|
return result
|
|
|
|
|
|
|
|
+ def get_association_account_list(self, date_str):
|
|
|
|
+ """
|
|
|
|
+ 获取账号联想的轮询账号
|
|
|
|
+ """
|
|
|
|
+ group_id = date_str[-1]
|
|
|
|
+ sql = f"""
|
|
|
|
+ select account_id, gh_id, account_name, latest_update_time
|
|
|
|
+ from long_articles_accounts
|
|
|
|
+ where account_category = 'account_association' and is_using = {ACCOUNT_DAILY_SCRAPE} and daily_scrape = {ACCOUNT_NOT_DAILY_SCRAPE};
|
|
|
|
+ """
|
|
|
|
+ account_list = self.db_client_lam.select(sql, cursor_type=DictCursor)
|
|
|
|
+ today_crawler_account_list = [i for i in account_list if str(i['account_id'])[-1] == group_id]
|
|
|
|
+ return today_crawler_account_list
|
|
|
|
+
|
|
def insert_data_into_db(self, gh_id, category, article_list):
|
|
def insert_data_into_db(self, gh_id, category, article_list):
|
|
"""
|
|
"""
|
|
将数据更新到数据库
|
|
将数据更新到数据库
|
|
:return:
|
|
:return:
|
|
|
|
+
|
|
"""
|
|
"""
|
|
success_records = []
|
|
success_records = []
|
|
for article_obj in article_list:
|
|
for article_obj in article_list:
|
|
@@ -63,7 +105,7 @@ class weixinCategory(object):
|
|
try:
|
|
try:
|
|
# 判断文章是否存在相同的标题
|
|
# 判断文章是否存在相同的标题
|
|
if article_crawler_duplicate_filter(
|
|
if article_crawler_duplicate_filter(
|
|
- new_article_title=obj["Title"], db_client=self.db_client_lam
|
|
|
|
|
|
+ new_article_title=obj["Title"], db_client=self.db_client_lam
|
|
):
|
|
):
|
|
log(
|
|
log(
|
|
function="weixinCategory",
|
|
function="weixinCategory",
|
|
@@ -72,6 +114,9 @@ class weixinCategory(object):
|
|
data={"title": obj["Title"]}
|
|
data={"title": obj["Title"]}
|
|
)
|
|
)
|
|
continue
|
|
continue
|
|
|
|
+
|
|
|
|
+ # 判断标题是否包含敏感词
|
|
|
|
+ title_sensitivity = TITLE_SENSITIVE if whether_title_sensitive(obj["Title"]) else TITLE_NOT_SENSITIVE
|
|
show_stat = self.function.show_desc_to_sta(obj["ShowDesc"])
|
|
show_stat = self.function.show_desc_to_sta(obj["ShowDesc"])
|
|
show_view_count = show_stat.get("show_view_count", DEFAULT_VIEW_COUNT)
|
|
show_view_count = show_stat.get("show_view_count", DEFAULT_VIEW_COUNT)
|
|
show_like_count = show_stat.get("show_like_count", DEFAULT_LIKE_COUNT)
|
|
show_like_count = show_stat.get("show_like_count", DEFAULT_LIKE_COUNT)
|
|
@@ -80,10 +125,10 @@ class weixinCategory(object):
|
|
insert into crawler_meta_article
|
|
insert into crawler_meta_article
|
|
(
|
|
(
|
|
platform, mode, category, out_account_id, article_index, title, link, read_cnt, like_cnt,
|
|
platform, mode, category, out_account_id, article_index, title, link, read_cnt, like_cnt,
|
|
- description, publish_time, crawler_time, status, unique_index, llm_sensitivity
|
|
|
|
|
|
+ description, publish_time, crawler_time, status, unique_index, llm_sensitivity, title_sensitivity
|
|
)
|
|
)
|
|
VALUES
|
|
VALUES
|
|
- (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s);
|
|
|
|
|
|
+ (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s);
|
|
"""
|
|
"""
|
|
self.db_client_lam.update(
|
|
self.db_client_lam.update(
|
|
sql=insert_sql,
|
|
sql=insert_sql,
|
|
@@ -102,7 +147,8 @@ class weixinCategory(object):
|
|
int(time.time()),
|
|
int(time.time()),
|
|
DEFAULT_ARTICLE_STATUS,
|
|
DEFAULT_ARTICLE_STATUS,
|
|
unique_idx,
|
|
unique_idx,
|
|
- obj.get("llm_sensitivity", -1)
|
|
|
|
|
|
+ obj.get("llm_sensitivity", -1),
|
|
|
|
+ title_sensitivity
|
|
),
|
|
),
|
|
)
|
|
)
|
|
success_records.append({
|
|
success_records.append({
|
|
@@ -175,48 +221,59 @@ class weixinCategory(object):
|
|
print("No more data")
|
|
print("No more data")
|
|
return []
|
|
return []
|
|
|
|
|
|
- def deal(self, category_list):
|
|
|
|
|
|
+ def crawler_each_category(self, account_list, category):
|
|
"""
|
|
"""
|
|
|
|
+ 抓取每个品类
|
|
|
|
+ :return:
|
|
|
|
+ """
|
|
|
|
+ success_records = []
|
|
|
|
+ for account in tqdm(account_list, desc="crawler_each_category"):
|
|
|
|
+ try:
|
|
|
|
+ gh_id = account['gh_id']
|
|
|
|
+ try:
|
|
|
|
+ timestamp = int(account['latest_timestamp'].timestamp())
|
|
|
|
+ except Exception as e:
|
|
|
|
+ timestamp = DEFAULT_TIMESTAMP
|
|
|
|
+ success_records += self.update_each_account(
|
|
|
|
+ gh_id=gh_id,
|
|
|
|
+ category=category,
|
|
|
|
+ latest_time_stamp=timestamp
|
|
|
|
+ )
|
|
|
|
+ print("success")
|
|
|
|
+ except Exception as e:
|
|
|
|
+ print("fail because of {}".format(e))
|
|
|
|
+ success_titles = [x['title'] for x in success_records]
|
|
|
|
+ if success_titles:
|
|
|
|
+ try:
|
|
|
|
+ sensitive_results = llm_sensitivity.check_titles(success_titles)
|
|
|
|
+ for record, sensitive_result in zip(success_records, sensitive_results):
|
|
|
|
+ self.update_article_sensitive_status(
|
|
|
|
+ category=category,
|
|
|
|
+ unique_index=record['unique_index'],
|
|
|
|
+ status=sensitive_result['hit_rule']
|
|
|
|
+ )
|
|
|
|
+ except Exception as e:
|
|
|
|
+ print("failed to update sensitive status: {}".format(e))
|
|
|
|
|
|
|
|
+ def deal(self, category_list, date_str):
|
|
|
|
+ """
|
|
:param category_list:
|
|
:param category_list:
|
|
|
|
+ :param date_str: YYYY-MM-DD
|
|
:return:
|
|
:return:
|
|
"""
|
|
"""
|
|
|
|
+ # daily 品类账号抓取
|
|
for category in category_list:
|
|
for category in category_list:
|
|
- success_records = []
|
|
|
|
account_list = self.get_account_list(category)
|
|
account_list = self.get_account_list(category)
|
|
- for account in tqdm(account_list):
|
|
|
|
- try:
|
|
|
|
- gh_id = account['gh_id']
|
|
|
|
- category = account['category']
|
|
|
|
- try:
|
|
|
|
- timestamp = int(account['latest_timestamp'].timestamp())
|
|
|
|
- except Exception as e:
|
|
|
|
- timestamp = DEFAULT_TIMESTAMP
|
|
|
|
- success_records += self.update_each_account(
|
|
|
|
- gh_id=gh_id,
|
|
|
|
- category=category,
|
|
|
|
- latest_time_stamp=timestamp
|
|
|
|
- )
|
|
|
|
- print("success")
|
|
|
|
- except Exception as e:
|
|
|
|
- print("fail because of {}".format(e))
|
|
|
|
- success_titles = [x['title'] for x in success_records]
|
|
|
|
- if success_titles:
|
|
|
|
- try:
|
|
|
|
- sensitive_results = llm_sensitivity.check_titles(success_titles)
|
|
|
|
- for record, sensitive_result in zip(success_records, sensitive_results):
|
|
|
|
- self.update_article_sensitive_status(
|
|
|
|
- category=category,
|
|
|
|
- unique_index=record['unique_index'],
|
|
|
|
- status=sensitive_result['hit_rule']
|
|
|
|
- )
|
|
|
|
- except Exception as e:
|
|
|
|
- print("failed to update sensitive status: {}".format(e))
|
|
|
|
|
|
+ self.crawler_each_category(account_list=account_list, category=category)
|
|
|
|
+
|
|
|
|
+ # 账号联想账号轮询抓取
|
|
|
|
+ association_account_list = self.get_association_account_list(date_str)
|
|
|
|
+ self.crawler_each_category(account_list=association_account_list, category="association")
|
|
|
|
|
|
def deal_accounts(self, account_list):
|
|
def deal_accounts(self, account_list):
|
|
"""
|
|
"""
|
|
input account list
|
|
input account list
|
|
- :param account_list:
|
|
|
|
|
|
+ :param account_list: 具体账号抓取,只抓一页
|
|
:return:
|
|
:return:
|
|
"""
|
|
"""
|
|
account_tuple = tuple(account_list)
|
|
account_tuple = tuple(account_list)
|
|
@@ -233,6 +290,7 @@ class weixinCategory(object):
|
|
try:
|
|
try:
|
|
latest_timestamp = account[3].timestamp()
|
|
latest_timestamp = account[3].timestamp()
|
|
except Exception as e:
|
|
except Exception as e:
|
|
|
|
+ print(e)
|
|
latest_timestamp = DEFAULT_TIMESTAMP
|
|
latest_timestamp = DEFAULT_TIMESTAMP
|
|
self.update_each_account(
|
|
self.update_each_account(
|
|
gh_id=gh_id,
|
|
gh_id=gh_id,
|
|
@@ -241,5 +299,3 @@ class weixinCategory(object):
|
|
)
|
|
)
|
|
except Exception as e:
|
|
except Exception as e:
|
|
print(e)
|
|
print(e)
|
|
-
|
|
|
|
-
|
|
|