|
@@ -18,10 +18,12 @@ from config import long_articles_config
|
|
|
|
|
|
functions = Functions()
|
|
|
|
|
|
+
|
|
|
class ToutiaoRecommendCrawler(object):
|
|
|
"""
|
|
|
今日头条推荐流
|
|
|
"""
|
|
|
+
|
|
|
def __init__(self) -> None:
|
|
|
self.db_client = None
|
|
|
|
|
@@ -42,18 +44,18 @@ class ToutiaoRecommendCrawler(object):
|
|
|
}
|
|
|
)
|
|
|
|
|
|
- def get_history_recommendation(self) -> Dict:
|
|
|
+ def get_request_params(self, category) -> Dict:
|
|
|
"""
|
|
|
- 获取历史推荐流文章
|
|
|
+ 获取请求参数
|
|
|
:return:
|
|
|
"""
|
|
|
select_sql = f"""
|
|
|
- SELECT request_method, request_url, request_headers, post_data
|
|
|
- FROM toutiao_request_params
|
|
|
- WHERE category = 'history' and expire_flag = 0
|
|
|
- ORDER BY id
|
|
|
- LIMIT 1;
|
|
|
- """
|
|
|
+ SELECT request_method, request_url, request_headers, post_data
|
|
|
+ FROM toutiao_request_params
|
|
|
+ WHERE category = '{category}' and expire_flag = 0
|
|
|
+ ORDER BY id
|
|
|
+ LIMIT 1;
|
|
|
+ """
|
|
|
result = self.db_client.fetch(
|
|
|
query=select_sql,
|
|
|
cursor_type=DictCursor
|
|
@@ -61,26 +63,32 @@ class ToutiaoRecommendCrawler(object):
|
|
|
if not result:
|
|
|
print("cookie没了报警")
|
|
|
return {}
|
|
|
- cookie_obj = result[0]
|
|
|
+ else:
|
|
|
+ return result[0]
|
|
|
+
|
|
|
+ def get_recommendation_article_list(self, category) -> Dict:
|
|
|
+ """
|
|
|
+ 获取历史推荐流文章
|
|
|
+ :return:
|
|
|
+ """
|
|
|
+ cookie_obj = self.get_request_params(category)
|
|
|
+ if not cookie_obj:
|
|
|
+ return {}
|
|
|
response = requests.request(
|
|
|
method=cookie_obj['request_method'],
|
|
|
url=cookie_obj['request_url'],
|
|
|
headers=json.loads(cookie_obj['request_headers']),
|
|
|
proxies=functions.proxy()
|
|
|
)
|
|
|
+ if response.text is None:
|
|
|
+ print("{}: cookie 失效".format(category))
|
|
|
return response.json()
|
|
|
|
|
|
- def get_tech_recommendation(self) -> Dict:
|
|
|
- """
|
|
|
- 获取科技推荐流文章
|
|
|
- :return:
|
|
|
- """
|
|
|
- return
|
|
|
-
|
|
|
- def insert_each_article(self, item: Dict) -> Dict:
|
|
|
+ def insert_each_article(self, category, item: Dict) -> None:
|
|
|
"""
|
|
|
提取文章信息
|
|
|
- :param article_info:
|
|
|
+ :param item
|
|
|
+ :param category
|
|
|
:return:
|
|
|
"""
|
|
|
item_id = item.get('item_id')
|
|
@@ -103,7 +111,7 @@ class ToutiaoRecommendCrawler(object):
|
|
|
params=(
|
|
|
"toutiao",
|
|
|
"recommend",
|
|
|
- "history",
|
|
|
+ category,
|
|
|
user_id,
|
|
|
title,
|
|
|
article_url,
|
|
@@ -117,10 +125,11 @@ class ToutiaoRecommendCrawler(object):
|
|
|
)
|
|
|
)
|
|
|
|
|
|
- def process_recommendation(self, recommendation) -> Dict:
|
|
|
+ def process_recommendation(self, category, recommendation) -> Dict:
|
|
|
"""
|
|
|
处理推荐流文章
|
|
|
- :param recommendation:
|
|
|
+ :param recommendation
|
|
|
+ :param category
|
|
|
:return:
|
|
|
"""
|
|
|
for item in tqdm(recommendation['data']):
|
|
@@ -128,13 +137,14 @@ class ToutiaoRecommendCrawler(object):
|
|
|
video_flag = item.get('has_video')
|
|
|
if not video_flag:
|
|
|
try:
|
|
|
- self.insert_each_article(item)
|
|
|
+ self.insert_each_article(category=category, item=item)
|
|
|
except Exception as e:
|
|
|
error_data = {
|
|
|
"error": str(e),
|
|
|
"error_stack": traceback.format_exc()
|
|
|
}
|
|
|
log(
|
|
|
+ function='toutiao_recommend_crawler',
|
|
|
task='toutiao_recommend',
|
|
|
message='头条推荐流文章插入失败',
|
|
|
data=error_data,
|
|
@@ -144,16 +154,16 @@ class ToutiaoRecommendCrawler(object):
|
|
|
print("视频文章跳过")
|
|
|
else:
|
|
|
print("无链接文章跳过")
|
|
|
-
|
|
|
- def run(self) -> None:
|
|
|
+
|
|
|
+ def run(self, category) -> None:
|
|
|
"""
|
|
|
主函数
|
|
|
:return:
|
|
|
"""
|
|
|
for i in range(10):
|
|
|
try:
|
|
|
- article_list = self.get_history_recommendation()
|
|
|
- self.process_recommendation(article_list)
|
|
|
+ article_list = self.get_recommendation_article_list(category=category)
|
|
|
+ self.process_recommendation(category=category, recommendation=article_list)
|
|
|
time.sleep(3)
|
|
|
except Exception as e:
|
|
|
error_data = {
|
|
@@ -161,3 +171,4 @@ class ToutiaoRecommendCrawler(object):
|
|
|
"error_stack": traceback.format_exc()
|
|
|
}
|
|
|
print(error_data)
|
|
|
+
|