|
@@ -1,6 +1,7 @@
|
|
|
"""
|
|
|
@author: luojunhui
|
|
|
"""
|
|
|
+
|
|
|
import json
|
|
|
import traceback
|
|
|
|
|
@@ -8,21 +9,16 @@ from datetime import datetime, timedelta
|
|
|
from typing import List, Dict
|
|
|
|
|
|
from tqdm import tqdm
|
|
|
-from urllib.parse import urlparse, parse_qs
|
|
|
from pymysql.cursors import DictCursor
|
|
|
|
|
|
-from applications import bot
|
|
|
-from applications import log
|
|
|
-from applications import Functions
|
|
|
-from applications import WeixinSpider
|
|
|
+from applications import bot, log
|
|
|
from applications.db import DatabaseConnector
|
|
|
-from applications.const import UpdateMiniProgramDetailConst
|
|
|
from applications.exception import SpiderError
|
|
|
+from applications.utils import extract_root_source_id
|
|
|
+
|
|
|
+from cold_start.crawler.wechat import get_article_detail
|
|
|
from config import long_articles_config, piaoquan_crawler_config
|
|
|
|
|
|
-const = UpdateMiniProgramDetailConst()
|
|
|
-spider = WeixinSpider()
|
|
|
-functions = Functions()
|
|
|
|
|
|
TASK_NAME = "updateMinigramInfoDaily"
|
|
|
ARTICLE_TABLE = "official_articles_v2"
|
|
@@ -31,26 +27,21 @@ EMPTY_LIST = []
|
|
|
EMPTY_DICT = {}
|
|
|
|
|
|
|
|
|
-def extract_path(path: str) -> Dict:
|
|
|
- """
|
|
|
- 提取path参数
|
|
|
- :param path:
|
|
|
- :return:
|
|
|
- """
|
|
|
- params = parse_qs(urlparse(path).query)
|
|
|
- jump_page = params.get('jumpPage', [None])[0]
|
|
|
- if jump_page:
|
|
|
- params2 = parse_qs(jump_page)
|
|
|
- res = {
|
|
|
- "video_id": params2['pages/user-videos?id'][0],
|
|
|
- "root_source_id": params2['rootSourceId'][0],
|
|
|
- }
|
|
|
- return res
|
|
|
- else:
|
|
|
- return EMPTY_DICT
|
|
|
-
|
|
|
-
|
|
|
-class UpdatePublishedArticlesMinigramDetail(object):
|
|
|
+class Const:
|
|
|
+ ARTICLE_SUCCESS_CODE = 0
|
|
|
+ # 记录默认状态
|
|
|
+ DEFAULT_STATUS = 0
|
|
|
+ # 请求接口失败状态
|
|
|
+ REQUEST_FAIL_STATUS = -1
|
|
|
+ # 文章被删除状态
|
|
|
+ DELETE_STATUS = -2
|
|
|
+ # 未知原因无信息返回状态
|
|
|
+ UNKNOWN_STATUS = -3
|
|
|
+ # 文章违规状态
|
|
|
+ ILLEGAL_STATUS = -4
|
|
|
+
|
|
|
+
|
|
|
+class UpdatePublishedArticlesMinigramDetail(Const):
|
|
|
"""
|
|
|
更新已发布文章数据
|
|
|
"""
|
|
@@ -74,10 +65,7 @@ class UpdatePublishedArticlesMinigramDetail(object):
|
|
|
error_msg = traceback.format_exc()
|
|
|
bot(
|
|
|
title="更新小程序裂变信息任务连接数据库失败",
|
|
|
- detail={
|
|
|
- "error": e,
|
|
|
- "msg": error_msg
|
|
|
- }
|
|
|
+ detail={"error": e, "msg": error_msg},
|
|
|
)
|
|
|
return
|
|
|
|
|
@@ -89,7 +77,7 @@ class UpdatePublishedArticlesMinigramDetail(object):
|
|
|
sql = f"""
|
|
|
SELECT ContentUrl, wx_sn
|
|
|
FROM {ARTICLE_TABLE}
|
|
|
- WHERE publish_timestamp IN {(const.DEFAULT_STATUS, const.REQUEST_FAIL_STATUS)};
|
|
|
+ WHERE publish_timestamp IN {(self.DEFAULT_STATUS, self.REQUEST_FAIL_STATUS)};
|
|
|
"""
|
|
|
|
|
|
response = self.piaoquan_crawler_db_client.fetch(sql, cursor_type=DictCursor)
|
|
@@ -124,10 +112,23 @@ class UpdatePublishedArticlesMinigramDetail(object):
|
|
|
WHERE FROM_UNIXTIME(publish_timestamp)
|
|
|
BETWEEN DATE_SUB('{biz_date}', INTERVAL 1 DAY) AND DATE_SUB('{biz_date}', INTERVAL 1 SECOND);
|
|
|
"""
|
|
|
- article_list = self.piaoquan_crawler_db_client.fetch(query=sql, cursor_type=DictCursor)
|
|
|
+ article_list = self.piaoquan_crawler_db_client.fetch(
|
|
|
+ query=sql, cursor_type=DictCursor
|
|
|
+ )
|
|
|
return article_list
|
|
|
|
|
|
- def insert_each_root_source_id(self, wx_sn, mini_title, mini_name, cover_url, video_index, root_source_id, video_id, publish_dt, recall_dt) -> int:
|
|
|
+ def insert_each_root_source_id(
|
|
|
+ self,
|
|
|
+ wx_sn,
|
|
|
+ mini_title,
|
|
|
+ mini_name,
|
|
|
+ cover_url,
|
|
|
+ video_index,
|
|
|
+ root_source_id,
|
|
|
+ video_id,
|
|
|
+ publish_dt,
|
|
|
+ recall_dt,
|
|
|
+ ) -> int:
|
|
|
"""
|
|
|
:param recall_dt: 召回日期
|
|
|
:param publish_dt: 文章发布日期
|
|
@@ -149,8 +150,16 @@ class UpdatePublishedArticlesMinigramDetail(object):
|
|
|
affected_rows = self.piaoquan_crawler_db_client.save(
|
|
|
query=insert_sql,
|
|
|
params=(
|
|
|
- wx_sn, mini_title, mini_name, cover_url, video_index, root_source_id, video_id, publish_dt, recall_dt
|
|
|
- )
|
|
|
+ wx_sn,
|
|
|
+ mini_title,
|
|
|
+ mini_name,
|
|
|
+ cover_url,
|
|
|
+ video_index,
|
|
|
+ root_source_id,
|
|
|
+ video_id,
|
|
|
+ publish_dt,
|
|
|
+ recall_dt,
|
|
|
+ ),
|
|
|
)
|
|
|
return affected_rows
|
|
|
|
|
@@ -161,13 +170,19 @@ class UpdatePublishedArticlesMinigramDetail(object):
|
|
|
:param article_info:
|
|
|
:return:
|
|
|
"""
|
|
|
- url = article_info['ContentUrl']
|
|
|
- publish_timestamp = article_info['publish_timestamp']
|
|
|
- wx_sn = article_info['wx_sn'].decode()
|
|
|
- root_source_id_list = json.loads(article_info['root_source_id_list'] if article_info['root_source_id_list'] else EMPTY_LIST)
|
|
|
+ url = article_info["ContentUrl"]
|
|
|
+ publish_timestamp = article_info["publish_timestamp"]
|
|
|
+ wx_sn = article_info["wx_sn"].decode()
|
|
|
+ root_source_id_list = json.loads(
|
|
|
+ article_info["root_source_id_list"]
|
|
|
+ if article_info["root_source_id_list"]
|
|
|
+ else EMPTY_LIST
|
|
|
+ )
|
|
|
|
|
|
try:
|
|
|
- article_mini_program_detail = self.get_article_mini_program_info(url, root_source_id_list)
|
|
|
+ article_mini_program_detail = self.get_article_mini_program_info(
|
|
|
+ url, root_source_id_list
|
|
|
+ )
|
|
|
except Exception as e:
|
|
|
return {}
|
|
|
|
|
@@ -180,30 +195,34 @@ class UpdatePublishedArticlesMinigramDetail(object):
|
|
|
"ContentUrl": url,
|
|
|
"wxSn": wx_sn,
|
|
|
"publish_timestamp": publish_timestamp,
|
|
|
- "miniInfo": article_mini_program_detail
|
|
|
- }
|
|
|
+ "miniInfo": article_mini_program_detail,
|
|
|
+ },
|
|
|
)
|
|
|
try:
|
|
|
publish_date = datetime.fromtimestamp(publish_timestamp)
|
|
|
# generate T+0, T+1, T+2 date string
|
|
|
recall_dt_str_list = [
|
|
|
- (publish_date + timedelta(days=i)).strftime('%Y-%m-%d')
|
|
|
+ (publish_date + timedelta(days=i)).strftime("%Y-%m-%d")
|
|
|
for i in range(3)
|
|
|
]
|
|
|
|
|
|
for date_str in recall_dt_str_list:
|
|
|
- for video_index, mini_item in enumerate(article_mini_program_detail, 1):
|
|
|
- image_url = mini_item['image_url']
|
|
|
- nick_name = mini_item['nike_name']
|
|
|
+ for video_index, mini_item in enumerate(
|
|
|
+ article_mini_program_detail, 1
|
|
|
+ ):
|
|
|
+ image_url = mini_item["image_url"]
|
|
|
+ nick_name = mini_item["nike_name"]
|
|
|
# extract video id and root_source_id
|
|
|
- if mini_item.get("root_source_id") and mini_item.get("video_id"):
|
|
|
- root_source_id = mini_item['root_source_id']
|
|
|
- video_id = mini_item['video_id']
|
|
|
+ if mini_item.get("root_source_id") and mini_item.get(
|
|
|
+ "video_id"
|
|
|
+ ):
|
|
|
+ root_source_id = mini_item["root_source_id"]
|
|
|
+ video_id = mini_item["video_id"]
|
|
|
else:
|
|
|
- id_info = extract_path(mini_item['path'])
|
|
|
- root_source_id = id_info['root_source_id']
|
|
|
- video_id = id_info['video_id']
|
|
|
- kimi_title = mini_item['title']
|
|
|
+ id_info = extract_root_source_id(mini_item["path"])
|
|
|
+ root_source_id = id_info["root_source_id"]
|
|
|
+ video_id = id_info["video_id"]
|
|
|
+ kimi_title = mini_item["title"]
|
|
|
self.insert_each_root_source_id(
|
|
|
wx_sn=wx_sn,
|
|
|
mini_title=kimi_title,
|
|
@@ -212,8 +231,8 @@ class UpdatePublishedArticlesMinigramDetail(object):
|
|
|
video_index=video_index,
|
|
|
root_source_id=root_source_id,
|
|
|
video_id=video_id,
|
|
|
- publish_dt=publish_date.strftime('%Y-%m-%d'),
|
|
|
- recall_dt=date_str
|
|
|
+ publish_dt=publish_date.strftime("%Y-%m-%d"),
|
|
|
+ recall_dt=date_str,
|
|
|
)
|
|
|
return EMPTY_DICT
|
|
|
except Exception as e:
|
|
@@ -222,14 +241,16 @@ class UpdatePublishedArticlesMinigramDetail(object):
|
|
|
task=TASK_NAME,
|
|
|
function="record_each_article",
|
|
|
status="fail",
|
|
|
- message="插入数据失败, 失败原因是{}--{}".format(e, error_msg)
|
|
|
+ message="插入数据失败, 失败原因是{}--{}".format(e, error_msg),
|
|
|
)
|
|
|
return article_info
|
|
|
|
|
|
else:
|
|
|
return EMPTY_DICT
|
|
|
|
|
|
- def get_article_mini_program_info(self, content_url: str, root_source_id_list: list) -> List[Dict]:
|
|
|
+ def get_article_mini_program_info(
|
|
|
+ self, content_url: str, root_source_id_list: list
|
|
|
+ ) -> List[Dict]:
|
|
|
"""
|
|
|
获取文章的小程序信息
|
|
|
:return:
|
|
@@ -242,7 +263,7 @@ class UpdatePublishedArticlesMinigramDetail(object):
|
|
|
fetch_response = self.long_articles_db_client.fetch(
|
|
|
query=fetch_sql,
|
|
|
params=(tuple(root_source_id_list),),
|
|
|
- cursor_type=DictCursor
|
|
|
+ cursor_type=DictCursor,
|
|
|
)
|
|
|
mini_info = []
|
|
|
if fetch_response:
|
|
@@ -254,23 +275,23 @@ class UpdatePublishedArticlesMinigramDetail(object):
|
|
|
"avatar": "https://rescdn.yishihui.com/0temp/logo.png",
|
|
|
"image_url": "",
|
|
|
"nike_name": "票圈 l 3亿人喜欢的视频平台",
|
|
|
- "root_source_id": item['root_source_id'],
|
|
|
- "video_id": item['video_id'],
|
|
|
+ "root_source_id": item["root_source_id"],
|
|
|
+ "video_id": item["video_id"],
|
|
|
"service_type": "0",
|
|
|
"title": "",
|
|
|
- "type": "card"
|
|
|
+ "type": "card",
|
|
|
}
|
|
|
)
|
|
|
return mini_info
|
|
|
|
|
|
try:
|
|
|
- article_detail = spider.get_article_text(content_url)
|
|
|
+ article_detail = get_article_detail(content_url)
|
|
|
except Exception as e:
|
|
|
raise SpiderError(error=e, spider="detail", url=content_url)
|
|
|
|
|
|
- response_code = article_detail['code']
|
|
|
- if response_code == const.ARTICLE_SUCCESS_CODE:
|
|
|
- mini_info = article_detail['data']['data']['mini_program']
|
|
|
+ response_code = article_detail["code"]
|
|
|
+ if response_code == self.ARTICLE_SUCCESS_CODE:
|
|
|
+ mini_info = article_detail["data"]["data"]["mini_program"]
|
|
|
return mini_info
|
|
|
else:
|
|
|
return EMPTY_LIST
|
|
@@ -287,7 +308,9 @@ class UpdatePublishedArticlesMinigramDetail(object):
|
|
|
WHERE publish_dt
|
|
|
BETWEEN DATE_SUB('{biz_date}', INTERVAL 3 DAY) AND DATE_SUB('{biz_date}', INTERVAL 1 SECOND);
|
|
|
"""
|
|
|
- article_list = self.piaoquan_crawler_db_client.fetch(query=sql, cursor_type=DictCursor)
|
|
|
+ article_list = self.piaoquan_crawler_db_client.fetch(
|
|
|
+ query=sql, cursor_type=DictCursor
|
|
|
+ )
|
|
|
return article_list
|
|
|
|
|
|
def update_each_root_source_id(self, recall_dt: str, root_source_id: str) -> None:
|
|
@@ -296,7 +319,9 @@ class UpdatePublishedArticlesMinigramDetail(object):
|
|
|
:param root_source_id:
|
|
|
:return:
|
|
|
"""
|
|
|
- mini_program_detail = self.get_root_source_id_result(root_source_id=root_source_id, dt=recall_dt)
|
|
|
+ mini_program_detail = self.get_root_source_id_result(
|
|
|
+ root_source_id=root_source_id, dt=recall_dt
|
|
|
+ )
|
|
|
if mini_program_detail:
|
|
|
# do update job
|
|
|
update_sql = f"""
|
|
@@ -310,19 +335,19 @@ class UpdatePublishedArticlesMinigramDetail(object):
|
|
|
self.piaoquan_crawler_db_client.save(
|
|
|
query=update_sql,
|
|
|
params=(
|
|
|
- mini_program_detail['first_uv'],
|
|
|
- mini_program_detail['split0'],
|
|
|
- mini_program_detail['split0_head'],
|
|
|
- mini_program_detail['split0_recommend'],
|
|
|
- mini_program_detail['split1'],
|
|
|
- mini_program_detail['split1_head'],
|
|
|
- mini_program_detail['split1_recommend'],
|
|
|
- mini_program_detail['split2'],
|
|
|
- mini_program_detail['split2_head'],
|
|
|
- mini_program_detail['split2_recommend'],
|
|
|
+ mini_program_detail["first_uv"],
|
|
|
+ mini_program_detail["split0"],
|
|
|
+ mini_program_detail["split0_head"],
|
|
|
+ mini_program_detail["split0_recommend"],
|
|
|
+ mini_program_detail["split1"],
|
|
|
+ mini_program_detail["split1_head"],
|
|
|
+ mini_program_detail["split1_recommend"],
|
|
|
+ mini_program_detail["split2"],
|
|
|
+ mini_program_detail["split2_head"],
|
|
|
+ mini_program_detail["split2_recommend"],
|
|
|
root_source_id,
|
|
|
- recall_dt
|
|
|
- )
|
|
|
+ recall_dt,
|
|
|
+ ),
|
|
|
)
|
|
|
else:
|
|
|
return
|
|
@@ -334,11 +359,13 @@ class UpdatePublishedArticlesMinigramDetail(object):
|
|
|
:return:
|
|
|
"""
|
|
|
if not biz_date:
|
|
|
- biz_date = datetime.today().strftime('%Y-%m-%d')
|
|
|
+ biz_date = datetime.today().strftime("%Y-%m-%d")
|
|
|
|
|
|
published_article_list = self.get_articles_published_yesterday(biz_date)
|
|
|
failed_article_list = []
|
|
|
- for article_info in tqdm(published_article_list, desc="update_published_articles_job"):
|
|
|
+ for article_info in tqdm(
|
|
|
+ published_article_list, desc="update_published_articles_job"
|
|
|
+ ):
|
|
|
failed_article = self.record_each_article(article_info)
|
|
|
if failed_article:
|
|
|
failed_article_list.append(failed_article)
|
|
@@ -353,22 +380,20 @@ class UpdatePublishedArticlesMinigramDetail(object):
|
|
|
|
|
|
bot(
|
|
|
title="更新文章任务完成",
|
|
|
- detail={
|
|
|
- "finish_time": datetime.now().strftime("%Y-%m-%d %H:%M:%S")
|
|
|
- },
|
|
|
- mention=False
|
|
|
+ detail={"finish_time": datetime.now().strftime("%Y-%m-%d %H:%M:%S")},
|
|
|
+ mention=False,
|
|
|
)
|
|
|
if second_try_fail_article_list:
|
|
|
bot(
|
|
|
title="更新文章任务存在文章抓取失败",
|
|
|
detail=[
|
|
|
{
|
|
|
- "account": line['accountName'],
|
|
|
- "title": line['title'],
|
|
|
- "url": line['ContentUrl']
|
|
|
+ "account": line["accountName"],
|
|
|
+ "title": line["title"],
|
|
|
+ "url": line["ContentUrl"],
|
|
|
}
|
|
|
for line in second_try_fail_article_list
|
|
|
- ]
|
|
|
+ ],
|
|
|
)
|
|
|
|
|
|
def update_mini_program_detail_job(self, biz_date=None):
|
|
@@ -378,21 +403,24 @@ class UpdatePublishedArticlesMinigramDetail(object):
|
|
|
:return:
|
|
|
"""
|
|
|
if not biz_date:
|
|
|
- biz_date = datetime.today().strftime('%Y-%m-%d')
|
|
|
+ biz_date = datetime.today().strftime("%Y-%m-%d")
|
|
|
|
|
|
# get root_source_id_list
|
|
|
root_source_id_obj_list = self.get_root_source_id_for_three_days(biz_date)
|
|
|
log(
|
|
|
task=TASK_NAME,
|
|
|
function="update_minigram_detail",
|
|
|
- message="获取前三天的 rootSourceId, 一共有 {} 条记录".format(len(root_source_id_obj_list))
|
|
|
+ message="获取前三天的 rootSourceId, 一共有 {} 条记录".format(
|
|
|
+ len(root_source_id_obj_list)
|
|
|
+ ),
|
|
|
)
|
|
|
fail_count = 0
|
|
|
- for item in tqdm(root_source_id_obj_list, desc="update_mini_program_detail_job"):
|
|
|
+ for item in tqdm(
|
|
|
+ root_source_id_obj_list, desc="update_mini_program_detail_job"
|
|
|
+ ):
|
|
|
try:
|
|
|
self.update_each_root_source_id(
|
|
|
- root_source_id=item['root_source_id'],
|
|
|
- recall_dt=item['recall_dt']
|
|
|
+ root_source_id=item["root_source_id"], recall_dt=item["recall_dt"]
|
|
|
)
|
|
|
except Exception as e:
|
|
|
log(
|
|
@@ -400,16 +428,12 @@ class UpdatePublishedArticlesMinigramDetail(object):
|
|
|
function="update_minigram_detail",
|
|
|
status="fail",
|
|
|
message="更新单条数据失败, 报错信息是 {}".format(e),
|
|
|
- data={"error_msg": traceback.format_exc()}
|
|
|
+ data={"error_msg": traceback.format_exc()},
|
|
|
)
|
|
|
fail_count += 1
|
|
|
|
|
|
if fail_count:
|
|
|
bot(
|
|
|
title="{} fail because of lam db error".format(TASK_NAME),
|
|
|
- detail={
|
|
|
- "fail_count": fail_count
|
|
|
- }
|
|
|
+ detail={"fail_count": fail_count},
|
|
|
)
|
|
|
-
|
|
|
-
|