|
@@ -1,6 +1,7 @@
|
|
|
"""
|
|
|
@author: luojunhui
|
|
|
"""
|
|
|
+import json
|
|
|
import traceback
|
|
|
|
|
|
from datetime import datetime, timedelta
|
|
@@ -49,24 +50,6 @@ def extract_path(path: str) -> Dict:
|
|
|
return EMPTY_DICT
|
|
|
|
|
|
|
|
|
-def get_article_mini_program_info(content_url: str) -> List[Dict]:
|
|
|
- """
|
|
|
- 获取文章的小程序信息
|
|
|
- :return:
|
|
|
- """
|
|
|
- try:
|
|
|
- article_detail = spider.get_article_text(content_url)
|
|
|
- except Exception as e:
|
|
|
- raise SpiderError(error=e, spider="detail", url=content_url)
|
|
|
-
|
|
|
- response_code = article_detail['code']
|
|
|
- if response_code == const.ARTICLE_SUCCESS_CODE:
|
|
|
- mini_info = article_detail['data']['data']['mini_program']
|
|
|
- return mini_info
|
|
|
- else:
|
|
|
- return EMPTY_LIST
|
|
|
-
|
|
|
-
|
|
|
class UpdatePublishedArticlesMinigramDetail(object):
|
|
|
"""
|
|
|
更新已发布文章数据
|
|
@@ -136,7 +119,7 @@ class UpdatePublishedArticlesMinigramDetail(object):
|
|
|
:return:
|
|
|
"""
|
|
|
sql = f"""
|
|
|
- SELECT ContentUrl, wx_sn, publish_timestamp, accountName, title
|
|
|
+ SELECT ContentUrl, wx_sn, publish_timestamp, accountName, title, root_source_id_list
|
|
|
FROM official_articles_v2
|
|
|
WHERE FROM_UNIXTIME(publish_timestamp)
|
|
|
BETWEEN DATE_SUB('{biz_date}', INTERVAL 1 DAY) AND DATE_SUB('{biz_date}', INTERVAL 1 SECOND);
|
|
@@ -181,8 +164,9 @@ class UpdatePublishedArticlesMinigramDetail(object):
|
|
|
url = article_info['ContentUrl']
|
|
|
publish_timestamp = article_info['publish_timestamp']
|
|
|
wx_sn = article_info['wx_sn'].decode()
|
|
|
+ root_source_id_list = json.loads(article_info['root_source_id_list'] if article_info['root_source_id_list'] else EMPTY_LIST)
|
|
|
|
|
|
- article_mini_program_detail = get_article_mini_program_info(url)
|
|
|
+ article_mini_program_detail = self.get_article_mini_program_info(url, root_source_id_list)
|
|
|
if article_mini_program_detail:
|
|
|
log(
|
|
|
task=TASK_NAME,
|
|
@@ -208,9 +192,13 @@ class UpdatePublishedArticlesMinigramDetail(object):
|
|
|
image_url = mini_item['image_url']
|
|
|
nick_name = mini_item['nike_name']
|
|
|
# extract video id and root_source_id
|
|
|
- id_info = extract_path(mini_item['path'])
|
|
|
- root_source_id = id_info['root_source_id']
|
|
|
- video_id = id_info['video_id']
|
|
|
+ if mini_item.get("root_source_id") and mini_item.get("video_id"):
|
|
|
+ root_source_id = mini_item['root_source_id']
|
|
|
+ video_id = mini_item['video_id']
|
|
|
+ else:
|
|
|
+ id_info = extract_path(mini_item['path'])
|
|
|
+ root_source_id = id_info['root_source_id']
|
|
|
+ video_id = id_info['video_id']
|
|
|
kimi_title = mini_item['title']
|
|
|
self.insert_each_root_source_id(
|
|
|
wx_sn=wx_sn,
|
|
@@ -237,6 +225,52 @@ class UpdatePublishedArticlesMinigramDetail(object):
|
|
|
else:
|
|
|
return EMPTY_DICT
|
|
|
|
|
|
+ def get_article_mini_program_info(self, content_url: str, root_source_id_list: list) -> List[Dict]:
|
|
|
+ """
|
|
|
+ 获取文章的小程序信息
|
|
|
+ :return:
|
|
|
+ """
|
|
|
+ if root_source_id_list:
|
|
|
+ # 说明已经获取到 root_source_id了
|
|
|
+ fetch_sql = f"""
|
|
|
+ select video_id, root_source_id from long_articles_root_source_id where root_source_id in %s;
|
|
|
+ """
|
|
|
+ fetch_response = self.long_articles_db_client.fetch(
|
|
|
+ query=fetch_sql,
|
|
|
+ params=(tuple(root_source_id_list),),
|
|
|
+ cursor_type=DictCursor
|
|
|
+ )
|
|
|
+ mini_info = []
|
|
|
+ if fetch_response:
|
|
|
+ # 构造 mini_info 的格式
|
|
|
+ for item in fetch_response:
|
|
|
+ mini_info.append(
|
|
|
+ {
|
|
|
+ "app_id": "wx89e7eb06478361d7",
|
|
|
+ "avatar": "https://rescdn.yishihui.com/0temp/logo.png",
|
|
|
+ "image_url": "",
|
|
|
+ "nike_name": "票圈 l 3亿人喜欢的视频平台",
|
|
|
+ "root_source_id": item['root_source_id'],
|
|
|
+ "video_id": item['video_id'],
|
|
|
+ "service_type": "0",
|
|
|
+ "title": "",
|
|
|
+ "type": "card"
|
|
|
+ }
|
|
|
+ )
|
|
|
+ return mini_info
|
|
|
+
|
|
|
+ try:
|
|
|
+ article_detail = spider.get_article_text(content_url)
|
|
|
+ except Exception as e:
|
|
|
+ raise SpiderError(error=e, spider="detail", url=content_url)
|
|
|
+
|
|
|
+ response_code = article_detail['code']
|
|
|
+ if response_code == const.ARTICLE_SUCCESS_CODE:
|
|
|
+ mini_info = article_detail['data']['data']['mini_program']
|
|
|
+ return mini_info
|
|
|
+ else:
|
|
|
+ return EMPTY_LIST
|
|
|
+
|
|
|
def get_root_source_id_for_three_days(self, biz_date: str) -> List[Dict]:
|
|
|
"""
|
|
|
获取publish_dt在 biz_date前三天的root_source_id
|
|
@@ -375,5 +409,3 @@ class UpdatePublishedArticlesMinigramDetail(object):
|
|
|
)
|
|
|
|
|
|
|
|
|
-
|
|
|
-
|