|
@@ -25,29 +25,28 @@ functions = Functions()
|
|
|
|
|
|
TASK_NAME = "updateMinigramInfoDaily"
|
|
|
ARTICLE_TABLE = "official_articles_v2"
|
|
|
-DETAIL_TABLE = "long_articles_detail_info_dev"
|
|
|
+DETAIL_TABLE = "long_articles_detail_info"
|
|
|
EMPTY_LIST = []
|
|
|
+EMPTY_DICT = {}
|
|
|
|
|
|
|
|
|
-def get_root_source_id_list(mini_program: List[Dict]) -> List[str]:
|
|
|
+def extract_path(path: str) -> Dict[str: str]:
|
|
|
"""
|
|
|
- 校验是否存在文章是否存在root_source_id
|
|
|
+ 提取path参数
|
|
|
+ :param path:
|
|
|
:return:
|
|
|
"""
|
|
|
- root_source_id_list = []
|
|
|
- for item in mini_program:
|
|
|
- path = item['path']
|
|
|
- # 解析主URL的查询参数
|
|
|
- params = parse_qs(urlparse(path).query)
|
|
|
- # 提取 'jumpPage' 参数的值并解析它的查询参数
|
|
|
- jump_page = params.get('jumpPage', [None])[0]
|
|
|
- if jump_page:
|
|
|
- params2 = parse_qs(jump_page)
|
|
|
- # 提取 'rootSourceId' 参数的值
|
|
|
- root_source_id = params2.get('rootSourceId', [None])[0]
|
|
|
- if root_source_id:
|
|
|
- root_source_id_list.append(root_source_id)
|
|
|
- return root_source_id_list
|
|
|
+ params = parse_qs(urlparse(path).query)
|
|
|
+ jump_page = params.get('jumpPage', [None])[0]
|
|
|
+ if jump_page:
|
|
|
+ params2 = parse_qs(jump_page)
|
|
|
+ res = {
|
|
|
+ "video_id": params2['pages/user-videos?id'][0],
|
|
|
+ "root_source_id": params2['rootSourceId'][0],
|
|
|
+ }
|
|
|
+ return res
|
|
|
+ else:
|
|
|
+ return EMPTY_DICT
|
|
|
|
|
|
|
|
|
def get_article_mini_program_info(content_url: str) -> List[Dict]:
|
|
@@ -105,9 +104,11 @@ class UpdatePublishedArticlesMinigramDetail(object):
|
|
|
:return:
|
|
|
"""
|
|
|
sql = f"""
|
|
|
- SELECT ContentUrl, wx_sn
|
|
|
- FROM {ARTICLE_TABLE}
|
|
|
- WHERE publish_timestamp IN {(const.DEFAULT_STATUS, const.REQUEST_FAIL_STATUS)};"""
|
|
|
+ SELECT ContentUrl, wx_sn
|
|
|
+ FROM {ARTICLE_TABLE}
|
|
|
+ WHERE publish_timestamp IN {(const.DEFAULT_STATUS, const.REQUEST_FAIL_STATUS)};
|
|
|
+ """
|
|
|
+
|
|
|
response = self.piaoquan_crawler_db_client.fetch(sql, cursor_type=DictCursor)
|
|
|
return response
|
|
|
|
|
@@ -127,7 +128,7 @@ class UpdatePublishedArticlesMinigramDetail(object):
|
|
|
if result:
|
|
|
return result[0]
|
|
|
else:
|
|
|
- return {}
|
|
|
+ return EMPTY_DICT
|
|
|
|
|
|
def get_articles_published_yesterday(self, biz_date: str) -> List[Dict]:
|
|
|
"""
|
|
@@ -206,8 +207,10 @@ class UpdatePublishedArticlesMinigramDetail(object):
|
|
|
for video_index, mini_item in enumerate(article_mini_program_detail, 1):
|
|
|
image_url = mini_item['image_url']
|
|
|
nick_name = mini_item['nike_name']
|
|
|
- root_source_id = mini_item['path'].split("rootSourceId%3D")[-1]
|
|
|
- video_id = mini_item['path'].split("videos%3Fid%3D")[1].split("%26su%3D")[0]
|
|
|
+ # extract video id and root_source_id
|
|
|
+ id_info = extract_path(mini_item['path'])
|
|
|
+ root_source_id = id_info['root_source_id']
|
|
|
+ video_id = id_info['video_id']
|
|
|
kimi_title = mini_item['title']
|
|
|
self.insert_each_root_source_id(
|
|
|
wx_sn=wx_sn,
|
|
@@ -220,7 +223,7 @@ class UpdatePublishedArticlesMinigramDetail(object):
|
|
|
publish_dt=publish_date.strftime('%Y-%m-%d'),
|
|
|
recall_dt=date_str
|
|
|
)
|
|
|
- return {}
|
|
|
+ return EMPTY_DICT
|
|
|
except Exception as e:
|
|
|
error_msg = traceback.format_exc()
|
|
|
log(
|
|
@@ -323,7 +326,7 @@ class UpdatePublishedArticlesMinigramDetail(object):
|
|
|
|
|
|
def update_mini_program_detail_job(self, biz_date=None):
|
|
|
"""
|
|
|
- 更新裂变信息
|
|
|
+ update mini program detail info
|
|
|
:param biz_date:
|
|
|
:return:
|
|
|
"""
|