|
@@ -4,6 +4,7 @@
|
|
|
"""
|
|
|
import time
|
|
|
import sys
|
|
|
+import traceback
|
|
|
|
|
|
from tqdm import tqdm
|
|
|
from datetime import datetime, timedelta
|
|
@@ -13,9 +14,14 @@ from argparse import ArgumentParser
|
|
|
from applications import longArticlesMySQL, PQMySQL, WeixinSpider, Functions, log, bot
|
|
|
|
|
|
TASK_NAME = "updateMinigramInfoDaily"
|
|
|
+SPIDER_SUCCESS_STATUS = 0
|
|
|
|
|
|
|
|
|
def get_yesterday():
|
|
|
+ """
|
|
|
+ get yesterday date
|
|
|
+ :return:
|
|
|
+ """
|
|
|
yesterday = datetime.today() - timedelta(1)
|
|
|
return yesterday
|
|
|
|
|
@@ -39,9 +45,9 @@ class DailyDataManager(object):
|
|
|
biz_date_ts = biz_date_midnight.timestamp()
|
|
|
biz_date_end_ts = biz_date_ts + 24 * 60 * 60 - 1
|
|
|
sql2 = f"""
|
|
|
- select ContentUrl, wx_sn, updateTime
|
|
|
+ select ContentUrl, wx_sn, publish_timestamp, accountName, title
|
|
|
from official_articles_v2
|
|
|
- where updateTime between {biz_date_ts} and {biz_date_end_ts};
|
|
|
+ where publish_timestamp between {biz_date_ts} and {biz_date_end_ts};
|
|
|
-- and accountName in (
|
|
|
-- select distinct account_name from account_avg_info_v2
|
|
|
-- );
|
|
@@ -60,56 +66,83 @@ class DailyDataManager(object):
|
|
|
update info into mysql
|
|
|
:return:
|
|
|
"""
|
|
|
- try:
|
|
|
- wx_sn, mini_info, update_time = cls.get_root_source_ids(line)
|
|
|
- dt_object = datetime.fromtimestamp(update_time)
|
|
|
- publish_dt = dt_object.strftime('%Y-%m-%d')
|
|
|
- one_day = timedelta(days=1)
|
|
|
- two_day = timedelta(days=2)
|
|
|
- next_day = dt_object + one_day
|
|
|
- next_next_day = dt_object + two_day
|
|
|
- recall_dt_list = [dt_object, next_day, next_next_day]
|
|
|
- recall_dt_str_list = [i.strftime('%Y-%m-%d') for i in recall_dt_list]
|
|
|
- for dt_str in recall_dt_str_list:
|
|
|
- for index, item in enumerate(mini_info, 1):
|
|
|
- image_url = item['image_url']
|
|
|
- nick_name = item['nike_name']
|
|
|
- root_source_id = item['path'].split("rootSourceId%3D")[-1]
|
|
|
- video_id = item['path'].split("videos%3Fid%3D")[1].split("%26su%3D")[0]
|
|
|
- kimi_title = item['title']
|
|
|
- # print(image_url, nick_name, root_source_id, video_id, kimi_title)
|
|
|
- insert_sql = f"""
|
|
|
- INSERT INTO long_articles_detail_info
|
|
|
- (wx_sn, mini_title, mini_name, cover_url, video_index, root_source_id, video_id, publish_dt, recall_dt)
|
|
|
- values
|
|
|
- (%s, %s, %s, %s, %s, %s, %s, %s, %s);
|
|
|
- """
|
|
|
- cls.pq_db.update(
|
|
|
- sql=insert_sql,
|
|
|
- params=(
|
|
|
- wx_sn,
|
|
|
- kimi_title,
|
|
|
- nick_name,
|
|
|
- image_url,
|
|
|
- index,
|
|
|
- root_source_id,
|
|
|
- video_id,
|
|
|
- publish_dt,
|
|
|
- dt_str
|
|
|
- )
|
|
|
- )
|
|
|
+ url = line[0]
|
|
|
+ update_time = line[2]
|
|
|
+ wx_sn = line[1].decode()
|
|
|
+ article_detail = cls.get_root_source_ids(line)
|
|
|
+ if article_detail:
|
|
|
+ response_code = article_detail['code']
|
|
|
+ if response_code == SPIDER_SUCCESS_STATUS:
|
|
|
+ mini_info = article_detail['data']['data']['mini_program']
|
|
|
+ if mini_info:
|
|
|
log(
|
|
|
task=TASK_NAME,
|
|
|
- function="update_article_info",
|
|
|
- message="插入数据成功, video_id 是: {}".format(video_id)
|
|
|
+ function="get_root_source_ids",
|
|
|
+ message="获取文章链接对应的 rootSourceId 成功",
|
|
|
+ data={
|
|
|
+ "ContentUrl": url,
|
|
|
+ "wxSn": wx_sn,
|
|
|
+ "updateTime": update_time,
|
|
|
+ "miniInfo": mini_info
|
|
|
+ }
|
|
|
)
|
|
|
- except Exception as e:
|
|
|
- log(
|
|
|
- task=TASK_NAME,
|
|
|
- function="update_article_info",
|
|
|
- status="fail",
|
|
|
- message="插入数据失败, 失败原因是".format(e)
|
|
|
- )
|
|
|
+ try:
|
|
|
+ dt_object = datetime.fromtimestamp(update_time)
|
|
|
+ publish_dt = dt_object.strftime('%Y-%m-%d')
|
|
|
+ one_day = timedelta(days=1)
|
|
|
+ two_day = timedelta(days=2)
|
|
|
+ next_day = dt_object + one_day
|
|
|
+ next_next_day = dt_object + two_day
|
|
|
+ recall_dt_list = [dt_object, next_day, next_next_day]
|
|
|
+ recall_dt_str_list = [i.strftime('%Y-%m-%d') for i in recall_dt_list]
|
|
|
+ for dt_str in recall_dt_str_list:
|
|
|
+ for index, item in enumerate(mini_info, 1):
|
|
|
+ image_url = item['image_url']
|
|
|
+ nick_name = item['nike_name']
|
|
|
+ root_source_id = item['path'].split("rootSourceId%3D")[-1]
|
|
|
+ video_id = item['path'].split("videos%3Fid%3D")[1].split("%26su%3D")[0]
|
|
|
+ kimi_title = item['title']
|
|
|
+ # print(image_url, nick_name, root_source_id, video_id, kimi_title)
|
|
|
+ insert_sql = f"""
|
|
|
+ INSERT INTO long_articles_detail_info
|
|
|
+ (wx_sn, mini_title, mini_name, cover_url, video_index, root_source_id, video_id, publish_dt, recall_dt)
|
|
|
+ values
|
|
|
+ (%s, %s, %s, %s, %s, %s, %s, %s, %s);
|
|
|
+ """
|
|
|
+ cls.pq_db.update(
|
|
|
+ sql=insert_sql,
|
|
|
+ params=(
|
|
|
+ wx_sn,
|
|
|
+ kimi_title,
|
|
|
+ nick_name,
|
|
|
+ image_url,
|
|
|
+ index,
|
|
|
+ root_source_id,
|
|
|
+ video_id,
|
|
|
+ publish_dt,
|
|
|
+ dt_str
|
|
|
+ )
|
|
|
+ )
|
|
|
+ log(
|
|
|
+ task=TASK_NAME,
|
|
|
+ function="update_article_info",
|
|
|
+ message="插入数据成功, video_id 是: {}".format(video_id)
|
|
|
+ )
|
|
|
+ except Exception as e:
|
|
|
+ error_msg = traceback.format_exc()
|
|
|
+ log(
|
|
|
+ task=TASK_NAME,
|
|
|
+ function="update_article_info",
|
|
|
+ status="fail",
|
|
|
+ message="插入数据失败, 失败原因是{}--{}".format(e, error_msg)
|
|
|
+ )
|
|
|
+
|
|
|
+ return None
|
|
|
+ else:
|
|
|
+ return line
|
|
|
+
|
|
|
+ else:
|
|
|
+ return line
|
|
|
|
|
|
@classmethod
|
|
|
def get_root_source_ids(cls, data_info):
|
|
@@ -120,19 +153,7 @@ class DailyDataManager(object):
|
|
|
url = data_info[0]
|
|
|
try:
|
|
|
article_detail = cls.wx_spider.get_article_text(url)
|
|
|
- mini_info = article_detail['data']['data']['mini_program']
|
|
|
- log(
|
|
|
- task=TASK_NAME,
|
|
|
- function="get_root_source_ids",
|
|
|
- message="获取文章链接对应的 rootSourceId 成功",
|
|
|
- data={
|
|
|
- "ContentUrl": url,
|
|
|
- "wxSn": data_info[1].decode(),
|
|
|
- "createTime": data_info[2],
|
|
|
- "miniInfo": mini_info
|
|
|
- }
|
|
|
- )
|
|
|
- return data_info[1].decode(), mini_info, data_info[2]
|
|
|
+ return article_detail
|
|
|
except Exception as e:
|
|
|
log(
|
|
|
task=TASK_NAME,
|
|
@@ -143,7 +164,7 @@ class DailyDataManager(object):
|
|
|
"ContentUrl": url
|
|
|
}
|
|
|
)
|
|
|
- return
|
|
|
+ return False
|
|
|
|
|
|
@classmethod
|
|
|
def get_minigram_info(cls, rootSourceId):
|
|
@@ -302,8 +323,20 @@ def updateArticlesJob(biz_date=None):
|
|
|
biz_date = get_yesterday()
|
|
|
data_manager = DailyDataManager()
|
|
|
article_list = data_manager.get_published_articles(biz_date)
|
|
|
+ failed_article_list = []
|
|
|
for article in tqdm(article_list):
|
|
|
- data_manager.update_article_info(article)
|
|
|
+ failed_article = data_manager.update_article_info(article)
|
|
|
+ if failed_article:
|
|
|
+ failed_article_list.append(failed_article)
|
|
|
+
|
|
|
+ # 重试
|
|
|
+ second_try_fail_article_list = []
|
|
|
+ if failed_article_list:
|
|
|
+ for article in tqdm(failed_article_list):
|
|
|
+ second_failed_article = data_manager.update_article_info(article)
|
|
|
+ if second_failed_article:
|
|
|
+ second_try_fail_article_list.append(second_failed_article)
|
|
|
+
|
|
|
log(
|
|
|
task=TASK_NAME,
|
|
|
function="updateArticlesJob",
|
|
@@ -312,10 +345,22 @@ def updateArticlesJob(biz_date=None):
|
|
|
bot(
|
|
|
title="更新文章任务完成",
|
|
|
detail={
|
|
|
- "finish_time": datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
|
|
|
+ "finish_time": datetime.now().strftime("%Y-%m-%d %H:%M:%S")
|
|
|
},
|
|
|
mention=False
|
|
|
)
|
|
|
+ if second_try_fail_article_list:
|
|
|
+ bot(
|
|
|
+ title="更新文章任务存在文章抓取失败",
|
|
|
+ detail=[
|
|
|
+ {
|
|
|
+ "account": line[3],
|
|
|
+ "title": line[4],
|
|
|
+ "url": line[0]
|
|
|
+ }
|
|
|
+ for line in second_try_fail_article_list
|
|
|
+ ]
|
|
|
+ )
|
|
|
|
|
|
|
|
|
def updateMinigramInfoJob(biz_date=None):
|
|
@@ -343,13 +388,17 @@ def updateMinigramInfoJob(biz_date=None):
|
|
|
bot(
|
|
|
title="更新小程序信息任务完成",
|
|
|
detail={
|
|
|
- "finish_time": datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
|
|
|
+ "finish_time": datetime.now().strftime("%Y-%m-%d %H:%M:%S")
|
|
|
},
|
|
|
mention=False
|
|
|
)
|
|
|
|
|
|
|
|
|
def main():
|
|
|
+ """
|
|
|
+ main function
|
|
|
+ :return:
|
|
|
+ """
|
|
|
parser = ArgumentParser()
|
|
|
parser.add_argument("--run-date",
|
|
|
help="Run only once for date in format of %Y%m%d. \
|
|
@@ -369,11 +418,6 @@ def main():
|
|
|
while True:
|
|
|
schedule.run_pending()
|
|
|
time.sleep(1)
|
|
|
- # log(
|
|
|
- # task=TASK_NAME,
|
|
|
- # function="main",
|
|
|
- # message="更新文章小程序信息任务正常执行"
|
|
|
- # )
|
|
|
|
|
|
|
|
|
if __name__ == '__main__':
|