|
|
@@ -5,7 +5,14 @@ from datetime import datetime, timedelta
|
|
|
from tqdm.asyncio import tqdm
|
|
|
|
|
|
from applications.crawler.wechat import get_article_detail
|
|
|
-from applications.utils import get_beijing_date, handle_spider_exception, transform_to_beijing_date, extract_root_source_id
|
|
|
+from applications.pipeline import insert_into_mini_program_detail_pool
|
|
|
+from applications.utils import (
|
|
|
+ get_beijing_date,
|
|
|
+ handle_spider_exception,
|
|
|
+ transform_to_beijing_date,
|
|
|
+ extract_root_source_id,
|
|
|
+)
|
|
|
+
|
|
|
|
|
|
class Const:
|
|
|
ARTICLE_SUCCESS_CODE = 0
|
|
|
@@ -52,7 +59,9 @@ class RecycleRootSourceIdDetail(Const):
|
|
|
from long_articles_root_source_id
|
|
|
where root_source_id in %s;
|
|
|
"""
|
|
|
- return await self.pool.async_fetch(query=query, params=(tuple(root_source_id_list),))
|
|
|
+ return await self.pool.async_fetch(
|
|
|
+ query=query, params=(tuple(root_source_id_list),)
|
|
|
+ )
|
|
|
|
|
|
async def get_article_mini_program_detail(self, url, root_source_id_list):
|
|
|
if not root_source_id_list:
|
|
|
@@ -70,11 +79,13 @@ class RecycleRootSourceIdDetail(Const):
|
|
|
error=e,
|
|
|
traceback=traceback.format_exc(),
|
|
|
trace_id=self.trace_id,
|
|
|
- task_name=self.__class__.__name__
|
|
|
+ task_name=self.__class__.__name__,
|
|
|
)
|
|
|
return []
|
|
|
else:
|
|
|
- mini_program_info = await self.get_mini_program_info_by_root_source_id(root_source_id_list)
|
|
|
+ mini_program_info = await self.get_mini_program_info_by_root_source_id(
|
|
|
+ root_source_id_list
|
|
|
+ )
|
|
|
if mini_program_info:
|
|
|
return [
|
|
|
{
|
|
|
@@ -87,7 +98,8 @@ class RecycleRootSourceIdDetail(Const):
|
|
|
"service_type": "0",
|
|
|
"title": "",
|
|
|
"type": "card",
|
|
|
- } for item in mini_program_info
|
|
|
+ }
|
|
|
+ for item in mini_program_info
|
|
|
]
|
|
|
else:
|
|
|
return []
|
|
|
@@ -96,10 +108,16 @@ class RecycleRootSourceIdDetail(Const):
|
|
|
url = article["ContentUrl"]
|
|
|
wx_sn = article["wx_sn"].decode("utf-8")
|
|
|
publish_timestamp = article["publish_timestamp"]
|
|
|
- root_source_id_list = json.loads(article["root_source_id_list"]) if article["root_source_id_list"] else []
|
|
|
+ root_source_id_list = (
|
|
|
+ json.loads(article["root_source_id_list"])
|
|
|
+ if article["root_source_id_list"]
|
|
|
+ else []
|
|
|
+ )
|
|
|
|
|
|
# get article mini program info
|
|
|
- article_mini_program_detail = await self.get_article_mini_program_detail(url, root_source_id_list)
|
|
|
+ article_mini_program_detail = await self.get_article_mini_program_detail(
|
|
|
+ url, root_source_id_list
|
|
|
+ )
|
|
|
if not article_mini_program_detail:
|
|
|
return {}
|
|
|
else:
|
|
|
@@ -110,7 +128,6 @@ class RecycleRootSourceIdDetail(Const):
|
|
|
(publish_date + timedelta(days=i)).strftime("%Y-%m-%d")
|
|
|
for i in range(3)
|
|
|
]
|
|
|
-
|
|
|
for date_str in recall_dt_str_list:
|
|
|
for video_index, mini_item in enumerate(
|
|
|
article_mini_program_detail, 1
|
|
|
@@ -128,24 +145,27 @@ class RecycleRootSourceIdDetail(Const):
|
|
|
root_source_id = id_info["root_source_id"]
|
|
|
video_id = id_info["video_id"]
|
|
|
kimi_title = mini_item["title"]
|
|
|
- # self.insert_each_root_source_id(
|
|
|
- # wx_sn=wx_sn,
|
|
|
- # mini_title=kimi_title,
|
|
|
- # mini_name=nick_name,
|
|
|
- # cover_url=image_url,
|
|
|
- # video_index=video_index,
|
|
|
- # root_source_id=root_source_id,
|
|
|
- # video_id=video_id,
|
|
|
- # publish_dt=publish_date.strftime("%Y-%m-%d"),
|
|
|
- # recall_dt=date_str,
|
|
|
- # )
|
|
|
+ await insert_into_mini_program_detail_pool(
|
|
|
+ self.pool,
|
|
|
+ raw={
|
|
|
+ "wx_sn": wx_sn,
|
|
|
+ "mini_title": kimi_title,
|
|
|
+ "root_source_id": root_source_id,
|
|
|
+ "video_id": video_id,
|
|
|
+ "mini_name": nick_name,
|
|
|
+ "cover_url": image_url,
|
|
|
+ "publish_dt": publish_date.strftime("%Y-%m-%d"),
|
|
|
+ "recall_dt": date_str,
|
|
|
+ "video_index": video_index,
|
|
|
+ },
|
|
|
+ )
|
|
|
+
|
|
|
return {}
|
|
|
except Exception as e:
|
|
|
print(e)
|
|
|
error_msg = traceback.format_exc()
|
|
|
return article
|
|
|
|
|
|
-
|
|
|
async def deal(self):
|
|
|
"""deal function"""
|
|
|
# step 1, record articles to detail table
|
|
|
@@ -154,7 +174,3 @@ class RecycleRootSourceIdDetail(Const):
|
|
|
await self.record_single_article(article)
|
|
|
|
|
|
# step2, update root_source_id detail info
|
|
|
-
|
|
|
-
|
|
|
-
|
|
|
-
|