|
@@ -3,7 +3,6 @@
|
|
|
@description Update Minigram Info Daily
|
|
|
"""
|
|
|
import time
|
|
|
-import sys
|
|
|
import traceback
|
|
|
|
|
|
from tqdm import tqdm
|
|
@@ -11,7 +10,9 @@ from datetime import datetime, timedelta
|
|
|
import schedule
|
|
|
from argparse import ArgumentParser
|
|
|
|
|
|
-from applications import longArticlesMySQL, PQMySQL, WeixinSpider, Functions, log, bot
|
|
|
+from applications import WeixinSpider, Functions, log, bot
|
|
|
+from applications.db import DatabaseConnector
|
|
|
+from config import long_articles_config, piaoquan_crawler_config
|
|
|
|
|
|
TASK_NAME = "updateMinigramInfoDaily"
|
|
|
SPIDER_SUCCESS_STATUS = 0
|
|
@@ -30,13 +31,35 @@ class DailyDataManager(object):
|
|
|
"""
|
|
|
daily 数据每日更新
|
|
|
"""
|
|
|
- long_articles_db = longArticlesMySQL()
|
|
|
- pq_db = PQMySQL()
|
|
|
- wx_spider = WeixinSpider()
|
|
|
- functions = Functions()
|
|
|
|
|
|
- @classmethod
|
|
|
- def get_published_articles(cls, biz_date):
|
|
|
+ def __init__(self):
|
|
|
+ self.piaoquan_crawler_db_client = None
|
|
|
+ self.long_articles_db_client = None
|
|
|
+ self.spider = WeixinSpider()
|
|
|
+
|
|
|
+ def init_database(self) -> None:
|
|
|
+ """
|
|
|
+ init database connector
|
|
|
+ :return:
|
|
|
+ """
|
|
|
+ # 初始化数据库连接
|
|
|
+ try:
|
|
|
+ self.piaoquan_crawler_db_client = DatabaseConnector(piaoquan_crawler_config)
|
|
|
+ self.piaoquan_crawler_db_client.connect()
|
|
|
+ self.long_articles_db_client = DatabaseConnector(long_articles_config)
|
|
|
+ self.long_articles_db_client.connect()
|
|
|
+ except Exception as e:
|
|
|
+ error_msg = traceback.format_exc()
|
|
|
+ bot(
|
|
|
+ title="更新小程序裂变信息任务连接数据库失败",
|
|
|
+ detail={
|
|
|
+ "error": e,
|
|
|
+ "msg": error_msg
|
|
|
+ }
|
|
|
+ )
|
|
|
+ return
|
|
|
+
|
|
|
+ def get_published_articles(self, biz_date):
|
|
|
"""
|
|
|
获取已经发布的文章的信息, updateTime 选择为前一天的 0 点并且转化为时间戳
|
|
|
:return:
|
|
@@ -48,11 +71,8 @@ class DailyDataManager(object):
|
|
|
select ContentUrl, wx_sn, publish_timestamp, accountName, title
|
|
|
from official_articles_v2
|
|
|
where publish_timestamp between {biz_date_ts} and {biz_date_end_ts};
|
|
|
--- and accountName in (
|
|
|
--- select distinct account_name from account_avg_info_v2
|
|
|
--- );
|
|
|
"""
|
|
|
- result_list = cls.pq_db.select(sql2)
|
|
|
+ result_list = self.piaoquan_crawler_db_client.fetch(sql2)
|
|
|
log(
|
|
|
task=TASK_NAME,
|
|
|
function="get_published_articles",
|
|
@@ -60,8 +80,7 @@ class DailyDataManager(object):
|
|
|
)
|
|
|
return result_list
|
|
|
|
|
|
- @classmethod
|
|
|
- def update_article_info(cls, line):
|
|
|
+ def update_article_info(self, line):
|
|
|
"""
|
|
|
update info into mysql
|
|
|
:return:
|
|
@@ -69,7 +88,7 @@ class DailyDataManager(object):
|
|
|
url = line[0]
|
|
|
update_time = line[2]
|
|
|
wx_sn = line[1].decode()
|
|
|
- article_detail = cls.get_root_source_ids(line)
|
|
|
+ article_detail = self.get_root_source_ids(line)
|
|
|
if article_detail:
|
|
|
response_code = article_detail['code']
|
|
|
if response_code == SPIDER_SUCCESS_STATUS:
|
|
@@ -109,8 +128,8 @@ class DailyDataManager(object):
|
|
|
values
|
|
|
(%s, %s, %s, %s, %s, %s, %s, %s, %s);
|
|
|
"""
|
|
|
- cls.pq_db.update(
|
|
|
- sql=insert_sql,
|
|
|
+ self.piaoquan_crawler_db_client.save(
|
|
|
+ query=insert_sql,
|
|
|
params=(
|
|
|
wx_sn,
|
|
|
kimi_title,
|
|
@@ -144,15 +163,14 @@ class DailyDataManager(object):
|
|
|
else:
|
|
|
return line
|
|
|
|
|
|
- @classmethod
|
|
|
- def get_root_source_ids(cls, data_info):
|
|
|
+ def get_root_source_ids(self, data_info):
|
|
|
"""
|
|
|
通过抓取接口获取 data_info
|
|
|
:return:
|
|
|
"""
|
|
|
url = data_info[0]
|
|
|
try:
|
|
|
- article_detail = cls.wx_spider.get_article_text(url)
|
|
|
+ article_detail = self.spider.get_article_text(url)
|
|
|
return article_detail
|
|
|
except Exception as e:
|
|
|
log(
|
|
@@ -166,8 +184,7 @@ class DailyDataManager(object):
|
|
|
)
|
|
|
return False
|
|
|
|
|
|
- @classmethod
|
|
|
- def get_minigram_info(cls, rootSourceId):
|
|
|
+ def get_minigram_info(self, rootSourceId):
|
|
|
"""
|
|
|
|
|
|
:param rootSourceId:
|
|
@@ -178,7 +195,7 @@ class DailyDataManager(object):
|
|
|
from changwen_data_base_v2
|
|
|
where rootsourceid = '{rootSourceId}';
|
|
|
"""
|
|
|
- result_list = cls.long_articles_db.select(sql)
|
|
|
+ result_list = self.long_articles_db_client.fetch(sql)
|
|
|
|
|
|
def summarize(values):
|
|
|
"""
|
|
@@ -255,8 +272,7 @@ class DailyDataManager(object):
|
|
|
)
|
|
|
return None
|
|
|
|
|
|
- @classmethod
|
|
|
- def update_minigram_detail(cls, biz_date):
|
|
|
+ def update_minigram_detail(self, biz_date):
|
|
|
"""
|
|
|
:return:
|
|
|
"""
|
|
@@ -269,7 +285,7 @@ class DailyDataManager(object):
|
|
|
from long_articles_detail_info
|
|
|
where publish_dt between '{datestr_begin}' and '{datestr_end}';
|
|
|
"""
|
|
|
- source_id_list = cls.pq_db.select(sql)
|
|
|
+ source_id_list = self.piaoquan_crawler_db_client.fetch(sql)
|
|
|
log(
|
|
|
task=TASK_NAME,
|
|
|
function="update_minigram_detail",
|
|
@@ -279,7 +295,7 @@ class DailyDataManager(object):
|
|
|
for item in tqdm(source_id_list):
|
|
|
s_id = item[0]
|
|
|
try:
|
|
|
- result = cls.get_minigram_info(s_id)
|
|
|
+ result = self.get_minigram_info(s_id)
|
|
|
for key in result:
|
|
|
recall_dt = key
|
|
|
first_level = result[key][0]
|
|
@@ -293,8 +309,8 @@ class DailyDataManager(object):
|
|
|
where root_source_id = %s and recall_dt = %s;
|
|
|
"""
|
|
|
try:
|
|
|
- cls.pq_db.update(
|
|
|
- sql=update_sql,
|
|
|
+ self.piaoquan_crawler_db_client.save(
|
|
|
+ query=update_sql,
|
|
|
params=(
|
|
|
first_level, fission_0, fission_1, fission_2, s_id, recall_dt
|
|
|
)
|
|
@@ -319,7 +335,7 @@ class DailyDataManager(object):
|
|
|
bot(
|
|
|
title="{} fail because of lam db error".format(TASK_NAME),
|
|
|
detail={
|
|
|
- "fail_count": fail_count
|
|
|
+ "fail_count": fail_count
|
|
|
}
|
|
|
)
|
|
|
|