|
@@ -10,6 +10,7 @@ from typing import List, Dict
|
|
from tqdm import tqdm
|
|
from tqdm import tqdm
|
|
from pymysql.cursors import DictCursor
|
|
from pymysql.cursors import DictCursor
|
|
|
|
|
|
|
|
+from config import apolloConfig
|
|
from applications import bot
|
|
from applications import bot
|
|
from applications import log
|
|
from applications import log
|
|
from applications import Functions
|
|
from applications import Functions
|
|
@@ -18,8 +19,9 @@ from applications import longArticlesMySQL
|
|
from applications.const import WeixinVideoCrawlerConst
|
|
from applications.const import WeixinVideoCrawlerConst
|
|
|
|
|
|
spider = WeixinSpider()
|
|
spider = WeixinSpider()
|
|
-const = WeixinVideoCrawlerConst()
|
|
|
|
functions = Functions()
|
|
functions = Functions()
|
|
|
|
+config = apolloConfig(env="prod")
|
|
|
|
+const = WeixinVideoCrawlerConst()
|
|
|
|
|
|
|
|
|
|
class WeixinVideoCrawler(object):
|
|
class WeixinVideoCrawler(object):
|
|
@@ -29,6 +31,31 @@ class WeixinVideoCrawler(object):
|
|
|
|
|
|
def __init__(self):
|
|
def __init__(self):
|
|
self.db_client = longArticlesMySQL()
|
|
self.db_client = longArticlesMySQL()
|
|
|
|
+ self.festival_list = json.loads(config.getConfigValue("festival"))
|
|
|
|
+
|
|
|
|
+ def is_festival(self, title: str) -> bool:
|
|
|
|
+ """
|
|
|
|
+ 判断是否为节假日
|
|
|
|
+ :param title:
|
|
|
|
+ :return:
|
|
|
|
+ """
|
|
|
|
+ for festival in self.festival_list:
|
|
|
|
+ if festival in title:
|
|
|
|
+ return True
|
|
|
|
+ return False
|
|
|
|
+
|
|
|
|
+ def get_title_status(self, title: str) -> int:
|
|
|
|
+ """
|
|
|
|
+ 通过标题获取文章状态
|
|
|
|
+ :param title:
|
|
|
|
+ :return:
|
|
|
|
+ """
|
|
|
|
+ if self.is_festival(title):
|
|
|
|
+ return const.TITLE_FESTIVAL_STATUS
|
|
|
|
+ elif len(title) < const.TITLE_MIN_LENGTH:
|
|
|
|
+ return const.TITLE_SHORT_STATUS
|
|
|
|
+ else:
|
|
|
|
+ return const.TITLE_DEFAULT_STATUS
|
|
|
|
|
|
def update_account_latest_crawler_timestamp(self, gh_id: str) -> int:
|
|
def update_account_latest_crawler_timestamp(self, gh_id: str) -> int:
|
|
"""
|
|
"""
|
|
@@ -142,11 +169,12 @@ class WeixinVideoCrawler(object):
|
|
show_stat = functions.show_desc_to_sta(show_desc)
|
|
show_stat = functions.show_desc_to_sta(show_desc)
|
|
read_cnt = show_stat.get("show_view_count", 0)
|
|
read_cnt = show_stat.get("show_view_count", 0)
|
|
like_cnt = show_stat.get("show_like_count", 0)
|
|
like_cnt = show_stat.get("show_like_count", 0)
|
|
|
|
+ title_status = self.get_title_status(title)
|
|
insert_sql = f"""
|
|
insert_sql = f"""
|
|
INSERT INTO publish_single_video_source
|
|
INSERT INTO publish_single_video_source
|
|
- (content_trace_id, article_title, out_account_id, out_account_name, read_cnt, like_cnt, article_index, article_publish_type, article_url, cover_url, video_oss_path, publish_timestamp, crawler_timestamp, url_unique_md5)
|
|
|
|
|
|
+ (content_trace_id, article_title, out_account_id, out_account_name, read_cnt, like_cnt, article_index, article_publish_type, article_url, cover_url, video_oss_path, bad_status, publish_timestamp, crawler_timestamp, url_unique_md5)
|
|
values
|
|
values
|
|
- (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s);
|
|
|
|
|
|
+ (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s);
|
|
"""
|
|
"""
|
|
try:
|
|
try:
|
|
self.db_client.update(
|
|
self.db_client.update(
|
|
@@ -163,6 +191,7 @@ class WeixinVideoCrawler(object):
|
|
article_url,
|
|
article_url,
|
|
cover_url,
|
|
cover_url,
|
|
oss_path,
|
|
oss_path,
|
|
|
|
+ title_status,
|
|
create_time,
|
|
create_time,
|
|
int(time.time()),
|
|
int(time.time()),
|
|
url_unique
|
|
url_unique
|