|
@@ -10,15 +10,16 @@ import time
|
|
import requests
|
|
import requests
|
|
from hashlib import md5
|
|
from hashlib import md5
|
|
|
|
|
|
-from common.public import get_user_from_mysql
|
|
|
|
from douyin.douyin_recommend import get_xb
|
|
from douyin.douyin_recommend import get_xb
|
|
|
|
|
|
sys.path.append(os.getcwd())
|
|
sys.path.append(os.getcwd())
|
|
|
|
+from common.common import Common
|
|
from common.db import MysqlHelper
|
|
from common.db import MysqlHelper
|
|
from common.feishu import Feishu
|
|
from common.feishu import Feishu
|
|
from common.publish import Publish
|
|
from common.publish import Publish
|
|
|
|
+from common.public import random_title
|
|
from common.userAgent import get_random_user_agent
|
|
from common.userAgent import get_random_user_agent
|
|
-from common.common import Common
|
|
+from common.public import get_user_from_mysql, get_config_from_mysql
|
|
|
|
|
|
|
|
|
|
class DyFollow(object):
|
|
class DyFollow(object):
|
|
@@ -89,7 +90,7 @@ class DyFollow(object):
|
|
Common.logger(log_type, crawler).error(f'filter_words异常:{e}\n')
|
|
Common.logger(log_type, crawler).error(f'filter_words异常:{e}\n')
|
|
|
|
|
|
@classmethod
|
|
@classmethod
|
|
- def video_title(cls, log_type, crawler, title):
|
|
+ def video_title(cls, log_type, env, crawler, title):
|
|
title_split1 = title.split(" #")
|
|
title_split1 = title.split(" #")
|
|
if title_split1[0] != "":
|
|
if title_split1[0] != "":
|
|
title1 = title_split1[0]
|
|
title1 = title_split1[0]
|
|
@@ -116,29 +117,10 @@ class DyFollow(object):
|
|
.replace("?", "").replace('"', "").replace("<", "") \
|
|
.replace("?", "").replace('"', "").replace("<", "") \
|
|
.replace(">", "").replace("|", "").replace("@", "").replace('"', '').replace("'", '')[:40]
|
|
.replace(">", "").replace("|", "").replace("@", "").replace('"', '').replace("'", '')[:40]
|
|
if video_title.replace(" ", "") == "" or video_title == "。。。" or video_title == "...":
|
|
if video_title.replace(" ", "") == "" or video_title == "。。。" or video_title == "...":
|
|
- return cls.random_title(log_type, crawler)
|
|
+ return random_title(log_type, crawler, env, text='title')
|
|
else:
|
|
else:
|
|
return video_title
|
|
return video_title
|
|
|
|
|
|
- @classmethod
|
|
|
|
- def random_title(cls, log_type, crawler):
|
|
|
|
- try:
|
|
|
|
- while True:
|
|
|
|
- random_title_sheet = Feishu.get_values_batch(log_type, crawler, 'sPK2oY')
|
|
|
|
- if random_title_sheet is None:
|
|
|
|
- Common.logger(log_type, crawler).warning(f"filter_words_sheet:{random_title_sheet} 10秒钟后重试")
|
|
|
|
- continue
|
|
|
|
- random_title_list = []
|
|
|
|
- for x in random_title_sheet:
|
|
|
|
- for y in x:
|
|
|
|
- if y is None:
|
|
|
|
- pass
|
|
|
|
- else:
|
|
|
|
- random_title_list.append(y)
|
|
|
|
- return random.choice(random_title_list)
|
|
|
|
- except Exception as e:
|
|
|
|
- Common.logger(log_type, crawler).error(f'random_title:{e}\n')
|
|
|
|
-
|
|
|
|
@classmethod
|
|
@classmethod
|
|
def get_videoList(cls, log_type, crawler, strategy, our_uid, out_uid, oss_endpoint, env, machine, rule_dict):
|
|
def get_videoList(cls, log_type, crawler, strategy, our_uid, out_uid, oss_endpoint, env, machine, rule_dict):
|
|
|
|
|
|
@@ -175,13 +157,15 @@ class DyFollow(object):
|
|
for info in aweme_list:
|
|
for info in aweme_list:
|
|
if info.get('is_ads'):
|
|
if info.get('is_ads'):
|
|
continue
|
|
continue
|
|
- publish_time = info['create_time']
|
|
+ publish_time = info.get('create_time')
|
|
|
|
+ if not publish_time:
|
|
|
|
+ continue
|
|
publish_time_str = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(publish_time))
|
|
publish_time_str = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(publish_time))
|
|
publish_day = int((int(time.time()) - publish_time) / (3600 * 24))
|
|
publish_day = int((int(time.time()) - publish_time) / (3600 * 24))
|
|
|
|
|
|
- video_title = cls.video_title(log_type, crawler, info['desc'])
|
|
+ video_title = cls.video_title(log_type, env, crawler, info['desc'])
|
|
if not video_title:
|
|
if not video_title:
|
|
- video_title = cls.random_title(log_type, crawler)
|
|
+ video_title = random_title(log_type, crawler, env, text='title')
|
|
|
|
|
|
video_dict = {'video_title': video_title,
|
|
video_dict = {'video_title': video_title,
|
|
'video_id': info['aweme_id'],
|
|
'video_id': info['aweme_id'],
|
|
@@ -226,11 +210,13 @@ class DyFollow(object):
|
|
@classmethod
|
|
@classmethod
|
|
def download_publish(cls, log_type, crawler, strategy, video_dict, rule_dict, our_uid, oss_endpoint, env, machine):
|
|
def download_publish(cls, log_type, crawler, strategy, video_dict, rule_dict, our_uid, oss_endpoint, env, machine):
|
|
try:
|
|
try:
|
|
|
|
+ filter_words = get_config_from_mysql(log_type, crawler, env, text='filter')
|
|
|
|
+ for filter_word in filter_words:
|
|
|
|
+ if filter_word in video_dict['video_title']:
|
|
|
|
+ Common.logger(log_type, crawler).info('标题已中过滤词:{}\n', video_dict['video_title'])
|
|
|
|
+ return
|
|
if cls.download_rule(video_dict, rule_dict) is False:
|
|
if cls.download_rule(video_dict, rule_dict) is False:
|
|
Common.logger(log_type, crawler).info('不满足抓取规则\n')
|
|
Common.logger(log_type, crawler).info('不满足抓取规则\n')
|
|
- elif any(word if word in video_dict['video_title'] else False for word in
|
|
|
|
- cls.filter_words(log_type, crawler)) is True:
|
|
|
|
- Common.logger(log_type, crawler).info('标题已中过滤词:{}\n', video_dict['video_title'])
|
|
|
|
elif cls.repeat_video(log_type, crawler, video_dict['video_id'], env, machine) != 0:
|
|
elif cls.repeat_video(log_type, crawler, video_dict['video_id'], env, machine) != 0:
|
|
Common.logger(log_type, crawler).info('视频已下载\n')
|
|
Common.logger(log_type, crawler).info('视频已下载\n')
|
|
else:
|
|
else:
|
|
@@ -349,6 +335,7 @@ class DyFollow(object):
|
|
def get_follow_videos(cls, log_type, crawler, strategy, oss_endpoint, env, machine):
|
|
def get_follow_videos(cls, log_type, crawler, strategy, oss_endpoint, env, machine):
|
|
user_list = get_user_from_mysql(log_type, crawler, crawler, env)
|
|
user_list = get_user_from_mysql(log_type, crawler, crawler, env)
|
|
rule_dict = cls.get_rule(log_type, crawler)
|
|
rule_dict = cls.get_rule(log_type, crawler)
|
|
|
|
+
|
|
for user in user_list:
|
|
for user in user_list:
|
|
spider_link = user["spider_link"]
|
|
spider_link = user["spider_link"]
|
|
out_uid = spider_link
|
|
out_uid = spider_link
|