|
@@ -1,7 +1,11 @@
|
|
# -*- coding: utf-8 -*-
|
|
# -*- coding: utf-8 -*-
|
|
-# @Author: lierqiang
|
|
|
|
-# @Time: 2023/3/15
|
|
|
|
-
|
|
|
|
|
|
+# @Author: wangkun
|
|
|
|
+# @Time: 2023/2/3
|
|
|
|
+"""
|
|
|
|
+YouTube 定向榜
|
|
|
|
+ 1. 发布时间<=1个月
|
|
|
|
+ 2. 10分钟>=时长>=1分钟
|
|
|
|
+"""
|
|
import os
|
|
import os
|
|
import re
|
|
import re
|
|
import shutil
|
|
import shutil
|
|
@@ -12,12 +16,17 @@ import requests
|
|
|
|
|
|
sys.path.append(os.getcwd())
|
|
sys.path.append(os.getcwd())
|
|
from common.common import Common
|
|
from common.common import Common
|
|
|
|
+# from common.db import MysqlHelper
|
|
from common.scheduling_db import MysqlHelper
|
|
from common.scheduling_db import MysqlHelper
|
|
from common.feishu import Feishu
|
|
from common.feishu import Feishu
|
|
from common.getuser import getUser
|
|
from common.getuser import getUser
|
|
from common.publish import Publish
|
|
from common.publish import Publish
|
|
from common.translate import Translate
|
|
from common.translate import Translate
|
|
-from common.userAgent import get_random_header, get_random_user_agent
|
|
|
|
|
|
+from common.public import get_user_from_mysql, get_config_from_mysql
|
|
|
|
+
|
|
|
|
+headers = {
|
|
|
|
+ 'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 Safari/537.36',
|
|
|
|
+}
|
|
|
|
|
|
|
|
|
|
def format_nums(data):
|
|
def format_nums(data):
|
|
@@ -35,11 +44,14 @@ def format_nums(data):
|
|
return count
|
|
return count
|
|
|
|
|
|
|
|
|
|
-class YoutubeFollowSchedule:
|
|
|
|
|
|
+class YoutubeAuthorScheduling:
|
|
# 翻页参数
|
|
# 翻页参数
|
|
continuation = ''
|
|
continuation = ''
|
|
# 抓取平台
|
|
# 抓取平台
|
|
platform = 'youtube'
|
|
platform = 'youtube'
|
|
|
|
+ headers = {
|
|
|
|
+ 'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 Safari/537.36',
|
|
|
|
+ }
|
|
|
|
|
|
@classmethod
|
|
@classmethod
|
|
def get_out_user_info(cls, log_type, crawler, browse_id, out_user_id):
|
|
def get_out_user_info(cls, log_type, crawler, browse_id, out_user_id):
|
|
@@ -57,12 +69,16 @@ class YoutubeFollowSchedule:
|
|
"""
|
|
"""
|
|
try:
|
|
try:
|
|
url = f'https://www.youtube.com/{out_user_id}/about'
|
|
url = f'https://www.youtube.com/{out_user_id}/about'
|
|
- res = requests.get(url=url, headers=get_random_header('pc'))
|
|
|
|
|
|
+ res = requests.get(url=url, headers=headers)
|
|
info = re.findall(r'var ytInitialData = (.*?);</script>', res.text, re.S)[0]
|
|
info = re.findall(r'var ytInitialData = (.*?);</script>', res.text, re.S)[0]
|
|
data = json.loads(info)
|
|
data = json.loads(info)
|
|
header = data['header']['c4TabbedHeaderRenderer']
|
|
header = data['header']['c4TabbedHeaderRenderer']
|
|
tabs = data['contents']['twoColumnBrowseResultsRenderer']['tabs']
|
|
tabs = data['contents']['twoColumnBrowseResultsRenderer']['tabs']
|
|
- subsimpleText = header['subscriberCountText']['simpleText'].replace('位订阅者', '')
|
|
|
|
|
|
+ try:
|
|
|
|
+ subsimpleText = header['subscriberCountText']['simpleText'].replace('位订阅者', '')
|
|
|
|
+ out_fans = format_nums(subsimpleText)
|
|
|
|
+ except Exception as e:
|
|
|
|
+ out_fans = 0
|
|
for tab in tabs:
|
|
for tab in tabs:
|
|
if 'tabRenderer' not in tab or 'content' not in tab['tabRenderer']:
|
|
if 'tabRenderer' not in tab or 'content' not in tab['tabRenderer']:
|
|
continue
|
|
continue
|
|
@@ -76,7 +92,7 @@ class YoutubeFollowSchedule:
|
|
out_user_dict = {
|
|
out_user_dict = {
|
|
'out_user_name': header['title'],
|
|
'out_user_name': header['title'],
|
|
'out_avatar_url': header['avatar']['thumbnails'][-1]['url'],
|
|
'out_avatar_url': header['avatar']['thumbnails'][-1]['url'],
|
|
- 'out_fans': format_nums(subsimpleText),
|
|
|
|
|
|
+ 'out_fans': out_fans,
|
|
'out_play_cnt': int(
|
|
'out_play_cnt': int(
|
|
viewCountText.replace('收看次數:', '').replace('次', '').replace(',', '')) if viewCountText else 0,
|
|
viewCountText.replace('收看次數:', '').replace('次', '').replace(',', '')) if viewCountText else 0,
|
|
'out_create_time': out_create_time.replace('年', '-').replace('月', '-').replace('日', ''),
|
|
'out_create_time': out_create_time.replace('年', '-').replace('月', '-').replace('日', ''),
|
|
@@ -108,13 +124,17 @@ class YoutubeFollowSchedule:
|
|
uer_url = user_sheet[i][4]
|
|
uer_url = user_sheet[i][4]
|
|
if out_uid is not None and user_name is not None:
|
|
if out_uid is not None and user_name is not None:
|
|
Common.logger(log_type, crawler).info(f"正在更新 {user_name} 用户信息\n")
|
|
Common.logger(log_type, crawler).info(f"正在更新 {user_name} 用户信息\n")
|
|
|
|
+
|
|
if our_uid is None:
|
|
if our_uid is None:
|
|
sql = f""" select * from crawler_user where platform="{cls.platform}" and out_user_id="{out_uid}" """
|
|
sql = f""" select * from crawler_user where platform="{cls.platform}" and out_user_id="{out_uid}" """
|
|
our_user_info = MysqlHelper.get_values(log_type, crawler, sql, env)
|
|
our_user_info = MysqlHelper.get_values(log_type, crawler, sql, env)
|
|
# 数据库中(youtube + out_user_id)返回数量 == 0,则创建站内账号UID,并写入定向账号飞书表。并结合站外用户信息,一并写入爬虫账号数据库
|
|
# 数据库中(youtube + out_user_id)返回数量 == 0,则创建站内账号UID,并写入定向账号飞书表。并结合站外用户信息,一并写入爬虫账号数据库
|
|
- if our_user_info is None or len(our_user_info) == 0:
|
|
|
|
|
|
+ if not our_user_info:
|
|
# 获取站外账号信息,写入数据库
|
|
# 获取站外账号信息,写入数据库
|
|
- out_user_dict = cls.get_out_user_info(log_type, crawler, browse_id, out_uid)
|
|
|
|
|
|
+ try:
|
|
|
|
+ out_user_dict = cls.get_out_user_info(log_type, crawler, browse_id, out_uid)
|
|
|
|
+ except Exception as e:
|
|
|
|
+ continue
|
|
out_avatar_url = out_user_dict['out_avatar_url']
|
|
out_avatar_url = out_user_dict['out_avatar_url']
|
|
out_create_time = out_user_dict['out_create_time']
|
|
out_create_time = out_user_dict['out_create_time']
|
|
out_play_cnt = out_user_dict['out_play_cnt']
|
|
out_play_cnt = out_user_dict['out_play_cnt']
|
|
@@ -127,7 +147,7 @@ class YoutubeFollowSchedule:
|
|
'avatarUrl': out_avatar_url,
|
|
'avatarUrl': out_avatar_url,
|
|
'tagName': tag,
|
|
'tagName': tag,
|
|
}
|
|
}
|
|
- our_uid = getUsercreate_uid(log_type, crawler, create_user_dict, env)
|
|
|
|
|
|
+ our_uid = getUser.create_uid(log_type, crawler, create_user_dict, env)
|
|
Common.logger(log_type, crawler).info(f'新创建的站内UID:{our_uid}')
|
|
Common.logger(log_type, crawler).info(f'新创建的站内UID:{our_uid}')
|
|
if env == 'prod':
|
|
if env == 'prod':
|
|
our_user_link = f'https://admin.piaoquantv.com/ums/user/{our_uid}/post'
|
|
our_user_link = f'https://admin.piaoquantv.com/ums/user/{our_uid}/post'
|
|
@@ -138,22 +158,22 @@ class YoutubeFollowSchedule:
|
|
[[our_uid, our_user_link]])
|
|
[[our_uid, our_user_link]])
|
|
Common.logger(log_type, crawler).info(f'站内用户信息写入飞书成功!')
|
|
Common.logger(log_type, crawler).info(f'站内用户信息写入飞书成功!')
|
|
|
|
|
|
- sql = f""" insert into crawler_user(user_id,
|
|
|
|
- out_user_id,
|
|
|
|
- out_user_name,
|
|
|
|
- out_avatar_url,
|
|
|
|
- out_create_time,
|
|
|
|
- out_play_cnt,
|
|
|
|
- out_fans,
|
|
|
|
- platform,
|
|
|
|
|
|
+ sql = f""" insert into crawler_user(user_id,
|
|
|
|
+ out_user_id,
|
|
|
|
+ out_user_name,
|
|
|
|
+ out_avatar_url,
|
|
|
|
+ out_create_time,
|
|
|
|
+ out_play_cnt,
|
|
|
|
+ out_fans,
|
|
|
|
+ platform,
|
|
tag)
|
|
tag)
|
|
- values({our_uid},
|
|
|
|
- "{out_uid}",
|
|
|
|
- "{user_name}",
|
|
|
|
- "{out_avatar_url}",
|
|
|
|
- "{out_create_time}",
|
|
|
|
- {out_play_cnt},
|
|
|
|
- {out_fans},
|
|
|
|
|
|
+ values({our_uid},
|
|
|
|
+ "{out_uid}",
|
|
|
|
+ "{user_name}",
|
|
|
|
+ "{out_avatar_url}",
|
|
|
|
+ "{out_create_time}",
|
|
|
|
+ {out_play_cnt},
|
|
|
|
+ {out_fans},
|
|
"{cls.platform}",
|
|
"{cls.platform}",
|
|
"{tag}") """
|
|
"{tag}") """
|
|
Common.logger(log_type, crawler).info(f'sql:{sql}')
|
|
Common.logger(log_type, crawler).info(f'sql:{sql}')
|
|
@@ -352,7 +372,7 @@ class YoutubeFollowSchedule:
|
|
'sec-fetch-dest': 'empty',
|
|
'sec-fetch-dest': 'empty',
|
|
'sec-fetch-mode': 'same-origin',
|
|
'sec-fetch-mode': 'same-origin',
|
|
'sec-fetch-site': 'same-origin',
|
|
'sec-fetch-site': 'same-origin',
|
|
- 'user-agent': get_random_user_agent('pc'),
|
|
|
|
|
|
+ 'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36',
|
|
'x-goog-visitor-id': 'CgtraDZfVnB4NXdIWSi6mIOfBg%3D%3D',
|
|
'x-goog-visitor-id': 'CgtraDZfVnB4NXdIWSi6mIOfBg%3D%3D',
|
|
'x-youtube-bootstrap-logged-in': 'false',
|
|
'x-youtube-bootstrap-logged-in': 'false',
|
|
'x-youtube-client-name': '1',
|
|
'x-youtube-client-name': '1',
|
|
@@ -369,7 +389,6 @@ class YoutubeFollowSchedule:
|
|
elif 'continuationContents' in response.json():
|
|
elif 'continuationContents' in response.json():
|
|
# Common.logger(log_type, crawler).info("'continuationContents' in response.json()\n")
|
|
# Common.logger(log_type, crawler).info("'continuationContents' in response.json()\n")
|
|
if 'richGridContinuation' not in response.json()['continuationContents']:
|
|
if 'richGridContinuation' not in response.json()['continuationContents']:
|
|
- # Common.logger(log_type, crawler).warning(f"'richGridContinuation' not in response.json()['continuationContents']\n")
|
|
|
|
Common.logger(log_type, crawler).warning(
|
|
Common.logger(log_type, crawler).warning(
|
|
f'get_feeds_response:{response.json()["continuationContents"]}\n')
|
|
f'get_feeds_response:{response.json()["continuationContents"]}\n')
|
|
elif 'contents' not in response.json()['continuationContents']['richGridContinuation']:
|
|
elif 'contents' not in response.json()['continuationContents']['richGridContinuation']:
|
|
@@ -407,7 +426,7 @@ class YoutubeFollowSchedule:
|
|
@classmethod
|
|
@classmethod
|
|
def get_first_page(cls, user_url):
|
|
def get_first_page(cls, user_url):
|
|
try:
|
|
try:
|
|
- res = requests.get(url=user_url, headers=get_random_header('pc'))
|
|
|
|
|
|
+ res = requests.get(url=user_url, headers=cls.headers)
|
|
info = re.findall(r'var ytInitialData = (.*?);', res.text, re.S)[0]
|
|
info = re.findall(r'var ytInitialData = (.*?);', res.text, re.S)[0]
|
|
ytInitialData = json.loads(info)
|
|
ytInitialData = json.loads(info)
|
|
video_list = \
|
|
video_list = \
|
|
@@ -418,19 +437,17 @@ class YoutubeFollowSchedule:
|
|
return video_list
|
|
return video_list
|
|
|
|
|
|
@classmethod
|
|
@classmethod
|
|
- def get_next_page(cls, log_type, crawler, task, strategy, oss_endpoint, env, our_uid,
|
|
|
|
- out_uid, out_user_url, continuation):
|
|
|
|
- min_publish_day = task['min_publish_day']
|
|
|
|
|
|
+ def get_next_page(cls, log_type, crawler, strategy, oss_endpoint, env, out_uid, our_uid, out_user_url, continuation):
|
|
post_url = "https://www.youtube.com/youtubei/v1/browse?key=AIzaSyAO_FJ2SlqU8Q4STEHLGCilw_Y9_11qcW8&prettyPrint=false"
|
|
post_url = "https://www.youtube.com/youtubei/v1/browse?key=AIzaSyAO_FJ2SlqU8Q4STEHLGCilw_Y9_11qcW8&prettyPrint=false"
|
|
payload = json.dumps({
|
|
payload = json.dumps({
|
|
"context": {
|
|
"context": {
|
|
"client": {
|
|
"client": {
|
|
- "userAgent": get_random_user_agent('pc'),
|
|
|
|
|
|
+ "userAgent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 Safari/537.36,gzip(gfe)",
|
|
"clientName": "WEB",
|
|
"clientName": "WEB",
|
|
"clientVersion": "2.20230221.06.00",
|
|
"clientVersion": "2.20230221.06.00",
|
|
"osName": "Macintosh",
|
|
"osName": "Macintosh",
|
|
"osVersion": "10_15_7",
|
|
"osVersion": "10_15_7",
|
|
- "originalUrl": "https://www.youtube.com/{}/videos".format(out_uid),
|
|
|
|
|
|
+ "originalUrl": "https://www.youtube.com/@wongkim728/videos",
|
|
"screenPixelDensity": 2,
|
|
"screenPixelDensity": 2,
|
|
"platform": "DESKTOP",
|
|
"platform": "DESKTOP",
|
|
"clientFormFactor": "UNKNOWN_FORM_FACTOR",
|
|
"clientFormFactor": "UNKNOWN_FORM_FACTOR",
|
|
@@ -480,60 +497,47 @@ class YoutubeFollowSchedule:
|
|
'cookie': 'VISITOR_INFO1_LIVE=HABZsLFdU40; DEVICE_INFO=ChxOekl3TWpVek9UQXpPVE13TnpJd056a3pNZz09EJie0p8GGJie0p8G; PREF=f4=4000000&tz=Asia.Shanghai; HSID=AxFp7ylWWebUZYqrl; SSID=ANHuSQMqvVcV0vVNn; APISID=AkwZgjPvFZ6LZCrE/Aiv0K-2rEUzY1bH1u; SAPISID=8yRrBMHYXAhqkybH/AEFGJvzZ3tPalnTy0; __Secure-1PAPISID=8yRrBMHYXAhqkybH/AEFGJvzZ3tPalnTy0; __Secure-3PAPISID=8yRrBMHYXAhqkybH/AEFGJvzZ3tPalnTy0; SID=TwjWkM4mrKb4o8pRKbyQVqELjNU43ZL0bF8QB2hdTI9z05T4Koo9aQoNQfX1AiGFWeD7WA.; __Secure-1PSID=TwjWkM4mrKb4o8pRKbyQVqELjNU43ZL0bF8QB2hdTI9z05T4bs4qvvXffLLTXq_VYw0XLw.; __Secure-3PSID=TwjWkM4mrKb4o8pRKbyQVqELjNU43ZL0bF8QB2hdTI9z05T4cNwzpudzvCglfQ5A1FJnog.; LOGIN_INFO=AFmmF2swRAIgO4TvR9xxWoHPgrGoGAEVo-P8Slqem__vIdF_oajjRiECIFiq4YtbL_IQGCbkjrHsWkWH6OpzKd8RlgdS6qNurR0Q:QUQ3MjNmejV5WkRVUmZXVlFjbjY0dW1aVGpoZkZQdmxYamIzV01zc0lmT3JiQl9ldVYwc0t4dlNkbWpoVEdJMHVaWjZXVEt3ZERQeUppU3AyNmR6ckFucWltZU5LNmZjQ3lHUEtKTDBzSlo5WXpJQzF3UlNCVlp2Q1ZKVmxtRk05OHRuWFFiWGphcFpPblFOUURWTlVxVGtBazVjcmVtS2pR; YSC=CtX0f3NennA; SIDCC=AFvIBn9aXC4vNCbg5jPzjbC8LMYCBVx_dy8uJO20b-768rmRfP9f5BqQ_xXspPemecVq29qZ7A; __Secure-1PSIDCC=AFvIBn-4TD_lPaKgbmYAGO6hZluLgSgbWgb7XAcaeNG6982LIIpS_Gb9vkqHTBMyCGvb4x7m6jk; __Secure-3PSIDCC=AFvIBn9ypvGX15qq4CsnsuhWTaXa9yMTxWMWbIDXtr6L3XZD81XBUQ0IMUv9ZKh9mf8NEbSvOy0; SIDCC=AFvIBn_DwLbohF2llhq4EQjFDFA3n9-_AK_7ITJsTZtCeYwy43J8KCYUPfY7ghqX9s-Qq5dOIQ; __Secure-1PSIDCC=AFvIBn-7x_HhxbmDkOzXew-sXAEWVuUGpglr8rypU623IyO8Y9OungcqMkuxBZQ2vr6G7x9UcxM; __Secure-3PSIDCC=AFvIBn-7aSYRxZkCKZp7-Mdn9PwbW4CUtXD0ok0nCvPIZXfkFrN9VqN1BHkI1fUaoIo_8YCjwRs',
|
|
'cookie': 'VISITOR_INFO1_LIVE=HABZsLFdU40; DEVICE_INFO=ChxOekl3TWpVek9UQXpPVE13TnpJd056a3pNZz09EJie0p8GGJie0p8G; PREF=f4=4000000&tz=Asia.Shanghai; HSID=AxFp7ylWWebUZYqrl; SSID=ANHuSQMqvVcV0vVNn; APISID=AkwZgjPvFZ6LZCrE/Aiv0K-2rEUzY1bH1u; SAPISID=8yRrBMHYXAhqkybH/AEFGJvzZ3tPalnTy0; __Secure-1PAPISID=8yRrBMHYXAhqkybH/AEFGJvzZ3tPalnTy0; __Secure-3PAPISID=8yRrBMHYXAhqkybH/AEFGJvzZ3tPalnTy0; SID=TwjWkM4mrKb4o8pRKbyQVqELjNU43ZL0bF8QB2hdTI9z05T4Koo9aQoNQfX1AiGFWeD7WA.; __Secure-1PSID=TwjWkM4mrKb4o8pRKbyQVqELjNU43ZL0bF8QB2hdTI9z05T4bs4qvvXffLLTXq_VYw0XLw.; __Secure-3PSID=TwjWkM4mrKb4o8pRKbyQVqELjNU43ZL0bF8QB2hdTI9z05T4cNwzpudzvCglfQ5A1FJnog.; LOGIN_INFO=AFmmF2swRAIgO4TvR9xxWoHPgrGoGAEVo-P8Slqem__vIdF_oajjRiECIFiq4YtbL_IQGCbkjrHsWkWH6OpzKd8RlgdS6qNurR0Q:QUQ3MjNmejV5WkRVUmZXVlFjbjY0dW1aVGpoZkZQdmxYamIzV01zc0lmT3JiQl9ldVYwc0t4dlNkbWpoVEdJMHVaWjZXVEt3ZERQeUppU3AyNmR6ckFucWltZU5LNmZjQ3lHUEtKTDBzSlo5WXpJQzF3UlNCVlp2Q1ZKVmxtRk05OHRuWFFiWGphcFpPblFOUURWTlVxVGtBazVjcmVtS2pR; YSC=CtX0f3NennA; SIDCC=AFvIBn9aXC4vNCbg5jPzjbC8LMYCBVx_dy8uJO20b-768rmRfP9f5BqQ_xXspPemecVq29qZ7A; __Secure-1PSIDCC=AFvIBn-4TD_lPaKgbmYAGO6hZluLgSgbWgb7XAcaeNG6982LIIpS_Gb9vkqHTBMyCGvb4x7m6jk; __Secure-3PSIDCC=AFvIBn9ypvGX15qq4CsnsuhWTaXa9yMTxWMWbIDXtr6L3XZD81XBUQ0IMUv9ZKh9mf8NEbSvOy0; SIDCC=AFvIBn_DwLbohF2llhq4EQjFDFA3n9-_AK_7ITJsTZtCeYwy43J8KCYUPfY7ghqX9s-Qq5dOIQ; __Secure-1PSIDCC=AFvIBn-7x_HhxbmDkOzXew-sXAEWVuUGpglr8rypU623IyO8Y9OungcqMkuxBZQ2vr6G7x9UcxM; __Secure-3PSIDCC=AFvIBn-7aSYRxZkCKZp7-Mdn9PwbW4CUtXD0ok0nCvPIZXfkFrN9VqN1BHkI1fUaoIo_8YCjwRs',
|
|
'origin': 'https://www.youtube.com',
|
|
'origin': 'https://www.youtube.com',
|
|
'referer': out_user_url,
|
|
'referer': out_user_url,
|
|
- 'user-agent': get_random_user_agent('pc'),
|
|
|
|
|
|
+ 'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 Safari/537.36',
|
|
|
|
|
|
}
|
|
}
|
|
try:
|
|
try:
|
|
res = requests.request("POST", post_url, headers=headers, data=payload).json()
|
|
res = requests.request("POST", post_url, headers=headers, data=payload).json()
|
|
video_infos = res['onResponseReceivedActions'][0]['appendContinuationItemsAction']['continuationItems']
|
|
video_infos = res['onResponseReceivedActions'][0]['appendContinuationItemsAction']['continuationItems']
|
|
-
|
|
|
|
for data in video_infos:
|
|
for data in video_infos:
|
|
if 'richItemRenderer' in data:
|
|
if 'richItemRenderer' in data:
|
|
video_id = data["richItemRenderer"]["content"]['videoRenderer']['videoId']
|
|
video_id = data["richItemRenderer"]["content"]['videoRenderer']['videoId']
|
|
video_dict = cls.get_video_info(log_type, crawler, out_uid, video_id)
|
|
video_dict = cls.get_video_info(log_type, crawler, out_uid, video_id)
|
|
- # video_dict = cls.parse_video(video_dict, log_type, crawler, out_uid, video_id, machine)
|
|
|
|
# 发布时间<=7天
|
|
# 发布时间<=7天
|
|
publish_time = int(time.mktime(time.strptime(video_dict['publish_time'], "%Y-%m-%d")))
|
|
publish_time = int(time.mktime(time.strptime(video_dict['publish_time'], "%Y-%m-%d")))
|
|
- if int(time.time()) - publish_time <= 3600 * 24 * min_publish_day:
|
|
|
|
|
|
+ if int(time.time()) - publish_time <= 3600 * 24 * 7:
|
|
cls.download_publish(log_type, crawler, video_dict, strategy, our_uid, env, oss_endpoint)
|
|
cls.download_publish(log_type, crawler, video_dict, strategy, our_uid, env, oss_endpoint)
|
|
else:
|
|
else:
|
|
Common.logger(log_type, crawler).info('发布时间超过7天\n')
|
|
Common.logger(log_type, crawler).info('发布时间超过7天\n')
|
|
return
|
|
return
|
|
else:
|
|
else:
|
|
continuation = cls.get_continuation(data)
|
|
continuation = cls.get_continuation(data)
|
|
- cls.get_next_page(log_type, crawler, task, strategy, oss_endpoint, env, our_uid, out_uid,
|
|
|
|
- out_user_url, continuation)
|
|
|
|
|
|
+ cls.get_next_page(log_type, crawler, strategy, oss_endpoint, env, out_uid, our_uid,out_user_url, continuation)
|
|
except:
|
|
except:
|
|
return
|
|
return
|
|
|
|
|
|
@classmethod
|
|
@classmethod
|
|
- def get_videos(cls, log_type, crawler, task, oss_endpoint, env, our_uid, out_uid, out_user_url):
|
|
|
|
|
|
+ def get_videos(cls, log_type, crawler, strategy, task, oss_endpoint, env, out_uid, our_uid, out_user_url):
|
|
try:
|
|
try:
|
|
- # 修改
|
|
|
|
- strategy = task['user_tag']
|
|
|
|
- min_publish_day = int(task['min_publish_day'])
|
|
|
|
feeds = cls.get_first_page(out_user_url)
|
|
feeds = cls.get_first_page(out_user_url)
|
|
for data in feeds:
|
|
for data in feeds:
|
|
if 'richItemRenderer' in data:
|
|
if 'richItemRenderer' in data:
|
|
video_id = data["richItemRenderer"]["content"]['videoRenderer']['videoId']
|
|
video_id = data["richItemRenderer"]["content"]['videoRenderer']['videoId']
|
|
video_dict = cls.get_video_info(log_type, crawler, out_uid, video_id)
|
|
video_dict = cls.get_video_info(log_type, crawler, out_uid, video_id)
|
|
- # 发布时间判断
|
|
|
|
|
|
+ # 发布时间<=7天
|
|
publish_time = int(time.mktime(time.strptime(video_dict['publish_time'], "%Y-%m-%d")))
|
|
publish_time = int(time.mktime(time.strptime(video_dict['publish_time'], "%Y-%m-%d")))
|
|
- if int(time.time()) - publish_time > 3600 * 24 * min_publish_day:
|
|
|
|
- Common.logger(log_type, crawler).info(f'发布时间超过{min_publish_day}天\n')
|
|
|
|
- elif video_dict['video_title'] == '' or video_dict['video_url'] == '':
|
|
|
|
- Common.logger(log_type, crawler).info('无效视频\n')
|
|
|
|
- elif video_dict['duration'] > task['duration_max'] or video_dict['duration'] < task['duration_min']:
|
|
|
|
- Common.logger(log_type, crawler).info(f"时长:{video_dict['duration']}不满足规则\n")
|
|
|
|
- elif cls.repeat_video(log_type, crawler, video_dict['video_id'], env) != 0:
|
|
|
|
- Common.logger(log_type, crawler).info('视频已下载\n')
|
|
|
|
-
|
|
|
|
- else:
|
|
|
|
|
|
+ if int(time.time()) - publish_time <= 3600 * 24 * 7:
|
|
cls.download_publish(log_type, crawler, video_dict, strategy, our_uid, env, oss_endpoint)
|
|
cls.download_publish(log_type, crawler, video_dict, strategy, our_uid, env, oss_endpoint)
|
|
|
|
+ else:
|
|
|
|
+ Common.logger(log_type, crawler).info('发布时间超过7天\n')
|
|
|
|
+ return
|
|
else:
|
|
else:
|
|
continuation = cls.get_continuation(data)
|
|
continuation = cls.get_continuation(data)
|
|
- cls.get_next_page(log_type, crawler, task, strategy, oss_endpoint, env, our_uid, out_uid,
|
|
|
|
- out_user_url, continuation=continuation)
|
|
|
|
|
|
+ cls.get_next_page(log_type, crawler, strategy, oss_endpoint, env, out_uid, our_uid, out_user_url, continuation=continuation)
|
|
except Exception as e:
|
|
except Exception as e:
|
|
Common.logger(log_type, crawler).error(f"get_videos异常:{e}\n")
|
|
Common.logger(log_type, crawler).error(f"get_videos异常:{e}\n")
|
|
|
|
|
|
@@ -622,6 +626,15 @@ class YoutubeFollowSchedule:
|
|
else:
|
|
else:
|
|
cover_url = videoDetails['thumbnail']['thumbnails'][-1]['url']
|
|
cover_url = videoDetails['thumbnail']['thumbnails'][-1]['url']
|
|
|
|
|
|
|
|
+ # video_url
|
|
|
|
+ # if 'formats' not in streamingData:
|
|
|
|
+ # video_url = ''
|
|
|
|
+ # elif len(streamingData['formats']) == 0:
|
|
|
|
+ # video_url = ''
|
|
|
|
+ # elif 'url' not in streamingData['formats'][-1]:
|
|
|
|
+ # video_url = ''
|
|
|
|
+ # else:
|
|
|
|
+ # video_url = streamingData['formats'][-1]['url']
|
|
video_url = f"https://www.youtube.com/watch?v={video_id}"
|
|
video_url = f"https://www.youtube.com/watch?v={video_id}"
|
|
|
|
|
|
Common.logger(log_type, crawler).info(f'video_title:{video_title}')
|
|
Common.logger(log_type, crawler).info(f'video_title:{video_title}')
|
|
@@ -661,7 +674,7 @@ class YoutubeFollowSchedule:
|
|
"deviceMake": "Apple",
|
|
"deviceMake": "Apple",
|
|
"deviceModel": "",
|
|
"deviceModel": "",
|
|
"visitorData": "CgtraDZfVnB4NXdIWSjkzoefBg%3D%3D",
|
|
"visitorData": "CgtraDZfVnB4NXdIWSjkzoefBg%3D%3D",
|
|
- "userAgent": get_random_user_agent('pc'),
|
|
|
|
|
|
+ "userAgent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36,gzip(gfe)",
|
|
"clientName": "WEB",
|
|
"clientName": "WEB",
|
|
"clientVersion": "2.20230201.01.00",
|
|
"clientVersion": "2.20230201.01.00",
|
|
"osName": "Macintosh",
|
|
"osName": "Macintosh",
|
|
@@ -820,7 +833,7 @@ class YoutubeFollowSchedule:
|
|
'sec-fetch-dest': 'empty',
|
|
'sec-fetch-dest': 'empty',
|
|
'sec-fetch-mode': 'same-origin',
|
|
'sec-fetch-mode': 'same-origin',
|
|
'sec-fetch-site': 'same-origin',
|
|
'sec-fetch-site': 'same-origin',
|
|
- 'user-agent': get_random_user_agent('pc'),
|
|
|
|
|
|
+ 'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36',
|
|
'x-goog-visitor-id': 'CgtraDZfVnB4NXdIWSjkzoefBg%3D%3D',
|
|
'x-goog-visitor-id': 'CgtraDZfVnB4NXdIWSjkzoefBg%3D%3D',
|
|
'x-youtube-bootstrap-logged-in': 'false',
|
|
'x-youtube-bootstrap-logged-in': 'false',
|
|
'x-youtube-client-name': '1',
|
|
'x-youtube-client-name': '1',
|
|
@@ -844,7 +857,7 @@ class YoutubeFollowSchedule:
|
|
if 'title' not in videoDetails:
|
|
if 'title' not in videoDetails:
|
|
video_title = ''
|
|
video_title = ''
|
|
else:
|
|
else:
|
|
- video_title = videoDetails['title']
|
|
|
|
|
|
+ video_title = videoDetails['title'].replace('"', '').replace("'", '')
|
|
video_title = cls.filter_emoji(video_title)
|
|
video_title = cls.filter_emoji(video_title)
|
|
if not cls.is_contain_chinese(video_title):
|
|
if not cls.is_contain_chinese(video_title):
|
|
video_title = Translate.google_translate(video_title) \
|
|
video_title = Translate.google_translate(video_title) \
|
|
@@ -938,167 +951,162 @@ class YoutubeFollowSchedule:
|
|
@classmethod
|
|
@classmethod
|
|
def download_publish(cls, log_type, crawler, video_dict, strategy, our_uid, env, oss_endpoint):
|
|
def download_publish(cls, log_type, crawler, video_dict, strategy, our_uid, env, oss_endpoint):
|
|
try:
|
|
try:
|
|
|
|
+ filter_words = get_config_from_mysql(log_type, crawler, env, text='filter', action='get_author_map')
|
|
|
|
+ for filter_word in filter_words:
|
|
|
|
+ if filter_word in video_dict['video_title']:
|
|
|
|
+ Common.logger(log_type, crawler).info('标题已中过滤词:{}\n', video_dict['video_title'])
|
|
|
|
+ return
|
|
|
|
+ if video_dict['video_title'] == '' or video_dict['video_url'] == '':
|
|
|
|
+ Common.logger(log_type, crawler).info('无效视频\n')
|
|
|
|
+ elif video_dict['duration'] > 1200 or video_dict['duration'] < 60:
|
|
|
|
+ Common.logger(log_type, crawler).info(f"时长:{video_dict['duration']}不满足规则\n")
|
|
|
|
+ # elif repeat_video is not None and len(repeat_video) != 0:
|
|
|
|
+ elif cls.repeat_video(log_type, crawler, video_dict['video_id'], env) != 0:
|
|
|
|
+ Common.logger(log_type, crawler).info('视频已下载\n')
|
|
|
|
|
|
- # 下载视频
|
|
|
|
- Common.logger(log_type, crawler).info('开始下载视频...')
|
|
|
|
- Common.download_method(log_type, crawler, 'youtube_video', video_dict['video_title'],
|
|
|
|
- video_dict['video_url'])
|
|
|
|
|
|
+ else:
|
|
|
|
+ # 下载视频
|
|
|
|
+ Common.logger(log_type, crawler).info('开始下载视频...')
|
|
|
|
+ # Common.download_method(log_type, crawler, 'video', video_dict['video_title'], video_dict['video_url'])
|
|
|
|
+ Common.download_method(log_type, crawler, 'youtube_video', video_dict['video_title'],
|
|
|
|
+ video_dict['video_url'])
|
|
|
|
+ # ffmpeg_dict = Common.ffmpeg(log_type, crawler, f"./{crawler}/videos/{video_dict['video_title']}/video.mp4")
|
|
|
|
+ # video_width = int(ffmpeg_dict['width'])
|
|
|
|
+ # video_height = int(ffmpeg_dict['height'])
|
|
|
|
+ # video_size = int(ffmpeg_dict['size'])
|
|
|
|
+ video_width = 1280
|
|
|
|
+ video_height = 720
|
|
|
|
+ duration = int(video_dict['duration'])
|
|
|
|
|
|
- video_width = 1280
|
|
|
|
- video_height = 720
|
|
|
|
- duration = int(video_dict['duration'])
|
|
|
|
- Common.logger(log_type, crawler).info(f'video_width:{video_width}')
|
|
|
|
- Common.logger(log_type, crawler).info(f'video_height:{video_height}')
|
|
|
|
- Common.logger(log_type, crawler).info(f'duration:{duration}')
|
|
|
|
|
|
+ Common.logger(log_type, crawler).info(f'video_width:{video_width}')
|
|
|
|
+ Common.logger(log_type, crawler).info(f'video_height:{video_height}')
|
|
|
|
+ Common.logger(log_type, crawler).info(f'duration:{duration}')
|
|
|
|
+ # Common.logger(log_type, crawler).info(f'video_size:{video_size}\n')
|
|
|
|
|
|
- video_dict['video_width'] = video_width
|
|
|
|
- video_dict['video_height'] = video_height
|
|
|
|
- video_dict['duration'] = duration
|
|
|
|
- video_dict['comment_cnt'] = 0
|
|
|
|
- video_dict['like_cnt'] = 0
|
|
|
|
- video_dict['share_cnt'] = 0
|
|
|
|
- video_dict['avatar_url'] = video_dict['cover_url']
|
|
|
|
- video_dict['session'] = f'youtube{int(time.time())}'
|
|
|
|
- rule = '1,2'
|
|
|
|
- # 下载封面
|
|
|
|
- Common.download_method(log_type, crawler, 'cover', video_dict['video_title'], video_dict['cover_url'])
|
|
|
|
- # 保存视频文本信息
|
|
|
|
- Common.save_video_info(log_type, crawler, video_dict)
|
|
|
|
|
|
+ video_dict['video_width'] = video_width
|
|
|
|
+ video_dict['video_height'] = video_height
|
|
|
|
+ video_dict['duration'] = duration
|
|
|
|
+ video_dict['comment_cnt'] = 0
|
|
|
|
+ video_dict['like_cnt'] = 0
|
|
|
|
+ video_dict['share_cnt'] = 0
|
|
|
|
+ video_dict['avatar_url'] = video_dict['cover_url']
|
|
|
|
+ video_dict['session'] = f'youtube{int(time.time())}'
|
|
|
|
+ rule = '1,2'
|
|
|
|
+ # if duration < 60 or duration > 600:
|
|
|
|
+ # # 删除视频文件夹
|
|
|
|
+ # shutil.rmtree(f"./{crawler}/videos/{video_dict['video_title']}/")
|
|
|
|
+ # Common.logger(log_type, crawler).info(f"时长:{video_dict['duration']}不满足抓取规则,删除成功\n")
|
|
|
|
+ # return
|
|
|
|
+ # if duration == 0 or duration is None:
|
|
|
|
+ # # 删除视频文件夹
|
|
|
|
+ # shutil.rmtree(f"./{crawler}/videos/{video_dict['video_title']}/")
|
|
|
|
+ # Common.logger(log_type, crawler).info(f"视频下载出错,删除成功\n")
|
|
|
|
+ # return
|
|
|
|
+ # else:
|
|
|
|
+ # 下载封面
|
|
|
|
+ Common.download_method(log_type, crawler, 'cover', video_dict['video_title'], video_dict['cover_url'])
|
|
|
|
+ # 保存视频文本信息
|
|
|
|
+ Common.save_video_info(log_type, crawler, video_dict)
|
|
|
|
|
|
- # 上传视频
|
|
|
|
- Common.logger(log_type, crawler).info(f"开始上传视频")
|
|
|
|
- if env == 'dev':
|
|
|
|
- our_video_id = Publish.upload_and_publish(log_type, crawler, strategy, our_uid, env, oss_endpoint)
|
|
|
|
- our_video_link = f"https://testadmin.piaoquantv.com/cms/post-detail/{our_video_id}/info"
|
|
|
|
- else:
|
|
|
|
- our_video_id = Publish.upload_and_publish(log_type, crawler, strategy, our_uid, env, oss_endpoint)
|
|
|
|
- our_video_link = f"https://admin.piaoquantv.com/cms/post-detail/{our_video_id}/info"
|
|
|
|
- Common.logger(log_type, crawler).info("视频上传完成")
|
|
|
|
|
|
+ # 上传视频
|
|
|
|
+ Common.logger(log_type, crawler).info(f"开始上传视频")
|
|
|
|
+ if env == 'dev':
|
|
|
|
+ our_video_id = Publish.upload_and_publish(log_type, crawler, strategy, our_uid, env, oss_endpoint)
|
|
|
|
+ our_video_link = f"https://testadmin.piaoquantv.com/cms/post-detail/{our_video_id}/info"
|
|
|
|
+ else:
|
|
|
|
+ our_video_id = Publish.upload_and_publish(log_type, crawler, strategy, our_uid, env, oss_endpoint)
|
|
|
|
+ our_video_link = f"https://admin.piaoquantv.com/cms/post-detail/{our_video_id}/info"
|
|
|
|
+ Common.logger(log_type, crawler).info("视频上传完成")
|
|
|
|
|
|
- if our_video_id is None:
|
|
|
|
- # 删除视频文件夹
|
|
|
|
- shutil.rmtree(f"./{crawler}/videos/{video_dict['video_title']}/")
|
|
|
|
- return
|
|
|
|
|
|
+ if our_video_id is None:
|
|
|
|
+ # 删除视频文件夹
|
|
|
|
+ shutil.rmtree(f"./{crawler}/videos/{video_dict['video_title']}/")
|
|
|
|
+ return
|
|
|
|
|
|
- # 视频信息保存至飞书
|
|
|
|
- Feishu.insert_columns(log_type, crawler, "GVxlYk", "ROWS", 1, 2)
|
|
|
|
- # 视频ID工作表,首行写入数据
|
|
|
|
- upload_time = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(int(time.time())))
|
|
|
|
- values = [[upload_time,
|
|
|
|
- "定向榜",
|
|
|
|
- video_dict['video_id'],
|
|
|
|
- video_dict['video_title'],
|
|
|
|
- our_video_link,
|
|
|
|
- video_dict['play_cnt'],
|
|
|
|
- video_dict['duration'],
|
|
|
|
- f'{video_width}*{video_height}',
|
|
|
|
- video_dict['publish_time'],
|
|
|
|
- video_dict['user_name'],
|
|
|
|
- video_dict['cover_url'],
|
|
|
|
- video_dict['video_url']
|
|
|
|
- ]]
|
|
|
|
- # time.sleep(1)
|
|
|
|
- Feishu.update_values(log_type, crawler, "GVxlYk", "F2:Z2", values)
|
|
|
|
- Common.logger(log_type, crawler).info('视频信息写入定向_已下载表成功\n')
|
|
|
|
|
|
+ # 视频信息保存至飞书
|
|
|
|
+ Feishu.insert_columns(log_type, crawler, "GVxlYk", "ROWS", 1, 2)
|
|
|
|
+ # 视频ID工作表,首行写入数据
|
|
|
|
+ upload_time = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(int(time.time())))
|
|
|
|
+ values = [[upload_time,
|
|
|
|
+ "定向榜",
|
|
|
|
+ video_dict['video_id'],
|
|
|
|
+ video_dict['video_title'],
|
|
|
|
+ our_video_link,
|
|
|
|
+ video_dict['play_cnt'],
|
|
|
|
+ video_dict['duration'],
|
|
|
|
+ f'{video_width}*{video_height}',
|
|
|
|
+ video_dict['publish_time'],
|
|
|
|
+ video_dict['user_name'],
|
|
|
|
+ video_dict['cover_url'],
|
|
|
|
+ video_dict['video_url']
|
|
|
|
+ ]]
|
|
|
|
+ # time.sleep(1)
|
|
|
|
+ Feishu.update_values(log_type, crawler, "GVxlYk", "F2:Z2", values)
|
|
|
|
+ Common.logger(log_type, crawler).info('视频信息写入定向_已下载表成功\n')
|
|
|
|
|
|
- # 视频信息保存数据库
|
|
|
|
- sql = f""" insert into crawler_video(video_id,
|
|
|
|
- user_id,
|
|
|
|
- out_user_id,
|
|
|
|
- platform,
|
|
|
|
- strategy,
|
|
|
|
- out_video_id,
|
|
|
|
- video_title,
|
|
|
|
- cover_url,
|
|
|
|
- video_url,
|
|
|
|
- duration,
|
|
|
|
- publish_time,
|
|
|
|
- play_cnt,
|
|
|
|
- crawler_rule,
|
|
|
|
- width,
|
|
|
|
- height)
|
|
|
|
- values({our_video_id},
|
|
|
|
- "{our_uid}",
|
|
|
|
- "{video_dict['out_uid']}",
|
|
|
|
- "{cls.platform}",
|
|
|
|
- "定向爬虫策略",
|
|
|
|
- "{video_dict['video_id']}",
|
|
|
|
- "{video_dict['video_title']}",
|
|
|
|
- "{video_dict['cover_url']}",
|
|
|
|
- "{video_dict['video_url']}",
|
|
|
|
- {int(duration)},
|
|
|
|
- "{video_dict['publish_time']}",
|
|
|
|
- {int(video_dict['play_cnt'])},
|
|
|
|
- "{rule}",
|
|
|
|
- {int(video_width)},
|
|
|
|
- {int(video_height)}) """
|
|
|
|
- MysqlHelper.update_values(log_type, crawler, sql, env)
|
|
|
|
- Common.logger(log_type, crawler).info('视频信息插入数据库成功!\n')
|
|
|
|
|
|
+ # 视频信息保存数据库
|
|
|
|
+ sql = f""" insert into crawler_video(video_id,
|
|
|
|
+ user_id,
|
|
|
|
+ out_user_id,
|
|
|
|
+ platform,
|
|
|
|
+ strategy,
|
|
|
|
+ out_video_id,
|
|
|
|
+ video_title,
|
|
|
|
+ cover_url,
|
|
|
|
+ video_url,
|
|
|
|
+ duration,
|
|
|
|
+ publish_time,
|
|
|
|
+ play_cnt,
|
|
|
|
+ crawler_rule,
|
|
|
|
+ width,
|
|
|
|
+ height)
|
|
|
|
+ values({our_video_id},
|
|
|
|
+ "{our_uid}",
|
|
|
|
+ "{video_dict['out_uid']}",
|
|
|
|
+ "{cls.platform}",
|
|
|
|
+ "定向爬虫策略",
|
|
|
|
+ "{video_dict['video_id']}",
|
|
|
|
+ "{video_dict['video_title']}",
|
|
|
|
+ "{video_dict['cover_url']}",
|
|
|
|
+ "{video_dict['video_url']}",
|
|
|
|
+ {int(duration)},
|
|
|
|
+ "{video_dict['publish_time']}",
|
|
|
|
+ {int(video_dict['play_cnt'])},
|
|
|
|
+ "{rule}",
|
|
|
|
+ {int(video_width)},
|
|
|
|
+ {int(video_height)}) """
|
|
|
|
+ MysqlHelper.update_values(log_type, crawler, sql, env)
|
|
|
|
+ Common.logger(log_type, crawler).info('视频信息插入数据库成功!\n')
|
|
except Exception as e:
|
|
except Exception as e:
|
|
Common.logger(log_type, crawler).info(f"download_publish异常:{e}\n")
|
|
Common.logger(log_type, crawler).info(f"download_publish异常:{e}\n")
|
|
|
|
|
|
- @classmethod
|
|
|
|
- def get_users(cls, log_type, crawler, task, env):
|
|
|
|
- link_list = task['spider_link']
|
|
|
|
- user_list = []
|
|
|
|
- for link in link_list:
|
|
|
|
- out_uid = link.split("/")[3]
|
|
|
|
- sql = f""" select * from crawler_author_map where spider_link="{link}" """
|
|
|
|
- our_user_info = MysqlHelper.get_values(log_type=log_type, crawler=crawler, sql=sql, env=env, action='get_author_map')
|
|
|
|
-
|
|
|
|
- if len(our_user_info) == 0:
|
|
|
|
- our_uid = 0
|
|
|
|
- Common.logger(log_type, crawler).info(f"没有站内虚拟账号: {link}\n")
|
|
|
|
- else:
|
|
|
|
- our_uid = our_user_info[0]["media_id"]
|
|
|
|
- user_dict = {
|
|
|
|
- "out_uid": out_uid,
|
|
|
|
- "out_user_url": link,
|
|
|
|
- "our_uid": our_uid
|
|
|
|
- }
|
|
|
|
- user_list.append(user_dict)
|
|
|
|
- Common.logger(log_type, crawler).info(f"user_list:{user_list}")
|
|
|
|
- return user_list
|
|
|
|
-
|
|
|
|
@classmethod
|
|
@classmethod
|
|
def get_follow_videos(cls, log_type, crawler, task, oss_endpoint, env):
|
|
def get_follow_videos(cls, log_type, crawler, task, oss_endpoint, env):
|
|
try:
|
|
try:
|
|
- user_list = cls.get_users(log_type, crawler, task, env)
|
|
|
|
- if len(user_list) == 0:
|
|
|
|
- Common.logger(log_type, crawler).warning('用户列表为空\n')
|
|
|
|
- else:
|
|
|
|
- for user_dict in user_list:
|
|
|
|
- out_user_url = user_dict['out_user_url']
|
|
|
|
- our_uid = user_dict['our_uid']
|
|
|
|
- out_uid = user_dict['out_uid']
|
|
|
|
- Common.logger(log_type, crawler).info(f'获取 {out_uid} 主页视频\n')
|
|
|
|
|
|
+ user_list = get_user_from_mysql(log_type, crawler, crawler, env, action='get_author_map')
|
|
|
|
+ strategy = '定向抓取策略'
|
|
|
|
|
|
- cls.get_videos(log_type=log_type,
|
|
|
|
- crawler=crawler,
|
|
|
|
- task=task,
|
|
|
|
- our_uid=our_uid,
|
|
|
|
- oss_endpoint=oss_endpoint,
|
|
|
|
- env=env,
|
|
|
|
- out_uid=out_uid,
|
|
|
|
- out_user_url=out_user_url
|
|
|
|
- )
|
|
|
|
- # Common.logger(log_type, crawler).info('休眠 10 秒')
|
|
|
|
- # time.sleep(random.randint(1, 2))
|
|
|
|
- cls.continuation = ''
|
|
|
|
|
|
+ for user_dict in user_list:
|
|
|
|
+ out_user_url = user_dict['link']
|
|
|
|
+ out_uid = out_user_url.split('/')[3]
|
|
|
|
+ user_name = user_dict['nick_name']
|
|
|
|
+ our_uid = user_dict['uid']
|
|
|
|
+ Common.logger(log_type, crawler).info(f'获取 {user_name} 主页视频\n')
|
|
|
|
+ cls.get_videos(
|
|
|
|
+ log_type=log_type,
|
|
|
|
+ crawler=crawler,
|
|
|
|
+ strategy=strategy,
|
|
|
|
+ task=task,
|
|
|
|
+ oss_endpoint=oss_endpoint,
|
|
|
|
+ env=env,
|
|
|
|
+ our_uid=our_uid,
|
|
|
|
+ out_uid=out_uid,
|
|
|
|
+ out_user_url=out_user_url
|
|
|
|
+ )
|
|
|
|
+ cls.continuation = ''
|
|
except Exception as e:
|
|
except Exception as e:
|
|
Common.logger(log_type, crawler).error(f"get_follow_videos异常:{e}\n")
|
|
Common.logger(log_type, crawler).error(f"get_follow_videos异常:{e}\n")
|
|
|
|
|
|
|
|
|
|
if __name__ == "__main__":
|
|
if __name__ == "__main__":
|
|
- # print(Follow.get_browse_id('follow', 'youtube', '@chinatravel5971', "local"))
|
|
|
|
- # print(Follow.get_user_from_feishu('follow', 'youtube', 'c467d7', 'dev', 'local'))
|
|
|
|
- # print(Follow.get_user_from_feishu('follow', 'youtube', 'c467d7', 'prod', 'prod'))
|
|
|
|
- # Follow.get_out_user_info('follow', 'youtube', 'UC08jgxf119fzynp2uHCvZIg', '@weitravel')
|
|
|
|
- # Follow.get_video_info('follow', 'youtube', 'OGVK0IXBIhI')
|
|
|
|
- # Follow.get_follow_videos('follow', 'youtube', 'youtube_follow', 'out', 'dev', 'local')
|
|
|
|
- # print(Follow.filter_emoji("姐妹倆一唱一和,完美配合,終於把大慶降服了😅😅#萌娃搞笑日常"))
|
|
|
|
- # Follow.repeat_video('follow', 'youtube', 4, "dev", "local")
|
|
|
|
- # title = "'西部巡游220丨两人一车环游中国半年,需要花费多少钱? 2万公里吃住行费用总结'"
|
|
|
|
- # title = "'Insanely Crowded Shanghai Yu Garden Lantern Festival Walk Tour 2023 人气爆棚的上海豫园元宵节漫步之行 4K'"
|
|
|
|
- # print(title.strip().replace("\\", "").replace(" ", "").replace("\n", "").replace("/", "").replace("\r", "").replace("&NBSP", "").replace("&", ""))
|
|
|
|
- pass
|
|
|
|
|
|
+ YoutubeAuthorScheduling.get_follow_videos('author', 'youtube', '', 'outer', 'dev')
|