|
@@ -380,91 +380,94 @@ class Follow:
|
|
|
:param machine: 部署机器,阿里云填写 aliyun,aliyun_hk ,线下分别填写 macpro,macair,local
|
|
|
:return: user_list
|
|
|
"""
|
|
|
- user_sheet = Feishu.get_values_batch(log_type, crawler, sheetid)
|
|
|
- user_list = []
|
|
|
- for i in range(1, len(user_sheet)):
|
|
|
- out_uid = user_sheet[i][2]
|
|
|
- user_name = user_sheet[i][3]
|
|
|
- browse_id = user_sheet[i][5]
|
|
|
- our_uid = user_sheet[i][6]
|
|
|
- Common.logger(log_type, crawler).info(f"正在更新 {user_name} 用户信息\n")
|
|
|
- # 获取站外browse_id,并写入飞书
|
|
|
- if browse_id is None:
|
|
|
- browse_id = cls.get_browse_id(log_type, crawler, out_uid, machine)
|
|
|
+ try:
|
|
|
+ user_sheet = Feishu.get_values_batch(log_type, crawler, sheetid)
|
|
|
+ user_list = []
|
|
|
+ for i in range(1, len(user_sheet)):
|
|
|
+ out_uid = user_sheet[i][2]
|
|
|
+ user_name = user_sheet[i][3]
|
|
|
+ browse_id = user_sheet[i][5]
|
|
|
+ our_uid = user_sheet[i][6]
|
|
|
+ Common.logger(log_type, crawler).info(f"正在更新 {user_name} 用户信息\n")
|
|
|
+ # 获取站外browse_id,并写入飞书
|
|
|
if browse_id is None:
|
|
|
- Common.logger(log_type, crawler).warning('browse_id is None !')
|
|
|
- else:
|
|
|
- Feishu.update_values(log_type, crawler, sheetid, f'F{i+1}:F{i+1}', [[browse_id]])
|
|
|
- Common.logger(log_type, crawler).info(f'browse_id写入成功:{browse_id}')
|
|
|
- # 站内 UID 为空,且数据库中(youtube+out_user_id)返回数量 == 0,则创建新的站内账号
|
|
|
- if our_uid is None:
|
|
|
- sql = f""" select * from crawler_user where platform="{cls.platform}" and out_user_id="{out_uid}" """
|
|
|
- our_user_info = MysqlHelper.get_values(log_type, crawler, sql, env, machine)
|
|
|
- # 数据库中(youtube + out_user_id)返回数量 == 0,则创建站内账号UID,并写入定向账号飞书表。并结合站外用户信息,一并写入爬虫账号数据库
|
|
|
- if our_user_info is None or len(our_user_info) == 0:
|
|
|
- # 获取站外账号信息,写入数据库
|
|
|
- out_user_dict = cls.get_out_user_info(log_type, crawler, browse_id, out_uid)
|
|
|
- out_avatar_url = out_user_dict['out_avatar_url']
|
|
|
- out_create_time = out_user_dict['out_create_time']
|
|
|
- out_play_cnt = out_user_dict['out_play_cnt']
|
|
|
- out_fans = out_user_dict['out_fans']
|
|
|
- tag = 'youtube爬虫,定向爬虫策略'
|
|
|
-
|
|
|
- # 创建站内账号
|
|
|
- create_user_dict = {
|
|
|
- 'nickName': user_name,
|
|
|
- 'avatarUrl': out_avatar_url,
|
|
|
- 'tagName': tag,
|
|
|
- }
|
|
|
- our_uid = Users.create_user(log_type, crawler, create_user_dict, env)
|
|
|
- Common.logger(log_type, crawler).info(f'新创建的站内UID:{our_uid}')
|
|
|
- if env == 'prod':
|
|
|
- our_user_link = f'https://admin.piaoquantv.com/ums/user/{our_uid}/post'
|
|
|
+ browse_id = cls.get_browse_id(log_type, crawler, out_uid, machine)
|
|
|
+ if browse_id is None:
|
|
|
+ Common.logger(log_type, crawler).warning('browse_id is None !')
|
|
|
else:
|
|
|
- our_user_link = f'https://testadmin.piaoquantv.com/ums/user/{our_uid}/post'
|
|
|
- Common.logger(log_type, crawler).info(f'站内用户主页链接:{our_user_link}')
|
|
|
- Feishu.update_values(log_type, crawler, sheetid, f'G{i + 1}:H{i + 1}', [[our_uid, our_user_link]])
|
|
|
- Common.logger(log_type, crawler).info(f'站内用户信息写入飞书成功!')
|
|
|
+ Feishu.update_values(log_type, crawler, sheetid, f'F{i+1}:F{i+1}', [[browse_id]])
|
|
|
+ Common.logger(log_type, crawler).info(f'browse_id写入成功:{browse_id}')
|
|
|
+ # 站内 UID 为空,且数据库中(youtube+out_user_id)返回数量 == 0,则创建新的站内账号
|
|
|
+ if our_uid is None:
|
|
|
+ sql = f""" select * from crawler_user where platform="{cls.platform}" and out_user_id="{out_uid}" """
|
|
|
+ our_user_info = MysqlHelper.get_values(log_type, crawler, sql, env, machine)
|
|
|
+ # 数据库中(youtube + out_user_id)返回数量 == 0,则创建站内账号UID,并写入定向账号飞书表。并结合站外用户信息,一并写入爬虫账号数据库
|
|
|
+ if our_user_info is None or len(our_user_info) == 0:
|
|
|
+ # 获取站外账号信息,写入数据库
|
|
|
+ out_user_dict = cls.get_out_user_info(log_type, crawler, browse_id, out_uid)
|
|
|
+ out_avatar_url = out_user_dict['out_avatar_url']
|
|
|
+ out_create_time = out_user_dict['out_create_time']
|
|
|
+ out_play_cnt = out_user_dict['out_play_cnt']
|
|
|
+ out_fans = out_user_dict['out_fans']
|
|
|
+ tag = 'youtube爬虫,定向爬虫策略'
|
|
|
|
|
|
- sql = f""" insert into crawler_user(user_id,
|
|
|
- out_user_id,
|
|
|
- out_user_name,
|
|
|
- out_avatar_url,
|
|
|
- out_create_time,
|
|
|
- out_play_cnt,
|
|
|
- out_fans,
|
|
|
- platform,
|
|
|
- tag)
|
|
|
- values({our_uid},
|
|
|
- "{out_uid}",
|
|
|
- "{user_name}",
|
|
|
- "{out_avatar_url}",
|
|
|
- "{out_create_time}",
|
|
|
- {out_play_cnt},
|
|
|
- {out_fans},
|
|
|
- "{cls.platform}",
|
|
|
- "{tag}") """
|
|
|
- MysqlHelper.update_values(log_type, crawler, sql, env, machine)
|
|
|
- Common.logger(log_type, crawler).info('用户信息插入数据库成功!\n')
|
|
|
- # 数据库中(youtube + out_user_id)返回数量 != 0,则直接把数据库中的站内 UID 写入飞书
|
|
|
- else:
|
|
|
- our_uid = our_user_info[0][1]
|
|
|
- if 'env' == 'prod':
|
|
|
- our_user_link = f'https://admin.piaoquantv.com/ums/user/{our_uid}/post'
|
|
|
+ # 创建站内账号
|
|
|
+ create_user_dict = {
|
|
|
+ 'nickName': user_name,
|
|
|
+ 'avatarUrl': out_avatar_url,
|
|
|
+ 'tagName': tag,
|
|
|
+ }
|
|
|
+ our_uid = Users.create_user(log_type, crawler, create_user_dict, env)
|
|
|
+ Common.logger(log_type, crawler).info(f'新创建的站内UID:{our_uid}')
|
|
|
+ if env == 'prod':
|
|
|
+ our_user_link = f'https://admin.piaoquantv.com/ums/user/{our_uid}/post'
|
|
|
+ else:
|
|
|
+ our_user_link = f'https://testadmin.piaoquantv.com/ums/user/{our_uid}/post'
|
|
|
+ Common.logger(log_type, crawler).info(f'站内用户主页链接:{our_user_link}')
|
|
|
+ Feishu.update_values(log_type, crawler, sheetid, f'G{i + 1}:H{i + 1}', [[our_uid, our_user_link]])
|
|
|
+ Common.logger(log_type, crawler).info(f'站内用户信息写入飞书成功!')
|
|
|
+
|
|
|
+ sql = f""" insert into crawler_user(user_id,
|
|
|
+ out_user_id,
|
|
|
+ out_user_name,
|
|
|
+ out_avatar_url,
|
|
|
+ out_create_time,
|
|
|
+ out_play_cnt,
|
|
|
+ out_fans,
|
|
|
+ platform,
|
|
|
+ tag)
|
|
|
+ values({our_uid},
|
|
|
+ "{out_uid}",
|
|
|
+ "{user_name}",
|
|
|
+ "{out_avatar_url}",
|
|
|
+ "{out_create_time}",
|
|
|
+ {out_play_cnt},
|
|
|
+ {out_fans},
|
|
|
+ "{cls.platform}",
|
|
|
+ "{tag}") """
|
|
|
+ MysqlHelper.update_values(log_type, crawler, sql, env, machine)
|
|
|
+ Common.logger(log_type, crawler).info('用户信息插入数据库成功!\n')
|
|
|
+ # 数据库中(youtube + out_user_id)返回数量 != 0,则直接把数据库中的站内 UID 写入飞书
|
|
|
else:
|
|
|
- our_user_link = f'https://testadmin.piaoquantv.com/ums/user/{our_uid}/post'
|
|
|
- Common.logger(log_type, crawler).info(f'站内用户主页链接:{our_user_link}')
|
|
|
- Feishu.update_values(log_type, crawler, sheetid, f'G{i+1}:H{i+1}', [[our_uid, our_user_link]])
|
|
|
- Common.logger(log_type, crawler).info(f'站内用户信息写入飞书成功!\n')
|
|
|
+ our_uid = our_user_info[0][1]
|
|
|
+ if 'env' == 'prod':
|
|
|
+ our_user_link = f'https://admin.piaoquantv.com/ums/user/{our_uid}/post'
|
|
|
+ else:
|
|
|
+ our_user_link = f'https://testadmin.piaoquantv.com/ums/user/{our_uid}/post'
|
|
|
+ Common.logger(log_type, crawler).info(f'站内用户主页链接:{our_user_link}')
|
|
|
+ Feishu.update_values(log_type, crawler, sheetid, f'G{i+1}:H{i+1}', [[our_uid, our_user_link]])
|
|
|
+ Common.logger(log_type, crawler).info(f'站内用户信息写入飞书成功!\n')
|
|
|
|
|
|
- user_dict = {
|
|
|
- 'out_user_id': out_uid,
|
|
|
- 'out_user_name': user_name,
|
|
|
- 'out_browse_id': browse_id,
|
|
|
- 'our_user_id': our_uid,
|
|
|
- }
|
|
|
- user_list.append(user_dict)
|
|
|
- return user_list
|
|
|
+ user_dict = {
|
|
|
+ 'out_user_id': out_uid,
|
|
|
+ 'out_user_name': user_name,
|
|
|
+ 'out_browse_id': browse_id,
|
|
|
+ 'our_user_id': our_uid,
|
|
|
+ }
|
|
|
+ user_list.append(user_dict)
|
|
|
+ return user_list
|
|
|
+ except Exception as e:
|
|
|
+ Common.logger(log_type, crawler).error(f"get_user_from_feishu异常:{e}\n")
|
|
|
|
|
|
@classmethod
|
|
|
def get_feeds(cls, log_type, crawler, browse_id, out_uid):
|
|
@@ -634,465 +637,477 @@ class Follow:
|
|
|
'x-youtube-client-name': '1',
|
|
|
'x-youtube-client-version': '2.20230201.01.00'
|
|
|
}
|
|
|
- # try:
|
|
|
- response = requests.post(url=url, headers=headers, data=payload)
|
|
|
- # Common.logger(log_type, crawler).info(f"get_feeds_response:{response.json()}\n")
|
|
|
- cls.continuation = response.json()['trackingParams']
|
|
|
- if response.status_code != 200:
|
|
|
- Common.logger(log_type, crawler).warning(f'get_feeds_response:{response.text}\n')
|
|
|
- elif 'continuationContents' not in response.text and 'onResponseReceivedActions' not in response.text:
|
|
|
- Common.logger(log_type, crawler).warning(f'get_feeds_response:{response.text}\n')
|
|
|
- elif 'continuationContents' in response.json():
|
|
|
- # Common.logger(log_type, crawler).info("'continuationContents' in response.json()\n")
|
|
|
- if 'richGridContinuation' not in response.json()['continuationContents']:
|
|
|
- # Common.logger(log_type, crawler).warning(f"'richGridContinuation' not in response.json()['continuationContents']\n")
|
|
|
- Common.logger(log_type, crawler).warning(f'get_feeds_response:{response.json()["continuationContents"]}\n')
|
|
|
- elif 'contents' not in response.json()['continuationContents']['richGridContinuation']:
|
|
|
- Common.logger(log_type, crawler).warning(f'get_feeds_response:{response.json()["continuationContents"]["richGridContinuation"]}\n')
|
|
|
- elif 'contents' in response.json()["continuationContents"]["richGridContinuation"]:
|
|
|
- feeds = response.json()["continuationContents"]["richGridContinuation"]['contents']
|
|
|
- return feeds
|
|
|
- elif 'onResponseReceivedActions' in response.json():
|
|
|
- Common.logger(log_type, crawler).info("'onResponseReceivedActions' in response.json()\n")
|
|
|
- if len(response.json()['onResponseReceivedActions']) == 0:
|
|
|
- Common.logger(log_type, crawler).warning(f'get_feeds_response:{response.json()["onResponseReceivedActions"]}\n')
|
|
|
- elif 'appendContinuationItemsAction' not in response.json()['onResponseReceivedActions'][0]:
|
|
|
- Common.logger(log_type, crawler).warning(f'get_feeds_response:{response.json()["onResponseReceivedActions"][0]}\n')
|
|
|
- elif 'continuationItems' not in response.json()['onResponseReceivedActions'][0]['appendContinuationItemsAction']:
|
|
|
- Common.logger(log_type, crawler).warning(f'get_feeds_response:{response.json()["onResponseReceivedActions"][0]["appendContinuationItemsAction"]}\n')
|
|
|
- elif len(response.json()['onResponseReceivedActions'][0]['appendContinuationItemsAction']['continuationItems']) == 0:
|
|
|
- Common.logger(log_type, crawler).warning(f'get_feeds_response:{response.json()["onResponseReceivedActions"][0]["appendContinuationItemsAction"]["continuationItems"]}\n')
|
|
|
+ try:
|
|
|
+ response = requests.post(url=url, headers=headers, data=payload)
|
|
|
+ # Common.logger(log_type, crawler).info(f"get_feeds_response:{response.json()}\n")
|
|
|
+ cls.continuation = response.json()['trackingParams']
|
|
|
+ if response.status_code != 200:
|
|
|
+ Common.logger(log_type, crawler).warning(f'get_feeds_response:{response.text}\n')
|
|
|
+ elif 'continuationContents' not in response.text and 'onResponseReceivedActions' not in response.text:
|
|
|
+ Common.logger(log_type, crawler).warning(f'get_feeds_response:{response.text}\n')
|
|
|
+ elif 'continuationContents' in response.json():
|
|
|
+ # Common.logger(log_type, crawler).info("'continuationContents' in response.json()\n")
|
|
|
+ if 'richGridContinuation' not in response.json()['continuationContents']:
|
|
|
+ # Common.logger(log_type, crawler).warning(f"'richGridContinuation' not in response.json()['continuationContents']\n")
|
|
|
+ Common.logger(log_type, crawler).warning(f'get_feeds_response:{response.json()["continuationContents"]}\n')
|
|
|
+ elif 'contents' not in response.json()['continuationContents']['richGridContinuation']:
|
|
|
+ Common.logger(log_type, crawler).warning(f'get_feeds_response:{response.json()["continuationContents"]["richGridContinuation"]}\n')
|
|
|
+ elif 'contents' in response.json()["continuationContents"]["richGridContinuation"]:
|
|
|
+ feeds = response.json()["continuationContents"]["richGridContinuation"]['contents']
|
|
|
+ return feeds
|
|
|
+ elif 'onResponseReceivedActions' in response.json():
|
|
|
+ Common.logger(log_type, crawler).info("'onResponseReceivedActions' in response.json()\n")
|
|
|
+ if len(response.json()['onResponseReceivedActions']) == 0:
|
|
|
+ Common.logger(log_type, crawler).warning(f'get_feeds_response:{response.json()["onResponseReceivedActions"]}\n')
|
|
|
+ elif 'appendContinuationItemsAction' not in response.json()['onResponseReceivedActions'][0]:
|
|
|
+ Common.logger(log_type, crawler).warning(f'get_feeds_response:{response.json()["onResponseReceivedActions"][0]}\n')
|
|
|
+ elif 'continuationItems' not in response.json()['onResponseReceivedActions'][0]['appendContinuationItemsAction']:
|
|
|
+ Common.logger(log_type, crawler).warning(f'get_feeds_response:{response.json()["onResponseReceivedActions"][0]["appendContinuationItemsAction"]}\n')
|
|
|
+ elif len(response.json()['onResponseReceivedActions'][0]['appendContinuationItemsAction']['continuationItems']) == 0:
|
|
|
+ Common.logger(log_type, crawler).warning(f'get_feeds_response:{response.json()["onResponseReceivedActions"][0]["appendContinuationItemsAction"]["continuationItems"]}\n')
|
|
|
+ else:
|
|
|
+ feeds = response.json()["onResponseReceivedActions"][0]["appendContinuationItemsAction"]["continuationItems"]
|
|
|
+ return feeds
|
|
|
else:
|
|
|
- feeds = response.json()["onResponseReceivedActions"][0]["appendContinuationItemsAction"]["continuationItems"]
|
|
|
- return feeds
|
|
|
- else:
|
|
|
- Common.logger(log_type, crawler).info('feeds is None\n')
|
|
|
+ Common.logger(log_type, crawler).info('feeds is None\n')
|
|
|
|
|
|
- # except Exception as e:
|
|
|
- # Common.logger(log_type, crawler).error(f'get_feeds异常:{e}\n')
|
|
|
+ except Exception as e:
|
|
|
+ Common.logger(log_type, crawler).error(f'get_feeds异常:{e}\n')
|
|
|
|
|
|
@classmethod
|
|
|
def get_videos(cls, log_type, crawler, strategy, oss_endpoint, env, browse_id, out_uid, our_uid, machine):
|
|
|
- while True:
|
|
|
- feeds = cls.get_feeds(log_type, crawler, browse_id, out_uid)
|
|
|
- # Common.logger(log_type, crawler).info(f"feeds:{feeds}\n")
|
|
|
- for i in range(len(feeds)):
|
|
|
- if 'richItemRenderer' not in feeds[i]:
|
|
|
- Common.logger(log_type, crawler).warning(f'feeds:{feeds[i]}\n')
|
|
|
- elif 'content' not in feeds[i]['richItemRenderer']:
|
|
|
- Common.logger(log_type, crawler).warning(f'feeds:{feeds[i]["richItemRenderer"]}\n')
|
|
|
- elif 'videoRenderer' not in feeds[i]['richItemRenderer']['content']:
|
|
|
- Common.logger(log_type, crawler).warning(f'feeds:{feeds[i]["richItemRenderer"]["content"]}\n')
|
|
|
- elif 'videoId' not in feeds[i]["richItemRenderer"]["content"]['videoRenderer']:
|
|
|
- Common.logger(log_type, crawler).warning(f'feeds:{feeds[i]["richItemRenderer"]["content"]["videoRenderer"]}\n')
|
|
|
- else:
|
|
|
- video_id = feeds[i]["richItemRenderer"]["content"]['videoRenderer']['videoId']
|
|
|
- video_dict = cls.get_video_info(log_type, crawler, out_uid, video_id, machine)
|
|
|
- # 发布时间<=30天
|
|
|
- publish_time = int(time.mktime(time.strptime(video_dict['publish_time'], "%Y-%m-%d")))
|
|
|
- if int(time.time()) - publish_time <= 3600*24*30:
|
|
|
- cls.download_publish(log_type, crawler, video_dict, strategy, our_uid, env, oss_endpoint, machine)
|
|
|
+ try:
|
|
|
+ while True:
|
|
|
+ feeds = cls.get_feeds(log_type, crawler, browse_id, out_uid)
|
|
|
+ # Common.logger(log_type, crawler).info(f"feeds:{feeds}\n")
|
|
|
+ for i in range(len(feeds)):
|
|
|
+ if 'richItemRenderer' not in feeds[i]:
|
|
|
+ Common.logger(log_type, crawler).warning(f'feeds:{feeds[i]}\n')
|
|
|
+ elif 'content' not in feeds[i]['richItemRenderer']:
|
|
|
+ Common.logger(log_type, crawler).warning(f'feeds:{feeds[i]["richItemRenderer"]}\n')
|
|
|
+ elif 'videoRenderer' not in feeds[i]['richItemRenderer']['content']:
|
|
|
+ Common.logger(log_type, crawler).warning(f'feeds:{feeds[i]["richItemRenderer"]["content"]}\n')
|
|
|
+ elif 'videoId' not in feeds[i]["richItemRenderer"]["content"]['videoRenderer']:
|
|
|
+ Common.logger(log_type, crawler).warning(f'feeds:{feeds[i]["richItemRenderer"]["content"]["videoRenderer"]}\n')
|
|
|
else:
|
|
|
- Common.logger(log_type, crawler).info('发布时间超过30天\n')
|
|
|
- return
|
|
|
+ video_id = feeds[i]["richItemRenderer"]["content"]['videoRenderer']['videoId']
|
|
|
+ video_dict = cls.get_video_info(log_type, crawler, out_uid, video_id, machine)
|
|
|
+ # 发布时间<=30天
|
|
|
+ publish_time = int(time.mktime(time.strptime(video_dict['publish_time'], "%Y-%m-%d")))
|
|
|
+ if int(time.time()) - publish_time <= 3600*24*30:
|
|
|
+ cls.download_publish(log_type, crawler, video_dict, strategy, our_uid, env, oss_endpoint, machine)
|
|
|
+ else:
|
|
|
+ Common.logger(log_type, crawler).info('发布时间超过30天\n')
|
|
|
+ return
|
|
|
+ except Exception as e:
|
|
|
+ Common.logger(log_type, crawler).error(f"get_videos异常:{e}\n")
|
|
|
|
|
|
@classmethod
|
|
|
def get_video_info(cls, log_type, crawler, out_uid, video_id, machine):
|
|
|
- url = "https://www.youtube.com/youtubei/v1/player?key=AIzaSyAO_FJ2SlqU8Q4STEHLGCilw_Y9_11qcW8&prettyPrint=false"
|
|
|
- payload = json.dumps({
|
|
|
- "context": {
|
|
|
- "client": {
|
|
|
- "hl": "zh-CN",
|
|
|
- "gl": "US",
|
|
|
- "remoteHost": "38.93.247.21",
|
|
|
- "deviceMake": "Apple",
|
|
|
- "deviceModel": "",
|
|
|
- "visitorData": "CgtraDZfVnB4NXdIWSjkzoefBg%3D%3D",
|
|
|
- "userAgent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36,gzip(gfe)",
|
|
|
- "clientName": "WEB",
|
|
|
- "clientVersion": "2.20230201.01.00",
|
|
|
- "osName": "Macintosh",
|
|
|
- "osVersion": "10_15_7",
|
|
|
- "originalUrl": f"https://www.youtube.com/watch?v={video_id}",
|
|
|
- "platform": "DESKTOP",
|
|
|
- "clientFormFactor": "UNKNOWN_FORM_FACTOR",
|
|
|
- "configInfo": {
|
|
|
- "appInstallData": "COTOh58GEPuj_hIQ1-SuBRC4i64FEMzfrgUQgt2uBRCi7K4FEOLUrgUQzPWuBRCKgK8FEOSg_hIQtpz-EhDa6a4FEP7urgUQieiuBRDn964FELjUrgUQlPiuBRCH3a4FELfgrgUQ76P-EhDJya4FEJan_hIQkfj8Eg%3D%3D"
|
|
|
+ try:
|
|
|
+ url = "https://www.youtube.com/youtubei/v1/player?key=AIzaSyAO_FJ2SlqU8Q4STEHLGCilw_Y9_11qcW8&prettyPrint=false"
|
|
|
+ payload = json.dumps({
|
|
|
+ "context": {
|
|
|
+ "client": {
|
|
|
+ "hl": "zh-CN",
|
|
|
+ "gl": "US",
|
|
|
+ "remoteHost": "38.93.247.21",
|
|
|
+ "deviceMake": "Apple",
|
|
|
+ "deviceModel": "",
|
|
|
+ "visitorData": "CgtraDZfVnB4NXdIWSjkzoefBg%3D%3D",
|
|
|
+ "userAgent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36,gzip(gfe)",
|
|
|
+ "clientName": "WEB",
|
|
|
+ "clientVersion": "2.20230201.01.00",
|
|
|
+ "osName": "Macintosh",
|
|
|
+ "osVersion": "10_15_7",
|
|
|
+ "originalUrl": f"https://www.youtube.com/watch?v={video_id}",
|
|
|
+ "platform": "DESKTOP",
|
|
|
+ "clientFormFactor": "UNKNOWN_FORM_FACTOR",
|
|
|
+ "configInfo": {
|
|
|
+ "appInstallData": "COTOh58GEPuj_hIQ1-SuBRC4i64FEMzfrgUQgt2uBRCi7K4FEOLUrgUQzPWuBRCKgK8FEOSg_hIQtpz-EhDa6a4FEP7urgUQieiuBRDn964FELjUrgUQlPiuBRCH3a4FELfgrgUQ76P-EhDJya4FEJan_hIQkfj8Eg%3D%3D"
|
|
|
+ },
|
|
|
+ "timeZone": "Asia/Shanghai",
|
|
|
+ "browserName": "Chrome",
|
|
|
+ "browserVersion": "109.0.0.0",
|
|
|
+ "acceptHeader": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
|
|
|
+ "deviceExperimentId": "ChxOekU1TlRReU5qWTBOVFExTVRRNU5qRTBOdz09EOTOh58GGOmU7Z4G",
|
|
|
+ "screenWidthPoints": 1037,
|
|
|
+ "screenHeightPoints": 969,
|
|
|
+ "screenPixelDensity": 1,
|
|
|
+ "screenDensityFloat": 1,
|
|
|
+ "utcOffsetMinutes": 480,
|
|
|
+ "userInterfaceTheme": "USER_INTERFACE_THEME_LIGHT",
|
|
|
+ "memoryTotalKbytes": "8000000",
|
|
|
+ "clientScreen": "WATCH",
|
|
|
+ "mainAppWebInfo": {
|
|
|
+ "graftUrl": f"/watch?v={video_id}",
|
|
|
+ "pwaInstallabilityStatus": "PWA_INSTALLABILITY_STATUS_CAN_BE_INSTALLED",
|
|
|
+ "webDisplayMode": "WEB_DISPLAY_MODE_FULLSCREEN",
|
|
|
+ "isWebNativeShareAvailable": True
|
|
|
+ }
|
|
|
},
|
|
|
- "timeZone": "Asia/Shanghai",
|
|
|
- "browserName": "Chrome",
|
|
|
- "browserVersion": "109.0.0.0",
|
|
|
- "acceptHeader": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
|
|
|
- "deviceExperimentId": "ChxOekU1TlRReU5qWTBOVFExTVRRNU5qRTBOdz09EOTOh58GGOmU7Z4G",
|
|
|
- "screenWidthPoints": 1037,
|
|
|
- "screenHeightPoints": 969,
|
|
|
- "screenPixelDensity": 1,
|
|
|
- "screenDensityFloat": 1,
|
|
|
- "utcOffsetMinutes": 480,
|
|
|
- "userInterfaceTheme": "USER_INTERFACE_THEME_LIGHT",
|
|
|
- "memoryTotalKbytes": "8000000",
|
|
|
- "clientScreen": "WATCH",
|
|
|
- "mainAppWebInfo": {
|
|
|
- "graftUrl": f"/watch?v={video_id}",
|
|
|
- "pwaInstallabilityStatus": "PWA_INSTALLABILITY_STATUS_CAN_BE_INSTALLED",
|
|
|
- "webDisplayMode": "WEB_DISPLAY_MODE_FULLSCREEN",
|
|
|
- "isWebNativeShareAvailable": True
|
|
|
+ "user": {
|
|
|
+ "lockedSafetyMode": False
|
|
|
+ },
|
|
|
+ "request": {
|
|
|
+ "useSsl": True,
|
|
|
+ "internalExperimentFlags": [],
|
|
|
+ "consistencyTokenJars": []
|
|
|
+ },
|
|
|
+ "clickTracking": {
|
|
|
+ "clickTrackingParams": "CIwBEKQwGAYiEwipncqx3IL9AhXs4cQKHbKZDO4yB3JlbGF0ZWRInsS1qbGFtIlUmgEFCAEQ-B0="
|
|
|
+ },
|
|
|
+ "adSignalsInfo": {
|
|
|
+ "params": [
|
|
|
+ {
|
|
|
+ "key": "dt",
|
|
|
+ "value": "1675749222611"
|
|
|
+ },
|
|
|
+ {
|
|
|
+ "key": "flash",
|
|
|
+ "value": "0"
|
|
|
+ },
|
|
|
+ {
|
|
|
+ "key": "frm",
|
|
|
+ "value": "0"
|
|
|
+ },
|
|
|
+ {
|
|
|
+ "key": "u_tz",
|
|
|
+ "value": "480"
|
|
|
+ },
|
|
|
+ {
|
|
|
+ "key": "u_his",
|
|
|
+ "value": "3"
|
|
|
+ },
|
|
|
+ {
|
|
|
+ "key": "u_h",
|
|
|
+ "value": "1080"
|
|
|
+ },
|
|
|
+ {
|
|
|
+ "key": "u_w",
|
|
|
+ "value": "1920"
|
|
|
+ },
|
|
|
+ {
|
|
|
+ "key": "u_ah",
|
|
|
+ "value": "1080"
|
|
|
+ },
|
|
|
+ {
|
|
|
+ "key": "u_aw",
|
|
|
+ "value": "1920"
|
|
|
+ },
|
|
|
+ {
|
|
|
+ "key": "u_cd",
|
|
|
+ "value": "24"
|
|
|
+ },
|
|
|
+ {
|
|
|
+ "key": "bc",
|
|
|
+ "value": "31"
|
|
|
+ },
|
|
|
+ {
|
|
|
+ "key": "bih",
|
|
|
+ "value": "969"
|
|
|
+ },
|
|
|
+ {
|
|
|
+ "key": "biw",
|
|
|
+ "value": "1037"
|
|
|
+ },
|
|
|
+ {
|
|
|
+ "key": "brdim",
|
|
|
+ "value": "-269,-1080,-269,-1080,1920,-1080,1920,1080,1037,969"
|
|
|
+ },
|
|
|
+ {
|
|
|
+ "key": "vis",
|
|
|
+ "value": "1"
|
|
|
+ },
|
|
|
+ {
|
|
|
+ "key": "wgl",
|
|
|
+ "value": "true"
|
|
|
+ },
|
|
|
+ {
|
|
|
+ "key": "ca_type",
|
|
|
+ "value": "image"
|
|
|
+ }
|
|
|
+ ],
|
|
|
+ "bid": "ANyPxKop8SijebwUCq4ZfKbJwlSjVQa_RTdS6c6a6WPYpCKnxpWCJ33B1SzRuSXjSfH9O2MhURebAs0CngRg6B4nOjBpeJDKgA"
|
|
|
}
|
|
|
},
|
|
|
- "user": {
|
|
|
- "lockedSafetyMode": False
|
|
|
- },
|
|
|
- "request": {
|
|
|
- "useSsl": True,
|
|
|
- "internalExperimentFlags": [],
|
|
|
- "consistencyTokenJars": []
|
|
|
- },
|
|
|
- "clickTracking": {
|
|
|
- "clickTrackingParams": "CIwBEKQwGAYiEwipncqx3IL9AhXs4cQKHbKZDO4yB3JlbGF0ZWRInsS1qbGFtIlUmgEFCAEQ-B0="
|
|
|
- },
|
|
|
- "adSignalsInfo": {
|
|
|
- "params": [
|
|
|
- {
|
|
|
- "key": "dt",
|
|
|
- "value": "1675749222611"
|
|
|
- },
|
|
|
- {
|
|
|
- "key": "flash",
|
|
|
- "value": "0"
|
|
|
- },
|
|
|
- {
|
|
|
- "key": "frm",
|
|
|
- "value": "0"
|
|
|
- },
|
|
|
- {
|
|
|
- "key": "u_tz",
|
|
|
- "value": "480"
|
|
|
- },
|
|
|
- {
|
|
|
- "key": "u_his",
|
|
|
- "value": "3"
|
|
|
- },
|
|
|
- {
|
|
|
- "key": "u_h",
|
|
|
- "value": "1080"
|
|
|
- },
|
|
|
- {
|
|
|
- "key": "u_w",
|
|
|
- "value": "1920"
|
|
|
- },
|
|
|
- {
|
|
|
- "key": "u_ah",
|
|
|
- "value": "1080"
|
|
|
- },
|
|
|
- {
|
|
|
- "key": "u_aw",
|
|
|
- "value": "1920"
|
|
|
- },
|
|
|
- {
|
|
|
- "key": "u_cd",
|
|
|
- "value": "24"
|
|
|
- },
|
|
|
- {
|
|
|
- "key": "bc",
|
|
|
- "value": "31"
|
|
|
- },
|
|
|
- {
|
|
|
- "key": "bih",
|
|
|
- "value": "969"
|
|
|
- },
|
|
|
- {
|
|
|
- "key": "biw",
|
|
|
- "value": "1037"
|
|
|
- },
|
|
|
- {
|
|
|
- "key": "brdim",
|
|
|
- "value": "-269,-1080,-269,-1080,1920,-1080,1920,1080,1037,969"
|
|
|
- },
|
|
|
- {
|
|
|
- "key": "vis",
|
|
|
- "value": "1"
|
|
|
- },
|
|
|
- {
|
|
|
- "key": "wgl",
|
|
|
- "value": "true"
|
|
|
- },
|
|
|
- {
|
|
|
- "key": "ca_type",
|
|
|
- "value": "image"
|
|
|
+ "videoId": str(video_id),
|
|
|
+ "playbackContext": {
|
|
|
+ "contentPlaybackContext": {
|
|
|
+ "currentUrl": f"/watch?v={video_id}",
|
|
|
+ "vis": 0,
|
|
|
+ "splay": False,
|
|
|
+ "autoCaptionsDefaultOn": False,
|
|
|
+ "autonavState": "STATE_NONE",
|
|
|
+ "html5Preference": "HTML5_PREF_WANTS",
|
|
|
+ "signatureTimestamp": 19394,
|
|
|
+ "referer": f"https://www.youtube.com/watch?v={video_id}",
|
|
|
+ "lactMilliseconds": "-1",
|
|
|
+ "watchAmbientModeContext": {
|
|
|
+ "watchAmbientModeEnabled": True
|
|
|
}
|
|
|
- ],
|
|
|
- "bid": "ANyPxKop8SijebwUCq4ZfKbJwlSjVQa_RTdS6c6a6WPYpCKnxpWCJ33B1SzRuSXjSfH9O2MhURebAs0CngRg6B4nOjBpeJDKgA"
|
|
|
- }
|
|
|
- },
|
|
|
- "videoId": str(video_id),
|
|
|
- "playbackContext": {
|
|
|
- "contentPlaybackContext": {
|
|
|
- "currentUrl": f"/watch?v={video_id}",
|
|
|
- "vis": 0,
|
|
|
- "splay": False,
|
|
|
- "autoCaptionsDefaultOn": False,
|
|
|
- "autonavState": "STATE_NONE",
|
|
|
- "html5Preference": "HTML5_PREF_WANTS",
|
|
|
- "signatureTimestamp": 19394,
|
|
|
- "referer": f"https://www.youtube.com/watch?v={video_id}",
|
|
|
- "lactMilliseconds": "-1",
|
|
|
- "watchAmbientModeContext": {
|
|
|
- "watchAmbientModeEnabled": True
|
|
|
}
|
|
|
- }
|
|
|
- },
|
|
|
- "racyCheckOk": False,
|
|
|
- "contentCheckOk": False
|
|
|
- })
|
|
|
- headers = {
|
|
|
- 'authority': 'www.youtube.com',
|
|
|
- 'accept': '*/*',
|
|
|
- 'accept-language': 'zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6',
|
|
|
- 'cache-control': 'no-cache',
|
|
|
- 'content-type': 'application/json',
|
|
|
- 'cookie': f'VISITOR_INFO1_LIVE=kh6_Vpx5wHY; YSC=UupqFrWvAR0; DEVICE_INFO=ChxOekU1TlRReU5qWTBOVFExTVRRNU5qRTBOdz09EOmU7Z4GGOmU7Z4G; PREF=tz=Asia.Shanghai; ST-180dxzo=itct=CIwBEKQwGAYiEwipncqx3IL9AhXs4cQKHbKZDO4yB3JlbGF0ZWRInsS1qbGFtIlUmgEFCAEQ-B0%3D&csn=MC41MTQ1NTQzMTE3NTA4MjY0&endpoint=%7B%22clickTrackingParams%22%3A%22CIwBEKQwGAYiEwipncqx3IL9AhXs4cQKHbKZDO4yB3JlbGF0ZWRInsS1qbGFtIlUmgEFCAEQ-B0%3D%22%2C%22commandMetadata%22%3A%7B%22webCommandMetadata%22%3A%7B%22url%22%3A%22%2Fwatch%3Fv%3D{video_id}%22%2C%22webPageType%22%3A%22WEB_PAGE_TYPE_WATCH%22%2C%22rootVe%22%3A3832%7D%7D%2C%22watchEndpoint%22%3A%7B%22videoId%22%3A%22{video_id}%22%2C%22nofollow%22%3Atrue%2C%22watchEndpointSupportedOnesieConfig%22%3A%7B%22html5PlaybackOnesieConfig%22%3A%7B%22commonConfig%22%3A%7B%22url%22%3A%22https%3A%2F%2Frr5---sn-nx5s7n76.googlevideo.com%2Finitplayback%3Fsource%3Dyoutube%26oeis%3D1%26c%3DWEB%26oad%3D3200%26ovd%3D3200%26oaad%3D11000%26oavd%3D11000%26ocs%3D700%26oewis%3D1%26oputc%3D1%26ofpcc%3D1%26msp%3D1%26odepv%3D1%26id%3D38654ad085c12212%26ip%3D38.93.247.21%26initcwndbps%3D11346250%26mt%3D1675748964%26oweuc%3D%26pxtags%3DCg4KAnR4EggyNDQ1MTI4OA%26rxtags%3DCg4KAnR4EggyNDQ1MTI4Ng%252CCg4KAnR4EggyNDQ1MTI4Nw%252CCg4KAnR4EggyNDQ1MTI4OA%252CCg4KAnR4EggyNDQ1MTI4OQ%22%7D%7D%7D%7D%7D',
|
|
|
- 'origin': 'https://www.youtube.com',
|
|
|
- 'pragma': 'no-cache',
|
|
|
- 'referer': f'https://www.youtube.com/watch?v={video_id}',
|
|
|
- 'sec-ch-ua': '"Not_A Brand";v="99", "Chromium";v="109", "Google Chrome";v="109.0.5414.87"',
|
|
|
- 'sec-ch-ua-arch': '"arm"',
|
|
|
- 'sec-ch-ua-bitness': '"64"',
|
|
|
- 'sec-ch-ua-full-version': '"109.0.1518.52"',
|
|
|
- 'sec-ch-ua-full-version-list': '"Not_A Brand";v="99.0.0.0", "Microsoft Edge";v="109.0.1518.52", "Chromium";v="109.0.5414.87"',
|
|
|
- 'sec-ch-ua-mobile': '?0',
|
|
|
- 'sec-ch-ua-model': '',
|
|
|
- 'sec-ch-ua-platform': '"macOS"',
|
|
|
- 'sec-ch-ua-platform-version': '"12.4.0"',
|
|
|
- 'sec-ch-ua-wow64': '?0',
|
|
|
- 'sec-fetch-dest': 'empty',
|
|
|
- 'sec-fetch-mode': 'same-origin',
|
|
|
- 'sec-fetch-site': 'same-origin',
|
|
|
- 'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36',
|
|
|
- 'x-goog-visitor-id': 'CgtraDZfVnB4NXdIWSjkzoefBg%3D%3D',
|
|
|
- 'x-youtube-bootstrap-logged-in': 'false',
|
|
|
- 'x-youtube-client-name': '1',
|
|
|
- 'x-youtube-client-version': '2.20230201.01.00'
|
|
|
- }
|
|
|
- response = requests.post(url=url, headers=headers, data=payload)
|
|
|
- # Common.logger(log_type, crawler).info(f"get_video_info_response:{response.json()}\n")
|
|
|
- if response.status_code != 200:
|
|
|
- Common.logger(log_type, crawler).warning(f"get_video_info_response:{response.text}\n")
|
|
|
- elif 'streamingData' not in response.json():
|
|
|
- Common.logger(log_type, crawler).warning(f"get_video_info_response:{response.json()}\n")
|
|
|
- elif 'videoDetails' not in response.json():
|
|
|
- Common.logger(log_type, crawler).warning(f"get_video_info_response:{response.json()}\n")
|
|
|
- elif 'microformat' not in response.json():
|
|
|
- Common.logger(log_type, crawler).warning(f"get_video_info_response:{response.json()}\n")
|
|
|
- else:
|
|
|
- playerMicroformatRenderer = response.json()['microformat']['playerMicroformatRenderer']
|
|
|
- videoDetails = response.json()['videoDetails']
|
|
|
- streamingData = response.json()['streamingData']
|
|
|
-
|
|
|
- # video_title
|
|
|
- if 'title' not in videoDetails:
|
|
|
- video_title = ''
|
|
|
+ },
|
|
|
+ "racyCheckOk": False,
|
|
|
+ "contentCheckOk": False
|
|
|
+ })
|
|
|
+ headers = {
|
|
|
+ 'authority': 'www.youtube.com',
|
|
|
+ 'accept': '*/*',
|
|
|
+ 'accept-language': 'zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6',
|
|
|
+ 'cache-control': 'no-cache',
|
|
|
+ 'content-type': 'application/json',
|
|
|
+ 'cookie': f'VISITOR_INFO1_LIVE=kh6_Vpx5wHY; YSC=UupqFrWvAR0; DEVICE_INFO=ChxOekU1TlRReU5qWTBOVFExTVRRNU5qRTBOdz09EOmU7Z4GGOmU7Z4G; PREF=tz=Asia.Shanghai; ST-180dxzo=itct=CIwBEKQwGAYiEwipncqx3IL9AhXs4cQKHbKZDO4yB3JlbGF0ZWRInsS1qbGFtIlUmgEFCAEQ-B0%3D&csn=MC41MTQ1NTQzMTE3NTA4MjY0&endpoint=%7B%22clickTrackingParams%22%3A%22CIwBEKQwGAYiEwipncqx3IL9AhXs4cQKHbKZDO4yB3JlbGF0ZWRInsS1qbGFtIlUmgEFCAEQ-B0%3D%22%2C%22commandMetadata%22%3A%7B%22webCommandMetadata%22%3A%7B%22url%22%3A%22%2Fwatch%3Fv%3D{video_id}%22%2C%22webPageType%22%3A%22WEB_PAGE_TYPE_WATCH%22%2C%22rootVe%22%3A3832%7D%7D%2C%22watchEndpoint%22%3A%7B%22videoId%22%3A%22{video_id}%22%2C%22nofollow%22%3Atrue%2C%22watchEndpointSupportedOnesieConfig%22%3A%7B%22html5PlaybackOnesieConfig%22%3A%7B%22commonConfig%22%3A%7B%22url%22%3A%22https%3A%2F%2Frr5---sn-nx5s7n76.googlevideo.com%2Finitplayback%3Fsource%3Dyoutube%26oeis%3D1%26c%3DWEB%26oad%3D3200%26ovd%3D3200%26oaad%3D11000%26oavd%3D11000%26ocs%3D700%26oewis%3D1%26oputc%3D1%26ofpcc%3D1%26msp%3D1%26odepv%3D1%26id%3D38654ad085c12212%26ip%3D38.93.247.21%26initcwndbps%3D11346250%26mt%3D1675748964%26oweuc%3D%26pxtags%3DCg4KAnR4EggyNDQ1MTI4OA%26rxtags%3DCg4KAnR4EggyNDQ1MTI4Ng%252CCg4KAnR4EggyNDQ1MTI4Nw%252CCg4KAnR4EggyNDQ1MTI4OA%252CCg4KAnR4EggyNDQ1MTI4OQ%22%7D%7D%7D%7D%7D',
|
|
|
+ 'origin': 'https://www.youtube.com',
|
|
|
+ 'pragma': 'no-cache',
|
|
|
+ 'referer': f'https://www.youtube.com/watch?v={video_id}',
|
|
|
+ 'sec-ch-ua': '"Not_A Brand";v="99", "Chromium";v="109", "Google Chrome";v="109.0.5414.87"',
|
|
|
+ 'sec-ch-ua-arch': '"arm"',
|
|
|
+ 'sec-ch-ua-bitness': '"64"',
|
|
|
+ 'sec-ch-ua-full-version': '"109.0.1518.52"',
|
|
|
+ 'sec-ch-ua-full-version-list': '"Not_A Brand";v="99.0.0.0", "Microsoft Edge";v="109.0.1518.52", "Chromium";v="109.0.5414.87"',
|
|
|
+ 'sec-ch-ua-mobile': '?0',
|
|
|
+ 'sec-ch-ua-model': '',
|
|
|
+ 'sec-ch-ua-platform': '"macOS"',
|
|
|
+ 'sec-ch-ua-platform-version': '"12.4.0"',
|
|
|
+ 'sec-ch-ua-wow64': '?0',
|
|
|
+ 'sec-fetch-dest': 'empty',
|
|
|
+ 'sec-fetch-mode': 'same-origin',
|
|
|
+ 'sec-fetch-site': 'same-origin',
|
|
|
+ 'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36',
|
|
|
+ 'x-goog-visitor-id': 'CgtraDZfVnB4NXdIWSjkzoefBg%3D%3D',
|
|
|
+ 'x-youtube-bootstrap-logged-in': 'false',
|
|
|
+ 'x-youtube-client-name': '1',
|
|
|
+ 'x-youtube-client-version': '2.20230201.01.00'
|
|
|
+ }
|
|
|
+ response = requests.post(url=url, headers=headers, data=payload)
|
|
|
+ # Common.logger(log_type, crawler).info(f"get_video_info_response:{response.json()}\n")
|
|
|
+ if response.status_code != 200:
|
|
|
+ Common.logger(log_type, crawler).warning(f"get_video_info_response:{response.text}\n")
|
|
|
+ elif 'streamingData' not in response.json():
|
|
|
+ Common.logger(log_type, crawler).warning(f"get_video_info_response:{response.json()}\n")
|
|
|
+ elif 'videoDetails' not in response.json():
|
|
|
+ Common.logger(log_type, crawler).warning(f"get_video_info_response:{response.json()}\n")
|
|
|
+ elif 'microformat' not in response.json():
|
|
|
+ Common.logger(log_type, crawler).warning(f"get_video_info_response:{response.json()}\n")
|
|
|
else:
|
|
|
- video_title = videoDetails['title']
|
|
|
- if Translate.is_contains_chinese(video_title) is False:
|
|
|
- video_title = Translate.google_translate(video_title, machine) # 自动翻译标题为中文
|
|
|
+ playerMicroformatRenderer = response.json()['microformat']['playerMicroformatRenderer']
|
|
|
+ videoDetails = response.json()['videoDetails']
|
|
|
+ streamingData = response.json()['streamingData']
|
|
|
|
|
|
- if 'lengthSeconds' not in videoDetails:
|
|
|
- duration = 0
|
|
|
- else:
|
|
|
- duration = int(videoDetails['lengthSeconds'])
|
|
|
+ # video_title
|
|
|
+ if 'title' not in videoDetails:
|
|
|
+ video_title = ''
|
|
|
+ else:
|
|
|
+ video_title = videoDetails['title']
|
|
|
+ if Translate.is_contains_chinese(video_title) is False:
|
|
|
+ video_title = Translate.google_translate(video_title, machine) # 自动翻译标题为中文
|
|
|
|
|
|
- # play_cnt
|
|
|
- if 'viewCount' not in videoDetails:
|
|
|
- play_cnt = 0
|
|
|
- else:
|
|
|
- play_cnt = int(videoDetails['viewCount'])
|
|
|
+ if 'lengthSeconds' not in videoDetails:
|
|
|
+ duration = 0
|
|
|
+ else:
|
|
|
+ duration = int(videoDetails['lengthSeconds'])
|
|
|
|
|
|
- # publish_time
|
|
|
- if 'publishDate' not in playerMicroformatRenderer:
|
|
|
- publish_time = ''
|
|
|
- else:
|
|
|
- publish_time = playerMicroformatRenderer['publishDate']
|
|
|
+ # play_cnt
|
|
|
+ if 'viewCount' not in videoDetails:
|
|
|
+ play_cnt = 0
|
|
|
+ else:
|
|
|
+ play_cnt = int(videoDetails['viewCount'])
|
|
|
|
|
|
- # user_name
|
|
|
- if 'author' not in videoDetails:
|
|
|
- user_name = ''
|
|
|
- else:
|
|
|
- user_name = videoDetails['author']
|
|
|
+ # publish_time
|
|
|
+ if 'publishDate' not in playerMicroformatRenderer:
|
|
|
+ publish_time = ''
|
|
|
+ else:
|
|
|
+ publish_time = playerMicroformatRenderer['publishDate']
|
|
|
|
|
|
- # cover_url
|
|
|
- if 'thumbnail' not in videoDetails:
|
|
|
- cover_url = ''
|
|
|
- elif 'thumbnails' not in videoDetails['thumbnail']:
|
|
|
- cover_url = ''
|
|
|
- elif len(videoDetails['thumbnail']['thumbnails']) == 0:
|
|
|
- cover_url = ''
|
|
|
- elif 'url' not in videoDetails['thumbnail']['thumbnails'][-1]:
|
|
|
- cover_url = ''
|
|
|
- else:
|
|
|
- cover_url = videoDetails['thumbnail']['thumbnails'][-1]['url']
|
|
|
+ # user_name
|
|
|
+ if 'author' not in videoDetails:
|
|
|
+ user_name = ''
|
|
|
+ else:
|
|
|
+ user_name = videoDetails['author']
|
|
|
|
|
|
- # video_url
|
|
|
- if 'formats' not in streamingData:
|
|
|
- video_url = ''
|
|
|
- elif len(streamingData['formats']) == 0:
|
|
|
- video_url = ''
|
|
|
- elif 'url' not in streamingData['formats'][-1]:
|
|
|
- video_url = ''
|
|
|
- else:
|
|
|
- video_url = streamingData['formats'][-1]['url']
|
|
|
+ # cover_url
|
|
|
+ if 'thumbnail' not in videoDetails:
|
|
|
+ cover_url = ''
|
|
|
+ elif 'thumbnails' not in videoDetails['thumbnail']:
|
|
|
+ cover_url = ''
|
|
|
+ elif len(videoDetails['thumbnail']['thumbnails']) == 0:
|
|
|
+ cover_url = ''
|
|
|
+ elif 'url' not in videoDetails['thumbnail']['thumbnails'][-1]:
|
|
|
+ cover_url = ''
|
|
|
+ else:
|
|
|
+ cover_url = videoDetails['thumbnail']['thumbnails'][-1]['url']
|
|
|
+
|
|
|
+ # video_url
|
|
|
+ if 'formats' not in streamingData:
|
|
|
+ video_url = ''
|
|
|
+ elif len(streamingData['formats']) == 0:
|
|
|
+ video_url = ''
|
|
|
+ elif 'url' not in streamingData['formats'][-1]:
|
|
|
+ video_url = ''
|
|
|
+ else:
|
|
|
+ video_url = streamingData['formats'][-1]['url']
|
|
|
|
|
|
- Common.logger(log_type, crawler).info(f'video_title:{video_title}')
|
|
|
- Common.logger(log_type, crawler).info(f'video_id:{video_id}')
|
|
|
- Common.logger(log_type, crawler).info(f'play_cnt:{play_cnt}')
|
|
|
- Common.logger(log_type, crawler).info(f'publish_time:{publish_time}')
|
|
|
- Common.logger(log_type, crawler).info(f'user_name:{user_name}')
|
|
|
- Common.logger(log_type, crawler).info(f'cover_url:{cover_url}')
|
|
|
- Common.logger(log_type, crawler).info(f'video_url:{video_url}')
|
|
|
+ Common.logger(log_type, crawler).info(f'video_title:{video_title}')
|
|
|
+ Common.logger(log_type, crawler).info(f'video_id:{video_id}')
|
|
|
+ Common.logger(log_type, crawler).info(f'play_cnt:{play_cnt}')
|
|
|
+ Common.logger(log_type, crawler).info(f'publish_time:{publish_time}')
|
|
|
+ Common.logger(log_type, crawler).info(f'user_name:{user_name}')
|
|
|
+ Common.logger(log_type, crawler).info(f'cover_url:{cover_url}')
|
|
|
+ Common.logger(log_type, crawler).info(f'video_url:{video_url}')
|
|
|
|
|
|
- video_dict = {
|
|
|
- 'video_title': video_title,
|
|
|
- 'video_id': video_id,
|
|
|
- 'duration': duration,
|
|
|
- 'play_cnt': play_cnt,
|
|
|
- 'publish_time': publish_time,
|
|
|
- 'user_name': user_name,
|
|
|
- 'out_uid': out_uid,
|
|
|
- 'cover_url': cover_url,
|
|
|
- 'video_url': video_url,
|
|
|
- }
|
|
|
- return video_dict
|
|
|
+ video_dict = {
|
|
|
+ 'video_title': video_title,
|
|
|
+ 'video_id': video_id,
|
|
|
+ 'duration': duration,
|
|
|
+ 'play_cnt': play_cnt,
|
|
|
+ 'publish_time': publish_time,
|
|
|
+ 'user_name': user_name,
|
|
|
+ 'out_uid': out_uid,
|
|
|
+ 'cover_url': cover_url,
|
|
|
+ 'video_url': video_url,
|
|
|
+ }
|
|
|
+ return video_dict
|
|
|
+ except Exception as e:
|
|
|
+ Common.logger(log_type, crawler).error(f"get_video_info异常:{e}\n")
|
|
|
|
|
|
@classmethod
|
|
|
def download_publish(cls, log_type, crawler, video_dict, strategy, our_uid, env, oss_endpoint, machine):
|
|
|
- sql = f""" select * from crawler_video where platform="{cls.platform}" and out_video_id="{video_dict['video_id']}" """
|
|
|
- repeat_video = MysqlHelper.get_values(log_type, crawler, sql, env, machine)
|
|
|
- if video_dict['video_title'] == '' or video_dict['video_url'] == '':
|
|
|
- Common.logger(log_type, crawler).info('无效视频\n')
|
|
|
- elif video_dict['duration'] > 600 or video_dict['duration'] < 60:
|
|
|
- Common.logger(log_type, crawler).info(f"时长:{video_dict['duration']}不满足规则\n")
|
|
|
- elif repeat_video is not None and len(repeat_video) != 0:
|
|
|
- Common.logger(log_type, crawler).info('视频已下载\n')
|
|
|
- else:
|
|
|
- # 下载视频
|
|
|
- Common.logger(log_type, crawler).info('开始下载视频...')
|
|
|
- Common.download_method(log_type, crawler, 'video', video_dict['video_title'], video_dict['video_url'])
|
|
|
- ffmpeg_dict = Common.ffmpeg(log_type, crawler, f"./{crawler}/videos/{video_dict['video_title']}/")
|
|
|
- video_width = ffmpeg_dict['width']
|
|
|
- video_height = ffmpeg_dict['height']
|
|
|
- duration = int(ffmpeg_dict['duration'])
|
|
|
- video_size = ffmpeg_dict['size']
|
|
|
-
|
|
|
- Common.logger(log_type, crawler).info(f'video_width:{video_width}')
|
|
|
- Common.logger(log_type, crawler).info(f'video_height:{video_height}')
|
|
|
- Common.logger(log_type, crawler).info(f'duration:{duration}')
|
|
|
- Common.logger(log_type, crawler).info(f'video_size:{video_size}\n')
|
|
|
-
|
|
|
- video_dict['video_width'] = video_width
|
|
|
- video_dict['video_height'] = video_height
|
|
|
- video_dict['duration'] = duration
|
|
|
- video_dict['comment_cnt'] = 0
|
|
|
- video_dict['like_cnt'] = 0
|
|
|
- video_dict['share_cnt'] = 0
|
|
|
- video_dict['avatar_url'] = video_dict['cover_url']
|
|
|
- video_dict['session'] = f'youtube{int(time.time())}'
|
|
|
- rule='1,2'
|
|
|
- if duration < 60 or duration > 600:
|
|
|
- # 删除视频文件夹
|
|
|
- shutil.rmtree(f"./{crawler}/videos/{video_dict['video_title']}/")
|
|
|
- Common.logger(log_type, crawler).info(f"时长:{video_dict['duration']}不满足抓取规则,删除成功\n")
|
|
|
- return
|
|
|
- elif video_size == 0 or duration == 0 or video_size is None or duration is None:
|
|
|
- # 删除视频文件夹
|
|
|
- shutil.rmtree(f"./{crawler}/videos/{video_dict['video_title']}/")
|
|
|
- Common.logger(log_type, crawler).info(f"视频下载出错,删除成功\n")
|
|
|
- return
|
|
|
+ try:
|
|
|
+ sql = f""" select * from crawler_video where platform="{cls.platform}" and out_video_id="{video_dict['video_id']}" """
|
|
|
+ repeat_video = MysqlHelper.get_values(log_type, crawler, sql, env, machine)
|
|
|
+ if video_dict['video_title'] == '' or video_dict['video_url'] == '':
|
|
|
+ Common.logger(log_type, crawler).info('无效视频\n')
|
|
|
+ elif video_dict['duration'] > 600 or video_dict['duration'] < 60:
|
|
|
+ Common.logger(log_type, crawler).info(f"时长:{video_dict['duration']}不满足规则\n")
|
|
|
+ elif repeat_video is not None and len(repeat_video) != 0:
|
|
|
+ Common.logger(log_type, crawler).info('视频已下载\n')
|
|
|
else:
|
|
|
- # 下载封面
|
|
|
- Common.download_method(log_type, crawler, 'cover', video_dict['video_title'], video_dict['cover_url'])
|
|
|
- # 保存视频文本信息
|
|
|
- Common.save_video_info(log_type, crawler, video_dict)
|
|
|
+ # 下载视频
|
|
|
+ Common.logger(log_type, crawler).info('开始下载视频...')
|
|
|
+ Common.download_method(log_type, crawler, 'video', video_dict['video_title'], video_dict['video_url'])
|
|
|
+ ffmpeg_dict = Common.ffmpeg(log_type, crawler, f"./{crawler}/videos/{video_dict['video_title']}/")
|
|
|
+ video_width = ffmpeg_dict['width']
|
|
|
+ video_height = ffmpeg_dict['height']
|
|
|
+ duration = int(ffmpeg_dict['duration'])
|
|
|
+ video_size = ffmpeg_dict['size']
|
|
|
+
|
|
|
+ Common.logger(log_type, crawler).info(f'video_width:{video_width}')
|
|
|
+ Common.logger(log_type, crawler).info(f'video_height:{video_height}')
|
|
|
+ Common.logger(log_type, crawler).info(f'duration:{duration}')
|
|
|
+ Common.logger(log_type, crawler).info(f'video_size:{video_size}\n')
|
|
|
|
|
|
- # 上传视频
|
|
|
- Common.logger(log_type, crawler).info(f"开始上传视频")
|
|
|
- if env == 'dev':
|
|
|
- our_video_id = Publish.upload_and_publish(log_type, crawler, strategy, our_uid, env, oss_endpoint)
|
|
|
- our_video_link = f"https://testadmin.piaoquantv.com/cms/post-detail/{our_video_id}/info"
|
|
|
+ video_dict['video_width'] = video_width
|
|
|
+ video_dict['video_height'] = video_height
|
|
|
+ video_dict['duration'] = duration
|
|
|
+ video_dict['comment_cnt'] = 0
|
|
|
+ video_dict['like_cnt'] = 0
|
|
|
+ video_dict['share_cnt'] = 0
|
|
|
+ video_dict['avatar_url'] = video_dict['cover_url']
|
|
|
+ video_dict['session'] = f'youtube{int(time.time())}'
|
|
|
+ rule='1,2'
|
|
|
+ if duration < 60 or duration > 600:
|
|
|
+ # 删除视频文件夹
|
|
|
+ shutil.rmtree(f"./{crawler}/videos/{video_dict['video_title']}/")
|
|
|
+ Common.logger(log_type, crawler).info(f"时长:{video_dict['duration']}不满足抓取规则,删除成功\n")
|
|
|
+ return
|
|
|
+ elif video_size == 0 or duration == 0 or video_size is None or duration is None:
|
|
|
+ # 删除视频文件夹
|
|
|
+ shutil.rmtree(f"./{crawler}/videos/{video_dict['video_title']}/")
|
|
|
+ Common.logger(log_type, crawler).info(f"视频下载出错,删除成功\n")
|
|
|
+ return
|
|
|
else:
|
|
|
- our_video_id = Publish.upload_and_publish(log_type, crawler, strategy, our_uid, env, oss_endpoint)
|
|
|
- our_video_link = f"https://admin.piaoquantv.com/cms/post-detail/{our_video_id}/info"
|
|
|
- Common.logger(log_type, crawler).info("视频上传完成")
|
|
|
+ # 下载封面
|
|
|
+ Common.download_method(log_type, crawler, 'cover', video_dict['video_title'], video_dict['cover_url'])
|
|
|
+ # 保存视频文本信息
|
|
|
+ Common.save_video_info(log_type, crawler, video_dict)
|
|
|
+
|
|
|
+ # 上传视频
|
|
|
+ Common.logger(log_type, crawler).info(f"开始上传视频")
|
|
|
+ if env == 'dev':
|
|
|
+ our_video_id = Publish.upload_and_publish(log_type, crawler, strategy, our_uid, env, oss_endpoint)
|
|
|
+ our_video_link = f"https://testadmin.piaoquantv.com/cms/post-detail/{our_video_id}/info"
|
|
|
+ else:
|
|
|
+ our_video_id = Publish.upload_and_publish(log_type, crawler, strategy, our_uid, env, oss_endpoint)
|
|
|
+ our_video_link = f"https://admin.piaoquantv.com/cms/post-detail/{our_video_id}/info"
|
|
|
+ Common.logger(log_type, crawler).info("视频上传完成")
|
|
|
|
|
|
- # 视频信息保存至飞书
|
|
|
- Feishu.insert_columns(log_type, crawler, "GVxlYk", "ROWS", 1, 2)
|
|
|
- # 视频ID工作表,首行写入数据
|
|
|
- upload_time = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(int(time.time())))
|
|
|
- values = [[upload_time,
|
|
|
- "定向榜",
|
|
|
- video_dict['video_id'],
|
|
|
- video_dict['video_title'],
|
|
|
- our_video_link,
|
|
|
- video_dict['play_cnt'],
|
|
|
- video_dict['duration'],
|
|
|
- f'{video_width}*{video_height}',
|
|
|
- video_dict['publish_time'],
|
|
|
- video_dict['user_name'],
|
|
|
- video_dict['cover_url'],
|
|
|
- video_dict['video_url']
|
|
|
- ]]
|
|
|
- time.sleep(1)
|
|
|
- Feishu.update_values(log_type, crawler, "GVxlYk", "F2:Z2", values)
|
|
|
- Common.logger(log_type, crawler).info('视频信息写入定向_已下载表成功\n')
|
|
|
+ # 视频信息保存至飞书
|
|
|
+ Feishu.insert_columns(log_type, crawler, "GVxlYk", "ROWS", 1, 2)
|
|
|
+ # 视频ID工作表,首行写入数据
|
|
|
+ upload_time = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(int(time.time())))
|
|
|
+ values = [[upload_time,
|
|
|
+ "定向榜",
|
|
|
+ video_dict['video_id'],
|
|
|
+ video_dict['video_title'],
|
|
|
+ our_video_link,
|
|
|
+ video_dict['play_cnt'],
|
|
|
+ video_dict['duration'],
|
|
|
+ f'{video_width}*{video_height}',
|
|
|
+ video_dict['publish_time'],
|
|
|
+ video_dict['user_name'],
|
|
|
+ video_dict['cover_url'],
|
|
|
+ video_dict['video_url']
|
|
|
+ ]]
|
|
|
+ time.sleep(1)
|
|
|
+ Feishu.update_values(log_type, crawler, "GVxlYk", "F2:Z2", values)
|
|
|
+ Common.logger(log_type, crawler).info('视频信息写入定向_已下载表成功\n')
|
|
|
|
|
|
- # 视频信息保存数据库
|
|
|
- sql = f""" insert into crawler_video(video_id,
|
|
|
- user_id,
|
|
|
- out_user_id,
|
|
|
- platform,
|
|
|
- strategy,
|
|
|
- out_video_id,
|
|
|
- video_title,
|
|
|
- cover_url,
|
|
|
- video_url,
|
|
|
- duration,
|
|
|
- publish_time,
|
|
|
- play_cnt,
|
|
|
- crawler_rule,
|
|
|
- width,
|
|
|
- height)
|
|
|
- values({our_video_id},
|
|
|
- "{our_uid}",
|
|
|
- "{video_dict['out_uid']}",
|
|
|
- "{cls.platform}",
|
|
|
- "定向爬虫策略",
|
|
|
- "{video_dict['video_id']}",
|
|
|
- "{video_dict['video_title']}",
|
|
|
- "{video_dict['cover_url']}",
|
|
|
- "{video_dict['video_url']}",
|
|
|
- {int(duration)},
|
|
|
- "{video_dict['publish_time']}",
|
|
|
- {int(video_dict['play_cnt'])},
|
|
|
- "{rule}",
|
|
|
- {int(video_width)},
|
|
|
- {int(video_height)}) """
|
|
|
- MysqlHelper.update_values(log_type, crawler, sql, env, machine)
|
|
|
- Common.logger(log_type, crawler).info('视频信息插入数据库成功!\n')
|
|
|
+ # 视频信息保存数据库
|
|
|
+ sql = f""" insert into crawler_video(video_id,
|
|
|
+ user_id,
|
|
|
+ out_user_id,
|
|
|
+ platform,
|
|
|
+ strategy,
|
|
|
+ out_video_id,
|
|
|
+ video_title,
|
|
|
+ cover_url,
|
|
|
+ video_url,
|
|
|
+ duration,
|
|
|
+ publish_time,
|
|
|
+ play_cnt,
|
|
|
+ crawler_rule,
|
|
|
+ width,
|
|
|
+ height)
|
|
|
+ values({our_video_id},
|
|
|
+ "{our_uid}",
|
|
|
+ "{video_dict['out_uid']}",
|
|
|
+ "{cls.platform}",
|
|
|
+ "定向爬虫策略",
|
|
|
+ "{video_dict['video_id']}",
|
|
|
+ "{video_dict['video_title']}",
|
|
|
+ "{video_dict['cover_url']}",
|
|
|
+ "{video_dict['video_url']}",
|
|
|
+ {int(duration)},
|
|
|
+ "{video_dict['publish_time']}",
|
|
|
+ {int(video_dict['play_cnt'])},
|
|
|
+ "{rule}",
|
|
|
+ {int(video_width)},
|
|
|
+ {int(video_height)}) """
|
|
|
+ MysqlHelper.update_values(log_type, crawler, sql, env, machine)
|
|
|
+ Common.logger(log_type, crawler).info('视频信息插入数据库成功!\n')
|
|
|
+ except Exception as e:
|
|
|
+ Common.logger(log_type, crawler).info(f"download_publish异常:{e}\n")
|
|
|
|
|
|
@classmethod
|
|
|
def get_follow_videos(cls, log_type, crawler, strategy, oss_endpoint, env, machine):
|
|
|
- user_list = cls.get_user_from_feishu(log_type, crawler, 'c467d7', env, machine)
|
|
|
- if len(user_list) == 0:
|
|
|
- Common.logger(log_type, crawler).warning('用户列表为空\n')
|
|
|
- else:
|
|
|
- for user_dict in user_list:
|
|
|
- out_uid = user_dict['out_user_id']
|
|
|
- user_name = user_dict['out_user_name']
|
|
|
- browse_id = user_dict['out_browse_id']
|
|
|
- our_uid = user_dict['our_user_id']
|
|
|
- Common.logger(log_type, crawler).info(f'获取 {user_name} 主页视频\n')
|
|
|
- cls.get_videos(log_type, crawler, strategy, oss_endpoint, env, browse_id, out_uid, our_uid, machine)
|
|
|
- Common.logger(log_type, crawler).info('休眠 10 秒')
|
|
|
- time.sleep(10)
|
|
|
- cls.continuation = ''
|
|
|
+ try:
|
|
|
+ user_list = cls.get_user_from_feishu(log_type, crawler, 'c467d7', env, machine)
|
|
|
+ if len(user_list) == 0:
|
|
|
+ Common.logger(log_type, crawler).warning('用户列表为空\n')
|
|
|
+ else:
|
|
|
+ for user_dict in user_list:
|
|
|
+ out_uid = user_dict['out_user_id']
|
|
|
+ user_name = user_dict['out_user_name']
|
|
|
+ browse_id = user_dict['out_browse_id']
|
|
|
+ our_uid = user_dict['our_user_id']
|
|
|
+ Common.logger(log_type, crawler).info(f'获取 {user_name} 主页视频\n')
|
|
|
+ cls.get_videos(log_type, crawler, strategy, oss_endpoint, env, browse_id, out_uid, our_uid, machine)
|
|
|
+ Common.logger(log_type, crawler).info('休眠 10 秒')
|
|
|
+ time.sleep(10)
|
|
|
+ cls.continuation = ''
|
|
|
+ except Exception as e:
|
|
|
+ Common.logger(log_type, crawler).error(f"get_follow_videos异常:{e}\n")
|
|
|
|
|
|
|
|
|
if __name__ == "__main__":
|