|
@@ -4,10 +4,8 @@
|
|
|
import json
|
|
|
import os
|
|
|
import random
|
|
|
-import shutil
|
|
|
import sys
|
|
|
import time
|
|
|
-from hashlib import md5
|
|
|
import requests
|
|
|
import urllib3
|
|
|
from fake_useragent import FakeUserAgent
|
|
@@ -21,7 +19,8 @@ from common.publish import Publish
|
|
|
from common.scheduling_db import MysqlHelper
|
|
|
from common.public import get_config_from_mysql, download_rule
|
|
|
|
|
|
-proxies = {"http": None, "https": None}
|
|
|
+
|
|
|
+# proxies = {"http": None, "https": None}
|
|
|
|
|
|
|
|
|
class XiaoNianGaoH5Scheduling:
|
|
@@ -36,12 +35,47 @@ class XiaoNianGaoH5Scheduling:
|
|
|
@classmethod
|
|
|
def get_videoList(cls, log_type, crawler, rule_dict, our_uid, env):
|
|
|
mq = MQ(topic_name="topic_crawler_etl_" + env)
|
|
|
- for page in range(1, 2):
|
|
|
+ for page in range(1, 101):
|
|
|
try:
|
|
|
Common.logger(log_type, crawler).info(f"正在抓取第{page}页")
|
|
|
- # Common.logging(log_type, crawler, env, f"正在抓取第{page}页")
|
|
|
+ Common.logging(log_type, crawler, env, f"正在抓取第{page}页")
|
|
|
url = "https://kapi.xiaoniangao.cn/trends/get_recommend_trends"
|
|
|
- payload = {"tag_id": 101}
|
|
|
+ payload = {
|
|
|
+ "rec_ab_config": {
|
|
|
+ "ban_ab": 1,
|
|
|
+ "city_slot": 0,
|
|
|
+ "multi_ab": 1,
|
|
|
+ "region_ab": {
|
|
|
+ "num": 4,
|
|
|
+ "position": {
|
|
|
+ "0": 1,
|
|
|
+ "1": 2,
|
|
|
+ "2": 3,
|
|
|
+ "3": 4
|
|
|
+ }
|
|
|
+ }
|
|
|
+ },
|
|
|
+ "qs": "imageMogr2/gravity/center/rotate/$/thumbnail/!750x500r/crop/750x500/interlace/1/format/jpg/quality/75",
|
|
|
+ "h_qs": "imageMogr2/gravity/center/rotate/$/thumbnail/!80x80r/crop/80x80/interlace/1/format/jpg/quality/75",
|
|
|
+ "limit": 4,
|
|
|
+ # "tag_id": 116,
|
|
|
+ "share_height": 500,
|
|
|
+ "share_width": 625,
|
|
|
+ "log_params": {
|
|
|
+ "proj": "in",
|
|
|
+ "page": "discover_rec",
|
|
|
+ "common": {
|
|
|
+ "os": "OS X 10.15.7",
|
|
|
+ "device": "",
|
|
|
+ "weixinver": "6.8.0",
|
|
|
+ "srcver": "5.71.11"
|
|
|
+ }
|
|
|
+ },
|
|
|
+ "token": cls.uid_token_dict['token'],
|
|
|
+ "code_ver": "5.71.11",
|
|
|
+ "uid": cls.uid_token_dict['uid'],
|
|
|
+ "proj": "in"
|
|
|
+ }
|
|
|
headers = {
|
|
|
"Host": "kapi.xiaoniangao.cn",
|
|
|
"accept": "application/json, text/plain, */*",
|
|
@@ -55,31 +89,32 @@ class XiaoNianGaoH5Scheduling:
|
|
|
"accept-language": "en",
|
|
|
}
|
|
|
urllib3.disable_warnings()
|
|
|
- # r = requests.post(url=url, headers=headers, data=json.dumps(payload), proxies=proxies, verify=False)
|
|
|
- r = requests.post(
|
|
|
- url=url, headers=headers, data=json.dumps(payload), verify=False
|
|
|
- )
|
|
|
+ proxies = Common.tunnel_proxies()
|
|
|
+ r = requests.post(url=url, headers=headers, data=json.dumps(payload), proxies=proxies, verify=False)
|
|
|
+ # r = requests.post(
|
|
|
+ # url=url, headers=headers, data=json.dumps(payload), verify=False
|
|
|
+ # )
|
|
|
if "data" not in r.text or r.status_code != 200:
|
|
|
Common.logger(log_type, crawler).warning(
|
|
|
f"get_videoList:{r.text}\n"
|
|
|
)
|
|
|
- # Common.logging(log_type, crawler, env, f"get_videoList:{r.text}\n")
|
|
|
+ Common.logging(log_type, crawler, env, f"get_videoList:{r.text}\n")
|
|
|
return
|
|
|
elif "data" not in r.json():
|
|
|
Common.logger(log_type, crawler).info(f"get_videoList:{r.json()}\n")
|
|
|
- # Common.logging(log_type, crawler, env, f"get_videoList:{r.json()}\n")
|
|
|
+ Common.logging(log_type, crawler, env, f"get_videoList:{r.json()}\n")
|
|
|
return
|
|
|
elif "list" not in r.json()["data"]:
|
|
|
Common.logger(log_type, crawler).warning(
|
|
|
f"get_videoList:{r.json()['data']}\n"
|
|
|
)
|
|
|
- # Common.logging(log_type, crawler, env, f"get_videoList:{r.json()['data']}\n")
|
|
|
+ Common.logging(log_type, crawler, env, f"get_videoList:{r.json()['data']}\n")
|
|
|
return
|
|
|
elif len(r.json()["data"]["list"]) == 0:
|
|
|
Common.logger(log_type, crawler).warning(
|
|
|
f"get_videoList:{r.json()['data']['list']}\n"
|
|
|
)
|
|
|
- # Common.logging(log_type, crawler, env, f"get_videoList:{r.json()['data']['list']}\n")
|
|
|
+ Common.logging(log_type, crawler, env, f"get_videoList:{r.json()['data']['list']}\n")
|
|
|
return
|
|
|
else:
|
|
|
# 视频列表数据
|
|
@@ -163,53 +198,53 @@ class XiaoNianGaoH5Scheduling:
|
|
|
}
|
|
|
for k, v in video_dict.items():
|
|
|
Common.logger(log_type, crawler).info(f"{k}:{v}")
|
|
|
- # Common.logging(log_type, crawler, env, f"{video_dict}")
|
|
|
+ Common.logging(log_type, crawler, env, f"{video_dict}")
|
|
|
|
|
|
# 过滤无效视频
|
|
|
if (
|
|
|
- video_title == ""
|
|
|
- or video_dict["video_id"] == ""
|
|
|
- or video_dict["video_url"] == ""
|
|
|
+ video_title == ""
|
|
|
+ or video_dict["video_id"] == ""
|
|
|
+ or video_dict["video_url"] == ""
|
|
|
):
|
|
|
Common.logger(log_type, crawler).warning("无效视频\n")
|
|
|
- # Common.logging(log_type, crawler, env, "无效视频\n")
|
|
|
+ Common.logging(log_type, crawler, env, "无效视频\n")
|
|
|
# 抓取基础规则过滤
|
|
|
elif (
|
|
|
- download_rule(
|
|
|
- log_type=log_type,
|
|
|
- crawler=crawler,
|
|
|
- video_dict=video_dict,
|
|
|
- rule_dict=rule_dict,
|
|
|
- )
|
|
|
- is False
|
|
|
+ download_rule(
|
|
|
+ log_type=log_type,
|
|
|
+ crawler=crawler,
|
|
|
+ video_dict=video_dict,
|
|
|
+ rule_dict=rule_dict,
|
|
|
+ )
|
|
|
+ is False
|
|
|
):
|
|
|
Common.logger(log_type, crawler).info("不满足抓取规则\n")
|
|
|
- # Common.logging(log_type, crawler, env, "不满足抓取规则\n")
|
|
|
+ Common.logging(log_type, crawler, env, "不满足抓取规则\n")
|
|
|
elif (
|
|
|
- any(
|
|
|
- str(word)
|
|
|
- if str(word) in video_dict["video_title"]
|
|
|
- else False
|
|
|
- for word in get_config_from_mysql(
|
|
|
- log_type=log_type,
|
|
|
- source=crawler,
|
|
|
- env=env,
|
|
|
- text="filter",
|
|
|
- action="",
|
|
|
+ any(
|
|
|
+ str(word)
|
|
|
+ if str(word) in video_dict["video_title"]
|
|
|
+ else False
|
|
|
+ for word in get_config_from_mysql(
|
|
|
+ log_type=log_type,
|
|
|
+ source=crawler,
|
|
|
+ env=env,
|
|
|
+ text="filter",
|
|
|
+ action="",
|
|
|
+ )
|
|
|
)
|
|
|
- )
|
|
|
- is True
|
|
|
+ is True
|
|
|
):
|
|
|
Common.logger(log_type, crawler).info("已中过滤词\n")
|
|
|
- # Common.logging(log_type, crawler, env, '已中过滤词\n')
|
|
|
+ Common.logging(log_type, crawler, env, '已中过滤词\n')
|
|
|
elif (
|
|
|
- cls.repeat_video(
|
|
|
- log_type, crawler, video_dict["video_id"], env
|
|
|
- )
|
|
|
- != 0
|
|
|
+ cls.repeat_video(
|
|
|
+ log_type, crawler, video_dict["video_id"], env
|
|
|
+ )
|
|
|
+ != 0
|
|
|
):
|
|
|
Common.logger(log_type, crawler).info("视频已下载\n")
|
|
|
- # Common.logging(log_type, crawler, env, '视频已下载\n')
|
|
|
+ Common.logging(log_type, crawler, env, '视频已下载\n')
|
|
|
else:
|
|
|
# cls.download_publish(log_type=log_type,
|
|
|
# crawler=crawler,
|
|
@@ -234,10 +269,10 @@ class XiaoNianGaoH5Scheduling:
|
|
|
# break
|
|
|
except Exception as e:
|
|
|
Common.logger(log_type, crawler).error(f"抓取单条视频异常:{e}\n")
|
|
|
- # Common.logging(log_type, crawler, env, f"抓取单条视频异常:{e}\n")
|
|
|
+ Common.logging(log_type, crawler, env, f"抓取单条视频异常:{e}\n")
|
|
|
except Exception as e:
|
|
|
Common.logger(log_type, crawler).error(f"抓取第{page}页时异常:{e}\n")
|
|
|
- # Common.logging(log_type, crawler, env, f"抓取第{page}页时异常:{e}\n")
|
|
|
+ Common.logging(log_type, crawler, env, f"抓取第{page}页时异常:{e}\n")
|
|
|
|
|
|
@classmethod
|
|
|
def repeat_video(cls, log_type, crawler, video_id, env):
|