Jelajahi Sumber

小年糕-h5-已配置好日志,拿推荐的数据

罗俊辉 1 tahun lalu
induk
melakukan
f2d45d4df4
1 mengubah file dengan 82 tambahan dan 47 penghapusan
  1. 82 47
      xiaoniangao/xiaoniangao_xcx_rec/xiaoniangao_h5_schduling.py

+ 82 - 47
xiaoniangao/xiaoniangao_xcx_rec/xiaoniangao_h5_schduling.py

@@ -4,10 +4,8 @@
 import json
 import os
 import random
-import shutil
 import sys
 import time
-from hashlib import md5
 import requests
 import urllib3
 from fake_useragent import FakeUserAgent
@@ -21,7 +19,8 @@ from common.publish import Publish
 from common.scheduling_db import MysqlHelper
 from common.public import get_config_from_mysql, download_rule
 
-proxies = {"http": None, "https": None}
+
+# proxies = {"http": None, "https": None}
 
 
 class XiaoNianGaoH5Scheduling:
@@ -36,12 +35,47 @@ class XiaoNianGaoH5Scheduling:
     @classmethod
     def get_videoList(cls, log_type, crawler, rule_dict, our_uid, env):
         mq = MQ(topic_name="topic_crawler_etl_" + env)
-        for page in range(1, 2):
+        for page in range(1, 101):
             try:
                 Common.logger(log_type, crawler).info(f"正在抓取第{page}页")
-                # Common.logging(log_type, crawler, env, f"正在抓取第{page}页")
+                Common.logging(log_type, crawler, env, f"正在抓取第{page}页")
                 url = "https://kapi.xiaoniangao.cn/trends/get_recommend_trends"
-                payload = {"tag_id": 101}
+                payload = {
+                    "rec_ab_config": {
+                        "ban_ab": 1,
+                        "city_slot": 0,
+                        "multi_ab": 1,
+                        "region_ab": {
+                            "num": 4,
+                            "position": {
+                                "0": 1,
+                                "1": 2,
+                                "2": 3,
+                                "3": 4
+                            }
+                        }
+                    },
+                    "qs": "imageMogr2/gravity/center/rotate/$/thumbnail/!750x500r/crop/750x500/interlace/1/format/jpg/quality/75",
+                    "h_qs": "imageMogr2/gravity/center/rotate/$/thumbnail/!80x80r/crop/80x80/interlace/1/format/jpg/quality/75",
+                    "limit": 4,
+                    # "tag_id": 116,
+                    "share_height": 500,
+                    "share_width": 625,
+                    "log_params": {
+                        "proj": "in",
+                        "page": "discover_rec",
+                        "common": {
+                            "os": "OS X 10.15.7",
+                            "device": "",
+                            "weixinver": "6.8.0",
+                            "srcver": "5.71.11"
+                        }
+                    },
+                    "token": cls.uid_token_dict['token'],
+                    "code_ver": "5.71.11",
+                    "uid": cls.uid_token_dict['uid'],
+                    "proj": "in"
+                }
                 headers = {
                     "Host": "kapi.xiaoniangao.cn",
                     "accept": "application/json, text/plain, */*",
@@ -55,31 +89,32 @@ class XiaoNianGaoH5Scheduling:
                     "accept-language": "en",
                 }
                 urllib3.disable_warnings()
-                # r = requests.post(url=url, headers=headers, data=json.dumps(payload), proxies=proxies, verify=False)
-                r = requests.post(
-                    url=url, headers=headers, data=json.dumps(payload), verify=False
-                )
+                proxies = Common.tunnel_proxies()
+                r = requests.post(url=url, headers=headers, data=json.dumps(payload), proxies=proxies, verify=False)
+                # r = requests.post(
+                #     url=url, headers=headers, data=json.dumps(payload), verify=False
+                # )
                 if "data" not in r.text or r.status_code != 200:
                     Common.logger(log_type, crawler).warning(
                         f"get_videoList:{r.text}\n"
                     )
-                    # Common.logging(log_type, crawler, env, f"get_videoList:{r.text}\n")
+                    Common.logging(log_type, crawler, env, f"get_videoList:{r.text}\n")
                     return
                 elif "data" not in r.json():
                     Common.logger(log_type, crawler).info(f"get_videoList:{r.json()}\n")
-                    # Common.logging(log_type, crawler, env, f"get_videoList:{r.json()}\n")
+                    Common.logging(log_type, crawler, env, f"get_videoList:{r.json()}\n")
                     return
                 elif "list" not in r.json()["data"]:
                     Common.logger(log_type, crawler).warning(
                         f"get_videoList:{r.json()['data']}\n"
                     )
-                    # Common.logging(log_type, crawler, env, f"get_videoList:{r.json()['data']}\n")
+                    Common.logging(log_type, crawler, env, f"get_videoList:{r.json()['data']}\n")
                     return
                 elif len(r.json()["data"]["list"]) == 0:
                     Common.logger(log_type, crawler).warning(
                         f"get_videoList:{r.json()['data']['list']}\n"
                     )
-                    # Common.logging(log_type, crawler, env, f"get_videoList:{r.json()['data']['list']}\n")
+                    Common.logging(log_type, crawler, env, f"get_videoList:{r.json()['data']['list']}\n")
                     return
                 else:
                     # 视频列表数据
@@ -163,53 +198,53 @@ class XiaoNianGaoH5Scheduling:
                             }
                             for k, v in video_dict.items():
                                 Common.logger(log_type, crawler).info(f"{k}:{v}")
-                            # Common.logging(log_type, crawler, env, f"{video_dict}")
+                            Common.logging(log_type, crawler, env, f"{video_dict}")
 
                             # 过滤无效视频
                             if (
-                                video_title == ""
-                                or video_dict["video_id"] == ""
-                                or video_dict["video_url"] == ""
+                                    video_title == ""
+                                    or video_dict["video_id"] == ""
+                                    or video_dict["video_url"] == ""
                             ):
                                 Common.logger(log_type, crawler).warning("无效视频\n")
-                                # Common.logging(log_type, crawler, env, "无效视频\n")
+                                Common.logging(log_type, crawler, env, "无效视频\n")
                             # 抓取基础规则过滤
                             elif (
-                                download_rule(
-                                    log_type=log_type,
-                                    crawler=crawler,
-                                    video_dict=video_dict,
-                                    rule_dict=rule_dict,
-                                )
-                                is False
+                                    download_rule(
+                                        log_type=log_type,
+                                        crawler=crawler,
+                                        video_dict=video_dict,
+                                        rule_dict=rule_dict,
+                                    )
+                                    is False
                             ):
                                 Common.logger(log_type, crawler).info("不满足抓取规则\n")
-                                # Common.logging(log_type, crawler, env, "不满足抓取规则\n")
+                                Common.logging(log_type, crawler, env, "不满足抓取规则\n")
                             elif (
-                                any(
-                                    str(word)
-                                    if str(word) in video_dict["video_title"]
-                                    else False
-                                    for word in get_config_from_mysql(
-                                        log_type=log_type,
-                                        source=crawler,
-                                        env=env,
-                                        text="filter",
-                                        action="",
+                                    any(
+                                        str(word)
+                                        if str(word) in video_dict["video_title"]
+                                        else False
+                                        for word in get_config_from_mysql(
+                                            log_type=log_type,
+                                            source=crawler,
+                                            env=env,
+                                            text="filter",
+                                            action="",
+                                        )
                                     )
-                                )
-                                is True
+                                    is True
                             ):
                                 Common.logger(log_type, crawler).info("已中过滤词\n")
-                                # Common.logging(log_type, crawler, env, '已中过滤词\n')
+                                Common.logging(log_type, crawler, env, '已中过滤词\n')
                             elif (
-                                cls.repeat_video(
-                                    log_type, crawler, video_dict["video_id"], env
-                                )
-                                != 0
+                                    cls.repeat_video(
+                                        log_type, crawler, video_dict["video_id"], env
+                                    )
+                                    != 0
                             ):
                                 Common.logger(log_type, crawler).info("视频已下载\n")
-                                # Common.logging(log_type, crawler, env, '视频已下载\n')
+                                Common.logging(log_type, crawler, env, '视频已下载\n')
                             else:
                                 # cls.download_publish(log_type=log_type,
                                 #                      crawler=crawler,
@@ -234,10 +269,10 @@ class XiaoNianGaoH5Scheduling:
                                 # break
                         except Exception as e:
                             Common.logger(log_type, crawler).error(f"抓取单条视频异常:{e}\n")
-                            # Common.logging(log_type, crawler, env, f"抓取单条视频异常:{e}\n")
+                            Common.logging(log_type, crawler, env, f"抓取单条视频异常:{e}\n")
             except Exception as e:
                 Common.logger(log_type, crawler).error(f"抓取第{page}页时异常:{e}\n")
-                # Common.logging(log_type, crawler, env, f"抓取第{page}页时异常:{e}\n")
+                Common.logging(log_type, crawler, env, f"抓取第{page}页时异常:{e}\n")
 
     @classmethod
     def repeat_video(cls, log_type, crawler, video_id, env):