浏览代码

update 快手从飞书获取作者改为mysql

lierqiang 2 年之前
父节点
当前提交
7d0251452c
共有 3 个文件被更改,包括 40 次插入22 次删除
  1. 15 2
      common/public.py
  2. 11 6
      kuaishou/kuaishou_follow/kuaishou_follow.py
  3. 14 14
      kuaishou/kuaishou_main/run_kuaishou_follow.py

+ 15 - 2
common/public.py

@@ -1,8 +1,11 @@
 # -*- coding: utf-8 -*-
 # @Author: wangkun
 # @Time: 2023/3/27
+import os, sys
+
+sys.path.append(os.getcwd())
+from common.common import Common
 from common.scheduling_db import MysqlHelper
-# from scheduling_db import MysqlHelper
 
 
 # 过滤词库
@@ -26,5 +29,15 @@ def filter_word(log_type, crawler, source, env):
     return word_list
 
 
+def get_user_from_mysql(log_type, crawler, source, env, machine):
+    sql = f"select * from crawler_author_map where source='{source}' and is_del=1"
+    results = MysqlHelper.get_values(log_type, crawler, sql, env, machine)
+    if results:
+        return results
+    else:
+        Common.logger(log_type, crawler).warning(f"爬虫:{crawler},没有查到抓取名单")
+        return []
+
+
 if __name__ == "__main__":
-    print(filter_word('public', 'xiaoniangao', '小年糕', 'prod'))
+    print(filter_word('public', 'xiaoniangao', '小年糕', 'prod'))

+ 11 - 6
kuaishou/kuaishou_follow/kuaishou_follow.py

@@ -20,6 +20,7 @@ from common.feishu import Feishu
 from common.getuser import getUser
 from common.db import MysqlHelper
 from common.publish import Publish
+from common.public import get_user_from_mysql
 from common.userAgent import get_random_user_agent
 
 
@@ -386,11 +387,12 @@ class KuaiShouFollow:
                 'Accept': '*/*',
                 'Content-Type': 'application/json',
                 'Origin': 'https://www.kuaishou.com',
-                'Cookie': 'kpf=PC_WEB; clientid=3; did=web_c11041a45efb379fa3e11198d58d1dd1; kpn=KUAISHOU_VISION',
+                'Cookie': 'kpf=PC_WEB; clientid=3; did=web_3f264336f6a6c191cd36fb15e87ab708; kpn=KUAISHOU_VISION',
                 'Content-Length': '1244',
                 'Accept-Language': 'zh-CN,zh-Hans;q=0.9',
                 'Host': 'www.kuaishou.com',
-                'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36',#get_random_user_agent('pc'),
+                'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36',
+                # get_random_user_agent('pc'),
                 'Referer': 'https://www.kuaishou.com/profile/{}'.format(out_uid),
                 'Accept-Encoding': 'gzip, deflate, br',
                 'Connection': 'keep-alive'
@@ -737,11 +739,14 @@ class KuaiShouFollow:
 
     @classmethod
     def get_follow_videos(cls, log_type, crawler, strategy, oss_endpoint, env, machine):
-        user_list = cls.get_user_list(log_type=log_type, crawler=crawler, sheetid="bTSzxW", env=env, machine=machine)
+        # user_list = cls.get_user_list(log_type=log_type, crawler=crawler, sheetid="bTSzxW", env=env, machine=machine)
+
+        user_list = get_user_from_mysql(log_type, crawler, crawler, env, machine)
         for user in user_list:
-            out_uid = user["out_uid"]
-            user_name = user["user_name"]
-            our_uid = user["our_uid"]
+            spider_link = user["spider_link"]
+            out_uid = spider_link.split('/')[-1]
+            user_name = user["nick_name"]
+            our_uid = user["media_id"]
             Common.logger(log_type, crawler).info(f"开始抓取 {user_name} 用户主页视频\n")
             try:
                 cls.get_videoList(log_type=log_type,

+ 14 - 14
kuaishou/kuaishou_main/run_kuaishou_follow.py

@@ -4,6 +4,7 @@
 import argparse
 import os
 import sys
+
 # import time
 
 sys.path.append(os.getcwd())
@@ -16,27 +17,26 @@ def main(log_type, crawler, strategy, oss_endpoint, env, machine):
     try:
         Common.logger(log_type, crawler).info('开始抓取 快手 定向榜\n')
         KuaiShouFollow.get_follow_videos(log_type=log_type,
-                                 crawler=crawler,
-                                 strategy=strategy,
-                                 oss_endpoint=oss_endpoint,
-                                 env=env,
-                                 machine=machine)
+                                         crawler=crawler,
+                                         strategy=strategy,
+                                         oss_endpoint=oss_endpoint,
+                                         env=env,
+                                         machine=machine)
         Common.del_logs(log_type, crawler)
         Common.logger(log_type, crawler).info('抓取完一轮\n')
     except Exception as e:
         Common.logger(log_type, crawler).info(f"快手定向榜异常,触发报警:{e}\n")
-            # Feishu.bot(log_type, crawler, f"快手定向榜异常,触发报警:{e}")
+        # Feishu.bot(log_type, crawler, f"快手定向榜异常,触发报警:{e}")
 
 
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()  ## 新建参数解释器对象
-    parser.add_argument('--log_type', type=str)  ## 添加参数,注明参数类型
-    parser.add_argument('--crawler')  ## 添加参数
-    parser.add_argument('--strategy')  ## 添加参数
-    parser.add_argument('--our_uid')  ## 添加参数
-    parser.add_argument('--oss_endpoint')  ## 添加参数
-    parser.add_argument('--env')  ## 添加参数
-    parser.add_argument('--machine')  ## 添加参数
+    parser.add_argument('--log_type', default='follow', type=str)  ## 添加参数,注明参数类型
+    parser.add_argument('--crawler', default='kuaishou')  ## 添加参数
+    parser.add_argument('--strategy', default='定向抓取')  ## 添加参数
+    parser.add_argument('--oss_endpoint', default='inner')  ## 添加参数
+    parser.add_argument('--env', default='prod')  ## 添加参数
+    parser.add_argument('--machine', default='aliyun')  ## 添加参数
     args = parser.parse_args()  ### 参数赋值,也可以通过终端赋值
     # print(args)
     main(log_type=args.log_type,
@@ -44,4 +44,4 @@ if __name__ == "__main__":
          strategy=args.strategy,
          oss_endpoint=args.oss_endpoint,
          env=args.env,
-         machine=args.machine)
+         machine=args.machine)