瀏覽代碼

update 增加从mysql获取配置信息

lierqiang 2 年之前
父節點
當前提交
30a9bb9e11

+ 0 - 0
douyin/douyin_follow/__init__.py


+ 16 - 29
douyin/douyin_follow/follow_dy.py

@@ -10,15 +10,16 @@ import time
 import requests
 from hashlib import md5
 
-from common.public import get_user_from_mysql
 from douyin.douyin_recommend import get_xb
 
 sys.path.append(os.getcwd())
+from common.common import Common
 from common.db import MysqlHelper
 from common.feishu import Feishu
 from common.publish import Publish
+from common.public import random_title
 from common.userAgent import get_random_user_agent
-from common.common import Common
+from common.public import get_user_from_mysql, get_config_from_mysql
 
 
 class DyFollow(object):
@@ -89,7 +90,7 @@ class DyFollow(object):
             Common.logger(log_type, crawler).error(f'filter_words异常:{e}\n')
 
     @classmethod
-    def video_title(cls, log_type, crawler, title):
+    def video_title(cls, log_type, env, crawler, title):
         title_split1 = title.split(" #")
         if title_split1[0] != "":
             title1 = title_split1[0]
@@ -116,29 +117,10 @@ class DyFollow(object):
                           .replace("?", "").replace('"', "").replace("<", "") \
                           .replace(">", "").replace("|", "").replace("@", "").replace('"', '').replace("'", '')[:40]
         if video_title.replace(" ", "") == "" or video_title == "。。。" or video_title == "...":
-            return cls.random_title(log_type, crawler)
+            return random_title(log_type, crawler, env, text='title')
         else:
             return video_title
 
-    @classmethod
-    def random_title(cls, log_type, crawler):
-        try:
-            while True:
-                random_title_sheet = Feishu.get_values_batch(log_type, crawler, 'sPK2oY')
-                if random_title_sheet is None:
-                    Common.logger(log_type, crawler).warning(f"filter_words_sheet:{random_title_sheet} 10秒钟后重试")
-                    continue
-                random_title_list = []
-                for x in random_title_sheet:
-                    for y in x:
-                        if y is None:
-                            pass
-                        else:
-                            random_title_list.append(y)
-                return random.choice(random_title_list)
-        except Exception as e:
-            Common.logger(log_type, crawler).error(f'random_title:{e}\n')
-
     @classmethod
     def get_videoList(cls, log_type, crawler, strategy, our_uid, out_uid, oss_endpoint, env, machine, rule_dict):
 
@@ -175,13 +157,15 @@ class DyFollow(object):
         for info in aweme_list:
             if info.get('is_ads'):
                 continue
-            publish_time = info['create_time']
+            publish_time = info.get('create_time')
+            if not publish_time:
+                continue
             publish_time_str = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(publish_time))
             publish_day = int((int(time.time()) - publish_time) / (3600 * 24))
 
-            video_title = cls.video_title(log_type, crawler, info['desc'])
+            video_title = cls.video_title(log_type, env, crawler, info['desc'])
             if not video_title:
-                video_title = cls.random_title(log_type, crawler)
+                video_title = random_title(log_type, crawler, env, text='title')
 
             video_dict = {'video_title': video_title,
                           'video_id': info['aweme_id'],
@@ -226,11 +210,13 @@ class DyFollow(object):
     @classmethod
     def download_publish(cls, log_type, crawler, strategy, video_dict, rule_dict, our_uid, oss_endpoint, env, machine):
         try:
+            filter_words = get_config_from_mysql(log_type, crawler, env, text='filter')
+            for filter_word in filter_words:
+                if filter_word in video_dict['video_title']:
+                    Common.logger(log_type, crawler).info('标题已中过滤词:{}\n', video_dict['video_title'])
+                    return
             if cls.download_rule(video_dict, rule_dict) is False:
                 Common.logger(log_type, crawler).info('不满足抓取规则\n')
-            elif any(word if word in video_dict['video_title'] else False for word in
-                     cls.filter_words(log_type, crawler)) is True:
-                Common.logger(log_type, crawler).info('标题已中过滤词:{}\n', video_dict['video_title'])
             elif cls.repeat_video(log_type, crawler, video_dict['video_id'], env, machine) != 0:
                 Common.logger(log_type, crawler).info('视频已下载\n')
             else:
@@ -349,6 +335,7 @@ class DyFollow(object):
     def get_follow_videos(cls, log_type, crawler, strategy, oss_endpoint, env, machine):
         user_list = get_user_from_mysql(log_type, crawler, crawler, env)
         rule_dict = cls.get_rule(log_type, crawler)
+
         for user in user_list:
             spider_link = user["spider_link"]
             out_uid = spider_link

+ 17 - 26
douyin/douyin_recommend/recommend_dy.py

@@ -9,14 +9,17 @@ import sys
 import time
 import requests
 from hashlib import md5
+
 from douyin.douyin_recommend import get_xb
 
 sys.path.append(os.getcwd())
-from common.db import MysqlHelper
+from common.common import Common
 from common.feishu import Feishu
+from common.db import MysqlHelper
 from common.publish import Publish
+from common.public import get_config_from_mysql
+from common.public import random_title
 from common.userAgent import get_random_user_agent
-from common.common import Common
 
 
 class DyRecommend(object):
@@ -92,7 +95,7 @@ class DyRecommend(object):
             Common.logger(log_type, crawler).error(f'filter_words异常:{e}\n')
 
     @classmethod
-    def video_title(cls, log_type, crawler, title):
+    def video_title(cls, log_type, crawler, env, title):
         title_split1 = title.split(" #")
         if title_split1[0] != "":
             title1 = title_split1[0]
@@ -119,29 +122,10 @@ class DyRecommend(object):
                           .replace("?", "").replace('"', "").replace("<", "") \
                           .replace(">", "").replace("|", "").replace("@", "").replace('"', '').replace("'", '')[:40]
         if video_title.replace(" ", "") == "" or video_title == "。。。" or video_title == "...":
-            return cls.random_title(log_type, crawler)
+            return random_title(log_type, crawler, env, text='title')
         else:
             return video_title
 
-    @classmethod
-    def random_title(cls, log_type, crawler):
-        try:
-            while True:
-                random_title_sheet = Feishu.get_values_batch(log_type, crawler, 'sPK2oY')
-                if random_title_sheet is None:
-                    Common.logger(log_type, crawler).warning(f"filter_words_sheet:{random_title_sheet} 10秒钟后重试")
-                    continue
-                random_title_list = []
-                for x in random_title_sheet:
-                    for y in x:
-                        if y is None:
-                            pass
-                        else:
-                            random_title_list.append(y)
-                return random.choice(random_title_list)
-        except Exception as e:
-            Common.logger(log_type, crawler).error(f'random_title:{e}\n')
-
     @classmethod
     def get_videolist(cls, log_type, crawler, strategy, our_id, oss_endpoint, env, machine):
         rule_dict = cls.get_rule(log_type, crawler)
@@ -171,13 +155,15 @@ class DyRecommend(object):
             for info in aweme_list:
                 if info.get('is_ads'):
                     continue
-                publish_time = info['create_time']
+                publish_time = info.get('create_time')
+                if not publish_time:
+                    continue
                 publish_time_str = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(publish_time))
                 publish_day = int((int(time.time()) - publish_time) / (3600 * 24))
                 if not info['desc']:
-                    video_title = cls.random_title(log_type, crawler)
+                    video_title = random_title(log_type, crawler, env, text='title')
                 else:
-                    video_title = cls.video_title(log_type, crawler, info['desc'])
+                    video_title = cls.video_title(log_type, crawler, env, info['desc'])
 
                 video_dict = {'video_title': video_title,
                               'video_id': info['aweme_id'],
@@ -222,6 +208,11 @@ class DyRecommend(object):
     @classmethod
     def download_publish(cls, log_type, crawler, strategy, video_dict, rule_dict, our_uid, oss_endpoint, env, machine):
         try:
+            filter_words = get_config_from_mysql(log_type, crawler, env, text='filter')
+            for filter_word in filter_words:
+                if filter_word in video_dict['video_title']:
+                    Common.logger(log_type, crawler).info('标题已中过滤词:{}\n', video_dict['video_title'])
+                    return
             if cls.download_rule(video_dict, rule_dict) is False:
                 Common.logger(log_type, crawler).info('不满足抓取规则\n')
             elif any(word if word in video_dict['video_title'] else False for word in

+ 19 - 37
kuaishou/kuaishou_follow/kuaishou_follow.py

@@ -20,6 +20,7 @@ from common.feishu import Feishu
 from common.getuser import getUser
 from common.db import MysqlHelper
 from common.publish import Publish
+from common.public import random_title, get_config_from_mysql
 from common.public import get_user_from_mysql
 from common.userAgent import get_random_user_agent
 
@@ -93,26 +94,6 @@ class KuaiShouFollow:
         except Exception as e:
             Common.logger(log_type, crawler).error(f'filter_words异常:{e}\n')
 
-    # 万能标题
-    @classmethod
-    def random_title(cls, log_type, crawler):
-        try:
-            while True:
-                random_title_sheet = Feishu.get_values_batch(log_type, crawler, '0DiyXe')
-                if random_title_sheet is None:
-                    Common.logger(log_type, crawler).warning(f"filter_words_sheet:{random_title_sheet} 10秒钟后重试")
-                    continue
-                random_title_list = []
-                for x in random_title_sheet:
-                    for y in x:
-                        if y is None:
-                            pass
-                        else:
-                            random_title_list.append(y)
-                return random.choice(random_title_list)
-        except Exception as e:
-            Common.logger(log_type, crawler).error(f'random_title:{e}\n')
-
     # 获取站外用户信息
     @classmethod
     def get_out_user_info(cls, log_type, crawler, out_uid):
@@ -261,7 +242,7 @@ class KuaiShouFollow:
 
     # 处理视频标题
     @classmethod
-    def video_title(cls, log_type, crawler, title):
+    def video_title(cls, log_type, crawler, env, title):
         title_split1 = title.split(" #")
         if title_split1[0] != "":
             title1 = title_split1[0]
@@ -288,7 +269,7 @@ class KuaiShouFollow:
                           .replace("?", "").replace('"', "").replace("<", "") \
                           .replace(">", "").replace("|", "").replace("@", "").replace('"', '').replace("'", '')[:40]
         if video_title.replace(" ", "") == "" or video_title == "。。。" or video_title == "...":
-            return cls.random_title(log_type, crawler)
+            return random_title(log_type, crawler, env, text='title')
         else:
             return video_title
 
@@ -359,11 +340,11 @@ class KuaiShouFollow:
 
                 # video_title
                 if 'caption' not in feeds[i]['photo']:
-                    video_title = cls.random_title(log_type, crawler)
+                    video_title = random_title(log_type, crawler, env, text='title')
                 elif feeds[i]['photo']['caption'].strip() == "":
-                    video_title = cls.random_title(log_type, crawler)
+                    video_title = random_title(log_type, crawler, env, text='title')
                 else:
-                    video_title = cls.video_title(log_type, crawler, feeds[i]['photo']['caption'])
+                    video_title = cls.video_title(log_type, crawler, env, feeds[i]['photo']['caption'])
 
                 if 'videoResource' not in feeds[i]['photo'] \
                         and 'manifest' not in feeds[i]['photo'] \
@@ -546,15 +527,15 @@ class KuaiShouFollow:
     @classmethod
     def download_publish(cls, log_type, crawler, strategy, video_dict, rule_dict, our_uid, oss_endpoint, env, machine):
         try:
+            filter_words = get_config_from_mysql(log_type, crawler, env, text='filter')
+            for filter_word in filter_words:
+                if filter_word in video_dict['video_title']:
+                    Common.logger(log_type, crawler).info('标题已中过滤词:{}\n', video_dict['video_title'])
+                    return
             download_finished = False
             if cls.repeat_video(log_type, crawler, video_dict['video_id'], video_dict['video_title'],
                                 video_dict['publish_time_str'], env, machine) != 0:
                 Common.logger(log_type, crawler).info('视频已下载\n')
-            # elif video_dict['video_id'] in [x for y in Feishu.get_values_batch(log_type, crawler, "3cd128") for x in y]:
-            #     Common.logger(log_type, crawler).info('视频已下载\n')
-            elif any(word if word in video_dict['video_title'] else False for word in
-                     cls.filter_words(log_type, crawler)) is True:
-                Common.logger(log_type, crawler).info('标题已中过滤词\n')
             else:
                 # 下载视频
                 Common.download_method(log_type=log_type, crawler=crawler, text='video',
@@ -674,13 +655,14 @@ class KuaiShouFollow:
             our_uid = user["media_id"]
             Common.logger(log_type, crawler).info(f"开始抓取 {user_name} 用户主页视频\n")
             cls.get_videoList(log_type=log_type,
-                                  crawler=crawler,
-                                  strategy=strategy,
-                                  our_uid=our_uid,
-                                  out_uid=out_uid,
-                                  oss_endpoint=oss_endpoint,
-                                  env=env,
-                                  machine=machine)
+                              crawler=crawler,
+                              strategy=strategy,
+                              our_uid=our_uid,
+                              out_uid=out_uid,
+                              oss_endpoint=oss_endpoint,
+                              env=env,
+                              machine=machine)
+
 
 if __name__ == "__main__":
     KuaiShouFollow.get_videoList(log_type="follow",

+ 28 - 0
kuaishou/kuaishou_follow/test.py

@@ -0,0 +1,28 @@
+import pymysql
+connection = pymysql.connect(
+                host="rm-bp1k5853td1r25g3n690.mysql.rds.aliyuncs.com",# 数据库IP地址,内网地址
+                # host="rm-bp1k5853td1r25g3ndo.mysql.rds.aliyuncs.com",  # 数据库IP地址,外网地址
+                port=3306,  # 端口号
+                user="crawler",  # mysql用户名
+                passwd="crawler123456@",  # mysql用户登录密码
+                db="piaoquan-crawler",  # 数据库名
+                # 如果数据库里面的文本是utf8编码的,charset指定是utf8
+                charset="utf8mb4")
+
+m_con = connection.cursor(cursor=pymysql.cursors.DictCursor)
+
+sql = 'select * from crawler_config where id =6'
+
+a  = m_con.execute(sql)
+data = m_con.fetchall()
+
+# 关闭数据库连接
+connection.close()
+n_data = data[0]
+emo = n_data['config']
+a = '😝'
+em = eval(emo)['emoji']
+if a in em:
+    print(11111)
+else:
+    print(222222)

+ 400 - 0
kuaishou/kuaishou_follow/videoid

@@ -0,0 +1,400 @@
+13938824
+13938809
+13938803
+13938798
+13938791
+13938726
+13938721
+13938713
+13938711
+13938708
+13938701
+13938697
+13938691
+13938686
+13938681
+13938664
+13938659
+13938657
+13938632
+13938628
+13938621
+13938607
+13938605
+13938602
+13938599
+13938597
+13938590
+13938551
+13938521
+13938514
+13938502
+13938497
+13938492
+13938411
+13938407
+13938403
+13938393
+13938382
+13938378
+13938091
+13938085
+13938076
+13938069
+13937967
+13937931
+13934789
+13934788
+13934787
+13934781
+13934778
+13934777
+13934774
+13934769
+13934768
+13934765
+13934764
+13934756
+13934753
+13934732
+13934729
+13934724
+13934719
+13934709
+13934705
+13934703
+13934701
+13934687
+13934682
+13934634
+13934618
+13934615
+13934606
+13934597
+13934593
+13934590
+13934560
+13934552
+13934551
+13934549
+13934544
+13934542
+13934539
+13934538
+13934536
+13934534
+13934532
+13934524
+13934503
+13934502
+13934500
+13934499
+13934498
+13934492
+13934491
+13934488
+13934484
+13934482
+13934481
+13934479
+13934476
+13934473
+13934472
+13934471
+13934467
+13934464
+13934462
+13934461
+13934457
+13934453
+13934451
+13934448
+13934446
+13934445
+13934441
+13934434
+13934431
+13934429
+13934426
+13934423
+13934421
+13934419
+13934417
+13934416
+13934415
+13934413
+13934411
+13934409
+13934405
+13934404
+13934399
+13934397
+13934394
+13934392
+13934379
+13934378
+13934364
+13934361
+13934359
+13934358
+13934355
+13934351
+13934350
+13934347
+13934343
+13934339
+13934338
+13934325
+13934321
+13934318
+13934315
+13934313
+13934309
+13934305
+13934303
+13934299
+13934296
+13934293
+13934290
+13934288
+13934284
+13934282
+13934278
+13934274
+13934270
+13934266
+13934259
+13934257
+13934245
+13934238
+13934232
+13934228
+13934224
+13934221
+13934217
+13934214
+13934213
+13934206
+13934204
+13934194
+13934189
+13934184
+13934183
+13934181
+13934178
+13934177
+13934153
+13934152
+13934151
+13934149
+13934146
+13934122
+13934120
+13934113
+13934112
+13934108
+13934103
+13934100
+13934093
+13934088
+13934086
+13934083
+13934080
+13934079
+13934077
+13934072
+13934071
+13934069
+13934065
+13934061
+13934060
+13934058
+13934052
+13934047
+13934046
+13934045
+13934044
+13934041
+13934039
+13934037
+13934029
+13934025
+13934024
+13934022
+13934019
+13934016
+13934014
+13934011
+13934010
+13934007
+13934005
+13934001
+13933995
+13933992
+13933991
+13933990
+13933984
+13933982
+13933962
+13933959
+13933958
+13933955
+13933951
+13933948
+13933943
+13933940
+13933937
+13933936
+13933933
+13933932
+13933931
+13933927
+13933924
+13933921
+13933919
+13933917
+13933915
+13933910
+13933903
+13933901
+13933900
+13933896
+13933895
+13933894
+13933890
+13933889
+13933884
+13933882
+13933876
+13933874
+13933871
+13933856
+13933852
+13933837
+13933836
+13933835
+13933833
+13933832
+13933827
+13933822
+13933821
+13933818
+13933816
+13933813
+13933812
+13933809
+13933806
+13933798
+13933796
+13933795
+13933794
+13933785
+13933779
+13933776
+13933774
+13933769
+13933767
+13933765
+13933762
+13933761
+13933758
+13933756
+13933755
+13933751
+13933750
+13933740
+13933739
+13933737
+13933734
+13933730
+13933727
+13933725
+13933723
+13933719
+13933712
+13933708
+13933706
+13933704
+13933700
+13933685
+13933684
+13933681
+13933678
+13933674
+13933668
+13933666
+13933664
+13933653
+13933649
+13933639
+13933620
+13933618
+13933612
+13933608
+13933604
+13933601
+13933600
+13933590
+13933583
+13933571
+13933564
+13933562
+13933559
+13933554
+13933553
+13933549
+13933546
+13933543
+13933539
+13933535
+13933533
+13933529
+13933519
+13933517
+13933515
+13933512
+13933509
+13933502
+13933498
+13933494
+13933487
+13933484
+13933482
+13933481
+13933477
+13933469
+13933464
+13933453
+13933448
+13933444
+13933426
+13933421
+13933418
+13933413
+13933410
+13933404
+13933403
+13933401
+13933399
+13933396
+13933392
+13933390
+13933386
+13933382
+13933381
+13933377
+13933371
+13933369
+13933366
+13933365
+13933363
+13933348
+13933345
+13933342
+13933341
+13933339
+13933337
+13933334
+13933332
+13933328
+13922716

+ 0 - 0
kuaishou/kuaishou_recommend/__init__.py


二進制
kuaishou/kuaishou_recommend/kuaishou/videos/e20c6fbcda1169932c94bf0bd3683f70/image.jpg


二進制
kuaishou/kuaishou_recommend/kuaishou/videos/e20c6fbcda1169932c94bf0bd3683f70/video.mp4


二進制
kuaishou/kuaishou_recommend/kuaishou/videos/fe80d0368da2d1c26909d1a875d6e09b/image.jpg


+ 14 - 0
kuaishou/kuaishou_recommend/kuaishou/videos/fe80d0368da2d1c26909d1a875d6e09b/info.txt

@@ -0,0 +1,14 @@
+e4d3640ed1cdc03e
+不是很漂亮主打就是自信
+26
+128164
+0
+5539
+0
+720*1280
+1680277502
+不是圆
+https://p1-pro.a.yximgs.com/uhead/AB/2023/03/21/22/BMjAyMzAzMjEyMjQ1MjRfMzExNTcwODIzNV8xX2hkMTQyXzE=_s.jpg
+https://v2.kwaicdn.com/upic/2023/03/31/23/BMjAyMzAzMzEyMzQ0NDFfMzExNTcwODIzNV85OTQ2Mjg4NTg1NV8xXzM=_b_Babf4d3be147b2b708814052850880ffd.mp4?pkey=AAWWo2TmOyovU3wpcw-qxZEfCzeNLzDtHBE7tLd15z28cmpRxMuIRZjWmjStQwVKc3qjb0_vB8h1BHRng7r1rArjbSzb7ZVX4aio9ACty6MyhQMEMcUsiNRmvlQLVhTywBg&tag=1-1681130915-unknown-0-ml3z9lxblm-de2b236a3eab1bc2&clientCacheKey=3xuzaf2zdga4is9_b.mp4&di=73e2f18f&bp=14944&tt=b&ss=vp
+https://p1.a.yximgs.com/upic/2023/03/31/23/BMjAyMzAzMzEyMzQ0NDFfMzExNTcwODIzNV85OTQ2Mjg4NTg1NV8xXzM=_Bfe4e513ea9e56dd55ff3258b92b7945f.jpg?tag=1-1681130915-unknown-0-zgsrlzqiab-8448deb29683667c&clientCacheKey=3xuzaf2zdga4is9.jpg&di=73e2f18f&bp=14944
+kuaishou1681180829

+ 7 - 0
kuaishou/kuaishou_recommend/kuaishou/videos/fe80d0368da2d1c26909d1a875d6e09b/video.mp4

@@ -0,0 +1,7 @@
+<!DOCTYPE HTML PUBLIC "-//IETF//DTD HTML 2.0//EN">
+<html>
+<head><title>403 Forbidden</title></head>
+<body>
+<h1>403 Forbidden</h1>
+<p>You don't have permission to access the URL on this server.<hr/>Powered by Tengine</body>
+</html>

+ 12 - 30
kuaishou/kuaishou_recommend/recommend_kuaishou.py

@@ -21,7 +21,7 @@ from common.feishu import Feishu
 from common.getuser import getUser
 from common.db import MysqlHelper
 from common.publish import Publish
-from common.public import get_user_from_mysql
+from common.public import get_user_from_mysql, random_title, get_config_from_mysql
 from common.userAgent import get_random_user_agent
 
 
@@ -77,26 +77,6 @@ class KuaiShouRecommend:
         except Exception as e:
             Common.logger(log_type, crawler).error(f'filter_words异常:{e}\n')
 
-    # 万能标题
-    @classmethod
-    def random_title(cls, log_type, crawler):
-        try:
-            while True:
-                random_title_sheet = Feishu.get_values_batch(log_type, crawler, '0DiyXe')
-                if random_title_sheet is None:
-                    Common.logger(log_type, crawler).warning(f"filter_words_sheet:{random_title_sheet} 10秒钟后重试")
-                    continue
-                random_title_list = []
-                for x in random_title_sheet:
-                    for y in x:
-                        if y is None:
-                            pass
-                        else:
-                            random_title_list.append(y)
-                return random.choice(random_title_list)
-        except Exception as e:
-            Common.logger(log_type, crawler).error(f'random_title:{e}\n')
-
     # 获取用户信息列表
     @classmethod
     def get_user_list(cls, log_type, crawler, sheetid, env, machine):
@@ -155,7 +135,7 @@ class KuaiShouRecommend:
 
     # 处理视频标题
     @classmethod
-    def video_title(cls, log_type, crawler, title):
+    def video_title(cls, log_type, crawler, env, title):
         title_split1 = title.split(" #")
         if title_split1[0] != "":
             title1 = title_split1[0]
@@ -182,7 +162,7 @@ class KuaiShouRecommend:
                           .replace("?", "").replace('"', "").replace("<", "") \
                           .replace(">", "").replace("|", "").replace("@", "").replace('"', '').replace("'", '')[:40]
         if video_title.replace(" ", "") == "" or video_title == "。。。" or video_title == "...":
-            return cls.random_title(log_type, crawler)
+            return random_title(log_type, crawler, env, text='title')
         else:
             return video_title
 
@@ -255,7 +235,6 @@ class KuaiShouRecommend:
                 continue
             else:
                 feeds = response.json()['data']['visionNewRecoFeed']['feeds']
-                # pcursor = response.json()['data']['visionNewRecoFeed']['pcursor']
                 for i in range(len(feeds)):
                     if 'photo' not in feeds[i]:
                         Common.logger(log_type, crawler).warning(f"get_videoList:{feeds[i]}\n")
@@ -263,11 +242,12 @@ class KuaiShouRecommend:
 
                     # video_title
                     if 'caption' not in feeds[i]['photo']:
-                        video_title = cls.random_title(log_type, crawler)
+                        video_title = random_title(log_type, crawler, env, text='title')
+
                     elif feeds[i]['photo']['caption'].strip() == "":
-                        video_title = cls.random_title(log_type, crawler)
+                        video_title = random_title(log_type, crawler, env, text='title')
                     else:
-                        video_title = cls.video_title(log_type, crawler, feeds[i]['photo']['caption'])
+                        video_title = cls.video_title(log_type, crawler, env, feeds[i]['photo']['caption'])
 
                     if 'videoResource' not in feeds[i]['photo'] \
                             and 'manifest' not in feeds[i]['photo'] \
@@ -407,13 +387,15 @@ class KuaiShouRecommend:
     @classmethod
     def download_publish(cls, log_type, crawler, strategy, video_dict, rule_dict, our_uid, oss_endpoint, env, machine):
         try:
+            filter_words = get_config_from_mysql(log_type, crawler, env, text='filter')
+            for filter_word in filter_words:
+                if filter_word in video_dict['video_title']:
+                    Common.logger(log_type, crawler).info('标题已中过滤词:{}\n', video_dict['video_title'])
+                    return
             download_finished = False
             if cls.repeat_video(log_type, crawler, video_dict['video_id'], video_dict['video_title'],
                                 video_dict['publish_time_str'], env, machine) != 0:
                 Common.logger(log_type, crawler).info('视频已下载\n')
-            elif any(word if word in video_dict['video_title'] else False for word in
-                     cls.filter_words(log_type, crawler)) is True:
-                Common.logger(log_type, crawler).info('标题已中过滤词\n')
             else:
                 # 下载视频
                 Common.download_method(log_type=log_type, crawler=crawler, text='video',

+ 14 - 18
xigua/xigua_follow/xigua_follow.py

@@ -27,7 +27,7 @@ from common.getuser import getUser
 from common.common import Common
 from common.feishu import Feishu
 from common.publish import Publish
-from common.public import get_user_from_mysql
+from common.public import get_user_from_mysql, random_title, get_config_from_mysql
 
 
 class Follow:
@@ -413,11 +413,11 @@ class Follow:
                         video_info['videoResource']['dash_120fps']['dynamic_video']['dynamic_audio_list']) != 0:
 
                         video_url = \
-                        video_info['videoResource']['dash_120fps']['dynamic_video']['dynamic_video_list'][-1][
-                            'backup_url_1']
+                            video_info['videoResource']['dash_120fps']['dynamic_video']['dynamic_video_list'][-1][
+                                'backup_url_1']
                         audio_url = \
-                        video_info['videoResource']['dash_120fps']['dynamic_video']['dynamic_audio_list'][-1][
-                            'backup_url_1']
+                            video_info['videoResource']['dash_120fps']['dynamic_video']['dynamic_audio_list'][-1][
+                                'backup_url_1']
                         if len(video_url) % 3 == 1:
                             video_url += '=='
                         elif len(video_url) % 3 == 2:
@@ -429,9 +429,11 @@ class Follow:
                         video_url = base64.b64decode(video_url).decode('utf8')
                         audio_url = base64.b64decode(audio_url).decode('utf8')
                         video_width = \
-                        video_info['videoResource']['dash_120fps']['dynamic_video']['dynamic_video_list'][-1]['vwidth']
+                            video_info['videoResource']['dash_120fps']['dynamic_video']['dynamic_video_list'][-1][
+                                'vwidth']
                         video_height = \
-                        video_info['videoResource']['dash_120fps']['dynamic_video']['dynamic_video_list'][-1]['vheight']
+                            video_info['videoResource']['dash_120fps']['dynamic_video']['dynamic_video_list'][-1][
+                                'vheight']
                         video_url_dict["video_url"] = video_url
                         video_url_dict["audio_url"] = audio_url
                         video_url_dict["video_width"] = video_width
@@ -893,21 +895,15 @@ class Follow:
     @classmethod
     def download_publish(cls, log_type, crawler, strategy, video_dict, rule_dict, our_uid, oss_endpoint, env, machine):
         try:
+            filter_words = get_config_from_mysql(log_type, crawler, env, text='filter')
+            for filter_word in filter_words:
+                if filter_word in video_dict['video_title']:
+                    Common.logger(log_type, crawler).info('标题已中过滤词:{}\n', video_dict['video_title'])
+                    return
             if cls.download_rule(video_dict, rule_dict) is False:
                 Common.logger(log_type, crawler).info('不满足抓取规则\n')
-            elif any(word if word in video_dict['video_title'] else False for word in
-                     cls.filter_words(log_type, crawler)) is True:
-                Common.logger(log_type, crawler).info('标题已中过滤词:{}\n', video_dict['video_title'])
             elif cls.repeat_video(log_type, crawler, video_dict['video_id'], env, machine) != 0:
                 Common.logger(log_type, crawler).info('视频已下载\n')
-            # elif str(video_dict['video_id']) in [x for y in Feishu.get_values_batch(log_type, 'xigua', 'e075e9') for x in y]:
-            #     Common.logger(log_type, crawler).info('视频已下载\n')
-            # elif str(video_dict['video_id']) in [x for y in Feishu.get_values_batch(log_type, 'xigua', '3Ul6wZ') for x in y]:
-            #     Common.logger(log_type, crawler).info('视频已下载\n')
-            # elif str(video_dict['video_id']) in [x for y in Feishu.get_values_batch(log_type, 'xigua', 'QOWqMo') for x in y]:
-            #     Common.logger(log_type, crawler).info('视频已下载\n')
-            # elif str(video_dict['video_id']) in [x for y in Feishu.get_values_batch(log_type, 'xigua', 'wjhpDs') for x in y]:
-            #     Common.logger(log_type, crawler).info('视频已存在\n')
             else:
                 # 下载视频
                 Common.download_method(log_type=log_type, crawler=crawler, text='xigua_video',

+ 14 - 5
xigua/xigua_search/xigua_search.py

@@ -16,12 +16,14 @@ import urllib3
 from urllib.parse import quote
 from requests.adapters import HTTPAdapter
 
+
 sys.path.append(os.getcwd())
 from common.db import MysqlHelper
 from common.getuser import getUser
 from common.common import Common
 from common.feishu import Feishu
 from common.publish import Publish
+from common.public import get_config_from_mysql
 from common.userAgent import get_random_user_agent, get_random_header
 
 
@@ -602,7 +604,7 @@ class XiguaSearch:
         item_counter = data['h5_extra']['itemCell']['itemCounter']
         user_info = data['user_info']
         detail_info = data['video_detail_info']
-        video_dict = {'video_title': data['title'].replace('"' ,'').replace("'", ''),
+        video_dict = {'video_title': data['title'].replace('"', '').replace("'", ''),
                       'video_id': detail_info['video_id'],
                       'gid': data['group_id'],
                       'play_cnt': item_counter['videoWatchCount'],
@@ -687,10 +689,17 @@ class XiguaSearch:
                         video_dict['video_url'] = video_url_dict["video_url"]
                         video_dict['session'] = signature
                     except Exception as e:
-                        Common.logger(log_type, crawler).error(f'关键词:{search_word},视频:{item_id},获取详情失败,原因:{e}')
+                        Common.logger(log_type, crawler).error(
+                            f'关键词:{search_word},视频:{item_id},获取详情失败,原因:{e}')
                         continue
+                    filter_words = get_config_from_mysql(log_type, crawler, env, text='filter')
+                    for filter_word in filter_words:
+                        if filter_word in video_dict['video_title']:
+                            Common.logger(log_type, crawler).info('标题已中过滤词:{}\n', video_dict['video_title'])
+                            continue
                     if cls.repeat_video(log_type, crawler, video_dict['video_id'], env, machine) != 0:
-                        Common.logger(log_type, crawler).info(f'关键词:{search_word},gid:{video_dict["gid"]},视频已下载,无需重复下载\n')
+                        Common.logger(log_type, crawler).info(
+                            f'关键词:{search_word},gid:{video_dict["gid"]},视频已下载,无需重复下载\n')
                         continue
                     for k, v in video_dict.items():
                         Common.logger(log_type, crawler).info(f"{k}:{v}")
@@ -715,7 +724,8 @@ class XiguaSearch:
                         Common.logger(log_type, crawler).error(f'关键词:{search_word},视频:{item_id},下载失败,原因:{e}')
                         continue
                     total_count += 1
-                    Common.logger(log_type, crawler).info(f'search_word:{search_word},title:{video_dict["video_title"]},gid:{video_dict["gid"]},offset:{offset}, total:{total_count}')
+                    Common.logger(log_type, crawler).info(
+                        f'search_word:{search_word},title:{video_dict["video_title"]},gid:{video_dict["gid"]},offset:{offset}, total:{total_count}')
                     if total_count >= 30:
                         return
                 # elif v_type == 'pseries':
@@ -790,7 +800,6 @@ class XiguaSearch:
     @classmethod
     def download_publish(cls, log_type, crawler, search_word, strategy, video_dict, rule_dict, our_uid, oss_endpoint,
                          env, machine):
-
         Common.download_method(log_type=log_type, crawler=crawler, text='xigua_video',
                                title=video_dict['video_title'], url=video_dict['video_url'])
         # 下载音频

+ 7 - 61
youtube/youtube_follow/youtube_follow_api.py

@@ -21,7 +21,7 @@ from common.feishu import Feishu
 from common.getuser import getUser
 from common.publish import Publish
 from common.translate import Translate
-from common.public import get_user_from_mysql
+from common.public import get_user_from_mysql, get_config_from_mysql
 
 headers = {
     'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 Safari/537.36',
@@ -52,63 +52,6 @@ class YoutubeFollow:
         'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 Safari/537.36',
     }
 
-    # @classmethod
-    # def get_browse_id(cls, log_type, crawler, out_user_id, machine):
-    #     """
-    #     获取每个用户的 browse_id
-    #     :param log_type: 日志
-    #     :param crawler: 哪款爬虫
-    #     :param out_user_id: 站外用户 UID
-    #     :param machine: 部署机器,阿里云填写 aliyun / aliyun_hk,线下分别填写 macpro,macair,local
-    #     :return: browse_id
-    #     """
-    #     try:
-    #         # 打印请求配置
-    #         ca = DesiredCapabilities.CHROME
-    #         ca["goog:loggingPrefs"] = {"performance": "ALL"}
-    #
-    #         # 不打开浏览器运行
-    #         chrome_options = webdriver.ChromeOptions()
-    #         chrome_options.add_argument("--headless")
-    #         chrome_options.add_argument(
-    #             '--user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.79 Safari/537.36')
-    #         chrome_options.add_argument("--no-sandbox")
-    #
-    #         # driver初始化
-    #         if machine == 'aliyun' or machine == 'aliyun_hk':
-    #             driver = webdriver.Chrome(desired_capabilities=ca, options=chrome_options)
-    #         elif machine == 'macpro':
-    #             driver = webdriver.Chrome(desired_capabilities=ca, options=chrome_options,
-    #                                       service=Service('/Users/lieyunye/Downloads/chromedriver_v86/chromedriver'))
-    #         elif machine == 'macair':
-    #             driver = webdriver.Chrome(desired_capabilities=ca, options=chrome_options,
-    #                                       service=Service('/Users/piaoquan/Downloads/chromedriver'))
-    #         else:
-    #             driver = webdriver.Chrome(desired_capabilities=ca, options=chrome_options, service=Service(
-    #                 '/Users/wangkun/Downloads/chromedriver/chromedriver_v110/chromedriver'))
-    #
-    #         driver.implicitly_wait(10)
-    #         url = f'https://www.youtube.com/{out_user_id}/videos'
-    #         driver.get(url)
-    #         # driver.save_screenshot("./1.png")
-    #         # 向上滑动 1000 个像素
-    #         # driver.execute_script('window.scrollBy(0, 2000)')
-    #         # driver.save_screenshot("./2.png")
-    #         time.sleep(3)
-    #         accept_btns = driver.find_elements(By.XPATH, '//span[text()="全部接受"]')
-    #         accept_btns_eng = driver.find_elements(By.XPATH, '//span[text()="Accept all"]')
-    #         if len(accept_btns) != 0:
-    #             accept_btns[0].click()
-    #             time.sleep(2)
-    #         elif len(accept_btns_eng) != 0:
-    #             accept_btns_eng[0].click()
-    #             time.sleep(2)
-    #         browse_id = driver.find_element(By.XPATH, '//meta[@itemprop="channelId"]').get_attribute('content')
-    #         driver.quit()
-    #         return browse_id
-    #     except Exception as e:
-    #         Common.logger(log_type, crawler).error(f'get_browse_id异常:{e}\n')
-
     @classmethod
     def get_out_user_info(cls, log_type, crawler, browse_id, out_user_id):
         """
@@ -922,7 +865,7 @@ class YoutubeFollow:
                 if 'title' not in videoDetails:
                     video_title = ''
                 else:
-                    video_title = videoDetails['title'].replace('"' ,'').replace("'", '')
+                    video_title = videoDetails['title'].replace('"', '').replace("'", '')
                 video_title = cls.filter_emoji(video_title)
                 if not cls.is_contain_chinese(video_title):
                     video_title = Translate.google_translate(video_title, machine) \
@@ -1016,8 +959,11 @@ class YoutubeFollow:
     @classmethod
     def download_publish(cls, log_type, crawler, video_dict, strategy, our_uid, env, oss_endpoint, machine):
         try:
-            # sql = f""" select * from crawler_video where platform="{cls.platform}" and out_video_id="{video_dict['video_id']}" """
-            # repeat_video = MysqlHelper.get_values(log_type, crawler, sql, env, machine)
+            filter_words = get_config_from_mysql(log_type, crawler, env, text='filter', action='get_author_map')
+            for filter_word in filter_words:
+                if filter_word in video_dict['video_title']:
+                    Common.logger(log_type, crawler).info('标题已中过滤词:{}\n', video_dict['video_title'])
+                    return
             if video_dict['video_title'] == '' or video_dict['video_url'] == '':
                 Common.logger(log_type, crawler).info('无效视频\n')
             elif video_dict['duration'] > 1200 or video_dict['duration'] < 60: