Procházet zdrojové kódy

Merge remote-tracking branch 'origin/master'

piaoquan před 9 měsíci
rodič
revize
c603cf4270

+ 2 - 1
common/__init__.py

@@ -1,4 +1,5 @@
 from .aliyun_log import AliyunLogger
 from .redirect_url import get_redirect_url
 from .pipeline import PiaoQuanPipeline
-from .proxy import tunnel_proxies
+from .proxy import tunnel_proxies
+from .redis_db import SyncRedisHelper

+ 1 - 0
common/mq.py

@@ -11,6 +11,7 @@ class MQ:
     def __init__(self, topic_name) -> None:
         self.mq_client = MQClient("http://1894469520484605.mqrest.cn-qingdao-public.aliyuncs.com", "LTAI4G7puhXtLyHzHQpD6H7A",
                                   "nEbq3xWNQd1qLpdy2u71qFweHkZjSG")
+        topic_name = topic_name+"_v2"
         self.producer = self.mq_client.get_producer(self.instance_id, topic_name)
 
     def send_msg(self, video_dict):

+ 67 - 0
common/redis_db.py

@@ -0,0 +1,67 @@
+import redis
+from datetime import timedelta
+
+
+class SyncRedisHelper:
+    _pool: redis.ConnectionPool = None
+    _instance = None
+
+    def __init__(self):
+        if not self._instance:
+            self._pool = self._get_pool()
+            self._instance = self
+
+    def _get_pool(self) -> redis.ConnectionPool:
+        if self._pool is None:
+            self._pool = redis.ConnectionPool(
+                host="r-bp1mb0v08fqi4hjffu.redis.rds.aliyuncs.com",  # 内网地址
+                # host="r-bp154bpw97gptefiqkpd.redis.rds.aliyuncs.com",  # 外网地址
+                port=6379,
+                db=2,
+                password="Wqsd@2019",
+                # password="Qingqu2019",
+
+            )
+        return self._pool
+
+    def get_client(self) -> redis.Redis:
+        pool = self._get_pool()
+        client = redis.Redis(connection_pool=pool)
+        return client
+
+    def close(self):
+        if self._pool:
+            self._pool.disconnect(inuse_connections=True)
+
+
+def store_data(platform, out_video_id, condition, day_time):
+    key = f"crawler:duplicate:{platform}:{out_video_id}"
+    value = 1
+    if condition:
+        timeout = timedelta(days=int(day_time))
+    else:
+        timeout = timedelta(hours=int(day_time))
+    helper = SyncRedisHelper()
+    client = helper.get_client()
+
+    client.set(key, value)
+    client.expire(key, timeout)
+
+
+def get_data(platform, out_video_id):
+    key = f"crawler:duplicate:{platform}:{out_video_id}"
+    helper = SyncRedisHelper()
+    client = helper.get_client()
+    value = client.exists(key)
+    return value
+
+
+# 示例:存储一个数据
+# store_data('xiaoniangao', '123457', True, 60)
+
+# 示例:获取一个数据
+# value = get_data('xiaoniangao', '1234857')
+# if value is None:
+#     print("Value does not exist")
+# else:
+#     print(f"Retrieved value: {value}")

+ 41 - 0
douyin/douyin_author/douyin_author_scheduling_new.py

@@ -5,6 +5,7 @@ import random
 import sys
 import time
 
+import cv2
 import requests
 import json
 import urllib3
@@ -173,6 +174,36 @@ class DouyinauthorScheduling:
                         comment_count = int(data[i].get('statistics').get('comment_count'))  # 评论
                         # collect_count = data[i].get('statistics').get('collect_count')  # 收藏
                         share_count = int(data[i].get('statistics').get('share_count'))  # 转发
+                        if share_count < 500:
+                            AliyunLogger.logging(
+                                code="2004",
+                                platform=crawler,
+                                mode=log_type,
+                                env=env,
+                                message=f'分享小于500\n'
+                            )
+                            continue
+                        video_percent = '%.2f' % (share_count / digg_count)
+                        special = float(0.25)
+                        if float(video_percent) < special:
+                            AliyunLogger.logging(
+                                code="2004",
+                                platform=crawler,
+                                mode=log_type,
+                                env=env,
+                                message=f'分享/点赞小于25%\n'
+                            )
+                            continue
+                        duration = cls.video_duration(video_url)
+                        if int(duration) < 45:
+                            AliyunLogger.logging(
+                                code="2004",
+                                platform=crawler,
+                                mode=log_type,
+                                env=env,
+                                message=f'视频时常小于45秒\n'
+                            )
+                            continue
                         # if special != 0:
                         #     if share_count != 0:
                         #         video_percent = '%.2f' % (share_count / digg_count)
@@ -328,6 +359,16 @@ class DouyinauthorScheduling:
                     message=f"抓取用户{user_dict['nick_name']}主页视频时异常:{e}\n"
                 )
 
+    @classmethod
+    def video_duration(cls, filename):
+        cap = cv2.VideoCapture(filename)
+        if cap.isOpened():
+            rate = cap.get(5)
+            frame_num = cap.get(7)
+            duration = frame_num / rate
+            return duration
+        return 0
+
 
 if __name__ == "__main__":
     print(DouyinauthorScheduling.get_cookie("author", "douyin", "prod")["cookie"])

+ 10 - 0
jingdianfuqiwang/jingdianfuqiwang_recommend/jingdianfuqiwang_recommend_scheduling.py

@@ -11,6 +11,7 @@ sys.path.append(os.getcwd())
 from common.video_item import VideoItem
 from common import PiaoQuanPipeline, AliyunLogger, tunnel_proxies
 from common.mq import MQ
+from common.redis_db import get_data, store_data
 
 
 class TFuQiWangRecommend(object):
@@ -122,6 +123,14 @@ class TFuQiWangRecommend(object):
         item.add_video_info("strategy", self.mode)
         item.add_video_info("session", "{}-{}".format(self.platform, int(time.time())))
         mq_obj = item.produce_item()
+        value = get_data(self.platform, video_obj["nid"])
+        if int(value) == 1:
+            AliyunLogger.logging(
+                code="2004",
+                platform=self.platform,
+                mode=self.mode,
+                env=self.env,
+                message="redis重复视频")
         pipeline = PiaoQuanPipeline(
             platform=self.platform,
             mode=self.mode,
@@ -133,6 +142,7 @@ class TFuQiWangRecommend(object):
         if pipeline.process_item():
             self.download_cnt += 1
             self.mq.send_msg(mq_obj)
+            store_data(self.platform, video_obj["nid"], False, 12)
             time.sleep(60 * random.randint(1, 5))
             AliyunLogger.logging(
                 code="1002",

+ 31 - 0
kuaishou/kuaishou_author/kuaishou_author_scheduling_new.py

@@ -6,6 +6,8 @@ import random
 import sys
 import time
 from datetime import date, timedelta
+
+import cv2
 import requests
 import json
 import urllib3
@@ -244,6 +246,25 @@ class KuaishouauthorScheduling:
 
                     viewCount = int(feeds[i].get('photo', {}).get('viewCount', 0))
                     realLikeCount = int(feeds[i].get('photo', {}).get('realLikeCount', 0))
+                    if realLikeCount < 10000:
+                        AliyunLogger.logging(
+                            code="2004",
+                            platform=crawler,
+                            mode=log_type,
+                            env=env,
+                            message=f'点赞小于10000\n'
+                        )
+                        continue
+                    duration = cls.video_duration(feeds[i].get('photo', {}).get('photoUrl', ""))
+                    if int(duration) < 45:
+                        AliyunLogger.logging(
+                            code="2004",
+                            platform=crawler,
+                            mode=log_type,
+                            env=env,
+                            message=f'视频时常小于45秒\n'
+                        )
+                        continue
                     # video_percent = '%.2f' % (realLikeCount / viewCount)
                     # if special != 0:
                     #     special = float(special)
@@ -412,6 +433,16 @@ class KuaishouauthorScheduling:
                     message=f"抓取用户{user_dict['nick_name']}主页视频时异常:{e}\n"
                 )
 
+    @classmethod
+    def video_duration(cls, filename):
+        cap = cv2.VideoCapture(filename)
+        if cap.isOpened():
+            rate = cap.get(5)
+            frame_num = cap.get(7)
+            duration = frame_num / rate
+            return duration
+        return 0
+
 
 if __name__ == "__main__":
     print(KuaishouauthorScheduling.get_cookie("author", "kuaishou", "prod")["cookie"])

+ 8 - 43
shipinhao/shipinhao_author/shipinhao_author.py

@@ -241,8 +241,15 @@ class ShiPinHaoAuthor(object):
             divisor_cnt = 0
         else:
             divisor_cnt = int(share_cnt / fav_count)
+        if share_cnt < 500:
+            return True
+        video_percent = '%.2f' % (share_cnt / like_cnt)
+        special = float(0.25)
+        if float(video_percent) < special:
+
+            return True
         # 视频时长小于30秒 返回
-        if duration < 15:
+        if duration < 45:
             values = [[
                 obj['nickname'],
                 publish_time_str,
@@ -262,48 +269,6 @@ class ShiPinHaoAuthor(object):
             time.sleep(0.5)
             Feishu.update_values(self.platform, 'shipinhao', "Vq7NeH", "A2:Z2", values)
             return True
-        # # 分享小于1000 返回
-        # if share_cnt < 500:
-        #     values = [[
-        #         obj['nickname'],
-        #         publish_time_str,
-        #         formatted_time,
-        #         int(obj['fav_count']),
-        #         int(obj['comment_count']),
-        #         int(obj['like_count']),
-        #         int(obj['forward_count']),
-        #         divisor_cnt,
-        #         video_obj.get('title').split("\n")[0].split("#")[0],
-        #         duration,
-        #         '否',
-        #         '分享小于500',
-        #         video_obj.get('DownloadAddress')
-        #     ]]
-        #     Feishu.insert_columns(self.platform, 'shipinhao', "Vq7NeH", "ROWS", 1, 2)
-        #     time.sleep(0.5)
-        #     Feishu.update_values(self.platform, 'shipinhao', "Vq7NeH", "A2:Z2", values)
-        #     return True
-        # # 分享小于等于99999
-        # if share_cnt <= 99999 and divisor_cnt < 1:
-        #     values = [[
-        #         obj['nickname'],
-        #         publish_time_str,
-        #         formatted_time,
-        #         int(obj['fav_count']),
-        #         int(obj['comment_count']),
-        #         int(obj['like_count']),
-        #         int(obj['forward_count']),
-        #         divisor_cnt,
-        #         video_obj.get('title').split("\n")[0].split("#")[0],
-        #         duration,
-        #         '否',
-        #         f'分享小于100000,分享/大拇指:{divisor_cnt}',
-        #         video_obj.get('DownloadAddress')
-        #     ]]
-        #     Feishu.insert_columns(self.platform, 'shipinhao', "Vq7NeH", "ROWS", 1, 2)
-        #     time.sleep(0.5)
-        #     Feishu.update_values(self.platform, 'shipinhao', "Vq7NeH", "A2:Z2", values)
-        #     return True
         pipeline = PiaoQuanPipeline(
             platform=self.platform,
             mode=self.mode,

+ 18 - 1
xiaoniangao/xiaoniangao_author/xiaoniangao_author_v2.py

@@ -9,6 +9,7 @@ import requests
 from common.mq import MQ
 
 sys.path.append(os.getcwd())
+from common.redis_db import get_data, store_data
 
 from common.common import Common
 from common import AliyunLogger, PiaoQuanPipeline
@@ -32,7 +33,6 @@ def tunnel_proxies():
 
     return tunnel_proxies
 
-
 class XiaoNianGaoAuthor:
     def __init__(self, platform, mode, rule_dict, env, user_list):
         self.platform = platform
@@ -247,6 +247,16 @@ class XiaoNianGaoAuthor:
             "strategy": self.mode,
             "out_video_id": video_obj.get("vid", ""),
         }
+        value = get_data(self.platform, video_obj.get("vid", ""))
+        if int(value) == 1:
+            AliyunLogger.logging(
+                code="2004",
+                trace_id=trace_id,
+                platform=self.platform,
+                mode=self.mode,
+                env=self.env,
+                data=video_dict,
+                message="redis重复视频")
         pipeline = PiaoQuanPipeline(
             platform=self.platform,
             mode=self.mode,
@@ -300,6 +310,13 @@ class XiaoNianGaoAuthor:
             limit_flag = self.limiter.author_limitation(user_id=video_dict['user_id'])
             if limit_flag:
                 self.mq.send_msg(video_dict)
+                store_data(self.platform, video_obj.get("vid", ""), True, 60)
+                Common.logging(
+                    log_type=self.mode,
+                    crawler=self.platform,
+                    env=self.env,
+                    message="写入 redis 成功",
+                )
                 self.download_count += 1
                 AliyunLogger.logging(
                     code="1002",