瀏覽代碼

喜事多多上线——4 天去重

罗俊辉 1 年之前
父節點
當前提交
94ec4d64f3
共有 2 個文件被更改,包括 18 次插入2 次删除
  1. 17 1
      application/pipeline/pipeline.py
  2. 1 1
      spider/crawler_online/jiajiezhufuxishiduoduo.py

+ 17 - 1
application/pipeline/pipeline.py

@@ -131,8 +131,13 @@ class PiaoQuanPipeline(object):
         out_id = self.item["out_video_id"]
         sql = f""" select 1 from crawler_video where platform = "{self.platform}" and out_video_id="{out_id}"; """
         repeat_video = self.mysql.select(sql=sql)
-        # print(repeat_video)
         if repeat_video:
+            # 喜事多多平台 4 天去重一次
+            if self.platform == "xishiduoduo":
+                sql_2 = f"""select create_time from crawler_video where out_video_id="{out_id}";"""
+                video_time = self.mysql.select(sql=sql_2)[0][0].timestamp()
+                if int(time.time()) - video_time >= 86400 * 4:
+                    return True
             self.aliyun_log.logging(
                 code="2002",
                 trace_id=self.trace_id,
@@ -161,3 +166,14 @@ class PiaoQuanPipeline(object):
             # 记录相关日志
             return False
         return True
+
+
+# if __name__ == '__main__':
+#     sql_2 = f"""select create_time from crawler_video where video_id='18940470';"""
+#     Mysql = MysqlHelper(platform="xishiduoduo", mode="recommend")
+#     video_time = Mysql.select(sql=sql_2)
+#     print(video_time)
+#     print(video_time[0])
+#     print(video_time[0][0])
+#     print(type(video_time[0][0]))
+#     print(video_time[0][0].timestamp())

+ 1 - 1
spider/crawler_online/jiajiezhufuxishiduoduo.py

@@ -2,7 +2,6 @@
 佳节祝福喜事多多——推荐爬虫
 @author: 罗俊辉
 """
-import json
 import os
 import sys
 import time
@@ -82,6 +81,7 @@ class XiShiDuoDuoRecommend(object):
                 message="成功发送至 ETL",
                 data=mq_obj,
             )
+            time.sleep(3)
             if self.download_cnt >= int(
                     self.rule_dict.get("videos_cnt", {}).get("min", 200)
             ):