wangkun 2 年之前
父节点
当前提交
d7dcec970f
共有 1 个文件被更改,包括 16 次插入7 次删除
  1. 16 7
      shipinhao/shipinhao_follow.py

+ 16 - 7
shipinhao/shipinhao_follow.py

@@ -1,6 +1,7 @@
 # -*- coding: utf-8 -*-
 # @Author: wangkun
 # @Time: 2022/12/14
+import difflib
 import os
 import sys
 import time
@@ -262,6 +263,8 @@ class Follow:
             # 视频号定向_已下载表
             elif video_title in [x for y in Feishu.get_values_batch(log_type, 'shipinhao', 'KsVtLe') for x in y]:
                 Common.logger(log_type).info('视频已下载\n')
+            elif cls.title_like(log_type, video_title) is True:
+                Common.logger(log_type).info('标题相似度>=90%')
             # feeds 表去重
             elif video_title in [x for y in Feishu.get_values_batch(log_type, 'shipinhao', 'FSDlBy') for x in y]:
                 Common.logger(log_type).info('视频已存在\n')
@@ -285,6 +288,18 @@ class Follow:
         except Exception as e:
             Common.logger(log_type).error(f'get_video_info异常:{e}\n')
 
+    @classmethod
+    def title_like(cls, log_type, title):
+        sheet = Feishu.get_values_batch(log_type, 'shipinhao', 'KsVtLe')
+        for i in range(1, len(sheet)):
+            video_title = sheet[i][7]
+            if video_title is None:
+                pass
+            elif difflib.SequenceMatcher(None, title, video_title).quick_ratio() >= 0.9:
+                return True
+            else:
+                pass
+
     @classmethod
     def share_to_windows(cls, log_type, driver: WebDriver, video_dict, env):
         Common.logger(log_type).info('分享给 windows 爬虫机器')
@@ -325,13 +340,7 @@ class Follow:
         try:
             follow_feeds_sheet = Feishu.get_values_batch(log_type, 'shipinhao', 'qzDljJ')
             for i in range(1, len(follow_feeds_sheet)):
-                download_title = follow_feeds_sheet[i][2].strip().replace('"', '') \
-                    .replace('“', '').replace('“', '…').replace("\n", "") \
-                    .replace("/", "").replace("\r", "") \
-                    .replace(".", "。").replace("\\", "").replace("&NBSP", "") \
-                    .replace(":", "").replace("*", "").replace("?", "") \
-                    .replace("?", "").replace('"', "").replace("<", "") \
-                    .replace(">", "").replace("|", "").replace(" ", "")
+                download_title = follow_feeds_sheet[i][2]
                 download_duration = follow_feeds_sheet[i][3]
                 download_like_cnt = follow_feeds_sheet[i][4]
                 download_share_cnt = follow_feeds_sheet[i][5]