Ver código fonte

update download sensitive_words and run time

wangkun 3 anos atrás
pai
commit
379f2dcb0a
4 arquivos alterados com 60 adições e 27 exclusões
  1. 17 0
      main/common.py
  2. 34 25
      main/download.py
  3. 2 2
      main/run.py
  4. 7 0
      抓取规则.txt

+ 17 - 0
main/common.py

@@ -126,6 +126,23 @@ class Common:
         with open(r"./txt/" + t_name, "r", encoding="UTF-8") as f:
             return f.readlines()
 
+    @classmethod
+    def del_content_in_txt(cls, d_content, d_filename):
+        """
+        删除指定文本的指定内容
+        :param d_content: 删除的指定内容
+        :param d_filename: 指定的文本
+        :return: None
+        """
+        with open(r"./txt/" + d_filename, "r", encoding="UTF-8") as f_r:
+            lines = f_r.readlines()
+        with open(r"./txt/" + d_filename, "w", encoding="utf-8") as f_w:
+            for line in lines:
+                if d_content in line.split(" + ")[1]:
+                    continue
+                f_w.write(line)
+        cls.crawler_log().info("删除{}中的{}成功".format(d_filename, d_content))
+
     @classmethod
     def benshanzhufu_download_count(cls):
         videoid_path = r"./txt/benshanzhufu_videoid.txt"

+ 34 - 25
main/download.py

@@ -29,6 +29,30 @@ class BSZF:
         sensitive_words = [
             "早上好",
             "晚上好",
+            "中午好",
+            "最美祝福",
+            "祝福",
+            "新年好",
+            "立春",
+            "雨水",
+            "惊蛰",
+            "春分",
+            "清明",
+            "谷雨",
+            "小暑",
+            "大暑",
+            "立秋",
+            "处暑",
+            "白露",
+            "秋分",
+            "寒露",
+            "霜降",
+            "立冬",
+            "小雪",
+            "大雪",
+            "冬至",
+            "小寒",
+            "大寒",
         ]
         return sensitive_words
 
@@ -242,13 +266,7 @@ class BSZF:
                 if any(word if word in download_video_title else False for word in cls.sensitive_words()) is True:
                     Common.crawler_log().info("视频已中敏感词,删除该视频信息:{}".format(download_video_title))
                     # 删除该视频在benshanzhufu_feeds.txt中的信息
-                    with open(r"./txt/benshanzhufu_feeds.txt", "r", encoding="UTF-8") as f_r:
-                        lines = f_r.readlines()
-                    with open(r"./txt/benshanzhufu_feeds.txt", "w", encoding="utf-8") as f_w:
-                        for line in lines:
-                            if download_video_id in line.split(" + ")[1]:
-                                continue
-                            f_w.write(line)
+                    Common.del_content_in_txt(download_video_id, "benshanzhufu_feeds.txt")
                 else:
                     Common.crawler_log().info("开始下载视频:{}".format(download_video_title))
                     # 下载封面
@@ -285,25 +303,16 @@ class BSZF:
                         Common.crawler_log().info("开始上传视频:{}".format(download_video_title))
                         Publish.upload_and_publish("prod", "play")
 
-                    # 删除该视频在benshanzhufu_feeds.txt中的信息
-                    Common.crawler_log().info("删除该视频在benshanzhufu_feeds.txt中的信息:{}".format(download_video_title))
-                    with open(r"./txt/benshanzhufu_feeds.txt", "r", encoding="UTF-8") as f_r3:
-                        lines = f_r3.readlines()
-                    with open(r"./txt/benshanzhufu_feeds.txt", "w", encoding="utf-8") as f_w3:
-                        for line in lines:
-                            if download_video_id in line.split(" + ")[1]:
-                                continue
-                            f_w3.write(line)
+                    try:
+                        Common.del_content_in_txt(download_video_id, "benshanzhufu_feeds.txt")
+                    except Exception as e:
+                        Common.crawler_log().error("删除benshanzhufu_feeds.txt中的{}失败,重新删除:{}".format(download_video_id, e))
+                        Common.del_content_in_txt(download_video_id, "benshanzhufu_feeds.txt")
+
             except Exception as e:
-                # 删除该视频在 recommend.txt中的信息
-                Common.crawler_log().error("该视频信息异常,删除在benshanzhufu_feeds.txt中的信息:{}".format(e))
-                with open(r"./txt/benshanzhufu_feeds.txt", "r", encoding="UTF-8") as f_r4:
-                    lines = f_r4.readlines()
-                with open(r"./txt/benshanzhufu_feeds.txt", "w", encoding="utf-8") as f_w4:
-                    for line in lines:
-                        if download_video_id in line.split(" + ")[1]:
-                            continue
-                        f_w4.write(line)
+                Common.crawler_log().error("下载视频异常:{}".format(e))
+                # 删除该视频在 recommend.txt 中的信息
+                Common.del_content_in_txt(download_video_id, "benshanzhufu_feeds.txt")
 
 
 if __name__ == "__main__":

+ 2 - 2
main/run.py

@@ -44,7 +44,7 @@ class Main:
         """
         正式环境脚本
         """
-        if len(BSZF.download_video_list) >= 300:
+        if len(BSZF.download_video_list) >= 200:
             Common.crawler_log().info("已下载视频数:{}".format(len(BSZF.download_video_list)))
             time.sleep(1800)
         else:
@@ -62,7 +62,7 @@ class Main:
         while True:
             while True:
                 main_time = datetime.datetime.now()
-                if main_time.hour >= 15:
+                if main_time.hour >= 11:
                     cls.download_job_prod()
                 else:
                     break

+ 7 - 0
抓取规则.txt

@@ -1,3 +1,10 @@
+==========2022/4/29===========
+- 增加敏感词过滤
+- 每天 11 点开始爬取,上限 200 条
+- 上传账号:[20631241, 20631242, 20631244, 20631245, 20631246, 20631247]
+==============================
+
+
 ==========2022/4/27===========
 - 全爬
 - 每天9-12点进行爬取