wangkun 2 éve
szülő
commit
00246328d4
2 módosított fájl, 47 hozzáadás és 112 törlés
  1. 44 65
      README.md
  2. 3 47
      main/bszf_recommend.py

+ 44 - 65
README.md

@@ -1,65 +1,44 @@
-本山祝福小程序爬虫
-https://git.yishihui.com/Server/crawler_benshanzhufu.git
-
-ffmpeg==1.4
-loguru==0.6.0
-oss2==2.15.0
-requests==2.27.1
-urllib3==1.26.9
-python==3.10
-
-执行入口:
-
-1.cd ./crawler_benshanzhufu
-
-2.python3 main/run_recommend.py
-
-
-
-==========2022/7/20===========
-
-项目重启:
-
-1.接入飞书文档:https://w42nne6hzg.feishu.cn/sheets/shtcnGh2rrsPYM4iVNEBO7OqWrb
-
-2.代码逻辑重构
-
-3.时长限制>=60s
-
-4.宽高限制:宽>=高才会下载及上传
-
-5.标题敏感词限制
-
-
-
-==========2022/7/18===========
-
-20631262
-20631263
-20631264
-20631265
-20631266  
-20631267
-20631268
-20631269
-20631271
-20631272  
-
-每个账号上发布3条本山祝福视频
-
-
-==========2022/4/29===========
-- 增加敏感词过滤
-
-- 每天 11 点开始爬取,上限 200 条
-
-- 上传账号:[20631241, 20631242, 20631244, 20631245, 20631246, 20631247]
-
-
-
-==========2022/4/27===========
-- 全爬
-
-- 每天9-12点进行爬取
-
-- 上传账号:20631241 / 20631242
+# 本山祝福小程序爬虫
+1. git: https://git.yishihui.com/Server/crawler_benshanzhufu.git
+2. feishu: https://w42nne6hzg.feishu.cn/sheets/shtcnGh2rrsPYM4iVNEBO7OqWrb?sheet=440018
+
+
+#### 软件架构
+1. ffmpeg==1.4 
+2. loguru==0.6.0 
+3. oss2==2.15.0 
+4. requests==2.27.1 
+5. urllib3==1.26.9 
+6. python==3.10
+
+
+#### 使用说明
+1. cd ./crawler_benshanzhufu
+2. python3 main/run_recommend.py
+
+
+#### 需求
+2022/10/25
+1. 下载视频时长修改为: >= 40s
+
+2022/7/20 项目重启:
+1. 接入飞书文档:https://w42nne6hzg.feishu.cn/sheets/shtcnGh2rrsPYM4iVNEBO7OqWrb
+2. 代码逻辑重构
+3. 时长限制>=60s
+4. 宽高限制:宽>=高才会下载及上传
+5. 标题敏感词限制
+
+2022/7/18
+1. 每个账号上发布3条本山祝福视频
+[20631262, 20631263, 20631264, 20631265, 20631266, 20631267,
+20631268, 20631269, 20631271, 20631272]
+
+2022/4/29
+1. 增加敏感词过滤
+2. 每天 11 点开始爬取,上限 200 条
+3. 上传账号:[20631241, 20631242, 20631244, 20631245, 20631246, 20631247]
+
+2022/4/27
+1. 全爬
+2. 每天9-12点进行爬取
+3. 上传账号:20631241 / 20631242

+ 3 - 47
main/bszf_recommend.py

@@ -45,27 +45,13 @@ class Recommend:
     @classmethod
     def get_video_info_from_local(cls, video_path):
         probe = ffmpeg.probe(video_path)
-        # print('video_path: {}'.format(video_path))
-        # format1 = probe['format']
-        # bit_rate = int(format1['bit_rate']) / 1000
-        # duration = format['duration']
-        # size = int(format1['size']) / 1024 / 1024
         video_stream = next((stream for stream in probe['streams'] if stream['codec_type'] == 'video'), None)
         if video_stream is None:
             print('No video stream found!')
             return
         width = int(video_stream['width'])
         height = int(video_stream['height'])
-        # num_frames = int(video_stream['nb_frames'])
-        # fps = int(video_stream['r_frame_rate'].split('/')[0]) / int(video_stream['r_frame_rate'].split('/')[1])
         duration = float(video_stream['duration'])
-        # print('width: {}'.format(width))
-        # print('height: {}'.format(height))
-        # print('num_frames: {}'.format(num_frames))
-        # print('bit_rate: {}k'.format(bit_rate))
-        # print('fps: {}'.format(fps))
-        # print('size: {}MB'.format(size))
-        # print('duration: {}'.format(duration))
         return width, height, duration
 
     # 推荐列表获取视频
@@ -115,15 +101,6 @@ class Recommend:
             cls.visitor_key = r.json()["data"]["visitor_key"]
             cls.page += 1
 
-            # Common.logger(log_type).info("visitor_key:{}", cls.visitor_key)
-            # Common.logger(log_type).info("page:{}\n", cls.page)
-            #
-            # for k, v in parameter.items():
-            #     Common.logger(log_type).info("{}:{}", k, v)
-            # Common.logger(log_type).info("\n")
-            #
-            # Common.logger(log_type).info("response:{}\n", response)
-
             if "data" not in response:
                 Common.logger(log_type).warning("get_recommend, response:{}".format(response))
                 time.sleep(3)
@@ -147,15 +124,6 @@ class Recommend:
                     else:
                         video_url = feeds[i]["video_url"]
 
-                    # if "width" not in feeds[i] or "height" not in feeds[i]:
-                    #     video_width = 0
-                    #     video_height = 0
-                    #     video_resolution = str(video_width) + "*" + str(video_height)
-                    # else:
-                    #     video_width = feeds[i]["width"]
-                    #     video_height = feeds[i]["height"]
-                    #     video_resolution = str(video_width) + "*" + str(video_height)
-
                     if "commentCount" not in feeds[i]:
                         video_comment_cnt = 0
                     else:
@@ -186,12 +154,9 @@ class Recommend:
                     user_id = "benshanzhufu"
                     Common.logger(log_type).info("video_title:{}".format(video_title))
                     Common.logger(log_type).info("video_id:{}".format(video_id))
-                    # Common.logger(log_type).info("video_comment_cnt:{}".format(video_comment_cnt))
-                    # Common.logger(log_type).info("video_resolution:{}".format(video_resolution))
                     Common.logger(log_type).info(
                         "video_send_time:{}", time.strftime(
                             "%Y/%m/%d %H:%M:%S", time.localtime(int(video_send_time))))
-                    # Common.logger(log_type).info("video_cover:{}".format(cover_url))
                     Common.logger(log_type).info("video_url:{}".format(video_url))
 
                     # 过滤无效视频
@@ -259,15 +224,6 @@ class Recommend:
                 Common.logger(log_type).info("download_video_title:{}", download_video_title)
                 Common.logger(log_type).info("download_video_send_time:{}", download_video_send_time)
                 Common.logger(log_type).info("download_video_url:{}", download_video_url)
-                # Common.logger(log_type).info("download_video_play_cnt:{}", download_video_play_cnt)
-                # Common.logger(log_type).info("download_video_id:{}", download_video_id)
-                # Common.logger(log_type).info("download_video_comment_cnt:{}", download_video_comment_cnt)
-                # Common.logger(log_type).info("download_video_like_cnt:{}", download_video_like_cnt)
-                # Common.logger(log_type).info("download_video_share_cnt:{}", download_video_share_cnt)
-                # Common.logger(log_type).info("download_user_name:{}", download_user_name)
-                # Common.logger(log_type).info("download_user_id:{}", download_user_id)
-                # Common.logger(log_type).info("download_head_url:{}", download_head_url)
-                # Common.logger(log_type).info("download_cover_url:{}", download_cover_url)
 
                 # 过滤空行
                 if download_video_id is None or download_video_title is None or download_video_play_cnt is None:
@@ -294,13 +250,13 @@ class Recommend:
                     download_video_resolution = str(video_info[0]) + "*" + str(video_info[1])
                     download_video_duration = video_info[2]
 
-                    # 视频时长<60s,直接删除
-                    if int(download_video_duration) < 60:
+                    # 视频时长<40s,直接删除
+                    if int(download_video_duration) < 40:
                         # 删除视频文件夹
                         shutil.rmtree("./videos/" + download_video_title + "/")
                         # 删除云文档recommend_feeds中的记录
                         Feishu.dimension_range(log_type, "bszf", "CcHgO7", "ROWS", i + 1, i + 1)
-                        Common.logger(log_type).info("时长:{}<60秒,删除成功\n", int(download_video_duration))
+                        Common.logger(log_type).info("时长:{}<40秒,删除成功\n", int(download_video_duration))
                         return
                     # # 竖版视频不下载,写入竖版视频表
                     # elif int(video_info[0]) < int(video_info[1]):