2 éve · 00246328d4
--- a/README.md
+++ b/README.md
@@ -1,65 +1,44 @@
 
				-本山祝福小程序爬虫
			
 
				-https://git.yishihui.com/Server/crawler_benshanzhufu.git
			
 
				-
			
 
				-ffmpeg==1.4
			
 
				-loguru==0.6.0
			
 
				-oss2==2.15.0
			
 
				-requests==2.27.1
			
 
				-urllib3==1.26.9
			
 
				-python==3.10
			
 
				-
			
 
				-执行入口:
			
 
				-
			
 
				-1.cd ./crawler_benshanzhufu
			
 
				-
			
 
				-2.python3 main/run_recommend.py
			
 
				-
			
 
				-
			
 
				-
			
 
				-==========2022/7/20===========
			
 
				-
			
 
				-项目重启:
			
 
				-
			
 
				-1.接入飞书文档:https://w42nne6hzg.feishu.cn/sheets/shtcnGh2rrsPYM4iVNEBO7OqWrb
			
 
				-
			
 
				-2.代码逻辑重构
			
 
				-
			
 
				-3.时长限制>=60s
			
 
				-
			
 
				-4.宽高限制:宽>=高才会下载及上传
			
 
				-
			
 
				-5.标题敏感词限制
			
 
				-
			
 
				-
			
 
				-
			
 
				-==========2022/7/18===========
			
 
				-
			
 
				-20631262
			
 
				-20631263
			
 
				-20631264
			
 
				-20631265
			
 
				-20631266  
			
 
				-20631267
			
 
				-20631268
			
 
				-20631269
			
 
				-20631271
			
 
				-20631272  
			
 
				-
			
 
				-每个账号上发布3条本山祝福视频
			
 
				-
			
 
				-
			
 
				-==========2022/4/29===========
			
 
				-- 增加敏感词过滤
			
 
				-
			
 
				-- 每天 11 点开始爬取，上限 200 条
			
 
				-
			
 
				-- 上传账号：[20631241, 20631242, 20631244, 20631245, 20631246, 20631247]
			
 
				-
			
 
				-
			
 
				-
			
 
				-==========2022/4/27===========
			
 
				-- 全爬
			
 
				-
			
 
				-- 每天9-12点进行爬取
			
 
				-
			
 
				-- 上传账号:20631241 / 20631242
			
 
				+# 本山祝福小程序爬虫
			
 
				+1. git: https://git.yishihui.com/Server/crawler_benshanzhufu.git
			
 
				+2. feishu: https://w42nne6hzg.feishu.cn/sheets/shtcnGh2rrsPYM4iVNEBO7OqWrb?sheet=440018
			
 
				+
			
 
				+
			
 
				+#### 软件架构
			
 
				+1. ffmpeg==1.4 
			
 
				+2. loguru==0.6.0 
			
 
				+3. oss2==2.15.0 
			
 
				+4. requests==2.27.1 
			
 
				+5. urllib3==1.26.9 
			
 
				+6. python==3.10
			
 
				+
			
 
				+
			
 
				+#### 使用说明
			
 
				+1. cd ./crawler_benshanzhufu
			
 
				+2. python3 main/run_recommend.py
			
 
				+
			
 
				+
			
 
				+#### 需求
			
 
				+2022/10/25
			
 
				+1. 下载视频时长修改为: >= 40s
			
 
				+
			
 
				+2022/7/20 项目重启:
			
 
				+1. 接入飞书文档:https://w42nne6hzg.feishu.cn/sheets/shtcnGh2rrsPYM4iVNEBO7OqWrb
			
 
				+2. 代码逻辑重构
			
 
				+3. 时长限制>=60s
			
 
				+4. 宽高限制:宽>=高才会下载及上传
			
 
				+5. 标题敏感词限制
			
 
				+
			
 
				+2022/7/18
			
 
				+1. 每个账号上发布3条本山祝福视频
			
 
				+[20631262, 20631263, 20631264, 20631265, 20631266, 20631267,
			
 
				+20631268, 20631269, 20631271, 20631272]
			
 
				+
			
 
				+2022/4/29
			
 
				+1. 增加敏感词过滤
			
 
				+2. 每天 11 点开始爬取，上限 200 条
			
 
				+3. 上传账号：[20631241, 20631242, 20631244, 20631245, 20631246, 20631247]
			
 
				+
			
 
				+2022/4/27
			
 
				+1. 全爬
			
 
				+2. 每天9-12点进行爬取
			
 
				+3. 上传账号:20631241 / 20631242
			
--- a/main/bszf_recommend.py
+++ b/main/bszf_recommend.py
@@ -45,27 +45,13 @@ class Recommend:
 
				     @classmethod
			
 
				     def get_video_info_from_local(cls, video_path):
			
 
				         probe = ffmpeg.probe(video_path)
			
 
				-        # print('video_path: {}'.format(video_path))
			
 
				-        # format1 = probe['format']
			
 
				-        # bit_rate = int(format1['bit_rate']) / 1000
			
 
				-        # duration = format['duration']
			
 
				-        # size = int(format1['size']) / 1024 / 1024
			
 
				         video_stream = next((stream for stream in probe['streams'] if stream['codec_type'] == 'video'), None)
			
 
				         if video_stream is None:
			
 
				             print('No video stream found!')
			
 
				             return
			
 
				         width = int(video_stream['width'])
			
 
				         height = int(video_stream['height'])
			
 
				-        # num_frames = int(video_stream['nb_frames'])
			
 
				-        # fps = int(video_stream['r_frame_rate'].split('/')[0]) / int(video_stream['r_frame_rate'].split('/')[1])
			
 
				         duration = float(video_stream['duration'])
			
 
				-        # print('width: {}'.format(width))
			
 
				-        # print('height: {}'.format(height))
			
 
				-        # print('num_frames: {}'.format(num_frames))
			
 
				-        # print('bit_rate: {}k'.format(bit_rate))
			
 
				-        # print('fps: {}'.format(fps))
			
 
				-        # print('size: {}MB'.format(size))
			
 
				-        # print('duration: {}'.format(duration))
			
 
				         return width, height, duration
			
 
				 
			
 
				     # 推荐列表获取视频
			
@@ -115,15 +101,6 @@ class Recommend:
 
				             cls.visitor_key = r.json()["data"]["visitor_key"]
			
 
				             cls.page += 1
			
 
				 
			
 
				-            # Common.logger(log_type).info("visitor_key:{}", cls.visitor_key)
			
 
				-            # Common.logger(log_type).info("page:{}\n", cls.page)
			
 
				-            #
			
 
				-            # for k, v in parameter.items():
			
 
				-            #     Common.logger(log_type).info("{}:{}", k, v)
			
 
				-            # Common.logger(log_type).info("\n")
			
 
				-            #
			
 
				-            # Common.logger(log_type).info("response:{}\n", response)
			
 
				-
			
 
				             if "data" not in response:
			
 
				                 Common.logger(log_type).warning("get_recommend, response:{}".format(response))
			
 
				                 time.sleep(3)
			
@@ -147,15 +124,6 @@ class Recommend:
 
				                     else:
			
 
				                         video_url = feeds[i]["video_url"]
			
 
				 
			
 
				-                    # if "width" not in feeds[i] or "height" not in feeds[i]:
			
 
				-                    #     video_width = 0
			
 
				-                    #     video_height = 0
			
 
				-                    #     video_resolution = str(video_width) + "*" + str(video_height)
			
 
				-                    # else:
			
 
				-                    #     video_width = feeds[i]["width"]
			
 
				-                    #     video_height = feeds[i]["height"]
			
 
				-                    #     video_resolution = str(video_width) + "*" + str(video_height)
			
 
				-
			
 
				                     if "commentCount" not in feeds[i]:
			
 
				                         video_comment_cnt = 0
			
 
				                     else:
			
@@ -186,12 +154,9 @@ class Recommend:
 
				                     user_id = "benshanzhufu"
			
 
				                     Common.logger(log_type).info("video_title:{}".format(video_title))
			
 
				                     Common.logger(log_type).info("video_id:{}".format(video_id))
			
 
				-                    # Common.logger(log_type).info("video_comment_cnt:{}".format(video_comment_cnt))
			
 
				-                    # Common.logger(log_type).info("video_resolution:{}".format(video_resolution))
			
 
				                     Common.logger(log_type).info(
			
 
				                         "video_send_time:{}", time.strftime(
			
 
				                             "%Y/%m/%d %H:%M:%S", time.localtime(int(video_send_time))))
			
 
				-                    # Common.logger(log_type).info("video_cover：{}".format(cover_url))
			
 
				                     Common.logger(log_type).info("video_url:{}".format(video_url))
			
 
				 
			
 
				                     # 过滤无效视频
			
@@ -259,15 +224,6 @@ class Recommend:
 
				                 Common.logger(log_type).info("download_video_title:{}", download_video_title)
			
 
				                 Common.logger(log_type).info("download_video_send_time:{}", download_video_send_time)
			
 
				                 Common.logger(log_type).info("download_video_url:{}", download_video_url)
			
 
				-                # Common.logger(log_type).info("download_video_play_cnt:{}", download_video_play_cnt)
			
 
				-                # Common.logger(log_type).info("download_video_id:{}", download_video_id)
			
 
				-                # Common.logger(log_type).info("download_video_comment_cnt:{}", download_video_comment_cnt)
			
 
				-                # Common.logger(log_type).info("download_video_like_cnt:{}", download_video_like_cnt)
			
 
				-                # Common.logger(log_type).info("download_video_share_cnt:{}", download_video_share_cnt)
			
 
				-                # Common.logger(log_type).info("download_user_name:{}", download_user_name)
			
 
				-                # Common.logger(log_type).info("download_user_id:{}", download_user_id)
			
 
				-                # Common.logger(log_type).info("download_head_url:{}", download_head_url)
			
 
				-                # Common.logger(log_type).info("download_cover_url:{}", download_cover_url)
			
 
				 
			
 
				                 # 过滤空行
			
 
				                 if download_video_id is None or download_video_title is None or download_video_play_cnt is None:
			
@@ -294,13 +250,13 @@ class Recommend:
 
				                     download_video_resolution = str(video_info[0]) + "*" + str(video_info[1])
			
 
				                     download_video_duration = video_info[2]
			
 
				 
			
 
				-                    # 视频时长<60s，直接删除
			
 
				-                    if int(download_video_duration) < 60:
			
 
				+                    # 视频时长<40s，直接删除
			
 
				+                    if int(download_video_duration) < 40:
			
 
				                         # 删除视频文件夹
			
 
				                         shutil.rmtree("./videos/" + download_video_title + "/")
			
 
				                         # 删除云文档recommend_feeds中的记录
			
 
				                         Feishu.dimension_range(log_type, "bszf", "CcHgO7", "ROWS", i + 1, i + 1)
			
 
				-                        Common.logger(log_type).info("时长:{}<60秒，删除成功\n", int(download_video_duration))
			
 
				+                        Common.logger(log_type).info("时长:{}<40秒，删除成功\n", int(download_video_duration))
			
 
				                         return
			
 
				                     # # 竖版视频不下载，写入竖版视频表
			
 
				                     # elif int(video_info[0]) < int(video_info[1]):