wangkun 2 年之前
父节点
当前提交
55595195b4

+ 11 - 7
README.MD

@@ -66,7 +66,7 @@ nohup python3 -u weixinzhishu/weixinzhishu_main/weixinzhishu_out.py >>./weixinzh
 ps aux | grep run_weixinzhishu
 ps aux | grep weixinzhishu | grep -v grep | awk '{print $2}' | xargs kill -9
 获取 wechat_key 设备: Mac Air 
-ps aux | grep weixinzhishu | grep -v grep | awk '{print $2}' | xargs kill -9 && cd /Users/piaoquan/Desktop/piaoquan_crawler && nohup python3 -u weixinzhishu/weixinzhishu_key/search_key_mac.py >> weixinzhishu/nohup.log 2>&1 &
+cd ~ && source ./base_profile && ps aux | grep weixinzhishu | grep -v grep | awk '{print $2}' | xargs kill -9 && cd /Users/piaoquan/Desktop/piaoquan_crawler && nohup python3 -u weixinzhishu/weixinzhishu_key/search_key_mac.py >> weixinzhishu/nohup.log 2>&1 &
 ```
 
 #### 西瓜视频
@@ -104,14 +104,18 @@ ps aux | grep run_kuaishou | grep -v grep | awk '{print $2}' | xargs kill -9
 #### 小年糕
 ```commandline
 阿里云 102 服务器
-定向爬虫策略: sh ./main/shceduling_main.sh ./xiaoniangao/xiaoniangao_main/run_xiaoniangao_follow.py --log_type="follow" --crawler="xiaoniangao" --env="prod"  xiaoniangao/nohup.log
-小时榜爬虫策略: 
-播放量榜爬虫策略: 
+定向爬虫策略: sh ./main/shceduling_main.sh ./xiaoniangao/xiaoniangao_main/run_xiaoniangao_follow.py --log_type="follow" --crawler="xiaoniangao" --env="prod"  xiaoniangao/nohup-follow.log
+小时榜爬虫策略: sh ./main/scheduling_main.sh ./xiaoniangao/xiaoniangao_main/run_xiaoniangao_hour.py --log_type="hour" --crawler="xiaoniangao" --env="prod" xiaoniangao/nohup-hour.log
+播放量榜爬虫策略: sh ./main/scheduling_main.sh ./xiaoniangao/xiaoniangao_main/run_xiaoniangao_play.py --log_type="hour" --crawler="xiaoniangao" --env="prod" xiaoniangao/nohup-play.log
 
 线下调试
-定向爬虫策略: sh ./main/scheduling_main.sh ./xiaoniangao/xiaoniangao_main/run_xiaoniangao_follow.py --log_type="follow" --crawler="xiaoniangao" --env="dev" xiaoniangao/nohup.log
-小时榜爬虫策略: sh ./main/scheduling_main.sh ./xiaoniangao/xiaoniangao_main/run_xiaoniangao_hour.py --log_type="hour" --crawler="xiaoniangao" --env="dev" xiaoniangao/nohup.log
-播放量榜爬虫策略: sh ./main/scheduling_main.sh ./xiaoniangao/xiaoniangao_main/run_xiaoniangao_play.py --log_type="hour" --crawler="xiaoniangao" --env="dev" xiaoniangao/nohup.log
+定向爬虫策略: sh ./main/scheduling_main.sh ./xiaoniangao/xiaoniangao_main/run_xiaoniangao_follow.py --log_type="follow" --crawler="xiaoniangao" --env="dev" xiaoniangao/nohup-follow.log
+小时榜爬虫策略: sh ./main/scheduling_main.sh ./xiaoniangao/xiaoniangao_main/run_xiaoniangao_hour.py --log_type="hour" --crawler="xiaoniangao" --env="dev" xiaoniangao/nohup-hour.log
+播放量榜爬虫策略: sh ./main/scheduling_main.sh ./xiaoniangao/xiaoniangao_main/run_xiaoniangao_play.py --log_type="hour" --crawler="xiaoniangao" --env="dev" xiaoniangao/nohup-play.log
+
+nohup python3 -u xiaoniangao/xiaoniangao_follow/insert_video_1.py >> xiaoniangao/nohup-1.log 2>&1 &
+nohup python3 -u xiaoniangao/xiaoniangao_follow/insert_video_2.py >> xiaoniangao/nohup-1.log 2>&1 &
+nohup python3 -u xiaoniangao/xiaoniangao_follow/insert_video_3.py >> xiaoniangao/nohup-1.log 2>&1 &
 
 杀进程命令
 ps aux | grep run_xiaoniangao

二进制
xiaoniangao/.DS_Store


+ 2 - 3
xiaoniangao/xiaoniangao_follow/insert_video.py → xiaoniangao/xiaoniangao_follow/insert_video_1.py

@@ -13,8 +13,7 @@ from common.feishu import Feishu
 class Insert:
     @classmethod
     def insert_video_from_feishu_to_mysql(cls, log_type, crawler, env, machine):
-        # xiaoniangao_sheetid_list = ['bkIrcr']  # follow
-        xiaoniangao_sheetid_list = ['InCA1I']  # hour
+        xiaoniangao_sheetid_list = ['Wu0CeL']
         for sheetid in xiaoniangao_sheetid_list:
             xiaoniangao_sheet = Feishu.get_values_batch(log_type, crawler, sheetid)
             for i in range(1, len(xiaoniangao_sheet)):
@@ -114,5 +113,5 @@ if __name__ == "__main__":
     # Insert.insert_video_from_feishu_to_mysql("insert-dev-follow", "xiaoniangao", "dev", "local")
     # Insert.insert_video_from_feishu_to_mysql("insert-dev-hour", "xiaoniangao", "dev", "local")
     # Insert.insert_video_from_feishu_to_mysql("insert-prod-follow", "xiaoniangao", "prod", "local")
-    Insert.insert_video_from_feishu_to_mysql("insert-prod-hour", "xiaoniangao", "prod", "local")
+    Insert.insert_video_from_feishu_to_mysql("insert-prod-1", "xiaoniangao", "prod", "local")
     pass

+ 117 - 0
xiaoniangao/xiaoniangao_follow/insert_video_2.py

@@ -0,0 +1,117 @@
+# -*- coding: utf-8 -*-
+# @Author: wangkun
+# @Time: 2023/3/14
+import json
+import os
+import sys
+sys.path.append(os.getcwd())
+from common.common import Common
+from common.scheduling_db  import MysqlHelper
+from common.feishu import Feishu
+
+
+class Insert:
+    @classmethod
+    def insert_video_from_feishu_to_mysql(cls, log_type, crawler, env, machine):
+        xiaoniangao_sheetid_list = ['yatRv2']
+        for sheetid in xiaoniangao_sheetid_list:
+            xiaoniangao_sheet = Feishu.get_values_batch(log_type, crawler, sheetid)
+            for i in range(1, len(xiaoniangao_sheet)):
+            # for i in range(1, 3):
+                if xiaoniangao_sheet[i][5] is None or xiaoniangao_sheet[i][9] is None:
+                    continue
+                video_id = xiaoniangao_sheet[i][9].replace("https://admin.piaoquantv.com/cms/post-detail/", "").replace("/info", "")
+                if video_id == "None":
+                    continue
+                video_id = int(video_id)
+                out_user_id = str(xiaoniangao_sheet[i][19])
+                platform = "小年糕"
+                strategy = "定向爬虫策略"
+                out_video_id = str(xiaoniangao_sheet[i][7])
+                video_title = str(xiaoniangao_sheet[i][8])
+                cover_url = str(xiaoniangao_sheet[i][21])
+                video_url = str(xiaoniangao_sheet[i][22])
+                duration = int(xiaoniangao_sheet[i][14])
+                publish_time = str(xiaoniangao_sheet[i][16]).replace("/", "-")
+                play_cnt = int(xiaoniangao_sheet[i][10])
+                like_cnt = int(xiaoniangao_sheet[i][12])
+                share_cnt = int(xiaoniangao_sheet[i][13])
+                # collection_cnt = 0
+                comment_cnt = int(xiaoniangao_sheet[i][11])
+                crawler_rule = json.dumps({"play_cnt": {"min": 500}, "duration": {"min": 40}, "publish_day": {"min": 3}})
+                width = int(xiaoniangao_sheet[i][15].split("*")[0])
+                height = int(xiaoniangao_sheet[i][15].split("*")[1])
+
+                # print(f"video_id:{video_id}, type:{type(video_id)}")
+                # print(f"user_id:{user_id}, type:{type(user_id)}")
+                # print(f"out_user_id:{out_user_id}, type:{type(out_user_id)}")
+                # print(f"platform:{platform}, type:{type(platform)}")
+                # print(f"strategy:{strategy}, type:{type(strategy)}")
+                # print(f"out_video_id:{out_video_id}, type:{type(out_video_id)}")
+                # print(f"video_title:{video_title}, type:{type(video_title)}")
+                # print(f"cover_url:{cover_url}, type:{type(cover_url)}")
+                # print(f"video_url:{video_url}, type:{type(video_url)}")
+                # print(f"duration:{duration}, type:{type(duration)}")
+                # print(f"publish_time:{publish_time}, type:{type(publish_time)}")
+                # print(f"play_cnt:{play_cnt}, type:{type(play_cnt)}")
+                # print(f"like_cnt:{like_cnt}, type:{type(like_cnt)}")
+                # print(f"share_cnt:{share_cnt}, type:{type(share_cnt)}")
+                # print(f"comment_cnt:{comment_cnt}, type:{type(comment_cnt)}")
+                # print(f"crawler_rule:{crawler_rule}, type:{type(crawler_rule)}")
+                # print(f"width:{width}, type:{type(width)}")
+                # print(f"height:{height}, type:{type(height)}\n")
+
+                select_sql = f""" select * from crawler_video where platform="{platform}" and out_video_id="{out_video_id}" """
+                Common.logger(log_type, crawler).info(f"select_sql:{select_sql}")
+                repeat_video = MysqlHelper.get_values(log_type, crawler, select_sql, env, machine)
+                Common.logger(log_type, crawler).info(f"repeat_video:{repeat_video}")
+
+                if repeat_video is not None and len(repeat_video) != 0:
+                    Common.logger(log_type, crawler).info(f"{video_title} 已存在数据库中\n")
+                else:
+                    # 视频信息保存数据库
+                    insert_sql = f""" insert into crawler_video(video_id,
+                                    out_user_id,
+                                    platform,
+                                    strategy,
+                                    out_video_id,
+                                    video_title,
+                                    cover_url,
+                                    video_url,
+                                    duration,
+                                    publish_time,
+                                    play_cnt,
+                                    like_cnt,
+                                    share_cnt,
+                                    comment_cnt,
+                                    crawler_rule,
+                                    width,
+                                    height)
+                                    values({video_id},
+                                    "{out_user_id}",
+                                    "{platform}",
+                                    "{strategy}",
+                                    "{out_video_id}",
+                                    "{video_title}",
+                                    "{cover_url}",
+                                    "{video_url}",
+                                    {duration},
+                                    "{publish_time}",
+                                    {play_cnt},
+                                    {like_cnt},
+                                    {share_cnt},
+                                    {comment_cnt},
+                                    '{crawler_rule}',
+                                    {width},
+                                    {height}) """
+                    Common.logger(log_type, crawler).info(f"insert_sql:{insert_sql}")
+                    MysqlHelper.update_values(log_type, crawler, insert_sql, env, machine)
+                    Common.logger(log_type, crawler).info('视频信息插入数据库成功!\n')
+
+
+if __name__ == "__main__":
+    # Insert.insert_video_from_feishu_to_mysql("insert-dev-follow", "xiaoniangao", "dev", "local")
+    # Insert.insert_video_from_feishu_to_mysql("insert-dev-hour", "xiaoniangao", "dev", "local")
+    # Insert.insert_video_from_feishu_to_mysql("insert-prod-follow", "xiaoniangao", "prod", "local")
+    Insert.insert_video_from_feishu_to_mysql("insert-prod-2", "xiaoniangao", "prod", "local")
+    pass

+ 117 - 0
xiaoniangao/xiaoniangao_follow/insert_video_3.py

@@ -0,0 +1,117 @@
+# -*- coding: utf-8 -*-
+# @Author: wangkun
+# @Time: 2023/3/14
+import json
+import os
+import sys
+sys.path.append(os.getcwd())
+from common.common import Common
+from common.scheduling_db  import MysqlHelper
+from common.feishu import Feishu
+
+
+class Insert:
+    @classmethod
+    def insert_video_from_feishu_to_mysql(cls, log_type, crawler, env, machine):
+        xiaoniangao_sheetid_list = ['c85k1C']
+        for sheetid in xiaoniangao_sheetid_list:
+            xiaoniangao_sheet = Feishu.get_values_batch(log_type, crawler, sheetid)
+            for i in range(1, len(xiaoniangao_sheet)):
+            # for i in range(1, 3):
+                if xiaoniangao_sheet[i][5] is None or xiaoniangao_sheet[i][9] is None:
+                    continue
+                video_id = xiaoniangao_sheet[i][9].replace("https://admin.piaoquantv.com/cms/post-detail/", "").replace("/info", "")
+                if video_id == "None":
+                    continue
+                video_id = int(video_id)
+                out_user_id = str(xiaoniangao_sheet[i][19])
+                platform = "小年糕"
+                strategy = "定向爬虫策略"
+                out_video_id = str(xiaoniangao_sheet[i][7])
+                video_title = str(xiaoniangao_sheet[i][8])
+                cover_url = str(xiaoniangao_sheet[i][21])
+                video_url = str(xiaoniangao_sheet[i][22])
+                duration = int(xiaoniangao_sheet[i][14])
+                publish_time = str(xiaoniangao_sheet[i][16]).replace("/", "-")
+                play_cnt = int(xiaoniangao_sheet[i][10])
+                like_cnt = int(xiaoniangao_sheet[i][12])
+                share_cnt = int(xiaoniangao_sheet[i][13])
+                # collection_cnt = 0
+                comment_cnt = int(xiaoniangao_sheet[i][11])
+                crawler_rule = json.dumps({"play_cnt": {"min": 500}, "duration": {"min": 40}, "publish_day": {"min": 3}})
+                width = int(xiaoniangao_sheet[i][15].split("*")[0])
+                height = int(xiaoniangao_sheet[i][15].split("*")[1])
+
+                # print(f"video_id:{video_id}, type:{type(video_id)}")
+                # print(f"user_id:{user_id}, type:{type(user_id)}")
+                # print(f"out_user_id:{out_user_id}, type:{type(out_user_id)}")
+                # print(f"platform:{platform}, type:{type(platform)}")
+                # print(f"strategy:{strategy}, type:{type(strategy)}")
+                # print(f"out_video_id:{out_video_id}, type:{type(out_video_id)}")
+                # print(f"video_title:{video_title}, type:{type(video_title)}")
+                # print(f"cover_url:{cover_url}, type:{type(cover_url)}")
+                # print(f"video_url:{video_url}, type:{type(video_url)}")
+                # print(f"duration:{duration}, type:{type(duration)}")
+                # print(f"publish_time:{publish_time}, type:{type(publish_time)}")
+                # print(f"play_cnt:{play_cnt}, type:{type(play_cnt)}")
+                # print(f"like_cnt:{like_cnt}, type:{type(like_cnt)}")
+                # print(f"share_cnt:{share_cnt}, type:{type(share_cnt)}")
+                # print(f"comment_cnt:{comment_cnt}, type:{type(comment_cnt)}")
+                # print(f"crawler_rule:{crawler_rule}, type:{type(crawler_rule)}")
+                # print(f"width:{width}, type:{type(width)}")
+                # print(f"height:{height}, type:{type(height)}\n")
+
+                select_sql = f""" select * from crawler_video where platform="{platform}" and out_video_id="{out_video_id}" """
+                Common.logger(log_type, crawler).info(f"select_sql:{select_sql}")
+                repeat_video = MysqlHelper.get_values(log_type, crawler, select_sql, env, machine)
+                Common.logger(log_type, crawler).info(f"repeat_video:{repeat_video}")
+
+                if repeat_video is not None and len(repeat_video) != 0:
+                    Common.logger(log_type, crawler).info(f"{video_title} 已存在数据库中\n")
+                else:
+                    # 视频信息保存数据库
+                    insert_sql = f""" insert into crawler_video(video_id,
+                                    out_user_id,
+                                    platform,
+                                    strategy,
+                                    out_video_id,
+                                    video_title,
+                                    cover_url,
+                                    video_url,
+                                    duration,
+                                    publish_time,
+                                    play_cnt,
+                                    like_cnt,
+                                    share_cnt,
+                                    comment_cnt,
+                                    crawler_rule,
+                                    width,
+                                    height)
+                                    values({video_id},
+                                    "{out_user_id}",
+                                    "{platform}",
+                                    "{strategy}",
+                                    "{out_video_id}",
+                                    "{video_title}",
+                                    "{cover_url}",
+                                    "{video_url}",
+                                    {duration},
+                                    "{publish_time}",
+                                    {play_cnt},
+                                    {like_cnt},
+                                    {share_cnt},
+                                    {comment_cnt},
+                                    '{crawler_rule}',
+                                    {width},
+                                    {height}) """
+                    Common.logger(log_type, crawler).info(f"insert_sql:{insert_sql}")
+                    MysqlHelper.update_values(log_type, crawler, insert_sql, env, machine)
+                    Common.logger(log_type, crawler).info('视频信息插入数据库成功!\n')
+
+
+if __name__ == "__main__":
+    # Insert.insert_video_from_feishu_to_mysql("insert-dev-follow", "xiaoniangao", "dev", "local")
+    # Insert.insert_video_from_feishu_to_mysql("insert-dev-hour", "xiaoniangao", "dev", "local")
+    # Insert.insert_video_from_feishu_to_mysql("insert-prod-follow", "xiaoniangao", "prod", "local")
+    Insert.insert_video_from_feishu_to_mysql("insert-prod-3", "xiaoniangao", "prod", "local")
+    pass