wangkun 1 jaar geleden
bovenliggende
commit
d52759cb0f
49 gewijzigde bestanden met toevoegingen van 1585 en 2730 verwijderingen
  1. 51 0
      README.MD
  2. 90 0
      benshanzhufu/benshanzhufu_main/run_bszf_recommend.py
  3. 0 181
      benshanzhufu/benshanzhufu_recommend/insert.py
  4. 2 2
      common/common.py
  5. 26 6
      common/public.py
  6. 85 0
      douyin/douyin_main/run_dy_author.py
  7. 90 0
      douyin/douyin_main/run_dy_recommend.py
  8. 0 32
      douyin/test.py
  9. 0 148
      ganggangdouchuan/ganggangdouchuan_recommend/insert.py
  10. 0 24
      gongzhonghao/gongzhonghao_author/demo.py
  11. 0 96
      gongzhonghao/gongzhonghao_follow/insert_video.py
  12. 0 2
      gongzhonghao/gongzhonghao_main/run_gongzhonghao1_author_scheduling.py
  13. 0 10
      gongzhonghao/gongzhonghao_main/run_gongzhonghao2_author_scheduling.py
  14. 88 0
      gongzhonghao/gongzhonghao_main/run_gzh1_author.py
  15. 88 0
      gongzhonghao/gongzhonghao_main/run_gzh2_author.py
  16. 88 0
      gongzhonghao/gongzhonghao_main/run_gzh3_author.py
  17. 88 0
      gongzhonghao/gongzhonghao_main/run_gzh4_author.py
  18. 88 0
      gongzhonghao/gongzhonghao_main/run_gzh5_author.py
  19. 0 161
      jixiangxingfu/jixiangxingfu_recommend/insert.py
  20. 1 1
      kuaishou/kuaishou_author/kuaishou_author_scheduling.py
  21. 0 216
      kuaishou/kuaishou_follow/insert_videos.py
  22. 0 596
      kuaishou/kuaishou_follow/kuaishou_follow_scheduling.py
  23. 0 28
      kuaishou/kuaishou_follow/test.py
  24. 85 0
      kuaishou/kuaishou_main/run_ks_author.py
  25. 90 0
      kuaishou/kuaishou_main/run_ks_recommend.py
  26. 0 57
      kuaishou/kuaishou_main/run_kuaishou_follow_scheduling.py
  27. 57 0
      main/process_mq.sh
  28. 0 138
      suisuiniannianyingfuqi/suisuiniannianyingfuqi_main/demo.py
  29. 10 10
      suisuiniannianyingfuqi/suisuiniannianyingfuqi_main/run_ssnnyfq_recommend.py
  30. 0 160
      suisuiniannianyingfuqi/suisuiniannianyingfuqi_recommend/insert.py
  31. 6 1
      suisuiniannianyingfuqi/suisuiniannianyingfuqi_recommend/suisuiniannianyingfuqi_recommend_scheduling.py
  32. 0 19
      xiaoniangao/xiaoniangao_author/author_test.py
  33. 0 24
      xiaoniangao/xiaoniangao_follow/insert_filter_word.py
  34. 0 117
      xiaoniangao/xiaoniangao_follow/insert_video_1.py
  35. 84 0
      xiaoniangao/xiaoniangao_main/run_xng_author.py
  36. 115 0
      xiaoniangao/xiaoniangao_main/run_xng_hour.py
  37. 89 0
      xiaoniangao/xiaoniangao_main/run_xng_play.py
  38. 0 38
      xigua/xigua_follow/demo.py
  39. 0 119
      xigua/xigua_follow/insert_videos.py
  40. 0 57
      xigua/xigua_follow/xigua_test.py
  41. 85 0
      xigua/xigua_main/run_xg_author.py
  42. 90 0
      xigua/xigua_main/run_xg_recommend.py
  43. 87 0
      xigua/xigua_main/run_xg_search.py
  44. 0 60
      xigua/xigua_main/run_xigua_follow_scheduling.py
  45. 0 78
      xigua/xigua_recommend/insert.py
  46. 0 27
      xigua/xigua_search/test.py
  47. 2 6
      xigua/xigua_search/xigua_search_scheduling.py
  48. 0 158
      zhiqingtiantiankan/zhiqingtiantiankan_recommend/zhiqing_insert.py
  49. 0 158
      zhongmiaoyinxin/zhongmiaoyinxin_recommend/insert.py

+ 51 - 0
README.MD

@@ -249,4 +249,55 @@ ps aux | grep kuaishou | grep -v grep | awk '{print $2}' | xargs kill -9
 ps aux | grep xigua_search | grep -v grep | awk '{print $2}' | xargs kill -9
 ps aux | grep kanyikan | grep -v grep | awk '{print $2}' | xargs kill -9
 ps aux | grep shipinhao_search | grep -v grep | awk '{print $2}' | xargs kill -9
+```
+
+
+#### 调用MQ的爬虫进程守护: main/process_mq.sh
+```commandline
+本地调试
+/bin/sh /Users/wangkun/Desktop/crawler/piaoquan_crawler/main/process_mq.sh "ssnnyfq" "suisuiniannianyingfuqi" "recommend" "dev"
+/bin/sh /Users/wangkun/Desktop/crawler/piaoquan_crawler/main/process_mq.sh "gzh1" "gongzhonghao" "author" "dev"
+/bin/sh /Users/wangkun/Desktop/crawler/piaoquan_crawler/main/process_mq.sh "gzh2" "gongzhonghao" "author" "dev"
+/bin/sh /Users/wangkun/Desktop/crawler/piaoquan_crawler/main/process_mq.sh "xg" "xigua" "recommend" "dev"
+/bin/sh /Users/wangkun/Desktop/crawler/piaoquan_crawler/main/process_mq.sh "xg" "xigua" "author" "dev"
+/bin/sh /Users/wangkun/Desktop/crawler/piaoquan_crawler/main/process_mq.sh "xg" "xigua" "search" "dev"
+/bin/sh /Users/wangkun/Desktop/crawler/piaoquan_crawler/main/process_mq.sh "bszf" "benshanzhufu" "recommend" "dev"
+/bin/sh /Users/wangkun/Desktop/crawler/piaoquan_crawler/main/process_mq.sh "ks" "kuaishou" "recommend" "dev"
+/bin/sh /Users/wangkun/Desktop/crawler/piaoquan_crawler/main/process_mq.sh "ks" "kuaishou" "author" "dev"
+/bin/sh /Users/wangkun/Desktop/crawler/piaoquan_crawler/main/process_mq.sh "dy" "douyin" "recommend" "dev"
+/bin/sh /Users/wangkun/Desktop/crawler/piaoquan_crawler/main/process_mq.sh "dy" "douyin" "author" "dev"
+/bin/sh /Users/wangkun/Desktop/crawler/piaoquan_crawler/main/process_mq.sh "xng" "xiaoniangao" "play" "dev"
+/bin/sh /Users/wangkun/Desktop/crawler/piaoquan_crawler/main/process_mq.sh "xng" "xiaoniangao" "hour" "dev"
+/bin/sh /Users/wangkun/Desktop/crawler/piaoquan_crawler/main/process_mq.sh "xng" "xiaoniangao" "author" "dev"
+
+102 服务器
+# 调用 MQ 爬虫守护进程
+* * * * * /usr/bin/sh /data5/piaoquan_crawler/main/process_mq.sh "ssnnyfq" "suisuiniannianyingfuqi" "recommend" "prod"
+* * * * * /usr/bin/sh /data5/piaoquan_crawler/main/process_mq.sh "gzh1" "gongzhonghao" "author" "prod"
+* * * * * /usr/bin/sh /data5/piaoquan_crawler/main/process_mq.sh "gzh2" "gongzhonghao" "author" "prod"
+* * * * * /usr/bin/sh /data5/piaoquan_crawler/main/process_mq.sh "gzh3" "gongzhonghao" "author" "prod"
+* * * * * /usr/bin/sh /data5/piaoquan_crawler/main/process_mq.sh "gzh4" "gongzhonghao" "author" "prod"
+* * * * * /usr/bin/sh /data5/piaoquan_crawler/main/process_mq.sh "gzh5" "gongzhonghao" "author" "prod"
+* * * * * /usr/bin/sh /data5/piaoquan_crawler/main/process_mq.sh "xg" "xigua" "recommend" "prod"
+* * * * * /usr/bin/sh /data5/piaoquan_crawler/main/process_mq.sh "xg" "xigua" "author" "prod"
+* * * * * /usr/bin/sh /data5/piaoquan_crawler/main/process_mq.sh "xg" "xigua" "search" "prod"
+* * * * * /usr/bin/sh /data5/piaoquan_crawler/main/process_mq.sh "bszf" "benshanzhfu" "recommend" "prod"
+* * * * * /usr/bin/sh /data5/piaoquan_crawler/main/process_mq.sh "ks" "kuaishou" "recommend" "prod"
+* * * * * /usr/bin/sh /data5/piaoquan_crawler/main/process_mq.sh "ks" "kuaishou" "author" "prod"
+* * * * * /usr/bin/sh /data5/piaoquan_crawler/main/process_mq.sh "dy" "douyin" "recommend" "prod"
+* * * * * /usr/bin/sh /data5/piaoquan_crawler/main/process_mq.sh "dy" "douyin" "author" "prod"
+* * * * * /usr/bin/sh /data5/piaoquan_crawler/main/process_mq.sh "xng" "xiaoniangao" "play" "prod"
+* * * * * /usr/bin/sh /data5/piaoquan_crawler/main/process_mq.sh "xng" "xiaoniangao" "hour" "prod"
+* * * * * /usr/bin/sh /data5/piaoquan_crawler/main/process_mq.sh "xng" "xiaoniangao" "author" "prod"
+
+线下服务器
+
+杀进程
+ps aux | grep suisuiniannianyingfuqi | grep -v grep | awk '{print $2}' | xargs kill -9
+ps aux | grep gongzhonghao | grep -v grep | awk '{print $2}' | xargs kill -9
+ps aux | grep xigua | grep -v grep | awk '{print $2}' | xargs kill -9
+ps aux | grep benshanzhufu | grep -v grep | awk '{print $2}' | xargs kill -9
+ps aux | grep kuaishou | grep -v grep | awk '{print $2}' | xargs kill -9
+ps aux | grep douyin | grep -v grep | awk '{print $2}' | xargs kill -9
+ps aux | grep xiaoniangao | grep -v grep | awk '{print $2}' | xargs kill -9
 ```

+ 90 - 0
benshanzhufu/benshanzhufu_main/run_bszf_recommend.py

@@ -0,0 +1,90 @@
+# -*- coding: utf-8 -*-
+# @Author: wangkun
+# @Time: 2023/6/7
+import argparse
+import random
+from mq_http_sdk.mq_client import *
+from mq_http_sdk.mq_consumer import *
+from mq_http_sdk.mq_exception import MQExceptionBase
+sys.path.append(os.getcwd())
+from common.public import get_consumer, ack_message, task_fun_mq
+from common.common import Common
+from common.scheduling_db import MysqlHelper
+from benshanzhufu.benshanzhufu_recommend.benshanzhufu_recommend_scheduling import BenshanzhufuRecommend
+
+
+
+def main(log_type, crawler, topic_name, group_id, env):
+    consumer = get_consumer(topic_name, group_id)
+    # 长轮询表示如果Topic没有消息,则客户端请求会在服务端挂起3秒,3秒内如果有消息可以消费则立即返回响应。
+    # 长轮询时间3秒(最多可设置为30秒)。
+    wait_seconds = 3
+    # 一次最多消费3条(最多可设置为16条)。
+    batch = 1
+    Common.logger(log_type, crawler).info(f'{10 * "="}Consume And Ack Message From Topic{10 * "="}\n'
+                                          f'WaitSeconds:{wait_seconds}\n'
+                                          f'TopicName:{topic_name}\n'
+                                          f'MQConsumer:{group_id}')
+    while True:
+        try:
+            # 长轮询消费消息。
+            recv_msgs = consumer.consume_message(batch, wait_seconds)
+            for msg in recv_msgs:
+                Common.logger(log_type, crawler).info(f"Receive\n"
+                                                      f"MessageId:{msg.message_id}\n"
+                                                      f"MessageBodyMD5:{msg.message_body_md5}\n"
+                                                      f"MessageTag:{msg.message_tag}\n"
+                                                      f"ConsumedTimes:{msg.consumed_times}\n"
+                                                      f"PublishTime:{msg.publish_time}\n"
+                                                      f"Body:{msg.message_body}\n"
+                                                      f"NextConsumeTime:{msg.next_consume_time}\n"
+                                                      f"ReceiptHandle:{msg.receipt_handle}\n"
+                                                      f"Properties:{msg.properties}")
+                # ack_mq_message
+                ack_message(log_type=log_type, crawler=crawler, recv_msgs=recv_msgs, consumer=consumer)
+
+                # 处理爬虫业务
+                task_dict = task_fun_mq(msg.message_body)['task_dict']
+                rule_dict = task_fun_mq(msg.message_body)['rule_dict']
+                task_id = task_dict['id']
+                select_user_sql = f"""select * from crawler_user_v3 where task_id={task_id}"""
+                user_list = MysqlHelper.get_values(log_type, crawler, select_user_sql, env, action="")
+                our_uid_list = []
+                for user in user_list:
+                    our_uid_list.append(user["uid"])
+                our_uid = random.choice(our_uid_list)
+                Common.logger(log_type, crawler).info(f"调度任务:{task_dict}")
+                Common.logger(log_type, crawler).info(f"抓取规则:{rule_dict}")
+                Common.logger(log_type, crawler).info(f"用户列表:{user_list}\n")
+                Common.logger(log_type, crawler).info(f'开始抓取 {task_dict["taskName"]}\n')
+                BenshanzhufuRecommend.get_videoList(log_type=log_type,
+                                                    crawler=crawler,
+                                                    our_uid=our_uid,
+                                                    rule_dict=rule_dict,
+                                                    env=env)
+                Common.del_logs(log_type, crawler)
+                Common.logger(log_type, crawler).info('抓取一轮结束\n')
+
+        except MQExceptionBase as err:
+            # Topic中没有消息可消费。
+            if err.type == "MessageNotExist":
+                Common.logger(log_type, crawler).info(f"No new message! RequestId:{err.req_id}\n")
+                continue
+
+            Common.logger(log_type, crawler).info(f"Consume Message Fail! Exception:{err}\n")
+            time.sleep(2)
+            continue
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()  ## 新建参数解释器对象
+    parser.add_argument('--log_type', type=str)  ## 添加参数,注明参数类型
+    parser.add_argument('--crawler')  ## 添加参数
+    parser.add_argument('--topic_name')  ## 添加参数
+    parser.add_argument('--group_id')  ## 添加参数
+    parser.add_argument('--env')  ## 添加参数
+    args = parser.parse_args()  ### 参数赋值,也可以通过终端赋值
+    main(log_type=args.log_type,
+         crawler=args.crawler,
+         topic_name=args.topic_name,
+         group_id=args.group_id,
+         env=args.env)

+ 0 - 181
benshanzhufu/benshanzhufu_recommend/insert.py

@@ -1,181 +0,0 @@
-# -*- coding: utf-8 -*-
-# @Author: wangkun
-# @Time: 2023/4/13
-import json
-import os
-import sys
-import time
-from datetime import date, timedelta
-sys.path.append(os.getcwd())
-from common.common import Common
-from common.feishu import Feishu
-from common.scheduling_db import MysqlHelper
-
-
-class Insert:
-    @classmethod
-    def get_config(cls, log_type, crawler, text, env):
-        select_sql = f"""select * from crawler_config where source="benshanzhufu" """
-        contents = MysqlHelper.get_values(log_type, crawler, select_sql, env, action='')
-        title_list = []
-        filter_list = []
-        for content in contents:
-            config = content['config']
-            config_dict = eval(config)
-            for k, v in config_dict.items():
-                if k == "title":
-                    title_list_config = v.split(",")
-                    for title in title_list_config:
-                        title_list.append(title)
-                if k == "filter":
-                    filter_list_config = v.split(",")
-                    for filter_word in filter_list_config:
-                        filter_list.append(filter_word)
-        if text == "title":
-            return title_list
-        elif text == "filter":
-            return filter_list
-
-    @classmethod
-    def before_day(cls):
-        publish_time_str_rule = (date.today() + timedelta(days=-30)).strftime("%Y-%m-%d %H:%M:%S")
-        publish_time_stamp_rule = int(time.mktime(time.strptime(publish_time_str_rule, "%Y-%m-%d %H:%M:%S")))
-        print(publish_time_str_rule)
-        print(publish_time_stamp_rule)
-
-    @classmethod
-    def insert_config(cls, log_type, crawler, env):
-        filter_sheet = Feishu.get_values_batch(log_type, crawler, "DjXfqG")
-        # title_sheet = Feishu.get_values_batch(log_type, crawler, "bHSW1p")
-        filter_list = []
-        # title_list = []
-        for x in filter_sheet:
-            for y in x:
-                if y is None:
-                    pass
-                else:
-                    filter_list.append(y)
-        # for x in title_sheet:
-        #     for y in x:
-        #         if y is None:
-        #             pass
-        #         else:
-        #             title_list.append(y)
-        # str_title = ','.join(title_list)
-        str_filter = ','.join(filter_list)
-        config_dict = {
-            # "title": str_title,
-            "filter": str_filter
-        }
-        str_config_dict = str(config_dict)
-        # print(f"config_dict:{config_dict}")
-        # print(f"str_config_dict:{str_config_dict}")
-        insert_sql = f""" insert into crawler_config(title, source, config) values("本山祝福小程序", "benshanzhufu", "{str_config_dict}") """
-        MysqlHelper.update_values(log_type, crawler, insert_sql, env)
-
-    @classmethod
-    def insert_video_from_feishu_to_mysql(cls, log_type, crawler, env):
-        benshanzhufu_sheetid = ['440018']
-        for sheetid in benshanzhufu_sheetid:
-            xiaoniangao_sheet = Feishu.get_values_batch(log_type, crawler, sheetid)
-            for i in range(1, len(xiaoniangao_sheet)):
-            # for i in range(1, 3):
-                if xiaoniangao_sheet[i][5] is None or xiaoniangao_sheet[i][9] is None:
-                    continue
-                video_id = xiaoniangao_sheet[i][8].replace("https://admin.piaoquantv.com/cms/post-detail/", "").replace(
-                    "/info", "")
-                if video_id == "None":
-                    continue
-                video_id = int(video_id)
-                out_user_id = str(xiaoniangao_sheet[i][17])
-                platform = "本山祝福"
-                strategy = "推荐榜爬虫策略"
-                out_video_id = str(xiaoniangao_sheet[i][6])
-                video_title = str(xiaoniangao_sheet[i][7])
-                cover_url = str(xiaoniangao_sheet[i][19])
-                video_url = str(xiaoniangao_sheet[i][20])
-                duration = int(xiaoniangao_sheet[i][13])
-                publish_time = str(xiaoniangao_sheet[i][15]).replace("/", "-")
-                play_cnt = int(xiaoniangao_sheet[i][9])
-                like_cnt = int(xiaoniangao_sheet[i][11])
-                share_cnt = int(xiaoniangao_sheet[i][12])
-                # collection_cnt = 0
-                comment_cnt = int(xiaoniangao_sheet[i][10])
-                user_id = str(xiaoniangao_sheet[i][17])
-                crawler_rule = json.dumps({})
-                width = int(xiaoniangao_sheet[i][14].split("*")[0])
-                height = int(xiaoniangao_sheet[i][14].split("*")[1])
-
-                # print(f"video_id:{video_id}, type:{type(video_id)}")
-                # print(f"user_id:{user_id}, type:{type(user_id)}")
-                # print(f"out_user_id:{out_user_id}, type:{type(out_user_id)}")
-                # print(f"platform:{platform}, type:{type(platform)}")
-                # print(f"strategy:{strategy}, type:{type(strategy)}")
-                # print(f"out_video_id:{out_video_id}, type:{type(out_video_id)}")
-                # print(f"video_title:{video_title}, type:{type(video_title)}")
-                # print(f"cover_url:{cover_url}, type:{type(cover_url)}")
-                # print(f"video_url:{video_url}, type:{type(video_url)}")
-                # print(f"duration:{duration}, type:{type(duration)}")
-                # print(f"publish_time:{publish_time}, type:{type(publish_time)}")
-                # print(f"play_cnt:{play_cnt}, type:{type(play_cnt)}")
-                # print(f"like_cnt:{like_cnt}, type:{type(like_cnt)}")
-                # print(f"share_cnt:{share_cnt}, type:{type(share_cnt)}")
-                # print(f"comment_cnt:{comment_cnt}, type:{type(comment_cnt)}")
-                # print(f"crawler_rule:{crawler_rule}, type:{type(crawler_rule)}")
-                # print(f"width:{width}, type:{type(width)}")
-                # print(f"height:{height}, type:{type(height)}\n")
-
-                select_sql = f""" select * from crawler_video where platform="{platform}" and out_video_id="{out_video_id}" """
-                Common.logger(log_type, crawler).info(f"select_sql:{select_sql}")
-                repeat_video = MysqlHelper.get_values(log_type, crawler, select_sql, env)
-                Common.logger(log_type, crawler).info(f"repeat_video:{repeat_video}")
-
-                if repeat_video is not None and len(repeat_video) != 0:
-                    Common.logger(log_type, crawler).info(f"{video_title} 已存在数据库中\n")
-                else:
-                    # 视频信息保存数据库
-                    insert_sql = f""" insert into crawler_video(video_id,
-                                        out_user_id,
-                                        platform,
-                                        strategy,
-                                        out_video_id,
-                                        video_title,
-                                        cover_url,
-                                        video_url,
-                                        duration,
-                                        publish_time,
-                                        play_cnt,
-                                        like_cnt,
-                                        share_cnt,
-                                        comment_cnt,
-                                        crawler_rule,
-                                        width,
-                                        height)
-                                        values({video_id},
-                                        "{out_user_id}",
-                                        "{platform}",
-                                        "{strategy}",
-                                        "{out_video_id}",
-                                        "{video_title}",
-                                        "{cover_url}",
-                                        "{video_url}",
-                                        {duration},
-                                        "{publish_time}",
-                                        {play_cnt},
-                                        {like_cnt},
-                                        {share_cnt},
-                                        {comment_cnt},
-                                        '{crawler_rule}',
-                                        {width},
-                                        {height}) """
-                    Common.logger(log_type, crawler).info(f"insert_sql:{insert_sql}")
-                    MysqlHelper.update_values(log_type, crawler, insert_sql, env, action='')
-                    Common.logger(log_type, crawler).info('视频信息插入数据库成功!\n')
-
-
-
-if __name__ == "__main__":
-    # Insert.insert_config("insert", "benshanzhufu", "dev")
-    # print(Insert.get_config("insert", "benshanzhufu", "filter", "dev"))
-    Insert.insert_video_from_feishu_to_mysql("insert-prod", "benshanzhufu", "prod")
-    pass

+ 2 - 2
common/common.py

@@ -69,10 +69,10 @@ class Common:
             if name == ".log":
                 all_logs.append(log)
 
-        if len(all_logs) <= 20:
+        if len(all_logs) <= 30:
             pass
         else:
-            for file in all_logs[:len(all_logs) - 20]:
+            for file in all_logs[:len(all_logs) - 30]:
                 os.remove(log_dir + file)
         cls.logger(log_type, crawler).info("清除日志成功\n")
 

+ 26 - 6
common/public.py

@@ -1,6 +1,9 @@
 # -*- coding: utf-8 -*-
 # @Author: wangkun
 # @Time: 2023/3/27
+from mq_http_sdk.mq_client import *
+# from mq_http_sdk.mq_consumer import *
+from mq_http_sdk.mq_exception import MQExceptionBase
 import os, sys
 import time
 import random
@@ -83,6 +86,7 @@ def get_config_from_mysql(log_type, source, env, text, action=''):
     elif text == "search_word":
         return search_word_list
 
+
 def random_title(log_type, crawler, env, text):
     random_title_list = get_config_from_mysql(log_type, crawler, env, text)
     return random.choice(random_title_list)
@@ -104,25 +108,41 @@ def task_fun(task_str):
     return task_dict
 
 
+def task_fun_mq(task_str):
+    task_str = task_str.replace('"[', '[').replace(']"', ']').replace('\\', '')
+    task_dict = dict(eval(task_str))
+    rule = task_dict['rule']
+    task_dict['rule'] = dict()
+    for item in rule:
+        for k, val in item.items():
+            task_dict['rule'][k] = val
+    rule_dict = task_dict['rule']
+    task_dict = {
+        "task_dict": task_dict,
+        "rule_dict": rule_dict
+    }
+    return task_dict
+
+
 def get_consumer(topic_name, group_id):
     # 初始化client。
     mq_client = MQClient(
         # 设置HTTP协议客户端接入点,进入云消息队列 RocketMQ 版控制台实例详情页面的接入点区域查看。
-        "${HTTP_ENDPOINT}",
+        "http://1894469520484605.mqrest.cn-qingdao-public.aliyuncs.com",
         # AccessKey ID,阿里云身份验证标识。获取方式,请参见创建AccessKey。
-        "${ACCESS_KEY}",
+        "LTAI4G7puhXtLyHzHQpD6H7A",
         # AccessKey Secret,阿里云身份验证密钥。获取方式,请参见创建AccessKey。
-        "${SECRET_KEY}"
+        "nEbq3xWNQd1qLpdy2u71qFweHkZjSG"
     )
     # 消息所属的Topic,在云消息队列 RocketMQ 版控制台创建。
     # topic_name = "${TOPIC}"
-    topic_name = f"{topic_name}"
+    topic_name = str(topic_name)
     # 您在云消息队列 RocketMQ 版控制台创建的Group ID。
     # group_id = "${GROUP_ID}"
-    group_id = f"{group_id}"
+    group_id = str(group_id)
     # Topic所属的实例ID,在云消息队列 RocketMQ 版控制台创建。
     # 若实例有命名空间,则实例ID必须传入;若实例无命名空间,则实例ID传入空字符串。实例的命名空间可以在云消息队列 RocketMQ 版控制台的实例详情页面查看。
-    instance_id = "${INSTANCE_ID}"
+    instance_id = "MQ_INST_1894469520484605_BXhXuzkZ"
 
     consumer = mq_client.get_consumer(instance_id, topic_name, group_id)
     return consumer

+ 85 - 0
douyin/douyin_main/run_dy_author.py

@@ -0,0 +1,85 @@
+# -*- coding: utf-8 -*-
+# @Author: wangkun
+# @Time: 2023/6/7
+import argparse
+from mq_http_sdk.mq_client import *
+from mq_http_sdk.mq_consumer import *
+from mq_http_sdk.mq_exception import MQExceptionBase
+sys.path.append(os.getcwd())
+from common.common import Common
+from common.public import get_consumer, ack_message, task_fun_mq
+from common.scheduling_db import MysqlHelper
+from douyin.douyin_author.douyin_author_scheduling import DouyinauthorScheduling
+
+
+def main(log_type, crawler, topic_name, group_id, env):
+    consumer = get_consumer(topic_name, group_id)
+    # 长轮询表示如果Topic没有消息,则客户端请求会在服务端挂起3秒,3秒内如果有消息可以消费则立即返回响应。
+    # 长轮询时间3秒(最多可设置为30秒)。
+    wait_seconds = 3
+    # 一次最多消费3条(最多可设置为16条)。
+    batch = 1
+    Common.logger(log_type, crawler).info(f'{10 * "="}Consume And Ack Message From Topic{10 * "="}\n'
+                                          f'WaitSeconds:{wait_seconds}\n'
+                                          f'TopicName:{topic_name}\n'
+                                          f'MQConsumer:{group_id}')
+    while True:
+        try:
+            # 长轮询消费消息。
+            recv_msgs = consumer.consume_message(batch, wait_seconds)
+            for msg in recv_msgs:
+                Common.logger(log_type, crawler).info(f"Receive\n"
+                                                      f"MessageId:{msg.message_id}\n"
+                                                      f"MessageBodyMD5:{msg.message_body_md5}\n"
+                                                      f"MessageTag:{msg.message_tag}\n"
+                                                      f"ConsumedTimes:{msg.consumed_times}\n"
+                                                      f"PublishTime:{msg.publish_time}\n"
+                                                      f"Body:{msg.message_body}\n"
+                                                      f"NextConsumeTime:{msg.next_consume_time}\n"
+                                                      f"ReceiptHandle:{msg.receipt_handle}\n"
+                                                      f"Properties:{msg.properties}")
+                # ack_mq_message
+                ack_message(log_type=log_type, crawler=crawler, recv_msgs=recv_msgs, consumer=consumer)
+
+                # 处理爬虫业务
+                task_dict = task_fun_mq(msg.message_body)['task_dict']
+                rule_dict = task_fun_mq(msg.message_body)['rule_dict']
+                task_id = task_dict['id']
+                select_user_sql = f"""select * from crawler_user_v3 where task_id={task_id}"""
+                user_list = MysqlHelper.get_values(log_type, crawler, select_user_sql, env, action="")
+                Common.logger(log_type, crawler).info(f"调度任务:{task_dict}")
+                Common.logger(log_type, crawler).info(f"抓取规则:{rule_dict}")
+                # Common.logger(log_type, crawler).info(f"用户列表:{user_list}\n")
+                Common.logger(log_type, crawler).info(f'开始抓取 {task_dict["taskName"]}\n')
+                DouyinauthorScheduling.get_author_videos(log_type=log_type,
+                                                         crawler=crawler,
+                                                         rule_dict=rule_dict,
+                                                         user_list=user_list,
+                                                         env=env)
+                Common.del_logs(log_type, crawler)
+                Common.logger(log_type, crawler).info('抓取一轮结束\n')
+
+        except MQExceptionBase as err:
+            # Topic中没有消息可消费。
+            if err.type == "MessageNotExist":
+                Common.logger(log_type, crawler).info(f"No new message! RequestId:{err.req_id}\n")
+                continue
+
+            Common.logger(log_type, crawler).info(f"Consume Message Fail! Exception:{err}\n")
+            time.sleep(2)
+            continue
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()  ## 新建参数解释器对象
+    parser.add_argument('--log_type', type=str)  ## 添加参数,注明参数类型
+    parser.add_argument('--crawler')  ## 添加参数
+    parser.add_argument('--topic_name')  ## 添加参数
+    parser.add_argument('--group_id')  ## 添加参数
+    parser.add_argument('--env')  ## 添加参数
+    args = parser.parse_args()  ### 参数赋值,也可以通过终端赋值
+    main(log_type=args.log_type,
+         crawler=args.crawler,
+         topic_name=args.topic_name,
+         group_id=args.group_id,
+         env=args.env)

+ 90 - 0
douyin/douyin_main/run_dy_recommend.py

@@ -0,0 +1,90 @@
+# -*- coding: utf-8 -*-
+# @Author: wangkun
+# @Time: 2023/6/7
+import argparse
+import random
+from mq_http_sdk.mq_client import *
+from mq_http_sdk.mq_consumer import *
+from mq_http_sdk.mq_exception import MQExceptionBase
+sys.path.append(os.getcwd())
+from common.common import Common
+from common.public import get_consumer, ack_message, task_fun_mq
+from common.scheduling_db import MysqlHelper
+from douyin.douyin_recommend.douyin_recommend_scheduling import DouyinrecommendScheduling
+
+
+def main(log_type, crawler, topic_name, group_id, env):
+    consumer = get_consumer(topic_name, group_id)
+    # 长轮询表示如果Topic没有消息,则客户端请求会在服务端挂起3秒,3秒内如果有消息可以消费则立即返回响应。
+    # 长轮询时间3秒(最多可设置为30秒)。
+    wait_seconds = 3
+    # 一次最多消费3条(最多可设置为16条)。
+    batch = 1
+    Common.logger(log_type, crawler).info(f'{10 * "="}Consume And Ack Message From Topic{10 * "="}\n'
+                                          f'WaitSeconds:{wait_seconds}\n'
+                                          f'TopicName:{topic_name}\n'
+                                          f'MQConsumer:{group_id}')
+    while True:
+        try:
+            # 长轮询消费消息。
+            recv_msgs = consumer.consume_message(batch, wait_seconds)
+            for msg in recv_msgs:
+                Common.logger(log_type, crawler).info(f"Receive\n"
+                                                      f"MessageId:{msg.message_id}\n"
+                                                      f"MessageBodyMD5:{msg.message_body_md5}\n"
+                                                      f"MessageTag:{msg.message_tag}\n"
+                                                      f"ConsumedTimes:{msg.consumed_times}\n"
+                                                      f"PublishTime:{msg.publish_time}\n"
+                                                      f"Body:{msg.message_body}\n"
+                                                      f"NextConsumeTime:{msg.next_consume_time}\n"
+                                                      f"ReceiptHandle:{msg.receipt_handle}\n"
+                                                      f"Properties:{msg.properties}")
+                # ack_mq_message
+                ack_message(log_type=log_type, crawler=crawler, recv_msgs=recv_msgs, consumer=consumer)
+
+                # 处理爬虫业务
+                task_dict = task_fun_mq(msg.message_body)['task_dict']
+                rule_dict = task_fun_mq(msg.message_body)['rule_dict']
+                task_id = task_dict['id']
+                select_user_sql = f"""select * from crawler_user_v3 where task_id={task_id}"""
+                user_list = MysqlHelper.get_values(log_type, crawler, select_user_sql, env, action="")
+                our_uid_list = []
+                for user in user_list:
+                    our_uid_list.append(user["uid"])
+                our_uid = random.choice(our_uid_list)
+                Common.logger(log_type, crawler).info(f"调度任务:{task_dict}")
+                Common.logger(log_type, crawler).info(f"抓取规则:{rule_dict}")
+                # Common.logger(log_type, crawler).info(f"用户列表:{user_list}\n")
+                Common.logger(log_type, crawler).info(f'开始抓取 {task_dict["taskName"]}\n')
+                DouyinrecommendScheduling.get_videoList(log_type=log_type,
+                                                        crawler=crawler,
+                                                        rule_dict=rule_dict,
+                                                        our_uid=our_uid,
+                                                        env=env)
+                Common.del_logs(log_type, crawler)
+                Common.logger(log_type, crawler).info('抓取一轮结束\n')
+
+        except MQExceptionBase as err:
+            # Topic中没有消息可消费。
+            if err.type == "MessageNotExist":
+                Common.logger(log_type, crawler).info(f"No new message! RequestId:{err.req_id}\n")
+                continue
+
+            Common.logger(log_type, crawler).info(f"Consume Message Fail! Exception:{err}\n")
+            time.sleep(2)
+            continue
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()  ## 新建参数解释器对象
+    parser.add_argument('--log_type', type=str)  ## 添加参数,注明参数类型
+    parser.add_argument('--crawler')  ## 添加参数
+    parser.add_argument('--topic_name')  ## 添加参数
+    parser.add_argument('--group_id')  ## 添加参数
+    parser.add_argument('--env')  ## 添加参数
+    args = parser.parse_args()  ### 参数赋值,也可以通过终端赋值
+    main(log_type=args.log_type,
+         crawler=args.crawler,
+         topic_name=args.topic_name,
+         group_id=args.group_id,
+         env=args.env)

File diff suppressed because it is too large
+ 0 - 32
douyin/test.py


+ 0 - 148
ganggangdouchuan/ganggangdouchuan_recommend/insert.py

@@ -1,148 +0,0 @@
-# -*- coding: utf-8 -*-
-# @Author: wangkun
-# @Time: 2023/4/13
-import json
-import os
-import sys
-import time
-from datetime import date, timedelta
-from hashlib import md5
-
-sys.path.append(os.getcwd())
-from common.common import Common
-from common.feishu import Feishu
-from common.scheduling_db import MysqlHelper
-
-
-class Insert:
-    @classmethod
-    def get_config(cls, log_type, crawler, text, env):
-        select_sql = f"""select * from crawler_config where source="benshanzhufu" """
-        contents = MysqlHelper.get_values(log_type, crawler, select_sql, env, action='')
-        title_list = []
-        filter_list = []
-        for content in contents:
-            config = content['config']
-            config_dict = eval(config)
-            for k, v in config_dict.items():
-                if k == "title":
-                    title_list_config = v.split(",")
-                    for title in title_list_config:
-                        title_list.append(title)
-                if k == "filter":
-                    filter_list_config = v.split(",")
-                    for filter_word in filter_list_config:
-                        filter_list.append(filter_word)
-        if text == "title":
-            return title_list
-        elif text == "filter":
-            return filter_list
-
-    @classmethod
-    def before_day(cls):
-        publish_time_str_rule = (date.today() + timedelta(days=-30)).strftime("%Y-%m-%d %H:%M:%S")
-        publish_time_stamp_rule = int(time.mktime(time.strptime(publish_time_str_rule, "%Y-%m-%d %H:%M:%S")))
-        print(publish_time_str_rule)
-        print(publish_time_stamp_rule)
-
-    @classmethod
-    def insert_config(cls, log_type, crawler, env):
-        filter_sheet = Feishu.get_values_batch(log_type, crawler, "DjXfqG")
-        # title_sheet = Feishu.get_values_batch(log_type, crawler, "bHSW1p")
-        filter_list = []
-        # title_list = []
-        for x in filter_sheet:
-            for y in x:
-                if y is None:
-                    pass
-                else:
-                    filter_list.append(y)
-        # for x in title_sheet:
-        #     for y in x:
-        #         if y is None:
-        #             pass
-        #         else:
-        #             title_list.append(y)
-        # str_title = ','.join(title_list)
-        str_filter = ','.join(filter_list)
-        config_dict = {
-            # "title": str_title,
-            "filter": str_filter
-        }
-        str_config_dict = str(config_dict)
-        # print(f"config_dict:{config_dict}")
-        # print(f"str_config_dict:{str_config_dict}")
-        insert_sql = f""" insert into crawler_config(title, source, config) values("本山祝福小程序", "benshanzhufu", "{str_config_dict}") """
-        MysqlHelper.update_values(log_type, crawler, insert_sql, env)
-
-    @classmethod
-    def insert_video_from_feishu_to_mysql(cls, log_type, crawler, env):
-        ganggangdouchuan_sheetid = ['070a67']
-        for sheetid in ganggangdouchuan_sheetid:
-            xiaoniangao_sheet = Feishu.get_values_batch(log_type, crawler, sheetid)
-            for i in range(1, len(xiaoniangao_sheet)):
-            # for i in range(1, 5):
-                if xiaoniangao_sheet[i][5] is None or xiaoniangao_sheet[i][7] is None:
-                    continue
-                video_id = xiaoniangao_sheet[i][11].replace("https://admin.piaoquantv.com/cms/post-detail/", "").replace(
-                    "/info", "")
-                if video_id == "None":
-                    continue
-                video_id = int(video_id)
-                out_user_id = "ganggangdouchuan"
-                platform = "刚刚都传"
-                strategy = "推荐榜爬虫策略"
-                video_title = str(xiaoniangao_sheet[i][7])
-                cover_url = str(xiaoniangao_sheet[i][12])
-                video_url = str(xiaoniangao_sheet[i][13])
-                crawler_rule = json.dumps({})
-                out_video_id = md5(video_title.encode('utf8')).hexdigest()
-
-                # print(f"video_id:{video_id}, type:{type(video_id)}")
-                # print(f"out_user_id:{out_user_id}, type:{type(out_user_id)}")
-                # print(f"platform:{platform}, type:{type(platform)}")
-                # print(f"strategy:{strategy}, type:{type(strategy)}")
-                # print(f"video_title:{video_title}, type:{type(video_title)}")
-                # print(f"cover_url:{cover_url}, type:{type(cover_url)}")
-                # print(f"video_url:{video_url}, type:{type(video_url)}")
-                # print(f"crawler_rule:{crawler_rule}, type:{type(crawler_rule)}")
-
-                select_sql = f""" select * from crawler_video where platform="{platform}" and video_url="{video_url}" """
-                Common.logger(log_type, crawler).info(f"select_sql:{select_sql}")
-                repeat_video = MysqlHelper.get_values(log_type, crawler, select_sql, env)
-                Common.logger(log_type, crawler).info(f"repeat_video:{repeat_video}")
-
-                if repeat_video is not None and len(repeat_video) != 0:
-                    Common.logger(log_type, crawler).info(f"{video_title} 已存在数据库中\n")
-                else:
-                    # 视频信息保存数据库
-                    insert_sql = f""" insert into crawler_video(video_id,
-                                        out_user_id,
-                                        platform,
-                                        strategy,
-                                        out_video_id,
-                                        video_title,
-                                        cover_url,
-                                        video_url,
-                                        crawler_rule)
-                                        values({video_id},
-                                        "{out_user_id}",
-                                        "{platform}",
-                                        "{strategy}",
-                                        "{out_video_id}",
-                                        "{video_title}",
-                                        "{cover_url}",
-                                        "{video_url}",
-                                        '{crawler_rule}') """
-                    Common.logger(log_type, crawler).info(f"insert_sql:{insert_sql}")
-                    MysqlHelper.update_values(log_type, crawler, insert_sql, env, action='')
-                    Common.logger(log_type, crawler).info('视频信息插入数据库成功!\n')
-
-
-
-if __name__ == "__main__":
-    # Insert.insert_config("insert", "benshanzhufu", "dev")
-    # print(Insert.get_config("insert", "ganggangdouchuan", "filter", "dev"))
-    # Insert.insert_video_from_feishu_to_mysql("insert-dev", "ganggangdouchuan", "dev")
-    Insert.insert_video_from_feishu_to_mysql("insert-prod", "ganggangdouchuan", "prod")
-    pass

+ 0 - 24
gongzhonghao/gongzhonghao_author/demo.py

@@ -1,24 +0,0 @@
-# -*- coding: utf-8 -*-
-# @Author: wangkun
-# @Time: 2023/5/23
-import os
-import psutil
-
-
-class Demo:
-    @classmethod
-    def demo1(cls):
-        # cmd = "ps aux | grep Appium.app"
-        cmd = 'ps -ef | grep "run_gongzhonghao1_author_scheduling.py" | grep -v "grep"'
-        result = os.popen(cmd).read()
-        print(len(result))
-        if len(result) == 0:
-            print("yes")
-
-        # for process in psutil.process_iter():
-        #     # if "shipinhao" in process.name():
-        #     print(f"{process.pid}, {process.name()}")
-
-
-if __name__ == "__main__":
-    Demo.demo1()

+ 0 - 96
gongzhonghao/gongzhonghao_follow/insert_video.py

@@ -1,96 +0,0 @@
-# -*- coding: utf-8 -*-
-# @Author: wangkun
-# @Time: 2023/3/28
-import json
-import os
-import sys
-sys.path.append(os.getcwd())
-from common.common import Common
-from common.scheduling_db  import MysqlHelper
-from common.feishu import Feishu
-
-
-class Insert:
-    @classmethod
-    def insert_video_from_feishu_to_mysql(cls, log_type, crawler, env, machine):
-        gongzhonghao_sheetid_list = ['47e39d']
-        for sheetid in gongzhonghao_sheetid_list:
-            gongzhonghao_sheet = Feishu.get_values_batch(log_type, crawler, sheetid)
-            for i in range(1, len(gongzhonghao_sheet)):
-            # for i in range(1, 3):
-                if gongzhonghao_sheet[i][5] is None or gongzhonghao_sheet[i][9] is None:
-                    continue
-                video_id = gongzhonghao_sheet[i][9].replace("https://admin.piaoquantv.com/cms/post-detail/", "").replace("/info", "")
-                if video_id == "None":
-                    continue
-                video_id = int(video_id)
-                out_user_id = str(gongzhonghao_sheet[i][14])
-                platform = "公众号"
-                strategy = "定向爬虫策略"
-                out_video_id = str(gongzhonghao_sheet[i][8])
-                video_title = str(gongzhonghao_sheet[i][7])
-                cover_url = str(gongzhonghao_sheet[i][16])
-                video_url = str(gongzhonghao_sheet[i][18])
-                duration = int(gongzhonghao_sheet[i][10])
-                publish_time = str(gongzhonghao_sheet[i][12]).replace("/", "-")
-                crawler_rule = json.dumps({"play_cnt": {"min": 0}, "duration": {"min": 20}, "publish_day": {"min": 0}})
-                width = int(gongzhonghao_sheet[i][11].split("*")[0])
-                height = int(gongzhonghao_sheet[i][11].split("*")[1])
-
-                # print(f"video_id:{video_id}, type:{type(video_id)}")
-                # print(f"out_user_id:{out_user_id}, type:{type(out_user_id)}")
-                # print(f"platform:{platform}, type:{type(platform)}")
-                # print(f"strategy:{strategy}, type:{type(strategy)}")
-                # print(f"out_video_id:{out_video_id}, type:{type(out_video_id)}")
-                # print(f"video_title:{video_title}, type:{type(video_title)}")
-                # print(f"cover_url:{cover_url}, type:{type(cover_url)}")
-                # print(f"video_url:{video_url}, type:{type(video_url)}")
-                # print(f"duration:{duration}, type:{type(duration)}")
-                # print(f"publish_time:{publish_time}, type:{type(publish_time)}")
-                # print(f"crawler_rule:{crawler_rule}, type:{type(crawler_rule)}")
-                # print(f"width:{width}, type:{type(width)}")
-                # print(f"height:{height}, type:{type(height)}\n")
-
-                select_sql = f""" select * from crawler_video where platform="{platform}" and out_video_id="{out_video_id}" """
-                Common.logger(log_type, crawler).info(f"select_sql:{select_sql}")
-                repeat_video = MysqlHelper.get_values(log_type, crawler, select_sql, env, machine)
-                Common.logger(log_type, crawler).info(f"repeat_video:{repeat_video}")
-
-                if repeat_video is not None and len(repeat_video) != 0:
-                    Common.logger(log_type, crawler).info(f"{video_title} 已存在数据库中\n")
-                else:
-                    # 视频信息保存数据库
-                    insert_sql = f""" insert into crawler_video(video_id,
-                                    out_user_id,
-                                    platform,
-                                    strategy,
-                                    out_video_id,
-                                    video_title,
-                                    cover_url,
-                                    video_url,
-                                    duration,
-                                    publish_time,
-                                    crawler_rule,
-                                    width,
-                                    height)
-                                    values({video_id},
-                                    "{out_user_id}",
-                                    "{platform}",
-                                    "{strategy}",
-                                    "{out_video_id}",
-                                    "{video_title}",
-                                    "{cover_url}",
-                                    "{video_url}",
-                                    {duration},
-                                    "{publish_time}",
-                                    '{crawler_rule}',
-                                    {width},
-                                    {height}) """
-                    Common.logger(log_type, crawler).info(f"insert_sql:{insert_sql}")
-                    MysqlHelper.update_values(log_type, crawler, insert_sql, env, machine)
-                    Common.logger(log_type, crawler).info('视频信息插入数据库成功!\n')
-
-
-if __name__ == "__main__":
-    Insert.insert_video_from_feishu_to_mysql("insert-prod", "gongzhonghao", "prod", "local")
-    pass

+ 0 - 2
gongzhonghao/gongzhonghao_main/run_gongzhonghao1_author_scheduling.py

@@ -4,8 +4,6 @@
 import argparse
 import os
 import sys
-import time
-
 sys.path.append(os.getcwd())
 from common.public import task_fun
 from common.common import Common

+ 0 - 10
gongzhonghao/gongzhonghao_main/run_gongzhonghao2_author_scheduling.py

@@ -4,7 +4,6 @@
 import argparse
 import os
 import sys
-import time
 sys.path.append(os.getcwd())
 from common.public import task_fun
 from common.common import Common
@@ -12,15 +11,6 @@ from gongzhonghao.gongzhonghao_author.gongzhonghao2_author import GongzhonghaoAu
 
 
 def main(log_type, crawler, task, env):
-    # while True:
-    #     cmd = 'ps -ef | grep "run_gongzhonghao2_author_scheduling.py" | grep -v "grep"'
-    #     result = os.popen(cmd).read()
-    #     Common.logger(log_type, crawler).info(f"len_result:{len(result)}")
-    #     if len(result) > 573:
-    #         Common.logger(log_type, crawler).info("公众号_2抓取未完成,无需启动新进程")
-    #         time.sleep(1)
-    #     else:
-    #         break
     task_dict = task_fun(task)['task_dict']
     rule_dict = task_fun(task)['rule_dict']
     Common.logger(log_type, crawler).info(f"调度任务:{task_dict}")

+ 88 - 0
gongzhonghao/gongzhonghao_main/run_gzh1_author.py

@@ -0,0 +1,88 @@
+# -*- coding: utf-8 -*-
+# @Author: wangkun
+# @Time: 2023/6/6
+import argparse
+from mq_http_sdk.mq_client import *
+from mq_http_sdk.mq_consumer import *
+from mq_http_sdk.mq_exception import MQExceptionBase
+sys.path.append(os.getcwd())
+from common.public import task_fun_mq, get_consumer, ack_message
+from common.common import Common
+from gongzhonghao.gongzhonghao_author.gongzhonghao1_author import GongzhonghaoAuthor1
+
+
+def main(log_type, crawler, topic_name, group_id, env):
+    if "gzh1" in topic_name:
+        log_type = "author1"
+    elif "gzh2" in topic_name:
+        log_type = "author2"
+    elif "gzh3" in topic_name:
+        log_type = "author3"
+    elif "gzh4" in topic_name:
+        log_type = "author4"
+    elif "gzh5" in topic_name:
+        log_type = "author5"
+    consumer = get_consumer(topic_name, group_id)
+    # 长轮询表示如果Topic没有消息,则客户端请求会在服务端挂起3秒,3秒内如果有消息可以消费则立即返回响应。
+    # 长轮询时间3秒(最多可设置为30秒)。
+    wait_seconds = 3
+    # 一次最多消费3条(最多可设置为16条)。
+    batch = 1
+    Common.logger(log_type, crawler).info(f'{10 * "="}Consume And Ack Message From Topic{10 * "="}\n'
+                                          f'WaitSeconds:{wait_seconds}\n'
+                                          f'TopicName:{topic_name}\n'
+                                          f'MQConsumer:{group_id}')
+    while True:
+        try:
+            # 长轮询消费消息。
+            recv_msgs = consumer.consume_message(batch, wait_seconds)
+            for msg in recv_msgs:
+                Common.logger(log_type, crawler).info(f"Receive\n"
+                                                      f"MessageId:{msg.message_id}\n"
+                                                      f"MessageBodyMD5:{msg.message_body_md5}\n"
+                                                      f"MessageTag:{msg.message_tag}\n"
+                                                      f"ConsumedTimes:{msg.consumed_times}\n"
+                                                      f"PublishTime:{msg.publish_time}\n"
+                                                      f"Body:{msg.message_body}\n"
+                                                      f"NextConsumeTime:{msg.next_consume_time}\n"
+                                                      f"ReceiptHandle:{msg.receipt_handle}\n"
+                                                      f"Properties:{msg.properties}")
+                # ack_mq_message
+                ack_message(log_type=log_type, crawler=crawler, recv_msgs=recv_msgs, consumer=consumer)
+
+                # 处理爬虫业务
+                task_dict = task_fun_mq(msg.message_body)['task_dict']
+                rule_dict = task_fun_mq(msg.message_body)['rule_dict']
+                Common.logger(log_type, crawler).info(f"调度任务:{task_dict}")
+                Common.logger(log_type, crawler).info(f"抓取规则:{rule_dict}\n")
+                Common.logger(log_type, crawler).info(f'开始抓取:{task_dict["taskName"]}\n')
+                GongzhonghaoAuthor1.get_all_videos(log_type=log_type,
+                                                    crawler=crawler,
+                                                    rule_dict=rule_dict,
+                                                    env=env)
+                Common.del_logs(log_type, crawler)
+                Common.logger(log_type, crawler).info('抓取一轮结束\n')
+        except MQExceptionBase as err:
+            # Topic中没有消息可消费。
+            if err.type == "MessageNotExist":
+                Common.logger(log_type, crawler).info(f"No new message! RequestId:{err.req_id}\n")
+                continue
+
+            Common.logger(log_type, crawler).info(f"Consume Message Fail! Exception:{err}\n")
+            time.sleep(2)
+            continue
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()  ## 新建参数解释器对象
+    parser.add_argument('--log_type', type=str)  ## 添加参数,注明参数类型
+    parser.add_argument('--crawler')  ## 添加参数
+    parser.add_argument('--topic_name')  ## 添加参数
+    parser.add_argument('--group_id')  ## 添加参数
+    parser.add_argument('--env')  ## 添加参数
+    args = parser.parse_args()  ### 参数赋值,也可以通过终端赋值
+    main(log_type=args.log_type,
+         crawler=args.crawler,
+         topic_name=args.topic_name,
+         group_id=args.group_id,
+         env=args.env)

+ 88 - 0
gongzhonghao/gongzhonghao_main/run_gzh2_author.py

@@ -0,0 +1,88 @@
+# -*- coding: utf-8 -*-
+# @Author: wangkun
+# @Time: 2023/6/6
+import argparse
+from mq_http_sdk.mq_client import *
+from mq_http_sdk.mq_consumer import *
+from mq_http_sdk.mq_exception import MQExceptionBase
+sys.path.append(os.getcwd())
+from common.public import task_fun_mq, get_consumer, ack_message
+from common.common import Common
+from gongzhonghao.gongzhonghao_author.gongzhonghao2_author import GongzhonghaoAuthor2
+
+
+def main(log_type, crawler, topic_name, group_id, env):
+    if "gzh1" in topic_name:
+        log_type = "author1"
+    elif "gzh2" in topic_name:
+        log_type = "author2"
+    elif "gzh3" in topic_name:
+        log_type = "author3"
+    elif "gzh4" in topic_name:
+        log_type = "author4"
+    elif "gzh5" in topic_name:
+        log_type = "author5"
+    consumer = get_consumer(topic_name, group_id)
+    # 长轮询表示如果Topic没有消息,则客户端请求会在服务端挂起3秒,3秒内如果有消息可以消费则立即返回响应。
+    # 长轮询时间3秒(最多可设置为30秒)。
+    wait_seconds = 3
+    # 一次最多消费3条(最多可设置为16条)。
+    batch = 1
+    Common.logger(log_type, crawler).info(f'{10 * "="}Consume And Ack Message From Topic{10 * "="}\n'
+                                          f'WaitSeconds:{wait_seconds}\n'
+                                          f'TopicName:{topic_name}\n'
+                                          f'MQConsumer:{group_id}')
+    while True:
+        try:
+            # 长轮询消费消息。
+            recv_msgs = consumer.consume_message(batch, wait_seconds)
+            for msg in recv_msgs:
+                Common.logger(log_type, crawler).info(f"Receive\n"
+                                                      f"MessageId:{msg.message_id}\n"
+                                                      f"MessageBodyMD5:{msg.message_body_md5}\n"
+                                                      f"MessageTag:{msg.message_tag}\n"
+                                                      f"ConsumedTimes:{msg.consumed_times}\n"
+                                                      f"PublishTime:{msg.publish_time}\n"
+                                                      f"Body:{msg.message_body}\n"
+                                                      f"NextConsumeTime:{msg.next_consume_time}\n"
+                                                      f"ReceiptHandle:{msg.receipt_handle}\n"
+                                                      f"Properties:{msg.properties}")
+                # ack_mq_message
+                ack_message(log_type=log_type, crawler=crawler, recv_msgs=recv_msgs, consumer=consumer)
+
+                # 处理爬虫业务
+                task_dict = task_fun_mq(msg.message_body)['task_dict']
+                rule_dict = task_fun_mq(msg.message_body)['rule_dict']
+                Common.logger(log_type, crawler).info(f"调度任务:{task_dict}")
+                Common.logger(log_type, crawler).info(f"抓取规则:{rule_dict}\n")
+                Common.logger(log_type, crawler).info(f'开始抓取:{task_dict["taskName"]}\n')
+                GongzhonghaoAuthor2.get_all_videos(log_type=log_type,
+                                                    crawler=crawler,
+                                                    rule_dict=rule_dict,
+                                                    env=env)
+                Common.del_logs(log_type, crawler)
+                Common.logger(log_type, crawler).info('抓取一轮结束\n')
+        except MQExceptionBase as err:
+            # Topic中没有消息可消费。
+            if err.type == "MessageNotExist":
+                Common.logger(log_type, crawler).info(f"No new message! RequestId:{err.req_id}\n")
+                continue
+
+            Common.logger(log_type, crawler).info(f"Consume Message Fail! Exception:{err}\n")
+            time.sleep(2)
+            continue
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()  ## 新建参数解释器对象
+    parser.add_argument('--log_type', type=str)  ## 添加参数,注明参数类型
+    parser.add_argument('--crawler')  ## 添加参数
+    parser.add_argument('--topic_name')  ## 添加参数
+    parser.add_argument('--group_id')  ## 添加参数
+    parser.add_argument('--env')  ## 添加参数
+    args = parser.parse_args()  ### 参数赋值,也可以通过终端赋值
+    main(log_type=args.log_type,
+         crawler=args.crawler,
+         topic_name=args.topic_name,
+         group_id=args.group_id,
+         env=args.env)

+ 88 - 0
gongzhonghao/gongzhonghao_main/run_gzh3_author.py

@@ -0,0 +1,88 @@
+# -*- coding: utf-8 -*-
+# @Author: wangkun
+# @Time: 2023/6/6
+import argparse
+from mq_http_sdk.mq_client import *
+from mq_http_sdk.mq_consumer import *
+from mq_http_sdk.mq_exception import MQExceptionBase
+sys.path.append(os.getcwd())
+from common.public import task_fun_mq, get_consumer, ack_message
+from common.common import Common
+from gongzhonghao.gongzhonghao_author.gongzhonghao3_author import GongzhonghaoAuthor3
+
+
+def main(log_type, crawler, topic_name, group_id, env):
+    if "gzh1" in topic_name:
+        log_type = "author1"
+    elif "gzh2" in topic_name:
+        log_type = "author2"
+    elif "gzh3" in topic_name:
+        log_type = "author3"
+    elif "gzh4" in topic_name:
+        log_type = "author4"
+    elif "gzh5" in topic_name:
+        log_type = "author5"
+    consumer = get_consumer(topic_name, group_id)
+    # 长轮询表示如果Topic没有消息,则客户端请求会在服务端挂起3秒,3秒内如果有消息可以消费则立即返回响应。
+    # 长轮询时间3秒(最多可设置为30秒)。
+    wait_seconds = 3
+    # 一次最多消费3条(最多可设置为16条)。
+    batch = 1
+    Common.logger(log_type, crawler).info(f'{10 * "="}Consume And Ack Message From Topic{10 * "="}\n'
+                                          f'WaitSeconds:{wait_seconds}\n'
+                                          f'TopicName:{topic_name}\n'
+                                          f'MQConsumer:{group_id}')
+    while True:
+        try:
+            # 长轮询消费消息。
+            recv_msgs = consumer.consume_message(batch, wait_seconds)
+            for msg in recv_msgs:
+                Common.logger(log_type, crawler).info(f"Receive\n"
+                                                      f"MessageId:{msg.message_id}\n"
+                                                      f"MessageBodyMD5:{msg.message_body_md5}\n"
+                                                      f"MessageTag:{msg.message_tag}\n"
+                                                      f"ConsumedTimes:{msg.consumed_times}\n"
+                                                      f"PublishTime:{msg.publish_time}\n"
+                                                      f"Body:{msg.message_body}\n"
+                                                      f"NextConsumeTime:{msg.next_consume_time}\n"
+                                                      f"ReceiptHandle:{msg.receipt_handle}\n"
+                                                      f"Properties:{msg.properties}")
+                # ack_mq_message
+                ack_message(log_type=log_type, crawler=crawler, recv_msgs=recv_msgs, consumer=consumer)
+
+                # 处理爬虫业务
+                task_dict = task_fun_mq(msg.message_body)['task_dict']
+                rule_dict = task_fun_mq(msg.message_body)['rule_dict']
+                Common.logger(log_type, crawler).info(f"调度任务:{task_dict}")
+                Common.logger(log_type, crawler).info(f"抓取规则:{rule_dict}\n")
+                Common.logger(log_type, crawler).info(f'开始抓取:{task_dict["taskName"]}\n')
+                GongzhonghaoAuthor3.get_all_videos(log_type=log_type,
+                                                    crawler=crawler,
+                                                    rule_dict=rule_dict,
+                                                    env=env)
+                Common.del_logs(log_type, crawler)
+                Common.logger(log_type, crawler).info('抓取一轮结束\n')
+        except MQExceptionBase as err:
+            # Topic中没有消息可消费。
+            if err.type == "MessageNotExist":
+                Common.logger(log_type, crawler).info(f"No new message! RequestId:{err.req_id}\n")
+                continue
+
+            Common.logger(log_type, crawler).info(f"Consume Message Fail! Exception:{err}\n")
+            time.sleep(2)
+            continue
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()  ## 新建参数解释器对象
+    parser.add_argument('--log_type', type=str)  ## 添加参数,注明参数类型
+    parser.add_argument('--crawler')  ## 添加参数
+    parser.add_argument('--topic_name')  ## 添加参数
+    parser.add_argument('--group_id')  ## 添加参数
+    parser.add_argument('--env')  ## 添加参数
+    args = parser.parse_args()  ### 参数赋值,也可以通过终端赋值
+    main(log_type=args.log_type,
+         crawler=args.crawler,
+         topic_name=args.topic_name,
+         group_id=args.group_id,
+         env=args.env)

+ 88 - 0
gongzhonghao/gongzhonghao_main/run_gzh4_author.py

@@ -0,0 +1,88 @@
+# -*- coding: utf-8 -*-
+# @Author: wangkun
+# @Time: 2023/6/6
+import argparse
+from mq_http_sdk.mq_client import *
+from mq_http_sdk.mq_consumer import *
+from mq_http_sdk.mq_exception import MQExceptionBase
+sys.path.append(os.getcwd())
+from common.public import task_fun_mq, get_consumer, ack_message
+from common.common import Common
+from gongzhonghao.gongzhonghao_author.gongzhonghao4_author import GongzhonghaoAuthor4
+
+
+def main(log_type, crawler, topic_name, group_id, env):
+    if "gzh1" in topic_name:
+        log_type = "author1"
+    elif "gzh2" in topic_name:
+        log_type = "author2"
+    elif "gzh3" in topic_name:
+        log_type = "author3"
+    elif "gzh4" in topic_name:
+        log_type = "author4"
+    elif "gzh5" in topic_name:
+        log_type = "author5"
+    consumer = get_consumer(topic_name, group_id)
+    # 长轮询表示如果Topic没有消息,则客户端请求会在服务端挂起3秒,3秒内如果有消息可以消费则立即返回响应。
+    # 长轮询时间3秒(最多可设置为30秒)。
+    wait_seconds = 3
+    # 一次最多消费3条(最多可设置为16条)。
+    batch = 1
+    Common.logger(log_type, crawler).info(f'{10 * "="}Consume And Ack Message From Topic{10 * "="}\n'
+                                          f'WaitSeconds:{wait_seconds}\n'
+                                          f'TopicName:{topic_name}\n'
+                                          f'MQConsumer:{group_id}')
+    while True:
+        try:
+            # 长轮询消费消息。
+            recv_msgs = consumer.consume_message(batch, wait_seconds)
+            for msg in recv_msgs:
+                Common.logger(log_type, crawler).info(f"Receive\n"
+                                                      f"MessageId:{msg.message_id}\n"
+                                                      f"MessageBodyMD5:{msg.message_body_md5}\n"
+                                                      f"MessageTag:{msg.message_tag}\n"
+                                                      f"ConsumedTimes:{msg.consumed_times}\n"
+                                                      f"PublishTime:{msg.publish_time}\n"
+                                                      f"Body:{msg.message_body}\n"
+                                                      f"NextConsumeTime:{msg.next_consume_time}\n"
+                                                      f"ReceiptHandle:{msg.receipt_handle}\n"
+                                                      f"Properties:{msg.properties}")
+                # ack_mq_message
+                ack_message(log_type=log_type, crawler=crawler, recv_msgs=recv_msgs, consumer=consumer)
+
+                # 处理爬虫业务
+                task_dict = task_fun_mq(msg.message_body)['task_dict']
+                rule_dict = task_fun_mq(msg.message_body)['rule_dict']
+                Common.logger(log_type, crawler).info(f"调度任务:{task_dict}")
+                Common.logger(log_type, crawler).info(f"抓取规则:{rule_dict}\n")
+                Common.logger(log_type, crawler).info(f'开始抓取:{task_dict["taskName"]}\n')
+                GongzhonghaoAuthor4.get_all_videos(log_type=log_type,
+                                                    crawler=crawler,
+                                                    rule_dict=rule_dict,
+                                                    env=env)
+                Common.del_logs(log_type, crawler)
+                Common.logger(log_type, crawler).info('抓取一轮结束\n')
+        except MQExceptionBase as err:
+            # Topic中没有消息可消费。
+            if err.type == "MessageNotExist":
+                Common.logger(log_type, crawler).info(f"No new message! RequestId:{err.req_id}\n")
+                continue
+
+            Common.logger(log_type, crawler).info(f"Consume Message Fail! Exception:{err}\n")
+            time.sleep(2)
+            continue
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()  ## 新建参数解释器对象
+    parser.add_argument('--log_type', type=str)  ## 添加参数,注明参数类型
+    parser.add_argument('--crawler')  ## 添加参数
+    parser.add_argument('--topic_name')  ## 添加参数
+    parser.add_argument('--group_id')  ## 添加参数
+    parser.add_argument('--env')  ## 添加参数
+    args = parser.parse_args()  ### 参数赋值,也可以通过终端赋值
+    main(log_type=args.log_type,
+         crawler=args.crawler,
+         topic_name=args.topic_name,
+         group_id=args.group_id,
+         env=args.env)

+ 88 - 0
gongzhonghao/gongzhonghao_main/run_gzh5_author.py

@@ -0,0 +1,88 @@
+# -*- coding: utf-8 -*-
+# @Author: wangkun
+# @Time: 2023/6/6
+import argparse
+from mq_http_sdk.mq_client import *
+from mq_http_sdk.mq_consumer import *
+from mq_http_sdk.mq_exception import MQExceptionBase
+sys.path.append(os.getcwd())
+from common.public import task_fun_mq, get_consumer, ack_message
+from common.common import Common
+from gongzhonghao.gongzhonghao_author.gongzhonghao5_author import GongzhonghaoAuthor5
+
+
+def main(log_type, crawler, topic_name, group_id, env):
+    if "gzh1" in topic_name:
+        log_type = "author1"
+    elif "gzh2" in topic_name:
+        log_type = "author2"
+    elif "gzh3" in topic_name:
+        log_type = "author3"
+    elif "gzh4" in topic_name:
+        log_type = "author4"
+    elif "gzh5" in topic_name:
+        log_type = "author5"
+    consumer = get_consumer(topic_name, group_id)
+    # 长轮询表示如果Topic没有消息,则客户端请求会在服务端挂起3秒,3秒内如果有消息可以消费则立即返回响应。
+    # 长轮询时间3秒(最多可设置为30秒)。
+    wait_seconds = 3
+    # 一次最多消费3条(最多可设置为16条)。
+    batch = 1
+    Common.logger(log_type, crawler).info(f'{10 * "="}Consume And Ack Message From Topic{10 * "="}\n'
+                                          f'WaitSeconds:{wait_seconds}\n'
+                                          f'TopicName:{topic_name}\n'
+                                          f'MQConsumer:{group_id}')
+    while True:
+        try:
+            # 长轮询消费消息。
+            recv_msgs = consumer.consume_message(batch, wait_seconds)
+            for msg in recv_msgs:
+                Common.logger(log_type, crawler).info(f"Receive\n"
+                                                      f"MessageId:{msg.message_id}\n"
+                                                      f"MessageBodyMD5:{msg.message_body_md5}\n"
+                                                      f"MessageTag:{msg.message_tag}\n"
+                                                      f"ConsumedTimes:{msg.consumed_times}\n"
+                                                      f"PublishTime:{msg.publish_time}\n"
+                                                      f"Body:{msg.message_body}\n"
+                                                      f"NextConsumeTime:{msg.next_consume_time}\n"
+                                                      f"ReceiptHandle:{msg.receipt_handle}\n"
+                                                      f"Properties:{msg.properties}")
+                # ack_mq_message
+                ack_message(log_type=log_type, crawler=crawler, recv_msgs=recv_msgs, consumer=consumer)
+
+                # 处理爬虫业务
+                task_dict = task_fun_mq(msg.message_body)['task_dict']
+                rule_dict = task_fun_mq(msg.message_body)['rule_dict']
+                Common.logger(log_type, crawler).info(f"调度任务:{task_dict}")
+                Common.logger(log_type, crawler).info(f"抓取规则:{rule_dict}\n")
+                Common.logger(log_type, crawler).info(f'开始抓取:{task_dict["taskName"]}\n')
+                GongzhonghaoAuthor5.get_all_videos(log_type=log_type,
+                                                    crawler=crawler,
+                                                    rule_dict=rule_dict,
+                                                    env=env)
+                Common.del_logs(log_type, crawler)
+                Common.logger(log_type, crawler).info('抓取一轮结束\n')
+        except MQExceptionBase as err:
+            # Topic中没有消息可消费。
+            if err.type == "MessageNotExist":
+                Common.logger(log_type, crawler).info(f"No new message! RequestId:{err.req_id}\n")
+                continue
+
+            Common.logger(log_type, crawler).info(f"Consume Message Fail! Exception:{err}\n")
+            time.sleep(2)
+            continue
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()  ## 新建参数解释器对象
+    parser.add_argument('--log_type', type=str)  ## 添加参数,注明参数类型
+    parser.add_argument('--crawler')  ## 添加参数
+    parser.add_argument('--topic_name')  ## 添加参数
+    parser.add_argument('--group_id')  ## 添加参数
+    parser.add_argument('--env')  ## 添加参数
+    args = parser.parse_args()  ### 参数赋值,也可以通过终端赋值
+    main(log_type=args.log_type,
+         crawler=args.crawler,
+         topic_name=args.topic_name,
+         group_id=args.group_id,
+         env=args.env)

+ 0 - 161
jixiangxingfu/jixiangxingfu_recommend/insert.py

@@ -1,161 +0,0 @@
-# -*- coding: utf-8 -*-
-# @Author: wangkun
-# @Time: 2023/4/17
-import json
-import os
-import sys
-import time
-from datetime import date, timedelta
-from hashlib import md5
-
-sys.path.append(os.getcwd())
-from common.common import Common
-from common.feishu import Feishu
-from common.scheduling_db import MysqlHelper
-
-
-class Insert:
-    @classmethod
-    def get_config(cls, log_type, crawler, text, env):
-        select_sql = f"""select * from crawler_config where source="benshanzhufu" """
-        contents = MysqlHelper.get_values(log_type, crawler, select_sql, env, action='')
-        title_list = []
-        filter_list = []
-        for content in contents:
-            config = content['config']
-            config_dict = eval(config)
-            for k, v in config_dict.items():
-                if k == "title":
-                    title_list_config = v.split(",")
-                    for title in title_list_config:
-                        title_list.append(title)
-                if k == "filter":
-                    filter_list_config = v.split(",")
-                    for filter_word in filter_list_config:
-                        filter_list.append(filter_word)
-        if text == "title":
-            return title_list
-        elif text == "filter":
-            return filter_list
-
-    @classmethod
-    def before_day(cls):
-        publish_time_str_rule = (date.today() + timedelta(days=-30)).strftime("%Y-%m-%d %H:%M:%S")
-        publish_time_stamp_rule = int(time.mktime(time.strptime(publish_time_str_rule, "%Y-%m-%d %H:%M:%S")))
-        print(publish_time_str_rule)
-        print(publish_time_stamp_rule)
-
-    @classmethod
-    def insert_config(cls, log_type, crawler, env):
-        filter_sheet = Feishu.get_values_batch(log_type, crawler, "DjXfqG")
-        # title_sheet = Feishu.get_values_batch(log_type, crawler, "bHSW1p")
-        filter_list = []
-        # title_list = []
-        for x in filter_sheet:
-            for y in x:
-                if y is None:
-                    pass
-                else:
-                    filter_list.append(y)
-        # for x in title_sheet:
-        #     for y in x:
-        #         if y is None:
-        #             pass
-        #         else:
-        #             title_list.append(y)
-        # str_title = ','.join(title_list)
-        str_filter = ','.join(filter_list)
-        config_dict = {
-            # "title": str_title,
-            "filter": str_filter
-        }
-        str_config_dict = str(config_dict)
-        # print(f"config_dict:{config_dict}")
-        # print(f"str_config_dict:{str_config_dict}")
-        insert_sql = f""" insert into crawler_config(title, source, config) values("本山祝福小程序", "benshanzhufu", "{str_config_dict}") """
-        MysqlHelper.update_values(log_type, crawler, insert_sql, env)
-
-    @classmethod
-    def insert_video_from_feishu_to_mysql(cls, log_type, crawler, env):
-        jixiangxingfu_sheetid = ['d9e9b1']
-        for sheetid in jixiangxingfu_sheetid:
-            xiaoniangao_sheet = Feishu.get_values_batch(log_type, crawler, sheetid)
-            for i in range(1, len(xiaoniangao_sheet)):
-            # for i in range(1, 5):
-                if xiaoniangao_sheet[i][5] is None or xiaoniangao_sheet[i][7] is None:
-                    continue
-                video_id = xiaoniangao_sheet[i][12].replace("https://admin.piaoquantv.com/cms/post-detail/", "").replace(
-                    "/info", "")
-                if video_id == "None":
-                    continue
-                video_id = int(video_id)
-                out_user_id = "jixiangxingfu"
-                platform = "吉祥幸福"
-                strategy = "推荐榜爬虫策略"
-                video_title = str(xiaoniangao_sheet[i][7])
-                play_cnt = int(xiaoniangao_sheet[i][9].split("万")[0])*10000
-                duration = str(xiaoniangao_sheet[i][10])
-                width = int(xiaoniangao_sheet[i][11].split("*")[0])
-                height = int(xiaoniangao_sheet[i][11].split("*")[1])
-                cover_url = str(xiaoniangao_sheet[i][13])
-                video_url = str(xiaoniangao_sheet[i][14])
-                crawler_rule = json.dumps({})
-                out_video_id = md5(video_title.encode('utf8')).hexdigest()
-
-                # print(f"video_id:{video_id}, type:{type(video_id)}")
-                # print(f"out_user_id:{out_user_id}, type:{type(out_user_id)}")
-                # print(f"platform:{platform}, type:{type(platform)}")
-                # print(f"strategy:{strategy}, type:{type(strategy)}")
-                # print(f"video_title:{video_title}, type:{type(video_title)}")
-                # print(f"cover_url:{cover_url}, type:{type(cover_url)}")
-                # print(f"video_url:{video_url}, type:{type(video_url)}")
-                # print(f"crawler_rule:{crawler_rule}, type:{type(crawler_rule)}")
-
-                select_sql = f""" select * from crawler_video where platform="{platform}" and video_url="{video_url}" """
-                Common.logger(log_type, crawler).info(f"select_sql:{select_sql}")
-                repeat_video = MysqlHelper.get_values(log_type, crawler, select_sql, env)
-                Common.logger(log_type, crawler).info(f"repeat_video:{repeat_video}")
-
-                if repeat_video is not None and len(repeat_video) != 0:
-                    Common.logger(log_type, crawler).info(f"{video_title} 已存在数据库中\n")
-                else:
-                    # 视频信息保存数据库
-                    insert_sql = f""" insert into crawler_video(video_id,
-                                        out_user_id,
-                                        platform,
-                                        strategy,
-                                        out_video_id,
-                                        video_title,
-                                        cover_url,
-                                        video_url,
-                                        duration,
-                                        play_cnt,
-                                        crawler_rule,
-                                        width,
-                                        height)
-                                        values({video_id},
-                                        "{out_user_id}",
-                                        "{platform}",
-                                        "{strategy}",
-                                        "{out_video_id}",
-                                        "{video_title}",
-                                        "{cover_url}",
-                                        "{video_url}",
-                                        {duration},
-                                        {play_cnt},
-                                        '{crawler_rule}',
-                                        {width},
-                                        {height}) """
-                    Common.logger(log_type, crawler).info(f"insert_sql:{insert_sql}")
-                    MysqlHelper.update_values(log_type, crawler, insert_sql, env, action='')
-                    Common.logger(log_type, crawler).info('视频信息插入数据库成功!\n')
-
-
-
-if __name__ == "__main__":
-    # Insert.insert_config("insert", "benshanzhufu", "dev")
-    # print(Insert.get_config("insert", "ganggangdouchuan", "filter", "dev"))
-    # Insert.insert_video_from_feishu_to_mysql("insert-dev", "ganggangdouchuan", "dev")
-    # Insert.insert_video_from_feishu_to_mysql("insert-dev", "jixiangxingfu", "dev")
-    Insert.insert_video_from_feishu_to_mysql("insert-prod", "jixiangxingfu", "prod")
-    pass

+ 1 - 1
kuaishou/kuaishou_author/kuaishou_author_scheduling.py

@@ -82,7 +82,7 @@ class KuaishouauthorScheduling:
     def get_videoList(cls, log_type, crawler, user_dict, rule_dict, env):
         pcursor = ""
         while True:
-            Common.logger(log_type, crawler).info(f'cookie:{cls.get_cookie(log_type, crawler, env)["cookie"]}')
+            # Common.logger(log_type, crawler).info(f'cookie:{cls.get_cookie(log_type, crawler, env)["cookie"]}')
             url = "https://www.kuaishou.com/graphql"
             payload = json.dumps({
                 "operationName": "visionProfilePhotoList",

+ 0 - 216
kuaishou/kuaishou_follow/insert_videos.py

@@ -1,216 +0,0 @@
-# -*- coding: utf-8 -*-
-# @Author: wangkun
-# @Time: 2023/2/27
-import json
-import os
-import random
-import string
-import sys
-import time
-
-sys.path.append(os.getcwd())
-from common.common import Common
-from common.db import MysqlHelper
-from common.feishu import Feishu
-
-
-class Insert:
-    @classmethod
-    def insert_video_from_feishu_to_mysql(cls, log_type, crawler, env, machine):
-        kuaishou_sheetid_list = ["fYdA8F", "3cd128", "31kOdu"]
-        for sheetid in kuaishou_sheetid_list:
-            kuaishou_sheet = Feishu.get_values_batch(log_type, crawler, sheetid)
-            # Common.logger(log_type, crawler).info(f"kuaishou_sheet:{kuaishou_sheet}")
-            for i in range(1, len(kuaishou_sheet)):
-                # for i in range(1, 3):
-                if kuaishou_sheet[i][5] is None:
-                    continue
-                if kuaishou_sheet[i][9] is None:
-                    video_id = int(time.time())
-                else:
-                    video_id = kuaishou_sheet[i][9].replace("https://admin.piaoquantv.com/cms/post-detail/",
-                                                            "").replace("/info", "")
-                if video_id == "None":
-                    continue
-                video_id = int(video_id)
-                user_id = 0
-                out_user_id = str(kuaishou_sheet[i][18])
-                platform = "快手"
-                strategy = "定向爬虫策略"
-                out_video_id = str(kuaishou_sheet[i][7])
-                video_title = str(kuaishou_sheet[i][8])
-                cover_url = str(kuaishou_sheet[i][20])
-                video_url = str(kuaishou_sheet[i][21])
-                duration = int(kuaishou_sheet[i][14])
-                publish_time = str(kuaishou_sheet[i][16]).replace("/", "-")
-                play_cnt = int(kuaishou_sheet[i][10])
-                like_cnt = int(kuaishou_sheet[i][12])
-                share_cnt = int(kuaishou_sheet[i][13])
-                # collection_cnt = 0
-                comment_cnt = int(kuaishou_sheet[i][11])
-                crawler_rule = json.dumps(
-                    {"play_cnt": 5000, "comment_cnt": 0, "like_cnt": 5000, "share_cnt": 1000, "duration": 40,
-                     "publish_time": 7, "video_width": 0, "video_height": 0})
-                width = int(kuaishou_sheet[i][15].split("*")[0])
-                height = int(kuaishou_sheet[i][15].split("*")[1])
-
-                # print(f"video_id:{video_id}, type:{type(video_id)}")
-                # print(f"user_id:{user_id}, type:{type(user_id)}")
-                # print(f"out_user_id:{out_user_id}, type:{type(out_user_id)}")
-                # print(f"platform:{platform}, type:{type(platform)}")
-                # print(f"strategy:{strategy}, type:{type(strategy)}")
-                # print(f"out_video_id:{out_video_id}, type:{type(out_video_id)}")
-                # print(f"video_title:{video_title}, type:{type(video_title)}")
-                # print(f"cover_url:{cover_url}, type:{type(cover_url)}")
-                # print(f"video_url:{video_url}, type:{type(video_url)}")
-                # print(f"duration:{duration}, type:{type(duration)}")
-                # print(f"publish_time:{publish_time}, type:{type(publish_time)}")
-                # print(f"play_cnt:{play_cnt}, type:{type(play_cnt)}")
-                # print(f"like_cnt:{like_cnt}, type:{type(like_cnt)}")
-                # print(f"share_cnt:{share_cnt}, type:{type(share_cnt)}")
-                # print(f"collection_cnt:{collection_cnt}, type:{type(collection_cnt)}")
-                # print(f"comment_cnt:{comment_cnt}, type:{type(comment_cnt)}")
-                # print(f"crawler_rule:{crawler_rule}, type:{type(crawler_rule)}")
-                # print(f"width:{width}, type:{type(width)}")
-                # print(f"height:{height}, type:{type(height)}\n")
-
-                select_sql = f""" select * from crawler_video where platform="{platform}" and out_video_id="{out_video_id}" """
-                # Common.logger(log_type, crawler).info(f"select_sql:{select_sql}")
-                repeat_video = MysqlHelper.get_values(log_type, crawler, select_sql, env, machine)
-                Common.logger(log_type, crawler).info(f"repeat_video:{repeat_video}")
-
-                repeat_video_id_sql = f""" select * from crawler_video where out_video_id="{out_video_id}" """
-                repeat_video_id = MysqlHelper.get_values(log_type, crawler, repeat_video_id_sql, env, machine)
-                Common.logger(log_type, crawler).info(f"repeat_video_id:{repeat_video_id}")
-
-                if repeat_video is not None and len(repeat_video) != 0:
-                    Common.logger(log_type, crawler).info(f"{video_title} 已存在数据库中\n")
-                elif repeat_video_id is not None and len(repeat_video_id) != 0:
-                    Common.logger(log_type, crawler).info(f"开始更新视频信息\n")
-                    update_sql = f""" UPDATE crawler_video SET
-                                    user_id={user_id},
-                                    out_user_id="{out_user_id}",
-                                    platform="{platform}",
-                                    strategy="{strategy}",
-                                    out_video_id="{out_video_id}",
-                                    video_title="{video_title}",
-                                    cover_url="{cover_url}",
-                                    video_url="{video_url}",
-                                    duration={duration},
-                                    publish_time="{publish_time}",
-                                    play_cnt={play_cnt},
-                                    like_cnt={like_cnt},
-                                    share_cnt={share_cnt},
-                                    comment_cnt={comment_cnt},
-                                    crawler_rule='{crawler_rule}',
-                                    width={width},
-                                    height={height}
-                                    WHERE video_id={video_id}
-                                    """
-                    Common.logger(log_type, crawler).info(f"update_sql:{update_sql}")
-                    MysqlHelper.update_values(log_type, crawler, update_sql, env, machine)
-                    Common.logger(log_type, crawler).info('视频信息更新成功!\n')
-                else:
-                    # 视频信息保存数据库
-                    insert_sql = f""" insert into crawler_video(video_id,
-                                    user_id,
-                                    out_user_id,
-                                    platform,
-                                    strategy,
-                                    out_video_id,
-                                    video_title,
-                                    cover_url,
-                                    video_url,
-                                    duration,
-                                    publish_time,
-                                    play_cnt,
-                                    like_cnt,
-                                    share_cnt,
-                                    comment_cnt,
-                                    crawler_rule,
-                                    width,
-                                    height)
-                                    values({video_id},
-                                    {user_id},
-                                    "{out_user_id}",
-                                    "{platform}",
-                                    "{strategy}",
-                                    "{out_video_id}",
-                                    "{video_title}",
-                                    "{cover_url}",
-                                    "{video_url}",
-                                    {duration},
-                                    "{publish_time}",
-                                    {play_cnt},
-                                    {like_cnt},
-                                    {share_cnt},
-                                    {comment_cnt},
-                                    '{crawler_rule}',
-                                    {width},
-                                    {height}) """
-                    Common.logger(log_type, crawler).info(f"insert_sql:{insert_sql}")
-                    MysqlHelper.update_values(log_type, crawler, insert_sql, env, machine)
-                    Common.logger(log_type, crawler).info('视频信息插入数据库成功!\n')
-
-    @classmethod
-    def get_sheet(cls):
-        sheet = Feishu.get_values_batch("insert", "kuaishou", "fYdA8F")
-        print(sheet)
-
-    @classmethod
-    def random_out_uid(cls):
-        did = "web_e2901e1c5a13c60af81ba88bc7a3ee24"
-        userId = "1921947321"
-        did = "web_e2901e1c5a13c60af81ba88bc7a3ee24"
-        userId = "3352428474"
-        src_digits = string.digits  # string_数字
-        src_uppercase = string.ascii_uppercase  # string_大写字母
-        src_lowercase = string.ascii_lowercase  # string_小写字母
-        # 10位随机数的方法
-        userId = ''.join(str(random.choice(range(1, 10))) for _ in range(10))
-        print(type(userId))
-        print(userId)
-        # 生成5位随机字符,包括大小写字母和数字
-        a_str = ''.join(random.sample(string.ascii_letters + string.digits, 5))
-        out_uid = ''.join(random.sample(string.digits, 10))
-        print(type(out_uid))
-        print(out_uid)
-
-    @classmethod
-    def random_cookies(cls):
-        kuaishou_server_web_st="ChZrdWFpc2hvdS5zZXJ2ZXIud2ViLnN0EqABaRXtfRHlzKlQVj0Nm" \
-                               "_M1G2wrIN1p6g3UTwfqfez6rkLVj6mPNt3RBAsLkyemMpvTLerPw0h41Q0lowqcImvIv5dlSGDEpQoj" \
-                               "-VTAmOR2Suzm8vCRakG7XziAWyI0PXJKhvdXms" \
-                               "-9Giy" \
-                               "_4TnoniB49Oo3m7qXjXVBCzybcWS5BO90OLkhD30GYmGEnBBvkBI2oErJy3mNbafQdBQ6SxSUHhoS" \
-                               "-1Rj5" \
-                               "-IBBNoxoIePYcxZFs4oIiCvaT7sRn" \
-                               "-zrF7X2ClPhfNh6lgClmH8MUjXszUfY_TPLCgFMAE"
-        kuaishou_server_web_ph="1b62b98fc28bc23a42cd85240e1fd6025983"
-        kuaishou_server_web_st_1 = ''.join(random.sample(string.ascii_letters + string.digits, 53))
-        kuaishou_server_web_st_2 = ''.join(random.sample(string.ascii_letters + string.digits, 58))+''.join(random.sample(string.ascii_letters + string.digits, 20))
-        kuaishou_server_web_st_3 = ''.join(random.sample(string.ascii_letters + string.digits, 37))
-        kuaishou_server_web_st_4 = ''.join(random.sample(string.ascii_letters + string.digits, 4))
-        kuaishou_server_web_st_5 = ''.join(random.sample(string.ascii_letters + string.digits, 56))+''.join(random.sample(string.ascii_letters + string.digits, 20))
-        kuaishou_server_web_st_6 = ''.join(random.sample(string.ascii_letters + string.digits, 4))
-        kuaishou_server_web_st_7 = ''.join(random.sample(string.ascii_letters + string.digits, 28))
-        kuaishou_server_web_st_8 = ''.join(random.sample(string.ascii_letters + string.digits, 40))
-        kuaishou_server_web_st = f"{kuaishou_server_web_st_1}" \
-                                 f"_{kuaishou_server_web_st_2}" \
-                                 f"-{kuaishou_server_web_st_3}" \
-                                 f"-{kuaishou_server_web_st_4}" \
-                                 f"_{kuaishou_server_web_st_5}" \
-                                 f"-{kuaishou_server_web_st_6}" \
-                                 f"-{kuaishou_server_web_st_7}" \
-                                 f"-{kuaishou_server_web_st_8}"
-
-        kuaishou_server_web_ph = ''.join(random.sample(string.ascii_letters + string.digits, 36))
-        print(f"kuaishou_server_web_st:{kuaishou_server_web_st}")
-        print(f"kuaishou_server_web_ph:{kuaishou_server_web_ph}")
-
-if __name__ == "__main__":
-    # Insert.insert_video_from_feishu_to_mysql("insert-prod", "kuaishou", "prod", "local")
-    # Insert.get_sheet()
-    # Insert.random_out_uid()
-    Insert.random_cookies()
-    pass

+ 0 - 596
kuaishou/kuaishou_follow/kuaishou_follow_scheduling.py

@@ -1,596 +0,0 @@
-# -*- coding: utf-8 -*-
-# @Author: lierqiang
-# @Time: 2023/2/24
-import os
-import random
-import shutil
-import sys
-import time
-from hashlib import md5
-
-import requests
-import json
-
-import urllib3
-from requests.adapters import HTTPAdapter
-
-sys.path.append(os.getcwd())
-from common.common import Common
-from common.feishu import Feishu
-from common.getuser import getUser
-# from common.db import MysqlHelper
-from common.scheduling_db import MysqlHelper
-from common.publish import Publish
-from common.public import random_title, get_config_from_mysql
-from common.public import get_user_from_mysql
-from common.userAgent import get_random_user_agent
-
-
-class KuaiShouFollowScheduling:
-    platform = "快手"
-    tag = "快手爬虫,定向爬虫策略"
-
-    @classmethod
-    def get_rule(cls, log_type, crawler, index):
-        try:
-            rule_sheet = Feishu.get_values_batch(log_type, crawler, "3iqG4z")
-            if index == 1:
-                rule_dict = {
-                    "play_cnt": f"{rule_sheet[1][1]}{rule_sheet[1][2]}",
-                    "video_width": f"{rule_sheet[2][1]}{rule_sheet[2][2]}",
-                    "video_height": f"{rule_sheet[3][1]}{rule_sheet[3][2]}",
-                    "like_cnt": f"{rule_sheet[4][1]}{rule_sheet[4][2]}",
-                    "duration": f"{rule_sheet[5][1]}{rule_sheet[5][2]}",
-                    "download_cnt": f"{rule_sheet[6][1]}{rule_sheet[6][2]}",
-                    "publish_time": f"{rule_sheet[7][1]}{rule_sheet[7][2]}",
-                }
-                # for k, v in rule_dict.items():
-                #     Common.logger(log_type, crawler).info(f"{k}:{v}")
-                return rule_dict
-            elif index == 2:
-                rule_dict = {
-                    "play_cnt": f"{rule_sheet[9][1]}{rule_sheet[9][2]}",
-                    "video_width": f"{rule_sheet[10][1]}{rule_sheet[10][2]}",
-                    "video_height": f"{rule_sheet[11][1]}{rule_sheet[11][2]}",
-                    "like_cnt": f"{rule_sheet[12][1]}{rule_sheet[12][2]}",
-                    "duration": f"{rule_sheet[13][1]}{rule_sheet[13][2]}",
-                    "download_cnt": f"{rule_sheet[14][1]}{rule_sheet[14][2]}",
-                    "publish_time": f"{rule_sheet[15][1]}{rule_sheet[15][2]}",
-                }
-                # for k, v in rule_dict.items():
-                #     Common.logger(log_type, crawler).info(f"{k}:{v}")
-                return rule_dict
-        except Exception as e:
-            Common.logger(log_type, crawler).error(f"get_rule:{e}\n")
-
-    @classmethod
-    def download_rule(cls, video_dict, rule_dict):
-        if video_dict['like_cnt'] >= rule_dict['like_cnt']['min']:
-            if video_dict['publish_time'] >= rule_dict['publish_time']['min']:
-                if video_dict['duration'] >= rule_dict['duration']['min']:
-                    if video_dict['video_width'] >= rule_dict['width']['min'] \
-                            or video_dict['video_height'] >= rule_dict['height']['min']:
-                        return True
-                    else:
-                        return False
-                else:
-                    return False
-            else:
-                return False
-        else:
-            return False
-
-    # 过滤词库
-    @classmethod
-    def filter_words(cls, log_type, crawler):
-        try:
-            while True:
-                filter_words_sheet = Feishu.get_values_batch(log_type, crawler, 'HIKVvs')
-                if filter_words_sheet is None:
-                    Common.logger(log_type, crawler).warning(f"filter_words_sheet:{filter_words_sheet} 10秒钟后重试")
-                    continue
-                filter_words_list = []
-                for x in filter_words_sheet:
-                    for y in x:
-                        if y is None:
-                            pass
-                        else:
-                            filter_words_list.append(y)
-                return filter_words_list
-        except Exception as e:
-            Common.logger(log_type, crawler).error(f'filter_words异常:{e}\n')
-
-    # 获取站外用户信息
-    @classmethod
-    def get_out_user_info(cls, log_type, crawler, out_uid):
-        try:
-            url = "https://www.kuaishou.com/graphql"
-
-            payload = json.dumps({
-                "operationName": "visionProfile",
-                "variables": {
-                    "userId": out_uid
-                },
-                "query": "query visionProfile($userId: String) {\n  visionProfile(userId: $userId) {\n    result\n    hostName\n    userProfile {\n      ownerCount {\n        fan\n        photo\n        follow\n        photo_public\n        __typename\n      }\n      profile {\n        gender\n        user_name\n        user_id\n        headurl\n        user_text\n        user_profile_bg_url\n        __typename\n      }\n      isFollowing\n      __typename\n    }\n    __typename\n  }\n}\n"
-            })
-            headers = {
-                'Accept': '*/*',
-                'Content-Type': 'application/json',
-                'Origin': 'https://www.kuaishou.com',
-                'Cookie': 'kpf=PC_WEB; clientid=3; did=web_921138a59a2c2a70a89fbf0e2d2db6d8; kpn=KUAISHOU_VISION',
-                'Content-Length': '552',
-                'Accept-Language': 'zh-CN,zh-Hans;q=0.9',
-                'Host': 'www.kuaishou.com',
-                'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/15.6.1 Safari/605.1.15',
-                'Referer': 'https://www.kuaishou.com/profile/{}'.format(out_uid),
-                'Accept-Encoding': 'gzip, deflate, br',
-                'Connection': 'keep-alive'
-            }
-            urllib3.disable_warnings()
-            s = requests.session()
-            # max_retries=3 重试3次
-            s.mount('http://', HTTPAdapter(max_retries=3))
-            s.mount('https://', HTTPAdapter(max_retries=3))
-            response = s.post(url=url, headers=headers, data=payload, proxies=Common.tunnel_proxies(), verify=False,
-                              timeout=5)
-            response.close()
-            # Common.logger(log_type, crawler).info(f"get_out_user_info_response:{response.text}")
-            if response.status_code != 200:
-                Common.logger(log_type, crawler).warning(f"get_out_user_info_response:{response.text}\n")
-                return
-            elif 'data' not in response.json():
-                Common.logger(log_type, crawler).warning(f"get_out_user_info_response:{response.json()}\n")
-                return
-            elif 'visionProfile' not in response.json()['data']:
-                Common.logger(log_type, crawler).warning(f"get_out_user_info_response:{response.json()['data']}\n")
-                return
-            elif 'userProfile' not in response.json()['data']['visionProfile']:
-                Common.logger(log_type, crawler).warning(
-                    f"get_out_user_info_response:{response.json()['data']['visionProfile']['userProfile']}\n")
-                return
-            else:
-                userProfile = response.json()['data']['visionProfile']['userProfile']
-                # Common.logger(log_type, crawler).info(f"userProfile:{userProfile}")
-
-                try:
-                    out_fans_str = str(userProfile['ownerCount']['fan'])
-                except Exception:
-                    out_fans_str = "0"
-
-                try:
-                    out_follow_str = str(userProfile['ownerCount']['follow'])
-                except Exception:
-                    out_follow_str = "0"
-
-                try:
-                    out_avatar_url = userProfile['profile']['headurl']
-                except Exception:
-                    out_avatar_url = ""
-
-                Common.logger(log_type, crawler).info(f"out_fans_str:{out_fans_str}")
-                Common.logger(log_type, crawler).info(f"out_follow_str:{out_follow_str}")
-                Common.logger(log_type, crawler).info(f"out_avatar_url:{out_avatar_url}")
-
-                if "万" in out_fans_str:
-                    out_fans = int(float(out_fans_str.split("万")[0]) * 10000)
-                else:
-                    out_fans = int(out_fans_str.replace(",", ""))
-                if "万" in out_follow_str:
-                    out_follow = int(float(out_follow_str.split("万")[0]) * 10000)
-                else:
-                    out_follow = int(out_follow_str.replace(",", ""))
-
-                out_user_dict = {
-                    "out_fans": out_fans,
-                    "out_follow": out_follow,
-                    "out_avatar_url": out_avatar_url
-                }
-                Common.logger(log_type, crawler).info(f"out_user_dict:{out_user_dict}")
-                return out_user_dict
-        except Exception as e:
-            Common.logger(log_type, crawler).error(f"get_out_user_info:{e}\n")
-
-    # 获取用户信息列表
-    @classmethod
-    def get_user_list(cls, log_type, crawler, sheetid, env):
-        try:
-            while True:
-                user_sheet = Feishu.get_values_batch(log_type, crawler, sheetid)
-                if user_sheet is None:
-                    Common.logger(log_type, crawler).warning(f"user_sheet:{user_sheet} 10秒钟后重试")
-                    continue
-                our_user_list = []
-                for i in range(1, len(user_sheet)):
-                    # for i in range(1, 2):
-                    out_uid = user_sheet[i][2]
-                    user_name = user_sheet[i][3]
-                    our_uid = user_sheet[i][6]
-                    our_user_link = user_sheet[i][7]
-                    if out_uid is None or user_name is None:
-                        Common.logger(log_type, crawler).info("空行\n")
-                    else:
-                        Common.logger(log_type, crawler).info(f"正在更新 {user_name} 用户信息\n")
-                        if our_uid is None:
-                            out_user_info = cls.get_out_user_info(log_type, crawler, out_uid)
-                            out_user_dict = {
-                                "out_uid": out_uid,
-                                "user_name": user_name,
-                                "out_avatar_url": out_user_info["out_avatar_url"],
-                                "out_create_time": '',
-                                "out_tag": '',
-                                "out_play_cnt": 0,
-                                "out_fans": out_user_info["out_fans"],
-                                "out_follow": out_user_info["out_follow"],
-                                "out_friend": 0,
-                                "out_like": 0,
-                                "platform": cls.platform,
-                                "tag": cls.tag,
-                            }
-                            our_user_dict = getUser.create_user(log_type=log_type, crawler=crawler,
-                                                                out_user_dict=out_user_dict, env=env)
-                            our_uid = our_user_dict['our_uid']
-                            our_user_link = our_user_dict['our_user_link']
-                            Feishu.update_values(log_type, crawler, sheetid, f'G{i + 1}:H{i + 1}',
-                                                 [[our_uid, our_user_link]])
-                            Common.logger(log_type, crawler).info(f'站内用户信息写入飞书成功!\n')
-                            our_user_list.append(our_user_dict)
-                        else:
-                            our_user_dict = {
-                                'out_uid': out_uid,
-                                'user_name': user_name,
-                                'our_uid': our_uid,
-                                'our_user_link': our_user_link,
-                            }
-                            our_user_list.append(our_user_dict)
-                return our_user_list
-        except Exception as e:
-            Common.logger(log_type, crawler).error(f'get_user_list:{e}\n')
-
-    # 处理视频标题
-    @classmethod
-    def video_title(cls, log_type, crawler, env, title):
-        title_split1 = title.split(" #")
-        if title_split1[0] != "":
-            title1 = title_split1[0]
-        else:
-            title1 = title_split1[-1]
-
-        title_split2 = title1.split(" #")
-        if title_split2[0] != "":
-            title2 = title_split2[0]
-        else:
-            title2 = title_split2[-1]
-
-        title_split3 = title2.split("@")
-        if title_split3[0] != "":
-            title3 = title_split3[0]
-        else:
-            title3 = title_split3[-1]
-
-        video_title = title3.strip().replace("\n", "") \
-                          .replace("/", "").replace("快手", "").replace(" ", "") \
-                          .replace(" ", "").replace("&NBSP", "").replace("\r", "") \
-                          .replace("#", "").replace(".", "。").replace("\\", "") \
-                          .replace(":", "").replace("*", "").replace("?", "") \
-                          .replace("?", "").replace('"', "").replace("<", "") \
-                          .replace(">", "").replace("|", "").replace("@", "").replace('"', '').replace("'", '')[:40]
-        if video_title.replace(" ", "") == "" or video_title == "。。。" or video_title == "...":
-            return random_title(log_type, crawler, env, text='title')
-        else:
-            return video_title
-
-    @classmethod
-    def get_videoList(cls, log_type, crawler, strategy, task, our_uid, out_uid, oss_endpoint, env, pcursor=""):
-        rule_dict_1 = task['rule_dict']
-        url = "https://www.kuaishou.com/graphql"
-        payload = json.dumps({
-            "operationName": "visionProfilePhotoList",
-            "variables": {
-                "userId": out_uid,
-                "pcursor": "",
-                "page": "profile"
-            },
-            "query": "fragment photoContent on PhotoEntity {\n  id\n  duration\n  caption\n  originCaption\n  likeCount\n  viewCount\n  commentCount\n  realLikeCount\n  coverUrl\n  photoUrl\n  photoH265Url\n  manifest\n  manifestH265\n  videoResource\n  coverUrls {\n    url\n    __typename\n  }\n  timestamp\n  expTag\n  animatedCoverUrl\n  distance\n  videoRatio\n  liked\n  stereoType\n  profileUserTopPhoto\n  musicBlocked\n  __typename\n}\n\nfragment feedContent on Feed {\n  type\n  author {\n    id\n    name\n    headerUrl\n    following\n    headerUrls {\n      url\n      __typename\n    }\n    __typename\n  }\n  photo {\n    ...photoContent\n    __typename\n  }\n  canAddComment\n  llsid\n  status\n  currentPcursor\n  tags {\n    type\n    name\n    __typename\n  }\n  __typename\n}\n\nquery visionProfilePhotoList($pcursor: String, $userId: String, $page: String, $webPageArea: String) {\n  visionProfilePhotoList(pcursor: $pcursor, userId: $userId, page: $page, webPageArea: $webPageArea) {\n    result\n    llsid\n    webPageArea\n    feeds {\n      ...feedContent\n      __typename\n    }\n    hostName\n    pcursor\n    __typename\n  }\n}\n"
-        })
-        headers = {
-            'Accept': '*/*',
-            'Content-Type': 'application/json',
-            'Origin': 'https://www.kuaishou.com',
-            'Cookie': 'kpf=PC_WEB; clientid=3; did=web_921138a59a2c2a70a89fbf0e2d2db6d8; kpn=KUAISHOU_VISION',
-            'Content-Length': '1260',
-            'Accept-Language': 'zh-CN,zh-Hans;q=0.9',
-            'Host': 'www.kuaishou.com',
-            'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/112.0.0.0 Safari/537.36',
-            'Referer': 'https://www.kuaishou.com/profile/{}'.format(out_uid),
-            'Accept-Encoding': 'gzip, deflate, br',
-            'Connection': 'keep-alive'
-        }
-        try:
-            response = requests.post(url=url, headers=headers, data=payload, proxies=Common.tunnel_proxies(),
-                                     verify=False, timeout=10)
-            feeds = response.json()['data']['visionProfilePhotoList']['feeds']
-        except Exception as e:
-            Common.logger(log_type, crawler).error(f"get_videoList:{e},response:{response.text}")
-            return
-        if not feeds:
-            Common.logger(log_type, crawler).info("没有更多视频啦 ~\n")
-            return
-        pcursor = response.json()['data']['visionProfilePhotoList']['pcursor']
-        for i in range(len(feeds)):
-            # video_title
-            if 'caption' not in feeds[i]['photo']:
-                video_title = random_title(log_type, crawler, env, text='title')
-            elif feeds[i]['photo']['caption'].strip() == "":
-                video_title = random_title(log_type, crawler, env, text='title')
-            else:
-                video_title = cls.video_title(log_type, crawler, env, feeds[i]['photo']['caption'])
-
-            if 'videoResource' not in feeds[i]['photo'] \
-                    and 'manifest' not in feeds[i]['photo'] \
-                    and 'manifestH265' not in feeds[i]['photo']:
-                Common.logger(log_type, crawler).warning(f"get_videoList:{feeds[i]['photo']}\n")
-                break
-            videoResource = feeds[i]['photo']['videoResource']
-
-            if 'h264' not in videoResource and 'hevc' not in videoResource:
-                Common.logger(log_type, crawler).warning(f"get_videoList:{videoResource}\n")
-                break
-
-            # video_id
-            if 'h264' in videoResource and 'videoId' in videoResource['h264']:
-                video_id = videoResource['h264']['videoId']
-            elif 'hevc' in videoResource and 'videoId' in videoResource['hevc']:
-                video_id = videoResource['hevc']['videoId']
-            else:
-                video_id = ""
-
-            # play_cnt
-            if 'viewCount' not in feeds[i]['photo']:
-                play_cnt = 0
-            else:
-                play_cnt = int(feeds[i]['photo']['viewCount'])
-
-            # like_cnt
-            if 'realLikeCount' not in feeds[i]['photo']:
-                like_cnt = 0
-            else:
-                like_cnt = feeds[i]['photo']['realLikeCount']
-
-            # publish_time
-            if 'timestamp' not in feeds[i]['photo']:
-                publish_time_stamp = 0
-                publish_time_str = ''
-                publish_time = 0
-            else:
-                publish_time_stamp = int(int(feeds[i]['photo']['timestamp']) / 1000)
-                publish_time_str = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(publish_time_stamp))
-                publish_time = int((int(time.time()) - publish_time_stamp) / (3600 * 24))
-
-            # duration
-            if 'duration' not in feeds[i]['photo']:
-                duration = 0
-            else:
-                duration = int(int(feeds[i]['photo']['duration']) / 1000)
-
-            # video_width / video_height / video_url
-            mapping = {}
-            for item in ['width', 'height']:
-                try:
-                    val = str(videoResource['h264']['adaptationSet'][0]['representation'][0][item])
-                except:
-                    val = str(videoResource['hevc']['adaptationSet'][0]['representation'][0][item])
-                mapping[item] = val
-            video_width = int(mapping['width']) if mapping['width'] else 0
-            video_height = int(mapping['height']) if mapping['height'] else 0
-            # cover_url
-            if 'coverUrl' not in feeds[i]['photo']:
-                cover_url = ""
-            else:
-                cover_url = feeds[i]['photo']['coverUrl']
-
-            # user_name / avatar_url
-            user_name = feeds[i]['author']['name']
-            avatar_url = feeds[i]['author']['headerUrl']
-
-            video_url = feeds[i]['photo']['photoUrl']
-            video_dict = {'video_title': video_title,
-                          'video_id': video_id,
-                          'play_cnt': play_cnt,
-                          'comment_cnt': 0,
-                          'like_cnt': like_cnt,
-                          'share_cnt': 0,
-                          'video_width': video_width,
-                          'video_height': video_height,
-                          'duration': duration,
-                          'publish_time': publish_time,
-                          'publish_time_stamp': publish_time_stamp,
-                          'publish_time_str': publish_time_str,
-                          'user_name': user_name,
-                          'user_id': out_uid,
-                          'avatar_url': avatar_url,
-                          'cover_url': cover_url,
-                          'video_url': video_url,
-                          'session': f"kuaishou{int(time.time())}"}
-            for k, v in video_dict.items():
-                Common.logger(log_type, crawler).info(f"{k}:{v}")
-            rule_1 = cls.download_rule(video_dict, rule_dict_1)
-            if rule_1 is True:
-                cls.download_publish(log_type=log_type,
-                                     crawler=crawler,
-                                     strategy=strategy,
-                                     video_dict=video_dict,
-                                     rule_dict=rule_dict_1,
-                                     our_uid=our_uid,
-                                     oss_endpoint=oss_endpoint,
-                                     env=env,
-                                     )
-
-
-            else:
-                Common.logger(log_type, crawler).info("不满足下载规则\n")
-
-            # if pcursor == "no_more":
-            #     Common.logger(log_type, crawler).info(f"作者,{out_uid},已经到底了,没有更多内容了\n")
-            #     return
-            # cls.get_videoList(log_type, crawler, strategy, our_uid, out_uid, oss_endpoint, env,
-            #               pcursor=pcursor)
-            # time.sleep(random.randint(1, 3))
-
-    @classmethod
-    def repeat_video(cls, log_type, crawler, video_id, video_title, publish_time, env):
-        sql = f""" select * from crawler_video where platform="{cls.platform}" and out_video_id="{video_id}" or (platform="{cls.platform}" and video_title="{video_title}" and publish_time="{publish_time}") """
-        repeat_video = MysqlHelper.get_values(log_type, crawler, sql, env)
-        return len(repeat_video)
-
-    @classmethod
-    def download_publish(cls, log_type, crawler, strategy, video_dict, rule_dict, our_uid, oss_endpoint, env):
-        try:
-            filter_words = get_config_from_mysql(log_type, crawler, env, text='filter')
-            for filter_word in filter_words:
-                if filter_word in video_dict['video_title']:
-                    Common.logger(log_type, crawler).info('标题已中过滤词:{}\n', video_dict['video_title'])
-                    return
-            if cls.repeat_video(log_type, crawler, video_dict['video_id'], video_dict['video_title'],
-                                video_dict['publish_time_str'], env) != 0:
-                Common.logger(log_type, crawler).info('视频已下载\n')
-            else:
-                # 下载视频
-                Common.download_method(log_type=log_type, crawler=crawler, text='video',
-                                       title=video_dict['video_title'], url=video_dict['video_url'])
-                md_title = md5(video_dict['video_title'].encode('utf8')).hexdigest()
-                if os.path.getsize(f"./{crawler}/videos/{md_title}/video.mp4") == 0:
-                    # 删除视频文件夹
-                    shutil.rmtree(f"./{crawler}/videos/{md_title}")
-                    Common.logger(log_type, crawler).info("视频size=0,删除成功\n")
-                    return
-                # ffmpeg_dict = Common.ffmpeg(log_type, crawler,
-                #                             f"./{crawler}/videos/{video_dict['video_title']}/video.mp4")
-                # if ffmpeg_dict is None or ffmpeg_dict['size'] == 0:
-                #     Common.logger(log_type, crawler).warning(f"下载的视频无效,已删除\n")
-                #     # 删除视频文件夹
-                #     shutil.rmtree(f"./{crawler}/videos/{video_dict['video_title']}")
-                #     return download_finished
-                # 下载封面
-                Common.download_method(log_type=log_type, crawler=crawler, text='cover',
-                                       title=video_dict['video_title'], url=video_dict['cover_url'])
-                # 保存视频信息至txt
-                Common.save_video_info(log_type=log_type, crawler=crawler, video_dict=video_dict)
-
-                # 上传视频
-                Common.logger(log_type, crawler).info("开始上传视频...")
-                our_video_id = Publish.upload_and_publish(log_type=log_type,
-                                                          crawler=crawler,
-                                                          strategy=strategy,
-                                                          our_uid=our_uid,
-                                                          env=env,
-                                                          oss_endpoint=oss_endpoint)
-                if env == 'dev':
-                    our_video_link = f"https://testadmin.piaoquantv.com/cms/post-detail/{our_video_id}/info"
-                else:
-                    our_video_link = f"https://admin.piaoquantv.com/cms/post-detail/{our_video_id}/info"
-                Common.logger(log_type, crawler).info("视频上传完成")
-
-                if our_video_id is None:
-                    Common.logger(log_type, crawler).warning(f"our_video_id:{our_video_id} 删除该视频文件夹")
-                    # 删除视频文件夹
-                    shutil.rmtree(f"./{crawler}/videos/{video_dict['video_title']}")
-                    return
-
-                # 视频信息保存数据库
-                insert_sql = f""" insert into crawler_video(video_id,
-                                                        user_id,
-                                                        out_user_id,
-                                                        platform,
-                                                        strategy,
-                                                        out_video_id,
-                                                        video_title,
-                                                        cover_url,
-                                                        video_url,
-                                                        duration,
-                                                        publish_time,
-                                                        play_cnt,
-                                                        crawler_rule,
-                                                        width,
-                                                        height)
-                                                        values({our_video_id},
-                                                        {our_uid},
-                                                        "{video_dict['user_id']}",
-                                                        "{cls.platform}",
-                                                        "定向爬虫策略",
-                                                        "{video_dict['video_id']}",
-                                                        "{video_dict['video_title']}",
-                                                        "{video_dict['cover_url']}",
-                                                        "{video_dict['video_url']}",
-                                                        {int(video_dict['duration'])},
-                                                        "{video_dict['publish_time_str']}",
-                                                        {int(video_dict['play_cnt'])},
-                                                        '{json.dumps(rule_dict)}',
-                                                        {int(video_dict['video_width'])},
-                                                        {int(video_dict['video_height'])}) """
-                Common.logger(log_type, crawler).info(f"insert_sql:{insert_sql}")
-                MysqlHelper.update_values(log_type, crawler, insert_sql, env)
-                Common.logger(log_type, crawler).info('视频信息插入数据库成功!\n')
-
-                # 视频写入飞书
-                Feishu.insert_columns(log_type, 'kuaishou', "fYdA8F", "ROWS", 1, 2)
-                upload_time = int(time.time())
-                values = [[our_video_id,
-                           time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(upload_time)),
-                           "定向榜",
-                           str(video_dict['video_id']),
-                           video_dict['video_title'],
-                           our_video_link,
-                           video_dict['play_cnt'],
-                           video_dict['comment_cnt'],
-                           video_dict['like_cnt'],
-                           video_dict['share_cnt'],
-                           video_dict['duration'],
-                           f"{video_dict['video_width']}*{video_dict['video_height']}",
-                           video_dict['publish_time_str'],
-                           video_dict['user_name'],
-                           video_dict['user_id'],
-                           video_dict['avatar_url'],
-                           video_dict['cover_url'],
-                           video_dict['video_url']]]
-                time.sleep(1)
-                Feishu.update_values(log_type, 'kuaishou', "fYdA8F", "E2:Z2", values)
-                Common.logger(log_type, crawler).info(f"视频已保存至云文档\n")
-                download_finished = True
-            return
-        except Exception as e:
-            Common.logger(log_type, crawler).error(f"download_publish:{e}\n")
-
-    @classmethod
-    def get_follow_videos(cls, log_type, crawler, task, oss_endpoint, env):
-        user_list = get_user_from_mysql(log_type, crawler, crawler, env)
-        strategy = '定向抓取策略'
-        for user in user_list:
-            try:
-                spider_link = user["link"]
-                out_uid = spider_link.split('/')[-1]
-                user_name = user["nick_name"]
-                our_uid = user["uid"]
-                Common.logger(log_type, crawler).info(f"开始抓取 {user_name} 用户主页视频\n")
-                cls.get_videoList(log_type=log_type,
-                                  crawler=crawler,
-                                  strategy=strategy,
-                                  task=task,
-                                  our_uid=our_uid,
-                                  out_uid=out_uid,
-                                  oss_endpoint=oss_endpoint,
-                                  env=env)
-            except Exception as e:
-                continue
-
-
-if __name__ == "__main__":
-    KuaiShouFollowScheduling.get_follow_videos(
-        log_type="follow",
-        crawler="kuaishou",
-        task="",
-        oss_endpoint="out",
-        env="dev",
-    )
-
-    # print(KuaiShouFollow.get_out_user_info("follow", "kuaishou", "3xnk3wbm3vfiha6"))
-    # print(Follow.get_out_user_info("follow", "kuaishou", "3x5wgjhfc7tx8ue"))

+ 0 - 28
kuaishou/kuaishou_follow/test.py

@@ -1,28 +0,0 @@
-import pymysql
-connection = pymysql.connect(
-                host="rm-bp1k5853td1r25g3n690.mysql.rds.aliyuncs.com",# 数据库IP地址,内网地址
-                # host="rm-bp1k5853td1r25g3ndo.mysql.rds.aliyuncs.com",  # 数据库IP地址,外网地址
-                port=3306,  # 端口号
-                user="crawler",  # mysql用户名
-                passwd="crawler123456@",  # mysql用户登录密码
-                db="piaoquan-crawler",  # 数据库名
-                # 如果数据库里面的文本是utf8编码的,charset指定是utf8
-                charset="utf8mb4")
-
-m_con = connection.cursor(cursor=pymysql.cursors.DictCursor)
-
-sql = 'select * from crawler_config where id =6'
-
-a  = m_con.execute(sql)
-data = m_con.fetchall()
-
-# 关闭数据库连接
-connection.close()
-n_data = data[0]
-emo = n_data['config']
-a = '😝'
-em = eval(emo)['emoji']
-if a in em:
-    print(11111)
-else:
-    print(222222)

+ 85 - 0
kuaishou/kuaishou_main/run_ks_author.py

@@ -0,0 +1,85 @@
+# -*- coding: utf-8 -*-
+# @Author: wangkun
+# @Time: 2023/6/7
+import argparse
+from mq_http_sdk.mq_client import *
+from mq_http_sdk.mq_consumer import *
+from mq_http_sdk.mq_exception import MQExceptionBase
+sys.path.append(os.getcwd())
+from common.common import Common
+from common.public import get_consumer, ack_message, task_fun_mq
+from common.scheduling_db import MysqlHelper
+from kuaishou.kuaishou_author.kuaishou_author_scheduling import KuaishouauthorScheduling
+
+
+def main(log_type, crawler, topic_name, group_id, env):
+    consumer = get_consumer(topic_name, group_id)
+    # 长轮询表示如果Topic没有消息,则客户端请求会在服务端挂起3秒,3秒内如果有消息可以消费则立即返回响应。
+    # 长轮询时间3秒(最多可设置为30秒)。
+    wait_seconds = 3
+    # 一次最多消费3条(最多可设置为16条)。
+    batch = 1
+    Common.logger(log_type, crawler).info(f'{10 * "="}Consume And Ack Message From Topic{10 * "="}\n'
+                                          f'WaitSeconds:{wait_seconds}\n'
+                                          f'TopicName:{topic_name}\n'
+                                          f'MQConsumer:{group_id}')
+    while True:
+        try:
+            # 长轮询消费消息。
+            recv_msgs = consumer.consume_message(batch, wait_seconds)
+            for msg in recv_msgs:
+                Common.logger(log_type, crawler).info(f"Receive\n"
+                                                      f"MessageId:{msg.message_id}\n"
+                                                      f"MessageBodyMD5:{msg.message_body_md5}\n"
+                                                      f"MessageTag:{msg.message_tag}\n"
+                                                      f"ConsumedTimes:{msg.consumed_times}\n"
+                                                      f"PublishTime:{msg.publish_time}\n"
+                                                      f"Body:{msg.message_body}\n"
+                                                      f"NextConsumeTime:{msg.next_consume_time}\n"
+                                                      f"ReceiptHandle:{msg.receipt_handle}\n"
+                                                      f"Properties:{msg.properties}")
+                # ack_mq_message
+                ack_message(log_type=log_type, crawler=crawler, recv_msgs=recv_msgs, consumer=consumer)
+
+                # 处理爬虫业务
+                task_dict = task_fun_mq(msg.message_body)['task_dict']
+                rule_dict = task_fun_mq(msg.message_body)['rule_dict']
+                task_id = task_dict['id']
+                select_user_sql = f"""select * from crawler_user_v3 where task_id={task_id}"""
+                user_list = MysqlHelper.get_values(log_type, crawler, select_user_sql, env, action="")
+                Common.logger(log_type, crawler).info(f"调度任务:{task_dict}")
+                Common.logger(log_type, crawler).info(f"抓取规则:{rule_dict}")
+                # Common.logger(log_type, crawler).info(f"用户列表:{user_list}\n")
+                Common.logger(log_type, crawler).info(f'开始抓取 {task_dict["taskName"]}\n')
+                KuaishouauthorScheduling.get_author_videos(log_type=log_type,
+                                                           crawler=crawler,
+                                                           rule_dict=rule_dict,
+                                                           user_list=user_list,
+                                                           env=env)
+                Common.del_logs(log_type, crawler)
+                Common.logger(log_type, crawler).info('抓取一轮结束\n')
+
+        except MQExceptionBase as err:
+            # Topic中没有消息可消费。
+            if err.type == "MessageNotExist":
+                Common.logger(log_type, crawler).info(f"No new message! RequestId:{err.req_id}\n")
+                continue
+
+            Common.logger(log_type, crawler).info(f"Consume Message Fail! Exception:{err}\n")
+            time.sleep(2)
+            continue
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()  ## 新建参数解释器对象
+    parser.add_argument('--log_type', type=str)  ## 添加参数,注明参数类型
+    parser.add_argument('--crawler')  ## 添加参数
+    parser.add_argument('--topic_name')  ## 添加参数
+    parser.add_argument('--group_id')  ## 添加参数
+    parser.add_argument('--env')  ## 添加参数
+    args = parser.parse_args()  ### 参数赋值,也可以通过终端赋值
+    main(log_type=args.log_type,
+         crawler=args.crawler,
+         topic_name=args.topic_name,
+         group_id=args.group_id,
+         env=args.env)

+ 90 - 0
kuaishou/kuaishou_main/run_ks_recommend.py

@@ -0,0 +1,90 @@
+# -*- coding: utf-8 -*-
+# @Author: wangkun
+# @Time: 2023/6/7
+import argparse
+import random
+from mq_http_sdk.mq_client import *
+from mq_http_sdk.mq_consumer import *
+from mq_http_sdk.mq_exception import MQExceptionBase
+sys.path.append(os.getcwd())
+from common.common import Common
+from common.public import get_consumer, ack_message, task_fun_mq
+from common.scheduling_db import MysqlHelper
+from kuaishou.kuaishou_recommend.kuaishou_recommend_shceduling import KuaiShouRecommendScheduling
+
+
+def main(log_type, crawler, topic_name, group_id, env):
+    consumer = get_consumer(topic_name, group_id)
+    # 长轮询表示如果Topic没有消息,则客户端请求会在服务端挂起3秒,3秒内如果有消息可以消费则立即返回响应。
+    # 长轮询时间3秒(最多可设置为30秒)。
+    wait_seconds = 3
+    # 一次最多消费3条(最多可设置为16条)。
+    batch = 1
+    Common.logger(log_type, crawler).info(f'{10 * "="}Consume And Ack Message From Topic{10 * "="}\n'
+                                          f'WaitSeconds:{wait_seconds}\n'
+                                          f'TopicName:{topic_name}\n'
+                                          f'MQConsumer:{group_id}')
+    while True:
+        try:
+            # 长轮询消费消息。
+            recv_msgs = consumer.consume_message(batch, wait_seconds)
+            for msg in recv_msgs:
+                Common.logger(log_type, crawler).info(f"Receive\n"
+                                                      f"MessageId:{msg.message_id}\n"
+                                                      f"MessageBodyMD5:{msg.message_body_md5}\n"
+                                                      f"MessageTag:{msg.message_tag}\n"
+                                                      f"ConsumedTimes:{msg.consumed_times}\n"
+                                                      f"PublishTime:{msg.publish_time}\n"
+                                                      f"Body:{msg.message_body}\n"
+                                                      f"NextConsumeTime:{msg.next_consume_time}\n"
+                                                      f"ReceiptHandle:{msg.receipt_handle}\n"
+                                                      f"Properties:{msg.properties}")
+                # ack_mq_message
+                ack_message(log_type=log_type, crawler=crawler, recv_msgs=recv_msgs, consumer=consumer)
+
+                # 处理爬虫业务
+                task_dict = task_fun_mq(msg.message_body)['task_dict']
+                rule_dict = task_fun_mq(msg.message_body)['rule_dict']
+                task_id = task_dict['id']
+                select_user_sql = f"""select * from crawler_user_v3 where task_id={task_id}"""
+                user_list = MysqlHelper.get_values(log_type, crawler, select_user_sql, env, action="")
+                our_uid_list = []
+                for user in user_list:
+                    our_uid_list.append(user["uid"])
+                our_uid = random.choice(our_uid_list)
+                Common.logger(log_type, crawler).info(f"调度任务:{task_dict}")
+                Common.logger(log_type, crawler).info(f"抓取规则:{rule_dict}")
+                # Common.logger(log_type, crawler).info(f"用户列表:{user_list}\n")
+                Common.logger(log_type, crawler).info(f'开始抓取 {task_dict["taskName"]}\n')
+                KuaiShouRecommendScheduling.get_videoList(log_type=log_type,
+                                                          crawler=crawler,
+                                                          rule_dict=rule_dict,
+                                                          our_uid=our_uid,
+                                                          env=env)
+                Common.del_logs(log_type, crawler)
+                Common.logger(log_type, crawler).info('抓取一轮结束\n')
+
+        except MQExceptionBase as err:
+            # Topic中没有消息可消费。
+            if err.type == "MessageNotExist":
+                Common.logger(log_type, crawler).info(f"No new message! RequestId:{err.req_id}\n")
+                continue
+
+            Common.logger(log_type, crawler).info(f"Consume Message Fail! Exception:{err}\n")
+            time.sleep(2)
+            continue
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()  ## 新建参数解释器对象
+    parser.add_argument('--log_type', type=str)  ## 添加参数,注明参数类型
+    parser.add_argument('--crawler')  ## 添加参数
+    parser.add_argument('--topic_name')  ## 添加参数
+    parser.add_argument('--group_id')  ## 添加参数
+    parser.add_argument('--env')  ## 添加参数
+    args = parser.parse_args()  ### 参数赋值,也可以通过终端赋值
+    main(log_type=args.log_type,
+         crawler=args.crawler,
+         topic_name=args.topic_name,
+         group_id=args.group_id,
+         env=args.env)

+ 0 - 57
kuaishou/kuaishou_main/run_kuaishou_follow_scheduling.py

@@ -1,57 +0,0 @@
-# -*- coding: utf-8 -*-
-# @Author: lierqiang
-# @Time: 2023/4/21
-import argparse
-import os
-import sys
-
-sys.path.append(os.getcwd())
-from common.common import Common
-from kuaishou.kuaishou_follow.kuaishou_follow_scheduling import KuaiShouFollowScheduling
-from common.public import task_fun
-
-
-def main(log_type, crawler, task, oss_endpoint, env):
-    task = task_fun(task)
-    try:
-        Common.logger(log_type, crawler).info(f'开始抓取 {crawler}视频 定向榜\n')
-        KuaiShouFollowScheduling.get_follow_videos(log_type=log_type,
-                                             crawler=crawler,
-                                             task=task,
-                                             oss_endpoint=oss_endpoint,
-                                             env=env)
-        Common.del_logs(log_type, crawler)
-        Common.logger(log_type, crawler).info('抓取任务结束\n')
-    except Exception as e:
-        Common.logger(log_type, crawler).info(f"{crawler}视频异常,触发报警:{e}\n")
-        # Feishu.bot(log_type, crawler, f"{e}")
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()  ## 新建参数解释器对象
-    parser.add_argument('--log_type', default='author')  ## 添加参数,注明参数类型
-    parser.add_argument('--crawler', default='kuaishou')  ## 添加参数
-    parser.add_argument('--strategy', default='定向抓取')  ## 添加参数
-    parser.add_argument('--task')  ## 添加参数
-    parser.add_argument('--oss_endpoint', default='outer')  ## 添加参数
-    parser.add_argument('--env', default='dev')  ## 添加参数
-    # parser.add_argument('--machine')  ## 添加参数
-    args = parser.parse_args()  ### 参数赋值,也可以通过终端赋值
-    task = {
-        'task_dict': {'task_id': '17', 'task_name': '西瓜测试4.21', 'source': 'kuaishou', 'start_time': '1682010720000',
-                      'interval': '24', 'mode': 'author',
-                      'rule': {'duration': {'min': 40, 'max': 0}, 'play_cnt': {'min': 4000, 'max': 0},
-                               'period': {'min': 10, 'max': 0}, 'fans_cnt': {'min': 0, 'max': 0},
-                               'videos_cnt': {'min': 0, 'max': 0}, 'like_cnt': {'min': 0, 'max': 0},
-                               'width': {'min': 0, 'max': 0}, 'height': {'min': 0, 'max': 0}},
-                      'spider_name': 'run_dy_author_scheduling', 'machine': 'aliyun', 'status': '0',
-                      'create_time': '1682048632396', 'update_time': '1682048632396', 'operator': ''},
-        'rule_dict': {'duration': {'min': 0, 'max': 0}, 'play_cnt': {'min': 0, 'max': 0},
-                      'period': {'min': 0, 'max': 0}, 'fans_cnt': {'min': 0, 'max': 0}, 'videos_cnt': {'min': 0, 'max': 0},
-                      'like_cnt': {'min': 0, 'max': 0}, 'width': {'min': 0, 'max': 0},
-                      'height': {'min': 0, 'max': 0},'publish_time':{'min':0}}}
-    main(log_type=args.log_type,
-         crawler=args.crawler,
-         task=task,
-         oss_endpoint=args.oss_endpoint,
-         env=args.env)

+ 57 - 0
main/process_mq.sh

@@ -0,0 +1,57 @@
+#! /bin/bash
+
+crawler=$1  # 哪款爬虫
+path=$2     # 爬虫路径
+log_type=$3 # 爬虫策略
+env=$4      # 环境
+
+if [ ${env} = "dev" ];then
+  piaoquan_crawler_dir=/Users/wangkun/Desktop/crawler/piaoquan_crawler/
+  profile_path=/etc/profile
+  python=python3
+  log_path=${piaoquan_crawler_dir}main/main_logs/process-mq-$(date +%Y-%m-%d).log
+elif [ ${env} = "hk" ];then
+  piaoquan_crawler_dir=/root/piaoquan_crawler/
+  profile_path=/etc/profile
+  python=python3
+  log_path=${piaoquan_crawler_dir}main/main_logs/process-mq-$(date +%Y-%m-%d).log
+else
+  piaoquan_crawler_dir=/data5/piaoquan_crawler/
+  profile_path=/etc/profile
+  python=python
+  log_path=${piaoquan_crawler_dir}main/main_logs/process-mq-$(date +%Y-%m-%d).log
+fi
+
+echo run_${crawler}_${log_type}.py
+echo topic_${crawler}_${log_type}_${env}
+echo GID_${crawler}_${log_type}_${env}
+
+time=$(date +%H:%M:%S)
+echo "$(date "+%Y-%m-%d %H:%M:%S") 开始监测爬虫进程状态" >> ${log_path}
+echo "$(date "+%Y-%m-%d %H:%M:%S") 更新环境变量..." >> ${log_path}
+cd ~ && source /etc/profile
+echo "$(date "+%Y-%m-%d %H:%M:%S") 更新环境变量完成!" >> ${log_path}
+
+echo "$(date "+%Y-%m-%d %H:%M:%S") 正在更新代码..." >> ${log_path}
+cd ${piaoquan_crawler_dir} && git pull origin master --force
+echo "$(date "+%Y-%m-%d %H:%M:%S") 代码更新完成!" >> ${log_path}
+
+# ====================接入爬虫平台,且调用MQ进程检测====================
+# 岁岁年年迎福气
+echo "$(date "+%Y-%m-%d %H:%M:%S") 正在监测 ${crawler}_${log_type} 进程状态" >> ${log_path}
+ps -ef | grep "run_${crawler}_${log_type}.py" | grep -v "grep"
+if [ "$?" -eq 1 ];then
+  echo "$(date "+%Y-%m-%d_%H:%M:%S") 异常停止,正在重启!" >> ${log_path}
+  cd ${piaoquan_crawler_dir} && nohup ${python} -u ${path}/${path}_main/run_${crawler}_${log_type}.py --log_type="${log_type}" --crawler="${path}" --topic_name="${crawler}_${log_type}_${env}" --group_id="${crawler}_${log_type}_${env}" --env="${env}" >> ${path}/logs/${log_type}-shell.log 2>&1 &
+  echo "$(date "+%Y-%m-%d %H:%M:%S") 重启完成!" >> ${log_path}
+else
+  echo "$(date "+%Y-%m-%d %H:%M:%S") ${crawler}_${log_type} 进程状态正常" >> ${log_path}
+fi
+# ==================================================================
+
+
+# 删除日志
+echo "$(date "+%Y-%m-%d %H:%M:%S") 开始清理 5 天前的日志文件" >> ${log_path}
+find ${piaoquan_crawler_dir}main/main_logs/ -mtime +5 -name "*.log" -exec rm -rf {} \;
+echo "$(date "+%Y-%m-%d %H:%M:%S") 日志文件清理完毕" >> ${log_path}
+exit 0

+ 0 - 138
suisuiniannianyingfuqi/suisuiniannianyingfuqi_main/demo.py

@@ -1,138 +0,0 @@
-# -*- coding: utf-8 -*-
-# @Author: wangkun
-# @Time: 2023/5/15
-import random
-import time
-from common.scheduling_db import MysqlHelper
-
-
-class Demo:
-    @classmethod
-    def get_user(cls, log_type, crawler, env):
-        select_user_sql = f"""select * from crawler_user_v3 where task_id=36"""
-        user_list = MysqlHelper.get_values(log_type, crawler, select_user_sql, env, action="")
-        print(user_list)
-        our_uid_list = []
-        for user in user_list:
-            our_uid_list.append(user["uid"])
-
-        print(our_uid_list)
-        our_uid = random.choice(our_uid_list)
-        print(our_uid)
-
-    @classmethod
-    def test_dict(cls):
-        video_dict = {
-            "play_cnt": 1000,
-            "share_cnt": 1000,
-            "duration": 55,
-            "period": 5,
-            "publish_time_stamp": 1683648000,  # 2023-05-10 00:00:00
-            "video_url": "www.baidu.com"
-        }
-        rule_dict = {
-             "play_cnt": {"min": 0, "max": 0},
-             "fans_cnt": {"min": 0, "max": 0},
-             "videos_cnt": {"min": 0, "max": 0},
-             "like_cnt": {"min": 0, "max": 0},
-             "video_width": {"min": 0, "max": 0},
-             "video_height": {"min": 0, "max": 0},
-             "duration": {"min": 0, "max": 0},
-             "share_cnt": {"min": 0, "max": 0},
-             "comment_cnt": {"min": 0, "max": 0},
-             "favorite_cnt": {"min": 0, "max": 0},
-             "period": {"min": 10, "max": 0},
-             "publish_time": {"min": 1673734400000, "max": 0}
-        }
-
-        # 格式化 video_dict:publish_time_stamp
-        if "publish_time_stamp" in video_dict.keys():
-            video_dict["publish_time"] = video_dict["publish_time_stamp"]*1000
-        # 格式化 video_dict:period
-        if "period" not in video_dict.keys() and "publish_time" in video_dict.keys():
-            video_dict["period"] = int((int(time.time()*1000)-video_dict["publish_time"])/(3600*24*1000))
-        # 格式化 rule_dict 最大值取值为 0 的问题
-        for rule_value in rule_dict.values():
-            if rule_value["max"] == 0:
-                rule_value["max"] = 999999999999999
-        rule_dict["period"]["max"] = rule_dict["period"]["min"]
-        rule_dict["period"]["min"] = 0
-        for k, v in rule_dict.items():
-            print(f"{k}:{v}")
-        # 格式化 rule_dict 有的 key,video_dict 中没有的问题
-        for rule_key in rule_dict.keys():
-            if rule_key not in video_dict.keys():
-                video_dict[rule_key] = int(rule_dict[rule_key]["max"] / 2)
-        # 比较结果,输出结果:True / False
-        for video_key, video_value in video_dict.items():
-            for rule_key, rule_value in rule_dict.items():
-                if video_key == rule_key:
-                    result = rule_value["min"] <= video_value <= rule_value["max"]
-                    print(f'{video_key}: {rule_value["min"]} <= {video_value} <= {rule_value["max"]},{result}')
-                    # Common.logger(log_type, crawler).info(f'{video_key}: {rule_value["min"]} <= {video_value} <= {rule_value["max"]},{result}')
-
-                    if result is False:
-                        return False
-                    else:
-                        continue
-        return True
-
-    @classmethod
-    def save_video_info(cls):
-        video_dict = {'video_title': "测试视频标题",
-                      'video_id': "id_1",
-                      'play_cnt': 199,
-                      'publish_time_stamp': 1683648000,
-                      'publish_time_str': "2023-05-10 00:00:00",
-                      'user_name': "岁岁年年迎福气",
-                      'user_id': "suisuiniannianyingfuqi",
-                      'avatar_url': "https://cdn.jzkksp.com/2022/3/22/lnchd2.jpg?auth_key=1684223012-0-0-2f8ddcf0e5d5f164f792b77c98e1ffde",
-                      'cover_url': "https://cdn.jzkksp.com/2022/3/22/lnchd2.jpg?auth_key=1684223012-0-0-2f8ddcf0e5d5f164f792b77c98e1ffde",
-                      'video_url': "https://cdn.jzkksp.com/2022/3/22/lnchd.mp4",
-                      'session': f"suisuiniannianyingfuqi-{int(time.time())}"}
-        save_dict = {
-            "video_title": "video_title",
-            "video_id": "video_id",
-            "duration": 0,
-            "play_cnt": 0,
-            "comment_cnt": 0,
-            "like_cnt": 0,
-            "share_cnt": 0,
-            "video_width": 1920,
-            "video_height": 1080,
-            "publish_time_stamp": 946656000,  # 2000-01-01 00:00:00
-            "user_name": "crawler",
-            "avatar_url": "http://weapppiccdn.yishihui.com/resources/images/pic_normal.png",
-            "video_url": "video_url",
-            "cover_url": "cover_url",
-            "session": f"session-{int(time.time())}",
-
-        }
-        for video_key, video_value in video_dict.items():
-            for save_key, save_value in save_dict.items():
-                if save_key == video_key:
-                    save_dict[save_key] = video_value
-        for k, v in save_dict.items():
-            print(f"{k}:{v}")
-        with open(f"./info.txt", "w", encoding="UTF-8") as f_a:
-            f_a.write(str(save_dict['video_id']) + "\n" +
-                      str(save_dict['video_title']) + "\n" +
-                      str(save_dict['duration']) + "\n" +
-                      str(save_dict['play_cnt']) + "\n" +
-                      str(save_dict['comment_cnt']) + "\n" +
-                      str(save_dict['like_cnt']) + "\n" +
-                      str(save_dict['share_cnt']) + "\n" +
-                      f"{save_dict['video_width']}*{save_dict['video_height']}" + "\n" +
-                      str(save_dict['publish_time_stamp']) + "\n" +
-                      str(save_dict['user_name']) + "\n" +
-                      str(save_dict['avatar_url']) + "\n" +
-                      str(save_dict['video_url']) + "\n" +
-                      str(save_dict['cover_url']) + "\n" +
-                      str(save_dict['session']))
-
-if __name__ == "__main__":
-    # Demo.get_user("demo", "suisuiniannianyingfuqi", "dev")
-    print(Demo.test_dict())
-    # print(500 <= 1000 <= 100000000)
-    # Demo.save_video_info()
-    pass

+ 10 - 10
suisuiniannianyingfuqi/suisuiniannianyingfuqi_main/run_suisuiniannianyingfuqi_recommend_mq.py → suisuiniannianyingfuqi/suisuiniannianyingfuqi_main/run_ssnnyfq_recommend.py

@@ -7,7 +7,7 @@ from mq_http_sdk.mq_client import *
 from mq_http_sdk.mq_consumer import *
 from mq_http_sdk.mq_exception import MQExceptionBase
 sys.path.append(os.getcwd())
-from common.public import task_fun, get_consumer, ack_message
+from common.public import get_consumer, ack_message, task_fun_mq
 from common.common import Common
 from common.scheduling_db import MysqlHelper
 from suisuiniannianyingfuqi.suisuiniannianyingfuqi_recommend.suisuiniannianyingfuqi_recommend_scheduling import \
@@ -22,9 +22,9 @@ def main(log_type, crawler, topic_name, group_id, env):
     # 一次最多消费3条(最多可设置为16条)。
     batch = 1
     Common.logger(log_type, crawler).info(f'{10 * "="}Consume And Ack Message From Topic{10 * "="}\n'
+                                          f'WaitSeconds:{wait_seconds}\n'
                                           f'TopicName:{topic_name}\n'
-                                          f'MQConsumer:{group_id}\n'
-                                          f'WaitSeconds:{wait_seconds}\n')
+                                          f'MQConsumer:{group_id}')
     while True:
         try:
             # 长轮询消费消息。
@@ -35,18 +35,18 @@ def main(log_type, crawler, topic_name, group_id, env):
                                                       f"MessageBodyMD5:{msg.message_body_md5}\n"
                                                       f"MessageTag:{msg.message_tag}\n"
                                                       f"ConsumedTimes:{msg.consumed_times}\n"
-                                                      f"PublishTime:{msg.PublishTime}\n"
+                                                      f"PublishTime:{msg.publish_time}\n"
                                                       f"Body:{msg.message_body}\n"
                                                       f"NextConsumeTime:{msg.next_consume_time}\n"
                                                       f"ReceiptHandle:{msg.receipt_handle}\n"
-                                                      f"Properties:{msg.properties}\n")
+                                                      f"Properties:{msg.properties}")
                 # ack_mq_message
                 ack_message(log_type=log_type, crawler=crawler, recv_msgs=recv_msgs, consumer=consumer)
 
                 # 处理爬虫业务
-                task_dict = task_fun(task)['task_dict']
-                rule_dict = task_fun(task)['rule_dict']
-                task_id = task_dict['task_id']
+                task_dict = task_fun_mq(msg.message_body)['task_dict']
+                rule_dict = task_fun_mq(msg.message_body)['rule_dict']
+                task_id = task_dict['id']
                 select_user_sql = f"""select * from crawler_user_v3 where task_id={task_id}"""
                 user_list = MysqlHelper.get_values(log_type, crawler, select_user_sql, env, action="")
                 our_uid_list = []
@@ -63,7 +63,7 @@ def main(log_type, crawler, topic_name, group_id, env):
                                                                         rule_dict=rule_dict,
                                                                         env=env)
                 Common.del_logs(log_type, crawler)
-                Common.logger(log_type, crawler).info('抓取一轮\n')
+                Common.logger(log_type, crawler).info('抓取一轮结束\n')
 
         except MQExceptionBase as err:
             # Topic中没有消息可消费。
@@ -87,4 +87,4 @@ if __name__ == "__main__":
          crawler=args.crawler,
          topic_name=args.topic_name,
          group_id=args.group_id,
-         env=args.env)
+         env=args.env)

+ 0 - 160
suisuiniannianyingfuqi/suisuiniannianyingfuqi_recommend/insert.py

@@ -1,160 +0,0 @@
-# -*- coding: utf-8 -*-
-# @Author: wangkun
-# @Time: 2023/4/13
-import json
-import os
-import sys
-import time
-from datetime import date, timedelta
-sys.path.append(os.getcwd())
-from common.common import Common
-from common.feishu import Feishu
-from common.scheduling_db import MysqlHelper
-
-
-class Insert:
-    @classmethod
-    def get_config(cls, log_type, crawler, text, env):
-        select_sql = f"""select * from crawler_config where source="suisuiniannianyingfuqi" """
-        contents = MysqlHelper.get_values(log_type, crawler, select_sql, env, action='')
-        title_list = []
-        filter_list = []
-        for content in contents:
-            config = content['config']
-            config_dict = eval(config)
-            for k, v in config_dict.items():
-                if k == "title":
-                    title_list_config = v.split(",")
-                    for title in title_list_config:
-                        title_list.append(title)
-                if k == "filter":
-                    filter_list_config = v.split(",")
-                    for filter_word in filter_list_config:
-                        filter_list.append(filter_word)
-        if text == "title":
-            return title_list
-        elif text == "filter":
-            return filter_list
-
-    @classmethod
-    def before_day(cls):
-        publish_time_str_rule = (date.today() + timedelta(days=-30)).strftime("%Y-%m-%d %H:%M:%S")
-        publish_time_stamp_rule = int(time.mktime(time.strptime(publish_time_str_rule, "%Y-%m-%d %H:%M:%S")))
-        print(publish_time_str_rule)
-        print(publish_time_stamp_rule)
-
-    @classmethod
-    def insert_config(cls, log_type, crawler, env):
-        filter_sheet = Feishu.get_values_batch(log_type, crawler, "DjXfqG")
-        # title_sheet = Feishu.get_values_batch(log_type, crawler, "bHSW1p")
-        filter_list = []
-        # title_list = []
-        for x in filter_sheet:
-            for y in x:
-                if y is None:
-                    pass
-                else:
-                    filter_list.append(y)
-        # for x in title_sheet:
-        #     for y in x:
-        #         if y is None:
-        #             pass
-        #         else:
-        #             title_list.append(y)
-        # str_title = ','.join(title_list)
-        str_filter = ','.join(filter_list)
-        config_dict = {
-            # "title": str_title,
-            "filter": str_filter
-        }
-        str_config_dict = str(config_dict)
-        # print(f"config_dict:{config_dict}")
-        # print(f"str_config_dict:{str_config_dict}")
-        insert_sql = f""" insert into crawler_config(title, source, config) values("本山祝福小程序", "benshanzhufu", "{str_config_dict}") """
-        MysqlHelper.update_values(log_type, crawler, insert_sql, env)
-
-    @classmethod
-    def insert_video_from_feishu_to_mysql(cls, log_type, crawler, env):
-        benshanzhufu_sheetid = ['290bae']
-        for sheetid in benshanzhufu_sheetid:
-            xiaoniangao_sheet = Feishu.get_values_batch(log_type, crawler, sheetid)
-            for i in range(1, len(xiaoniangao_sheet)):
-            # for i in range(1, 3):
-                if xiaoniangao_sheet[i][5] is None or xiaoniangao_sheet[i][9] is None:
-                    continue
-                video_id = xiaoniangao_sheet[i][9].replace("https://admin.piaoquantv.com/cms/post-detail/", "").replace(
-                    "/info", "")
-                if video_id == "None":
-                    continue
-                video_id = int(video_id)
-                platform = "岁岁年年迎福气"
-                strategy = "推荐榜爬虫策略"
-                out_video_id = str(xiaoniangao_sheet[i][8])
-                video_title = str(xiaoniangao_sheet[i][7])
-                cover_url = str(xiaoniangao_sheet[i][13])
-                video_url = str(xiaoniangao_sheet[i][14])
-                duration = int(xiaoniangao_sheet[i][11])
-                play_cnt = int(xiaoniangao_sheet[i][10])
-                crawler_rule = json.dumps({})
-                width = int(xiaoniangao_sheet[i][12].split("*")[0])
-                height = int(xiaoniangao_sheet[i][12].split("*")[1])
-
-                # print(f"video_id:{video_id}, type:{type(video_id)}")
-                # print(f"platform:{platform}, type:{type(platform)}")
-                # print(f"strategy:{strategy}, type:{type(strategy)}")
-                # print(f"out_video_id:{out_video_id}, type:{type(out_video_id)}")
-                # print(f"video_title:{video_title}, type:{type(video_title)}")
-                # print(f"cover_url:{cover_url}, type:{type(cover_url)}")
-                # print(f"video_url:{video_url}, type:{type(video_url)}")
-                # print(f"duration:{duration}, type:{type(duration)}")
-                # print(f"play_cnt:{play_cnt}, type:{type(play_cnt)}")
-                # print(f"crawler_rule:{crawler_rule}, type:{type(crawler_rule)}")
-                # print(f"width:{width}, type:{type(width)}")
-                # print(f"height:{height}, type:{type(height)}\n")
-
-                select_sql = f""" select * from crawler_video where platform="{platform}" and out_video_id="{out_video_id}" """
-                Common.logger(log_type, crawler).info(f"select_sql:{select_sql}")
-                repeat_video = MysqlHelper.get_values(log_type, crawler, select_sql, env)
-                Common.logger(log_type, crawler).info(f"repeat_video:{repeat_video}")
-
-                if repeat_video is not None and len(repeat_video) != 0:
-                    Common.logger(log_type, crawler).info(f"{video_title} 已存在数据库中\n")
-                else:
-                    # 视频信息保存数据库
-                    insert_sql = f""" insert into crawler_video(video_id,
-                                        out_user_id,
-                                        platform,
-                                        strategy,
-                                        out_video_id,
-                                        video_title,
-                                        cover_url,
-                                        video_url,
-                                        duration,
-                                        play_cnt,
-                                        crawler_rule,
-                                        width,
-                                        height)
-                                        values({video_id},
-                                        "suisuiniannianyingfuqi",
-                                        "{platform}",
-                                        "{strategy}",
-                                        "{out_video_id}",
-                                        "{video_title}",
-                                        "{cover_url}",
-                                        "{video_url}",
-                                        {duration},
-                                        {play_cnt},
-                                        '{crawler_rule}',
-                                        {width},
-                                        {height}) """
-                    Common.logger(log_type, crawler).info(f"insert_sql:{insert_sql}")
-                    MysqlHelper.update_values(log_type, crawler, insert_sql, env, action='')
-                    Common.logger(log_type, crawler).info('视频信息插入数据库成功!\n')
-
-
-if __name__ == "__main__":
-    # Insert.insert_config("insert", "suisuiniannianyingfuqi", "dev")
-    # print(Insert.get_config("insert", "suisuiniannianyingfuqi", "filter", "dev"))
-    # Insert.insert_video_from_feishu_to_mysql("insert-dev", "suisuiniannianyingfuqi", "dev")
-    Insert.insert_video_from_feishu_to_mysql("insert-prod", "suisuiniannianyingfuqi", "prod")
-    pass

+ 6 - 1
suisuiniannianyingfuqi/suisuiniannianyingfuqi_recommend/suisuiniannianyingfuqi_recommend_scheduling.py

@@ -9,6 +9,7 @@ import time
 from hashlib import md5
 import requests
 import urllib3
+from requests.adapters import HTTPAdapter
 sys.path.append(os.getcwd())
 from common.common import Common
 from common.feishu import Feishu
@@ -51,7 +52,11 @@ class SuisuiniannianyingfuqiRecommendScheduling:
                     'openid': 'oDAjy5SCFe7Ml3PNgiow3ncozL1o'
                 }
                 urllib3.disable_warnings()
-                response = requests.post(url=url, headers=headers, data=data, verify=False)
+                s = requests.session()
+                # max_retries=3 重试3次
+                s.mount('http://', HTTPAdapter(max_retries=3))
+                s.mount('https://', HTTPAdapter(max_retries=3))
+                response = s.post(url=url, headers=headers, data=data, verify=False, timeout=5)
                 page += 1
                 if response.status_code != 200:
                     Common.logger(log_type, crawler).warning(f'get_videoList:{response.status_code}, {response.text}\n')

+ 0 - 19
xiaoniangao/xiaoniangao_author/author_test.py

@@ -1,19 +0,0 @@
-# -*- coding: utf-8 -*-
-# @Author: wangkun
-# @Time: 2023/4/21
-from common.scheduling_db import MysqlHelper
-
-
-class AuthorTest:
-    @classmethod
-    def get_users(cls, log_type, crawler, env):
-        select_user_sql = f"""select * from crawler_user_v3 where task_id=16"""
-        user_list = MysqlHelper.get_values(log_type, crawler, select_user_sql, env, action="")
-        print(len(user_list))
-        for user in user_list:
-            print(type(user))
-            print(user)
-
-
-if __name__ == "__main__":
-    AuthorTest.get_users("test", "xiaoniangao", "dev")

+ 0 - 24
xiaoniangao/xiaoniangao_follow/insert_filter_word.py

@@ -1,24 +0,0 @@
-# -*- coding: utf-8 -*-
-# @Author: wangkun
-# @Time: 2023/3/27
-from common.scheduling_db import MysqlHelper
-from common.feishu import Feishu
-
-
-def insert_filter_word(log_type, crawler, env, action=''):
-    xiaoniangao_sheet = Feishu.get_values_batch('db', "xiaoniangao", "DRAnZh")
-
-    for i in range(len(xiaoniangao_sheet)):
-        filter_word = xiaoniangao_sheet[i]
-        if filter_word is None:
-            continue
-        source = "小年糕"
-        insert_sql = f""" insert into  crawler_filter_word(source,
-        filter_word)
-        values("{source}",
-        "{filter_word[0]}")"""
-        MysqlHelper.update_values(log_type, crawler, insert_sql, env, action='')
-
-
-if __name__ == "__main__":
-    insert_filter_word('insert', 'xiaoniangao', 'dev')

+ 0 - 117
xiaoniangao/xiaoniangao_follow/insert_video_1.py

@@ -1,117 +0,0 @@
-# -*- coding: utf-8 -*-
-# @Author: wangkun
-# @Time: 2023/3/14
-import json
-import os
-import sys
-sys.path.append(os.getcwd())
-from common.common import Common
-from common.scheduling_db  import MysqlHelper
-from common.feishu import Feishu
-
-
-class Insert:
-    @classmethod
-    def insert_video_from_feishu_to_mysql(cls, log_type, crawler, env, machine):
-        xiaoniangao_sheetid_list = ['Wu0CeL']
-        for sheetid in xiaoniangao_sheetid_list:
-            xiaoniangao_sheet = Feishu.get_values_batch(log_type, crawler, sheetid)
-            for i in range(1, len(xiaoniangao_sheet)):
-            # for i in range(1, 3):
-                if xiaoniangao_sheet[i][5] is None or xiaoniangao_sheet[i][9] is None:
-                    continue
-                video_id = xiaoniangao_sheet[i][9].replace("https://admin.piaoquantv.com/cms/post-detail/", "").replace("/info", "")
-                if video_id == "None":
-                    continue
-                video_id = int(video_id)
-                out_user_id = str(xiaoniangao_sheet[i][19])
-                platform = "小年糕"
-                strategy = "定向爬虫策略"
-                out_video_id = str(xiaoniangao_sheet[i][7])
-                video_title = str(xiaoniangao_sheet[i][8])
-                cover_url = str(xiaoniangao_sheet[i][21])
-                video_url = str(xiaoniangao_sheet[i][22])
-                duration = int(xiaoniangao_sheet[i][14])
-                publish_time = str(xiaoniangao_sheet[i][16]).replace("/", "-")
-                play_cnt = int(xiaoniangao_sheet[i][10])
-                like_cnt = int(xiaoniangao_sheet[i][12])
-                share_cnt = int(xiaoniangao_sheet[i][13])
-                # collection_cnt = 0
-                comment_cnt = int(xiaoniangao_sheet[i][11])
-                crawler_rule = json.dumps({"play_cnt": {"min": 500}, "duration": {"min": 40}, "publish_day": {"min": 3}})
-                width = int(xiaoniangao_sheet[i][15].split("*")[0])
-                height = int(xiaoniangao_sheet[i][15].split("*")[1])
-
-                # print(f"video_id:{video_id}, type:{type(video_id)}")
-                # print(f"user_id:{user_id}, type:{type(user_id)}")
-                # print(f"out_user_id:{out_user_id}, type:{type(out_user_id)}")
-                # print(f"platform:{platform}, type:{type(platform)}")
-                # print(f"strategy:{strategy}, type:{type(strategy)}")
-                # print(f"out_video_id:{out_video_id}, type:{type(out_video_id)}")
-                # print(f"video_title:{video_title}, type:{type(video_title)}")
-                # print(f"cover_url:{cover_url}, type:{type(cover_url)}")
-                # print(f"video_url:{video_url}, type:{type(video_url)}")
-                # print(f"duration:{duration}, type:{type(duration)}")
-                # print(f"publish_time:{publish_time}, type:{type(publish_time)}")
-                # print(f"play_cnt:{play_cnt}, type:{type(play_cnt)}")
-                # print(f"like_cnt:{like_cnt}, type:{type(like_cnt)}")
-                # print(f"share_cnt:{share_cnt}, type:{type(share_cnt)}")
-                # print(f"comment_cnt:{comment_cnt}, type:{type(comment_cnt)}")
-                # print(f"crawler_rule:{crawler_rule}, type:{type(crawler_rule)}")
-                # print(f"width:{width}, type:{type(width)}")
-                # print(f"height:{height}, type:{type(height)}\n")
-
-                select_sql = f""" select * from crawler_video where platform="{platform}" and out_video_id="{out_video_id}" """
-                Common.logger(log_type, crawler).info(f"select_sql:{select_sql}")
-                repeat_video = MysqlHelper.get_values(log_type, crawler, select_sql, env, machine)
-                Common.logger(log_type, crawler).info(f"repeat_video:{repeat_video}")
-
-                if repeat_video is not None and len(repeat_video) != 0:
-                    Common.logger(log_type, crawler).info(f"{video_title} 已存在数据库中\n")
-                else:
-                    # 视频信息保存数据库
-                    insert_sql = f""" insert into crawler_video(video_id,
-                                    out_user_id,
-                                    platform,
-                                    strategy,
-                                    out_video_id,
-                                    video_title,
-                                    cover_url,
-                                    video_url,
-                                    duration,
-                                    publish_time,
-                                    play_cnt,
-                                    like_cnt,
-                                    share_cnt,
-                                    comment_cnt,
-                                    crawler_rule,
-                                    width,
-                                    height)
-                                    values({video_id},
-                                    "{out_user_id}",
-                                    "{platform}",
-                                    "{strategy}",
-                                    "{out_video_id}",
-                                    "{video_title}",
-                                    "{cover_url}",
-                                    "{video_url}",
-                                    {duration},
-                                    "{publish_time}",
-                                    {play_cnt},
-                                    {like_cnt},
-                                    {share_cnt},
-                                    {comment_cnt},
-                                    '{crawler_rule}',
-                                    {width},
-                                    {height}) """
-                    Common.logger(log_type, crawler).info(f"insert_sql:{insert_sql}")
-                    MysqlHelper.update_values(log_type, crawler, insert_sql, env, machine)
-                    Common.logger(log_type, crawler).info('视频信息插入数据库成功!\n')
-
-
-if __name__ == "__main__":
-    # Insert.insert_video_from_feishu_to_mysql("insert-dev-follow", "xiaoniangao", "dev", "local")
-    # Insert.insert_video_from_feishu_to_mysql("insert-dev-hour", "xiaoniangao", "dev", "local")
-    # Insert.insert_video_from_feishu_to_mysql("insert-prod-follow", "xiaoniangao", "prod", "local")
-    Insert.insert_video_from_feishu_to_mysql("insert-prod-1", "xiaoniangao", "prod", "local")
-    pass

+ 84 - 0
xiaoniangao/xiaoniangao_main/run_xng_author.py

@@ -0,0 +1,84 @@
+# -*- coding: utf-8 -*-
+# @Author: wangkun
+# @Time: 2023/6/7
+import argparse
+from mq_http_sdk.mq_client import *
+from mq_http_sdk.mq_consumer import *
+from mq_http_sdk.mq_exception import MQExceptionBase
+sys.path.append(os.getcwd())
+from common.public import get_consumer, ack_message, task_fun_mq
+from common.common import Common
+from common.scheduling_db import MysqlHelper
+from xiaoniangao.xiaoniangao_author.xiaoniangao_author_scheduling import XiaoniangaoAuthorScheduling
+
+
+def main(log_type, crawler, topic_name, group_id, env):
+    consumer = get_consumer(topic_name, group_id)
+    # 长轮询表示如果Topic没有消息,则客户端请求会在服务端挂起3秒,3秒内如果有消息可以消费则立即返回响应。
+    # 长轮询时间3秒(最多可设置为30秒)。
+    wait_seconds = 3
+    # 一次最多消费3条(最多可设置为16条)。
+    batch = 1
+    Common.logger(log_type, crawler).info(f'{10 * "="}Consume And Ack Message From Topic{10 * "="}\n'
+                                          f'WaitSeconds:{wait_seconds}\n'
+                                          f'TopicName:{topic_name}\n'
+                                          f'MQConsumer:{group_id}')
+    while True:
+        try:
+            # 长轮询消费消息。
+            recv_msgs = consumer.consume_message(batch, wait_seconds)
+            for msg in recv_msgs:
+                Common.logger(log_type, crawler).info(f"Receive\n"
+                                                      f"MessageId:{msg.message_id}\n"
+                                                      f"MessageBodyMD5:{msg.message_body_md5}\n"
+                                                      f"MessageTag:{msg.message_tag}\n"
+                                                      f"ConsumedTimes:{msg.consumed_times}\n"
+                                                      f"PublishTime:{msg.publish_time}\n"
+                                                      f"Body:{msg.message_body}\n"
+                                                      f"NextConsumeTime:{msg.next_consume_time}\n"
+                                                      f"ReceiptHandle:{msg.receipt_handle}\n"
+                                                      f"Properties:{msg.properties}")
+                # ack_mq_message
+                ack_message(log_type=log_type, crawler=crawler, recv_msgs=recv_msgs, consumer=consumer)
+
+                # 处理爬虫业务
+                task_dict = task_fun_mq(msg.message_body)['task_dict']
+                rule_dict = task_fun_mq(msg.message_body)['rule_dict']
+                task_id = task_dict['id']
+                select_user_sql = f"""select * from crawler_user_v3 where task_id={task_id}"""
+                user_list = MysqlHelper.get_values(log_type, crawler, select_user_sql, env, action="")
+                Common.logger(log_type, crawler).info(f"调度任务:\n{task_dict}")
+                Common.logger(log_type, crawler).info(f"抓取规则:\n{rule_dict}")
+                Common.logger(log_type, crawler).info(f"用户列表:\n{user_list}")
+                Common.logger(log_type, crawler).info(f'开始抓取 {task_dict["taskName"]}\n')
+                XiaoniangaoAuthorScheduling.get_author_videos(log_type=log_type,
+                                                              crawler=crawler,
+                                                              user_list=user_list,
+                                                              rule_dict=rule_dict,
+                                                              env=env)
+                Common.del_logs(log_type, crawler)
+                Common.logger(log_type, crawler).info('抓取一轮结束\n')
+
+        except MQExceptionBase as err:
+            # Topic中没有消息可消费。
+            if err.type == "MessageNotExist":
+                Common.logger(log_type, crawler).info(f"No new message! RequestId:{err.req_id}\n")
+                continue
+
+            Common.logger(log_type, crawler).info(f"Consume Message Fail! Exception:{err}\n")
+            time.sleep(2)
+            continue
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()  ## 新建参数解释器对象
+    parser.add_argument('--log_type', type=str)  ## 添加参数,注明参数类型
+    parser.add_argument('--crawler')  ## 添加参数
+    parser.add_argument('--topic_name')  ## 添加参数
+    parser.add_argument('--group_id')  ## 添加参数
+    parser.add_argument('--env')  ## 添加参数
+    args = parser.parse_args()  ### 参数赋值,也可以通过终端赋值
+    main(log_type=args.log_type,
+         crawler=args.crawler,
+         topic_name=args.topic_name,
+         group_id=args.group_id,
+         env=args.env)

+ 115 - 0
xiaoniangao/xiaoniangao_main/run_xng_hour.py

@@ -0,0 +1,115 @@
+# -*- coding: utf-8 -*-
+# @Author: wangkun
+# @Time: 2023/6/7
+import argparse
+import datetime
+import random
+from mq_http_sdk.mq_client import *
+from mq_http_sdk.mq_consumer import *
+from mq_http_sdk.mq_exception import MQExceptionBase
+sys.path.append(os.getcwd())
+from common.public import get_consumer, ack_message, task_fun_mq
+from common.common import Common
+from common.scheduling_db import MysqlHelper
+from xiaoniangao.xiaoniangao_hour.xiaoniangao_hour_scheduling import XiaoniangaoHourScheduling
+
+
+def main(log_type, crawler, topic_name, group_id, env):
+    consumer = get_consumer(topic_name, group_id)
+    # 长轮询表示如果Topic没有消息,则客户端请求会在服务端挂起3秒,3秒内如果有消息可以消费则立即返回响应。
+    # 长轮询时间3秒(最多可设置为30秒)。
+    wait_seconds = 3
+    # 一次最多消费3条(最多可设置为16条)。
+    batch = 1
+    Common.logger(log_type, crawler).info(f'{10 * "="}Consume And Ack Message From Topic{10 * "="}\n'
+                                          f'WaitSeconds:{wait_seconds}\n'
+                                          f'TopicName:{topic_name}\n'
+                                          f'MQConsumer:{group_id}')
+    while True:
+        try:
+            # 长轮询消费消息。
+            recv_msgs = consumer.consume_message(batch, wait_seconds)
+            for msg in recv_msgs:
+                Common.logger(log_type, crawler).info(f"Receive\n"
+                                                      f"MessageId:{msg.message_id}\n"
+                                                      f"MessageBodyMD5:{msg.message_body_md5}\n"
+                                                      f"MessageTag:{msg.message_tag}\n"
+                                                      f"ConsumedTimes:{msg.consumed_times}\n"
+                                                      f"PublishTime:{msg.publish_time}\n"
+                                                      f"Body:{msg.message_body}\n"
+                                                      f"NextConsumeTime:{msg.next_consume_time}\n"
+                                                      f"ReceiptHandle:{msg.receipt_handle}\n"
+                                                      f"Properties:{msg.properties}")
+                # ack_mq_message
+                ack_message(log_type=log_type, crawler=crawler, recv_msgs=recv_msgs, consumer=consumer)
+
+                # 处理爬虫业务
+                task_dict = task_fun_mq(msg.message_body)['task_dict']
+                rule_dict = task_fun_mq(msg.message_body)['rule_dict']
+                task_id = task_dict['id']
+                select_user_sql = f"""select * from crawler_user_v3 where task_id={task_id}"""
+                user_list = MysqlHelper.get_values(log_type, crawler, select_user_sql, env, action="")
+                our_uid_list = []
+                for user in user_list:
+                    our_uid_list.append(user["uid"])
+                our_uid = random.choice(our_uid_list)
+                Common.logger(log_type, crawler).info(f"调度任务:\n{task_dict}")
+                Common.logger(log_type, crawler).info(f"抓取规则:\n{rule_dict}")
+                Common.logger(log_type, crawler).info(f'开始抓取 {task_dict["taskName"]}\n')
+                # 获取符合规则的视频,写入小时级数据_feeds
+                for i in range(1, 101):
+                    try:
+                        Common.logger(log_type, crawler).info(f"正在抓取第{i}页")
+                        XiaoniangaoHourScheduling.get_videoList(log_type, crawler, rule_dict, env)
+                    except Exception as err:
+                        Common.logger(log_type, crawler).info(f"抓取第{i}页时异常:{err}\n")
+                now = datetime.datetime.now()
+                if now.hour == 10 and 0 <= now.minute <= 10:
+                    Common.logger(log_type, crawler).info("开始更新/下载上升榜")
+                    XiaoniangaoHourScheduling.update_videoList(log_type=log_type,
+                                                               crawler=crawler,
+                                                               rule_dict=rule_dict,
+                                                               our_uid=our_uid,
+                                                               env=env)
+
+                elif now.hour == 15 and now.minute <= 10:
+                    Common.logger(log_type, crawler).info("开始更新/下载上升榜")
+                    XiaoniangaoHourScheduling.update_videoList(log_type=log_type,
+                                                               crawler=crawler,
+                                                               rule_dict=rule_dict,
+                                                               our_uid=our_uid,
+                                                               env=env)
+
+                elif now.hour == 20 and now.minute <= 10:
+                    Common.logger(log_type, crawler).info("开始更新/下载上升榜")
+                    XiaoniangaoHourScheduling.update_videoList(log_type=log_type,
+                                                               crawler=crawler,
+                                                               rule_dict=rule_dict,
+                                                               our_uid=our_uid,
+                                                               env=env)
+                Common.del_logs(log_type, crawler)
+                Common.logger(log_type, crawler).info('抓取一轮结束\n')
+
+        except MQExceptionBase as err:
+            # Topic中没有消息可消费。
+            if err.type == "MessageNotExist":
+                Common.logger(log_type, crawler).info(f"No new message! RequestId:{err.req_id}\n")
+                continue
+
+            Common.logger(log_type, crawler).info(f"Consume Message Fail! Exception:{err}\n")
+            time.sleep(2)
+            continue
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()  ## 新建参数解释器对象
+    parser.add_argument('--log_type', type=str)  ## 添加参数,注明参数类型
+    parser.add_argument('--crawler')  ## 添加参数
+    parser.add_argument('--topic_name')  ## 添加参数
+    parser.add_argument('--group_id')  ## 添加参数
+    parser.add_argument('--env')  ## 添加参数
+    args = parser.parse_args()  ### 参数赋值,也可以通过终端赋值
+    main(log_type=args.log_type,
+         crawler=args.crawler,
+         topic_name=args.topic_name,
+         group_id=args.group_id,
+         env=args.env)

+ 89 - 0
xiaoniangao/xiaoniangao_main/run_xng_play.py

@@ -0,0 +1,89 @@
+# -*- coding: utf-8 -*-
+# @Author: wangkun
+# @Time: 2023/6/7
+import argparse
+import random
+from mq_http_sdk.mq_client import *
+from mq_http_sdk.mq_consumer import *
+from mq_http_sdk.mq_exception import MQExceptionBase
+sys.path.append(os.getcwd())
+from common.public import get_consumer, ack_message, task_fun_mq
+from common.common import Common
+from common.scheduling_db import MysqlHelper
+from xiaoniangao.xiaoniangao_play.xiaoniangao_play_scheduling import XiaoniangaoplayScheduling
+
+
+def main(log_type, crawler, topic_name, group_id, env):
+    consumer = get_consumer(topic_name, group_id)
+    # 长轮询表示如果Topic没有消息,则客户端请求会在服务端挂起3秒,3秒内如果有消息可以消费则立即返回响应。
+    # 长轮询时间3秒(最多可设置为30秒)。
+    wait_seconds = 3
+    # 一次最多消费3条(最多可设置为16条)。
+    batch = 1
+    Common.logger(log_type, crawler).info(f'{10 * "="}Consume And Ack Message From Topic{10 * "="}\n'
+                                          f'WaitSeconds:{wait_seconds}\n'
+                                          f'TopicName:{topic_name}\n'
+                                          f'MQConsumer:{group_id}')
+    while True:
+        try:
+            # 长轮询消费消息。
+            recv_msgs = consumer.consume_message(batch, wait_seconds)
+            for msg in recv_msgs:
+                Common.logger(log_type, crawler).info(f"Receive\n"
+                                                      f"MessageId:{msg.message_id}\n"
+                                                      f"MessageBodyMD5:{msg.message_body_md5}\n"
+                                                      f"MessageTag:{msg.message_tag}\n"
+                                                      f"ConsumedTimes:{msg.consumed_times}\n"
+                                                      f"PublishTime:{msg.publish_time}\n"
+                                                      f"Body:{msg.message_body}\n"
+                                                      f"NextConsumeTime:{msg.next_consume_time}\n"
+                                                      f"ReceiptHandle:{msg.receipt_handle}\n"
+                                                      f"Properties:{msg.properties}")
+                # ack_mq_message
+                ack_message(log_type=log_type, crawler=crawler, recv_msgs=recv_msgs, consumer=consumer)
+
+                # 处理爬虫业务
+                task_dict = task_fun_mq(msg.message_body)['task_dict']
+                rule_dict = task_fun_mq(msg.message_body)['rule_dict']
+                task_id = task_dict['id']
+                select_user_sql = f"""select * from crawler_user_v3 where task_id={task_id}"""
+                user_list = MysqlHelper.get_values(log_type, crawler, select_user_sql, env, action="")
+                our_uid_list = []
+                for user in user_list:
+                    our_uid_list.append(user["uid"])
+                our_uid = random.choice(our_uid_list)
+                Common.logger(log_type, crawler).info(f"调度任务:{task_dict}")
+                Common.logger(log_type, crawler).info(f"抓取规则:{rule_dict}")
+                # Common.logger(log_type, crawler).info(f"用户列表:{user_list}\n")
+                Common.logger(log_type, crawler).info(f'开始抓取 {task_dict["taskName"]}\n')
+                XiaoniangaoplayScheduling.get_videoList(log_type=log_type,
+                                                        crawler=crawler,
+                                                        rule_dict=rule_dict,
+                                                        our_uid=our_uid,
+                                                        env=env)
+                Common.del_logs(log_type, crawler)
+                Common.logger(log_type, crawler).info('抓取一轮结束\n')
+
+        except MQExceptionBase as err:
+            # Topic中没有消息可消费。
+            if err.type == "MessageNotExist":
+                Common.logger(log_type, crawler).info(f"No new message! RequestId:{err.req_id}\n")
+                continue
+
+            Common.logger(log_type, crawler).info(f"Consume Message Fail! Exception:{err}\n")
+            time.sleep(2)
+            continue
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()  ## 新建参数解释器对象
+    parser.add_argument('--log_type', type=str)  ## 添加参数,注明参数类型
+    parser.add_argument('--crawler')  ## 添加参数
+    parser.add_argument('--topic_name')  ## 添加参数
+    parser.add_argument('--group_id')  ## 添加参数
+    parser.add_argument('--env')  ## 添加参数
+    args = parser.parse_args()  ### 参数赋值,也可以通过终端赋值
+    main(log_type=args.log_type,
+         crawler=args.crawler,
+         topic_name=args.topic_name,
+         group_id=args.group_id,
+         env=args.env)

+ 0 - 38
xigua/xigua_follow/demo.py

@@ -1,38 +0,0 @@
-# -*- coding: utf-8 -*-
-# @Author: wangkun
-# @Time: 2023/3/7
-import time
-from datetime import date, timedelta
-
-
-class Demo:
-    @classmethod
-    def test_time(cls):
-        min_publish_day = 10
-        min_publish_day = (date.today() + timedelta(days=-min_publish_day)).strftime("%Y-%m-%d")
-        min_publish_day = int(time.mktime(time.strptime(min_publish_day, "%Y-%m-%d")))
-        print(min_publish_day)
-
-
-    @classmethod
-    def test_str(cls):
-        list1 = [('task_id','19'),('task_name','西瓜定向抓取'),('source','xigua'),('next_time','1678174642'),('interval_piaoquan','600'),('play_cnt',{'min':1000}),('video_width',{'min':720}),('video_height',{'min':720}),('video_like',{'min':0}),('share_cnt',{'min':0}),('duration_min',60),('duration_max',6000),('task_type','author'),('spider_link',['https://www.ixigua.com/home/95420624045','https://www.ixigua.com/home/6431477489']),('spider_name','run_xigua_follow'),('min_publish_time','0'),('min_publish_day','10'),('media_id','6267141'),('applets_status','0'),('app_status','3'),('user_tag','西瓜爬虫,定向爬虫策略'),('user_content_tag','搞笑博主'),('local','aliyun')]
-        dict1 = dict(list1)
-        print(type(dict1))
-        print(dict1)
-        for k, v in dict1.items():
-            print(f"{k}:{v},{type(v)}")
-
-
-
-
-        # for tuple1 in str1:
-        #     list1 = list(tuple1)
-        #     dict1 = dict(zip(list1))
-
-
-if __name__ == "__main__":
-    # Demo.test_time()
-    # Demo.test_str()
-    print(int('0'))
-    pass

+ 0 - 119
xigua/xigua_follow/insert_videos.py

@@ -1,119 +0,0 @@
-# -*- coding: utf-8 -*-
-# @Author: wangkun
-# @Time: 2023/2/23
-import json
-import os
-import sys
-
-sys.path.append(os.getcwd())
-from common.common import Common
-from common.db import MysqlHelper
-from common.feishu import Feishu
-
-
-class Insert:
-    @classmethod
-    def insert_video_from_feishu_to_mysql(cls, log_type, crawler, env, machine):
-        xigua_sheetid_list = ["QOWqMo", "3Ul6wZ", "e075e9"]
-        for sheetid in xigua_sheetid_list:
-            xigua_sheet = Feishu.get_values_batch(log_type, crawler, sheetid)
-            for i in range(1, len(xigua_sheet)):
-            # for i in range(1, 3):
-                if xigua_sheet[i][5] is None:
-                    continue
-                video_id = xigua_sheet[i][9].replace("https://admin.piaoquantv.com/cms/post-detail/", "").replace("/info", "")
-                if video_id == "None":
-                    continue
-                video_id = int(video_id)
-                user_id = 0
-                out_user_id = str(xigua_sheet[i][19])
-                platform = "西瓜视频"
-                strategy = "定向爬虫策略"
-                out_video_id = str(xigua_sheet[i][8])
-                video_title = str(xigua_sheet[i][7])
-                cover_url = str(xigua_sheet[i][21])
-                video_url = str(xigua_sheet[i][22])
-                duration = int(xigua_sheet[i][15])
-                publish_time = str(xigua_sheet[i][17].replace("/", "-"))
-                play_cnt = int(xigua_sheet[i][11])
-                like_cnt = int(xigua_sheet[i][13])
-                share_cnt = int(xigua_sheet[i][14])
-                # collection_cnt = 0
-                comment_cnt = int(xigua_sheet[i][12])
-                crawler_rule = json.dumps({"play_cnt": 0, "comment_cnt": 0, "like_cnt": 0, "duration": 60, "publish_time": 10, "video_width": 720, "video_height": 720})
-                width = int(xigua_sheet[i][16].split("*")[0])
-                height = int(xigua_sheet[i][16].split("*")[1])
-
-                # print(f"video_id:{video_id}, type:{type(video_id)}")
-                # print(f"user_id:{user_id}, type:{type(user_id)}")
-                # print(f"out_user_id:{out_user_id}, type:{type(out_user_id)}")
-                # print(f"platform:{platform}, type:{type(platform)}")
-                # print(f"strategy:{strategy}, type:{type(strategy)}")
-                # print(f"out_video_id:{out_video_id}, type:{type(out_video_id)}")
-                # print(f"video_title:{video_title}, type:{type(video_title)}")
-                # print(f"cover_url:{cover_url}, type:{type(cover_url)}")
-                # print(f"video_url:{video_url}, type:{type(video_url)}")
-                # print(f"duration:{duration}, type:{type(duration)}")
-                # print(f"publish_time:{publish_time}, type:{type(publish_time)}")
-                # print(f"play_cnt:{play_cnt}, type:{type(play_cnt)}")
-                # print(f"like_cnt:{like_cnt}, type:{type(like_cnt)}")
-                # print(f"share_cnt:{share_cnt}, type:{type(share_cnt)}")
-                # print(f"collection_cnt:{collection_cnt}, type:{type(collection_cnt)}")
-                # print(f"comment_cnt:{comment_cnt}, type:{type(comment_cnt)}")
-                # print(f"crawler_rule:{crawler_rule}, type:{type(crawler_rule)}")
-                # print(f"width:{width}, type:{type(width)}")
-                # print(f"height:{height}, type:{type(height)}\n")
-
-                select_sql = f""" select * from crawler_video where platform="{platform}" and out_video_id="{out_video_id}" """
-                Common.logger(log_type, crawler).info(f"select_sql:{select_sql}")
-                repeat_video = MysqlHelper.get_values(log_type, crawler, select_sql, env, machine)
-                Common.logger(log_type, crawler).info(f"repeat_video:{repeat_video}")
-
-                if repeat_video is not None and len(repeat_video) != 0:
-                    Common.logger(log_type, crawler).info(f"{video_title} 已存在数据库中\n")
-                else:
-                    # 视频信息保存数据库
-                    insert_sql = f""" insert into crawler_video(video_id,
-                                    user_id,
-                                    out_user_id,
-                                    platform,
-                                    strategy,
-                                    out_video_id,
-                                    video_title,
-                                    cover_url,
-                                    video_url,
-                                    duration,
-                                    publish_time,
-                                    play_cnt,
-                                    like_cnt,
-                                    share_cnt,
-                                    comment_cnt,
-                                    crawler_rule,
-                                    width,
-                                    height)
-                                    values({video_id},
-                                    {user_id},
-                                    "{out_user_id}",
-                                    "{platform}",
-                                    "{strategy}",
-                                    "{out_video_id}",
-                                    "{video_title}",
-                                    "{cover_url}",
-                                    "{video_url}",
-                                    {duration},
-                                    "{publish_time}",
-                                    {play_cnt},
-                                    {like_cnt},
-                                    {share_cnt},
-                                    {comment_cnt},
-                                    '{crawler_rule}',
-                                    {width},
-                                    {height}) """
-                    Common.logger(log_type, crawler).info(f"insert_sql:{insert_sql}")
-                    MysqlHelper.update_values(log_type, crawler, insert_sql, env, machine)
-                    Common.logger(log_type, crawler).info('视频信息插入数据库成功!\n')
-
-
-
-if __name__ == "__main__":
-    Insert.insert_video_from_feishu_to_mysql("insert", "xigua", "dev", "local")

+ 0 - 57
xigua/xigua_follow/xigua_test.py

@@ -1,57 +0,0 @@
-# -*- coding: utf-8 -*-
-# @Author: wangkun
-# @Time: 2023/2/24
-import requests
-import string
-import random
-
-
-def random_signature():
-    src_digits = string.digits  # string_数字
-    src_uppercase = string.ascii_uppercase  # string_大写字母
-    src_lowercase = string.ascii_lowercase  # string_小写字母
-    digits_num = random.randint(1, 6)
-    uppercase_num = random.randint(1, 26 - digits_num - 1)
-    lowercase_num = 26 - (digits_num + uppercase_num)
-    password = random.sample(src_digits, digits_num) + random.sample(src_uppercase, uppercase_num) + random.sample(
-        src_lowercase, lowercase_num)
-    random.shuffle(password)
-    new_password = 'AAAAAAAAAA' + ''.join(password)[10:-4] + 'AAAB'
-    new_password_start = new_password[0:18]
-    new_password_end = new_password[-7:]
-    if new_password[18] == '8':
-        new_password = new_password_start + 'w' + new_password_end
-    elif new_password[18] == '9':
-        new_password = new_password_start + 'x' + new_password_end
-    elif new_password[18] == '-':
-        new_password = new_password_start + 'y' + new_password_end
-    elif new_password[18] == '.':
-        new_password = new_password_start + 'z' + new_password_end
-    else:
-        new_password = new_password_start + 'y' + new_password_end
-    return new_password
-
-
-def get_user_video(uid):
-    signature = random_signature()
-    url = "https://www.ixigua.com/api/videov2/author/new_video_list?"
-    params = {
-        'to_user_id': uid,
-        'offset': '0',
-        'limit': '30',
-        'maxBehotTime': '0',
-        'order': 'new',
-        'isHome': '0',
-        '_signature': signature,
-    }
-    headers = {
-        'referer': f'https://www.ixigua.com/home/{uid}/video/?preActiveKey=hotsoon&list_entrance=userdetail',
-        'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 Safari/537.36 Edg/110.0.1587.41',
-    }
-    response = requests.get(url=url, headers=headers, params=params, verify=False)
-    print(response.text)
-
-
-if __name__ == '__main__':
-    uid = '3865480345435996'
-    get_user_video(uid)

+ 85 - 0
xigua/xigua_main/run_xg_author.py

@@ -0,0 +1,85 @@
+# -*- coding: utf-8 -*-
+# @Author: wangkun
+# @Time: 2023/6/7
+import argparse
+from mq_http_sdk.mq_client import *
+from mq_http_sdk.mq_consumer import *
+from mq_http_sdk.mq_exception import MQExceptionBase
+sys.path.append(os.getcwd())
+from common.common import Common
+from common.public import get_consumer, ack_message, task_fun_mq
+from common.scheduling_db import MysqlHelper
+from xigua.xigua_author.xigua_author_scheduling import XiguaauthorScheduling
+
+
+def main(log_type, crawler, topic_name, group_id, env):
+    consumer = get_consumer(topic_name, group_id)
+    # 长轮询表示如果Topic没有消息,则客户端请求会在服务端挂起3秒,3秒内如果有消息可以消费则立即返回响应。
+    # 长轮询时间3秒(最多可设置为30秒)。
+    wait_seconds = 3
+    # 一次最多消费3条(最多可设置为16条)。
+    batch = 1
+    Common.logger(log_type, crawler).info(f'{10 * "="}Consume And Ack Message From Topic{10 * "="}\n'
+                                          f'WaitSeconds:{wait_seconds}\n'
+                                          f'TopicName:{topic_name}\n'
+                                          f'MQConsumer:{group_id}')
+    while True:
+        try:
+            # 长轮询消费消息。
+            recv_msgs = consumer.consume_message(batch, wait_seconds)
+            for msg in recv_msgs:
+                Common.logger(log_type, crawler).info(f"Receive\n"
+                                                      f"MessageId:{msg.message_id}\n"
+                                                      f"MessageBodyMD5:{msg.message_body_md5}\n"
+                                                      f"MessageTag:{msg.message_tag}\n"
+                                                      f"ConsumedTimes:{msg.consumed_times}\n"
+                                                      f"PublishTime:{msg.publish_time}\n"
+                                                      f"Body:{msg.message_body}\n"
+                                                      f"NextConsumeTime:{msg.next_consume_time}\n"
+                                                      f"ReceiptHandle:{msg.receipt_handle}\n"
+                                                      f"Properties:{msg.properties}")
+                # ack_mq_message
+                ack_message(log_type=log_type, crawler=crawler, recv_msgs=recv_msgs, consumer=consumer)
+
+                # 处理爬虫业务
+                task_dict = task_fun_mq(msg.message_body)['task_dict']
+                rule_dict = task_fun_mq(msg.message_body)['rule_dict']
+                task_id = task_dict['id']
+                select_user_sql = f"""select * from crawler_user_v3 where task_id={task_id}"""
+                user_list = MysqlHelper.get_values(log_type, crawler, select_user_sql, env, action="")
+                Common.logger(log_type, crawler).info(f"调度任务:{task_dict}")
+                Common.logger(log_type, crawler).info(f"抓取规则:{rule_dict}")
+                # Common.logger(log_type, crawler).info(f"用户列表:{user_list}\n")
+                Common.logger(log_type, crawler).info(f'开始抓取 {task_dict["taskName"]}\n')
+                XiguaauthorScheduling.get_author_videos(log_type=log_type,
+                                                        crawler=crawler,
+                                                        rule_dict=rule_dict,
+                                                        user_list=user_list,
+                                                        env=env)
+                Common.del_logs(log_type, crawler)
+                Common.logger(log_type, crawler).info('抓取一轮结束\n')
+
+        except MQExceptionBase as err:
+            # Topic中没有消息可消费。
+            if err.type == "MessageNotExist":
+                Common.logger(log_type, crawler).info(f"No new message! RequestId:{err.req_id}\n")
+                continue
+
+            Common.logger(log_type, crawler).info(f"Consume Message Fail! Exception:{err}\n")
+            time.sleep(2)
+            continue
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()  ## 新建参数解释器对象
+    parser.add_argument('--log_type', type=str)  ## 添加参数,注明参数类型
+    parser.add_argument('--crawler')  ## 添加参数
+    parser.add_argument('--topic_name')  ## 添加参数
+    parser.add_argument('--group_id')  ## 添加参数
+    parser.add_argument('--env')  ## 添加参数
+    args = parser.parse_args()  ### 参数赋值,也可以通过终端赋值
+    main(log_type=args.log_type,
+         crawler=args.crawler,
+         topic_name=args.topic_name,
+         group_id=args.group_id,
+         env=args.env)

+ 90 - 0
xigua/xigua_main/run_xg_recommend.py

@@ -0,0 +1,90 @@
+# -*- coding: utf-8 -*-
+# @Author: wangkun
+# @Time: 2023/6/7
+import argparse
+import random
+from mq_http_sdk.mq_client import *
+from mq_http_sdk.mq_consumer import *
+from mq_http_sdk.mq_exception import MQExceptionBase
+sys.path.append(os.getcwd())
+from common.common import Common
+from common.public import get_consumer, ack_message, task_fun_mq
+from common.scheduling_db import MysqlHelper
+from xigua.xigua_recommend.xigua_recommend_scheduling import XiguarecommendScheduling
+
+
+def main(log_type, crawler, topic_name, group_id, env):
+    consumer = get_consumer(topic_name, group_id)
+    # 长轮询表示如果Topic没有消息,则客户端请求会在服务端挂起3秒,3秒内如果有消息可以消费则立即返回响应。
+    # 长轮询时间3秒(最多可设置为30秒)。
+    wait_seconds = 3
+    # 一次最多消费3条(最多可设置为16条)。
+    batch = 1
+    Common.logger(log_type, crawler).info(f'{10 * "="}Consume And Ack Message From Topic{10 * "="}\n'
+                                          f'WaitSeconds:{wait_seconds}\n'
+                                          f'TopicName:{topic_name}\n'
+                                          f'MQConsumer:{group_id}')
+    while True:
+        try:
+            # 长轮询消费消息。
+            recv_msgs = consumer.consume_message(batch, wait_seconds)
+            for msg in recv_msgs:
+                Common.logger(log_type, crawler).info(f"Receive\n"
+                                                      f"MessageId:{msg.message_id}\n"
+                                                      f"MessageBodyMD5:{msg.message_body_md5}\n"
+                                                      f"MessageTag:{msg.message_tag}\n"
+                                                      f"ConsumedTimes:{msg.consumed_times}\n"
+                                                      f"PublishTime:{msg.publish_time}\n"
+                                                      f"Body:{msg.message_body}\n"
+                                                      f"NextConsumeTime:{msg.next_consume_time}\n"
+                                                      f"ReceiptHandle:{msg.receipt_handle}\n"
+                                                      f"Properties:{msg.properties}")
+                # ack_mq_message
+                ack_message(log_type=log_type, crawler=crawler, recv_msgs=recv_msgs, consumer=consumer)
+
+                # 处理爬虫业务
+                task_dict = task_fun_mq(msg.message_body)['task_dict']
+                rule_dict = task_fun_mq(msg.message_body)['rule_dict']
+                task_id = task_dict['id']
+                select_user_sql = f"""select * from crawler_user_v3 where task_id={task_id}"""
+                user_list = MysqlHelper.get_values(log_type, crawler, select_user_sql, env, action="")
+                our_uid_list = []
+                for user in user_list:
+                    our_uid_list.append(user["uid"])
+                our_uid = random.choice(our_uid_list)
+                Common.logger(log_type, crawler).info(f"调度任务:{task_dict}")
+                Common.logger(log_type, crawler).info(f"抓取规则:{rule_dict}")
+                # Common.logger(log_type, crawler).info(f"用户列表:{user_list}\n")
+                Common.logger(log_type, crawler).info(f'开始抓取 {task_dict["taskName"]}\n')
+                XiguarecommendScheduling.get_videoList(log_type=log_type,
+                                                       crawler=crawler,
+                                                       rule_dict=rule_dict,
+                                                       our_uid=our_uid,
+                                                       env=env)
+                Common.del_logs(log_type, crawler)
+                Common.logger(log_type, crawler).info('抓取一轮结束\n')
+
+        except MQExceptionBase as err:
+            # Topic中没有消息可消费。
+            if err.type == "MessageNotExist":
+                Common.logger(log_type, crawler).info(f"No new message! RequestId:{err.req_id}\n")
+                continue
+
+            Common.logger(log_type, crawler).info(f"Consume Message Fail! Exception:{err}\n")
+            time.sleep(2)
+            continue
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()  ## 新建参数解释器对象
+    parser.add_argument('--log_type', type=str)  ## 添加参数,注明参数类型
+    parser.add_argument('--crawler')  ## 添加参数
+    parser.add_argument('--topic_name')  ## 添加参数
+    parser.add_argument('--group_id')  ## 添加参数
+    parser.add_argument('--env')  ## 添加参数
+    args = parser.parse_args()  ### 参数赋值,也可以通过终端赋值
+    main(log_type=args.log_type,
+         crawler=args.crawler,
+         topic_name=args.topic_name,
+         group_id=args.group_id,
+         env=args.env)

+ 87 - 0
xigua/xigua_main/run_xg_search.py

@@ -0,0 +1,87 @@
+# -*- coding: utf-8 -*-
+# @Author: wangkun
+# @Time: 2023/6/7
+import argparse
+from mq_http_sdk.mq_client import *
+from mq_http_sdk.mq_consumer import *
+from mq_http_sdk.mq_exception import MQExceptionBase
+sys.path.append(os.getcwd())
+from common.common import Common
+from common.public import get_consumer, ack_message, task_fun_mq
+from common.scheduling_db import MysqlHelper
+from xigua.xigua_search.xigua_search_scheduling import XiguasearchScheduling
+
+
+def main(log_type, crawler, topic_name, group_id, env):
+    consumer = get_consumer(topic_name, group_id)
+    # 长轮询表示如果Topic没有消息,则客户端请求会在服务端挂起3秒,3秒内如果有消息可以消费则立即返回响应。
+    # 长轮询时间3秒(最多可设置为30秒)。
+    wait_seconds = 3
+    # 一次最多消费3条(最多可设置为16条)。
+    batch = 1
+    Common.logger(log_type, crawler).info(f'{10 * "="}Consume And Ack Message From Topic{10 * "="}\n'
+                                          f'WaitSeconds:{wait_seconds}\n'
+                                          f'TopicName:{topic_name}\n'
+                                          f'MQConsumer:{group_id}')
+    while True:
+        try:
+            # 长轮询消费消息。
+            recv_msgs = consumer.consume_message(batch, wait_seconds)
+            for msg in recv_msgs:
+                Common.logger(log_type, crawler).info(f"Receive\n"
+                                                      f"MessageId:{msg.message_id}\n"
+                                                      f"MessageBodyMD5:{msg.message_body_md5}\n"
+                                                      f"MessageTag:{msg.message_tag}\n"
+                                                      f"ConsumedTimes:{msg.consumed_times}\n"
+                                                      f"PublishTime:{msg.publish_time}\n"
+                                                      f"Body:{msg.message_body}\n"
+                                                      f"NextConsumeTime:{msg.next_consume_time}\n"
+                                                      f"ReceiptHandle:{msg.receipt_handle}\n"
+                                                      f"Properties:{msg.properties}")
+                # ack_mq_message
+                ack_message(log_type=log_type, crawler=crawler, recv_msgs=recv_msgs, consumer=consumer)
+
+                # 处理爬虫业务
+                task_dict = task_fun_mq(msg.message_body)['task_dict']
+                rule_dict = task_fun_mq(msg.message_body)['rule_dict']
+                task_id = task_dict['id']
+                select_user_sql = f"""select * from crawler_user_v3 where task_id={task_id}"""
+                user_list = MysqlHelper.get_values(log_type, crawler, select_user_sql, env, action="")
+                Common.logger(log_type, crawler).info(f"调度任务:{task_dict}")
+                Common.logger(log_type, crawler).info(f"抓取规则:{rule_dict}")
+                # Common.logger(log_type, crawler).info(f"用户列表:{user_list}\n")
+                Common.logger(log_type, crawler).info(f'开始抓取 {task_dict["taskName"]}\n')
+                XiguasearchScheduling.get_search_videos(log_type=log_type,
+                                                        crawler=crawler,
+                                                        rule_dict=rule_dict,
+                                                        user_list=user_list,
+                                                        env=env)
+                os.system("ps aux | grep Chrome | grep -v grep | awk '{print $2}' | xargs kill -9")
+                os.system("ps aux | grep chromedriver | grep -v grep | awk '{print $2}' | xargs kill -9")
+                Common.del_logs(log_type, crawler)
+                Common.logger(log_type, crawler).info('抓取一轮结束\n')
+
+        except MQExceptionBase as err:
+            # Topic中没有消息可消费。
+            if err.type == "MessageNotExist":
+                Common.logger(log_type, crawler).info(f"No new message! RequestId:{err.req_id}\n")
+                continue
+
+            Common.logger(log_type, crawler).info(f"Consume Message Fail! Exception:{err}\n")
+            time.sleep(2)
+            continue
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()  ## 新建参数解释器对象
+    parser.add_argument('--log_type', type=str)  ## 添加参数,注明参数类型
+    parser.add_argument('--crawler')  ## 添加参数
+    parser.add_argument('--topic_name')  ## 添加参数
+    parser.add_argument('--group_id')  ## 添加参数
+    parser.add_argument('--env')  ## 添加参数
+    args = parser.parse_args()  ### 参数赋值,也可以通过终端赋值
+    main(log_type=args.log_type,
+         crawler=args.crawler,
+         topic_name=args.topic_name,
+         group_id=args.group_id,
+         env=args.env)

+ 0 - 60
xigua/xigua_main/run_xigua_follow_scheduling.py

@@ -1,60 +0,0 @@
-# -*- coding: utf-8 -*-
-# @Author: lierqiang
-# @Time: 2023/4/21
-import argparse
-import os
-import sys
-
-sys.path.append(os.getcwd())
-from common.common import Common
-from xigua.xigua_follow.xigua_follow_scheduling import ScheduleXiguaFollow
-from common.public import task_fun
-
-
-# from common.feishu import Feishu
-
-
-def main(log_type, crawler, task, oss_endpoint, env):
-    task = task_fun(task)
-    try:
-        Common.logger(log_type, crawler).info('开始抓取 西瓜视频 定向榜\n')
-        ScheduleXiguaFollow.get_follow_videos(log_type=log_type,
-                                              crawler=crawler,
-                                              task=task,
-                                              oss_endpoint=oss_endpoint,
-                                              env=env)
-        Common.del_logs(log_type, crawler)
-        Common.logger(log_type, crawler).info('抓取任务结束\n')
-    except Exception as e:
-        Common.logger(log_type, crawler).info(f"西瓜视频异常,触发报警:{e}\n")
-        # Feishu.bot(log_type, crawler, f"{e}")
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()  ## 新建参数解释器对象
-    parser.add_argument('--log_type', default='author')  ## 添加参数,注明参数类型
-    parser.add_argument('--crawler', default='xigua')  ## 添加参数
-    parser.add_argument('--strategy', default='定向抓取')  ## 添加参数
-    parser.add_argument('--task')  ## 添加参数
-    parser.add_argument('--oss_endpoint', default='outer')  ## 添加参数
-    parser.add_argument('--env', default='dev')  ## 添加参数
-    # parser.add_argument('--machine')  ## 添加参数
-    args = parser.parse_args()  ### 参数赋值,也可以通过终端赋值
-    task = {
-        'task_dict': {'task_id': '17', 'task_name': '西瓜测试4.21', 'source': 'xigua', 'start_time': '1682010720000',
-                      'interval': '24', 'mode': 'author',
-                      'rule': {'duration': {'min': 40, 'max': 0}, 'playCnt': {'min': 4000, 'max': 0},
-                               'period': {'min': 10, 'max': 0}, 'fans': {'min': 0, 'max': 0},
-                               'videos': {'min': 0, 'max': 0}, 'like': {'min': 0, 'max': 0},
-                               'videoWidth': {'min': 0, 'max': 0}, 'videoHeight': {'min': 0, 'max': 0}},
-                      'spider_name': 'run_xiguan_author_scheduling', 'machine': 'aliyun', 'status': '0',
-                      'create_time': '1682048632396', 'update_time': '1682048632396', 'operator': ''},
-        'rule_dict': {'duration': {'min': 40, 'max': 0}, 'playCnt': {'min': 4000, 'max': 0},
-                      'period': {'min': 10, 'max': 0}, 'fans': {'min': 0, 'max': 0}, 'videos': {'min': 0, 'max': 0},
-                      'like': {'min': 0, 'max': 0}, 'videoWidth': {'min': 0, 'max': 0},
-                      'videoHeight': {'min': 0, 'max': 0}}}
-    main(log_type=args.log_type,
-         crawler=args.crawler,
-         task=task,
-         oss_endpoint=args.oss_endpoint,
-         env=args.env)

+ 0 - 78
xigua/xigua_recommend/insert.py

@@ -1,78 +0,0 @@
-import json
-import os
-import sys
-import time
-from datetime import date, timedelta
-sys.path.append(os.getcwd())
-from common.scheduling_db import MysqlHelper
-from common.feishu import Feishu
-
-
-class Demo:
-    @classmethod
-    def get_config(cls, log_type, crawler, text, env):
-        select_sql = f"""select * from crawler_config where source="xigua" """
-        contents = MysqlHelper.get_values(log_type, crawler, select_sql, env, action='')
-        title_list = []
-        filter_list = []
-        for content in contents:
-            config = content['config']
-            config_dict = eval(config)
-            for k, v in config_dict.items():
-                if k == "title":
-                    title_list_config = v.split(",")
-                    for title in title_list_config:
-                        title_list.append(title)
-                if k == "filter":
-                    filter_list_config = v.split(",")
-                    for filter_word in filter_list_config:
-                        filter_list.append(filter_word)
-        if text == "title":
-            return title_list
-        elif text == "filter":
-            return filter_list
-
-    @classmethod
-    def before_day(cls):
-        publish_time_str_rule = (date.today() + timedelta(days=-30)).strftime("%Y-%m-%d %H:%M:%S")
-        publish_time_stamp_rule = int(time.mktime(time.strptime(publish_time_str_rule, "%Y-%m-%d %H:%M:%S")))
-        print(publish_time_str_rule)
-        print(publish_time_stamp_rule)
-
-    @classmethod
-    def insert_config(cls, log_type, crawler, env):
-        filter_sheet = Feishu.get_values_batch(log_type, crawler, "KGB4Hc")
-        title_sheet = Feishu.get_values_batch(log_type, crawler, "bHSW1p")
-        filter_list = []
-        title_list = []
-        for x in filter_sheet:
-            for y in x:
-                if y is None:
-                    pass
-                else:
-                    filter_list.append(y)
-        for x in title_sheet:
-            for y in x:
-                if y is None:
-                    pass
-                else:
-                    title_list.append(y)
-        str_title = ','.join(title_list)
-        str_filter = ','.join(filter_list)
-        config_dict = {
-            "title": str_title,
-            "filter": str_filter
-        }
-        str_config_dict = str(config_dict)
-        # print(f"config_dict:{config_dict}")
-        # print(f"str_config_dict:{str_config_dict}")
-        insert_sql = f""" insert into crawler_config(title, source, config) values("西瓜视频", "xigua", "{str_config_dict}") """
-        MysqlHelper.update_values(log_type, crawler, insert_sql, env)
-
-
-
-if __name__ == "__main__":
-    # Demo.get_config("demo", "xiaoniangao", "dev")
-    # Demo.before_day()
-    Demo.insert_config("demo", "xigua", "prod")
-    pass

+ 0 - 27
xigua/xigua_search/test.py

@@ -1,27 +0,0 @@
-import requests
-
-class Test:
-  @classmethod
-  def test_search(cls):
-    url = "https://www.ixigua.com/search/美国禁令/?logTag=423ac644324e5c15d0b4&tab_name=home"
-    headers = {
-      "cookie": "MONITOR_WEB_ID=67cb5099-a022-4ec3-bb8e-c4de6ba51dd0; passport_csrf_token=72b2574f3c99f8ba670e42df430218fd; passport_csrf_token_default=72b2574f3c99f8ba670e42df430218fd; sid_guard=c7472b508ea631823ba765a60cf8757f|1680867422|3024002|Fri,+12-May-2023+11:37:04+GMT; uid_tt=c13f47d51767f616befe32fb3e9f485a; uid_tt_ss=c13f47d51767f616befe32fb3e9f485a; sid_tt=c7472b508ea631823ba765a60cf8757f; sessionid=c7472b508ea631823ba765a60cf8757f; sessionid_ss=c7472b508ea631823ba765a60cf8757f; sid_ucp_v1=1.0.0-KGUzNWYxNmRkZGJiZjgxY2MzZWNkMTEzMTkwYjY1Yjg5OTY5NzVlNmMKFQiu3d-eqQIQ3oDAoQYYGCAMOAhACxoCaGwiIGM3NDcyYjUwOGVhNjMxODIzYmE3NjVhNjBjZjg3NTdm; ssid_ucp_v1=1.0.0-KGUzNWYxNmRkZGJiZjgxY2MzZWNkMTEzMTkwYjY1Yjg5OTY5NzVlNmMKFQiu3d-eqQIQ3oDAoQYYGCAMOAhACxoCaGwiIGM3NDcyYjUwOGVhNjMxODIzYmE3NjVhNjBjZjg3NTdm; odin_tt=b893608d4dde2e1e8df8cd5d97a0e2fbeafc4ca762ac72ebef6e6c97e2ed19859bb01d46b4190ddd6dd17d7f9678e1de; __ac_signature=_02B4Z6wo00f01cFG6wAAAIDAuYgABqMKchHBZu-AABRtEnCTGzn5TJAsKOsuT7sRkpwCdN8j7eYG90xzDd55F2cCSZ0PajfVHvgm.7NmCht3MpN9fpw444-hLirWhH5NEyo.T3R-WhtUn32C58; SEARCH_CARD_MODE=7168304743566296612_0; support_webp=true; support_avif=false; csrf_session_id=a5355d954d3c63ed1ba35faada452b4d; tt_scid=87i80MjOSI6anYlKjNm3I80Az2HQvlC28pTxBpTksNBJ.srUyoC9hHhDYrE8N6fE4b26; ttwid=1|HHtv2QqpSGuSu8r-zXF1QoWsvjmNi1SJrqOrZzg-UCY|1683773681|d51c586327656b27492c8f406dd2530c2b4d03c38c4010b2b9d3de5dc883998f; msToken=lewvwOnFOl5Z5z_VkMYd7d4N-5y5uY0j82_1tnhWnOav09INStsHQnr0U953YQ9LzowXSPNP7m6l0nv1faSF9VEsEHGWqTg47kXuZKu9L4brbN4pmDNqZMwZ-YVQWFs=; ixigua-a-s=1",
-      "user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/113.0.0.0 Safari/537.36 Edg/113.0.1774.35"
-    }
-    response = requests.get(url=url, headers=headers)
-    print(response)
-    print(response.text)
-
-  @classmethod
-  def test_dict(cls):
-    dict_1 = {}
-    if dict_1 == {}:
-      print("yes")
-    else:
-      print("no")
-
-
-if __name__ == "__main__":
-  # Test.test_search()
-  Test.test_dict()
-  pass

+ 2 - 6
xigua/xigua_search/xigua_search_scheduling.py

@@ -558,7 +558,7 @@ class XiguasearchScheduling:
         chrome_options.add_argument("--window-size=1920,1080")
         chrome_options.add_argument("--no-sandbox")
         if env == "dev":
-            chromedriver = "/Users/wangkun/Downloads/chromedriver/chromedriver_v112/chromedriver"
+            chromedriver = "/Users/wangkun/Downloads/chromedriver/chromedriver_v114/chromedriver"
         else:
             chromedriver = "/usr/bin/chromedriver"
         # driver初始化
@@ -607,14 +607,10 @@ class XiguasearchScheduling:
                         driver.quit()
                         return
                     num += 1
-                    # Common.logger(log_type, crawler).info(f"len_videos:{len(video_element_temp)}")
-                    # Common.logger(log_type, crawler).info(f"index:{index}")
-                    # Common.logger(log_type, crawler).info(f"i:{i}")
-                    # Common.logger(log_type, crawler).info(f"index+i:{index+i}")
                     Common.logger(log_type, crawler).info(f'拖动"视频"列表第{num}个至屏幕中间')
                     driver.execute_script("arguments[0].scrollIntoView({block:'center',inline:'center'})", video_element)
                     time.sleep(3)
-                    driver.get_screenshot_as_file(f"./{crawler}/logs/{num}.jpg")
+                    # driver.get_screenshot_as_file(f"./{crawler}/logs/{num}.jpg")
                     item_id = video_element.find_elements(By.XPATH, '//*[@class="HorizontalFeedCard__coverWrapper disableZoomAnimation"]')[index+i].get_attribute('href')
                     item_id = item_id.split("com/")[-1].split("?&")[0]
                     video_dict = cls.get_video_info(log_type, crawler, item_id)

+ 0 - 158
zhiqingtiantiankan/zhiqingtiantiankan_recommend/zhiqing_insert.py

@@ -1,158 +0,0 @@
-# -*- coding: utf-8 -*-
-# @Author: wangkun
-# @Time: 2023/4/18
-import json
-import os
-import sys
-import time
-from datetime import date, timedelta
-from hashlib import md5
-
-sys.path.append(os.getcwd())
-from common.common import Common
-from common.feishu import Feishu
-from common.scheduling_db import MysqlHelper
-
-
-class Insert:
-    @classmethod
-    def get_config(cls, log_type, crawler, text, env):
-        select_sql = f"""select * from crawler_config where source="benshanzhufu" """
-        contents = MysqlHelper.get_values(log_type, crawler, select_sql, env, action='')
-        title_list = []
-        filter_list = []
-        for content in contents:
-            config = content['config']
-            config_dict = eval(config)
-            for k, v in config_dict.items():
-                if k == "title":
-                    title_list_config = v.split(",")
-                    for title in title_list_config:
-                        title_list.append(title)
-                if k == "filter":
-                    filter_list_config = v.split(",")
-                    for filter_word in filter_list_config:
-                        filter_list.append(filter_word)
-        if text == "title":
-            return title_list
-        elif text == "filter":
-            return filter_list
-
-    @classmethod
-    def before_day(cls):
-        publish_time_str_rule = (date.today() + timedelta(days=-30)).strftime("%Y-%m-%d %H:%M:%S")
-        publish_time_stamp_rule = int(time.mktime(time.strptime(publish_time_str_rule, "%Y-%m-%d %H:%M:%S")))
-        print(publish_time_str_rule)
-        print(publish_time_stamp_rule)
-
-    @classmethod
-    def insert_config(cls, log_type, crawler, env):
-        filter_sheet = Feishu.get_values_batch(log_type, crawler, "DjXfqG")
-        # title_sheet = Feishu.get_values_batch(log_type, crawler, "bHSW1p")
-        filter_list = []
-        # title_list = []
-        for x in filter_sheet:
-            for y in x:
-                if y is None:
-                    pass
-                else:
-                    filter_list.append(y)
-        # for x in title_sheet:
-        #     for y in x:
-        #         if y is None:
-        #             pass
-        #         else:
-        #             title_list.append(y)
-        # str_title = ','.join(title_list)
-        str_filter = ','.join(filter_list)
-        config_dict = {
-            # "title": str_title,
-            "filter": str_filter
-        }
-        str_config_dict = str(config_dict)
-        # print(f"config_dict:{config_dict}")
-        # print(f"str_config_dict:{str_config_dict}")
-        insert_sql = f""" insert into crawler_config(title, source, config) values("本山祝福小程序", "benshanzhufu", "{str_config_dict}") """
-        MysqlHelper.update_values(log_type, crawler, insert_sql, env)
-
-    @classmethod
-    def insert_video_from_feishu_to_mysql(cls, log_type, crawler, env):
-        zhiqing_sheetid = ['1a88b3']
-        for sheetid in zhiqing_sheetid:
-            xiaoniangao_sheet = Feishu.get_values_batch(log_type, crawler, sheetid)
-            for i in range(1, len(xiaoniangao_sheet)):
-            # for i in range(1, 5):
-                if xiaoniangao_sheet[i][5] is None or xiaoniangao_sheet[i][7] is None:
-                    continue
-                video_id = xiaoniangao_sheet[i][12].replace("https://admin.piaoquantv.com/cms/post-detail/", "").replace(
-                    "/info", "")
-                if video_id == "None":
-                    continue
-                video_id = int(video_id)
-                out_user_id = "zhiqingtiantiankan"
-                platform = "知青天天看"
-                strategy = "推荐榜爬虫策略"
-                video_title = str(xiaoniangao_sheet[i][7])
-                play_cnt = int(float(xiaoniangao_sheet[i][9].replace("阅读数", "").strip().split("万")[0])*10000)
-                duration = str(xiaoniangao_sheet[i][10])
-                width = int(xiaoniangao_sheet[i][11].split("*")[0])
-                height = int(xiaoniangao_sheet[i][11].split("*")[1])
-                cover_url = str(xiaoniangao_sheet[i][13])
-                video_url = str(xiaoniangao_sheet[i][14])
-                crawler_rule = json.dumps({})
-                out_video_id = md5(video_title.encode('utf8')).hexdigest()
-
-                # print(f"video_id:{video_id}, type:{type(video_id)}")
-                # print(f"out_user_id:{out_user_id}, type:{type(out_user_id)}")
-                # print(f"platform:{platform}, type:{type(platform)}")
-                # print(f"strategy:{strategy}, type:{type(strategy)}")
-                # print(f"video_title:{video_title}, type:{type(video_title)}")
-                # print(f"cover_url:{cover_url}, type:{type(cover_url)}")
-                # print(f"video_url:{video_url}, type:{type(video_url)}")
-                # print(f"crawler_rule:{crawler_rule}, type:{type(crawler_rule)}")
-
-                select_sql = f""" select * from crawler_video where platform="{platform}" and video_url="{video_url}" """
-                Common.logger(log_type, crawler).info(f"select_sql:{select_sql}")
-                repeat_video = MysqlHelper.get_values(log_type, crawler, select_sql, env)
-                Common.logger(log_type, crawler).info(f"repeat_video:{repeat_video}")
-
-                if repeat_video is not None and len(repeat_video) != 0:
-                    Common.logger(log_type, crawler).info(f"{video_title} 已存在数据库中\n")
-                else:
-                    # 视频信息保存数据库
-                    insert_sql = f""" insert into crawler_video(video_id,
-                                        out_user_id,
-                                        platform,
-                                        strategy,
-                                        out_video_id,
-                                        video_title,
-                                        cover_url,
-                                        video_url,
-                                        duration,
-                                        play_cnt,
-                                        crawler_rule,
-                                        width,
-                                        height)
-                                        values({video_id},
-                                        "{out_user_id}",
-                                        "{platform}",
-                                        "{strategy}",
-                                        "{out_video_id}",
-                                        "{video_title}",
-                                        "{cover_url}",
-                                        "{video_url}",
-                                        {duration},
-                                        {play_cnt},
-                                        '{crawler_rule}',
-                                        {width},
-                                        {height}) """
-                    Common.logger(log_type, crawler).info(f"insert_sql:{insert_sql}")
-                    MysqlHelper.update_values(log_type, crawler, insert_sql, env, action='')
-                    Common.logger(log_type, crawler).info('视频信息插入数据库成功!\n')
-
-
-
-if __name__ == "__main__":
-    # Insert.insert_video_from_feishu_to_mysql("insert-dev", "zhiqingtiantiankan", "dev")
-    Insert.insert_video_from_feishu_to_mysql("insert-prod", "zhiqingtiantiankan", "prod")
-    pass

+ 0 - 158
zhongmiaoyinxin/zhongmiaoyinxin_recommend/insert.py

@@ -1,158 +0,0 @@
-# -*- coding: utf-8 -*-
-# @Author: wangkun
-# @Time: 2023/4/17
-import json
-import os
-import sys
-import time
-from datetime import date, timedelta
-from hashlib import md5
-
-sys.path.append(os.getcwd())
-from common.common import Common
-from common.feishu import Feishu
-from common.scheduling_db import MysqlHelper
-
-
-class Insert:
-    @classmethod
-    def get_config(cls, log_type, crawler, text, env):
-        select_sql = f"""select * from crawler_config where source="benshanzhufu" """
-        contents = MysqlHelper.get_values(log_type, crawler, select_sql, env, action='')
-        title_list = []
-        filter_list = []
-        for content in contents:
-            config = content['config']
-            config_dict = eval(config)
-            for k, v in config_dict.items():
-                if k == "title":
-                    title_list_config = v.split(",")
-                    for title in title_list_config:
-                        title_list.append(title)
-                if k == "filter":
-                    filter_list_config = v.split(",")
-                    for filter_word in filter_list_config:
-                        filter_list.append(filter_word)
-        if text == "title":
-            return title_list
-        elif text == "filter":
-            return filter_list
-
-    @classmethod
-    def before_day(cls):
-        publish_time_str_rule = (date.today() + timedelta(days=-30)).strftime("%Y-%m-%d %H:%M:%S")
-        publish_time_stamp_rule = int(time.mktime(time.strptime(publish_time_str_rule, "%Y-%m-%d %H:%M:%S")))
-        print(publish_time_str_rule)
-        print(publish_time_stamp_rule)
-
-    @classmethod
-    def insert_config(cls, log_type, crawler, env):
-        filter_sheet = Feishu.get_values_batch(log_type, crawler, "DjXfqG")
-        # title_sheet = Feishu.get_values_batch(log_type, crawler, "bHSW1p")
-        filter_list = []
-        # title_list = []
-        for x in filter_sheet:
-            for y in x:
-                if y is None:
-                    pass
-                else:
-                    filter_list.append(y)
-        # for x in title_sheet:
-        #     for y in x:
-        #         if y is None:
-        #             pass
-        #         else:
-        #             title_list.append(y)
-        # str_title = ','.join(title_list)
-        str_filter = ','.join(filter_list)
-        config_dict = {
-            # "title": str_title,
-            "filter": str_filter
-        }
-        str_config_dict = str(config_dict)
-        # print(f"config_dict:{config_dict}")
-        # print(f"str_config_dict:{str_config_dict}")
-        insert_sql = f""" insert into crawler_config(title, source, config) values("本山祝福小程序", "benshanzhufu", "{str_config_dict}") """
-        MysqlHelper.update_values(log_type, crawler, insert_sql, env)
-
-    @classmethod
-    def insert_video_from_feishu_to_mysql(cls, log_type, crawler, env):
-        jixiangxingfu_sheetid = ['19c772']
-        for sheetid in jixiangxingfu_sheetid:
-            xiaoniangao_sheet = Feishu.get_values_batch(log_type, crawler, sheetid)
-            for i in range(1, len(xiaoniangao_sheet)):
-            # for i in range(1, 5):
-                if xiaoniangao_sheet[i][5] is None or xiaoniangao_sheet[i][7] is None:
-                    continue
-                video_id = xiaoniangao_sheet[i][12].replace("https://admin.piaoquantv.com/cms/post-detail/", "").replace(
-                    "/info", "")
-                if video_id == "None":
-                    continue
-                video_id = int(video_id)
-                out_user_id = "zhongmiaoyinxin"
-                platform = "众妙音信"
-                strategy = "推荐榜爬虫策略"
-                video_title = str(xiaoniangao_sheet[i][7])
-                play_cnt = int(xiaoniangao_sheet[i][9].split("万")[0])*10000
-                duration = str(xiaoniangao_sheet[i][10])
-                width = int(xiaoniangao_sheet[i][11].split("*")[0])
-                height = int(xiaoniangao_sheet[i][11].split("*")[1])
-                cover_url = str(xiaoniangao_sheet[i][13])
-                video_url = str(xiaoniangao_sheet[i][14])
-                crawler_rule = json.dumps({})
-                out_video_id = md5(video_title.encode('utf8')).hexdigest()
-
-                # print(f"video_id:{video_id}, type:{type(video_id)}")
-                # print(f"out_user_id:{out_user_id}, type:{type(out_user_id)}")
-                # print(f"platform:{platform}, type:{type(platform)}")
-                # print(f"strategy:{strategy}, type:{type(strategy)}")
-                # print(f"video_title:{video_title}, type:{type(video_title)}")
-                # print(f"cover_url:{cover_url}, type:{type(cover_url)}")
-                # print(f"video_url:{video_url}, type:{type(video_url)}")
-                # print(f"crawler_rule:{crawler_rule}, type:{type(crawler_rule)}")
-
-                select_sql = f""" select * from crawler_video where platform="{platform}" and video_url="{video_url}" """
-                Common.logger(log_type, crawler).info(f"select_sql:{select_sql}")
-                repeat_video = MysqlHelper.get_values(log_type, crawler, select_sql, env)
-                Common.logger(log_type, crawler).info(f"repeat_video:{repeat_video}")
-
-                if repeat_video is not None and len(repeat_video) != 0:
-                    Common.logger(log_type, crawler).info(f"{video_title} 已存在数据库中\n")
-                else:
-                    # 视频信息保存数据库
-                    insert_sql = f""" insert into crawler_video(video_id,
-                                        out_user_id,
-                                        platform,
-                                        strategy,
-                                        out_video_id,
-                                        video_title,
-                                        cover_url,
-                                        video_url,
-                                        duration,
-                                        play_cnt,
-                                        crawler_rule,
-                                        width,
-                                        height)
-                                        values({video_id},
-                                        "{out_user_id}",
-                                        "{platform}",
-                                        "{strategy}",
-                                        "{out_video_id}",
-                                        "{video_title}",
-                                        "{cover_url}",
-                                        "{video_url}",
-                                        {duration},
-                                        {play_cnt},
-                                        '{crawler_rule}',
-                                        {width},
-                                        {height}) """
-                    Common.logger(log_type, crawler).info(f"insert_sql:{insert_sql}")
-                    MysqlHelper.update_values(log_type, crawler, insert_sql, env, action='')
-                    Common.logger(log_type, crawler).info('视频信息插入数据库成功!\n')
-
-
-
-if __name__ == "__main__":
-    Insert.insert_video_from_feishu_to_mysql("insert-dev", "zhongmiaoyinxin", "dev")
-    # Insert.insert_video_from_feishu_to_mysql("insert-prod", "zhongmiaoyinxin", "prod")
-    pass

Some files were not shown because too many files changed in this diff