wangkun 2 سال پیش
والد
کامیت
e252a66e43
5فایلهای تغییر یافته به همراه86 افزوده شده و 70 حذف شده
  1. 2 1
      README.MD
  2. 22 11
      common/db.py
  3. 30 30
      common/users.py
  4. 16 21
      main/main.sh
  5. 16 7
      youtube/youtube_follow/youtube_follow.py

+ 2 - 1
README.MD

@@ -19,7 +19,8 @@ ${nohup_dir}:       nohup日志存储路径,如: ./youtube/nohup.log
 ### 已上线爬虫运行命令示例
 ```
 youtube定向榜运行命令: 
-sh ./main/main.sh ./youtube/youtube_main/run_youtube_follow.py --log_type="follow" --crawler="youtube" --strategy="定向爬虫策略" --oss_endpoint="hk" --env="dev" --machine="aliyun_hk" ./youtube/nohup.log
+sh ./main/main.sh ./youtube/youtube_main/run_youtube_follow.py --log_type="follow" --crawler="youtube" --strategy="定向爬虫策略" --oss_endpoint="hk" --env="prod" --machine="aliyun_hk" youtube/nohup.log
 youtube定向榜杀进程命令: 
+ps aux | grep run_youtube | grep -v grep | awk '{print $2}' | xargs kill -9
 ps aux | grep run_youtube | grep Python | grep -v grep | awk '{print $2}' | xargs kill -9
 ```

+ 22 - 11
common/db.py

@@ -9,12 +9,23 @@ from common.common import Common
 
 class MysqlHelper:
     @classmethod
-    def connect_mysql(cls, env):
-        if env == 'prod':
+    def connect_mysql(cls, env, machine):
+        if machine == 'aliyun_hk':
             # 创建一个 Connection 对象,代表了一个数据库连接
             connection = pymysql.connect(
-                # host="rm-bp1159bu17li9hi94.mysql.rds.aliyuncs.com",# 数据库IP地址,内网地址
-                host="rm-bp1159bu17li9hi94ro.mysql.rds.aliyuncs.com",# 数据库IP地址,外网地址
+                host="rm-j6cz4c6pt96000xi3.mysql.rds.aliyuncs.com",# 数据库IP地址,内网地址
+                # host="rm-j6cz4c6pt96000xi3lo.mysql.rds.aliyuncs.com",# 数据库IP地址,外网地址
+                port=3306,                      # 端口号
+                user="crawler",                 #  mysql用户名
+                passwd="crawler123456@",        # mysql用户登录密码
+                db="piaoquan-crawler" ,         # 数据库名
+                # 如果数据库里面的文本是utf8编码的,charset指定是utf8
+                charset = "utf8")
+        elif env == 'prod':
+            # 创建一个 Connection 对象,代表了一个数据库连接
+            connection = pymysql.connect(
+                host="rm-bp1159bu17li9hi94.mysql.rds.aliyuncs.com",# 数据库IP地址,内网地址
+                # host="rm-bp1159bu17li9hi94ro.mysql.rds.aliyuncs.com",# 数据库IP地址,外网地址
                 port=3306,                      # 端口号
                 user="crawler",                 #  mysql用户名
                 passwd="crawler123456@",        # mysql用户登录密码
@@ -24,8 +35,8 @@ class MysqlHelper:
         else:
             # 创建一个 Connection 对象,代表了一个数据库连接
             connection = pymysql.connect(
-                # host="rm-bp1k5853td1r25g3n690.mysql.rds.aliyuncs.com",# 数据库IP地址,内网地址
-                host="rm-bp1k5853td1r25g3ndo.mysql.rds.aliyuncs.com",  # 数据库IP地址,外网地址
+                host="rm-bp1k5853td1r25g3n690.mysql.rds.aliyuncs.com",# 数据库IP地址,内网地址
+                # host="rm-bp1k5853td1r25g3ndo.mysql.rds.aliyuncs.com",  # 数据库IP地址,外网地址
                 port=3306,  # 端口号
                 user="crawler",  # mysql用户名
                 passwd="crawler123456@",  # mysql用户登录密码
@@ -36,10 +47,10 @@ class MysqlHelper:
         return connection
 
     @classmethod
-    def get_values(cls, log_type, crawler, sql, env):
+    def get_values(cls, log_type, crawler, sql, env, machine):
         # try:
         # 连接数据库
-        connect = cls.connect_mysql(env)
+        connect = cls.connect_mysql(env, machine)
         # 返回一个 Cursor对象
         mysql = connect.cursor()
 
@@ -58,9 +69,9 @@ class MysqlHelper:
         #     Common.logger(log_type, crawler).error(f"get_values异常:{e}\n")
 
     @classmethod
-    def update_values(cls, log_type, crawler, sql, env):
+    def update_values(cls, log_type, crawler, sql, env, machine):
         # 连接数据库
-        connect = cls.connect_mysql(env)
+        connect = cls.connect_mysql(env, machine)
         # 返回一个 Cursor对象
         mysql = connect.cursor()
 
@@ -84,6 +95,6 @@ if __name__ == "__main__":
     # edit_data = MysqlHelper.edit_data(sql=sql_statement)
     # print(edit_data)
 
-    get_data = MysqlHelper.get_values("demo", "youtube", "select * from crawler_user", "dev")
+    get_data = MysqlHelper.get_values("demo", "youtube", "select * from crawler_user", "dev", "local")
     print(get_data)
 

+ 30 - 30
common/users.py

@@ -23,36 +23,36 @@ class Users:
         :param env: 环境
         :return: 站内 UID
         """
-        try:
-            if env == 'dev':
-                # 外网
-                url = 'http://videotest.yishihui.com/longvideoapi/user/virtual/crawler/registerVirtualUser'
-                # 内网
-                # url = 'http://videotest-internal.yishihui.com/longvideoapi/user/virtual/crawler/registerVirtualUser'
-            elif env == 'prod':
-                # 外网
-                url = 'http://longvideoapi.piaoquantv.com/longvideoapi/user/virtual/crawler/registerVirtualUser'
-                # 内网
-                # url = 'http://longvideoapi-internal.piaoquantv.com/longvideoapi/user/virtual/crawler/registerVirtualUser'
-            else:
-                # 外网
-                url = 'http://longvideoapi.piaoquantv.com/longvideoapi/user/virtual/crawler/registerVirtualUser'
-                # 内网
-                # url = 'http://longvideoapi-internal.piaoquantv.com/longvideoapi/user/virtual/crawler/registerVirtualUser'
-            params = {
-                # 'count': 1,     # (必须)账号个数:传1
-                # 'accountType': 4,   # (必须)账号类型 :传 4 app虚拟账号
-                'pwd': '',  # 密码 默认 12346
-                'nickName': user_dict['nickName'],  # 昵称  默认 vuser......
-                'avatarUrl': user_dict['avatarUrl'],  # 头像Url  默认 http://weapppiccdn.yishihui.com/resources/images/pic_normal.png
-                'tagName': user_dict['tagName'],  # 多条数据用英文逗号分割
-            }
-            response = requests.post(url=url, params=params)
-            # print(response.text)
-            user_id = response.json()['data']
-            return user_id
-        except Exception as e:
-            Common.logger(log_type, crawler).error(f"create_user异常:{e}\n")
+        # try:
+        if env == 'dev':
+            # 外网
+            url = 'https://videotest.yishihui.com/longvideoapi/user/virtual/crawler/registerVirtualUser'
+            # 内网
+            # url = 'http://videotest-internal.yishihui.com/longvideoapi/user/virtual/crawler/registerVirtualUser'
+        elif env == 'prod':
+            # 外网
+            url = 'https://longvideoapi.piaoquantv.com/longvideoapi/user/virtual/crawler/registerVirtualUser'
+            # 内网
+            # url = 'http://longvideoapi-internal.piaoquantv.com/longvideoapi/user/virtual/crawler/registerVirtualUser'
+        else:
+            # 外网
+            url = 'https://longvideoapi.piaoquantv.com/longvideoapi/user/virtual/crawler/registerVirtualUser'
+            # 内网
+            # url = 'http://longvideoapi-internal.piaoquantv.com/longvideoapi/user/virtual/crawler/registerVirtualUser'
+        params = {
+            # 'count': 1,     # (必须)账号个数:传1
+            # 'accountType': 4,   # (必须)账号类型 :传 4 app虚拟账号
+            'pwd': '',  # 密码 默认 12346
+            'nickName': user_dict['nickName'],  # 昵称  默认 vuser......
+            'avatarUrl': user_dict['avatarUrl'],  # 头像Url  默认 http://weapppiccdn.yishihui.com/resources/images/pic_normal.png
+            'tagName': user_dict['tagName'],  # 多条数据用英文逗号分割
+        }
+        response = requests.post(url=url, params=params)
+        # print(response.text)
+        user_id = response.json()['data']
+        return user_id
+        # except Exception as e:
+        #     Common.logger(log_type, crawler).error(f"create_user异常:{e}\n")
 
 
 if __name__ == "__main__":

+ 16 - 21
main/main.sh

@@ -1,9 +1,6 @@
 #!/bin/bash
 # 看一看+小程序 朋友圈榜单
 # sh ./main/main.sh ./kanyikan/kanyikan_main/run_kanyikan_moment.py --log_type="moment" --crawler="kanyikan" --strategy="kanyikan_moment" --our_uid="kanyikan_moment" --oss_endpoint="out" --env="dev" ./kanyikan/nohup.log local
-# youtube 定向策略
-# sh ./main/main.sh ./youtube/youtube_main/run_youtube_follow.py --log_type="follow" --crawler="youtube" --strategy="定向爬虫策略" --oss_endpoint="out" --env="dev" --machine="local" youtube/nohup.log
-# ps aux | grep run_youtube | grep Python | grep -v grep | awk '{print $2}' | xargs kill -9
 
 crawler_dir=$1  # 爬虫执行路径,如: ./youtube/youtube_main/run_youtube_follow.py
 log_type=$2     # 日志命名格式,如: follow,则在 youtube/logs/目录下,生成 2023-02-08-follow.log
@@ -16,30 +13,25 @@ nohup_dir=$8    # nohup日志存储路径,如: ./youtube/nohup.log
 
 echo "开始"
 
-if [ ${machine} = "macpro" ];then
+if [ ${machine} = "--machine=macpro" ];then
   piaoquan_crawler_dir=/Users/lieyunye/Desktop/piaoquan_crawler/
   profile_path=.bash_profile
   node_path=/usr/local/bin/node
   python=python3
-elif [ ${machine} = "macair" ];then
+elif [ ${machine} = "--machine=macair" ];then
   piaoquan_crawler_dir=/Users/piaoquan/Desktop/piaoquan_crawler/
   profile_path=./base_profile
   node_path=/usr/local/bin/node
   python=python3
-elif [ ${machine} = "aliyun_hk" ];then
+elif [ ${machine} = "--machine=aliyun_hk" ];then
   piaoquan_crawler_dir=/root/piaoquan_crawler/
   profile_path=/etc/profile
-  python=python
-elif [ ${machine} = "aliyun" ];then
+  python=python3
+elif [ ${machine} = "--machine=aliyun" ];then
   piaoquan_crawler_dir=/data5/wangkun/piaoquan_crawler/
   profile_path=/etc/profile
   python=python
-elif [ ${machine} = "local" ];then
-  piaoquan_crawler_dir=/Users/wangkun/Desktop/crawler/piaoquan_crawler/
-  profile_path=/etc/profile
-  node_path=/opt/homebrew/bin/node
-  python=python3
-else
+elif [ ${machine} = "--machine=local" ];then
   piaoquan_crawler_dir=/Users/wangkun/Desktop/crawler/piaoquan_crawler/
   profile_path=/etc/profile
   node_path=/opt/homebrew/bin/node
@@ -55,15 +47,17 @@ grep_str=run_${crawler##*=}
 ps aux | grep ${grep_str} | grep Python | grep -v grep | awk '{print $2}' | xargs kill -9
 echo "$(date "+%Y-%m-%d %H:%M:%S") 进程已杀死!"
 
+if [ ${machine} = "--machine=aliyun_hk" ];then
+  echo "无需更新代码"
+else
+  echo "$(date "+%Y-%m-%d %H:%M:%S") 正在更新代码..."
+  cd ${piaoquan_crawler_dir} && git pull origin master --force && rm -f ${piaoquan_crawler_dir}main/nohup.log && rm -f ${piaoquan_crawler_dir}${nohup_dir}
+  echo "$(date "+%Y-%m-%d %H:%M:%S") 代码更新完成!"
+fi
 
-echo "$(date "+%Y-%m-%d %H:%M:%S") 正在更新代码..."
-cd ${piaoquan_crawler_dir} && git pull origin master --force && rm -f ${piaoquan_crawler_dir}main/nohup.log && rm -f ${piaoquan_crawler_dir}${nohup_dir}
-echo "$(date "+%Y-%m-%d %H:%M:%S") 代码更新完成!"
-
-
-if [ ${machine} = "aliyun" ];then
+if [ ${machine} = "--machine=aliyun_hk" ];then
   echo "无需重启Appium及adb服务"
-elif [ ${machine} = "aliyun_hk" ];then
+elif [ ${machine} = "--machine=aliyun" ];then
   echo "无需重启Appium及adb服务"
 else
   echo "$(date "+%Y-%m-%d %H:%M:%S") 正在重启Appium..."
@@ -78,6 +72,7 @@ else
 fi
 
 echo "$(date "+%Y-%m-%d %H:%M:%S") 正在重启服务..."
+cd ${piaoquan_crawler_dir}
 nohup ${python} -u ${crawler_dir} ${log_type} ${crawler} ${strategy} ${oss_endpoint} ${env} ${machine} >>${nohup_dir} 2>&1 &
 echo "$(date "+%Y-%m-%d %H:%M:%S") 服务重启完毕!"
 

+ 16 - 7
youtube/youtube_follow/youtube_follow.py

@@ -399,7 +399,7 @@ class Follow:
             # 站内 UID 为空,且数据库中(youtube+out_user_id)返回数量 == 0,则创建新的站内账号
             if our_uid is None:
                 sql = f""" select * from crawler_user where platform="{cls.platform}" and out_user_id="{out_uid}" """
-                our_user_info = MysqlHelper.get_values(log_type, crawler, sql, env)
+                our_user_info = MysqlHelper.get_values(log_type, crawler, sql, env, machine)
                 # 数据库中(youtube + out_user_id)返回数量 == 0,则创建站内账号UID,并写入定向账号飞书表。并结合站外用户信息,一并写入爬虫账号数据库
                 if our_user_info is None or len(our_user_info) == 0:
                     # 获取站外账号信息,写入数据库
@@ -417,7 +417,8 @@ class Follow:
                         'tagName': tag,
                     }
                     our_uid = Users.create_user(log_type, crawler, create_user_dict, env)
-                    if 'env' == 'prod':
+                    Common.logger(log_type, crawler).info(f'新创建的站内UID:{our_uid}')
+                    if env == 'prod':
                         our_user_link = f'https://admin.piaoquantv.com/ums/user/{our_uid}/post'
                     else:
                         our_user_link = f'https://testadmin.piaoquantv.com/ums/user/{our_uid}/post'
@@ -443,7 +444,7 @@ class Follow:
                                         {out_fans}, 
                                         "{cls.platform}",
                                         "{tag}") """
-                    MysqlHelper.update_values(log_type, crawler, sql, env)
+                    MysqlHelper.update_values(log_type, crawler, sql, env, machine)
                     Common.logger(log_type, crawler).info('用户信息插入数据库成功!\n')
                 # 数据库中(youtube + out_user_id)返回数量 != 0,则直接把数据库中的站内 UID 写入飞书
                 else:
@@ -690,7 +691,7 @@ class Follow:
                     # 发布时间<=30天
                     publish_time = int(time.mktime(time.strptime(video_dict['publish_time'], "%Y-%m-%d")))
                     if int(time.time()) - publish_time <= 3600*24*30:
-                        cls.download_publish(log_type, crawler, video_dict, strategy, our_uid, env, oss_endpoint)
+                        cls.download_publish(log_type, crawler, video_dict, strategy, our_uid, env, oss_endpoint, machine)
                     else:
                         Common.logger(log_type, crawler).info('发布时间超过30天\n')
                         return
@@ -895,6 +896,11 @@ class Follow:
             if Translate.is_contains_chinese(video_title) is False:
                 video_title = Translate.google_translate(video_title, machine)  # 自动翻译标题为中文
 
+            if 'lengthSeconds' not in videoDetails:
+                duration = 0
+            else:
+                duration = int(videoDetails['lengthSeconds'])
+
             # play_cnt
             if 'viewCount' not in videoDetails:
                 play_cnt = 0
@@ -946,6 +952,7 @@ class Follow:
             video_dict = {
                 'video_title': video_title,
                 'video_id': video_id,
+                'duration': duration,
                 'play_cnt': play_cnt,
                 'publish_time': publish_time,
                 'user_name': user_name,
@@ -956,11 +963,13 @@ class Follow:
             return video_dict
 
     @classmethod
-    def download_publish(cls, log_type, crawler, video_dict, strategy, our_uid, env, oss_endpoint):
+    def download_publish(cls, log_type, crawler, video_dict, strategy, our_uid, env, oss_endpoint, machine):
         sql = f""" select * from crawler_video where platform="{cls.platform}" and out_video_id="{video_dict['video_id']}" """
-        repeat_video = MysqlHelper.get_values(log_type, crawler, sql, env)
+        repeat_video = MysqlHelper.get_values(log_type, crawler, sql, env, machine)
         if video_dict['video_title'] == '' or  video_dict['video_url'] == '':
             Common.logger(log_type, crawler).info('无效视频\n')
+        elif video_dict['duration'] > 600 or video_dict['duration'] < 60:
+            Common.logger(log_type, crawler).info(f"时长:{video_dict['duration']}不满足规则\n")
         elif repeat_video is not None and len(repeat_video) != 0:
             Common.logger(log_type, crawler).info('视频已下载\n')
         else:
@@ -1065,7 +1074,7 @@ class Follow:
                 "{rule}",
                 {int(video_width)},
                 {int(video_height)}) """
-                MysqlHelper.update_values(log_type, crawler, sql, env)
+                MysqlHelper.update_values(log_type, crawler, sql, env, machine)
                 Common.logger(log_type, crawler).info('视频信息插入数据库成功!\n')
 
     @classmethod