Przeglądaj źródła

抓取视频号数据

zhangyong 8 miesięcy temu
rodzic
commit
0860860bf4
4 zmienionych plików z 180 dodań i 1 usunięć
  1. 23 0
      common/feishu_form.py
  2. 46 1
      common/sql_help.py
  3. 106 0
      data_channel/sph_crawling_data.py
  4. 5 0
      sph_crawling_job.py

+ 23 - 0
common/feishu_form.py

@@ -5,6 +5,7 @@ import random
 import sys
 import datetime
 
+from common.sql_help import sqlCollect
 
 sys.path.append(os.getcwd())
 from common.feishu_utils import Feishu
@@ -31,6 +32,28 @@ class Material():
                 return list
         return list
 
+    @classmethod
+    def get_sph_user(cls):
+        data = Feishu.get_values_batch("GPbhsb5vchAN3qtzot6cu1f0n1c", "cc7ef0")
+        user_data_list = []
+        try:
+            for row in data[1:]:
+                users = str(row[2])
+                if users and users != 'None':
+                    if ',' in users:
+                        user_list = users.split(',')
+                    else:
+                        user_list = [users]
+                    for user in user_list:
+                        status = sqlCollect.sph_channel_user(user)
+                        if status:
+                            user_data_list.append(user)
+                else:
+                    return user_data_list
+            return user_data_list
+        except:
+            return user_data_list
+
     """
     获取对应负责人任务明细
     """

+ 46 - 1
common/sql_help.py

@@ -6,7 +6,7 @@ from common.mysql_db_aigc import AigcMysqlHelper
 
 sys.path.append(os.getcwd())
 from datetime import datetime
-from common import MysqlHelper
+from common.mysql_db import MysqlHelper
 
 
 class sqlCollect():
@@ -81,6 +81,51 @@ class sqlCollect():
             return False
         return True
 
+    """查询视频号user是否添加过"""
+    @classmethod
+    def sph_channel_user(cls, user_id):
+        sql = """
+                   SELECT user_id
+                   FROM sph_channel_user
+                   WHERE user_id = %s
+               """
+        data = MysqlHelper.get_values(sql, (str(user_id)))
+        if len(data) == 0 or data == ():
+            return True
+        return False
+
+    """抓取视频号数据入库"""
+    @classmethod
+    def sph_data_info(cls, channel: str, objectId: str, video_url: str, cover: str, video_title: str, share_cnt: str, like_cnt: str, oss_video_key: str, oss_cover_key: str, nick_name: str, user_name: str, comment_count: str, fav_count: str, create_time: str):
+        insert_sql = f"""INSERT INTO sph_data_info (channel, video_id, video_url, video_cover, video_title, share_cnt, like_cnt, oss_url, oss_cover, nick_name, user_name, comment_count, fav_count, create_time)
+                         values 
+                         ("{channel}", "{objectId}", "{video_url}", "{cover}", "{video_title}", "{share_cnt}", "{like_cnt}", "{oss_video_key}", "{oss_cover_key}", "{nick_name}", "{user_name}", "{comment_count}", "{fav_count}", "{create_time}")"""
+        res = MysqlHelper.update_values(
+            sql=insert_sql
+        )
+        return res
+
+    """查询是否有视频号数据"""
+    @classmethod
+    def sph_channel_user_list(cls):
+        sql = """
+                       SELECT user_id
+                       FROM sph_channel_user
+                       WHERE status = %s
+                   """
+        data = MysqlHelper.get_values(sql, (0))
+        if len(data) == 0 or data == ():
+            return None
+        return data
+
+    """插入视频号信息"""
+    @classmethod
+    def insert_sph_channel_user(cls, channel, user_id):
+        insert_sql = f"""INSERT INTO sph_channel_user (channel, user_id, status) values ("{channel}", "{user_id}", 0)"""
+        res = MysqlHelper.update_values(
+            sql=insert_sql
+        )
+
     @classmethod
     def insert_ks_data(cls, user_name: str, user_sex: str, time_data, caption: str, view_count: str, like_count: str,
                        share_count: str, duration: str, main_mv_url: str, thumbnail_url: str, user_id: str, status: str,

+ 106 - 0
data_channel/sph_crawling_data.py

@@ -0,0 +1,106 @@
+import configparser
+import json
+import os
+import random
+import time
+
+import requests
+
+from common import Material, Oss, Common
+from common.sql_help import sqlCollect
+from data_channel.shipinhao import SPH
+class SphHistory:
+
+    """获取视频号所有内容"""
+    @classmethod
+    def sph_data_info(cls):
+        user_list = cls.get_sph_user()
+        if user_list == None:
+            return
+        for user in user_list:
+            Common.logger("sph_crawling").info(f"{user}开始获取数据")
+            account_id = SPH.get_account_id(user)
+            if account_id == False:
+                print(f"{account_id}:没有获取到视频account_id,无法抓取数据")
+            url = "http://61.48.133.26:30001/FinderGetUpMasterNextPage"
+            last_buffer = ""
+            try:
+                while True:
+                    headers = {
+                        'Content-Type': 'application/json'
+                    }
+                    payload = json.dumps({
+                        "username": account_id,
+                        "last_buffer": last_buffer
+                    })
+
+                    response = requests.request("POST", url, headers=headers, data=payload)
+                    time.sleep(random.randint(1, 5))
+                    res_json = response.json()
+                    try:
+                        if len(res_json["DownloadAddress"]) == 0 or res_json["DownloadAddress"] == "" or res_json["DownloadAddress"] == None:
+                            break
+                    except:
+                        pass
+                    if "objectId" not in response.text or response.status_code != 200:
+                        break
+                    if len(res_json["UpMasterHomePage"]) == 0:
+                        break
+                    if not res_json["UpMasterHomePage"]:
+                        break
+                    last_buffer = res_json.get('last_buffer')
+                    for obj in res_json["UpMasterHomePage"]:
+                        objectId = obj['objectId']
+                        objectNonceId = obj['objectNonceId']
+                        url = "http://61.48.133.26:30001/GetFinderDownloadAddress"
+                        payload = json.dumps({
+                            "objectId": objectId,
+                            "objectNonceId": objectNonceId
+                        })
+                        headers = {
+                            'Content-Type': 'text/plain'
+                        }
+                        response = requests.request("POST", url, headers=headers, data=payload)
+                        time.sleep(random.randint(0, 1))
+                        video_obj = response.json()
+                        video_url = video_obj.get('DownloadAddress')
+                        if len(video_url) == 0:
+                            continue
+                        v_id = f"sph_{objectId}.mp4"
+                        c_id = f"sph_{objectId}.jpg"
+                        oss_video_path = Oss.channel_upload_oss(video_url, v_id)
+                        oss_video_key = oss_video_path.get("oss_object_key")
+                        share_cnt = int(obj['forward_count'])  # 分享
+                        like_cnt = int(obj['like_count'])  # 点赞
+                        video_title = video_obj.get('title').split("\n")[0].split("#")[0]
+                        cover = video_obj.get('thumb_url')
+                        oss_cover_path = Oss.channel_upload_oss(cover, c_id)
+                        oss_cover_key = oss_cover_path.get("oss_object_key")
+                        Common.logger("sph_crawling").info(f"{user}oss地址:视频{oss_video_key},封面{oss_cover_key}")
+                        create_time = obj['createtime']  # 发布时间
+                        user_name = obj['username']  # 用户名标示
+                        nick_name = obj['nickname']  # 用户名
+                        comment_count = obj['comment_count']  # 评论数
+                        fav_count = obj['fav_count']  # 大拇指点赞数
+                        sqlCollect.sph_data_info('视频号', objectId, video_url, cover, video_title, str(share_cnt), str(like_cnt), oss_video_key, oss_cover_key, nick_name, user_name, comment_count, fav_count, create_time)
+                        Common.logger("sph_crawling").info(f"{nick_name}插入数据成功")
+                        return "完成"
+            except:
+                continue
+
+    @classmethod
+    def get_sph_user(cls):
+        data = sqlCollect.sph_channel_user_list()
+        if data == None:
+            user_list = Material.get_sph_user()
+            if user_list:
+                for user in user_list:
+                    sqlCollect.insert_sph_channel_user("视频号", user)
+            else:
+                return None
+        result_list = [item for sublist in data for item in sublist]
+        return result_list
+
+
+if __name__ == '__main__':
+    SphHistory.sph_data_info()

+ 5 - 0
sph_crawling_job.py

@@ -0,0 +1,5 @@
+from data_channel.sph_crawling_data import SphHistory
+
+if __name__ == '__main__':
+    SphHistory.sph_data_info()
+    print("完成")