2 лет назад · 91a11c4357
--- a/shipinhao/shipinhao_author/shipinhao_author.py
+++ b/shipinhao/shipinhao_author/shipinhao_author.py
@@ -0,0 +1,384 @@
 
															+import os
														
 
															+import json
														
 
															+import random
														
 
															+import sys
														
 
															+import time
														
 
															+import uuid
														
 
															+import datetime
														
 
															+
														
 
															+import requests
														
 
															+import cv2
														
 
															+
														
 
															+sys.path.append(os.getcwd())
														
 
															+from datetime import datetime
														
 
															+from common.feishu import Feishu
														
 
															+from common import PiaoQuanPipeline, AliyunLogger
														
 
															+from common.db import MysqlHelper
														
 
															+from common.mq import MQ
														
 
															+from common.public import clean_title
														
 
															+
														
 
															+
														
 
															+def find_target_user(name, user_list):
														
 
															+    """
														
 
															+    在搜索到到账号列表中找目标列表
														
 
															+    """
														
 
															+    for obj in user_list:
														
 
															+        if obj['contact']["nickname"] == name:
														
 
															+            return obj
														
 
															+        else:
														
 
															+            continue
														
 
															+    return False
														
 
															+
														
 
															+
														
 
															+class ShiPinHaoAuthor(object):
														
 
															+    """
														
 
															+    视频号账号爬虫
														
 
															+    """
														
 
															+    def __init__(self, platform, mode, rule_dict, user_dict, env):
														
 
															+        self.account_name = user_dict["link"]
														
 
															+        self.platform = platform
														
 
															+        self.mode = mode
														
 
															+        self.rule_dict = rule_dict
														
 
															+        self.user_dict = user_dict
														
 
															+        self.env = env
														
 
															+        self.download_cnt = 0
														
 
															+        self.mq = MQ(topic_name="topic_crawler_etl_" + self.env)
														
 
															+
														
 
															+    def get_history_id(self):
														
 
															+        """
														
 
															+        从数据库表中读取 id
														
 
															+        """
														
 
															+        select_user_sql = f"""select name_id from accounts where name = "{self.account_name}" and platform = "{self.platform}" and useful = 1 limit 1"""
														
 
															+        name_id = MysqlHelper.get_values(
														
 
															+            log_type=self.mode,
														
 
															+            crawler=self.platform,
														
 
															+            sql=select_user_sql,
														
 
															+            env=self.env,
														
 
															+            machine="",
														
 
															+        )
														
 
															+        if name_id:
														
 
															+            return name_id[0][0]
														
 
															+        else:
														
 
															+            return False
														
 
															+
														
 
															+    def get_account_id(self):
														
 
															+        """
														
 
															+        读历史数据，如果存在 id，则直接返回 id
														
 
															+        """
														
 
															+        history_id = self.get_history_id()
														
 
															+        if history_id:
														
 
															+            return history_id
														
 
															+        else:
														
 
															+            url = "http://61.48.133.26:30001/Find_Video_Content"
														
 
															+            payload = json.dumps({
														
 
															+                "content": self.account_name,
														
 
															+                "type": "19"
														
 
															+            })
														
 
															+            headers = {
														
 
															+                'Content-Type': 'application/json'
														
 
															+            }
														
 
															+            response = requests.request("POST", url, headers=headers, data=payload)
														
 
															+            info_list = response.json()['info_list']
														
 
															+            if len(info_list) == 0:
														
 
															+                return False
														
 
															+            target_user = find_target_user(name=self.account_name, user_list=info_list)
														
 
															+            # 写入 MySql 数据库
														
 
															+            if target_user:
														
 
															+                update_sql = f"""INSERT INTO accounts (name, name_id, platform, useful) values ("{self.account_name}", "{target_user['contact']['username']}", "{self.platform}", 1 )"""
														
 
															+                MysqlHelper.update_values(
														
 
															+                    log_type=self.mode,
														
 
															+                    crawler=self.platform,
														
 
															+                    sql=update_sql,
														
 
															+                    env=self.env,
														
 
															+                    machine="",
														
 
															+                )
														
 
															+                return target_user['contact']["username"]
														
 
															+            else:
														
 
															+                return False
														
 
															+
														
 
															+    def get_account_videos(self):
														
 
															+        account_id = self.get_account_id()
														
 
															+        if account_id:
														
 
															+            url = "http://61.48.133.26:30001/FinderGetUpMasterNextPage"
														
 
															+            last_buffer = ""
														
 
															+            for i in range(10):
														
 
															+                if self.download_cnt >= int(
														
 
															+                    self.rule_dict.get("videos_cnt", {}).get("min", 30)
														
 
															+                ):
														
 
															+                    return
														
 
															+                headers = {
														
 
															+                    'Content-Type': 'application/json'
														
 
															+                }
														
 
															+                payload = json.dumps({
														
 
															+                    "username": account_id,
														
 
															+                    "last_buffer": last_buffer
														
 
															+                })
														
 
															+
														
 
															+                response = requests.request("POST", url, headers=headers, data=payload)
														
 
															+                time.sleep(random.randint(1, 5))
														
 
															+                if "objectId" not in response.text or response.status_code != 200:
														
 
															+                    AliyunLogger.logging(
														
 
															+                        code="2000",
														
 
															+                        platform=self.platform,
														
 
															+                        mode=self.mode,
														
 
															+                        env=self.env,
														
 
															+                        message="没有更多视频了",
														
 
															+                    )
														
 
															+                    return
														
 
															+                res_json = response.json()
														
 
															+                if len(res_json["UpMasterHomePage"]) == 0:
														
 
															+                    AliyunLogger.logging(
														
 
															+                        code="2000",
														
 
															+                        platform=self.platform,
														
 
															+                        mode=self.mode,
														
 
															+                        env=self.env,
														
 
															+                        message="没有更多视频了",
														
 
															+                    )
														
 
															+                    return
														
 
															+
														
 
															+                if not res_json["UpMasterHomePage"]:
														
 
															+                    AliyunLogger.logging(
														
 
															+                        code="2000",
														
 
															+                        platform=self.platform,
														
 
															+                        mode=self.mode,
														
 
															+                        env=self.env,
														
 
															+                        message="没有更多视频了",
														
 
															+                    )
														
 
															+                    return
														
 
															+                else:
														
 
															+                    last_buffer = res_json.get('last_buffer')
														
 
															+                    count = 0
														
 
															+                    for obj in res_json["UpMasterHomePage"]:
														
 
															+                        try:
														
 
															+                            AliyunLogger.logging(
														
 
															+                                code="1001",
														
 
															+                                platform=self.platform,
														
 
															+                                mode=self.mode,
														
 
															+                                message="扫描到一条视频",
														
 
															+                                env=self.env,
														
 
															+                                data=obj,
														
 
															+                            )
														
 
															+                            repeat_flag = self.process_video_obj(obj, count)
														
 
															+                            count += 1
														
 
															+                            if not repeat_flag:
														
 
															+                                return
														
 
															+                        except Exception as e:
														
 
															+                            AliyunLogger.logging(
														
 
															+                                code="3000",
														
 
															+                                platform=self.platform,
														
 
															+                                mode=self.mode,
														
 
															+                                env=self.env,
														
 
															+                                message=f"抓取单条视频异常:{e}\n",
														
 
															+                            )
														
 
															+        else:
														
 
															+            AliyunLogger.logging(
														
 
															+                code="3000",
														
 
															+                platform=self.platform,
														
 
															+                mode=self.mode,
														
 
															+                env=self.env,
														
 
															+                message="{}\t获取 id 失败".format(self.account_name),
														
 
															+            )
														
 
															+
														
 
															+    def process_video_obj(self, obj, count):
														
 
															+        objectId = obj['objectId']
														
 
															+        objectNonceId = obj['objectNonceId']
														
 
															+
														
 
															+        trace_id = self.platform + "new" + str(uuid.uuid1())
														
 
															+        url = "http://61.48.133.26:30001/GetFinderDownloadAddress"
														
 
															+        payload = json.dumps({
														
 
															+            "objectId": objectId,
														
 
															+            "objectNonceId": objectNonceId
														
 
															+        })
														
 
															+        headers = {
														
 
															+            'Content-Type': 'text/plain'
														
 
															+        }
														
 
															+        response = requests.request("POST", url, headers=headers, data=payload)
														
 
															+        time.sleep(random.randint(0, 1))
														
 
															+        video_obj = response.json()
														
 
															+        publish_time_str = obj['createtime']
														
 
															+        datetime_obj = datetime.strptime(publish_time_str, '%Y-%m-%d %H:%M:%S')
														
 
															+        # 将datetime对象转换为时间戳
														
 
															+        publish_time_stamp = int(datetime_obj.timestamp())
														
 
															+        video_url = video_obj.get('DownloadAddress')
														
 
															+        duration = int(self.video_duration(video_url))
														
 
															+        share_cnt = int(obj['forward_count'])
														
 
															+        like_cnt = int(obj['like_count'])
														
 
															+        # 获取当前时间
														
 
															+        current_time = datetime.now()
														
 
															+        formatted_time = current_time.strftime("%Y-%m-%d %H:%M:%S")
														
 
															+        video_dict = {
														
 
															+            "video_id": objectId,
														
 
															+            "video_title": clean_title(video_obj.get('title').split("\n")[0].split("#")[0]),
														
 
															+            "out_video_id": trace_id,
														
 
															+            "publish_time_stamp": publish_time_stamp,
														
 
															+            "publish_time_str": publish_time_str,
														
 
															+            "play_cnt": 0,
														
 
															+            "fav_count": int(obj['fav_count']),
														
 
															+            "comment_cnt": int(obj['comment_count']),
														
 
															+            "like_cnt": like_cnt,
														
 
															+            "share_cnt": share_cnt,
														
 
															+            "user_id": self.user_dict["uid"],
														
 
															+            "cover_url": video_obj.get('thumb_url'),
														
 
															+            "video_url": video_url,
														
 
															+            "avatar_url": video_obj.get('thumb_url'),
														
 
															+            "width": video_obj.get('width'),
														
 
															+            "height": video_obj.get('height'),
														
 
															+            "duration": duration,
														
 
															+            "platform": self.platform,
														
 
															+            "strategy": self.mode,
														
 
															+            "crawler_rule": self.rule_dict,
														
 
															+            "session": f"shipinhao-author-{int(time.time())}",
														
 
															+        }
														
 
															+        if share_cnt == 0:
														
 
															+            divisor_cnt = 0
														
 
															+        else:
														
 
															+            divisor_cnt = int(share_cnt / like_cnt)
														
 
															+        # 视频时长小于30秒 返回
														
 
															+        if duration < 30:
														
 
															+            values = [[
														
 
															+                obj['nickname'],
														
 
															+                publish_time_str,
														
 
															+                formatted_time,
														
 
															+                int(obj['fav_count']),
														
 
															+                int(obj['comment_count']),
														
 
															+                int(obj['like_count']),
														
 
															+                int(obj['forward_count']),
														
 
															+                divisor_cnt,
														
 
															+                video_obj.get('title').split("\n")[0].split("#")[0],
														
 
															+                duration,
														
 
															+                '否',
														
 
															+                '时长小于30秒',
														
 
															+                video_obj.get('DownloadAddress')
														
 
															+            ]]
														
 
															+            Feishu.insert_columns(self.platform, 'shipinhao', "Vq7NeH", "ROWS", 1, 2)
														
 
															+            time.sleep(0.5)
														
 
															+            Feishu.update_values(self.platform, 'shipinhao', "Vq7NeH", "A2:Z2", values)
														
 
															+            return True
														
 
															+        # 分享小于1000 返回
														
 
															+        if share_cnt < 1000:
														
 
															+            values = [[
														
 
															+                obj['nickname'],
														
 
															+                publish_time_str,
														
 
															+                formatted_time,
														
 
															+                int(obj['fav_count']),
														
 
															+                int(obj['comment_count']),
														
 
															+                int(obj['like_count']),
														
 
															+                int(obj['forward_count']),
														
 
															+                divisor_cnt,
														
 
															+                video_obj.get('title').split("\n")[0].split("#")[0],
														
 
															+                duration,
														
 
															+                '否',
														
 
															+                '分享小于1000',
														
 
															+                video_obj.get('DownloadAddress')
														
 
															+            ]]
														
 
															+            Feishu.insert_columns(self.platform, 'shipinhao', "Vq7NeH", "ROWS", 1, 2)
														
 
															+            time.sleep(0.5)
														
 
															+            Feishu.update_values(self.platform, 'shipinhao', "Vq7NeH", "A2:Z2", values)
														
 
															+            return True
														
 
															+        # 分享小于等于99999
														
 
															+        if share_cnt <= 99999 and divisor_cnt < 2:
														
 
															+            values = [[
														
 
															+                obj['nickname'],
														
 
															+                publish_time_str,
														
 
															+                formatted_time,
														
 
															+                int(obj['fav_count']),
														
 
															+                int(obj['comment_count']),
														
 
															+                int(obj['like_count']),
														
 
															+                int(obj['forward_count']),
														
 
															+                divisor_cnt,
														
 
															+                video_obj.get('title').split("\n")[0].split("#")[0],
														
 
															+                duration,
														
 
															+                '否',
														
 
															+                f'分享小于100000,分享/点赞：{divisor_cnt}',
														
 
															+                video_obj.get('DownloadAddress')
														
 
															+            ]]
														
 
															+            Feishu.insert_columns(self.platform, 'shipinhao', "Vq7NeH", "ROWS", 1, 2)
														
 
															+            time.sleep(0.5)
														
 
															+            Feishu.update_values(self.platform, 'shipinhao', "Vq7NeH", "A2:Z2", values)
														
 
															+            return True
														
 
															+        pipeline = PiaoQuanPipeline(
														
 
															+            platform=self.platform,
														
 
															+            mode=self.mode,
														
 
															+            item=video_dict,
														
 
															+            rule_dict=self.rule_dict,
														
 
															+            env=self.env,
														
 
															+            trace_id=trace_id,
														
 
															+        )
														
 
															+        if not pipeline.repeat_video():
														
 
															+            values = [[
														
 
															+                obj['nickname'],
														
 
															+                publish_time_str,
														
 
															+                formatted_time,
														
 
															+                int(obj['fav_count']),
														
 
															+                int(obj['comment_count']),
														
 
															+                int(obj['like_count']),
														
 
															+                int(obj['forward_count']),
														
 
															+                divisor_cnt,
														
 
															+                video_obj.get('title').split("\n")[0].split("#")[0],
														
 
															+                duration,
														
 
															+                '否',
														
 
															+                '重复视频',
														
 
															+                video_obj.get('DownloadAddress')
														
 
															+            ]]
														
 
															+            Feishu.insert_columns(self.platform, 'shipinhao', "Vq7NeH", "ROWS", 1, 2)
														
 
															+            time.sleep(0.5)
														
 
															+            Feishu.update_values(self.platform, 'shipinhao', "Vq7NeH", "A2:Z2", values)
														
 
															+            if count > 3:
														
 
															+                return False
														
 
															+            else:
														
 
															+                return True
														
 
															+        else:
														
 
															+            values = [[
														
 
															+                obj['nickname'],
														
 
															+                publish_time_str,
														
 
															+                formatted_time,
														
 
															+                int(obj['fav_count']),
														
 
															+                int(obj['comment_count']),
														
 
															+                int(obj['like_count']),
														
 
															+                int(obj['forward_count']),
														
 
															+                divisor_cnt,
														
 
															+                video_obj.get('title').split("\n")[0].split("#")[0],
														
 
															+                duration,
														
 
															+                '是',
														
 
															+                '',
														
 
															+                video_obj.get('DownloadAddress')
														
 
															+            ]]
														
 
															+            Feishu.insert_columns(self.platform, 'shipinhao', "Vq7NeH", "ROWS", 1, 2)
														
 
															+            time.sleep(0.5)
														
 
															+            Feishu.update_values(self.platform, 'shipinhao', "Vq7NeH", "A2:Z2", values)
														
 
															+            video_dict["publish_time"] = video_dict["publish_time_str"]
														
 
															+            self.mq.send_msg(video_dict)
														
 
															+            self.download_cnt += 1
														
 
															+            AliyunLogger.logging(
														
 
															+                code="1002",
														
 
															+                platform=self.platform,
														
 
															+                mode=self.mode,
														
 
															+                env=self.env,
														
 
															+                data=video_dict,
														
 
															+                trace_id=trace_id,
														
 
															+                message="成功发送 MQ 至 ETL",
														
 
															+            )
														
 
															+            time.sleep(5)
														
 
															+        return True
														
 
															+
														
 
															+    def video_duration(self, filename):
														
 
															+        cap = cv2.VideoCapture(filename)
														
 
															+        if cap.isOpened():
														
 
															+            rate = cap.get(5)
														
 
															+            frame_num = cap.get(7)
														
 
															+            duration = frame_num / rate
														
 
															+            return duration
														
 
															+        return 0
														
 
															+
														
 
															+
														
 
															+if __name__ == "__main__":
														
 
															+    SP = ShiPinHaoAuthor(
														
 
															+
														
 
															+        platform="shipinhao",
														
 
															+        mode="author",
														
 
															+        user_dict={"uid": "123456", "link": "老碗哥说文解惑", "user_id": "1234565"},
														
 
															+        rule_dict={},
														
 
															+        env="prod",
														
 
															+    )
														
 
															+
														
 
															+    SP.get_account_videos()
														
--- a/shipinhao/shipinhao_main/run_sph_author.py
+++ b/shipinhao/shipinhao_main/run_sph_author.py
@@ -1,15 +1,16 @@
 
															 import argparse
														
 
															 import random
														
 
															-import time
														
 
															 from mq_http_sdk.mq_client import *
														
 
															 from mq_http_sdk.mq_consumer import *
														
 
															 from mq_http_sdk.mq_exception import MQExceptionBase
														
 
															+
														
 
															 sys.path.append(os.getcwd())
														
 
															 from common.public import task_fun_mq, get_consumer, ack_message
														
 
															 from common.scheduling_db import MysqlHelper
														
 
															 from common import AliyunLogger
														
 
															-from shipinhao.shipinhao_author import ShiPinHaoAccount
														
 
															+from shipinhao.shipinhao_author.shipinhao_author import ShiPinHaoAuthor
														
 
															+
														
 
															 def main(log_type, crawler, topic_name, group_id, env):
														
@@ -103,7 +104,7 @@ def main(log_type, crawler, topic_name, group_id, env):
 
															                             message="开始抓取视频号{}".format(user_dict["link"]),
														
 
															                         )
														
 
															                         # 初始化
														
 
															-                        SPHA = ShiPinHaoAccount(
														
 
															+                        SPHA = ShiPinHaoAuthor(
														
 
															                             platform=crawler,
														
 
															                             mode=log_type,
														
 
															                             rule_dict=rule_dict,