罗俊辉 11 ماه پیش
والد
کامیت
67c0d5ba94
9فایلهای تغییر یافته به همراه187 افزوده شده و 123 حذف شده
  1. 44 0
      applications/functions.py
  2. 33 0
      applications/migrate.py
  3. 2 1
      deal/__init__.py
  4. 33 0
      deal/db_deal.py
  5. 3 1
      deal/request_deal.py
  6. 29 17
      dev.py
  7. 38 2
      routes/vta_routes.py
  8. 0 99
      t.py
  9. 5 3
      test_req.py

+ 44 - 0
applications/functions.py

@@ -3,6 +3,8 @@
 """
 from datetime import datetime, timedelta
 
+import requests
+
 
 def generate_daily_strings(start_date, end_date):
     """
@@ -19,3 +21,45 @@ def generate_daily_strings(start_date, end_date):
         date_strings.append(current.strftime("%Y%m%d"))
         current += timedelta(days=1)
     return date_strings
+
+
+def whisper(video_id):
+    """
+    input video_id, output video_text
+    :param video_id:
+    :return:
+    """
+    url = "http://61.48.133.26:5999/video_to_text"
+    body = {
+        "video_id": video_id
+    }
+    header = {
+        "Content-Type": "application/json",
+    }
+    response = requests.post(
+        url=url,
+        json=body,
+        headers=header
+    )
+    return response.json()
+
+
+def get_text(video_id):
+    """
+    input video_id, output video_text
+    :param video_id:
+    :return:
+    """
+    url = "http://localhost:8888/get_text"
+    body = {
+        "vid": video_id
+    }
+    header = {
+        "Content-Type": "application/json",
+    }
+    response = requests.post(
+        url=url,
+        json=body,
+        headers=header
+    )
+    return response.json()

+ 33 - 0
applications/migrate.py

@@ -0,0 +1,33 @@
+"""
+@author: luojunhui
+"""
+"""
+@author: luojunhui
+"""
+import time
+from concurrent.futures.thread import ThreadPoolExecutor
+
+from applications.mysql import MySQL
+from applications.odps_server import PyODPS
+from applications.functions import generate_daily_strings
+
+
+def migrate_daily(dt):
+    """
+    迁移当天到数据
+    :param dt:
+    :return:
+    """
+    PO = PyODPS()
+    M = MySQL()
+    select_sql = f"""select * from loghubods.video_return_top_500_new where dt = '{dt}';"""
+    data = PO.select(select_sql)
+    a = time.time()
+    with ThreadPoolExecutor(max_workers=8) as pool:
+        pool.map(M.migrate_data_to_mysql, data)
+    b = time.time()
+    print("{} successfully insert {} rows, totally cost {} seconds".format(dt, len(data), b - a))
+
+
+dt_list = generate_daily_strings("20240528", "20240528")
+migrate_daily(dt_list[0])

+ 2 - 1
deal/__init__.py

@@ -1,4 +1,5 @@
 """
 @author: luojunhui
 """
-from .request_deal import RequestDeal
+from .request_deal import RequestDeal
+from .db_deal import insert_text_mysql, get_text_by_id

+ 33 - 0
deal/db_deal.py

@@ -0,0 +1,33 @@
+"""
+@author: luojunhui
+"""
+
+
+async def insert_text_mysql(mysql_client, video_id, text, title):
+    """
+    :return:
+    """
+    select_sql = "select video_id from video_text;"
+    result = await mysql_client.select(select_sql)
+    if result:
+        return {"duplicated": "vid already exists"}
+    else:
+        insert_sql = f"""
+        insert into video_text
+        (video_id, video_text, title)
+        values 
+        ({video_id},'{text}', '{title}');
+        """
+        await mysql_client.async_insert(insert_sql)
+
+
+async def get_text_by_id(mysql_client, video_id):
+    """
+    获取视频id
+    :param mysql_client:
+    :param video_id:
+    :return:
+    """
+    select_sql = f"""select video_text from video_text where video_id = {video_id}"""
+    result = await mysql_client.select(select_sql)
+    return result[0]

+ 3 - 1
deal/request_deal.py

@@ -1,6 +1,7 @@
 """
 @author: luojunhui
 """
+from applications.functions import get_text
 
 
 class RequestDeal(object):
@@ -25,7 +26,8 @@ class RequestDeal(object):
             {
                 "video_id": obj[0],
                 "title": obj[1],
-                "video_url": obj[2]
+                "video_url": obj[2],
+                "video_text": get_text(video_id=obj[0])['text']
             } for obj in data_list
         ]
         response = {

+ 29 - 17
dev.py

@@ -1,31 +1,43 @@
 """
 @author: luojunhui
 """
+import json
 import time
-from concurrent.futures.thread import ThreadPoolExecutor
+import requests
 
-from applications.mysql import MySQL
-from applications.odps_server import PyODPS
-from applications.functions import generate_daily_strings
+from tqdm import tqdm
 
 
-def migrate_daily(dt):
+def request_for_whisper(obj):
     """
-    迁移当天到数据
-    :param dt:
+    请求whisper
+    :param obj:
     :return:
     """
-    PO = PyODPS()
-    M = MySQL()
-    select_sql = f"""select * from loghubods.video_return_top_500_new where dt = '{dt}';"""
-    data = PO.select(select_sql)
+    url = "http://localhost:8888/whisper"
+
+    body = {
+        "vid": obj['video_id'],
+        "title": obj['title']
+    }
     a = time.time()
-    with ThreadPoolExecutor(max_workers=8) as pool:
-        pool.map(M.migrate_data_to_mysql, data)
+    header = {
+        "Content-Type": "application/json",
+    }
+    response = requests.post(url, json=body, headers=header, timeout=600)
     b = time.time()
-    print("{} successfully insert {} rows, totally cost {} seconds".format(dt, len(data), b - a))
+    print(b - a)
+    return response.json()
+
 
+if __name__ == '__main__':
+    with open("test.json", encoding="utf-8") as f:
+        today_info = json.loads(f.read())
 
-dt_list = generate_daily_strings("20240522", "20240525")
-with ThreadPoolExecutor(max_workers=8) as Pool:
-    Pool.map(migrate_daily, dt_list)
+    dt_list = today_info['data']
+    for video_obj in tqdm(dt_list):
+        try:
+            result = request_for_whisper(obj=video_obj)
+            print(result)
+        except Exception as e:
+            print(e)

+ 38 - 2
routes/vta_routes.py

@@ -6,7 +6,10 @@ import uuid
 import asyncio
 from quart import Blueprint, jsonify, request
 
-from deal import RequestDeal
+from deal import RequestDeal, insert_text_mysql, get_text_by_id
+from applications.functions import whisper
+
+
 bp = Blueprint('VideosToArticle', __name__)
 
 
@@ -24,7 +27,40 @@ def VTARoutes(mysql_client):
         """
         params = await request.get_json()
         RD = RequestDeal(params, mysql_client)
-        return await RD.deal()
+        result = await RD.deal()
+        return jsonify(result)
+
+    @bp.route('/whisper', methods=["POST"])
+    async def video_extracting():
+        """
+        whisper 处理文本
+        :return:
+        """
+        params = await request.get_json()
+        video_id = params['vid']
+        video_title = params['title']
+        try:
+            response = whisper(video_id)
+            await insert_text_mysql(mysql_client, video_id, response['text'], video_title)
+            result = {"info": "success insert text into mysql", "vid": video_id}
+        except Exception as e:
+            result = {"error": str(e), "vid": video_id}
+        return jsonify(result)
+
+    @bp.route('/get_text', methods=["POST"])
+    async def get_video_text():
+        """
+        获取视频文本
+        :return:
+        """
+        params = await request.get_json()
+        video_id = params['vid']
+        text = await get_text_by_id(mysql_client, video_id)
+        if text:
+            result = {"text": text}
+        else:
+            result = {"text": None}
+        return jsonify(result)
 
     return bp
 

+ 0 - 99
t.py

@@ -1,99 +0,0 @@
-"""
-@author: luojunhui
-"""
-"""
-@author: luojunhui
-"""
-import time
-import json
-import uuid
-import random
-import hashlib
-import requests
-import urllib.parse
-
-
-def create_gzh_path(video_id, shared_uid):
-    """
-    :param video_id: 视频 id
-    :param shared_uid: 分享 id
-    """
-
-    def auto_white(root_share_id_):
-        """
-        自动加入白名单, 保证公众号百分百出广告
-        :param root_share_id_:
-        :return:
-        """
-
-        def get_cookie():
-            """
-            获取 cookie
-            :return:
-            """
-            url = "https://admin.piaoquantv.com/manager/login?account=luojunhui&passWd=e10adc3949ba59abbe56e057f20f883e&muid=7"
-            payload = {}
-            headers = {
-                'accept': 'application/json, text/plain, */*',
-                'accept-language': 'en',
-                'priority': 'u=1, i',
-                'sec-ch-ua': '"Chromium";v="124", "Google Chrome";v="124", "Not-A.Brand";v="99"',
-                'sec-ch-ua-mobile': '?0',
-                'sec-ch-ua-platform': '"macOS"',
-                'sec-fetch-dest': 'empty',
-                'sec-fetch-mode': 'cors',
-                'sec-fetch-site': 'same-origin',
-                'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36'
-            }
-            response = requests.request("GET", url, headers=headers, data=payload)
-            return response.cookies.values()[0]
-
-        url = "https://admin.piaoquantv.com/manager/ad/own/white/rootShare/save"
-        dd = {
-            "rootShareId": root_share_id_,
-            "commit": "算法自动加入白名单--"
-        }
-        payload = json.dumps(dd)
-        cookie = get_cookie()
-        headers = {
-            'accept': 'application/json',
-            'accept-language': 'en',
-            'content-type': 'application/json;',
-            'cookie': "SESSION=" + cookie,
-            'origin': 'https://admin.piaoquantv.com',
-            'priority': 'u=1, i',
-            'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36'
-        }
-        response = requests.request("POST", url, headers=headers, data=payload)
-        return response.json()['content']
-
-    def generate_source_id():
-        """
-        generate_source_id
-        :return:
-        """
-        timestamp = str(int(time.time() * 1000))
-        random_str = str(random.randint(1000, 9999))
-        hash_input = f"{timestamp}-{random_str}"
-        return hashlib.md5(hash_input.encode()).hexdigest()
-
-    root_share_id = str(uuid.uuid4())
-    source_id = "longArticles_" + generate_source_id()
-    url = f"pages/user-videos?id={video_id}&su={shared_uid}&fromGzh=1&rootShareId={root_share_id}&shareId={root_share_id}&rootSourceId={source_id}"
-    # 自动把 root_share_id 加入到白名单
-    auto_white(root_share_id)
-    return root_share_id, f"pages/category?jumpPage={urllib.parse.quote(url, safe='')}"
-
-
-obj = {"productionCover": "http://rescdn.yishihui.com/d3dba68c-0ab3-4f0c-858d-7248121833da?x-oss-process=image/resize,m_fill,w_600,h_480,limit_0/format,jpg/watermark,image_eXNoL3BpYy93YXRlcm1hcmtlci9pY29uX3BsYXlfd2hpdGUucG5nP3gtb3NzLXByb2Nlc3M9aW1hZ2UvcmVzaXplLHdfMTQ0,g_center", "productionName": "【揭秘】中国与该国无冤无仇,为何处处作对?专家:罪有应得🔥", "programAvatar": "https://rescdn.yishihui.com/0temp/lehuo.png", "programId": "wxe8f8f0e23cecad0f", "programName": "票圈乐活", "source": "Web", "rootShareId": "3b249e9e-dcdc-412b-9079-cb0df947128c", "productionPath": "pages/category?jumpPage=pages%2Fuser-videos%3Fid%3D20857581%26su%3D69637493%26fromGzh%3D1%26rootShareId%3D3b249e9e-dcdc-412b-9079-cb0df947128c%26shareId%3D3b249e9e-dcdc-412b-9079-cb0df947128c", "videoUrl": "http://rescdn.yishihui.com/7f0f3e2d-3006-4f40-9004-5ab871dd885f.mp4"}
-
-video_id = "20857581"
-share_uid = "69637493"
-root_id, path = create_gzh_path(video_id, share_uid)
-cover = obj["productionCover"]
-title = obj['productionName']
-
-print("封面:\t", cover)
-print("标题:\t", title)
-print("root_share_id:\t", root_id)
-print("SharePath: \t", path)

+ 5 - 3
test_req.py

@@ -9,8 +9,8 @@ url = "http://localhost:8888/videos"
 
 body = {
     "cate": "video_return",
-    "start_date": "2024-04-02",
-    "end_date": "2024-04-03",
+    "start_date": "2024-05-28",
+    "end_date": "2024-05-29",
     "topN": 10
 }
 a = time.time()
@@ -21,4 +21,6 @@ header = {
 response = requests.post(url, json=body, headers=header, timeout=600)
 b = time.time()
 print(b - a)
-print(json.dumps(response.json(), ensure_ascii=False, indent=4))
+print(json.dumps(response.json(), ensure_ascii=False, indent=4))
+with open("test.json", "w", encoding="utf-8") as f:
+    f.write(json.dumps(response.json(), ensure_ascii=False, indent=4))