luojunhui
/
ScheduledTasks


			
							1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192
							"""
@author: luojunhui
"""
import json
import pymysql
import requests
import pandas as pd
from concurrent.futures.thread import ThreadPoolExecutor


def request_for_info(video_id):
    """
    请求数据
    :param video_id:
    :return:
    """
    url = "https://longvideoapi.piaoquantv.com/longvideoapi/openapi/video/batchSelectVideoInfo"
    data = {
        "videoIdList": [video_id]
    }
    header = {
        "Content-Type": "application/json",
    }
    response = requests.post(url, headers=header, data=json.dumps(data))
    return response.json()


def migrate_data_to_mysql(video_id, title, view_, return_, video_url):
    """
    把 data_works 数据迁移到数据库
    :param obj:
    :return:
    """
    rov = int(return_) / int(view_) if int(view_) > 0 else 0
    insert_sql = f"""
        INSERT INTO top_return_daily
            (video_id, title, view_, return_, video_url, dt, rov)
        VALUES 
            (%s, %s, %s, %s, %s, %s, %s);
    """
    connection = pymysql.connect(
        host='rm-bp1159bu17li9hi94.mysql.rds.aliyuncs.com',
        port=3306,
        user='crawler',
        password='crawler123456@',
        db='piaoquan-crawler',
        charset='utf8mb4'
    )
    cursor = connection.cursor()
    cursor.execute(
        insert_sql,
        (
            video_id,
            title,
            view_,
            return_,
            video_url,
            "20240715",
            rov
        )
    )
    connection.commit()


def process(line):
    title = line[0]
    video_id = line[1].replace('"', '')
    view = int(line[3])
    return_count = int(line[4])
    video_url = request_for_info(video_id)['data'][0]['videoPath']
    migrate_data_to_mysql(video_id, title, view, return_count, video_url)


path = "/Users/luojunhui/Downloads/2022-top10000.csv"

with open(path, encoding="gbk", errors='ignore') as f:
    data = f.readlines()

L = []
for line in data:
    temp = line.replace("\n", "").split(",")
    # print(len(temp))
    if len(temp) == 5:
        L.append(temp)
# for line in L:
#     print(line)


# data_list = df.values.tolist()

with ThreadPoolExecutor(max_workers=10) as pool:
    pool.map(process, L)