luojunhui
/
alg


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116
							"""
process the data to satisfy the lightgbm
"""

import sys
import os
import json
from tqdm import tqdm
import jieba.analyse

sys.path.append(os.getcwd())

from functions import generate_label_date, MysqlClient


class DataProcessor(object):
    """
    Process the data to satisfy the lightGBM
    """

    def __init__(self, flag):
        self.client = MysqlClient()
        self.flag = flag

    def generate_train_label(self, item, y_ori_data, cate):
        """
        生成训练数据，用 np.array矩阵的方式返回，
        :return: x_train, 训练数据， y_train, 训练 label
        """
        video_id = item["video_id"]
        dt = item["dt"]
        userful_features = [
            "uid",
            "type",
            "channel",
            "fans",
            "view_count_user_30days",
            "share_count_user_30days",
            "return_count_user_30days",
            "rov_user",
            "str_user",
            "out_user_id",
            "mode",
            "out_play_cnt",
            "out_like_cnt",
            "out_share_cnt",
            "out_collection_cnt",
        ]
        item_features = [item[i] for i in userful_features]
        keywords_textrank, keywords_tf = self.title_processor(video_id)
        if keywords_tf and keywords_textrank:
            item_features.append(",".join(keywords_textrank))
            item_features.append(",".join(keywords_tf))
        else:
            item_features.append(None)
            item_features.append(None)
        label_dt = generate_label_date(dt)
        label_obj = y_ori_data.get(label_dt, {}).get(video_id)
        if label_obj:
            label = int(label_obj[cate]) if label_obj[cate] else 0
        else:
            label = 0
        return label, item_features

    def title_processor(self, video_id):
        """
        通过 video_id 去获取title， 然后通过 title 再分词，把关键词作为 feature
        :param video_id: the video id
        :return: tag_list [tag, tag, tag, tag......]
        """
        sql = f"""SELECT title from crawler_video where id = {video_id};"""
        try:
            title = self.client.select(sql)
            keywords_textrank = jieba.analyse.textrank(title, topK=3)
            keywords_tfidf = jieba.analyse.extract_tags(title, topK=3)
            return list(keywords_textrank), list(keywords_tfidf)
        except Exception as e:
            print(video_id, "\t", e)
            return [], []

    def producer(self):
        """
        生成数据
        :return:none
        """
        if self.flag == "train":
            x_path = "data/hour_train.json"
            y_path = "data/daily-label-20240101-20240320.json"
        elif self.flag == "predict":
            x_path = "prid_data/train_0314_0317.json"
            y_path = "data/daily-label-20240315-20240321.json"
        else:
            return
        with open(x_path) as f:
            x_data = json.loads(f.read())
        with open(y_path) as f:
            y_data = json.loads(f.read())
        cate_list = ["total_return"]
        for c in cate_list:
            x_list = []
            y_list = []
            for video_obj in tqdm(x_data):
                our_label, features = self.generate_train_label(video_obj, y_data, c)
                x_list.append(features)
                y_list.append(our_label)
            with open("produce_data/x_data_{}_{}.json".format(c, self.flag), "w") as f1:
                f1.write(json.dumps(x_list, ensure_ascii=False))

            with open("produce_data/y_data_{}_{}.json".format(c, self.flag), "w") as f2:
                f2.write(json.dumps(y_list, ensure_ascii=False))


if __name__ == "__main__":
    D = DataProcessor(flag="train")
    a, b = D.title_processor("19591529")
    print(a, b)