|
@@ -1,7 +1,7 @@
|
|
|
"""
|
|
|
process the data to satisfy the lightgbm
|
|
|
"""
|
|
|
-
|
|
|
+import datetime
|
|
|
import sys
|
|
|
import os
|
|
|
import json
|
|
@@ -11,6 +11,7 @@ import time
|
|
|
|
|
|
from tqdm import tqdm
|
|
|
import jieba.analyse
|
|
|
+import pandas as pd
|
|
|
|
|
|
sys.path.append(os.getcwd())
|
|
|
|
|
@@ -180,6 +181,14 @@ class SpiderProcess(object):
|
|
|
|
|
|
def __init__(self):
|
|
|
self.client_spider = MySQLClientSpider()
|
|
|
+ self.spider_features = [
|
|
|
+ "channel",
|
|
|
+ "out_user_id",
|
|
|
+ "mode",
|
|
|
+ "out_play_cnt",
|
|
|
+ "out_like_cnt",
|
|
|
+ "out_share_cnt"
|
|
|
+ ]
|
|
|
|
|
|
def spider_lop(self, video_id):
|
|
|
"""
|
|
@@ -201,7 +210,22 @@ class SpiderProcess(object):
|
|
|
把 spider_duration 存储到数据库中
|
|
|
:return:
|
|
|
"""
|
|
|
- return
|
|
|
+ select_sql = "SELECT video_id, title, channel, out_user_id, mode, out_play_cnt, out_like_cnt, out_share_cnt FROM lightgbm_data WHERE type = 'spider';"
|
|
|
+ data_list = self.client_spider.select(select_sql)
|
|
|
+ df = []
|
|
|
+ for line in data_list:
|
|
|
+ temp = list(line)
|
|
|
+ video_id = line[0]
|
|
|
+ title = line[1]
|
|
|
+ lop, duration = self.spider_lop(video_id)
|
|
|
+ title_tags = list(jieba.analyse.textrank(title, topK=3))
|
|
|
+ temp.append(lop)
|
|
|
+ temp.append(duration)
|
|
|
+ for i in range(3):
|
|
|
+ temp.append(title_tags[i] if title_tags[i] else None)
|
|
|
+ df.append(temp[1:])
|
|
|
+ df = pd.DataFrame(df, columns=['title', 'channel', 'out_user_id', 'mode', 'out_play_cnt', 'out_like_cnt', 'out_share_cnt', 'lop', 'duration', 'tag1', 'tag2', 'tag3'])
|
|
|
+ df.to_excel("data/train_data/spider_data_{}.xlsx".format(datetime.datetime.today().strftime("y%m%d")))
|
|
|
|
|
|
|
|
|
class UserProcess(object):
|
|
@@ -264,8 +288,10 @@ if __name__ == "__main__":
|
|
|
# mode = args.mode
|
|
|
# category = args.category
|
|
|
# dtype = args.dtype
|
|
|
- D = DataProcessor()
|
|
|
- D.producer()
|
|
|
+ # D = DataProcessor()
|
|
|
+ # D.producer()
|
|
|
+ S = SpiderProcess()
|
|
|
+ S.spider_data_produce()
|
|
|
# if mode == "train":
|
|
|
# print("Loading data and process for training.....")
|
|
|
# D = DataProcessor(flag="train", ll=category)
|