浏览代码

generate label for mysql

罗俊辉 1 年之前
父节点
当前提交
5f3e1e086b
共有 1 个文件被更改,包括 30 次插入4 次删除
  1. 30 4
      process_data.py

+ 30 - 4
process_data.py

@@ -1,7 +1,7 @@
 """
 process the data to satisfy the lightgbm
 """
-
+import datetime
 import sys
 import os
 import json
@@ -11,6 +11,7 @@ import time
 
 from tqdm import tqdm
 import jieba.analyse
+import pandas as pd
 
 sys.path.append(os.getcwd())
 
@@ -180,6 +181,14 @@ class SpiderProcess(object):
 
     def __init__(self):
         self.client_spider = MySQLClientSpider()
+        self.spider_features = [
+            "channel",
+            "out_user_id",
+            "mode",
+            "out_play_cnt",
+            "out_like_cnt",
+            "out_share_cnt"
+        ]
 
     def spider_lop(self, video_id):
         """
@@ -201,7 +210,22 @@ class SpiderProcess(object):
         把 spider_duration 存储到数据库中
         :return:
         """
-        return
+        select_sql = "SELECT video_id, title, channel, out_user_id, mode, out_play_cnt, out_like_cnt, out_share_cnt FROM lightgbm_data WHERE type = 'spider';"
+        data_list = self.client_spider.select(select_sql)
+        df = []
+        for line in data_list:
+            temp = list(line)
+            video_id = line[0]
+            title = line[1]
+            lop, duration = self.spider_lop(video_id)
+            title_tags = list(jieba.analyse.textrank(title, topK=3))
+            temp.append(lop)
+            temp.append(duration)
+            for i in range(3):
+                temp.append(title_tags[i] if title_tags[i] else None)
+            df.append(temp[1:])
+        df = pd.DataFrame(df, columns=['title', 'channel', 'out_user_id', 'mode', 'out_play_cnt', 'out_like_cnt', 'out_share_cnt', 'lop', 'duration', 'tag1', 'tag2', 'tag3'])
+        df.to_excel("data/train_data/spider_data_{}.xlsx".format(datetime.datetime.today().strftime("y%m%d")))
 
 
 class UserProcess(object):
@@ -264,8 +288,10 @@ if __name__ == "__main__":
     # mode = args.mode
     # category = args.category
     # dtype = args.dtype
-    D = DataProcessor()
-    D.producer()
+    # D = DataProcessor()
+    # D.producer()
+    S = SpiderProcess()
+    S.spider_data_produce()
     # if mode == "train":
     #     print("Loading data and process for training.....")
     #     D = DataProcessor(flag="train", ll=category)