1 tahun lalu · 84e9d55bef
--- a/functions/__init__.py
+++ b/functions/__init__.py
@@ -3,4 +3,4 @@ init file for functions
 
				 """
			
 
				 from .date import *
			
 
				 from .odps_function import PyODPS
			
 
				-from .mysql import MysqlClient
			
 
				+from .mysql import MysqlClient, MySQLClientSpider
			
--- a/functions/mysql.py
+++ b/functions/mysql.py
@@ -58,9 +58,53 @@ class MysqlClient(object):
 
				         关闭连接
			
 
				         """
			
 
				         self.connection.close()
			
 
				-#
			
 
				-#
			
 
				-# M = MysqlClient()
			
 
				-# sql = "SELECT title from wx_video where id = '19591529';"
			
 
				-# w = M.select(sql)
			
 
				-# print(w)
			
 
				+
			
 
				+
			
 
				+class MySQLClientSpider(object):
			
 
				+    """
			
 
				+    爬虫的 mysql 配置
			
 
				+    """
			
 
				+
			
 
				+    def __init__(self):
			
 
				+        mysql_config = {
			
 
				+            "host": "rm-bp1159bu17li9hi94.mysql.rds.aliyuncs.com",  # 数据库IP地址，内网地址
			
 
				+            "port": 3306,  # 端口号
			
 
				+            "user": "crawler",  # mysql用户名
			
 
				+            "passwd": "crawler123456@",  # mysql用户登录密码
			
 
				+            "db": "piaoquan-crawler",  # 数据库名
			
 
				+            "charset": "utf8mb4"  # 如果数据库里面的文本是utf8编码的，charset指定是utf8
			
 
				+        }
			
 
				+        self.connection = pymysql.connect(
			
 
				+            host=mysql_config['host'],  # 数据库IP地址，内网地址
			
 
				+            port=mysql_config['port'],  # 端口号
			
 
				+            user=mysql_config['user'],  # mysql用户名
			
 
				+            passwd=mysql_config['passwd'],  # mysql用户登录密码
			
 
				+            db=mysql_config['db'],  # 数据库名
			
 
				+            charset=mysql_config['charset']  # 如果数据库里面的文本是utf8编码的，charset指定是utf8
			
 
				+        )
			
 
				+
			
 
				+    def select(self, sql):
			
 
				+        """
			
 
				+        查询
			
 
				+        :param sql:
			
 
				+        :return:
			
 
				+        """
			
 
				+        cursor = self.connection.cursor()
			
 
				+        cursor.execute(sql)
			
 
				+        data = cursor.fetchall()
			
 
				+        return data
			
 
				+
			
 
				+    def close(self):
			
 
				+        """
			
 
				+        关闭连接
			
 
				+        """
			
 
				+        self.connection.close()
			
 
				+
			
 
				+
			
 
				+M = MySQLClientSpider()
			
 
				+video_id = "14126697"
			
 
				+sql = f"""SELECT like_cnt, play_cnt, duration from crawler_video where video_id = '{video_id}';"""
			
 
				+w = M.select(sql)
			
 
				+print(w[0])
			
 
				+a, b, c = w[0]
			
 
				+print(a, b, c)
			
--- a/main_spider.py
+++ b/main_spider.py
@@ -31,6 +31,8 @@ class LightGBM(object):
 
				             "out_play_cnt",
			
 
				             "out_like_cnt",
			
 
				             "out_share_cnt",
			
 
				+            "lop",
			
 
				+            "duration",
			
 
				             "tag1",
			
 
				             "tag2",
			
 
				             "tag3"
			
@@ -39,11 +41,13 @@ class LightGBM(object):
 
				         self.float_columns = [
			
 
				             "out_play_cnt",
			
 
				             "out_like_cnt",
			
 
				-            "out_share_cnt"
			
 
				+            "out_share_cnt",
			
 
				+            "lop",
			
 
				+            "duration"
			
 
				         ]
			
 
				         self.split_c = 0.7
			
 
				         self.yc = 0.8
			
 
				-        self.model = "lightgbm_0326_spider.bin"
			
 
				+        self.model = "lightgbm_0327_spider.bin"
			
 
				         self.flag = flag
			
 
				         self.dt = dt
			
 
				 
			
@@ -205,20 +209,20 @@ class LightGBM(object):
 
				 
			
 
				 
			
 
				 if __name__ == "__main__":
			
 
				-    i = int(input("输入 1 训练， 输入 2 预测:\n"))
			
 
				-    if i == 1:
			
 
				-        f = "train"
			
 
				-        dt = "whole"
			
 
				-        L = LightGBM(flag=f, dt=dt)
			
 
				-        L.train_model()
			
 
				-    elif i == 2:
			
 
				-        f = "predict"
			
 
				-        dt = int(input("输入日期， 16-21:\n"))
			
 
				-        L = LightGBM(flag=f, dt=dt)
			
 
				-        L.evaluate_model()
			
 
				-        L.feature_importance()
			
 
				-    # L = LightGBM("train", "whole")
			
 
				-    # study = optuna.create_study(direction='maximize')
			
 
				-    # study.optimize(L.bays_params, n_trials=100)
			
 
				-    # print('Number of finished trials:', len(study.trials))
			
 
				-    # print('Best trial:', study.best_trial.params)
			
 
				+    # i = int(input("输入 1 训练， 输入 2 预测:\n"))
			
 
				+    # if i == 1:
			
 
				+    #     f = "train"
			
 
				+    #     dt = "whole"
			
 
				+    #     L = LightGBM(flag=f, dt=dt)
			
 
				+    #     L.train_model()
			
 
				+    # elif i == 2:
			
 
				+    #     f = "predict"
			
 
				+    #     dt = int(input("输入日期， 16-21:\n"))
			
 
				+    #     L = LightGBM(flag=f, dt=dt)
			
 
				+    #     L.evaluate_model()
			
 
				+    #     L.feature_importance()
			
 
				+    L = LightGBM("train", "whole")
			
 
				+    study = optuna.create_study(direction='maximize')
			
 
				+    study.optimize(L.bays_params, n_trials=100)
			
 
				+    print('Number of finished trials:', len(study.trials))
			
 
				+    print('Best trial:', study.best_trial.params)
			
--- a/process_data.py
+++ b/process_data.py
@@ -10,7 +10,7 @@ import jieba.analyse
 
				 
			
 
				 sys.path.append(os.getcwd())
			
 
				 
			
 
				-from functions import generate_label_date, MysqlClient
			
 
				+from functions import generate_label_date, MysqlClient, MySQLClientSpider
			
 
				 
			
 
				 
			
 
				 class DataProcessor(object):
			
@@ -20,6 +20,7 @@ class DataProcessor(object):
 
				 
			
 
				     def __init__(self, flag, c="useful"):
			
 
				         self.client = MysqlClient()
			
 
				+        self.client_spider = MySQLClientSpider()
			
 
				         self.flag = flag
			
 
				         self.c = c
			
 
				 
			
@@ -76,6 +77,9 @@ class DataProcessor(object):
 
				             case "spider":
			
 
				                 if item['type'] == "spider":
			
 
				                     item_features = [item[i] for i in spider_features]
			
 
				+                    lop, duration = self.cal_lop(video_id)
			
 
				+                    item_features.append(lop)
			
 
				+                    item_features.append(duration)
			
 
				                 else:
			
 
				                     return None, None
			
 
				         keywords_textrank = self.title_processor(video_id)
			
@@ -89,6 +93,7 @@ class DataProcessor(object):
 
				             item_features.append(None)
			
 
				             item_features.append(None)
			
 
				             item_features.append(None)
			
 
				+
			
 
				         label_dt = generate_label_date(dt)
			
 
				         label_obj = y_ori_data.get(label_dt, {}).get(video_id)
			
 
				         if label_obj:
			
@@ -112,6 +117,21 @@ class DataProcessor(object):
 
				             print(video_id, "\t", e)
			
 
				             return []
			
 
				 
			
 
				+    def cal_lop(self, video_id):
			
 
				+        """
			
 
				+        通过视频 id 去爬虫表读取播放和点赞，并且求出like / play的值，要注意平滑，要注意分母为 0 的情况
			
 
				+        :param video_id:
			
 
				+        :return:  lop
			
 
				+        """
			
 
				+        sql = f"""SELECT like_cnt, play_cnt, duration from crawler_video where video_id = '{video_id}';"""
			
 
				+        try:
			
 
				+            like_cnt, play_cnt, duration = self.client_spider.select(sql)[0]
			
 
				+            lop = (like_cnt + 700) / (play_cnt + 18000)
			
 
				+            return lop, duration
			
 
				+        except Exception as e:
			
 
				+            print(video_id, "\t", e)
			
 
				+            return 0, 0
			
 
				+
			
 
				     def producer(self, dt):
			
 
				         """
			
 
				         生成数据