Jelajahi Sumber

mysql.py 增加爬虫表数据库连接
process_data.py 增加 lop, 增加 duration
main_spider.py 修改为执行贝叶斯优化

罗俊辉 1 tahun lalu
induk
melakukan
84e9d55bef
4 mengubah file dengan 95 tambahan dan 27 penghapusan
  1. 1 1
      functions/__init__.py
  2. 50 6
      functions/mysql.py
  3. 23 19
      main_spider.py
  4. 21 1
      process_data.py

+ 1 - 1
functions/__init__.py

@@ -3,4 +3,4 @@ init file for functions
 """
 from .date import *
 from .odps_function import PyODPS
-from .mysql import MysqlClient
+from .mysql import MysqlClient, MySQLClientSpider

+ 50 - 6
functions/mysql.py

@@ -58,9 +58,53 @@ class MysqlClient(object):
         关闭连接
         """
         self.connection.close()
-#
-#
-# M = MysqlClient()
-# sql = "SELECT title from wx_video where id = '19591529';"
-# w = M.select(sql)
-# print(w)
+
+
+class MySQLClientSpider(object):
+    """
+    爬虫的 mysql 配置
+    """
+
+    def __init__(self):
+        mysql_config = {
+            "host": "rm-bp1159bu17li9hi94.mysql.rds.aliyuncs.com",  # 数据库IP地址,内网地址
+            "port": 3306,  # 端口号
+            "user": "crawler",  # mysql用户名
+            "passwd": "crawler123456@",  # mysql用户登录密码
+            "db": "piaoquan-crawler",  # 数据库名
+            "charset": "utf8mb4"  # 如果数据库里面的文本是utf8编码的,charset指定是utf8
+        }
+        self.connection = pymysql.connect(
+            host=mysql_config['host'],  # 数据库IP地址,内网地址
+            port=mysql_config['port'],  # 端口号
+            user=mysql_config['user'],  # mysql用户名
+            passwd=mysql_config['passwd'],  # mysql用户登录密码
+            db=mysql_config['db'],  # 数据库名
+            charset=mysql_config['charset']  # 如果数据库里面的文本是utf8编码的,charset指定是utf8
+        )
+
+    def select(self, sql):
+        """
+        查询
+        :param sql:
+        :return:
+        """
+        cursor = self.connection.cursor()
+        cursor.execute(sql)
+        data = cursor.fetchall()
+        return data
+
+    def close(self):
+        """
+        关闭连接
+        """
+        self.connection.close()
+
+
+M = MySQLClientSpider()
+video_id = "14126697"
+sql = f"""SELECT like_cnt, play_cnt, duration from crawler_video where video_id = '{video_id}';"""
+w = M.select(sql)
+print(w[0])
+a, b, c = w[0]
+print(a, b, c)

+ 23 - 19
main_spider.py

@@ -31,6 +31,8 @@ class LightGBM(object):
             "out_play_cnt",
             "out_like_cnt",
             "out_share_cnt",
+            "lop",
+            "duration",
             "tag1",
             "tag2",
             "tag3"
@@ -39,11 +41,13 @@ class LightGBM(object):
         self.float_columns = [
             "out_play_cnt",
             "out_like_cnt",
-            "out_share_cnt"
+            "out_share_cnt",
+            "lop",
+            "duration"
         ]
         self.split_c = 0.7
         self.yc = 0.8
-        self.model = "lightgbm_0326_spider.bin"
+        self.model = "lightgbm_0327_spider.bin"
         self.flag = flag
         self.dt = dt
 
@@ -205,20 +209,20 @@ class LightGBM(object):
 
 
 if __name__ == "__main__":
-    i = int(input("输入 1 训练, 输入 2 预测:\n"))
-    if i == 1:
-        f = "train"
-        dt = "whole"
-        L = LightGBM(flag=f, dt=dt)
-        L.train_model()
-    elif i == 2:
-        f = "predict"
-        dt = int(input("输入日期, 16-21:\n"))
-        L = LightGBM(flag=f, dt=dt)
-        L.evaluate_model()
-        L.feature_importance()
-    # L = LightGBM("train", "whole")
-    # study = optuna.create_study(direction='maximize')
-    # study.optimize(L.bays_params, n_trials=100)
-    # print('Number of finished trials:', len(study.trials))
-    # print('Best trial:', study.best_trial.params)
+    # i = int(input("输入 1 训练, 输入 2 预测:\n"))
+    # if i == 1:
+    #     f = "train"
+    #     dt = "whole"
+    #     L = LightGBM(flag=f, dt=dt)
+    #     L.train_model()
+    # elif i == 2:
+    #     f = "predict"
+    #     dt = int(input("输入日期, 16-21:\n"))
+    #     L = LightGBM(flag=f, dt=dt)
+    #     L.evaluate_model()
+    #     L.feature_importance()
+    L = LightGBM("train", "whole")
+    study = optuna.create_study(direction='maximize')
+    study.optimize(L.bays_params, n_trials=100)
+    print('Number of finished trials:', len(study.trials))
+    print('Best trial:', study.best_trial.params)

+ 21 - 1
process_data.py

@@ -10,7 +10,7 @@ import jieba.analyse
 
 sys.path.append(os.getcwd())
 
-from functions import generate_label_date, MysqlClient
+from functions import generate_label_date, MysqlClient, MySQLClientSpider
 
 
 class DataProcessor(object):
@@ -20,6 +20,7 @@ class DataProcessor(object):
 
     def __init__(self, flag, c="useful"):
         self.client = MysqlClient()
+        self.client_spider = MySQLClientSpider()
         self.flag = flag
         self.c = c
 
@@ -76,6 +77,9 @@ class DataProcessor(object):
             case "spider":
                 if item['type'] == "spider":
                     item_features = [item[i] for i in spider_features]
+                    lop, duration = self.cal_lop(video_id)
+                    item_features.append(lop)
+                    item_features.append(duration)
                 else:
                     return None, None
         keywords_textrank = self.title_processor(video_id)
@@ -89,6 +93,7 @@ class DataProcessor(object):
             item_features.append(None)
             item_features.append(None)
             item_features.append(None)
+
         label_dt = generate_label_date(dt)
         label_obj = y_ori_data.get(label_dt, {}).get(video_id)
         if label_obj:
@@ -112,6 +117,21 @@ class DataProcessor(object):
             print(video_id, "\t", e)
             return []
 
+    def cal_lop(self, video_id):
+        """
+        通过视频 id 去爬虫表读取播放和点赞,并且求出like / play的值,要注意平滑,要注意分母为 0 的情况
+        :param video_id:
+        :return:  lop
+        """
+        sql = f"""SELECT like_cnt, play_cnt, duration from crawler_video where video_id = '{video_id}';"""
+        try:
+            like_cnt, play_cnt, duration = self.client_spider.select(sql)[0]
+            lop = (like_cnt + 700) / (play_cnt + 18000)
+            return lop, duration
+        except Exception as e:
+            print(video_id, "\t", e)
+            return 0, 0
+
     def producer(self, dt):
         """
         生成数据