|
@@ -10,7 +10,7 @@ import jieba.analyse
|
|
|
|
|
|
sys.path.append(os.getcwd())
|
|
|
|
|
|
-from functions import generate_label_date, MysqlClient
|
|
|
+from functions import generate_label_date, MysqlClient, MySQLClientSpider
|
|
|
|
|
|
|
|
|
class DataProcessor(object):
|
|
@@ -20,6 +20,7 @@ class DataProcessor(object):
|
|
|
|
|
|
def __init__(self, flag, c="useful"):
|
|
|
self.client = MysqlClient()
|
|
|
+ self.client_spider = MySQLClientSpider()
|
|
|
self.flag = flag
|
|
|
self.c = c
|
|
|
|
|
@@ -76,6 +77,9 @@ class DataProcessor(object):
|
|
|
case "spider":
|
|
|
if item['type'] == "spider":
|
|
|
item_features = [item[i] for i in spider_features]
|
|
|
+ lop, duration = self.cal_lop(video_id)
|
|
|
+ item_features.append(lop)
|
|
|
+ item_features.append(duration)
|
|
|
else:
|
|
|
return None, None
|
|
|
keywords_textrank = self.title_processor(video_id)
|
|
@@ -89,6 +93,7 @@ class DataProcessor(object):
|
|
|
item_features.append(None)
|
|
|
item_features.append(None)
|
|
|
item_features.append(None)
|
|
|
+
|
|
|
label_dt = generate_label_date(dt)
|
|
|
label_obj = y_ori_data.get(label_dt, {}).get(video_id)
|
|
|
if label_obj:
|
|
@@ -112,6 +117,21 @@ class DataProcessor(object):
|
|
|
print(video_id, "\t", e)
|
|
|
return []
|
|
|
|
|
|
+ def cal_lop(self, video_id):
|
|
|
+ """
|
|
|
+ 通过视频 id 去爬虫表读取播放和点赞,并且求出like / play的值,要注意平滑,要注意分母为 0 的情况
|
|
|
+ :param video_id:
|
|
|
+ :return: lop
|
|
|
+ """
|
|
|
+ sql = f"""SELECT like_cnt, play_cnt, duration from crawler_video where video_id = '{video_id}';"""
|
|
|
+ try:
|
|
|
+ like_cnt, play_cnt, duration = self.client_spider.select(sql)[0]
|
|
|
+ lop = (like_cnt + 700) / (play_cnt + 18000)
|
|
|
+ return lop, duration
|
|
|
+ except Exception as e:
|
|
|
+ print(video_id, "\t", e)
|
|
|
+ return 0, 0
|
|
|
+
|
|
|
def producer(self, dt):
|
|
|
"""
|
|
|
生成数据
|