3 jaren geleden · bf6f6f4327
--- a/.gitignore
+++ b/.gitignore
@@ -24,7 +24,9 @@ var/
 
				 *.egg-info/
			
 
				 .installed.cfg
			
 
				 *.egg
			
 
				-
			
 
				+*pickle
			
 
				+*.swp
			
 
				+*.csv
			
 
				 # PyInstaller
			
 
				 #  Usually these files are written by a python script from a template
			
 
				 #  before PyInstaller builds the exe, so as to inject date/other infos into it.
			
--- a/meta_data_operation.py
+++ b/meta_data_operation.py
@@ -0,0 +1,323 @@
 
				+#!/usr/bin/env python
			
 
				+# coding: utf-8
			
 
				+
			
 
				+import random
			
 
				+import numpy as np
			
 
				+import os
			
 
				+import datetime
			
 
				+import json
			
 
				+import oss2
			
 
				+import hashlib
			
 
				+import pandas as pd
			
 
				+from odps import ODPS
			
 
				+import time
			
 
				+from tqdm import tqdm
			
 
				+import pickle
			
 
				+import langid
			
 
				+import smtplib
			
 
				+from email.mime.text import MIMEText
			
 
				+from email.header import Header
			
 
				+from rov_to_redis import toRedis
			
 
				+
			
 
				+
			
 
				+def sendemail():
			
 
				+    mail_host="smtp.exmail.qq.com"  #设置服务器
			
 
				+    mail_user="warning@piaoquantv.com"    #用户名
			
 
				+    mail_pass="Wq20160722"   #口令 
			
 
				+
			
 
				+    sender = 'warning@piaoquantv.com'
			
 
				+    receivers = ['liqian@piaoquantv.com','liuchunlin@piaoquantv.com']  # 接收邮件，可设置为你的QQ邮箱或者其他邮箱
			
 
				+
			
 
				+    # 三个参数：第一个为文本内容，第二个 plain 设置文本格式，第三个 utf-8 设置编码
			
 
				+    message = MIMEText('小程序ROV更新失败', 'plain', 'utf-8')
			
 
				+    message['From'] = Header("warning@piaoquantv.com", 'utf-8')  
			
 
				+    message['To'] =  Header("xiaoping@piaoquantv.com", 'utf-8')        
			
 
				+
			
 
				+    subject = '小程序ROV首屏更新预警'
			
 
				+    message['Subject'] = Header(subject, 'utf-8')
			
 
				+
			
 
				+    try:
			
 
				+        with smtplib.SMTP_SSL(host="smtp.exmail.qq.com",port=465) as smtp:
			
 
				+            # 登录发邮件服务器
			
 
				+            smtp.login(user = mail_user, password = mail_pass)
			
 
				+            # 实际发送、接收邮件配置
			
 
				+            smtp.sendmail(sender, receivers, message.as_string())
			
 
				+            print ("邮件发送成功")
			
 
				+    except smtplib.SMTPException:
			
 
				+        print ("Error: 无法发送邮件")
			
 
				+
			
 
				+
			
 
				+now_date = datetime.date.today()-datetime.timedelta(days=0)
			
 
				+day = datetime.datetime.strftime(now_date, '%Y%m%d')
			
 
				+diff_1 = datetime.timedelta(days=1)
			
 
				+input_dt = datetime.datetime.strftime(now_date - diff_1, '%Y%m%d')
			
 
				+print(input_dt)
			
 
				+
			
 
				+
			
 
				+res_dir = '/root/ROVtrain/resdir'
			
 
				+res_file = os.path.join(res_dir,'video_score_'+ day[-4:] +'.json')
			
 
				+
			
 
				+def getRovfeaturetable(dt):
			
 
				+    odps = ODPS('LTAI4FtW5ZzxMvdw35aNkmcp', '0VKnydcaHK3ITjylbgUsLubX6rwiwc', 'videoods',
			
 
				+                endpoint='http://service.cn.maxcompute.aliyun.com/api', connect_timeout=3000, \
			
 
				+                read_timeout=500000, pool_maxsize=1000, pool_connections=1000)
			
 
				+
			
 
				+    featureArray = []
			
 
				+    for record in tqdm(odps.read_table('recommendVideoDistributionEfficiency', partition='dt=%s' % dt)):
			
 
				+        valueFeature = {}
			
 
				+        valueFeature['videoId'] = record['videoid']
			
 
				+        valueFeature['today_all_return_person'] = record['today_all_return_person']
			
 
				+        valueFeature['today_return_total_real']  = record['today_return_total_real']
			
 
				+        valueFeature['videoview_today'] = record['videoview_today']
			
 
				+        valueFeature['videoshare_today'] = record['videoshare_today']
			
 
				+        valueFeature['videoplay_today'] = record['videoplay_today']
			
 
				+        valueFeature['ruosoneweek'] = record['ruosoneweek']
			
 
				+        valueFeature['ruos3day'] = record['ruos3day']
			
 
				+        valueFeature['today_ruov_all'] = record['today_ruov_all']
			
 
				+        
			
 
				+        valueFeature['ctr_today'] = record['ctr_today']
			
 
				+        valueFeature['today_first_return_person'] = record['today_first_return_person']
			
 
				+        valueFeature['today_second_return_person'] = record['today_second_return_person']
			
 
				+        valueFeature['today_third_return_person'] = record['today_third_return_person']
			
 
				+        valueFeature['today_overfour_return_person'] = record['today_overfour_return_person']
			
 
				+
			
 
				+        valueFeature['score'] = record['a11']
			
 
				+        valueFeature['dis_title'] = record['dis_title1']
			
 
				+        valueFeature['title'] = record['title']
			
 
				+        valueFeature['rovscore'] = record['rovscore']
			
 
				+        featureArray.append(valueFeature)
			
 
				+    featureArrayDF = pd.DataFrame(featureArray)
			
 
				+    print('feature table finish')
			
 
				+    return featureArrayDF
			
 
				+
			
 
				+raw_score = getRovfeaturetable(input_dt)
			
 
				+
			
 
				+
			
 
				+###=========================================运营改分逻辑添加================================================
			
 
				+rename_dict={'videoId':'videoId',
			
 
				+ 'today_all_return_person':'今日曝光总回流人数',
			
 
				+ 'today_return_total_real':'今日总回流_真实值',
			
 
				+ 'videoview_today':'今日首页曝光次数',
			
 
				+ 'videoshare_today':'今日首页分享次数',
			
 
				+ 'videoplay_today':'今日首页播放次数',
			
 
				+ 'ruosoneweek':'前7_1日曝光今日RUOV',
			
 
				+ 'ruos3day':'前3_1日曝光今日RUOV',
			
 
				+ 'today_ruov_all':'今日首页RUOV',
			
 
				+ 'ctr_today':'今日首页CTR',
			
 
				+ 'today_first_return_person':'今日曝光一层回流人数',
			
 
				+ 'today_second_return_person':'今日曝光二层回流人数',
			
 
				+ 'today_third_return_person':'今日曝光三层回流人数',
			
 
				+ 'today_overfour_return_person':'今日曝光四加层回流人数',
			
 
				+ 'score':'全局总分',
			
 
				+ 'dis_title':'分发标题',
			
 
				+ 'title':'标题',
			
 
				+ 'rovscore':'rov分数'
			
 
				+}
			
 
				+
			
 
				+raw_score.rename(columns=rename_dict,inplace=True)
			
 
				+#raw_score.columns = ['今日首页CTR', '分发标题', 'rov分数', '前3_1日曝光今日RUOV','前7_1日曝光今日RUOV','全局总分','标题', '今日曝光总回流人数', '今日曝光一层回流人数','今日曝光四加层回流人数',
			
 
				+#                    '今日总回流_真实值','今日首页RUOV','今日曝光二层回流人数', '今日曝光三层回流人数','videoId', '今日首页播放次数','今日首页分享次数',
			
 
				+#                    '今日首页曝光次数']
			
 
				+
			
 
				+raw_score['二层/一层'] = raw_score.apply(lambda x:x['今日曝光二层回流人数'] / max(10,x['今日曝光一层回流人数']),axis=1)
			
 
				+raw_score['三层/二层'] = raw_score.apply(lambda x:x['今日曝光三层回流人数'] / max(10,x['今日曝光二层回流人数']),axis=1)
			
 
				+raw_score['四加层/三层'] = raw_score.apply(lambda x:x['今日曝光四加层回流人数'] / max(10,x['今日曝光三层回流人数']),axis=1)
			
 
				+raw_score['回流比均值'] = (raw_score['二层/一层'] + raw_score['三层/二层'] + raw_score['四加层/三层']) / 3
			
 
				+
			
 
				+raw_score = raw_score[['videoId','分发标题','标题','今日首页曝光次数','今日首页播放次数',
			
 
				+                       '今日首页分享次数','今日曝光总回流人数','今日总回流_真实值','前7_1日曝光今日RUOV',
			
 
				+                       '前3_1日曝光今日RUOV','今日首页RUOV','今日首页CTR','今日曝光一层回流人数',
			
 
				+                       '今日曝光二层回流人数', '今日曝光三层回流人数','今日曝光四加层回流人数','二层/一层',
			
 
				+                       '三层/二层','四加层/三层','回流比均值',
			
 
				+                       'rov分数','全局总分']]
			
 
				+
			
 
				+raw_score['新分数'] = raw_score.apply(lambda x: (x['今日曝光总回流人数']*2 + min(x['今日总回流_真实值'],x['今日首页曝光次数'])*0.3 + x['今日首页分享次数']*0.2 + x['今日首页播放次数']*0.02) / max(x['今日首页曝光次数'],1000) * np.log10(x['今日首页曝光次数']+10),axis=1)
			
 
				+
			
 
				+raw_score['旧排名'] = [i+1 for i in range(len(raw_score))]
			
 
				+
			
 
				+raw_score = raw_score.sort_values(by='新分数',ascending=False).reset_index(drop=True)
			
 
				+
			
 
				+raw_score['新排名'] = [i+1 for i in range(len(raw_score))]
			
 
				+
			
 
				+raw_score['排名差值'] = raw_score['旧排名'] - raw_score['新排名']
			
 
				+
			
 
				+raw_score['提升至'] =  raw_score.apply(lambda x: int(x['旧排名']- x['排名差值']/2 if x['排名差值']>=0 and x['旧排名']<=200 else 100+x['旧排名']%100),axis=1)
			
 
				+raw_score = pd.merge(raw_score,raw_score[['旧排名','全局总分']],left_on='提升至',right_on='旧排名',how='left')
			
 
				+
			
 
				+raw_score.drop(['旧排名_y'],axis=1,inplace=True)
			
 
				+raw_score.rename(columns={'全局总分_x':'全局总分','旧排名_x':'旧排名','全局总分_y':'提升分数至'},inplace=True)
			
 
				+
			
 
				+raw_score['降至'] =  raw_score.apply(lambda x: int(x['旧排名']- x['排名差值']/2 if(x['排名差值']<=0 and x['新排名']<=200) else 100+x['新排名']%100),axis=1)
			
 
				+
			
 
				+raw_score = pd.merge(raw_score,raw_score[['旧排名','全局总分']],left_on='降至',right_on='旧排名',how='left')
			
 
				+raw_score.drop(['旧排名_y'],axis=1,inplace=True)
			
 
				+
			
 
				+raw_score.rename(columns={'全局总分_x':'全局总分','旧排名_x':'旧排名','全局总分_y':'降低分数至'},inplace=True)
			
 
				+raw_score['一周ruov均值'] = (raw_score['前7_1日曝光今日RUOV'] + raw_score['今日首页RUOV'] + raw_score['前3_1日曝光今日RUOV']) / 3
			
 
				+
			
 
				+## 筛选加分视频
			
 
				+add_score_video = raw_score[(raw_score['新排名']<=100)&(raw_score['排名差值']>20)]
			
 
				+add_score_video['最终分数'] = add_score_video.apply(lambda x : x['提升分数至']  if x['排名差值']>20 else x['全局总分'],axis=1)
			
 
				+
			
 
				+# 筛选 减分视频
			
 
				+reduce_score_video = raw_score[(raw_score['排名差值']<-50)&(raw_score['旧排名']<=100)]
			
 
				+reduce_score_video['最终分数'] = reduce_score_video['降低分数至']
			
 
				+
			
 
				+score_adjust_videos = pd.concat([add_score_video,reduce_score_video])
			
 
				+raw_score = raw_score.merge(score_adjust_videos[['videoId','最终分数']],how='left')
			
 
				+raw_score['最终分数'] = raw_score.apply(lambda x: x['最终分数'] if x['最终分数']>0 else x['全局总分'],axis=1)
			
 
				+
			
 
				+## 新算法递进逻辑
			
 
				+raw_score['最终分数'] = raw_score.apply(lambda x: x['最终分数']  + 10 if (x['一周ruov均值']>=0.045 and x['新排名']<=100)  else x['最终分数'],axis=1)
			
 
				+
			
 
				+## 低分裂变视频
			
 
				+raw_score['最终分数'] =  raw_score.apply(lambda x: x['最终分数']%10 + 30 if (x['今日首页CTR']>=0.04999 and x['回流比均值'] >= 0.3999 and x['最终分数']<=30 and x['旧排名']<=3000) else x['最终分数'],axis=1)
			
 
				+
			
 
				+###=========================================运营改分逻辑结束================================================
			
 
				+
			
 
				+raw_score = raw_score[['videoId','最终分数','分发标题','标题']]
			
 
				+raw_score.columns = ['videoId','score','dis_title','title']
			
 
				+
			
 
				+
			
 
				+
			
 
				+raw_score = raw_score[['videoId','score','dis_title','title']]
			
 
				+raw_score = raw_score.dropna(axis=0,how='any')
			
 
				+raw_score.drop_duplicates(subset=['videoId'],inplace=True) 
			
 
				+# raw_score = raw_score.loc[raw_score['score'] >= 2.5]
			
 
				+
			
 
				+raw_score = raw_score.sort_values(by="score" , ascending=False).iloc[0:60000,:]
			
 
				+
			
 
				+raw_score.reset_index(inplace = True)
			
 
				+
			
 
				+#过滤外文
			
 
				+print(raw_score.shape)
			
 
				+score_initial = pd.DataFrame([])
			
 
				+for index in range(len(raw_score)):
			
 
				+#     print(index)
			
 
				+    title = raw_score.iloc[[index]]['title'][index]
			
 
				+    dis_title = raw_score.iloc[[index]]['dis_title'][index]
			
 
				+#     print(dis_title)
			
 
				+    title_language = []
			
 
				+    dis_title_language = []
			
 
				+    if title == '' and  dis_title == '' :
			
 
				+        score_initial = score_initial.append(raw_score.iloc[[index]])
			
 
				+    else :
			
 
				+        if len(title) > 0:
			
 
				+            for index1 in range (len(title)):
			
 
				+                title_lan_label = langid.classify(title[index1])[0]
			
 
				+                title_language.append(title_lan_label)
			
 
				+                if 'zh' in title_language:
			
 
				+                    break
			
 
				+        if len(dis_title) > 0:
			
 
				+            for index2 in range(len(dis_title)):
			
 
				+                dis_title_label = langid.classify(dis_title[index2])[0]
			
 
				+                dis_title_language.append(dis_title_label)
			
 
				+                if 'zh' in dis_title_language:
			
 
				+                    break
			
 
				+        if 'zh' in title_language or 'zh' in dis_title_language:
			
 
				+            score_initial = score_initial.append(raw_score.iloc[[index]])
			
 
				+#二次过滤非标准字符
			
 
				+score_initial.reset_index( inplace=True)
			
 
				+score = pd.DataFrame([])
			
 
				+for index in range(len(score_initial)):
			
 
				+    title = score_initial.iloc[[index]]['title'][index]
			
 
				+    dis_title = score_initial.iloc[[index]]['dis_title'][index]
			
 
				+
			
 
				+    if title == '' and  dis_title == '' :
			
 
				+        score = score.append(score_initial.iloc[[index]])
			
 
				+    else :
			
 
				+        if 'ၼ' in title and dis_title == '':
			
 
				+            continue
			
 
				+        elif 'ၼ' in dis_title and title == '':
			
 
				+            continue
			
 
				+        else:
			
 
				+            score = score.append(score_initial.iloc[[index]])
			
 
				+        
			
 
				+        
			
 
				+score['videoId'] = score['videoId'].astype('int')
			
 
				+
			
 
				+score = score.sort_values(by="score" , ascending=False)
			
 
				+score = score[['videoId','score']]
			
 
				+### 添加旧视频测试
			
 
				+#old_video = pd.read_csv('/root/ROVtrain/readonlinetable/old_videoid_retest.csv')
			
 
				+#step = (score.loc[49].score  - score.loc[79].score) / 30
			
 
				+
			
 
				+#old_video_df = old_video.rename(columns={'videoid':'videoId'}).head(30)
			
 
				+
			
 
				+
			
 
				+#old_video_score = []
			
 
				+#for i in range(len(old_video_df)):
			
 
				+#    old_video_score.append(score.loc[49].score+step*(i+1))
			
 
				+#old_video_df['score']  = old_video_score
			
 
				+
			
 
				+#score = pd.concat([score[~score.videoId.isin(old_video_df.videoId)],old_video_df])
			
 
				+
			
 
				+#score = score.sort_values(by="score" , ascending=False)
			
 
				+
			
 
				+
			
 
				+#old_video[~old_video.videoid.isin(old_video_df.videoId)].to_csv('/root/ROVtrain/readonlinetable/old_videoid_retest.csv')
			
 
				+#dt = datetime.datetime.strftime(datetime.date.today(), '%Y%m%d')
			
 
				+#old_video_df.to_csv('/root/ROVtrain/readonlinetable/oldvideo/'+dt+'.csv')
			
 
				+#score.to_csv('/root/ROVtrain/readonlinetable/oldvideo/score.csv')
			
 
				+###
			
 
				+
			
 
				+score.to_json(
			
 
				+    res_file, orient='records')
			
 
				+
			
 
				+with open(res_file,'r') as score_json:
			
 
				+    data = json.load(score_json)
			
 
				+score_df = pd.DataFrame(data)
			
 
				+threshold = score_df.iloc[500,:]['score']
			
 
				+
			
 
				+# threshold
			
 
				+
			
 
				+def CalcMD5(filepath):
			
 
				+    if not os.path.isfile(filepath):
			
 
				+        return
			
 
				+    with open(filepath,'rb') as f:
			
 
				+        md5obj = hashlib.md5()
			
 
				+        md5obj.update(f.read())
			
 
				+        myhash = md5obj.hexdigest()
			
 
				+    return myhash
			
 
				+
			
 
				+
			
 
				+print(CalcMD5(res_file))
			
 
				+
			
 
				+
			
 
				+metadata = {}
			
 
				+metadata['rootDir'] = 'recommend'
			
 
				+metadata['dataDir'] = 'data'
			
 
				+metadata['modularName'] = 'model'
			
 
				+metadata['modelName'] = 'rov'
			
 
				+metadata['version'] = 'v1'
			
 
				+metadata['date'] = day
			
 
				+metadata['defaultRovScore'] = threshold
			
 
				+metadata['itemRovName'] = 'video_score.json'
			
 
				+metadata['videoScoreMd5'] = CalcMD5(res_file)
			
 
				+meta_file = os.path.join(res_dir,'rov_success.json')
			
 
				+with open(meta_file,'w') as f:
			
 
				+    json.dump(metadata,f)
			
 
				+    
			
 
				+# 'LTAI4FtW5ZzxMvdw35aNkmcp', '0VKnydcaHK3ITjylbgUsLubX6rwiwc'
			
 
				+#if rov_max > 5:
			
 
				+''' 
			
 
				+rov_max = max(score.score)
			
 
				+if rov_max > 50:
			
 
				+    auth = oss2.Auth('LTAI9EBa0bd5PrDa', 'vAalxds7YxhfOA2yVv8GziCg3Y87v5')
			
 
				+    bucket = oss2.Bucket(auth, 'http://oss-cn-hangzhou-internal.aliyuncs.com', 'art-recommend')
			
 
				+    bucket.put_object_from_file('recommend/metadata/rov/'+'rov_success.json', meta_file)
			
 
				+    bucket.put_object_from_file('recommend/data/model/rov/v1/'+ day +'/video_score.json', res_file)
			
 
				+    print('upload success')
			
 
				+else:
			
 
				+    print('error: rovscore under threshold')
			
 
				+    sendemail()
			
 
				+'''
			
 
				+
			
 
				+auth = oss2.Auth('LTAI9EBa0bd5PrDa', 'vAalxds7YxhfOA2yVv8GziCg3Y87v5')
			
 
				+bucket = oss2.Bucket(auth, 'http://oss-cn-hangzhou-internal.aliyuncs.com', 'art-recommend')
			
 
				+bucket.put_object_from_file('recommend/metadata/rov/'+'rov_success.json', meta_file)
			
 
				+bucket.put_object_from_file('recommend/data/model/rov/v1/'+ day +'/video_score.json', res_file)
			
 
				+print('upload success')
			
 
				+# 上传rov score到Redis中
			
 
				+toRedis(res_file)
			
 
				+print('rov score to redis success')
			
--- a/nohup.out
+++ b/nohup.out
@@ -0,0 +1,252 @@
 
				+[LightGBM] [Warning] min_data_in_leaf is set=60, min_child_samples=30 will be ignored. Current value: min_data_in_leaf=60
			
 
				+[LightGBM] [Warning] min_data_in_leaf is set=60, min_child_samples=30 will be ignored. Current value: min_data_in_leaf=60
			
 
				+[LightGBM] [Warning] min_data_in_leaf is set=60, min_child_samples=30 will be ignored. Current value: min_data_in_leaf=60
			
 
				+[LightGBM] [Warning] min_data_in_leaf is set=60, min_child_samples=30 will be ignored. Current value: min_data_in_leaf=60
			
 
				+stage 1: time cost : 0.12420 sec
			
 
				+stage 2: time cost : 0.12090 sec
			
 
				+stage 3: time cost : 0.12970 sec
			
 
				+stage 4: time cost : 0.13302 sec
			
 
				+stage 1: time cost : 0.00954 sec
			
 
				+stage 2: time cost : 0.00869 sec
			
 
				+stage 3: time cost : 0.00879 sec
			
 
				+stage 4: time cost : 0.00902 sec
			
 
				+value feature generate successfully
			
 
				+videoid feature generate successfully
			
 
				+lenth tag_dict: 47609
			
 
				+tag tfidf feature generate successfully
			
 
				+tag dimension: 59393
			
 
				+lenth words_dict: 189467
			
 
				+tag tfidf feature generate successfully
			
 
				+words dimension: 59390
			
 
				+folds 0
			
 
				+Training until validation scores don't improve for 200 rounds
			
 
				+[100]	training's rmse: 0.226424	valid_1's rmse: 0.225884
			
 
				+[200]	training's rmse: 0.0973668	valid_1's rmse: 0.0971656
			
 
				+[300]	training's rmse: 0.057936	valid_1's rmse: 0.0586671
			
 
				+[400]	training's rmse: 0.0475353	valid_1's rmse: 0.0492602
			
 
				+[500]	training's rmse: 0.0447344	valid_1's rmse: 0.0471132
			
 
				+[600]	training's rmse: 0.0435196	valid_1's rmse: 0.0465598
			
 
				+[700]	training's rmse: 0.0426755	valid_1's rmse: 0.0462999
			
 
				+[800]	training's rmse: 0.0419673	valid_1's rmse: 0.0461619
			
 
				+[900]	training's rmse: 0.0413599	valid_1's rmse: 0.0461027
			
 
				+[1000]	training's rmse: 0.0408172	valid_1's rmse: 0.0460891
			
 
				+[1100]	training's rmse: 0.0403146	valid_1's rmse: 0.0461019
			
 
				+Early stopping, best iteration is:
			
 
				+[977]	training's rmse: 0.0409576	valid_1's rmse: 0.046076
			
 
				+folds 1
			
 
				+Training until validation scores don't improve for 200 rounds
			
 
				+[100]	training's rmse: 0.226544	valid_1's rmse: 0.225426
			
 
				+[200]	training's rmse: 0.0971282	valid_1's rmse: 0.0981563
			
 
				+[300]	training's rmse: 0.0573839	valid_1's rmse: 0.0604765
			
 
				+[400]	training's rmse: 0.0468133	valid_1's rmse: 0.0513215
			
 
				+[500]	training's rmse: 0.04392	valid_1's rmse: 0.0493315
			
 
				+[600]	training's rmse: 0.0426995	valid_1's rmse: 0.0488075
			
 
				+[700]	training's rmse: 0.0418527	valid_1's rmse: 0.0485998
			
 
				+[800]	training's rmse: 0.0411682	valid_1's rmse: 0.0485575
			
 
				+[900]	training's rmse: 0.040566	valid_1's rmse: 0.048484
			
 
				+[1000]	training's rmse: 0.0400298	valid_1's rmse: 0.0484444
			
 
				+[1100]	training's rmse: 0.0395374	valid_1's rmse: 0.0484197
			
 
				+[1200]	training's rmse: 0.0390424	valid_1's rmse: 0.0484038
			
 
				+[1300]	training's rmse: 0.0385941	valid_1's rmse: 0.0483951
			
 
				+[1400]	training's rmse: 0.0381847	valid_1's rmse: 0.048402
			
 
				+[1500]	training's rmse: 0.0377891	valid_1's rmse: 0.048371
			
 
				+[1600]	training's rmse: 0.037364	valid_1's rmse: 0.0483954
			
 
				+[1700]	training's rmse: 0.0370085	valid_1's rmse: 0.0483913
			
 
				+Early stopping, best iteration is:
			
 
				+[1502]	training's rmse: 0.0377785	valid_1's rmse: 0.0483691
			
 
				+folds 2
			
 
				+Training until validation scores don't improve for 200 rounds
			
 
				+[100]	training's rmse: 0.227932	valid_1's rmse: 0.229577
			
 
				+[200]	training's rmse: 0.100058	valid_1's rmse: 0.101833
			
 
				+[300]	training's rmse: 0.0584845	valid_1's rmse: 0.060349
			
 
				+[400]	training's rmse: 0.0480128	valid_1's rmse: 0.050103
			
 
				+[500]	training's rmse: 0.0451386	valid_1's rmse: 0.0474865
			
 
				+[600]	training's rmse: 0.0438463	valid_1's rmse: 0.0467042
			
 
				+[700]	training's rmse: 0.0430083	valid_1's rmse: 0.0463218
			
 
				+[800]	training's rmse: 0.0423034	valid_1's rmse: 0.0461414
			
 
				+[900]	training's rmse: 0.0416902	valid_1's rmse: 0.0460446
			
 
				+[1000]	training's rmse: 0.0411211	valid_1's rmse: 0.0460113
			
 
				+[1100]	training's rmse: 0.0406169	valid_1's rmse: 0.0459762
			
 
				+[1200]	training's rmse: 0.0401204	valid_1's rmse: 0.0459549
			
 
				+[1300]	training's rmse: 0.0396321	valid_1's rmse: 0.0459738
			
 
				+[1400]	training's rmse: 0.0392071	valid_1's rmse: 0.0459784
			
 
				+Early stopping, best iteration is:
			
 
				+[1206]	training's rmse: 0.0400926	valid_1's rmse: 0.0459515
			
 
				+folds 3
			
 
				+Training until validation scores don't improve for 200 rounds
			
 
				+[100]	training's rmse: 0.226307	valid_1's rmse: 0.227556
			
 
				+[200]	training's rmse: 0.0974512	valid_1's rmse: 0.098307
			
 
				+[300]	training's rmse: 0.0581506	valid_1's rmse: 0.0590657
			
 
				+[400]	training's rmse: 0.047742	valid_1's rmse: 0.048951
			
 
				+[500]	training's rmse: 0.0448986	valid_1's rmse: 0.0465751
			
 
				+[600]	training's rmse: 0.0436506	valid_1's rmse: 0.0459655
			
 
				+[700]	training's rmse: 0.0427668	valid_1's rmse: 0.0457833
			
 
				+[800]	training's rmse: 0.0421014	valid_1's rmse: 0.0456473
			
 
				+[900]	training's rmse: 0.0414596	valid_1's rmse: 0.0455921
			
 
				+[1000]	training's rmse: 0.0408817	valid_1's rmse: 0.0455654
			
 
				+[1100]	training's rmse: 0.0403698	valid_1's rmse: 0.0455323
			
 
				+[1200]	training's rmse: 0.0398803	valid_1's rmse: 0.0455336
			
 
				+[1300]	training's rmse: 0.0394193	valid_1's rmse: 0.0455275
			
 
				+Early stopping, best iteration is:
			
 
				+[1164]	training's rmse: 0.0400544	valid_1's rmse: 0.0455123
			
 
				+oof_rmse: 0.046490504968166625
			
 
				+oof_mse: 0.0021613670521951254
			
 
				+test_rmse: 0.10498955445882613
			
 
				+test_mse: 0.011022806545462817
			
 
				+oof_mape: [0.04896782]
			
 
				+test_mape: [0.13560733]
			
 
				+verification r2: 0.99351663488233
			
 
				+test r2: 0.9630887068109539
			
 
				+regre ranking shape (27116, 2)
			
 
				+[LightGBM] [Warning] min_data_in_leaf is set=60, min_child_samples=30 will be ignored. Current value: min_data_in_leaf=60
			
 
				+[LightGBM] [Warning] min_data_in_leaf is set=60, min_child_samples=30 will be ignored. Current value: min_data_in_leaf=60
			
 
				+[LightGBM] [Warning] min_data_in_leaf is set=60, min_child_samples=30 will be ignored. Current value: min_data_in_leaf=60
			
 
				+[LightGBM] [Warning] min_data_in_leaf is set=60, min_child_samples=30 will be ignored. Current value: min_data_in_leaf=60
			
 
				+20210919 rov_feature_add_v1 feature table finish
			
 
				+20210918 rov_feature_add_v1 feature table finish
			
 
				+20210917 rov_feature_add_v1 feature table finish
			
 
				+20210916 rov_feature_add_v1 feature table finish
			
 
				+20210915 rov_feature_add_v1 feature table finish
			
 
				+20210914 rov_feature_add_v1 feature table finish
			
 
				+20210913 rov_feature_add_v1 feature table finish
			
 
				+20210912 rov_feature_add_v1 feature table finish
			
 
				+20210911 rov_feature_add_v1 feature table finish
			
 
				+20210910 rov_feature_add_v1 feature table finish
			
 
				+20210909 rov_feature_add_v1 feature table finish
			
 
				+20210908 rov_feature_add_v1 feature table finish
			
 
				+20210907 rov_feature_add_v1 feature table finish
			
 
				+20210906 rov_feature_add_v1 feature table finish
			
 
				+20210905 rov_feature_add_v1 feature table finish
			
 
				+20210904 rov_feature_add_v1 feature table finish
			
 
				+20210903 rov_feature_add_v1 feature table finish
			
 
				+20210902 rov_feature_add_v1 feature table finish
			
 
				+20210901 rov_feature_add_v1 feature table finish
			
 
				+20210831 rov_feature_add_v1 feature table finish
			
 
				+20210830 rov_feature_add_v1 feature table finish
			
 
				+20210829 rov_feature_add_v1 feature table finish
			
 
				+20210828 rov_feature_add_v1 feature table finish
			
 
				+20210827 rov_feature_add_v1 feature table finish
			
 
				+20210826 rov_feature_add_v1 feature table finish
			
 
				+20210825 rov_feature_add_v1 feature table finish
			
 
				+20210824 rov_feature_add_v1 feature table finish
			
 
				+20210823 rov_feature_add_v1 feature table finish
			
 
				+20210822 rov_feature_add_v1 feature table finish
			
 
				+20210821 rov_feature_add_v1 feature table finish
			
 
				+20210925 rov_predict_table_add_v1 feature table finish
			
 
				+stage 1: time cost : 0.07594 sec
			
 
				+stage 2: time cost : 0.07060 sec
			
 
				+stage 3: time cost : 0.08065 sec
			
 
				+stage 4: time cost : 0.08395 sec
			
 
				+stage 1: time cost : 0.00955 sec
			
 
				+stage 2: time cost : 0.00875 sec
			
 
				+stage 3: time cost : 0.00866 sec
			
 
				+stage 4: time cost : 0.00883 sec
			
 
				+151
			
 
				+121
			
 
				+(776799, 121)
			
 
				+151
			
 
				+121
			
 
				+(21060, 121)
			
 
				+0        5.953243
			
 
				+1        6.812345
			
 
				+2        4.634729
			
 
				+3        4.990433
			
 
				+4        3.931826
			
 
				+           ...   
			
 
				+21055    0.000000
			
 
				+21056    0.000000
			
 
				+21057    0.000000
			
 
				+21058    0.000000
			
 
				+21059    0.000000
			
 
				+Name: weighted_retrn_log, Length: 21060, dtype: float64
			
 
				+value feature generate successfully
			
 
				+videoid feature generate successfully
			
 
				+lenth tag_dict: 47609
			
 
				+tag tfidf feature generate successfully
			
 
				+tag dimension: 59393
			
 
				+lenth words_dict: 189467
			
 
				+tag tfidf feature generate successfully
			
 
				+words dimension: 59390
			
 
				+folds 0
			
 
				+Training until validation scores don't improve for 200 rounds
			
 
				+[100]	training's rmse: 0.217296	valid_1's rmse: 0.219055
			
 
				+[200]	training's rmse: 0.0933997	valid_1's rmse: 0.0957921
			
 
				+[300]	training's rmse: 0.054594	valid_1's rmse: 0.0577442
			
 
				+[400]	training's rmse: 0.044312	valid_1's rmse: 0.0479771
			
 
				+[500]	training's rmse: 0.0415086	valid_1's rmse: 0.0457105
			
 
				+[600]	training's rmse: 0.0402481	valid_1's rmse: 0.0450285
			
 
				+[700]	training's rmse: 0.0393805	valid_1's rmse: 0.0447715
			
 
				+[800]	training's rmse: 0.0386728	valid_1's rmse: 0.0446799
			
 
				+[900]	training's rmse: 0.0380495	valid_1's rmse: 0.0446337
			
 
				+[1000]	training's rmse: 0.0374717	valid_1's rmse: 0.0446179
			
 
				+[1100]	training's rmse: 0.0369606	valid_1's rmse: 0.0445955
			
 
				+[1200]	training's rmse: 0.0364684	valid_1's rmse: 0.0445982
			
 
				+[1300]	training's rmse: 0.0359899	valid_1's rmse: 0.0446047
			
 
				+[1400]	training's rmse: 0.0355138	valid_1's rmse: 0.0446262
			
 
				+Early stopping, best iteration is:
			
 
				+[1220]	training's rmse: 0.0363808	valid_1's rmse: 0.0445906
			
 
				+folds 1
			
 
				+Training until validation scores don't improve for 200 rounds
			
 
				+[100]	training's rmse: 0.215789	valid_1's rmse: 0.214513
			
 
				+[200]	training's rmse: 0.0931099	valid_1's rmse: 0.092297
			
 
				+[300]	training's rmse: 0.0546024	valid_1's rmse: 0.0552234
			
 
				+[400]	training's rmse: 0.0443371	valid_1's rmse: 0.0464082
			
 
				+[500]	training's rmse: 0.0414791	valid_1's rmse: 0.0446326
			
 
				+[600]	training's rmse: 0.0402743	valid_1's rmse: 0.0442435
			
 
				+[700]	training's rmse: 0.0394273	valid_1's rmse: 0.0440974
			
 
				+[800]	training's rmse: 0.0387161	valid_1's rmse: 0.0440401
			
 
				+[900]	training's rmse: 0.0380969	valid_1's rmse: 0.044004
			
 
				+[1000]	training's rmse: 0.0375615	valid_1's rmse: 0.0439732
			
 
				+[1100]	training's rmse: 0.0370498	valid_1's rmse: 0.0439087
			
 
				+[1200]	training's rmse: 0.0365759	valid_1's rmse: 0.0438935
			
 
				+[1300]	training's rmse: 0.036118	valid_1's rmse: 0.0439083
			
 
				+Early stopping, best iteration is:
			
 
				+[1172]	training's rmse: 0.036709	valid_1's rmse: 0.0438807
			
 
				+folds 2
			
 
				+Training until validation scores don't improve for 200 rounds
			
 
				+[100]	training's rmse: 0.220769	valid_1's rmse: 0.220882
			
 
				+[200]	training's rmse: 0.0971585	valid_1's rmse: 0.0977065
			
 
				+[300]	training's rmse: 0.0570418	valid_1's rmse: 0.0587242
			
 
				+[400]	training's rmse: 0.04556	valid_1's rmse: 0.0483838
			
 
				+[500]	training's rmse: 0.0421882	valid_1's rmse: 0.0457825
			
 
				+[600]	training's rmse: 0.0408069	valid_1's rmse: 0.0449982
			
 
				+[700]	training's rmse: 0.0398654	valid_1's rmse: 0.0446595
			
 
				+[800]	training's rmse: 0.0391188	valid_1's rmse: 0.0444782
			
 
				+[900]	training's rmse: 0.0384418	valid_1's rmse: 0.0443802
			
 
				+[1000]	training's rmse: 0.0378518	valid_1's rmse: 0.0443594
			
 
				+[1100]	training's rmse: 0.037322	valid_1's rmse: 0.0443377
			
 
				+[1200]	training's rmse: 0.036808	valid_1's rmse: 0.0443101
			
 
				+[1300]	training's rmse: 0.0363635	valid_1's rmse: 0.0442829
			
 
				+[1400]	training's rmse: 0.0359017	valid_1's rmse: 0.0442798
			
 
				+[1500]	training's rmse: 0.0354922	valid_1's rmse: 0.0443043
			
 
				+Early stopping, best iteration is:
			
 
				+[1324]	training's rmse: 0.036253	valid_1's rmse: 0.0442693
			
 
				+folds 3
			
 
				+Training until validation scores don't improve for 200 rounds
			
 
				+[100]	training's rmse: 0.215786	valid_1's rmse: 0.216227
			
 
				+[200]	training's rmse: 0.0932123	valid_1's rmse: 0.094359
			
 
				+[300]	training's rmse: 0.0548723	valid_1's rmse: 0.0562601
			
 
				+[400]	training's rmse: 0.0448026	valid_1's rmse: 0.0462832
			
 
				+[500]	training's rmse: 0.0420321	valid_1's rmse: 0.0437981
			
 
				+[600]	training's rmse: 0.0408233	valid_1's rmse: 0.0429686
			
 
				+[700]	training's rmse: 0.0400402	valid_1's rmse: 0.042586
			
 
				+[800]	training's rmse: 0.0393215	valid_1's rmse: 0.0424025
			
 
				+[900]	training's rmse: 0.0386931	valid_1's rmse: 0.0422996
			
 
				+[1000]	training's rmse: 0.0381022	valid_1's rmse: 0.0422256
			
 
				+[1100]	training's rmse: 0.0375841	valid_1's rmse: 0.0421743
			
 
				+[1200]	training's rmse: 0.0370754	valid_1's rmse: 0.042115
			
 
				+[1300]	training's rmse: 0.0365871	valid_1's rmse: 0.0420674
			
 
				+[1400]	training's rmse: 0.0361483	valid_1's rmse: 0.0420477
			
 
				+[1500]	training's rmse: 0.0356979	valid_1's rmse: 0.0420595
			
 
				+[1600]	training's rmse: 0.0352963	valid_1's rmse: 0.0420646
			
 
				+Early stopping, best iteration is:
			
 
				+[1430]	training's rmse: 0.0360223	valid_1's rmse: 0.042041
			
 
				+oof_rmse: 0.04370656860219149
			
 
				+oof_mse: 0.0019102641389780713
			
 
				+test_rmse: 0.030188856957077185
			
 
				+test_mse: 0.0009113670843748676
			
 
				+oof_mape: [0.04676871]
			
 
				+test_mape: [0.13009724]
			
 
				+verification r2: 0.9937893006007936
			
 
				+test r2: 0.9634331570023774
			
 
				+regre ranking shape (21060, 2)
			
--- a/process_feature.py
+++ b/process_feature.py
@@ -240,6 +240,13 @@ features = ['day1playcount', 'day1returncount', 'day1sharecount', 'day1viewcount
 
				             'day3sharecount_dif_day1sharecount', 'day3viewcount_divide_day1viewcount', 
			
 
				             'day3viewcount_dif_day1viewcount']
			
 
				 
			
 
				+
			
 
				+def filter_recent_features():
			
 
				+    print(len(features))
			
 
				+    res = [f for f in features if (f.find('30') == -1 and f.find('60') == -1)]
			
 
				+    print(len(res))
			
 
				+    return res
			
 
				+
			
 
				 def cal_feature(df):
			
 
				     start = time.time()
			
 
				     for i in range(len(root_page_1day)):
			
--- a/rov_train.py
+++ b/rov_train.py
@@ -56,7 +56,6 @@ def getdatasample(date, max_range, table):
 
				         delta = datetime.timedelta(days=i)
			
 
				         tar_dt = new_date - delta
			
 
				         datelist.append(tar_dt.strftime("%Y%m%d"))
			
 
				-    print(datelist)
			
 
				     for tm in datelist:
			
 
				         testlist.append(getRovfeaturetable(tm, table))
			
 
				     testdata = pd.concat(testlist)
			
@@ -101,7 +100,9 @@ def dataprepare(df_pre):
 
				     #  直接将特征送进去，不加交叉特征。
			
 
				     # 是否对数据补零
			
 
				     df_pre = df_pre.fillna(0)
			
 
				-    df_new_feature = df_pre[process_feature.features]
			
 
				+    #df_new_feature = df_pre[process_feature.features]
			
 
				+    df_new_feature = df_pre[process_feature.filter_recent_features()]
			
 
				+    print(df_new_feature.shape)
			
 
				     df_target = df_pre['weighted_retrn_log']
			
 
				     df_new_feature = pd.concat([df_new_feature, df_pre[process_feature.cate_feat],df_pre[process_feature.one_hot_feature]], axis=1)
			
 
				     return df_new_feature, df_target
			
@@ -149,22 +150,21 @@ def process_train_predict_data():
 
				     train_dt = now_date - diff_5
			
 
				     train_day = datetime.datetime.strftime(train_dt, '%Y%m%d')
			
 
				     #read data from ali
			
 
				-    #train_data = getdatasample(train_day, 30, 'rov_feature_add_v1')
			
 
				-    #predict_data = getdatasample(predict_day, 1, 'rov_predict_table_add_v1')
			
 
				+    train_data = getdatasample(train_day, 30, 'rov_feature_add_v1')
			
 
				+    predict_data = getdatasample(predict_day, 1, 'rov_predict_table_add_v1')
			
 
				     #pickle for test
			
 
				     import _pickle as cPickle
			
 
				-    ''''
			
 
				     with open('train_data.pickle','wb') as output_file:
			
 
				         cPickle.dump(train_data, output_file)
			
 
				     with open('predict_data.pickle','wb') as output_file:
			
 
				         cPickle.dump(predict_data, output_file) 
			
 
				-    exit()
			
 
				-    '''
			
 
				     #with open(r"train_data.pickle", "rb") as input_file:
			
 
				+    '''
			
 
				     with open(r"train_data.pickle", "rb") as input_file:
			
 
				         train_data = cPickle.load(input_file)    
			
 
				     with open(r"predict_data.pickle", "rb") as input_file:
			
 
				         predict_data = cPickle.load(input_file)       
			
 
				+    '''
			
 
				     #end pickle
			
 
				     train_data = basic_cal(train_data)
			
 
				     predict_data = basic_cal(predict_data)
			
@@ -186,6 +186,7 @@ def process_train_predict_data():
 
				 
			
 
				     df_new_feature,df_target= dataprepare(train_data)
			
 
				     df_new_feature_predict, df_target_predict = dataprepare(predict_data)
			
 
				+    print(df_target_predict)
			
 
				 
			
 
				     df_new_feature_part_one = sparse.csr_matrix(np.array(pd.DataFrame(df_new_feature).loc[:,'day1playcount':'videocategory555']))
			
 
				     df_new_feature_predict_part_one = sparse.csr_matrix(np.array(pd.DataFrame(df_new_feature_predict).loc[:,'day1playcount':'videocategory555']))
			
@@ -330,7 +331,7 @@ def do_train(train_data, predict_data, df_target, df_target_predict, df_new_feat
 
				     sub_df_ = pd.DataFrame({"videoid": predict_data["videoid"].values})
			
 
				     sub_df_['score'] = predictions
			
 
				     print('regre ranking shape', sub_df_.shape)
			
 
				-
			
 
				+    sub_df_.to_csv('result.csv')
			
 
				 
			
 
				 if __name__ == '__main__':
			
 
				     train_data, predict_data, df_target, df_target_predict, df_new_feature, df_new_feature_predict = process_train_predict_data()
			
--- a/rov_train2.py
+++ b/rov_train2.py
@@ -0,0 +1,311 @@
 
				+import warnings
			
 
				+
			
 
				+warnings.filterwarnings("ignore")
			
 
				+from sklearn.metrics import r2_score
			
 
				+import os
			
 
				+import pandas as pd
			
 
				+import gc
			
 
				+import math
			
 
				+import numpy as np
			
 
				+import time
			
 
				+from sklearn.linear_model import SGDRegressor
			
 
				+from sklearn.linear_model import SGDClassifier
			
 
				+import lightgbm as lgb
			
 
				+from sklearn.model_selection import train_test_split
			
 
				+from sklearn.model_selection import StratifiedKFold
			
 
				+from sklearn.preprocessing import MultiLabelBinarizer
			
 
				+from sklearn import metrics
			
 
				+import pickle
			
 
				+from sklearn.metrics import mean_squared_error
			
 
				+import seaborn as sns
			
 
				+import matplotlib.pylab as plt
			
 
				+from odps import ODPS
			
 
				+from odps.df import DataFrame as odpsdf
			
 
				+from datetime import datetime as dt
			
 
				+import datetime
			
 
				+from scipy import sparse
			
 
				+from scipy.sparse import hstack
			
 
				+
			
 
				+import process_feature
			
 
				+import process_tag
			
 
				+
			
 
				+
			
 
				+def getRovfeaturetable(dt, table):
			
 
				+    odps = ODPS('LTAI4FtW5ZzxMvdw35aNkmcp', '0VKnydcaHK3ITjylbgUsLubX6rwiwc', 'usercdm',
			
 
				+                endpoint='http://service.cn.maxcompute.aliyun.com/api', connect_timeout=3000, \
			
 
				+                read_timeout=500000, pool_maxsize=1000, pool_connections=1000)
			
 
				+
			
 
				+    featureArray = []
			
 
				+    for record in odps.read_table(table, partition='dt=%s' % dt):
			
 
				+        valueFeature = {}
			
 
				+        for i in process_feature.featurename:
			
 
				+            if i == 'dt':
			
 
				+                valueFeature[i] = dt
			
 
				+            else:
			
 
				+                valueFeature[i] = record[i]
			
 
				+        featureArray.append(valueFeature)
			
 
				+    featureArray = pd.DataFrame(featureArray)
			
 
				+    print(dt, table, 'feature table finish')
			
 
				+    return featureArray
			
 
				+
			
 
				+def getdatasample(date, max_range, table):
			
 
				+    new_date = dt.strptime(date, '%Y%m%d')
			
 
				+    datelist = []
			
 
				+    testlist = []
			
 
				+    for i in range(0, max_range):
			
 
				+        delta = datetime.timedelta(days=i)
			
 
				+        tar_dt = new_date - delta
			
 
				+        datelist.append(tar_dt.strftime("%Y%m%d"))
			
 
				+    for tm in datelist:
			
 
				+        testlist.append(getRovfeaturetable(tm, table))
			
 
				+    testdata = pd.concat(testlist)
			
 
				+    testdata.reset_index(inplace=True)
			
 
				+    testdata = testdata.drop(axis=1, columns='index')
			
 
				+    return testdata
			
 
				+
			
 
				+def select_recent_video(df):
			
 
				+    """对每一个视频添加row number，按照日期排序，最后选取最近的那一天"""
			
 
				+    df['dt'] = df['dt'].astype(int)
			
 
				+    df['rk'] = df['dt'].groupby(df['videoid']).rank(ascending=0, method='first')
			
 
				+    df = df[df['rk'] == 1]
			
 
				+    return df
			
 
				+
			
 
				+def basic_cal(df):
			
 
				+    df['weighted_retrn'] = df['futre7dayreturn'].astype('int') 
			
 
				+    df['weighted_retrn_log'] = df.apply(lambda x: np.log(x['weighted_retrn'] + 1),axis=1)
			
 
				+    df['return_back'] = df.apply(lambda x:1 if x['weighted_retrn']> 0 else 0,axis=1)
			
 
				+    return df 
			
 
				+
			
 
				+def dataprepare(df_pre):
			
 
				+    #  直接将特征送进去，不加交叉特征。
			
 
				+    df_pre = df_pre.fillna(0)
			
 
				+    #df_new_feature = df_pre[process_feature.features]
			
 
				+    df_new_feature = df_pre[process_feature.filter_recent_features()]
			
 
				+    df_target = df_pre['weighted_retrn_log']
			
 
				+    df_new_feature = pd.concat([df_new_feature, df_pre[process_feature.cate_feat],df_pre[process_feature.one_hot_feature]], axis=1)
			
 
				+    return df_new_feature, df_target
			
 
				+
			
 
				+
			
 
				+def featureImportance(fold1_df,fold2_df,fold3_df,fold4_df,values_lenth,video_id_lenth,tag_length,word_length):
			
 
				+    Feature_Data= pd.DataFrame()
			
 
				+    for df in (fold1_df,fold2_df,fold3_df,fold4_df):
			
 
				+        fold1_df1 = df.iloc[0:values_lenth,:]
			
 
				+        videoid_fold1_importance = df.iloc[values_lenth:values_lenth+video_id_lenth,:]['importance'].sum()
			
 
				+        fold1_df2 = pd.DataFrame([{'Feature':'videoid','importance':videoid_fold1_importance,'fold':1}])
			
 
				+        tag_fold1_importance = df.iloc[values_lenth+video_id_lenth:values_lenth+video_id_lenth+tag_length,:]['importance'].sum()
			
 
				+        fold1_df3 = pd.DataFrame([{'Feature':'tags','importance':tag_fold1_importance,'fold':1}])
			
 
				+        words_fold1_importance = df.iloc[values_lenth+video_id_lenth+tag_length:values_lenth+video_id_lenth+tag_length+word_length,:]['importance'].sum()
			
 
				+        fold1_df4 = pd.DataFrame([{'Feature':'words','importance':words_fold1_importance,'fold':1}])
			
 
				+        
			
 
				+        
			
 
				+        Feature_Data = pd.concat([Feature_Data,fold1_df1,fold1_df2,fold1_df3,fold1_df4])
			
 
				+        
			
 
				+    return Feature_Data
			
 
				+
			
 
				+
			
 
				+def MAPE(true, pred):
			
 
				+    true = np.array(true)
			
 
				+    sum_ = 0
			
 
				+    count = 0
			
 
				+    for i in range(len(true)):
			
 
				+        if true[i] != 0:
			
 
				+            sum_ = sum_ + np.abs(true[i] - pred[i]) / true[i]
			
 
				+            count = count + 1
			
 
				+        else:
			
 
				+            continue
			
 
				+
			
 
				+    return sum_ / count
			
 
				+
			
 
				+
			
 
				+def process_train_predict_data():
			
 
				+    now_date = datetime.date.today() 
			
 
				+    # day = datetime.datetime.strftime(now_date, '%Y%m%d')
			
 
				+    diff_1 = datetime.timedelta(days=1)
			
 
				+    diff_5 = datetime.timedelta(days=7)
			
 
				+    predict_dt = now_date - diff_1
			
 
				+    predict_day = datetime.datetime.strftime(predict_dt, '%Y%m%d')
			
 
				+    train_dt = now_date - diff_5
			
 
				+    train_day = datetime.datetime.strftime(train_dt, '%Y%m%d')
			
 
				+    #read data from ali
			
 
				+    train_data = getdatasample(train_day, 30, 'rov_feature_add_v1')
			
 
				+    predict_data = getdatasample(predict_day, 1, 'rov_predict_table_add_v1')
			
 
				+    #pickle for test
			
 
				+    import _pickle as cPickle
			
 
				+    with open('train_data.pickle','wb') as output_file:
			
 
				+        cPickle.dump(train_data, output_file)
			
 
				+    with open('predict_data.pickle','wb') as output_file:
			
 
				+        cPickle.dump(predict_data, output_file) 
			
 
				+    #with open(r"train_data.pickle", "rb") as input_file:
			
 
				+    '''
			
 
				+    with open(r"train_data.pickle", "rb") as input_file:
			
 
				+        train_data = cPickle.load(input_file)    
			
 
				+    with open(r"predict_data.pickle", "rb") as input_file:
			
 
				+        predict_data = cPickle.load(input_file)       
			
 
				+    '''
			
 
				+    #end pickle
			
 
				+    train_data = basic_cal(train_data)
			
 
				+    predict_data = basic_cal(predict_data)
			
 
				+
			
 
				+    predict_data = select_recent_video(predict_data)
			
 
				+    #predict_data.loc[predict_data['dt'] != int(predict_day), 'futre7dayreturn'] = 0
			
 
				+    predict_data = predict_data.drop(axis=1, columns='rk')
			
 
				+
			
 
				+    train_data.drop_duplicates(subset=['videoid', 'dt'], keep='first', inplace=True)
			
 
				+    predict_data.drop_duplicates(subset=['videoid', 'dt'], keep='first', inplace=True)
			
 
				+
			
 
				+    train_data = train_data.fillna(0)
			
 
				+    predict_data = predict_data.fillna(0)
			
 
				+    train_data = process_feature.cal_feature(train_data)
			
 
				+    predict_data = process_feature.cal_feature(predict_data)
			
 
				+
			
 
				+    predict_data['videoid'] = predict_data['videoid'].astype('int')
			
 
				+
			
 
				+    df_new_feature,df_target= dataprepare(train_data)
			
 
				+    df_new_feature_predict, df_target_predict = dataprepare(predict_data)
			
 
				+
			
 
				+    df_new_feature_part_one = sparse.csr_matrix(np.array(pd.DataFrame(df_new_feature).loc[:,'day1playcount':'videocategory555']))
			
 
				+    df_new_feature_predict_part_one = sparse.csr_matrix(np.array(pd.DataFrame(df_new_feature_predict).loc[:,'day1playcount':'videocategory555']))
			
 
				+
			
 
				+    print('value feature generate successfully')
			
 
				+
			
 
				+    train_videoid = pd.DataFrame(df_new_feature).loc[:,'videoid']
			
 
				+    predict_videoid = pd.DataFrame(df_new_feature_predict).loc[:,'videoid']
			
 
				+
			
 
				+    train_videoid_list = pd.DataFrame(df_new_feature).loc[:,'videoid'].to_numpy().reshape(len(pd.DataFrame(df_new_feature).loc[:,'videoid']),1).tolist()
			
 
				+    predict_videoid_list = pd.DataFrame(df_new_feature_predict).loc[:,'videoid'].to_numpy().reshape(len(pd.DataFrame(df_new_feature_predict).loc[:,'videoid']),1).tolist()
			
 
				+
			
 
				+
			
 
				+    allvideo_raw = list(set(np.array(pd.concat([train_videoid,predict_videoid])).tolist()))
			
 
				+    allvideo = np.array(allvideo_raw).reshape(len(allvideo_raw),1).tolist()
			
 
				+    
			
 
				+
			
 
				+    mlb_model_videoid = MultiLabelBinarizer(sparse_output=True).fit(allvideo)
			
 
				+    train_videoid = mlb_model_videoid.transform(train_videoid_list)
			
 
				+    predict_videoid = mlb_model_videoid.transform(predict_videoid_list)
			
 
				+
			
 
				+    print('videoid feature generate successfully')
			
 
				+
			
 
				+    #获取tag-one-hot
			
 
				+    tags ,train_tag,predict_tag = process_tag.tag_preprocessing('tag', df_new_feature, df_new_feature_predict)
			
 
				+    #获取tag tfidf
			
 
				+    tag_dict = process_tag.get_tag_tfidf('20200305','video_tag_tf_idf')
			
 
				+    print('lenth tag_dict:',len(tag_dict))
			
 
				+    #获取tfidf_tag 稀疏矩阵
			
 
				+    tag_corpus = tags.tolist()  #corpus
			
 
				+    tag_tfidf_list = process_tag.ttfidf_list_generation(tag_corpus,tag_dict )
			
 
				+    tag_tf_idf_matrix  = sparse.csr_matrix(np.array(tag_tfidf_list))
			
 
				+
			
 
				+    tag_feature_train = train_tag.multiply(tag_tf_idf_matrix)  
			
 
				+    tag_feature_test = predict_tag.multiply(tag_tf_idf_matrix)  
			
 
				+    print('tag tfidf feature generate successfully')
			
 
				+    print('tag dimension:', len(tag_tfidf_list))
			
 
				+
			
 
				+    #获取values without tag
			
 
				+    words ,train_words,test_words = process_tag.tag_preprocessing('words_no_tag', df_new_feature, df_new_feature_predict)
			
 
				+    #获取words tfidf
			
 
				+    words_dict = process_tag.get_tag_tfidf('20200305','video_words_without_tags_tfidf')
			
 
				+    print('lenth words_dict:',len(words_dict))
			
 
				+    #获取tfidf_tag 稀疏矩阵
			
 
				+    words_corpus = words.tolist()  #corpus
			
 
				+    words_tfidf_list = process_tag.ttfidf_list_generation(words_corpus,words_dict )
			
 
				+    words_tf_idf_matrix  = sparse.csr_matrix(np.array(words_tfidf_list))
			
 
				+    words_feature_train = train_words.multiply(words_tf_idf_matrix)  
			
 
				+    words_feature_test = test_words.multiply(words_tf_idf_matrix)  
			
 
				+    print('tag tfidf feature generate successfully')
			
 
				+    print('words dimension:', len(words_tfidf_list))
			
 
				+
			
 
				+    df_new_feature = hstack([df_new_feature_part_one,train_videoid,tag_feature_train, words_feature_train])
			
 
				+    df_new_feature_predict = hstack([df_new_feature_predict_part_one,predict_videoid,tag_feature_test,words_feature_test])
			
 
				+    return train_data, predict_data, df_target, df_target_predict, df_new_feature, df_new_feature_predict 
			
 
				+
			
 
				+
			
 
				+def do_train(train_data, predict_data, df_target, df_target_predict, df_new_feature, df_new_feature_predict):
			
 
				+
			
 
				+    #target
			
 
				+    df_target_predict = sparse.csr_matrix(pd.DataFrame(df_target_predict).values).toarray()
			
 
				+    df_target = sparse.csr_matrix(pd.DataFrame(df_target).values).toarray()
			
 
				+
			
 
				+
			
 
				+    param = {'num_leaves': 18,
			
 
				+         'min_data_in_leaf': 60,
			
 
				+         'objective': 'regression',
			
 
				+         'max_depth': -1,
			
 
				+         'learning_rate': 0.01,
			
 
				+         "min_child_samples": 30,
			
 
				+         "boosting": "gbdt",
			
 
				+         "feature_fraction": 0.8,
			
 
				+         "bagging_freq": 1,
			
 
				+         "bagging_fraction": 0.8,
			
 
				+         "bagging_seed": 11,
			
 
				+         "metric": 'rmse',
			
 
				+         "lambda_l1": 0.1,
			
 
				+         "verbosity": -1,
			
 
				+         "nthread": 4,
			
 
				+         #          'max_bin': 512,
			
 
				+         "random_state": 4590}
			
 
				+
			
 
				+    folds = StratifiedKFold(n_splits=4, shuffle=True, random_state=4590)
			
 
				+    #oof = np.zeros(len(pd.DataFrame(df_new_feature.toarray())))
			
 
				+    oof = np.zeros(len(df_target))
			
 
				+    predictions = np.zeros(len(df_target_predict))
			
 
				+    feature_importance_df = pd.DataFrame()
			
 
				+
			
 
				+
			
 
				+    # values_lenth = len(process_feature.features + process_feature.cate_feat)
			
 
				+    # video_id_lenth = len(mlb_model_videoid.classes_)
			
 
				+    # tag_length = len(tag_tfidf_list)
			
 
				+    # word_length = len(words_tfidf_list)
			
 
				+
			
 
				+    change_view = pd.DataFrame(pd.DataFrame(df_new_feature_predict.toarray()))
			
 
				+    change_view = change_view.sort_index()  
			
 
				+
			
 
				+    for fold_, (trn_idx, val_idx) in enumerate(folds.split(df_new_feature, train_data['return_back'].values)):
			
 
				+        print("folds {}".format(fold_))
			
 
				+        trn_data = lgb.Dataset(df_new_feature.tocsr()[trn_idx,:], label=pd.DataFrame(df_target).iloc[trn_idx])
			
 
				+        val_data = lgb.Dataset(df_new_feature.tocsr()[val_idx,:], label=pd.DataFrame(df_target).iloc[val_idx])
			
 
				+
			
 
				+        num_round = 10000
			
 
				+        clf = lgb.train(param, trn_data, num_round, valid_sets=[trn_data, val_data], verbose_eval=100,
			
 
				+                early_stopping_rounds=200)
			
 
				+        oof[val_idx] = clf.predict(df_new_feature.tocsr()[val_idx,:], num_iteration=clf.best_iteration)
			
 
				+        predictions += clf.predict(df_new_feature_predict, num_iteration=clf.best_iteration) / folds.n_splits
			
 
				+
			
 
				+        fold_importance_df = pd.DataFrame()
			
 
				+        
			
 
				+        # column = process_feature.features+process_feature.cate_feat+mlb_model_videoid.classes_.tolist()+ tag_corpus + words_corpus
			
 
				+        # fold_importance_df["Feature"] = np.array(column)
			
 
				+        
			
 
				+        # fold_importance_df["importance"] = clf.feature_importance()
			
 
				+        # fold_importance_df["fold"] = fold_ + 1
			
 
				+        # feature_importance_df = pd.concat([feature_importance_df, fold_importance_df], axis=0)
			
 
				+
			
 
				+
			
 
				+    # fold1_df = feature_importance_df.loc[feature_importance_df['fold']==1]
			
 
				+    # fold2_df = feature_importance_df.loc[feature_importance_df['fold']==2]
			
 
				+    # fold3_df = feature_importance_df.loc[feature_importance_df['fold']==3]
			
 
				+    # fold4_df = feature_importance_df.loc[feature_importance_df['fold']==4]
			
 
				+
			
 
				+
			
 
				+    # feature_importance_df = featureImportance(fold1_df,fold2_df,fold3_df,fold4_df,values_lenth,video_id_lenth,tag_length,word_length)
			
 
				+
			
 
				+    print('oof_rmse:', np.sqrt(mean_squared_error(df_target, oof)))
			
 
				+    print('oof_mse:', mean_squared_error(df_target, oof))
			
 
				+
			
 
				+    print('test_rmse:', np.sqrt(mean_squared_error(df_target_predict, predictions)))
			
 
				+    print('test_mse:', mean_squared_error(df_target_predict, predictions))
			
 
				+
			
 
				+
			
 
				+    print('oof_mape:', MAPE(df_target, oof))
			
 
				+    print('test_mape:', MAPE(df_target_predict, predictions))
			
 
				+
			
 
				+    print('verification r2:', r2_score(df_target, oof))
			
 
				+    print('test r2:', r2_score(df_target_predict, predictions))
			
 
				+
			
 
				+    sub_df_ = pd.DataFrame({"videoid": predict_data["videoid"].values})
			
 
				+    sub_df_['score'] = predictions
			
 
				+    print('regre ranking shape', sub_df_.shape)
			
 
				+    sub_df_.to_csv('result.csv')
			
 
				+
			
 
				+if __name__ == '__main__':
			
 
				+    train_data, predict_data, df_target, df_target_predict, df_new_feature, df_new_feature_predict = process_train_predict_data()
			
 
				+    do_train(train_data, predict_data, df_target, df_target_predict, df_new_feature, df_new_feature_predict)
			
--- a/sort.py
+++ b/sort.py
@@ -0,0 +1,6 @@
 
				+import pandas as pd
			
 
				+
			
 
				+df = pd.read_csv('result.csv')
			
 
				+df2 = df[df['score']>0.001]
			
 
				+df2 = df2.sort_values('score', axis=0, ascending=False)
			
 
				+df2.to_csv('result2.csv')