baichongyang 3 lat temu
rodzic
commit
bf6f6f4327
7 zmienionych plików z 911 dodań i 9 usunięć
  1. 3 1
      .gitignore
  2. 323 0
      meta_data_operation.py
  3. 252 0
      nohup.out
  4. 7 0
      process_feature.py
  5. 9 8
      rov_train.py
  6. 311 0
      rov_train2.py
  7. 6 0
      sort.py

+ 3 - 1
.gitignore

@@ -24,7 +24,9 @@ var/
 *.egg-info/
 .installed.cfg
 *.egg
-
+*pickle
+*.swp
+*.csv
 # PyInstaller
 #  Usually these files are written by a python script from a template
 #  before PyInstaller builds the exe, so as to inject date/other infos into it.

+ 323 - 0
meta_data_operation.py

@@ -0,0 +1,323 @@
+#!/usr/bin/env python
+# coding: utf-8
+
+import random
+import numpy as np
+import os
+import datetime
+import json
+import oss2
+import hashlib
+import pandas as pd
+from odps import ODPS
+import time
+from tqdm import tqdm
+import pickle
+import langid
+import smtplib
+from email.mime.text import MIMEText
+from email.header import Header
+from rov_to_redis import toRedis
+
+
+def sendemail():
+    mail_host="smtp.exmail.qq.com"  #设置服务器
+    mail_user="warning@piaoquantv.com"    #用户名
+    mail_pass="Wq20160722"   #口令 
+
+    sender = 'warning@piaoquantv.com'
+    receivers = ['liqian@piaoquantv.com','liuchunlin@piaoquantv.com']  # 接收邮件,可设置为你的QQ邮箱或者其他邮箱
+
+    # 三个参数:第一个为文本内容,第二个 plain 设置文本格式,第三个 utf-8 设置编码
+    message = MIMEText('小程序ROV更新失败', 'plain', 'utf-8')
+    message['From'] = Header("warning@piaoquantv.com", 'utf-8')  
+    message['To'] =  Header("xiaoping@piaoquantv.com", 'utf-8')        
+
+    subject = '小程序ROV首屏更新预警'
+    message['Subject'] = Header(subject, 'utf-8')
+
+    try:
+        with smtplib.SMTP_SSL(host="smtp.exmail.qq.com",port=465) as smtp:
+            # 登录发邮件服务器
+            smtp.login(user = mail_user, password = mail_pass)
+            # 实际发送、接收邮件配置
+            smtp.sendmail(sender, receivers, message.as_string())
+            print ("邮件发送成功")
+    except smtplib.SMTPException:
+        print ("Error: 无法发送邮件")
+
+
+now_date = datetime.date.today()-datetime.timedelta(days=0)
+day = datetime.datetime.strftime(now_date, '%Y%m%d')
+diff_1 = datetime.timedelta(days=1)
+input_dt = datetime.datetime.strftime(now_date - diff_1, '%Y%m%d')
+print(input_dt)
+
+
+res_dir = '/root/ROVtrain/resdir'
+res_file = os.path.join(res_dir,'video_score_'+ day[-4:] +'.json')
+
+def getRovfeaturetable(dt):
+    odps = ODPS('LTAI4FtW5ZzxMvdw35aNkmcp', '0VKnydcaHK3ITjylbgUsLubX6rwiwc', 'videoods',
+                endpoint='http://service.cn.maxcompute.aliyun.com/api', connect_timeout=3000, \
+                read_timeout=500000, pool_maxsize=1000, pool_connections=1000)
+
+    featureArray = []
+    for record in tqdm(odps.read_table('recommendVideoDistributionEfficiency', partition='dt=%s' % dt)):
+        valueFeature = {}
+        valueFeature['videoId'] = record['videoid']
+        valueFeature['today_all_return_person'] = record['today_all_return_person']
+        valueFeature['today_return_total_real']  = record['today_return_total_real']
+        valueFeature['videoview_today'] = record['videoview_today']
+        valueFeature['videoshare_today'] = record['videoshare_today']
+        valueFeature['videoplay_today'] = record['videoplay_today']
+        valueFeature['ruosoneweek'] = record['ruosoneweek']
+        valueFeature['ruos3day'] = record['ruos3day']
+        valueFeature['today_ruov_all'] = record['today_ruov_all']
+        
+        valueFeature['ctr_today'] = record['ctr_today']
+        valueFeature['today_first_return_person'] = record['today_first_return_person']
+        valueFeature['today_second_return_person'] = record['today_second_return_person']
+        valueFeature['today_third_return_person'] = record['today_third_return_person']
+        valueFeature['today_overfour_return_person'] = record['today_overfour_return_person']
+
+        valueFeature['score'] = record['a11']
+        valueFeature['dis_title'] = record['dis_title1']
+        valueFeature['title'] = record['title']
+        valueFeature['rovscore'] = record['rovscore']
+        featureArray.append(valueFeature)
+    featureArrayDF = pd.DataFrame(featureArray)
+    print('feature table finish')
+    return featureArrayDF
+
+raw_score = getRovfeaturetable(input_dt)
+
+
+###=========================================运营改分逻辑添加================================================
+rename_dict={'videoId':'videoId',
+ 'today_all_return_person':'今日曝光总回流人数',
+ 'today_return_total_real':'今日总回流_真实值',
+ 'videoview_today':'今日首页曝光次数',
+ 'videoshare_today':'今日首页分享次数',
+ 'videoplay_today':'今日首页播放次数',
+ 'ruosoneweek':'前7_1日曝光今日RUOV',
+ 'ruos3day':'前3_1日曝光今日RUOV',
+ 'today_ruov_all':'今日首页RUOV',
+ 'ctr_today':'今日首页CTR',
+ 'today_first_return_person':'今日曝光一层回流人数',
+ 'today_second_return_person':'今日曝光二层回流人数',
+ 'today_third_return_person':'今日曝光三层回流人数',
+ 'today_overfour_return_person':'今日曝光四加层回流人数',
+ 'score':'全局总分',
+ 'dis_title':'分发标题',
+ 'title':'标题',
+ 'rovscore':'rov分数'
+}
+
+raw_score.rename(columns=rename_dict,inplace=True)
+#raw_score.columns = ['今日首页CTR', '分发标题', 'rov分数', '前3_1日曝光今日RUOV','前7_1日曝光今日RUOV','全局总分','标题', '今日曝光总回流人数', '今日曝光一层回流人数','今日曝光四加层回流人数',
+#                    '今日总回流_真实值','今日首页RUOV','今日曝光二层回流人数', '今日曝光三层回流人数','videoId', '今日首页播放次数','今日首页分享次数',
+#                    '今日首页曝光次数']
+
+raw_score['二层/一层'] = raw_score.apply(lambda x:x['今日曝光二层回流人数'] / max(10,x['今日曝光一层回流人数']),axis=1)
+raw_score['三层/二层'] = raw_score.apply(lambda x:x['今日曝光三层回流人数'] / max(10,x['今日曝光二层回流人数']),axis=1)
+raw_score['四加层/三层'] = raw_score.apply(lambda x:x['今日曝光四加层回流人数'] / max(10,x['今日曝光三层回流人数']),axis=1)
+raw_score['回流比均值'] = (raw_score['二层/一层'] + raw_score['三层/二层'] + raw_score['四加层/三层']) / 3
+
+raw_score = raw_score[['videoId','分发标题','标题','今日首页曝光次数','今日首页播放次数',
+                       '今日首页分享次数','今日曝光总回流人数','今日总回流_真实值','前7_1日曝光今日RUOV',
+                       '前3_1日曝光今日RUOV','今日首页RUOV','今日首页CTR','今日曝光一层回流人数',
+                       '今日曝光二层回流人数', '今日曝光三层回流人数','今日曝光四加层回流人数','二层/一层',
+                       '三层/二层','四加层/三层','回流比均值',
+                       'rov分数','全局总分']]
+
+raw_score['新分数'] = raw_score.apply(lambda x: (x['今日曝光总回流人数']*2 + min(x['今日总回流_真实值'],x['今日首页曝光次数'])*0.3 + x['今日首页分享次数']*0.2 + x['今日首页播放次数']*0.02) / max(x['今日首页曝光次数'],1000) * np.log10(x['今日首页曝光次数']+10),axis=1)
+
+raw_score['旧排名'] = [i+1 for i in range(len(raw_score))]
+
+raw_score = raw_score.sort_values(by='新分数',ascending=False).reset_index(drop=True)
+
+raw_score['新排名'] = [i+1 for i in range(len(raw_score))]
+
+raw_score['排名差值'] = raw_score['旧排名'] - raw_score['新排名']
+
+raw_score['提升至'] =  raw_score.apply(lambda x: int(x['旧排名']- x['排名差值']/2 if x['排名差值']>=0 and x['旧排名']<=200 else 100+x['旧排名']%100),axis=1)
+raw_score = pd.merge(raw_score,raw_score[['旧排名','全局总分']],left_on='提升至',right_on='旧排名',how='left')
+
+raw_score.drop(['旧排名_y'],axis=1,inplace=True)
+raw_score.rename(columns={'全局总分_x':'全局总分','旧排名_x':'旧排名','全局总分_y':'提升分数至'},inplace=True)
+
+raw_score['降至'] =  raw_score.apply(lambda x: int(x['旧排名']- x['排名差值']/2 if(x['排名差值']<=0 and x['新排名']<=200) else 100+x['新排名']%100),axis=1)
+
+raw_score = pd.merge(raw_score,raw_score[['旧排名','全局总分']],left_on='降至',right_on='旧排名',how='left')
+raw_score.drop(['旧排名_y'],axis=1,inplace=True)
+
+raw_score.rename(columns={'全局总分_x':'全局总分','旧排名_x':'旧排名','全局总分_y':'降低分数至'},inplace=True)
+raw_score['一周ruov均值'] = (raw_score['前7_1日曝光今日RUOV'] + raw_score['今日首页RUOV'] + raw_score['前3_1日曝光今日RUOV']) / 3
+
+## 筛选加分视频
+add_score_video = raw_score[(raw_score['新排名']<=100)&(raw_score['排名差值']>20)]
+add_score_video['最终分数'] = add_score_video.apply(lambda x : x['提升分数至']  if x['排名差值']>20 else x['全局总分'],axis=1)
+
+# 筛选 减分视频
+reduce_score_video = raw_score[(raw_score['排名差值']<-50)&(raw_score['旧排名']<=100)]
+reduce_score_video['最终分数'] = reduce_score_video['降低分数至']
+
+score_adjust_videos = pd.concat([add_score_video,reduce_score_video])
+raw_score = raw_score.merge(score_adjust_videos[['videoId','最终分数']],how='left')
+raw_score['最终分数'] = raw_score.apply(lambda x: x['最终分数'] if x['最终分数']>0 else x['全局总分'],axis=1)
+
+## 新算法递进逻辑
+raw_score['最终分数'] = raw_score.apply(lambda x: x['最终分数']  + 10 if (x['一周ruov均值']>=0.045 and x['新排名']<=100)  else x['最终分数'],axis=1)
+
+## 低分裂变视频
+raw_score['最终分数'] =  raw_score.apply(lambda x: x['最终分数']%10 + 30 if (x['今日首页CTR']>=0.04999 and x['回流比均值'] >= 0.3999 and x['最终分数']<=30 and x['旧排名']<=3000) else x['最终分数'],axis=1)
+
+###=========================================运营改分逻辑结束================================================
+
+raw_score = raw_score[['videoId','最终分数','分发标题','标题']]
+raw_score.columns = ['videoId','score','dis_title','title']
+
+
+
+raw_score = raw_score[['videoId','score','dis_title','title']]
+raw_score = raw_score.dropna(axis=0,how='any')
+raw_score.drop_duplicates(subset=['videoId'],inplace=True) 
+# raw_score = raw_score.loc[raw_score['score'] >= 2.5]
+
+raw_score = raw_score.sort_values(by="score" , ascending=False).iloc[0:60000,:]
+
+raw_score.reset_index(inplace = True)
+
+#过滤外文
+print(raw_score.shape)
+score_initial = pd.DataFrame([])
+for index in range(len(raw_score)):
+#     print(index)
+    title = raw_score.iloc[[index]]['title'][index]
+    dis_title = raw_score.iloc[[index]]['dis_title'][index]
+#     print(dis_title)
+    title_language = []
+    dis_title_language = []
+    if title == '' and  dis_title == '' :
+        score_initial = score_initial.append(raw_score.iloc[[index]])
+    else :
+        if len(title) > 0:
+            for index1 in range (len(title)):
+                title_lan_label = langid.classify(title[index1])[0]
+                title_language.append(title_lan_label)
+                if 'zh' in title_language:
+                    break
+        if len(dis_title) > 0:
+            for index2 in range(len(dis_title)):
+                dis_title_label = langid.classify(dis_title[index2])[0]
+                dis_title_language.append(dis_title_label)
+                if 'zh' in dis_title_language:
+                    break
+        if 'zh' in title_language or 'zh' in dis_title_language:
+            score_initial = score_initial.append(raw_score.iloc[[index]])
+#二次过滤非标准字符
+score_initial.reset_index( inplace=True)
+score = pd.DataFrame([])
+for index in range(len(score_initial)):
+    title = score_initial.iloc[[index]]['title'][index]
+    dis_title = score_initial.iloc[[index]]['dis_title'][index]
+
+    if title == '' and  dis_title == '' :
+        score = score.append(score_initial.iloc[[index]])
+    else :
+        if 'ၼ' in title and dis_title == '':
+            continue
+        elif 'ၼ' in dis_title and title == '':
+            continue
+        else:
+            score = score.append(score_initial.iloc[[index]])
+        
+        
+score['videoId'] = score['videoId'].astype('int')
+
+score = score.sort_values(by="score" , ascending=False)
+score = score[['videoId','score']]
+### 添加旧视频测试
+#old_video = pd.read_csv('/root/ROVtrain/readonlinetable/old_videoid_retest.csv')
+#step = (score.loc[49].score  - score.loc[79].score) / 30
+
+#old_video_df = old_video.rename(columns={'videoid':'videoId'}).head(30)
+
+
+#old_video_score = []
+#for i in range(len(old_video_df)):
+#    old_video_score.append(score.loc[49].score+step*(i+1))
+#old_video_df['score']  = old_video_score
+
+#score = pd.concat([score[~score.videoId.isin(old_video_df.videoId)],old_video_df])
+
+#score = score.sort_values(by="score" , ascending=False)
+
+
+#old_video[~old_video.videoid.isin(old_video_df.videoId)].to_csv('/root/ROVtrain/readonlinetable/old_videoid_retest.csv')
+#dt = datetime.datetime.strftime(datetime.date.today(), '%Y%m%d')
+#old_video_df.to_csv('/root/ROVtrain/readonlinetable/oldvideo/'+dt+'.csv')
+#score.to_csv('/root/ROVtrain/readonlinetable/oldvideo/score.csv')
+###
+
+score.to_json(
+    res_file, orient='records')
+
+with open(res_file,'r') as score_json:
+    data = json.load(score_json)
+score_df = pd.DataFrame(data)
+threshold = score_df.iloc[500,:]['score']
+
+# threshold
+
+def CalcMD5(filepath):
+    if not os.path.isfile(filepath):
+        return
+    with open(filepath,'rb') as f:
+        md5obj = hashlib.md5()
+        md5obj.update(f.read())
+        myhash = md5obj.hexdigest()
+    return myhash
+
+
+print(CalcMD5(res_file))
+
+
+metadata = {}
+metadata['rootDir'] = 'recommend'
+metadata['dataDir'] = 'data'
+metadata['modularName'] = 'model'
+metadata['modelName'] = 'rov'
+metadata['version'] = 'v1'
+metadata['date'] = day
+metadata['defaultRovScore'] = threshold
+metadata['itemRovName'] = 'video_score.json'
+metadata['videoScoreMd5'] = CalcMD5(res_file)
+meta_file = os.path.join(res_dir,'rov_success.json')
+with open(meta_file,'w') as f:
+    json.dump(metadata,f)
+    
+# 'LTAI4FtW5ZzxMvdw35aNkmcp', '0VKnydcaHK3ITjylbgUsLubX6rwiwc'
+#if rov_max > 5:
+''' 
+rov_max = max(score.score)
+if rov_max > 50:
+    auth = oss2.Auth('LTAI9EBa0bd5PrDa', 'vAalxds7YxhfOA2yVv8GziCg3Y87v5')
+    bucket = oss2.Bucket(auth, 'http://oss-cn-hangzhou-internal.aliyuncs.com', 'art-recommend')
+    bucket.put_object_from_file('recommend/metadata/rov/'+'rov_success.json', meta_file)
+    bucket.put_object_from_file('recommend/data/model/rov/v1/'+ day +'/video_score.json', res_file)
+    print('upload success')
+else:
+    print('error: rovscore under threshold')
+    sendemail()
+'''
+
+auth = oss2.Auth('LTAI9EBa0bd5PrDa', 'vAalxds7YxhfOA2yVv8GziCg3Y87v5')
+bucket = oss2.Bucket(auth, 'http://oss-cn-hangzhou-internal.aliyuncs.com', 'art-recommend')
+bucket.put_object_from_file('recommend/metadata/rov/'+'rov_success.json', meta_file)
+bucket.put_object_from_file('recommend/data/model/rov/v1/'+ day +'/video_score.json', res_file)
+print('upload success')
+# 上传rov score到Redis中
+toRedis(res_file)
+print('rov score to redis success')

+ 252 - 0
nohup.out

@@ -0,0 +1,252 @@
+[LightGBM] [Warning] min_data_in_leaf is set=60, min_child_samples=30 will be ignored. Current value: min_data_in_leaf=60
+[LightGBM] [Warning] min_data_in_leaf is set=60, min_child_samples=30 will be ignored. Current value: min_data_in_leaf=60
+[LightGBM] [Warning] min_data_in_leaf is set=60, min_child_samples=30 will be ignored. Current value: min_data_in_leaf=60
+[LightGBM] [Warning] min_data_in_leaf is set=60, min_child_samples=30 will be ignored. Current value: min_data_in_leaf=60
+stage 1: time cost : 0.12420 sec
+stage 2: time cost : 0.12090 sec
+stage 3: time cost : 0.12970 sec
+stage 4: time cost : 0.13302 sec
+stage 1: time cost : 0.00954 sec
+stage 2: time cost : 0.00869 sec
+stage 3: time cost : 0.00879 sec
+stage 4: time cost : 0.00902 sec
+value feature generate successfully
+videoid feature generate successfully
+lenth tag_dict: 47609
+tag tfidf feature generate successfully
+tag dimension: 59393
+lenth words_dict: 189467
+tag tfidf feature generate successfully
+words dimension: 59390
+folds 0
+Training until validation scores don't improve for 200 rounds
+[100]	training's rmse: 0.226424	valid_1's rmse: 0.225884
+[200]	training's rmse: 0.0973668	valid_1's rmse: 0.0971656
+[300]	training's rmse: 0.057936	valid_1's rmse: 0.0586671
+[400]	training's rmse: 0.0475353	valid_1's rmse: 0.0492602
+[500]	training's rmse: 0.0447344	valid_1's rmse: 0.0471132
+[600]	training's rmse: 0.0435196	valid_1's rmse: 0.0465598
+[700]	training's rmse: 0.0426755	valid_1's rmse: 0.0462999
+[800]	training's rmse: 0.0419673	valid_1's rmse: 0.0461619
+[900]	training's rmse: 0.0413599	valid_1's rmse: 0.0461027
+[1000]	training's rmse: 0.0408172	valid_1's rmse: 0.0460891
+[1100]	training's rmse: 0.0403146	valid_1's rmse: 0.0461019
+Early stopping, best iteration is:
+[977]	training's rmse: 0.0409576	valid_1's rmse: 0.046076
+folds 1
+Training until validation scores don't improve for 200 rounds
+[100]	training's rmse: 0.226544	valid_1's rmse: 0.225426
+[200]	training's rmse: 0.0971282	valid_1's rmse: 0.0981563
+[300]	training's rmse: 0.0573839	valid_1's rmse: 0.0604765
+[400]	training's rmse: 0.0468133	valid_1's rmse: 0.0513215
+[500]	training's rmse: 0.04392	valid_1's rmse: 0.0493315
+[600]	training's rmse: 0.0426995	valid_1's rmse: 0.0488075
+[700]	training's rmse: 0.0418527	valid_1's rmse: 0.0485998
+[800]	training's rmse: 0.0411682	valid_1's rmse: 0.0485575
+[900]	training's rmse: 0.040566	valid_1's rmse: 0.048484
+[1000]	training's rmse: 0.0400298	valid_1's rmse: 0.0484444
+[1100]	training's rmse: 0.0395374	valid_1's rmse: 0.0484197
+[1200]	training's rmse: 0.0390424	valid_1's rmse: 0.0484038
+[1300]	training's rmse: 0.0385941	valid_1's rmse: 0.0483951
+[1400]	training's rmse: 0.0381847	valid_1's rmse: 0.048402
+[1500]	training's rmse: 0.0377891	valid_1's rmse: 0.048371
+[1600]	training's rmse: 0.037364	valid_1's rmse: 0.0483954
+[1700]	training's rmse: 0.0370085	valid_1's rmse: 0.0483913
+Early stopping, best iteration is:
+[1502]	training's rmse: 0.0377785	valid_1's rmse: 0.0483691
+folds 2
+Training until validation scores don't improve for 200 rounds
+[100]	training's rmse: 0.227932	valid_1's rmse: 0.229577
+[200]	training's rmse: 0.100058	valid_1's rmse: 0.101833
+[300]	training's rmse: 0.0584845	valid_1's rmse: 0.060349
+[400]	training's rmse: 0.0480128	valid_1's rmse: 0.050103
+[500]	training's rmse: 0.0451386	valid_1's rmse: 0.0474865
+[600]	training's rmse: 0.0438463	valid_1's rmse: 0.0467042
+[700]	training's rmse: 0.0430083	valid_1's rmse: 0.0463218
+[800]	training's rmse: 0.0423034	valid_1's rmse: 0.0461414
+[900]	training's rmse: 0.0416902	valid_1's rmse: 0.0460446
+[1000]	training's rmse: 0.0411211	valid_1's rmse: 0.0460113
+[1100]	training's rmse: 0.0406169	valid_1's rmse: 0.0459762
+[1200]	training's rmse: 0.0401204	valid_1's rmse: 0.0459549
+[1300]	training's rmse: 0.0396321	valid_1's rmse: 0.0459738
+[1400]	training's rmse: 0.0392071	valid_1's rmse: 0.0459784
+Early stopping, best iteration is:
+[1206]	training's rmse: 0.0400926	valid_1's rmse: 0.0459515
+folds 3
+Training until validation scores don't improve for 200 rounds
+[100]	training's rmse: 0.226307	valid_1's rmse: 0.227556
+[200]	training's rmse: 0.0974512	valid_1's rmse: 0.098307
+[300]	training's rmse: 0.0581506	valid_1's rmse: 0.0590657
+[400]	training's rmse: 0.047742	valid_1's rmse: 0.048951
+[500]	training's rmse: 0.0448986	valid_1's rmse: 0.0465751
+[600]	training's rmse: 0.0436506	valid_1's rmse: 0.0459655
+[700]	training's rmse: 0.0427668	valid_1's rmse: 0.0457833
+[800]	training's rmse: 0.0421014	valid_1's rmse: 0.0456473
+[900]	training's rmse: 0.0414596	valid_1's rmse: 0.0455921
+[1000]	training's rmse: 0.0408817	valid_1's rmse: 0.0455654
+[1100]	training's rmse: 0.0403698	valid_1's rmse: 0.0455323
+[1200]	training's rmse: 0.0398803	valid_1's rmse: 0.0455336
+[1300]	training's rmse: 0.0394193	valid_1's rmse: 0.0455275
+Early stopping, best iteration is:
+[1164]	training's rmse: 0.0400544	valid_1's rmse: 0.0455123
+oof_rmse: 0.046490504968166625
+oof_mse: 0.0021613670521951254
+test_rmse: 0.10498955445882613
+test_mse: 0.011022806545462817
+oof_mape: [0.04896782]
+test_mape: [0.13560733]
+verification r2: 0.99351663488233
+test r2: 0.9630887068109539
+regre ranking shape (27116, 2)
+[LightGBM] [Warning] min_data_in_leaf is set=60, min_child_samples=30 will be ignored. Current value: min_data_in_leaf=60
+[LightGBM] [Warning] min_data_in_leaf is set=60, min_child_samples=30 will be ignored. Current value: min_data_in_leaf=60
+[LightGBM] [Warning] min_data_in_leaf is set=60, min_child_samples=30 will be ignored. Current value: min_data_in_leaf=60
+[LightGBM] [Warning] min_data_in_leaf is set=60, min_child_samples=30 will be ignored. Current value: min_data_in_leaf=60
+20210919 rov_feature_add_v1 feature table finish
+20210918 rov_feature_add_v1 feature table finish
+20210917 rov_feature_add_v1 feature table finish
+20210916 rov_feature_add_v1 feature table finish
+20210915 rov_feature_add_v1 feature table finish
+20210914 rov_feature_add_v1 feature table finish
+20210913 rov_feature_add_v1 feature table finish
+20210912 rov_feature_add_v1 feature table finish
+20210911 rov_feature_add_v1 feature table finish
+20210910 rov_feature_add_v1 feature table finish
+20210909 rov_feature_add_v1 feature table finish
+20210908 rov_feature_add_v1 feature table finish
+20210907 rov_feature_add_v1 feature table finish
+20210906 rov_feature_add_v1 feature table finish
+20210905 rov_feature_add_v1 feature table finish
+20210904 rov_feature_add_v1 feature table finish
+20210903 rov_feature_add_v1 feature table finish
+20210902 rov_feature_add_v1 feature table finish
+20210901 rov_feature_add_v1 feature table finish
+20210831 rov_feature_add_v1 feature table finish
+20210830 rov_feature_add_v1 feature table finish
+20210829 rov_feature_add_v1 feature table finish
+20210828 rov_feature_add_v1 feature table finish
+20210827 rov_feature_add_v1 feature table finish
+20210826 rov_feature_add_v1 feature table finish
+20210825 rov_feature_add_v1 feature table finish
+20210824 rov_feature_add_v1 feature table finish
+20210823 rov_feature_add_v1 feature table finish
+20210822 rov_feature_add_v1 feature table finish
+20210821 rov_feature_add_v1 feature table finish
+20210925 rov_predict_table_add_v1 feature table finish
+stage 1: time cost : 0.07594 sec
+stage 2: time cost : 0.07060 sec
+stage 3: time cost : 0.08065 sec
+stage 4: time cost : 0.08395 sec
+stage 1: time cost : 0.00955 sec
+stage 2: time cost : 0.00875 sec
+stage 3: time cost : 0.00866 sec
+stage 4: time cost : 0.00883 sec
+151
+121
+(776799, 121)
+151
+121
+(21060, 121)
+0        5.953243
+1        6.812345
+2        4.634729
+3        4.990433
+4        3.931826
+           ...   
+21055    0.000000
+21056    0.000000
+21057    0.000000
+21058    0.000000
+21059    0.000000
+Name: weighted_retrn_log, Length: 21060, dtype: float64
+value feature generate successfully
+videoid feature generate successfully
+lenth tag_dict: 47609
+tag tfidf feature generate successfully
+tag dimension: 59393
+lenth words_dict: 189467
+tag tfidf feature generate successfully
+words dimension: 59390
+folds 0
+Training until validation scores don't improve for 200 rounds
+[100]	training's rmse: 0.217296	valid_1's rmse: 0.219055
+[200]	training's rmse: 0.0933997	valid_1's rmse: 0.0957921
+[300]	training's rmse: 0.054594	valid_1's rmse: 0.0577442
+[400]	training's rmse: 0.044312	valid_1's rmse: 0.0479771
+[500]	training's rmse: 0.0415086	valid_1's rmse: 0.0457105
+[600]	training's rmse: 0.0402481	valid_1's rmse: 0.0450285
+[700]	training's rmse: 0.0393805	valid_1's rmse: 0.0447715
+[800]	training's rmse: 0.0386728	valid_1's rmse: 0.0446799
+[900]	training's rmse: 0.0380495	valid_1's rmse: 0.0446337
+[1000]	training's rmse: 0.0374717	valid_1's rmse: 0.0446179
+[1100]	training's rmse: 0.0369606	valid_1's rmse: 0.0445955
+[1200]	training's rmse: 0.0364684	valid_1's rmse: 0.0445982
+[1300]	training's rmse: 0.0359899	valid_1's rmse: 0.0446047
+[1400]	training's rmse: 0.0355138	valid_1's rmse: 0.0446262
+Early stopping, best iteration is:
+[1220]	training's rmse: 0.0363808	valid_1's rmse: 0.0445906
+folds 1
+Training until validation scores don't improve for 200 rounds
+[100]	training's rmse: 0.215789	valid_1's rmse: 0.214513
+[200]	training's rmse: 0.0931099	valid_1's rmse: 0.092297
+[300]	training's rmse: 0.0546024	valid_1's rmse: 0.0552234
+[400]	training's rmse: 0.0443371	valid_1's rmse: 0.0464082
+[500]	training's rmse: 0.0414791	valid_1's rmse: 0.0446326
+[600]	training's rmse: 0.0402743	valid_1's rmse: 0.0442435
+[700]	training's rmse: 0.0394273	valid_1's rmse: 0.0440974
+[800]	training's rmse: 0.0387161	valid_1's rmse: 0.0440401
+[900]	training's rmse: 0.0380969	valid_1's rmse: 0.044004
+[1000]	training's rmse: 0.0375615	valid_1's rmse: 0.0439732
+[1100]	training's rmse: 0.0370498	valid_1's rmse: 0.0439087
+[1200]	training's rmse: 0.0365759	valid_1's rmse: 0.0438935
+[1300]	training's rmse: 0.036118	valid_1's rmse: 0.0439083
+Early stopping, best iteration is:
+[1172]	training's rmse: 0.036709	valid_1's rmse: 0.0438807
+folds 2
+Training until validation scores don't improve for 200 rounds
+[100]	training's rmse: 0.220769	valid_1's rmse: 0.220882
+[200]	training's rmse: 0.0971585	valid_1's rmse: 0.0977065
+[300]	training's rmse: 0.0570418	valid_1's rmse: 0.0587242
+[400]	training's rmse: 0.04556	valid_1's rmse: 0.0483838
+[500]	training's rmse: 0.0421882	valid_1's rmse: 0.0457825
+[600]	training's rmse: 0.0408069	valid_1's rmse: 0.0449982
+[700]	training's rmse: 0.0398654	valid_1's rmse: 0.0446595
+[800]	training's rmse: 0.0391188	valid_1's rmse: 0.0444782
+[900]	training's rmse: 0.0384418	valid_1's rmse: 0.0443802
+[1000]	training's rmse: 0.0378518	valid_1's rmse: 0.0443594
+[1100]	training's rmse: 0.037322	valid_1's rmse: 0.0443377
+[1200]	training's rmse: 0.036808	valid_1's rmse: 0.0443101
+[1300]	training's rmse: 0.0363635	valid_1's rmse: 0.0442829
+[1400]	training's rmse: 0.0359017	valid_1's rmse: 0.0442798
+[1500]	training's rmse: 0.0354922	valid_1's rmse: 0.0443043
+Early stopping, best iteration is:
+[1324]	training's rmse: 0.036253	valid_1's rmse: 0.0442693
+folds 3
+Training until validation scores don't improve for 200 rounds
+[100]	training's rmse: 0.215786	valid_1's rmse: 0.216227
+[200]	training's rmse: 0.0932123	valid_1's rmse: 0.094359
+[300]	training's rmse: 0.0548723	valid_1's rmse: 0.0562601
+[400]	training's rmse: 0.0448026	valid_1's rmse: 0.0462832
+[500]	training's rmse: 0.0420321	valid_1's rmse: 0.0437981
+[600]	training's rmse: 0.0408233	valid_1's rmse: 0.0429686
+[700]	training's rmse: 0.0400402	valid_1's rmse: 0.042586
+[800]	training's rmse: 0.0393215	valid_1's rmse: 0.0424025
+[900]	training's rmse: 0.0386931	valid_1's rmse: 0.0422996
+[1000]	training's rmse: 0.0381022	valid_1's rmse: 0.0422256
+[1100]	training's rmse: 0.0375841	valid_1's rmse: 0.0421743
+[1200]	training's rmse: 0.0370754	valid_1's rmse: 0.042115
+[1300]	training's rmse: 0.0365871	valid_1's rmse: 0.0420674
+[1400]	training's rmse: 0.0361483	valid_1's rmse: 0.0420477
+[1500]	training's rmse: 0.0356979	valid_1's rmse: 0.0420595
+[1600]	training's rmse: 0.0352963	valid_1's rmse: 0.0420646
+Early stopping, best iteration is:
+[1430]	training's rmse: 0.0360223	valid_1's rmse: 0.042041
+oof_rmse: 0.04370656860219149
+oof_mse: 0.0019102641389780713
+test_rmse: 0.030188856957077185
+test_mse: 0.0009113670843748676
+oof_mape: [0.04676871]
+test_mape: [0.13009724]
+verification r2: 0.9937893006007936
+test r2: 0.9634331570023774
+regre ranking shape (21060, 2)

+ 7 - 0
process_feature.py

@@ -240,6 +240,13 @@ features = ['day1playcount', 'day1returncount', 'day1sharecount', 'day1viewcount
             'day3sharecount_dif_day1sharecount', 'day3viewcount_divide_day1viewcount', 
             'day3viewcount_dif_day1viewcount']
 
+
+def filter_recent_features():
+    print(len(features))
+    res = [f for f in features if (f.find('30') == -1 and f.find('60') == -1)]
+    print(len(res))
+    return res
+
 def cal_feature(df):
     start = time.time()
     for i in range(len(root_page_1day)):

+ 9 - 8
rov_train.py

@@ -56,7 +56,6 @@ def getdatasample(date, max_range, table):
         delta = datetime.timedelta(days=i)
         tar_dt = new_date - delta
         datelist.append(tar_dt.strftime("%Y%m%d"))
-    print(datelist)
     for tm in datelist:
         testlist.append(getRovfeaturetable(tm, table))
     testdata = pd.concat(testlist)
@@ -101,7 +100,9 @@ def dataprepare(df_pre):
     #  直接将特征送进去,不加交叉特征。
     # 是否对数据补零
     df_pre = df_pre.fillna(0)
-    df_new_feature = df_pre[process_feature.features]
+    #df_new_feature = df_pre[process_feature.features]
+    df_new_feature = df_pre[process_feature.filter_recent_features()]
+    print(df_new_feature.shape)
     df_target = df_pre['weighted_retrn_log']
     df_new_feature = pd.concat([df_new_feature, df_pre[process_feature.cate_feat],df_pre[process_feature.one_hot_feature]], axis=1)
     return df_new_feature, df_target
@@ -149,22 +150,21 @@ def process_train_predict_data():
     train_dt = now_date - diff_5
     train_day = datetime.datetime.strftime(train_dt, '%Y%m%d')
     #read data from ali
-    #train_data = getdatasample(train_day, 30, 'rov_feature_add_v1')
-    #predict_data = getdatasample(predict_day, 1, 'rov_predict_table_add_v1')
+    train_data = getdatasample(train_day, 30, 'rov_feature_add_v1')
+    predict_data = getdatasample(predict_day, 1, 'rov_predict_table_add_v1')
     #pickle for test
     import _pickle as cPickle
-    ''''
     with open('train_data.pickle','wb') as output_file:
         cPickle.dump(train_data, output_file)
     with open('predict_data.pickle','wb') as output_file:
         cPickle.dump(predict_data, output_file) 
-    exit()
-    '''
     #with open(r"train_data.pickle", "rb") as input_file:
+    '''
     with open(r"train_data.pickle", "rb") as input_file:
         train_data = cPickle.load(input_file)    
     with open(r"predict_data.pickle", "rb") as input_file:
         predict_data = cPickle.load(input_file)       
+    '''
     #end pickle
     train_data = basic_cal(train_data)
     predict_data = basic_cal(predict_data)
@@ -186,6 +186,7 @@ def process_train_predict_data():
 
     df_new_feature,df_target= dataprepare(train_data)
     df_new_feature_predict, df_target_predict = dataprepare(predict_data)
+    print(df_target_predict)
 
     df_new_feature_part_one = sparse.csr_matrix(np.array(pd.DataFrame(df_new_feature).loc[:,'day1playcount':'videocategory555']))
     df_new_feature_predict_part_one = sparse.csr_matrix(np.array(pd.DataFrame(df_new_feature_predict).loc[:,'day1playcount':'videocategory555']))
@@ -330,7 +331,7 @@ def do_train(train_data, predict_data, df_target, df_target_predict, df_new_feat
     sub_df_ = pd.DataFrame({"videoid": predict_data["videoid"].values})
     sub_df_['score'] = predictions
     print('regre ranking shape', sub_df_.shape)
-
+    sub_df_.to_csv('result.csv')
 
 if __name__ == '__main__':
     train_data, predict_data, df_target, df_target_predict, df_new_feature, df_new_feature_predict = process_train_predict_data()

+ 311 - 0
rov_train2.py

@@ -0,0 +1,311 @@
+import warnings
+
+warnings.filterwarnings("ignore")
+from sklearn.metrics import r2_score
+import os
+import pandas as pd
+import gc
+import math
+import numpy as np
+import time
+from sklearn.linear_model import SGDRegressor
+from sklearn.linear_model import SGDClassifier
+import lightgbm as lgb
+from sklearn.model_selection import train_test_split
+from sklearn.model_selection import StratifiedKFold
+from sklearn.preprocessing import MultiLabelBinarizer
+from sklearn import metrics
+import pickle
+from sklearn.metrics import mean_squared_error
+import seaborn as sns
+import matplotlib.pylab as plt
+from odps import ODPS
+from odps.df import DataFrame as odpsdf
+from datetime import datetime as dt
+import datetime
+from scipy import sparse
+from scipy.sparse import hstack
+
+import process_feature
+import process_tag
+
+
+def getRovfeaturetable(dt, table):
+    odps = ODPS('LTAI4FtW5ZzxMvdw35aNkmcp', '0VKnydcaHK3ITjylbgUsLubX6rwiwc', 'usercdm',
+                endpoint='http://service.cn.maxcompute.aliyun.com/api', connect_timeout=3000, \
+                read_timeout=500000, pool_maxsize=1000, pool_connections=1000)
+
+    featureArray = []
+    for record in odps.read_table(table, partition='dt=%s' % dt):
+        valueFeature = {}
+        for i in process_feature.featurename:
+            if i == 'dt':
+                valueFeature[i] = dt
+            else:
+                valueFeature[i] = record[i]
+        featureArray.append(valueFeature)
+    featureArray = pd.DataFrame(featureArray)
+    print(dt, table, 'feature table finish')
+    return featureArray
+
+def getdatasample(date, max_range, table):
+    new_date = dt.strptime(date, '%Y%m%d')
+    datelist = []
+    testlist = []
+    for i in range(0, max_range):
+        delta = datetime.timedelta(days=i)
+        tar_dt = new_date - delta
+        datelist.append(tar_dt.strftime("%Y%m%d"))
+    for tm in datelist:
+        testlist.append(getRovfeaturetable(tm, table))
+    testdata = pd.concat(testlist)
+    testdata.reset_index(inplace=True)
+    testdata = testdata.drop(axis=1, columns='index')
+    return testdata
+
+def select_recent_video(df):
+    """对每一个视频添加row number,按照日期排序,最后选取最近的那一天"""
+    df['dt'] = df['dt'].astype(int)
+    df['rk'] = df['dt'].groupby(df['videoid']).rank(ascending=0, method='first')
+    df = df[df['rk'] == 1]
+    return df
+
+def basic_cal(df):
+    df['weighted_retrn'] = df['futre7dayreturn'].astype('int') 
+    df['weighted_retrn_log'] = df.apply(lambda x: np.log(x['weighted_retrn'] + 1),axis=1)
+    df['return_back'] = df.apply(lambda x:1 if x['weighted_retrn']> 0 else 0,axis=1)
+    return df 
+
+def dataprepare(df_pre):
+    #  直接将特征送进去,不加交叉特征。
+    df_pre = df_pre.fillna(0)
+    #df_new_feature = df_pre[process_feature.features]
+    df_new_feature = df_pre[process_feature.filter_recent_features()]
+    df_target = df_pre['weighted_retrn_log']
+    df_new_feature = pd.concat([df_new_feature, df_pre[process_feature.cate_feat],df_pre[process_feature.one_hot_feature]], axis=1)
+    return df_new_feature, df_target
+
+
+def featureImportance(fold1_df,fold2_df,fold3_df,fold4_df,values_lenth,video_id_lenth,tag_length,word_length):
+    Feature_Data= pd.DataFrame()
+    for df in (fold1_df,fold2_df,fold3_df,fold4_df):
+        fold1_df1 = df.iloc[0:values_lenth,:]
+        videoid_fold1_importance = df.iloc[values_lenth:values_lenth+video_id_lenth,:]['importance'].sum()
+        fold1_df2 = pd.DataFrame([{'Feature':'videoid','importance':videoid_fold1_importance,'fold':1}])
+        tag_fold1_importance = df.iloc[values_lenth+video_id_lenth:values_lenth+video_id_lenth+tag_length,:]['importance'].sum()
+        fold1_df3 = pd.DataFrame([{'Feature':'tags','importance':tag_fold1_importance,'fold':1}])
+        words_fold1_importance = df.iloc[values_lenth+video_id_lenth+tag_length:values_lenth+video_id_lenth+tag_length+word_length,:]['importance'].sum()
+        fold1_df4 = pd.DataFrame([{'Feature':'words','importance':words_fold1_importance,'fold':1}])
+        
+        
+        Feature_Data = pd.concat([Feature_Data,fold1_df1,fold1_df2,fold1_df3,fold1_df4])
+        
+    return Feature_Data
+
+
+def MAPE(true, pred):
+    true = np.array(true)
+    sum_ = 0
+    count = 0
+    for i in range(len(true)):
+        if true[i] != 0:
+            sum_ = sum_ + np.abs(true[i] - pred[i]) / true[i]
+            count = count + 1
+        else:
+            continue
+
+    return sum_ / count
+
+
+def process_train_predict_data():
+    now_date = datetime.date.today() 
+    # day = datetime.datetime.strftime(now_date, '%Y%m%d')
+    diff_1 = datetime.timedelta(days=1)
+    diff_5 = datetime.timedelta(days=7)
+    predict_dt = now_date - diff_1
+    predict_day = datetime.datetime.strftime(predict_dt, '%Y%m%d')
+    train_dt = now_date - diff_5
+    train_day = datetime.datetime.strftime(train_dt, '%Y%m%d')
+    #read data from ali
+    train_data = getdatasample(train_day, 30, 'rov_feature_add_v1')
+    predict_data = getdatasample(predict_day, 1, 'rov_predict_table_add_v1')
+    #pickle for test
+    import _pickle as cPickle
+    with open('train_data.pickle','wb') as output_file:
+        cPickle.dump(train_data, output_file)
+    with open('predict_data.pickle','wb') as output_file:
+        cPickle.dump(predict_data, output_file) 
+    #with open(r"train_data.pickle", "rb") as input_file:
+    '''
+    with open(r"train_data.pickle", "rb") as input_file:
+        train_data = cPickle.load(input_file)    
+    with open(r"predict_data.pickle", "rb") as input_file:
+        predict_data = cPickle.load(input_file)       
+    '''
+    #end pickle
+    train_data = basic_cal(train_data)
+    predict_data = basic_cal(predict_data)
+
+    predict_data = select_recent_video(predict_data)
+    #predict_data.loc[predict_data['dt'] != int(predict_day), 'futre7dayreturn'] = 0
+    predict_data = predict_data.drop(axis=1, columns='rk')
+
+    train_data.drop_duplicates(subset=['videoid', 'dt'], keep='first', inplace=True)
+    predict_data.drop_duplicates(subset=['videoid', 'dt'], keep='first', inplace=True)
+
+    train_data = train_data.fillna(0)
+    predict_data = predict_data.fillna(0)
+    train_data = process_feature.cal_feature(train_data)
+    predict_data = process_feature.cal_feature(predict_data)
+
+    predict_data['videoid'] = predict_data['videoid'].astype('int')
+
+    df_new_feature,df_target= dataprepare(train_data)
+    df_new_feature_predict, df_target_predict = dataprepare(predict_data)
+
+    df_new_feature_part_one = sparse.csr_matrix(np.array(pd.DataFrame(df_new_feature).loc[:,'day1playcount':'videocategory555']))
+    df_new_feature_predict_part_one = sparse.csr_matrix(np.array(pd.DataFrame(df_new_feature_predict).loc[:,'day1playcount':'videocategory555']))
+
+    print('value feature generate successfully')
+
+    train_videoid = pd.DataFrame(df_new_feature).loc[:,'videoid']
+    predict_videoid = pd.DataFrame(df_new_feature_predict).loc[:,'videoid']
+
+    train_videoid_list = pd.DataFrame(df_new_feature).loc[:,'videoid'].to_numpy().reshape(len(pd.DataFrame(df_new_feature).loc[:,'videoid']),1).tolist()
+    predict_videoid_list = pd.DataFrame(df_new_feature_predict).loc[:,'videoid'].to_numpy().reshape(len(pd.DataFrame(df_new_feature_predict).loc[:,'videoid']),1).tolist()
+
+
+    allvideo_raw = list(set(np.array(pd.concat([train_videoid,predict_videoid])).tolist()))
+    allvideo = np.array(allvideo_raw).reshape(len(allvideo_raw),1).tolist()
+    
+
+    mlb_model_videoid = MultiLabelBinarizer(sparse_output=True).fit(allvideo)
+    train_videoid = mlb_model_videoid.transform(train_videoid_list)
+    predict_videoid = mlb_model_videoid.transform(predict_videoid_list)
+
+    print('videoid feature generate successfully')
+
+    #获取tag-one-hot
+    tags ,train_tag,predict_tag = process_tag.tag_preprocessing('tag', df_new_feature, df_new_feature_predict)
+    #获取tag tfidf
+    tag_dict = process_tag.get_tag_tfidf('20200305','video_tag_tf_idf')
+    print('lenth tag_dict:',len(tag_dict))
+    #获取tfidf_tag 稀疏矩阵
+    tag_corpus = tags.tolist()  #corpus
+    tag_tfidf_list = process_tag.ttfidf_list_generation(tag_corpus,tag_dict )
+    tag_tf_idf_matrix  = sparse.csr_matrix(np.array(tag_tfidf_list))
+
+    tag_feature_train = train_tag.multiply(tag_tf_idf_matrix)  
+    tag_feature_test = predict_tag.multiply(tag_tf_idf_matrix)  
+    print('tag tfidf feature generate successfully')
+    print('tag dimension:', len(tag_tfidf_list))
+
+    #获取values without tag
+    words ,train_words,test_words = process_tag.tag_preprocessing('words_no_tag', df_new_feature, df_new_feature_predict)
+    #获取words tfidf
+    words_dict = process_tag.get_tag_tfidf('20200305','video_words_without_tags_tfidf')
+    print('lenth words_dict:',len(words_dict))
+    #获取tfidf_tag 稀疏矩阵
+    words_corpus = words.tolist()  #corpus
+    words_tfidf_list = process_tag.ttfidf_list_generation(words_corpus,words_dict )
+    words_tf_idf_matrix  = sparse.csr_matrix(np.array(words_tfidf_list))
+    words_feature_train = train_words.multiply(words_tf_idf_matrix)  
+    words_feature_test = test_words.multiply(words_tf_idf_matrix)  
+    print('tag tfidf feature generate successfully')
+    print('words dimension:', len(words_tfidf_list))
+
+    df_new_feature = hstack([df_new_feature_part_one,train_videoid,tag_feature_train, words_feature_train])
+    df_new_feature_predict = hstack([df_new_feature_predict_part_one,predict_videoid,tag_feature_test,words_feature_test])
+    return train_data, predict_data, df_target, df_target_predict, df_new_feature, df_new_feature_predict 
+
+
+def do_train(train_data, predict_data, df_target, df_target_predict, df_new_feature, df_new_feature_predict):
+
+    #target
+    df_target_predict = sparse.csr_matrix(pd.DataFrame(df_target_predict).values).toarray()
+    df_target = sparse.csr_matrix(pd.DataFrame(df_target).values).toarray()
+
+
+    param = {'num_leaves': 18,
+         'min_data_in_leaf': 60,
+         'objective': 'regression',
+         'max_depth': -1,
+         'learning_rate': 0.01,
+         "min_child_samples": 30,
+         "boosting": "gbdt",
+         "feature_fraction": 0.8,
+         "bagging_freq": 1,
+         "bagging_fraction": 0.8,
+         "bagging_seed": 11,
+         "metric": 'rmse',
+         "lambda_l1": 0.1,
+         "verbosity": -1,
+         "nthread": 4,
+         #          'max_bin': 512,
+         "random_state": 4590}
+
+    folds = StratifiedKFold(n_splits=4, shuffle=True, random_state=4590)
+    #oof = np.zeros(len(pd.DataFrame(df_new_feature.toarray())))
+    oof = np.zeros(len(df_target))
+    predictions = np.zeros(len(df_target_predict))
+    feature_importance_df = pd.DataFrame()
+
+
+    # values_lenth = len(process_feature.features + process_feature.cate_feat)
+    # video_id_lenth = len(mlb_model_videoid.classes_)
+    # tag_length = len(tag_tfidf_list)
+    # word_length = len(words_tfidf_list)
+
+    change_view = pd.DataFrame(pd.DataFrame(df_new_feature_predict.toarray()))
+    change_view = change_view.sort_index()  
+
+    for fold_, (trn_idx, val_idx) in enumerate(folds.split(df_new_feature, train_data['return_back'].values)):
+        print("folds {}".format(fold_))
+        trn_data = lgb.Dataset(df_new_feature.tocsr()[trn_idx,:], label=pd.DataFrame(df_target).iloc[trn_idx])
+        val_data = lgb.Dataset(df_new_feature.tocsr()[val_idx,:], label=pd.DataFrame(df_target).iloc[val_idx])
+
+        num_round = 10000
+        clf = lgb.train(param, trn_data, num_round, valid_sets=[trn_data, val_data], verbose_eval=100,
+                early_stopping_rounds=200)
+        oof[val_idx] = clf.predict(df_new_feature.tocsr()[val_idx,:], num_iteration=clf.best_iteration)
+        predictions += clf.predict(df_new_feature_predict, num_iteration=clf.best_iteration) / folds.n_splits
+
+        fold_importance_df = pd.DataFrame()
+        
+        # column = process_feature.features+process_feature.cate_feat+mlb_model_videoid.classes_.tolist()+ tag_corpus + words_corpus
+        # fold_importance_df["Feature"] = np.array(column)
+        
+        # fold_importance_df["importance"] = clf.feature_importance()
+        # fold_importance_df["fold"] = fold_ + 1
+        # feature_importance_df = pd.concat([feature_importance_df, fold_importance_df], axis=0)
+
+
+    # fold1_df = feature_importance_df.loc[feature_importance_df['fold']==1]
+    # fold2_df = feature_importance_df.loc[feature_importance_df['fold']==2]
+    # fold3_df = feature_importance_df.loc[feature_importance_df['fold']==3]
+    # fold4_df = feature_importance_df.loc[feature_importance_df['fold']==4]
+
+
+    # feature_importance_df = featureImportance(fold1_df,fold2_df,fold3_df,fold4_df,values_lenth,video_id_lenth,tag_length,word_length)
+
+    print('oof_rmse:', np.sqrt(mean_squared_error(df_target, oof)))
+    print('oof_mse:', mean_squared_error(df_target, oof))
+
+    print('test_rmse:', np.sqrt(mean_squared_error(df_target_predict, predictions)))
+    print('test_mse:', mean_squared_error(df_target_predict, predictions))
+
+
+    print('oof_mape:', MAPE(df_target, oof))
+    print('test_mape:', MAPE(df_target_predict, predictions))
+
+    print('verification r2:', r2_score(df_target, oof))
+    print('test r2:', r2_score(df_target_predict, predictions))
+
+    sub_df_ = pd.DataFrame({"videoid": predict_data["videoid"].values})
+    sub_df_['score'] = predictions
+    print('regre ranking shape', sub_df_.shape)
+    sub_df_.to_csv('result.csv')
+
+if __name__ == '__main__':
+    train_data, predict_data, df_target, df_target_predict, df_new_feature, df_new_feature_predict = process_train_predict_data()
+    do_train(train_data, predict_data, df_target, df_target_predict, df_new_feature, df_new_feature_predict)

+ 6 - 0
sort.py

@@ -0,0 +1,6 @@
+import pandas as pd
+
+df = pd.read_csv('result.csv')
+df2 = df[df['score']>0.001]
+df2 = df2.sort_values('score', axis=0, ascending=False)
+df2.to_csv('result2.csv')