lightgbm_regr_add_new_feature_bak.py 36 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993
  1. #!/usr/bin/env python
  2. # coding: utf-8
  3. # In[2]:
  4. import warnings
  5. warnings.filterwarnings("ignore")
  6. from sklearn.metrics import r2_score
  7. import os
  8. import pandas as pd
  9. import gc
  10. import math
  11. import numpy as np
  12. import time
  13. from sklearn.linear_model import SGDRegressor
  14. from sklearn.linear_model import SGDClassifier
  15. import lightgbm as lgb
  16. from sklearn.model_selection import train_test_split
  17. from sklearn.model_selection import StratifiedKFold
  18. from sklearn import metrics
  19. import pickle
  20. from sklearn.metrics import mean_squared_error
  21. import seaborn as sns
  22. import matplotlib.pylab as plt
  23. from odps import ODPS
  24. from odps.df import DataFrame as odpsdf
  25. from datetime import datetime as dt
  26. import datetime
  27. # In[3]:
  28. now_date = datetime.date.today()
  29. # day = datetime.datetime.strftime(now_date, '%Y%m%d')
  30. diff_1 = datetime.timedelta(days=1)
  31. diff_5 = datetime.timedelta(days=7)
  32. input_dt = now_date - diff_1
  33. input_day = datetime.datetime.strftime(input_dt, '%Y%m%d')
  34. now_day = datetime.datetime.strftime(now_date, '%Y%m%d')
  35. train_dt = now_date - diff_5
  36. train_day = datetime.datetime.strftime(train_dt, '%Y%m%d')
  37. # In[4]:
  38. add_feature = [
  39. 'all_return_day1_return_count', # -- 1/3/7/14日内总回流 #12
  40. 'all_return_day3_return_count',
  41. 'all_return_day7_return_count',
  42. 'all_return_day14_return_count',
  43. 'three_return_day1_return_count', # -- 1/3/7/14日内前三层回流 #14
  44. 'three_return_day3_return_count',
  45. 'three_return_day7_return_count',
  46. 'three_return_day14_return_count',
  47. 'four_up_return_day1_return_count', # -- 1/3/7/14日内四+层回流 #15
  48. 'four_up_return_day3_return_count',
  49. 'four_up_return_day7_return_count',
  50. 'four_up_return_day14_return_count',
  51. 'one_return_day1_return_count', # -- 1/3/7/14日内一层回流 #13
  52. 'one_return_day3_return_count',
  53. 'one_return_day7_return_count',
  54. 'one_return_day14_return_count',
  55. 'four_up_return_div_three_return_day1', # -- 1/3/7/14日内四+层回流/前三层回流 #23
  56. 'four_up_return_div_three_return_day3',
  57. 'four_up_return_div_three_return_day7',
  58. 'four_up_return_div_three_return_day14',
  59. 'all_return_day1_view_day1_return_count', # -- 1/3/7/14日内曝光在1/3/7/14日内回流 #8
  60. 'all_return_day3_view_day3_return_count',
  61. 'all_return_day7_view_day7_return_count',
  62. 'all_return_day14_view_day14_return_count',
  63. 'three_return_day1_view_day1_return_count', # -- 1/3/7/14日内曝光在1/3/7/14日内前三层回流 #10
  64. 'three_return_day3_view_day3_return_count',
  65. 'three_return_day7_view_day7_return_count',
  66. 'three_return_day14_view_day14_return_count',
  67. 'four_up_return_day1_view_day1_return_count', # -- 1/3/7/14日内曝光在1/3/7/14日内四+层回流 # 11
  68. 'four_up_return_day3_view_day3_return_count',
  69. 'four_up_return_day7_view_day7_return_count',
  70. 'four_up_return_day14_view_day14_return_count',
  71. 'one_return_day1_view_day1_return_count', ##-- 1/3/7/14日内曝光在1/3/7/14日内一层回流 #9
  72. 'one_return_day3_view_day3_return_count',
  73. 'one_return_day7_view_day7_return_count',
  74. 'one_return_day14_view_day14_return_count',
  75. 'all_return_day1_on_day1_return_count', # 前day1+1 / day1+3/day1+7/day1+14 到前 day1+1日内曝光在 day1的总回流 #16
  76. 'all_return_day3_on_day1_return_count',
  77. 'all_return_day7_on_day1_return_count',
  78. 'all_return_day14_on_day1_return_count',
  79. 'four_up_return_day1_view_day1_return_div_three_d1', # -- 1/3/7/14日内曝光在1/3/7/14日内四+层回流/前三层回流 #22
  80. 'four_up_return_day3_view_day3_return_div_three_d3',
  81. 'four_up_return_day7_view_day7_return_div_three_d7',
  82. 'four_up_return_day14_view_day14_return_div_three_d14',
  83. 'day1ctr', # -- 1/3/7/14/30/60日内播放/曝光 #17
  84. 'day3ctr',
  85. 'day7ctr',
  86. 'day14ctr',
  87. 'day30ctr',
  88. 'day60ctr',
  89. 'day1sov', # -- 1/3/7/14/30/60日内分享/曝光 #18
  90. 'day3sov',
  91. 'day7sov',
  92. 'day14sov',
  93. 'day30sov',
  94. 'day60sov',
  95. 'day1rov', # -- 1/3/7/14日内曝光的回流/曝光 #19
  96. 'day3rov',
  97. 'day7rov',
  98. 'day14rov',
  99. 'day1soc', # -- 1/3/7/14/30/60日内分享/播放 #20
  100. 'day3soc',
  101. 'day7soc',
  102. 'day14soc',
  103. 'day30soc',
  104. 'day60soc',
  105. 'day1roc', # -- 1/3/7/14日内曝光的回流/播放 #21
  106. 'day3roc',
  107. 'day7roc',
  108. 'day14roc',
  109. 'oneday_day1rov', # -- 1/3/7/14日内曝光在今日的回流/ 1/3/7/14日内曝光 #24
  110. 'oneday_day3rov',
  111. 'oneday_day7rov',
  112. 'oneday_day14rov',
  113. 'futre7dayreturn'
  114. ,'todyviewcount_rank'
  115. ,'day1viewcount_rank'
  116. ]
  117. featurename = [
  118. 'dt',
  119. 'videoid',
  120. 'day1playcount',
  121. 'day1returncount',
  122. 'day1sharecount',
  123. 'day1viewcount',
  124. 'day14playcount',
  125. 'day14returncount',
  126. 'day14sharecount',
  127. 'day14viewcount',
  128. 'day30playcount',
  129. 'day30returncount',
  130. 'day30sharecount',
  131. 'day30viewcount',
  132. 'day3playcount',
  133. 'day3returncount',
  134. 'day3sharecount',
  135. 'day3viewcount',
  136. 'day60playcount',
  137. 'day60returncount',
  138. 'day60sharecount',
  139. 'day60viewcount',
  140. 'day7playcount',
  141. 'day7returncount',
  142. 'day7sharecount',
  143. 'day7viewcount',
  144. 'videocategory11',
  145. 'videocategory12',
  146. 'videocategory45',
  147. 'videocategory49',
  148. 'videocategory1',
  149. 'videocategory2',
  150. 'videocategory3',
  151. 'videocategory4',
  152. 'videocategory5',
  153. 'videocategory6',
  154. 'videocategory7',
  155. 'videocategory8',
  156. 'videocategory9',
  157. 'videocategory85',
  158. 'videocategory10',
  159. 'videocategory555',
  160. 'usercategory1',
  161. 'usercategory2',
  162. 'usercategory3',
  163. 'usercategory4',
  164. 'usercategory5',
  165. 'usercategory6',
  166. 'usercategory7',
  167. 'usercategory8',
  168. 'usercategory9',
  169. 'usercategory10',
  170. 'usercategory11',
  171. 'usercategory12',
  172. 'usercategory45',
  173. 'usercategory49',
  174. 'usercategory85',
  175. 'usercategory555',
  176. 'todyviewcount',
  177. 'day5returncount_1_stage',
  178. 'day5returncount_2_stage',
  179. 'day5returncount_3_stage',
  180. 'day5returncount_4_stage',
  181. 'stage_one_retrn',
  182. 'stage_two_retrn',
  183. 'stage_three_retrn',
  184. 'stage_four_retrn']
  185. words = ['videotags','words_without_tags']
  186. featurename = featurename + add_feature + words
  187. print(len(featurename))
  188. # In[5]:
  189. def getRovfeaturetable(dt):
  190. odps = ODPS('LTAI4FtW5ZzxMvdw35aNkmcp', '0VKnydcaHK3ITjylbgUsLubX6rwiwc', 'usercdm',
  191. endpoint='http://service.cn.maxcompute.aliyun.com/api', connect_timeout=3000, \
  192. read_timeout=500000, pool_maxsize=1000, pool_connections=1000)
  193. featureArray = []
  194. for record in odps.read_table('rov_feature_add_v1', partition='dt=%s' % dt):
  195. valueFeature = {}
  196. for i in featurename:
  197. if i == 'dt':
  198. valueFeature[i] = dt
  199. else:
  200. valueFeature[i] = record[i]
  201. featureArray.append(valueFeature)
  202. featureArray = pd.DataFrame(featureArray)
  203. print(dt, 'feature table finish')
  204. return featureArray
  205. def getRovtestable(dt):
  206. odps = ODPS('LTAI4FtW5ZzxMvdw35aNkmcp', '0VKnydcaHK3ITjylbgUsLubX6rwiwc', 'usercdm',
  207. endpoint='http://service.cn.maxcompute.aliyun.com/api', connect_timeout=3000, \
  208. read_timeout=500000, pool_maxsize=1000, pool_connections=1000)
  209. featureArray = []
  210. for record in odps.read_table('rov_predict_table_add_v1', partition='dt=%s' % dt):
  211. valueFeature = {}
  212. for i in featurename:
  213. if i == 'dt':
  214. valueFeature[i] = dt
  215. else:
  216. valueFeature[i] = record[i]
  217. featureArray.append(valueFeature)
  218. featureArray = pd.DataFrame(featureArray)
  219. print(dt, 'test table finish')
  220. return featureArray
  221. def getestingdata(date):
  222. new_date = dt.strptime(date, '%Y%m%d')
  223. datelist = []
  224. testlist = []
  225. for i in range(0, 1):
  226. delta = datetime.timedelta(days=i)
  227. tar_dt = new_date - delta
  228. datelist.append(tar_dt.strftime("%Y%m%d"))
  229. print(datelist)
  230. for tm in datelist:
  231. testlist.append(getRovtestable(tm))
  232. testdata = pd.concat(testlist)
  233. testdata.reset_index(inplace=True)
  234. testdata = testdata.drop(axis=1, columns='index')
  235. return testdata
  236. def getrainingdata(date):
  237. new_date = dt.strptime(date, '%Y%m%d')
  238. datelist = []
  239. trainlist = []
  240. for i in range(0, 30):
  241. delta = datetime.timedelta(days=i)
  242. tar_dt = new_date - delta
  243. datelist.append(tar_dt.strftime("%Y%m%d"))
  244. print(datelist)
  245. for tm in datelist:
  246. trainlist.append(getRovfeaturetable(tm))
  247. traindata = pd.concat(trainlist)
  248. traindata.reset_index(inplace=True)
  249. traindata = traindata.drop(axis=1, columns='index')
  250. return traindata
  251. traindata = getrainingdata(train_day)
  252. data_test_ori_rk = getestingdata(input_day)
  253. # In[6]:
  254. def select_recent_video(df):
  255. """对每一个视频添加row number,按照日期排序,最后选取最近的那一天"""
  256. df['dt'] = df['dt'].astype(int)
  257. df['rk'] = df['dt'].groupby(df['videoid']).rank(ascending=0, method='first')
  258. df = df[df['rk'] == 1]
  259. return df
  260. data_test_ori = select_recent_video(data_test_ori_rk)
  261. data_test_ori.loc[data_test_ori['dt'] != int(input_day), 'futre7dayreturn'] = 0
  262. data_test_ori = data_test_ori.drop(axis=1, columns='rk')
  263. # In[7]:
  264. ## 去重复,保证每个视频 每一天 有切仅有一条数据。
  265. traindata.drop_duplicates(subset=['videoid', 'dt'], keep='first', inplace=True)
  266. data_test_ori.drop_duplicates(subset=['videoid', 'dt'], keep='first', inplace=True)
  267. # In[8]:
  268. def basic_cal(df):
  269. # df['weighted_retrn'] = df['stage_one_retrn'].astype('int')*0.4 + \
  270. # df['stage_two_retrn'].astype('int')*0.3 + \
  271. # df['stage_three_retrn'].astype('int')*0.3
  272. df['weighted_retrn'] = df['futre7dayreturn'].astype('int')
  273. # df['weighted_retrn'] = df['futr5returncount'].astype('int')
  274. # day1viewcount 如果是零,就返回 rov,rov_log 变为零
  275. df['weighted_retrn_log'] = df.apply(lambda x: np.log(x['weighted_retrn'] + 1),axis=1)
  276. # df['rov'] = df.apply(lambda x: x['weighted_retrn'] / x['todyviewcount'] \
  277. # if x['todyviewcount']!=0 else 0,axis=1)
  278. # df['rov_log'] = df.apply(lambda x: np.log(x['rov'] + 1),axis=1)
  279. # thresh = np.percentile(df[df['weighted_retrn']>0]['weighted_retrn'],50)
  280. ## 设置回流大于thresh, label就是1, 没有分享或有分享但是回流数是零的标为0
  281. df['return_back'] = df.apply(lambda x:1 if x['weighted_retrn']> 0 else 0,axis=1)
  282. return df
  283. data_train = basic_cal(traindata)
  284. data_test = basic_cal(data_test_ori)
  285. # In[9]:
  286. def today_view_category(df):
  287. ### 对当天的曝光量分三个级别,未来三天的曝光量分3个级别,添加Category feaure
  288. data_test1_view1 = df.loc[data_test['day1viewcount_rank'] > 10000]['day1viewcount'].mean()
  289. data_test1_view2 = df.loc[(data_test['day1viewcount_rank'] > 3000)&(data_test['day1viewcount_rank'] <= 10000)]['day1viewcount'].mean()
  290. data_test1_view3 = df.loc[(data_test['day1viewcount_rank'] > 1000)&(data_test['day1viewcount_rank'] <= 3000)]['day1viewcount'].mean()
  291. data_test1_view4 = df.loc[(data_test['day1viewcount_rank'] > 300)&(data_test['day1viewcount_rank'] <= 1000)]['day1viewcount'].mean()
  292. data_test1_view5 = df.loc[(data_test['day1viewcount_rank'] > 100)&(data_test['day1viewcount_rank'] <= 300)]['day1viewcount'].mean()
  293. data_test1_view6 = df.loc[(data_test['day1viewcount_rank'] > 30)&(data_test['day1viewcount_rank'] <= 100)]['day1viewcount'].mean()
  294. data_test1_view7 = df.loc[(data_test['day1viewcount_rank'] > 0)&(data_test['day1viewcount_rank'] <= 30)]['day1viewcount'].mean()
  295. df.loc[df['day1viewcount_rank'] > 10000, 'todyviewcount'] = data_test1_view3
  296. df.loc[(data_test['day1viewcount_rank'] > 3000)&(data_test['day1viewcount_rank'] <= 10000), 'todyviewcount'] = data_test1_view3
  297. df.loc[(data_test['day1viewcount_rank'] > 1000)&(data_test['day1viewcount_rank'] <= 3000), 'todyviewcount'] = data_test1_view4
  298. df.loc[(data_test['day1viewcount_rank'] > 300)&(data_test['day1viewcount_rank'] <= 1000), 'todyviewcount'] = data_test1_view5
  299. df.loc[(data_test['day1viewcount_rank'] > 100)&(data_test['day1viewcount_rank'] <= 300), 'todyviewcount'] = data_test1_view6
  300. df.loc[(data_test['day1viewcount_rank'] > 30)&(data_test['day1viewcount_rank'] <= 100), 'todyviewcount'] = data_test1_view7
  301. df.loc[(data_test['day1viewcount_rank'] > 0)&(data_test['day1viewcount_rank'] <= 30), 'todyviewcount'] = data_test1_view7
  302. return df
  303. data_test = today_view_category(data_test)
  304. # In[10]:
  305. # 首页特征
  306. root_page_1day = ['day1playcount', 'day1returncount', 'day1sharecount', 'day1viewcount']
  307. root_page_3day = ['day3playcount', 'day3returncount', 'day3sharecount', 'day3viewcount']
  308. root_page_7day = ['day7playcount', 'day7returncount', 'day7sharecount', 'day7viewcount']
  309. root_page_14day = ['day14playcount', 'day14returncount', 'day14sharecount', 'day14viewcount']
  310. root_page_30day = ['day30playcount', 'day30returncount', 'day30sharecount', 'day30viewcount']
  311. root_page_60day = ['day60playcount', 'day60returncount', 'day60sharecount', 'day60viewcount']
  312. return_5day = ['day5returncount_1_stage', 'day5returncount_2_stage', 'day5returncount_3_stage',
  313. 'day5returncount_4_stage']
  314. cate_feat = ['videocategory1', 'videocategory10', 'videocategory11', 'videocategory12',
  315. 'videocategory2', 'videocategory3', 'videocategory4', 'videocategory45',
  316. 'videocategory49', 'videocategory5', 'videocategory6',
  317. 'videocategory7', 'videocategory8', 'videocategory85', 'videocategory9', 'videocategory555']
  318. one_hot_feature = ['videotags','words_without_tags','videoid']
  319. # cate_view_feat = [ 'todyview_low','todyview_median','todyview_high']
  320. # cate_view_feat = ['todyview_1', 'todyview_2', 'todyview_3', 'todyview_4', 'todyview_5', 'todyview_6', 'todyview_7',
  321. # 'todyview_8']
  322. # In[11]:
  323. def cal_feature(df):
  324. start = time.time()
  325. for i in range(len(root_page_1day)):
  326. newfeat_div = root_page_60day[i] + '_divide_' + root_page_30day[i]
  327. # df[newfeat_div] = df.apply(lambda s: s[root_page_30day[i]] / s[root_page_60day[i]]\
  328. # if s[root_page_60day[i]] != 0 else 0, axis=1)
  329. df[newfeat_div] = df[root_page_30day[i]]/ df[root_page_60day[i]]
  330. newfeat_diff = root_page_60day[i] + '_dif_' + root_page_30day[i]
  331. # df[newfeat_diff] = df.apply(lambda s: s[root_page_60day[i]]-s[root_page_30day[i]],\
  332. # axis=1)
  333. df[newfeat_diff] = df[root_page_60day[i]] - df[root_page_30day[i]]
  334. end = time.time()
  335. running_time = end-start
  336. print('stage 1: time cost : %.5f sec' %running_time)
  337. start = time.time()
  338. for i in range(len(root_page_1day)):
  339. newfeat_div = root_page_30day[i] + '_divide_' + root_page_7day[i]
  340. # df[newfeat_div] = df.apply(lambda s: s[root_page_7day[i]] / s[root_page_30day[i]]\
  341. # if s[root_page_30day[i]] != 0 else 0, axis=1)
  342. df[newfeat_div] = df[root_page_7day[i]]/df[root_page_30day[i]]
  343. newfeat_diff = root_page_30day[i] + '_dif_' + root_page_7day[i]
  344. # df[newfeat_diff] = df.apply(lambda s: s[root_page_7day[i]] - s[root_page_3day[i]],\
  345. # axis=1)
  346. df[newfeat_diff] = df[root_page_7day[i]] - df[root_page_3day[i]]
  347. end = time.time()
  348. running_time = end-start
  349. print('stage 2: time cost : %.5f sec' %running_time)
  350. start = time.time()
  351. for i in range(len(root_page_1day)):
  352. newfeat_div = root_page_7day[i] + '_divide_' + root_page_3day[i]
  353. # df[newfeat_div] = df.apply(lambda s: s[root_page_3day[i]] / s[root_page_7day[i]]\
  354. # if s[root_page_7day[i]] != 0 else 0, axis=1)
  355. df[newfeat_div] = df[root_page_3day[i]]/df[root_page_7day[i]]
  356. newfeat_diff = root_page_7day[i] + '_dif_' + root_page_3day[i]
  357. # df[newfeat_diff] = df.apply(lambda s: s[root_page_7day[i]] - s[root_page_3day[i]],\
  358. # axis=1)
  359. df[newfeat_diff] = df[root_page_7day[i]] - df[root_page_3day[i]]
  360. end = time.time()
  361. running_time = end-start
  362. print('stage 3: time cost : %.5f sec' %running_time)
  363. start = time.time()
  364. for i in range(len(root_page_1day)):
  365. newfeat_div = root_page_3day[i] + '_divide_' + root_page_1day[i]
  366. # df[newfeat_div] = df.apply(lambda s: s[root_page_1day[i]] / s[root_page_3day[i]]\
  367. # if s[root_page_3day[i]] != 0 else 0, axis=1)
  368. df[newfeat_div] = df[root_page_1day[i]] / df[root_page_3day[i]]
  369. newfeat_diff = root_page_3day[i] + '_dif_' + root_page_1day[i]
  370. # df[newfeat_diff] = df.apply(lambda s: s[root_page_3day[i]] - s[root_page_1day[i]],\
  371. # axis=1)
  372. df[newfeat_diff] = df[root_page_3day[i]] - df[root_page_1day[i]]
  373. end = time.time()
  374. running_time = end-start
  375. print('stage 4: time cost : %.5f sec' %running_time)
  376. df = df.replace([np.inf, -np.inf], np.nan)
  377. df = df.fillna(0)
  378. return df
  379. # In[12]:
  380. data_train = data_train.fillna(0)
  381. data_test = data_test.fillna(0)
  382. data_train = cal_feature(data_train)
  383. data_test = cal_feature(data_test)
  384. # In[13]:
  385. print('data_train shape:', data_train.shape)
  386. print('data_test shape:', data_test.shape)
  387. # In[14]:
  388. features = ['day1playcount', 'day1returncount', 'day1sharecount', 'day1viewcount', 'day30playcount', 'day30returncount',
  389. 'day30sharecount', 'day30viewcount', 'day3playcount', 'day3returncount', 'day3sharecount', 'day3viewcount',
  390. 'day60playcount', 'day60returncount', 'day60sharecount', 'day60viewcount', 'day7playcount', 'day7returncount',
  391. 'day7sharecount', 'day7viewcount', 'usercategory1', 'usercategory2', 'usercategory3', 'usercategory4',
  392. 'usercategory5', 'usercategory6', 'usercategory7', 'usercategory8', 'usercategory9', 'usercategory10',
  393. 'usercategory11', 'usercategory12', 'usercategory45', 'usercategory49', 'usercategory85','usercategory555',
  394. 'todyviewcount',
  395. 'day5returncount_1_stage', 'day5returncount_2_stage', 'day5returncount_3_stage', 'day5returncount_4_stage',
  396. 'stage_one_retrn', 'stage_two_retrn', 'stage_three_retrn', 'stage_four_retrn', 'all_return_day1_return_count',
  397. 'all_return_day3_return_count', 'all_return_day7_return_count', 'all_return_day14_return_count',
  398. 'three_return_day1_return_count', 'three_return_day3_return_count', 'three_return_day7_return_count',
  399. 'three_return_day14_return_count', 'four_up_return_day1_return_count', 'four_up_return_day3_return_count',
  400. 'four_up_return_day7_return_count', 'four_up_return_day14_return_count', 'one_return_day1_return_count',
  401. 'one_return_day3_return_count', 'one_return_day7_return_count', 'one_return_day14_return_count',
  402. 'four_up_return_div_three_return_day1', 'four_up_return_div_three_return_day3',
  403. 'four_up_return_div_three_return_day7', 'four_up_return_div_three_return_day14',
  404. 'all_return_day1_view_day1_return_count', 'all_return_day3_view_day3_return_count',
  405. 'all_return_day7_view_day7_return_count', 'all_return_day14_view_day14_return_count',
  406. 'three_return_day1_view_day1_return_count', 'three_return_day3_view_day3_return_count',
  407. 'three_return_day7_view_day7_return_count', 'three_return_day14_view_day14_return_count',
  408. 'four_up_return_day1_view_day1_return_count', 'four_up_return_day3_view_day3_return_count',
  409. 'four_up_return_day7_view_day7_return_count', 'four_up_return_day14_view_day14_return_count',
  410. 'one_return_day1_view_day1_return_count', 'one_return_day3_view_day3_return_count',
  411. 'one_return_day7_view_day7_return_count', 'one_return_day14_view_day14_return_count',
  412. 'all_return_day1_on_day1_return_count', 'all_return_day3_on_day1_return_count',
  413. 'all_return_day7_on_day1_return_count', 'all_return_day14_on_day1_return_count',
  414. 'four_up_return_day1_view_day1_return_div_three_d1', 'four_up_return_day3_view_day3_return_div_three_d3',
  415. 'four_up_return_day7_view_day7_return_div_three_d7', 'four_up_return_day14_view_day14_return_div_three_d14',
  416. 'day1ctr', 'day3ctr', 'day7ctr', 'day14ctr', 'day30ctr', 'day60ctr', 'day1sov', 'day3sov', 'day7sov',
  417. 'day14sov', 'day30sov', 'day60sov', 'day1rov', 'day3rov', 'day7rov', 'day14rov', 'day1soc', 'day3soc',
  418. 'day7soc', 'day14soc', 'day30soc', 'day60soc', 'day1roc', 'day3roc', 'day7roc', 'day14roc', 'oneday_day1rov',
  419. 'oneday_day3rov', 'oneday_day7rov', 'oneday_day14rov',
  420. 'day60playcount_divide_day30playcount', 'day60playcount_dif_day30playcount',
  421. 'day60returncount_divide_day30returncount', 'day60returncount_dif_day30returncount',
  422. 'day60sharecount_divide_day30sharecount', 'day60sharecount_dif_day30sharecount',
  423. 'day60viewcount_divide_day30viewcount', 'day60viewcount_dif_day30viewcount',
  424. 'day30playcount_divide_day7playcount', 'day30playcount_dif_day7playcount',
  425. 'day30returncount_divide_day7returncount', 'day30returncount_dif_day7returncount',
  426. 'day30sharecount_divide_day7sharecount', 'day30sharecount_dif_day7sharecount',
  427. 'day30viewcount_divide_day7viewcount', 'day30viewcount_dif_day7viewcount',
  428. 'day7playcount_divide_day3playcount', 'day7playcount_dif_day3playcount',
  429. 'day7returncount_divide_day3returncount', 'day7returncount_dif_day3returncount',
  430. 'day7sharecount_divide_day3sharecount', 'day7sharecount_dif_day3sharecount',
  431. 'day7viewcount_divide_day3viewcount', 'day7viewcount_dif_day3viewcount', 'day3playcount_divide_day1playcount',
  432. 'day3playcount_dif_day1playcount', 'day3returncount_divide_day1returncount',
  433. 'day3returncount_dif_day1returncount', 'day3sharecount_divide_day1sharecount',
  434. 'day3sharecount_dif_day1sharecount', 'day3viewcount_divide_day1viewcount',
  435. 'day3viewcount_dif_day1viewcount']
  436. # In[15]:
  437. def dataprepare(df_pre):
  438. # 直接将特征送进去,不加交叉特征。
  439. # 是否对数据补零
  440. df_pre = df_pre.fillna(0)
  441. df_new_feature = df_pre[features]
  442. # df_onehot_feature = df_pre[one_hot_feature]
  443. # df_new_feature = pd.concat([df_pre.loc[:, 'all_return_day14_on_day1_return_count':'day7viewcount'], \
  444. # df_pre.loc[:, 'four_up_return_day14_return_count': \
  445. # 'four_up_return_div_three_return_day7'], \
  446. # df_pre.loc[:, 'one_return_day14_return_count':'oneday_day7rov'],
  447. # df_pre.loc[:,
  448. # 'three_return_day14_return_count':'three_return_day7_view_day7_return_count'],
  449. # df_pre.loc[:, 'usercategory1':'usercategory9'], df_pre.loc[:,
  450. # 'day60playcount_divide_day30playcount':'day3viewcount_dif_day1viewcount']],
  451. # axis=1)
  452. # df_new_feature = pd.concat([df_pre.loc[:,'day1playcount':'day7viewcount'],\
  453. # df_pre.loc[:,'day60playcount_divide_day30playcount':\
  454. # 'day5returncount_4_stage_dif_day5returncount_3_stage'], \
  455. # df_pre.loc[:,'usercategory1':'usercategory9']], axis=1)
  456. df_target = df_pre['weighted_retrn_log']
  457. df_new_feature = pd.concat([df_new_feature, df_pre[cate_feat],df_pre[one_hot_feature]], axis=1)
  458. return df_new_feature, df_target
  459. # In[16]:
  460. recall_video = pd.read_csv('/root/ROVtrain/readonlinetable/result/recall_' + input_day[-4:] + '.csv')
  461. # In[17]:
  462. ten_percent_thresh = recall_video['score'].min()
  463. if ten_percent_thresh < 0.4:
  464. recall_video_stage_one = recall_video[recall_video['score'] > 0.4]
  465. else:
  466. recall_video_stage_one = recall_video[recall_video['score'] > ten_percent_thresh]
  467. # In[18]:
  468. data_test['videoid'] = data_test['videoid'].astype('int')
  469. data_train = data_train[data_train['weighted_retrn'] > 0]
  470. print(data_train.shape, 'train shape')
  471. data_test = pd.merge(data_test, recall_video_stage_one, on=['videoid'], how='inner')
  472. print('score>0.5 video_count:', data_test.shape)
  473. # In[19]:
  474. df_new_feature,df_target= dataprepare(data_train)
  475. df_new_feature_test, df_target_test = dataprepare(data_test)
  476. # In[20]:
  477. #数值
  478. from scipy import sparse
  479. df_new_feature_part_one = sparse.csr_matrix(np.array(pd.DataFrame(df_new_feature).loc[:,'day1playcount':'videocategory555']))
  480. df_new_feature_test_part_one = sparse.csr_matrix(np.array(pd.DataFrame(df_new_feature_test).loc[:,'day1playcount':'videocategory555']))
  481. print('value feature generate successfully')
  482. # In[21]:
  483. #videoid
  484. train_videoid = pd.DataFrame(df_new_feature).loc[:,'videoid']
  485. test_videoid = pd.DataFrame(df_new_feature_test).loc[:,'videoid']
  486. train_videoid_list = pd.DataFrame(df_new_feature).loc[:,'videoid'].to_numpy().reshape(len(pd.DataFrame(df_new_feature).loc[:,'videoid']),1).tolist()
  487. test_videoid_list = pd.DataFrame(df_new_feature_test).loc[:,'videoid'].to_numpy().reshape(len(pd.DataFrame(df_new_feature_test).loc[:,'videoid']),1).tolist()
  488. allvideo_raw = list(set(np.array(pd.concat([train_videoid,test_videoid])).tolist()))
  489. allvideo = np.array(allvideo_raw).reshape(len(allvideo_raw),1).tolist()
  490. from sklearn.preprocessing import MultiLabelBinarizer
  491. mlb_model_videoid = MultiLabelBinarizer(sparse_output=True).fit(allvideo)
  492. train_videoid = mlb_model_videoid.transform(train_videoid_list)
  493. test_videoid = mlb_model_videoid.transform(test_videoid_list)
  494. print('videoid feature generate successfully')
  495. # In[23]:
  496. len(mlb_model_videoid.classes_)
  497. # In[24]:
  498. def tag_preprocessing(filename):
  499. #读取tag分词结果
  500. tag_txt = open("/root/ROVtrain/tfidfCompution/"+ filename +".txt","r") #设置文件对象
  501. ftextlist = tag_txt.readlines() # 同上
  502. tag_txt.close() #关闭文件
  503. #转为corpus
  504. tagList = str(ftextlist).replace('[','').replace(']','').replace("'","").replace("'","").split(',')
  505. tag = np.array(tagList).reshape(len(tagList),1).tolist()
  506. #将词特征转为list形式
  507. train_tag_feature = pd.DataFrame(df_new_feature).loc[:,'videotags'].to_numpy().reshape(len(pd.DataFrame(df_new_feature).loc[:,'videotags']),1).tolist()
  508. test_tag_feature = pd.DataFrame(df_new_feature_test).loc[:,'videotags'].to_numpy().reshape(len(pd.DataFrame(df_new_feature_test).loc[:,'videotags']),1).tolist()
  509. #稀疏特征
  510. mlb_model_tag = MultiLabelBinarizer(sparse_output=True).fit(tag)
  511. train_tag = mlb_model_tag.transform(train_tag_feature)
  512. test_tag = mlb_model_tag.transform(test_tag_feature)
  513. return mlb_model_tag.classes_,train_tag,test_tag
  514. # In[25]:
  515. #读取tf,idf
  516. def get_tag_tfidf(dt, tfidf_table_name):
  517. odps = ODPS('LTAI4FtW5ZzxMvdw35aNkmcp', '0VKnydcaHK3ITjylbgUsLubX6rwiwc', 'videoods',
  518. endpoint='http://service.cn.maxcompute.aliyun.com/api', connect_timeout=3000, \
  519. read_timeout=500000, pool_maxsize=1000, pool_connections=1000)
  520. tag_dict = {}
  521. for record in odps.read_table(tfidf_table_name, partition='dt=%s' % dt):
  522. tag_dict[record[0]] = record[1]
  523. return tag_dict
  524. # In[26]:
  525. def ttfidf_list_generation(tag_corpus,tag_dict):
  526. tag_tfidf_list = []
  527. for i in tag_corpus:
  528. try :
  529. tag_tfidf_list.append(tag_dict[i])
  530. except:
  531. tag_tfidf_list.append(0)
  532. return tag_tfidf_list
  533. # In[27]:
  534. #获取tag-one-hot
  535. tags ,train_tag,test_tag = tag_preprocessing('tag')
  536. #获取tag tfidf
  537. tag_dict = get_tag_tfidf('20200305','video_tag_tf_idf')
  538. print('lenth tag_dict:',len(tag_dict))
  539. #获取tfidf_tag 稀疏矩阵
  540. tag_corpus = tags.tolist() #corpus
  541. tag_tfidf_list = ttfidf_list_generation(tag_corpus,tag_dict )
  542. tag_tf_idf_matrix = sparse.csr_matrix(np.array(tag_tfidf_list))
  543. tag_feature_train = train_tag.multiply(tag_tf_idf_matrix)
  544. tag_feature_test = test_tag.multiply(tag_tf_idf_matrix)
  545. print('tag tfidf feature generate successfully')
  546. print('tag dimension:', len(tag_tfidf_list))
  547. # In[28]:
  548. #获取values without tag
  549. words ,train_words,test_words = tag_preprocessing('words_no_tag')
  550. #获取words tfidf
  551. words_dict = get_tag_tfidf('20200305','video_words_without_tags_tfidf')
  552. print('lenth words_dict:',len(words_dict))
  553. #获取tfidf_tag 稀疏矩阵
  554. words_corpus = words.tolist() #corpus
  555. words_tfidf_list = ttfidf_list_generation(words_corpus,words_dict )
  556. words_tf_idf_matrix = sparse.csr_matrix(np.array(words_tfidf_list))
  557. words_feature_train = train_words.multiply(words_tf_idf_matrix)
  558. words_feature_test = test_words.multiply(words_tf_idf_matrix)
  559. print('tag tfidf feature generate successfully')
  560. print('words dimension:', len(words_tfidf_list))
  561. # In[32]:
  562. #cancat 特征
  563. from scipy.sparse import hstack
  564. #训练特征
  565. df_new_feature = hstack([df_new_feature_part_one,train_videoid,tag_feature_train, words_feature_train])
  566. df_new_feature_test = hstack([df_new_feature_test_part_one,test_videoid,tag_feature_test,words_feature_test])
  567. #target
  568. df_target_test = sparse.csr_matrix(pd.DataFrame(df_target_test).values).toarray()
  569. df_target = sparse.csr_matrix(pd.DataFrame(df_target).values).toarray()
  570. # In[33]:
  571. df_target.size
  572. # In[34]:
  573. param = {'num_leaves': 18,
  574. 'min_data_in_leaf': 60,
  575. 'objective': 'regression',
  576. 'max_depth': -1,
  577. 'learning_rate': 0.01,
  578. "min_child_samples": 30,
  579. "boosting": "gbdt",
  580. "feature_fraction": 0.8,
  581. "bagging_freq": 1,
  582. "bagging_fraction": 0.8,
  583. "bagging_seed": 11,
  584. "metric": 'rmse',
  585. "lambda_l1": 0.1,
  586. "verbosity": -1,
  587. "nthread": 4,
  588. # 'max_bin': 512,
  589. "random_state": 4590}
  590. folds = StratifiedKFold(n_splits=4, shuffle=True, random_state=4590)
  591. oof = np.zeros(len(pd.DataFrame(df_new_feature.toarray())))
  592. predictions = np.zeros(len(df_target_test))
  593. feature_importance_df = pd.DataFrame()
  594. # In[ ]:
  595. # In[46]:
  596. values_lenth = len(features + cate_feat)
  597. video_id_lenth = len(mlb_model_videoid.classes_)
  598. tag_length = len(tag_tfidf_list)
  599. word_length = len(words_tfidf_list)
  600. print(values_lenth)
  601. print(video_id_lenth)
  602. print(tag_length)
  603. print(word_length)
  604. # In[36]:
  605. change_view = pd.DataFrame(pd.DataFrame(df_new_feature_test.toarray()))
  606. change_view = change_view.sort_index()
  607. # In[64]:
  608. # In[67]:
  609. for fold_, (trn_idx, val_idx) in enumerate(folds.split(df_new_feature, data_train['return_back'].values)):
  610. print("folds {}".format(fold_))
  611. trn_data = lgb.Dataset(df_new_feature.tocsr()[trn_idx,:], label=pd.DataFrame(df_target).iloc[trn_idx])
  612. val_data = lgb.Dataset(df_new_feature.tocsr()[val_idx,:], label=pd.DataFrame(df_target).iloc[val_idx])
  613. num_round = 10000
  614. clf = lgb.train(param, trn_data, num_round, valid_sets=[trn_data, val_data], verbose_eval=100,
  615. early_stopping_rounds=200)
  616. oof[val_idx] = clf.predict(df_new_feature.tocsr()[val_idx,:], num_iteration=clf.best_iteration)
  617. predictions += clf.predict(df_new_feature_test, num_iteration=clf.best_iteration) / folds.n_splits
  618. fold_importance_df = pd.DataFrame()
  619. column = features+cate_feat+mlb_model_videoid.classes_.tolist()+ tag_corpus + words_corpus
  620. fold_importance_df["Feature"] = np.array(column)
  621. fold_importance_df["importance"] = clf.feature_importance()
  622. fold_importance_df["fold"] = fold_ + 1
  623. feature_importance_df = pd.concat([feature_importance_df, fold_importance_df], axis=0)
  624. # In[72]:
  625. print(values_lenth)
  626. print(video_id_lenth)
  627. print(tag_length)
  628. print(word_length)
  629. fold1_df = feature_importance_df.loc[feature_importance_df['fold']==1]
  630. fold2_df = feature_importance_df.loc[feature_importance_df['fold']==2]
  631. fold3_df = feature_importance_df.loc[feature_importance_df['fold']==3]
  632. fold4_df = feature_importance_df.loc[feature_importance_df['fold']==4]
  633. # In[95]:
  634. def featureImportance(fold1_df,fold2_df,fold3_df,fold4_df,values_lenth,video_id_lenth,tag_length,word_length):
  635. Feature_Data= pd.DataFrame()
  636. for df in (fold1_df,fold2_df,fold3_df,fold4_df):
  637. fold1_df1 = df.iloc[0:values_lenth,:]
  638. videoid_fold1_importance = df.iloc[values_lenth:values_lenth+video_id_lenth,:]['importance'].sum()
  639. fold1_df2 = pd.DataFrame([{'Feature':'videoid','importance':videoid_fold1_importance,'fold':1}])
  640. tag_fold1_importance = df.iloc[values_lenth+video_id_lenth:values_lenth+video_id_lenth+tag_length,:]['importance'].sum()
  641. fold1_df3 = pd.DataFrame([{'Feature':'tags','importance':tag_fold1_importance,'fold':1}])
  642. words_fold1_importance = df.iloc[values_lenth+video_id_lenth+tag_length:values_lenth+video_id_lenth+tag_length+word_length,:]['importance'].sum()
  643. fold1_df4 = pd.DataFrame([{'Feature':'words','importance':words_fold1_importance,'fold':1}])
  644. Feature_Data = pd.concat([Feature_Data,fold1_df1,fold1_df2,fold1_df3,fold1_df4])
  645. return Feature_Data
  646. # In[96]:
  647. feature_importance_df = featureImportance(fold1_df,fold2_df,fold3_df,fold4_df,values_lenth,video_id_lenth,tag_length,word_length)
  648. # In[98]:
  649. print('oof_rmse:', np.sqrt(mean_squared_error(df_target, oof)))
  650. print('oof_mse:', mean_squared_error(df_target, oof))
  651. print('test_rmse:', np.sqrt(mean_squared_error(df_target_test, predictions)))
  652. print('test_mse:', mean_squared_error(df_target_test, predictions))
  653. # In[99]:
  654. def MAPE(true, pred):
  655. true = np.array(true)
  656. sum_ = 0
  657. count = 0
  658. for i in range(len(true)):
  659. if true[i] != 0:
  660. sum_ = sum_ + np.abs(true[i] - pred[i]) / true[i]
  661. count = count + 1
  662. else:
  663. continue
  664. return sum_ / count
  665. print('oof_mape:', MAPE(df_target, oof))
  666. print('test_mape:', MAPE(df_target_test, predictions))
  667. # In[100]:
  668. # from sklearn.metrics import r2_score
  669. print('verification r2:', r2_score(df_target, oof))
  670. print('test r2:', r2_score(df_target_test, predictions))
  671. sub_df_ = pd.DataFrame({"videoid": data_test["videoid"].values})
  672. sub_df_['score'] = predictions
  673. print('regre ranking shape', sub_df_.shape)
  674. # In[101]:
  675. if ten_percent_thresh < 0.4:
  676. rest_video = recall_video[recall_video['score'] <= 0.35]
  677. else:
  678. rest_video = recall_video[recall_video['score'] <= ten_percent_thresh]
  679. # recall_all = pd.concat([rest_video,sub_df_],axis=0).sort_values(by=['score'],ascending=False)
  680. # recall_all.columns = ['videoId', 'score']
  681. recall_all = sub_df_.sort_values(by=['score'], ascending=False)
  682. recall_all.columns = ['videoId', 'score']
  683. print('result score shape', recall_all.shape)
  684. # In[102]:
  685. # recall_all.to_json('/root/ROVtrain/readonlinetable/video_score_add_newfeature'+ datetime.datetime.strftime(now_date, '%Y%m%d')[-4:] + '.json',orient='records')
  686. # print('save json success')
  687. recall_all.to_json(
  688. '/root/ROVtrain/readonlinetable/result/video_score_' + datetime.datetime.strftime(now_date, '%Y%m%d')[
  689. -4:] + '.json', orient='records')
  690. print('save json success')
  691. # In[103]:
  692. sub_df_ = pd.DataFrame({"videoid": data_test["videoid"].values})
  693. sub_df_['score'] = predictions
  694. compare_col_ = data_test[
  695. ['videoid', 'weighted_retrn_log', 'weighted_retrn', 'todyviewcount', 'day3viewcount', 'day1viewcount',
  696. 'day3returncount', 'day1returncount']]
  697. merge_ = pd.merge(compare_col_, sub_df_, on=['videoid'])
  698. # In[104]:
  699. # merge_.shape
  700. merge_.to_csv('/root/ROVtrain/readonlinetable/video_metric_score/' + now_day[-4:] + '/' + 'video_metric' + '.csv',
  701. index=False)
  702. feature_importance_df.to_csv(
  703. '/root/ROVtrain/readonlinetable/video_metric_score/' + now_day[-4:] + '/' + 'feature_importance' + '.csv',
  704. index=False)
  705. print('end')