meta_data_operation.py 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323
  1. #!/usr/bin/env python
  2. # coding: utf-8
  3. import random
  4. import numpy as np
  5. import os
  6. import datetime
  7. import json
  8. import oss2
  9. import hashlib
  10. import pandas as pd
  11. from odps import ODPS
  12. import time
  13. from tqdm import tqdm
  14. import pickle
  15. import langid
  16. import smtplib
  17. from email.mime.text import MIMEText
  18. from email.header import Header
  19. from rov_to_redis import toRedis
  20. def sendemail():
  21. mail_host="smtp.exmail.qq.com" #设置服务器
  22. mail_user="warning@piaoquantv.com" #用户名
  23. mail_pass="Wq20160722" #口令
  24. sender = 'warning@piaoquantv.com'
  25. receivers = ['liqian@piaoquantv.com','liuchunlin@piaoquantv.com'] # 接收邮件,可设置为你的QQ邮箱或者其他邮箱
  26. # 三个参数:第一个为文本内容,第二个 plain 设置文本格式,第三个 utf-8 设置编码
  27. message = MIMEText('小程序ROV更新失败', 'plain', 'utf-8')
  28. message['From'] = Header("warning@piaoquantv.com", 'utf-8')
  29. message['To'] = Header("xiaoping@piaoquantv.com", 'utf-8')
  30. subject = '小程序ROV首屏更新预警'
  31. message['Subject'] = Header(subject, 'utf-8')
  32. try:
  33. with smtplib.SMTP_SSL(host="smtp.exmail.qq.com",port=465) as smtp:
  34. # 登录发邮件服务器
  35. smtp.login(user = mail_user, password = mail_pass)
  36. # 实际发送、接收邮件配置
  37. smtp.sendmail(sender, receivers, message.as_string())
  38. print ("邮件发送成功")
  39. except smtplib.SMTPException:
  40. print ("Error: 无法发送邮件")
  41. now_date = datetime.date.today()-datetime.timedelta(days=0)
  42. day = datetime.datetime.strftime(now_date, '%Y%m%d')
  43. diff_1 = datetime.timedelta(days=1)
  44. input_dt = datetime.datetime.strftime(now_date - diff_1, '%Y%m%d')
  45. print(input_dt)
  46. res_dir = '/root/ROVtrain/resdir'
  47. res_file = os.path.join(res_dir,'video_score_'+ day[-4:] +'.json')
  48. def getRovfeaturetable(dt):
  49. odps = ODPS('LTAI4FtW5ZzxMvdw35aNkmcp', '0VKnydcaHK3ITjylbgUsLubX6rwiwc', 'videoods',
  50. endpoint='http://service.cn.maxcompute.aliyun.com/api', connect_timeout=3000, \
  51. read_timeout=500000, pool_maxsize=1000, pool_connections=1000)
  52. featureArray = []
  53. for record in tqdm(odps.read_table('recommendVideoDistributionEfficiency', partition='dt=%s' % dt)):
  54. valueFeature = {}
  55. valueFeature['videoId'] = record['videoid']
  56. valueFeature['today_all_return_person'] = record['today_all_return_person']
  57. valueFeature['today_return_total_real'] = record['today_return_total_real']
  58. valueFeature['videoview_today'] = record['videoview_today']
  59. valueFeature['videoshare_today'] = record['videoshare_today']
  60. valueFeature['videoplay_today'] = record['videoplay_today']
  61. valueFeature['ruosoneweek'] = record['ruosoneweek']
  62. valueFeature['ruos3day'] = record['ruos3day']
  63. valueFeature['today_ruov_all'] = record['today_ruov_all']
  64. valueFeature['ctr_today'] = record['ctr_today']
  65. valueFeature['today_first_return_person'] = record['today_first_return_person']
  66. valueFeature['today_second_return_person'] = record['today_second_return_person']
  67. valueFeature['today_third_return_person'] = record['today_third_return_person']
  68. valueFeature['today_overfour_return_person'] = record['today_overfour_return_person']
  69. valueFeature['score'] = record['a11']
  70. valueFeature['dis_title'] = record['dis_title1']
  71. valueFeature['title'] = record['title']
  72. valueFeature['rovscore'] = record['rovscore']
  73. featureArray.append(valueFeature)
  74. featureArrayDF = pd.DataFrame(featureArray)
  75. print('feature table finish')
  76. return featureArrayDF
  77. raw_score = getRovfeaturetable(input_dt)
  78. ###=========================================运营改分逻辑添加================================================
  79. rename_dict={'videoId':'videoId',
  80. 'today_all_return_person':'今日曝光总回流人数',
  81. 'today_return_total_real':'今日总回流_真实值',
  82. 'videoview_today':'今日首页曝光次数',
  83. 'videoshare_today':'今日首页分享次数',
  84. 'videoplay_today':'今日首页播放次数',
  85. 'ruosoneweek':'前7_1日曝光今日RUOV',
  86. 'ruos3day':'前3_1日曝光今日RUOV',
  87. 'today_ruov_all':'今日首页RUOV',
  88. 'ctr_today':'今日首页CTR',
  89. 'today_first_return_person':'今日曝光一层回流人数',
  90. 'today_second_return_person':'今日曝光二层回流人数',
  91. 'today_third_return_person':'今日曝光三层回流人数',
  92. 'today_overfour_return_person':'今日曝光四加层回流人数',
  93. 'score':'全局总分',
  94. 'dis_title':'分发标题',
  95. 'title':'标题',
  96. 'rovscore':'rov分数'
  97. }
  98. raw_score.rename(columns=rename_dict,inplace=True)
  99. #raw_score.columns = ['今日首页CTR', '分发标题', 'rov分数', '前3_1日曝光今日RUOV','前7_1日曝光今日RUOV','全局总分','标题', '今日曝光总回流人数', '今日曝光一层回流人数','今日曝光四加层回流人数',
  100. # '今日总回流_真实值','今日首页RUOV','今日曝光二层回流人数', '今日曝光三层回流人数','videoId', '今日首页播放次数','今日首页分享次数',
  101. # '今日首页曝光次数']
  102. raw_score['二层/一层'] = raw_score.apply(lambda x:x['今日曝光二层回流人数'] / max(10,x['今日曝光一层回流人数']),axis=1)
  103. raw_score['三层/二层'] = raw_score.apply(lambda x:x['今日曝光三层回流人数'] / max(10,x['今日曝光二层回流人数']),axis=1)
  104. raw_score['四加层/三层'] = raw_score.apply(lambda x:x['今日曝光四加层回流人数'] / max(10,x['今日曝光三层回流人数']),axis=1)
  105. raw_score['回流比均值'] = (raw_score['二层/一层'] + raw_score['三层/二层'] + raw_score['四加层/三层']) / 3
  106. raw_score = raw_score[['videoId','分发标题','标题','今日首页曝光次数','今日首页播放次数',
  107. '今日首页分享次数','今日曝光总回流人数','今日总回流_真实值','前7_1日曝光今日RUOV',
  108. '前3_1日曝光今日RUOV','今日首页RUOV','今日首页CTR','今日曝光一层回流人数',
  109. '今日曝光二层回流人数', '今日曝光三层回流人数','今日曝光四加层回流人数','二层/一层',
  110. '三层/二层','四加层/三层','回流比均值',
  111. 'rov分数','全局总分']]
  112. raw_score['新分数'] = raw_score.apply(lambda x: (x['今日曝光总回流人数']*2 + min(x['今日总回流_真实值'],x['今日首页曝光次数'])*0.3 + x['今日首页分享次数']*0.2 + x['今日首页播放次数']*0.02) / max(x['今日首页曝光次数'],1000) * np.log10(x['今日首页曝光次数']+10),axis=1)
  113. raw_score['旧排名'] = [i+1 for i in range(len(raw_score))]
  114. raw_score = raw_score.sort_values(by='新分数',ascending=False).reset_index(drop=True)
  115. raw_score['新排名'] = [i+1 for i in range(len(raw_score))]
  116. raw_score['排名差值'] = raw_score['旧排名'] - raw_score['新排名']
  117. raw_score['提升至'] = raw_score.apply(lambda x: int(x['旧排名']- x['排名差值']/2 if x['排名差值']>=0 and x['旧排名']<=200 else 100+x['旧排名']%100),axis=1)
  118. raw_score = pd.merge(raw_score,raw_score[['旧排名','全局总分']],left_on='提升至',right_on='旧排名',how='left')
  119. raw_score.drop(['旧排名_y'],axis=1,inplace=True)
  120. raw_score.rename(columns={'全局总分_x':'全局总分','旧排名_x':'旧排名','全局总分_y':'提升分数至'},inplace=True)
  121. raw_score['降至'] = raw_score.apply(lambda x: int(x['旧排名']- x['排名差值']/2 if(x['排名差值']<=0 and x['新排名']<=200) else 100+x['新排名']%100),axis=1)
  122. raw_score = pd.merge(raw_score,raw_score[['旧排名','全局总分']],left_on='降至',right_on='旧排名',how='left')
  123. raw_score.drop(['旧排名_y'],axis=1,inplace=True)
  124. raw_score.rename(columns={'全局总分_x':'全局总分','旧排名_x':'旧排名','全局总分_y':'降低分数至'},inplace=True)
  125. raw_score['一周ruov均值'] = (raw_score['前7_1日曝光今日RUOV'] + raw_score['今日首页RUOV'] + raw_score['前3_1日曝光今日RUOV']) / 3
  126. ## 筛选加分视频
  127. add_score_video = raw_score[(raw_score['新排名']<=100)&(raw_score['排名差值']>20)]
  128. add_score_video['最终分数'] = add_score_video.apply(lambda x : x['提升分数至'] if x['排名差值']>20 else x['全局总分'],axis=1)
  129. # 筛选 减分视频
  130. reduce_score_video = raw_score[(raw_score['排名差值']<-50)&(raw_score['旧排名']<=100)]
  131. reduce_score_video['最终分数'] = reduce_score_video['降低分数至']
  132. score_adjust_videos = pd.concat([add_score_video,reduce_score_video])
  133. raw_score = raw_score.merge(score_adjust_videos[['videoId','最终分数']],how='left')
  134. raw_score['最终分数'] = raw_score.apply(lambda x: x['最终分数'] if x['最终分数']>0 else x['全局总分'],axis=1)
  135. ## 新算法递进逻辑
  136. raw_score['最终分数'] = raw_score.apply(lambda x: x['最终分数'] + 10 if (x['一周ruov均值']>=0.045 and x['新排名']<=100) else x['最终分数'],axis=1)
  137. ## 低分裂变视频
  138. raw_score['最终分数'] = raw_score.apply(lambda x: x['最终分数']%10 + 30 if (x['今日首页CTR']>=0.04999 and x['回流比均值'] >= 0.3999 and x['最终分数']<=30 and x['旧排名']<=3000) else x['最终分数'],axis=1)
  139. ###=========================================运营改分逻辑结束================================================
  140. raw_score = raw_score[['videoId','最终分数','分发标题','标题']]
  141. raw_score.columns = ['videoId','score','dis_title','title']
  142. raw_score = raw_score[['videoId','score','dis_title','title']]
  143. raw_score = raw_score.dropna(axis=0,how='any')
  144. raw_score.drop_duplicates(subset=['videoId'],inplace=True)
  145. # raw_score = raw_score.loc[raw_score['score'] >= 2.5]
  146. raw_score = raw_score.sort_values(by="score" , ascending=False).iloc[0:60000,:]
  147. raw_score.reset_index(inplace = True)
  148. #过滤外文
  149. print(raw_score.shape)
  150. score_initial = pd.DataFrame([])
  151. for index in range(len(raw_score)):
  152. # print(index)
  153. title = raw_score.iloc[[index]]['title'][index]
  154. dis_title = raw_score.iloc[[index]]['dis_title'][index]
  155. # print(dis_title)
  156. title_language = []
  157. dis_title_language = []
  158. if title == '' and dis_title == '' :
  159. score_initial = score_initial.append(raw_score.iloc[[index]])
  160. else :
  161. if len(title) > 0:
  162. for index1 in range (len(title)):
  163. title_lan_label = langid.classify(title[index1])[0]
  164. title_language.append(title_lan_label)
  165. if 'zh' in title_language:
  166. break
  167. if len(dis_title) > 0:
  168. for index2 in range(len(dis_title)):
  169. dis_title_label = langid.classify(dis_title[index2])[0]
  170. dis_title_language.append(dis_title_label)
  171. if 'zh' in dis_title_language:
  172. break
  173. if 'zh' in title_language or 'zh' in dis_title_language:
  174. score_initial = score_initial.append(raw_score.iloc[[index]])
  175. #二次过滤非标准字符
  176. score_initial.reset_index( inplace=True)
  177. score = pd.DataFrame([])
  178. for index in range(len(score_initial)):
  179. title = score_initial.iloc[[index]]['title'][index]
  180. dis_title = score_initial.iloc[[index]]['dis_title'][index]
  181. if title == '' and dis_title == '' :
  182. score = score.append(score_initial.iloc[[index]])
  183. else :
  184. if 'ၼ' in title and dis_title == '':
  185. continue
  186. elif 'ၼ' in dis_title and title == '':
  187. continue
  188. else:
  189. score = score.append(score_initial.iloc[[index]])
  190. score['videoId'] = score['videoId'].astype('int')
  191. score = score.sort_values(by="score" , ascending=False)
  192. score = score[['videoId','score']]
  193. ### 添加旧视频测试
  194. #old_video = pd.read_csv('/root/ROVtrain/readonlinetable/old_videoid_retest.csv')
  195. #step = (score.loc[49].score - score.loc[79].score) / 30
  196. #old_video_df = old_video.rename(columns={'videoid':'videoId'}).head(30)
  197. #old_video_score = []
  198. #for i in range(len(old_video_df)):
  199. # old_video_score.append(score.loc[49].score+step*(i+1))
  200. #old_video_df['score'] = old_video_score
  201. #score = pd.concat([score[~score.videoId.isin(old_video_df.videoId)],old_video_df])
  202. #score = score.sort_values(by="score" , ascending=False)
  203. #old_video[~old_video.videoid.isin(old_video_df.videoId)].to_csv('/root/ROVtrain/readonlinetable/old_videoid_retest.csv')
  204. #dt = datetime.datetime.strftime(datetime.date.today(), '%Y%m%d')
  205. #old_video_df.to_csv('/root/ROVtrain/readonlinetable/oldvideo/'+dt+'.csv')
  206. #score.to_csv('/root/ROVtrain/readonlinetable/oldvideo/score.csv')
  207. ###
  208. score.to_json(
  209. res_file, orient='records')
  210. with open(res_file,'r') as score_json:
  211. data = json.load(score_json)
  212. score_df = pd.DataFrame(data)
  213. threshold = score_df.iloc[500,:]['score']
  214. # threshold
  215. def CalcMD5(filepath):
  216. if not os.path.isfile(filepath):
  217. return
  218. with open(filepath,'rb') as f:
  219. md5obj = hashlib.md5()
  220. md5obj.update(f.read())
  221. myhash = md5obj.hexdigest()
  222. return myhash
  223. print(CalcMD5(res_file))
  224. metadata = {}
  225. metadata['rootDir'] = 'recommend'
  226. metadata['dataDir'] = 'data'
  227. metadata['modularName'] = 'model'
  228. metadata['modelName'] = 'rov'
  229. metadata['version'] = 'v1'
  230. metadata['date'] = day
  231. metadata['defaultRovScore'] = threshold
  232. metadata['itemRovName'] = 'video_score.json'
  233. metadata['videoScoreMd5'] = CalcMD5(res_file)
  234. meta_file = os.path.join(res_dir,'rov_success.json')
  235. with open(meta_file,'w') as f:
  236. json.dump(metadata,f)
  237. # 'LTAI4FtW5ZzxMvdw35aNkmcp', '0VKnydcaHK3ITjylbgUsLubX6rwiwc'
  238. #if rov_max > 5:
  239. '''
  240. rov_max = max(score.score)
  241. if rov_max > 50:
  242. auth = oss2.Auth('LTAI9EBa0bd5PrDa', 'vAalxds7YxhfOA2yVv8GziCg3Y87v5')
  243. bucket = oss2.Bucket(auth, 'http://oss-cn-hangzhou-internal.aliyuncs.com', 'art-recommend')
  244. bucket.put_object_from_file('recommend/metadata/rov/'+'rov_success.json', meta_file)
  245. bucket.put_object_from_file('recommend/data/model/rov/v1/'+ day +'/video_score.json', res_file)
  246. print('upload success')
  247. else:
  248. print('error: rovscore under threshold')
  249. sendemail()
  250. '''
  251. auth = oss2.Auth('LTAI9EBa0bd5PrDa', 'vAalxds7YxhfOA2yVv8GziCg3Y87v5')
  252. bucket = oss2.Bucket(auth, 'http://oss-cn-hangzhou-internal.aliyuncs.com', 'art-recommend')
  253. bucket.put_object_from_file('recommend/metadata/rov/'+'rov_success.json', meta_file)
  254. bucket.put_object_from_file('recommend/data/model/rov/v1/'+ day +'/video_score.json', res_file)
  255. print('upload success')
  256. # 上传rov score到Redis中
  257. toRedis(res_file)
  258. print('rov score to redis success')