123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323 |
- #!/usr/bin/env python
- # coding: utf-8
- import random
- import numpy as np
- import os
- import datetime
- import json
- import oss2
- import hashlib
- import pandas as pd
- from odps import ODPS
- import time
- from tqdm import tqdm
- import pickle
- import langid
- import smtplib
- from email.mime.text import MIMEText
- from email.header import Header
- from rov_to_redis import toRedis
- def sendemail():
- mail_host="smtp.exmail.qq.com" #设置服务器
- mail_user="warning@piaoquantv.com" #用户名
- mail_pass="Wq20160722" #口令
- sender = 'warning@piaoquantv.com'
- receivers = ['liqian@piaoquantv.com','liuchunlin@piaoquantv.com'] # 接收邮件,可设置为你的QQ邮箱或者其他邮箱
- # 三个参数:第一个为文本内容,第二个 plain 设置文本格式,第三个 utf-8 设置编码
- message = MIMEText('小程序ROV更新失败', 'plain', 'utf-8')
- message['From'] = Header("warning@piaoquantv.com", 'utf-8')
- message['To'] = Header("xiaoping@piaoquantv.com", 'utf-8')
- subject = '小程序ROV首屏更新预警'
- message['Subject'] = Header(subject, 'utf-8')
- try:
- with smtplib.SMTP_SSL(host="smtp.exmail.qq.com",port=465) as smtp:
- # 登录发邮件服务器
- smtp.login(user = mail_user, password = mail_pass)
- # 实际发送、接收邮件配置
- smtp.sendmail(sender, receivers, message.as_string())
- print ("邮件发送成功")
- except smtplib.SMTPException:
- print ("Error: 无法发送邮件")
- now_date = datetime.date.today()-datetime.timedelta(days=0)
- day = datetime.datetime.strftime(now_date, '%Y%m%d')
- diff_1 = datetime.timedelta(days=1)
- input_dt = datetime.datetime.strftime(now_date - diff_1, '%Y%m%d')
- print(input_dt)
- res_dir = '/root/ROVtrain/resdir'
- res_file = os.path.join(res_dir,'video_score_'+ day[-4:] +'.json')
- def getRovfeaturetable(dt):
- odps = ODPS('LTAI4FtW5ZzxMvdw35aNkmcp', '0VKnydcaHK3ITjylbgUsLubX6rwiwc', 'videoods',
- endpoint='http://service.cn.maxcompute.aliyun.com/api', connect_timeout=3000, \
- read_timeout=500000, pool_maxsize=1000, pool_connections=1000)
- featureArray = []
- for record in tqdm(odps.read_table('recommendVideoDistributionEfficiency', partition='dt=%s' % dt)):
- valueFeature = {}
- valueFeature['videoId'] = record['videoid']
- valueFeature['today_all_return_person'] = record['today_all_return_person']
- valueFeature['today_return_total_real'] = record['today_return_total_real']
- valueFeature['videoview_today'] = record['videoview_today']
- valueFeature['videoshare_today'] = record['videoshare_today']
- valueFeature['videoplay_today'] = record['videoplay_today']
- valueFeature['ruosoneweek'] = record['ruosoneweek']
- valueFeature['ruos3day'] = record['ruos3day']
- valueFeature['today_ruov_all'] = record['today_ruov_all']
-
- valueFeature['ctr_today'] = record['ctr_today']
- valueFeature['today_first_return_person'] = record['today_first_return_person']
- valueFeature['today_second_return_person'] = record['today_second_return_person']
- valueFeature['today_third_return_person'] = record['today_third_return_person']
- valueFeature['today_overfour_return_person'] = record['today_overfour_return_person']
- valueFeature['score'] = record['a11']
- valueFeature['dis_title'] = record['dis_title1']
- valueFeature['title'] = record['title']
- valueFeature['rovscore'] = record['rovscore']
- featureArray.append(valueFeature)
- featureArrayDF = pd.DataFrame(featureArray)
- print('feature table finish')
- return featureArrayDF
- raw_score = getRovfeaturetable(input_dt)
- ###=========================================运营改分逻辑添加================================================
- rename_dict={'videoId':'videoId',
- 'today_all_return_person':'今日曝光总回流人数',
- 'today_return_total_real':'今日总回流_真实值',
- 'videoview_today':'今日首页曝光次数',
- 'videoshare_today':'今日首页分享次数',
- 'videoplay_today':'今日首页播放次数',
- 'ruosoneweek':'前7_1日曝光今日RUOV',
- 'ruos3day':'前3_1日曝光今日RUOV',
- 'today_ruov_all':'今日首页RUOV',
- 'ctr_today':'今日首页CTR',
- 'today_first_return_person':'今日曝光一层回流人数',
- 'today_second_return_person':'今日曝光二层回流人数',
- 'today_third_return_person':'今日曝光三层回流人数',
- 'today_overfour_return_person':'今日曝光四加层回流人数',
- 'score':'全局总分',
- 'dis_title':'分发标题',
- 'title':'标题',
- 'rovscore':'rov分数'
- }
- raw_score.rename(columns=rename_dict,inplace=True)
- #raw_score.columns = ['今日首页CTR', '分发标题', 'rov分数', '前3_1日曝光今日RUOV','前7_1日曝光今日RUOV','全局总分','标题', '今日曝光总回流人数', '今日曝光一层回流人数','今日曝光四加层回流人数',
- # '今日总回流_真实值','今日首页RUOV','今日曝光二层回流人数', '今日曝光三层回流人数','videoId', '今日首页播放次数','今日首页分享次数',
- # '今日首页曝光次数']
- raw_score['二层/一层'] = raw_score.apply(lambda x:x['今日曝光二层回流人数'] / max(10,x['今日曝光一层回流人数']),axis=1)
- raw_score['三层/二层'] = raw_score.apply(lambda x:x['今日曝光三层回流人数'] / max(10,x['今日曝光二层回流人数']),axis=1)
- raw_score['四加层/三层'] = raw_score.apply(lambda x:x['今日曝光四加层回流人数'] / max(10,x['今日曝光三层回流人数']),axis=1)
- raw_score['回流比均值'] = (raw_score['二层/一层'] + raw_score['三层/二层'] + raw_score['四加层/三层']) / 3
- raw_score = raw_score[['videoId','分发标题','标题','今日首页曝光次数','今日首页播放次数',
- '今日首页分享次数','今日曝光总回流人数','今日总回流_真实值','前7_1日曝光今日RUOV',
- '前3_1日曝光今日RUOV','今日首页RUOV','今日首页CTR','今日曝光一层回流人数',
- '今日曝光二层回流人数', '今日曝光三层回流人数','今日曝光四加层回流人数','二层/一层',
- '三层/二层','四加层/三层','回流比均值',
- 'rov分数','全局总分']]
- raw_score['新分数'] = raw_score.apply(lambda x: (x['今日曝光总回流人数']*2 + min(x['今日总回流_真实值'],x['今日首页曝光次数'])*0.3 + x['今日首页分享次数']*0.2 + x['今日首页播放次数']*0.02) / max(x['今日首页曝光次数'],1000) * np.log10(x['今日首页曝光次数']+10),axis=1)
- raw_score['旧排名'] = [i+1 for i in range(len(raw_score))]
- raw_score = raw_score.sort_values(by='新分数',ascending=False).reset_index(drop=True)
- raw_score['新排名'] = [i+1 for i in range(len(raw_score))]
- raw_score['排名差值'] = raw_score['旧排名'] - raw_score['新排名']
- raw_score['提升至'] = raw_score.apply(lambda x: int(x['旧排名']- x['排名差值']/2 if x['排名差值']>=0 and x['旧排名']<=200 else 100+x['旧排名']%100),axis=1)
- raw_score = pd.merge(raw_score,raw_score[['旧排名','全局总分']],left_on='提升至',right_on='旧排名',how='left')
- raw_score.drop(['旧排名_y'],axis=1,inplace=True)
- raw_score.rename(columns={'全局总分_x':'全局总分','旧排名_x':'旧排名','全局总分_y':'提升分数至'},inplace=True)
- raw_score['降至'] = raw_score.apply(lambda x: int(x['旧排名']- x['排名差值']/2 if(x['排名差值']<=0 and x['新排名']<=200) else 100+x['新排名']%100),axis=1)
- raw_score = pd.merge(raw_score,raw_score[['旧排名','全局总分']],left_on='降至',right_on='旧排名',how='left')
- raw_score.drop(['旧排名_y'],axis=1,inplace=True)
- raw_score.rename(columns={'全局总分_x':'全局总分','旧排名_x':'旧排名','全局总分_y':'降低分数至'},inplace=True)
- raw_score['一周ruov均值'] = (raw_score['前7_1日曝光今日RUOV'] + raw_score['今日首页RUOV'] + raw_score['前3_1日曝光今日RUOV']) / 3
- ## 筛选加分视频
- add_score_video = raw_score[(raw_score['新排名']<=100)&(raw_score['排名差值']>20)]
- add_score_video['最终分数'] = add_score_video.apply(lambda x : x['提升分数至'] if x['排名差值']>20 else x['全局总分'],axis=1)
- # 筛选 减分视频
- reduce_score_video = raw_score[(raw_score['排名差值']<-50)&(raw_score['旧排名']<=100)]
- reduce_score_video['最终分数'] = reduce_score_video['降低分数至']
- score_adjust_videos = pd.concat([add_score_video,reduce_score_video])
- raw_score = raw_score.merge(score_adjust_videos[['videoId','最终分数']],how='left')
- raw_score['最终分数'] = raw_score.apply(lambda x: x['最终分数'] if x['最终分数']>0 else x['全局总分'],axis=1)
- ## 新算法递进逻辑
- raw_score['最终分数'] = raw_score.apply(lambda x: x['最终分数'] + 10 if (x['一周ruov均值']>=0.045 and x['新排名']<=100) else x['最终分数'],axis=1)
- ## 低分裂变视频
- raw_score['最终分数'] = raw_score.apply(lambda x: x['最终分数']%10 + 30 if (x['今日首页CTR']>=0.04999 and x['回流比均值'] >= 0.3999 and x['最终分数']<=30 and x['旧排名']<=3000) else x['最终分数'],axis=1)
- ###=========================================运营改分逻辑结束================================================
- raw_score = raw_score[['videoId','最终分数','分发标题','标题']]
- raw_score.columns = ['videoId','score','dis_title','title']
- raw_score = raw_score[['videoId','score','dis_title','title']]
- raw_score = raw_score.dropna(axis=0,how='any')
- raw_score.drop_duplicates(subset=['videoId'],inplace=True)
- # raw_score = raw_score.loc[raw_score['score'] >= 2.5]
- raw_score = raw_score.sort_values(by="score" , ascending=False).iloc[0:60000,:]
- raw_score.reset_index(inplace = True)
- #过滤外文
- print(raw_score.shape)
- score_initial = pd.DataFrame([])
- for index in range(len(raw_score)):
- # print(index)
- title = raw_score.iloc[[index]]['title'][index]
- dis_title = raw_score.iloc[[index]]['dis_title'][index]
- # print(dis_title)
- title_language = []
- dis_title_language = []
- if title == '' and dis_title == '' :
- score_initial = score_initial.append(raw_score.iloc[[index]])
- else :
- if len(title) > 0:
- for index1 in range (len(title)):
- title_lan_label = langid.classify(title[index1])[0]
- title_language.append(title_lan_label)
- if 'zh' in title_language:
- break
- if len(dis_title) > 0:
- for index2 in range(len(dis_title)):
- dis_title_label = langid.classify(dis_title[index2])[0]
- dis_title_language.append(dis_title_label)
- if 'zh' in dis_title_language:
- break
- if 'zh' in title_language or 'zh' in dis_title_language:
- score_initial = score_initial.append(raw_score.iloc[[index]])
- #二次过滤非标准字符
- score_initial.reset_index( inplace=True)
- score = pd.DataFrame([])
- for index in range(len(score_initial)):
- title = score_initial.iloc[[index]]['title'][index]
- dis_title = score_initial.iloc[[index]]['dis_title'][index]
- if title == '' and dis_title == '' :
- score = score.append(score_initial.iloc[[index]])
- else :
- if 'ၼ' in title and dis_title == '':
- continue
- elif 'ၼ' in dis_title and title == '':
- continue
- else:
- score = score.append(score_initial.iloc[[index]])
-
-
- score['videoId'] = score['videoId'].astype('int')
- score = score.sort_values(by="score" , ascending=False)
- score = score[['videoId','score']]
- ### 添加旧视频测试
- #old_video = pd.read_csv('/root/ROVtrain/readonlinetable/old_videoid_retest.csv')
- #step = (score.loc[49].score - score.loc[79].score) / 30
- #old_video_df = old_video.rename(columns={'videoid':'videoId'}).head(30)
- #old_video_score = []
- #for i in range(len(old_video_df)):
- # old_video_score.append(score.loc[49].score+step*(i+1))
- #old_video_df['score'] = old_video_score
- #score = pd.concat([score[~score.videoId.isin(old_video_df.videoId)],old_video_df])
- #score = score.sort_values(by="score" , ascending=False)
- #old_video[~old_video.videoid.isin(old_video_df.videoId)].to_csv('/root/ROVtrain/readonlinetable/old_videoid_retest.csv')
- #dt = datetime.datetime.strftime(datetime.date.today(), '%Y%m%d')
- #old_video_df.to_csv('/root/ROVtrain/readonlinetable/oldvideo/'+dt+'.csv')
- #score.to_csv('/root/ROVtrain/readonlinetable/oldvideo/score.csv')
- ###
- score.to_json(
- res_file, orient='records')
- with open(res_file,'r') as score_json:
- data = json.load(score_json)
- score_df = pd.DataFrame(data)
- threshold = score_df.iloc[500,:]['score']
- # threshold
- def CalcMD5(filepath):
- if not os.path.isfile(filepath):
- return
- with open(filepath,'rb') as f:
- md5obj = hashlib.md5()
- md5obj.update(f.read())
- myhash = md5obj.hexdigest()
- return myhash
- print(CalcMD5(res_file))
- metadata = {}
- metadata['rootDir'] = 'recommend'
- metadata['dataDir'] = 'data'
- metadata['modularName'] = 'model'
- metadata['modelName'] = 'rov'
- metadata['version'] = 'v1'
- metadata['date'] = day
- metadata['defaultRovScore'] = threshold
- metadata['itemRovName'] = 'video_score.json'
- metadata['videoScoreMd5'] = CalcMD5(res_file)
- meta_file = os.path.join(res_dir,'rov_success.json')
- with open(meta_file,'w') as f:
- json.dump(metadata,f)
-
- # 'LTAI4FtW5ZzxMvdw35aNkmcp', '0VKnydcaHK3ITjylbgUsLubX6rwiwc'
- #if rov_max > 5:
- '''
- rov_max = max(score.score)
- if rov_max > 50:
- auth = oss2.Auth('LTAI9EBa0bd5PrDa', 'vAalxds7YxhfOA2yVv8GziCg3Y87v5')
- bucket = oss2.Bucket(auth, 'http://oss-cn-hangzhou-internal.aliyuncs.com', 'art-recommend')
- bucket.put_object_from_file('recommend/metadata/rov/'+'rov_success.json', meta_file)
- bucket.put_object_from_file('recommend/data/model/rov/v1/'+ day +'/video_score.json', res_file)
- print('upload success')
- else:
- print('error: rovscore under threshold')
- sendemail()
- '''
- auth = oss2.Auth('LTAI9EBa0bd5PrDa', 'vAalxds7YxhfOA2yVv8GziCg3Y87v5')
- bucket = oss2.Bucket(auth, 'http://oss-cn-hangzhou-internal.aliyuncs.com', 'art-recommend')
- bucket.put_object_from_file('recommend/metadata/rov/'+'rov_success.json', meta_file)
- bucket.put_object_from_file('recommend/data/model/rov/v1/'+ day +'/video_score.json', res_file)
- print('upload success')
- # 上传rov score到Redis中
- toRedis(res_file)
- print('rov score to redis success')
|