algorithm
/
RovOpt


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748
							import numpy as np
import pandas as pd
from sklearn.preprocessing import MultiLabelBinarizer
from odps import ODPS

def tag_preprocessing(filename,df_new_feature, df_new_feature_predict):
    #读取tag分词结果
    tag_txt = open("/root/ROVtrain/tfidfCompution/"+ filename +".txt","r")   #设置文件对象
    ftextlist = tag_txt.readlines() # 同上
    tag_txt.close() #关闭文件

    #转为corpus
    tagList = str(ftextlist).replace('[','').replace(']','').replace("'","").replace("'","").split(',')
    tag = np.array(tagList).reshape(len(tagList),1).tolist()

    #将词特征转为list形式
    train_tag_feature =  pd.DataFrame(df_new_feature).loc[:,'videotags'].to_numpy().reshape(len(pd.DataFrame(df_new_feature).loc[:,'videotags']),1).tolist()
    predict_tag_feature = pd.DataFrame(df_new_feature_predict).loc[:,'videotags'].to_numpy().reshape(len(pd.DataFrame(df_new_feature_predict).loc[:,'videotags']),1).tolist()
    
    #稀疏特征
    mlb_model_tag = MultiLabelBinarizer(sparse_output=True).fit(tag)
    train_tag = mlb_model_tag.transform(train_tag_feature)
    predict_tag = mlb_model_tag.transform(predict_tag_feature)
    
    return mlb_model_tag.classes_,train_tag,predict_tag


def get_tag_tfidf(dt, tfidf_table_name):
    odps = ODPS('LTAI4FtW5ZzxMvdw35aNkmcp', '0VKnydcaHK3ITjylbgUsLubX6rwiwc', 'videoods',
                endpoint='http://service.cn.maxcompute.aliyun.com/api', connect_timeout=3000, \
                read_timeout=500000, pool_maxsize=1000, pool_connections=1000)
    tag_dict = {}
    for record in odps.read_table(tfidf_table_name, partition='dt=%s' % dt):
        tag_dict[record[0]] = record[1]
    return tag_dict


def ttfidf_list_generation(tag_corpus,tag_dict):
    tag_tfidf_list = []
    for i in tag_corpus:
        try :
            tag_tfidf_list.append(tag_dict[i])
        except:
            tag_tfidf_list.append(0)
    return tag_tfidf_list