import numpy as np import pandas as pd from sklearn.preprocessing import MultiLabelBinarizer from odps import ODPS def tag_preprocessing(filename,df_new_feature, df_new_feature_predict): #读取tag分词结果 tag_txt = open("/root/ROVtrain/tfidfCompution/"+ filename +".txt","r") #设置文件对象 ftextlist = tag_txt.readlines() # 同上 tag_txt.close() #关闭文件 #转为corpus tagList = str(ftextlist).replace('[','').replace(']','').replace("'","").replace("'","").split(',') tag = np.array(tagList).reshape(len(tagList),1).tolist() #将词特征转为list形式 train_tag_feature = pd.DataFrame(df_new_feature).loc[:,'videotags'].to_numpy().reshape(len(pd.DataFrame(df_new_feature).loc[:,'videotags']),1).tolist() predict_tag_feature = pd.DataFrame(df_new_feature_predict).loc[:,'videotags'].to_numpy().reshape(len(pd.DataFrame(df_new_feature_predict).loc[:,'videotags']),1).tolist() #稀疏特征 mlb_model_tag = MultiLabelBinarizer(sparse_output=True).fit(tag) train_tag = mlb_model_tag.transform(train_tag_feature) predict_tag = mlb_model_tag.transform(predict_tag_feature) return mlb_model_tag.classes_,train_tag,predict_tag def get_tag_tfidf(dt, tfidf_table_name): odps = ODPS('LTAI4FtW5ZzxMvdw35aNkmcp', '0VKnydcaHK3ITjylbgUsLubX6rwiwc', 'videoods', endpoint='http://service.cn.maxcompute.aliyun.com/api', connect_timeout=3000, \ read_timeout=500000, pool_maxsize=1000, pool_connections=1000) tag_dict = {} for record in odps.read_table(tfidf_table_name, partition='dt=%s' % dt): tag_dict[record[0]] = record[1] return tag_dict def ttfidf_list_generation(tag_corpus,tag_dict): tag_tfidf_list = [] for i in tag_corpus: try : tag_tfidf_list.append(tag_dict[i]) except: tag_tfidf_list.append(0) return tag_tfidf_list