123456789101112131415161718192021222324252627282930313233343536373839404142434445464748 |
- import numpy as np
- import pandas as pd
- from sklearn.preprocessing import MultiLabelBinarizer
- from odps import ODPS
- def tag_preprocessing(filename,df_new_feature, df_new_feature_predict):
- #读取tag分词结果
- tag_txt = open("/root/ROVtrain/tfidfCompution/"+ filename +".txt","r") #设置文件对象
- ftextlist = tag_txt.readlines() # 同上
- tag_txt.close() #关闭文件
- #转为corpus
- tagList = str(ftextlist).replace('[','').replace(']','').replace("'","").replace("'","").split(',')
- tag = np.array(tagList).reshape(len(tagList),1).tolist()
- #将词特征转为list形式
- train_tag_feature = pd.DataFrame(df_new_feature).loc[:,'videotags'].to_numpy().reshape(len(pd.DataFrame(df_new_feature).loc[:,'videotags']),1).tolist()
- predict_tag_feature = pd.DataFrame(df_new_feature_predict).loc[:,'videotags'].to_numpy().reshape(len(pd.DataFrame(df_new_feature_predict).loc[:,'videotags']),1).tolist()
-
- #稀疏特征
- mlb_model_tag = MultiLabelBinarizer(sparse_output=True).fit(tag)
- train_tag = mlb_model_tag.transform(train_tag_feature)
- predict_tag = mlb_model_tag.transform(predict_tag_feature)
-
- return mlb_model_tag.classes_,train_tag,predict_tag
- def get_tag_tfidf(dt, tfidf_table_name):
- odps = ODPS('LTAI4FtW5ZzxMvdw35aNkmcp', '0VKnydcaHK3ITjylbgUsLubX6rwiwc', 'videoods',
- endpoint='http://service.cn.maxcompute.aliyun.com/api', connect_timeout=3000, \
- read_timeout=500000, pool_maxsize=1000, pool_connections=1000)
- tag_dict = {}
- for record in odps.read_table(tfidf_table_name, partition='dt=%s' % dt):
- tag_dict[record[0]] = record[1]
- return tag_dict
- def ttfidf_list_generation(tag_corpus,tag_dict):
- tag_tfidf_list = []
- for i in tag_corpus:
- try :
- tag_tfidf_list.append(tag_dict[i])
- except:
- tag_tfidf_list.append(0)
- return tag_tfidf_list
|