123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354 |
- def tag_preprocessing(filename):
- #读取tag分词结果
- tag_txt = open("/root/ROVtrain/tfidfCompution/"+ filename +".txt","r") #设置文件对象
- ftextlist = tag_txt.readlines() # 同上
- tag_txt.close() #关闭文件
-
- #转为corpus
- tagList = str(ftextlist).replace('[','').replace(']','').replace("'","").replace("'","").split(',')
- tag = np.array(tagList).reshape(len(tagList),1).tolist()
-
-
- #将词特征转为list形式
- train_tag_feature = pd.DataFrame(df_new_feature).loc[:,'videotags'].to_numpy().reshape(len(pd.DataFrame(df_new_feature).loc[:,'videotags']),1).tolist()
- test_tag_feature = pd.DataFrame(df_new_feature_test).loc[:,'videotags'].to_numpy().reshape(len(pd.DataFrame(df_new_feature_test).loc[:,'videotags']),1).tolist()
-
- #稀疏特征
- mlb_model_tag = MultiLabelBinarizer(sparse_output=True).fit(tag)
- train_tag = mlb_model_tag.transform(train_tag_feature)
- test_tag = mlb_model_tag.transform(test_tag_feature)
-
- return mlb_model_tag.classes_,train_tag,test_tag
- # In[25]:
- #读取tf,idf
- def get_tag_tfidf(dt, tfidf_table_name):
- odps = ODPS('LTAI4FtW5ZzxMvdw35aNkmcp', '0VKnydcaHK3ITjylbgUsLubX6rwiwc', 'videoods',
- endpoint='http://service.cn.maxcompute.aliyun.com/api', connect_timeout=3000, \
- read_timeout=500000, pool_maxsize=1000, pool_connections=1000)
- tag_dict = {}
- for record in odps.read_table(tfidf_table_name, partition='dt=%s' % dt):
- tag_dict[record[0]] = record[1]
- return tag_dict
- # In[26]:
- def ttfidf_list_generation(tag_corpus,tag_dict):
- tag_tfidf_list = []
- for i in tag_corpus:
- try :
- tag_tfidf_list.append(tag_dict[i])
- except:
- tag_tfidf_list.append(0)
- return tag_tfidf_list
|