process_tag.py 1.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354
  1. def tag_preprocessing(filename):
  2. #读取tag分词结果
  3. tag_txt = open("/root/ROVtrain/tfidfCompution/"+ filename +".txt","r") #设置文件对象
  4. ftextlist = tag_txt.readlines() # 同上
  5. tag_txt.close() #关闭文件
  6. #转为corpus
  7. tagList = str(ftextlist).replace('[','').replace(']','').replace("'","").replace("'","").split(',')
  8. tag = np.array(tagList).reshape(len(tagList),1).tolist()
  9. #将词特征转为list形式
  10. train_tag_feature = pd.DataFrame(df_new_feature).loc[:,'videotags'].to_numpy().reshape(len(pd.DataFrame(df_new_feature).loc[:,'videotags']),1).tolist()
  11. test_tag_feature = pd.DataFrame(df_new_feature_test).loc[:,'videotags'].to_numpy().reshape(len(pd.DataFrame(df_new_feature_test).loc[:,'videotags']),1).tolist()
  12. #稀疏特征
  13. mlb_model_tag = MultiLabelBinarizer(sparse_output=True).fit(tag)
  14. train_tag = mlb_model_tag.transform(train_tag_feature)
  15. test_tag = mlb_model_tag.transform(test_tag_feature)
  16. return mlb_model_tag.classes_,train_tag,test_tag
  17. # In[25]:
  18. #读取tf,idf
  19. def get_tag_tfidf(dt, tfidf_table_name):
  20. odps = ODPS('LTAI4FtW5ZzxMvdw35aNkmcp', '0VKnydcaHK3ITjylbgUsLubX6rwiwc', 'videoods',
  21. endpoint='http://service.cn.maxcompute.aliyun.com/api', connect_timeout=3000, \
  22. read_timeout=500000, pool_maxsize=1000, pool_connections=1000)
  23. tag_dict = {}
  24. for record in odps.read_table(tfidf_table_name, partition='dt=%s' % dt):
  25. tag_dict[record[0]] = record[1]
  26. return tag_dict
  27. # In[26]:
  28. def ttfidf_list_generation(tag_corpus,tag_dict):
  29. tag_tfidf_list = []
  30. for i in tag_corpus:
  31. try :
  32. tag_tfidf_list.append(tag_dict[i])
  33. except:
  34. tag_tfidf_list.append(0)
  35. return tag_tfidf_list