process_tag.py 1.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748
  1. import numpy as np
  2. import pandas as pd
  3. from sklearn.preprocessing import MultiLabelBinarizer
  4. from odps import ODPS
  5. def tag_preprocessing(filename,df_new_feature, df_new_feature_predict):
  6. #读取tag分词结果
  7. tag_txt = open("/root/ROVtrain/tfidfCompution/"+ filename +".txt","r") #设置文件对象
  8. ftextlist = tag_txt.readlines() # 同上
  9. tag_txt.close() #关闭文件
  10. #转为corpus
  11. tagList = str(ftextlist).replace('[','').replace(']','').replace("'","").replace("'","").split(',')
  12. tag = np.array(tagList).reshape(len(tagList),1).tolist()
  13. #将词特征转为list形式
  14. train_tag_feature = pd.DataFrame(df_new_feature).loc[:,'videotags'].to_numpy().reshape(len(pd.DataFrame(df_new_feature).loc[:,'videotags']),1).tolist()
  15. predict_tag_feature = pd.DataFrame(df_new_feature_predict).loc[:,'videotags'].to_numpy().reshape(len(pd.DataFrame(df_new_feature_predict).loc[:,'videotags']),1).tolist()
  16. #稀疏特征
  17. mlb_model_tag = MultiLabelBinarizer(sparse_output=True).fit(tag)
  18. train_tag = mlb_model_tag.transform(train_tag_feature)
  19. predict_tag = mlb_model_tag.transform(predict_tag_feature)
  20. return mlb_model_tag.classes_,train_tag,predict_tag
  21. def get_tag_tfidf(dt, tfidf_table_name):
  22. odps = ODPS('LTAI4FtW5ZzxMvdw35aNkmcp', '0VKnydcaHK3ITjylbgUsLubX6rwiwc', 'videoods',
  23. endpoint='http://service.cn.maxcompute.aliyun.com/api', connect_timeout=3000, \
  24. read_timeout=500000, pool_maxsize=1000, pool_connections=1000)
  25. tag_dict = {}
  26. for record in odps.read_table(tfidf_table_name, partition='dt=%s' % dt):
  27. tag_dict[record[0]] = record[1]
  28. return tag_dict
  29. def ttfidf_list_generation(tag_corpus,tag_dict):
  30. tag_tfidf_list = []
  31. for i in tag_corpus:
  32. try :
  33. tag_tfidf_list.append(tag_dict[i])
  34. except:
  35. tag_tfidf_list.append(0)
  36. return tag_tfidf_list