predict.py 1.4 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455
  1. #coding utf-8
  2. import sys
  3. from gensim import models
  4. import numpy as np
  5. if __name__=="__main__":
  6. #model = models.word2vec.Word2Vec.load('word2vec.txt')
  7. #print(model.wx)
  8. f1 = open('word2vec.txt')
  9. word_dict = {}
  10. while True:
  11. line = f1.readline()
  12. if not line:
  13. break
  14. items = line.strip().split(" ")
  15. if len(items)<100:
  16. continue
  17. arr = []
  18. for w in items[1:]:
  19. arr.append(float(w))
  20. word_dict[items[0]] = arr
  21. #print(word_dict)
  22. f = open(sys.argv[1])
  23. num = 0
  24. while True:
  25. line = f.readline()
  26. if not line:
  27. break
  28. num = num+1
  29. if num == 1:
  30. continue
  31. items = line.split("\t")
  32. if len(items)<2:
  33. continue
  34. vid = items[0]
  35. title_arr = items[1].split(" ")
  36. title_info = np.zeros(100)
  37. word_len = 0
  38. for word in title_arr:
  39. if word in word_dict:
  40. #print(title_info)
  41. #print(word)
  42. word_vec = word_dict[word]
  43. #print(word_vec)
  44. title_info = np.add(title_info, word_vec)
  45. word_len +=1
  46. title_info_list = []
  47. if word_len<=0:
  48. continue
  49. for j in title_info:
  50. title_info_list.append(j/word_len)
  51. #print("title_info_list:", title_info_list)
  52. print(vid,title_info_list)