predict.py 1.5 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556
  1. #coding utf-8
  2. import sys
  3. from gensim import models
  4. import numpy as np
  5. if __name__=="__main__":
  6. #model = models.word2vec.Word2Vec.load('word2vec.txt')
  7. #print(model.wx)
  8. f1 = open('word2vec.txt')
  9. word_dict = {}
  10. while True:
  11. line = f1.readline()
  12. if not line:
  13. break
  14. items = line.strip().split(" ")
  15. if len(items)<64:
  16. continue
  17. arr = []
  18. for w in items[1:]:
  19. arr.append(float(w))
  20. word_dict[items[0]] = arr
  21. #print(word_dict)
  22. f = open(sys.argv[1])
  23. num = 0
  24. while True:
  25. line = f.readline()
  26. if not line:
  27. break
  28. num = num+1
  29. if num == 1:
  30. continue
  31. items = line.split("\t")
  32. if len(items)<2:
  33. continue
  34. vid = items[0]
  35. title_arr = items[1].split(" ")
  36. title_info = np.zeros(64)
  37. word_len = 0
  38. for word in title_arr:
  39. if word in word_dict:
  40. #print(title_info)
  41. #print(word)
  42. word_vec = word_dict[word]
  43. #print(word_vec)
  44. title_info = np.add(title_info, word_vec)
  45. word_len +=1
  46. #print(title_info)
  47. title_info_list = []
  48. if word_len<=0:
  49. continue
  50. for j in title_info:
  51. title_info_list.append(j/word_len)
  52. #print("title_info_list:", title_info_list)
  53. print(vid,"\t",title_info_list)