calI2I3.py 3.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124
  1. #coding utf-8
  2. import sys
  3. from operator import itemgetter
  4. import json
  5. if __name__=="__main__":
  6. #1.load data
  7. nowdate=sys.argv[1]
  8. f = open("./data/user_item_share_"+nowdate)
  9. user_item_dict={}
  10. item_dict = {}
  11. while True:
  12. line = f.readline()
  13. if not line:
  14. break
  15. items = line.strip().split("\t")
  16. if len(items)<3:
  17. continue
  18. key = (items[1],items[2])
  19. #print(key)
  20. if key not in user_item_dict:
  21. user_item_dict[key] = 1
  22. else:
  23. user_item_dict[key] = user_item_dict[key]+1
  24. if items[2] not in item_dict:
  25. item_dict[items[2]] = 1
  26. else:
  27. item_dict[items[2]] = item_dict[items[2]]+1
  28. f.close()
  29. #((user,item), score)
  30. #print(user_item_dict)
  31. #2. (uid, [(vid, score)....])
  32. user_group_dict = {}
  33. for k, v in user_item_dict.items():
  34. uid = k[0]
  35. vid = k[1]
  36. score = v
  37. #if score <3:
  38. # continue
  39. vid_list = []
  40. if uid not in user_group_dict:
  41. vid_list.append((vid, score))
  42. user_group_dict[uid] = vid_list
  43. else:
  44. vid_list = user_group_dict[uid]
  45. vid_list.append((vid, score))
  46. user_group_dict[uid] = vid_list
  47. #print(user_group_dict)
  48. item_pair_dict = {}
  49. #3. expand item
  50. for k, v_list in user_group_dict.items():
  51. v_n = len(v_list)
  52. if v_n<2:
  53. continue
  54. for i in range(v_n):
  55. for j in range(1, v_n):
  56. if v_list[i][0] == v_list[j][0]:
  57. continue
  58. item_key = (v_list[i][0], v_list[j][0])
  59. item_score = 1
  60. if item_key not in item_pair_dict:
  61. item_pair_dict[item_key] = item_score
  62. else:
  63. item_pair_dict[item_key] = item_pair_dict[item_key]+item_score
  64. #print(item_pair_dict)
  65. print(item_pair_dict)
  66. print(item_dict)
  67. left_pair_num = 0
  68. rec_item_dict = {}
  69. #4. rec item
  70. for k, v in item_pair_dict.items():
  71. if v<2:
  72. continue
  73. left_pair_num+=1
  74. item1 = k[0]
  75. item2 = k[1]
  76. pair_score = v
  77. if item1 in item_dict:
  78. item_score1 = item_dict[item1]
  79. i2i_pro = pair_score/(item_score1+5)
  80. rec_list1 = []
  81. if item2 not in rec_item_dict:
  82. rec_list1.append((item1, i2i_pro, pair_score, item_score1))
  83. rec_item_dict[item2] = rec_list1
  84. else:
  85. rec_list1 = rec_item_dict[item2]
  86. rec_list1.append((item1, i2i_pro, pair_score, item_score1))
  87. rec_item_dict[item2] = rec_list1
  88. if item2 in item_dict:
  89. item_score2 = item_dict[item2]
  90. i2i_pro = pair_score/(item_score2+5)
  91. rec_list2 = []
  92. if item1 not in rec_item_dict:
  93. rec_list2.append((item2, i2i_pro, pair_score, item_score2))
  94. rec_item_dict[item1] = rec_list2
  95. else:
  96. rec_list2 = rec_item_dict[item1]
  97. rec_list2.append((item2, i2i_pro, pair_score, item_score2))
  98. rec_item_dict[item1] = rec_list2
  99. #(item, share_count)
  100. print(left_pair_num)
  101. #print(rec_item_dict)
  102. final_rec_list = []
  103. #f = open("rec_result", "w")
  104. #5. sorted item_list
  105. for k,v in rec_item_dict.items():
  106. v_set = set('')
  107. value_list = v
  108. dup_list = []
  109. for item in value_list:
  110. if item[0] in v_set:
  111. continue
  112. v_set.add(item[0])
  113. dup_list.append(item)
  114. sorted_v = sorted(dup_list, key=itemgetter(1), reverse=True)
  115. final_rec_list.append((k, sorted_v))
  116. #print(final_rec_list[:1])
  117. #json_str = json.dumps(final_rec_list)
  118. with open("./data/rec_result2_"+nowdate+".json", "w") as f :
  119. json.dump(final_rec_list, f)