calI2I2.py 4.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145
  1. #coding utf-8
  2. import sys
  3. from operator import itemgetter
  4. import json
  5. if __name__=="__main__":
  6. #1.load data
  7. nowdate=sys.argv[1]
  8. f = open("./data/user_item_share_"+nowdate)
  9. user_item_dict={}
  10. item_dict = {}
  11. while True:
  12. line = f.readline()
  13. if not line:
  14. break
  15. items = line.strip().split("\t")
  16. if len(items)<3:
  17. continue
  18. vid = -1
  19. try:
  20. vid = int(items[2])
  21. except:
  22. continue
  23. if vid == -1:
  24. continue
  25. key = (items[1],vid)
  26. #print(key)
  27. if key not in user_item_dict:
  28. user_item_dict[key] = 1
  29. else:
  30. user_item_dict[key] = user_item_dict[key]+1
  31. if vid not in item_dict:
  32. item_dict[vid] = 1
  33. else:
  34. item_dict[vid] = item_dict[vid]+1
  35. f.close()
  36. #((user,item), score)
  37. #print(user_item_dict)
  38. #2. (uid, [(vid, score)....])
  39. user_group_dict = {}
  40. for k, v in user_item_dict.items():
  41. uid = k[0]
  42. vid = k[1]
  43. score = v
  44. #if score <3:
  45. # continue
  46. vid_list = []
  47. if uid not in user_group_dict:
  48. vid_list.append((vid, score))
  49. user_group_dict[uid] = vid_list
  50. else:
  51. vid_list = user_group_dict[uid]
  52. vid_list.append((vid, score))
  53. user_group_dict[uid] = vid_list
  54. #print(user_group_dict)
  55. item_pair_dict = {}
  56. #3. expand item
  57. for k, v_list in user_group_dict.items():
  58. v_n = len(v_list)
  59. if v_n<2:
  60. continue
  61. for i in range(v_n):
  62. for j in range(1, v_n):
  63. if v_list[i][0] == v_list[j][0]:
  64. continue
  65. item_key = (v_list[i][0], v_list[j][0])
  66. item_score = 1
  67. if item_key not in item_pair_dict:
  68. item_pair_dict[item_key] = item_score
  69. else:
  70. item_pair_dict[item_key] = item_pair_dict[item_key]+item_score
  71. print(len(item_pair_dict))
  72. #print(item_pair_dict)
  73. #print(item_dict)
  74. left_pair_num = 0
  75. rec_item_dict = {}
  76. #4. rec item
  77. for k, v in item_pair_dict.items():
  78. if v<3:
  79. continue
  80. left_pair_num+=1
  81. #print(k[0])
  82. #print(k[1])
  83. item1 = int(k[0])
  84. item2 = int(k[1])
  85. pair_score = v
  86. if item1 in item_dict:
  87. item_score1 = item_dict[item1]
  88. if item_score1<10:
  89. continue
  90. i2i_pro = float(pair_score)/(float(item_score1)+5)
  91. if i2i_pro<0.000001:
  92. continue
  93. rec_list1 = []
  94. if item2 not in rec_item_dict:
  95. rec_list1.append((item1, i2i_pro, pair_score, item_score1))
  96. rec_item_dict[item2] = rec_list1
  97. else:
  98. rec_list1 = rec_item_dict[item2]
  99. rec_list1.append((item1, i2i_pro, pair_score, item_score1))
  100. rec_item_dict[item2] = rec_list1
  101. if item2 in item_dict:
  102. item_score2 = item_dict[item2]
  103. if item_score2<10:
  104. continue
  105. i2i_pro = float(pair_score)/(float(item_score2)+5)
  106. if i2i_pro<0.000001:
  107. continue
  108. rec_list2 = []
  109. if item1 not in rec_item_dict:
  110. rec_list2.append((item2, i2i_pro, pair_score, item_score2))
  111. rec_item_dict[item1] = rec_list2
  112. else:
  113. rec_list2 = rec_item_dict[item1]
  114. rec_list2.append((item2, i2i_pro, pair_score, item_score2))
  115. rec_item_dict[item1] = rec_list2
  116. #(item, share_count)
  117. print(left_pair_num)
  118. #print(rec_item_dict)
  119. final_rec_list = []
  120. #f = open("rec_result", "w")
  121. #5. sorted item_list
  122. for k,v in rec_item_dict.items():
  123. v_set = set('')
  124. value_list = v
  125. dup_list = []
  126. for item in value_list:
  127. if item[0] in v_set:
  128. continue
  129. v_set.add(item[0])
  130. #print(item[1])
  131. #if float(items[1])<0.000001:
  132. # continue
  133. dup_list.append(item)
  134. sorted_v = sorted(dup_list, key=itemgetter(1), reverse=True)
  135. final_rec_list.append((k, sorted_v))
  136. #print(final_rec_list[:1])
  137. #json_str = json.dumps(final_rec_list)
  138. with open("./data/rec_result_"+nowdate+".json", "w") as f :
  139. json.dump(final_rec_list, f)