calI2I.py 4.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142
  1. #coding utf-8
  2. import sys
  3. from operator import itemgetter
  4. import json
  5. if __name__=="__main__":
  6. #1.load data
  7. nowdate=sys.argv[1]
  8. f = open("./data/user_item_share_"+nowdate)
  9. user_item_dict={}
  10. item_dict = {}
  11. while True:
  12. line = f.readline()
  13. if not line:
  14. break
  15. items = line.strip().split("\t")
  16. if len(items)<3:
  17. continue
  18. key = (items[1],items[2])
  19. #print(key)
  20. if key not in user_item_dict:
  21. user_item_dict[key] = 1
  22. else:
  23. user_item_dict[key] = user_item_dict[key]+1
  24. if items[2] not in item_dict:
  25. item_dict[items[2]] = 1
  26. else:
  27. item_dict[items[2]] = item_dict[items[2]]+1
  28. f.close()
  29. nowhour=sys.argv[2]
  30. f1 = open("./data/user_cur_day_item_share_"+nowhour)
  31. while True:
  32. line = f1.readline()
  33. if not line:
  34. break
  35. items = line.strip().split("\t")
  36. if len(items)<3:
  37. continue
  38. key = (items[1],items[2])
  39. #print(key)
  40. if key not in user_item_dict:
  41. user_item_dict[key] = 1
  42. else:
  43. user_item_dict[key] = user_item_dict[key]+1
  44. if items[2] not in item_dict:
  45. item_dict[items[2]] = 1
  46. else:
  47. item_dict[items[2]] = item_dict[items[2]]+1
  48. f1.close()
  49. #((user,item), score)
  50. #print(user_item_dict)
  51. #2. (uid, [(vid, score)....])
  52. user_group_dict = {}
  53. for k, v in user_item_dict.items():
  54. uid = k[0]
  55. vid = k[1]
  56. score = v
  57. vid_list = []
  58. if uid not in user_group_dict:
  59. vid_list.append((vid, score))
  60. user_group_dict[uid] = vid_list
  61. else:
  62. vid_list = user_group_dict[uid]
  63. vid_list.append((vid, score))
  64. user_group_dict[uid] = vid_list
  65. #print(user_group_dict)
  66. item_pair_dict = {}
  67. #3. expand item
  68. for k, v_list in user_group_dict.items():
  69. v_n = len(v_list)
  70. if v_n<2:
  71. continue
  72. for i in range(v_n):
  73. for j in range(1, v_n):
  74. if v_list[i][0] == v_list[j][0]:
  75. continue
  76. item_key = (v_list[i][0], v_list[j][0])
  77. item_score = min(v_list[i][1], v_list[j][1])
  78. if item_key not in item_pair_dict:
  79. item_pair_dict[item_key] = item_score
  80. else:
  81. item_pair_dict[item_key] = item_pair_dict[item_key]+item_score
  82. #print(item_pair_dict)
  83. print(len(item_pair_dict))
  84. print(len(item_dict))
  85. left_pair_num = 0
  86. rec_item_dict = {}
  87. #4. rec item
  88. for k, v in item_pair_dict.items():
  89. if v<2:
  90. continue
  91. left_pair_num+=1
  92. item1 = k[0]
  93. item2 = k[1]
  94. pair_score = v
  95. if item1 in item_dict:
  96. item_score1 = item_dict[item1]
  97. i2i_pro = pair_score/(item_score1+5)
  98. rec_list1 = []
  99. if item2 not in rec_item_dict:
  100. rec_list1.append((item1, i2i_pro, pair_score, item_score1))
  101. rec_item_dict[item2] = rec_list1
  102. else:
  103. rec_list1 = rec_item_dict[item2]
  104. rec_list1.append((item1, i2i_pro, pair_score, item_score1))
  105. rec_item_dict[item2] = rec_list1
  106. if item2 in item_dict:
  107. item_score2 = item_dict[item2]
  108. i2i_pro = pair_score/(item_score2+5)
  109. rec_list2 = []
  110. if item1 not in rec_item_dict:
  111. rec_list2.append((item2, i2i_pro, pair_score, item_score2))
  112. rec_item_dict[item1] = rec_list2
  113. else:
  114. rec_list2 = rec_item_dict[item1]
  115. rec_list2.append((item2, i2i_pro, pair_score, item_score2))
  116. rec_item_dict[item1] = rec_list2
  117. #(item, share_count)
  118. print(left_pair_num)
  119. #print(rec_item_dict)
  120. final_rec_list = []
  121. #f = open("rec_result", "w")
  122. #5. sorted item_list
  123. for k,v in rec_item_dict.items():
  124. v_set = set('')
  125. value_list = v
  126. dup_list = []
  127. for item in value_list:
  128. if item[0] in v_set:
  129. continue
  130. v_set.add(item[0])
  131. dup_list.append(item)
  132. sorted_v = sorted(dup_list, key=itemgetter(1), reverse=True)
  133. final_rec_list.append((k, sorted_v))
  134. #print(final_rec_list[:1])
  135. #json_str = json.dumps(final_rec_list)
  136. with open("./data/rec_result_"+nowhour+".json", "w") as f :
  137. json.dump(final_rec_list, f)