calI2I2.py 5.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176
  1. #coding utf-8
  2. import sys
  3. from operator import itemgetter
  4. import json
  5. if __name__=="__main__":
  6. #1.load data
  7. nowdate=sys.argv[1]
  8. f = open("./data/user_item_share_filter_"+nowdate)
  9. user_item_dict={}
  10. item_dict = {}
  11. while True:
  12. line = f.readline()
  13. if not line:
  14. break
  15. items = line.strip().split("\t")
  16. if len(items)<3:
  17. continue
  18. vid = -1
  19. try:
  20. vid = int(items[2])
  21. except:
  22. continue
  23. if vid == -1:
  24. continue
  25. key = (items[1],vid)
  26. #print(key)
  27. if key not in user_item_dict:
  28. user_item_dict[key] = 1
  29. else:
  30. user_item_dict[key] = user_item_dict[key]+1
  31. if vid not in item_dict:
  32. item_dict[vid] = 1
  33. else:
  34. item_dict[vid] = item_dict[vid]+1
  35. f.close()
  36. nowhour=sys.argv[2]
  37. f1 = open("./data/user_cur_day_item_share_filter_"+nowhour)
  38. while True:
  39. line = f1.readline()
  40. if not line:
  41. break
  42. items = line.strip().split("\t")
  43. if len(items)<3:
  44. continue
  45. vid = -1
  46. try:
  47. vid = int(items[2])
  48. except:
  49. continue
  50. if vid == -1:
  51. continue
  52. key = (items[1],vid)
  53. #print(key)
  54. if key not in user_item_dict:
  55. user_item_dict[key] = 1
  56. else:
  57. user_item_dict[key] = user_item_dict[key]+1
  58. if vid not in item_dict:
  59. item_dict[vid] = 1
  60. else:
  61. item_dict[vid] = item_dict[vid]+1
  62. f1.close()
  63. #((user,item), score)
  64. #print(user_item_dict)
  65. #2. (uid, [(vid, score)....])
  66. user_group_dict = {}
  67. for k, v in user_item_dict.items():
  68. uid = k[0]
  69. vid = k[1]
  70. score = v
  71. #if score <3:
  72. # continue
  73. vid_list = []
  74. if uid not in user_group_dict:
  75. vid_list.append((vid, score))
  76. user_group_dict[uid] = vid_list
  77. else:
  78. vid_list = user_group_dict[uid]
  79. vid_list.append((vid, score))
  80. user_group_dict[uid] = vid_list
  81. #print(user_group_dict)
  82. item_pair_dict = {}
  83. #3. expand item
  84. for k, v_list in user_group_dict.items():
  85. v_n = len(v_list)
  86. if v_n<2:
  87. continue
  88. for i in range(v_n):
  89. for j in range(1, v_n):
  90. if v_list[i][0] == v_list[j][0]:
  91. continue
  92. item_key = (v_list[i][0], v_list[j][0])
  93. item_score = 1
  94. if item_key not in item_pair_dict:
  95. item_pair_dict[item_key] = item_score
  96. else:
  97. item_pair_dict[item_key] = item_pair_dict[item_key]+item_score
  98. print(len(item_pair_dict))
  99. #print(item_pair_dict)
  100. #print(item_dict)
  101. left_pair_num = 0
  102. rec_item_dict = {}
  103. #4. rec item
  104. for k, v in item_pair_dict.items():
  105. if v<3:
  106. continue
  107. left_pair_num+=1
  108. #print(k[0])
  109. #print(k[1])
  110. item1 = int(k[0])
  111. item2 = int(k[1])
  112. pair_score = v
  113. if item1 in item_dict:
  114. item_score1 = item_dict[item1]
  115. #if item_score1<10:
  116. # continue
  117. item_score1 = 1
  118. i2i_pro = float(pair_score)/(float(item_score1)+5)
  119. if i2i_pro<0.000001:
  120. continue
  121. rec_list1 = []
  122. if item2 not in rec_item_dict:
  123. rec_list1.append((item1, i2i_pro, pair_score, item_score1))
  124. rec_item_dict[item2] = rec_list1
  125. else:
  126. rec_list1 = rec_item_dict[item2]
  127. rec_list1.append((item1, i2i_pro, pair_score, item_score1))
  128. rec_item_dict[item2] = rec_list1
  129. if item2 in item_dict:
  130. item_score2 = item_dict[item2]
  131. #if item_score2<10:
  132. # continue
  133. item_score2 = 1.0
  134. i2i_pro = float(pair_score)/(float(item_score2)+5)
  135. if i2i_pro<0.000001:
  136. continue
  137. rec_list2 = []
  138. if item1 not in rec_item_dict:
  139. rec_list2.append((item2, i2i_pro, pair_score, item_score2))
  140. rec_item_dict[item1] = rec_list2
  141. else:
  142. rec_list2 = rec_item_dict[item1]
  143. rec_list2.append((item2, i2i_pro, pair_score, item_score2))
  144. rec_item_dict[item1] = rec_list2
  145. #(item, share_count)
  146. print(left_pair_num)
  147. #print(rec_item_dict)
  148. final_rec_list = []
  149. #f = open("rec_result", "w")
  150. #5. sorted item_list
  151. for k,v in rec_item_dict.items():
  152. v_set = set('')
  153. value_list = v
  154. dup_list = []
  155. for item in value_list:
  156. if item[0] in v_set:
  157. continue
  158. v_set.add(item[0])
  159. #print(item[1])
  160. #if float(items[1])<0.000001:
  161. # continue
  162. dup_list.append(item)
  163. sorted_v = sorted(dup_list, key=itemgetter(1), reverse=True)
  164. final_rec_list.append((k, sorted_v))
  165. #print(final_rec_list[:1])
  166. #json_str = json.dumps(final_rec_list)
  167. with open("./data/rec_result3_"+nowhour+".json", "w") as f :
  168. json.dump(final_rec_list, f)