13_groupItemHour.py 1.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960
  1. #coding utf-8
  2. import sys
  3. from operator import itemgetter
  4. import json
  5. if __name__=="__main__":
  6. #1.load data
  7. nowdate=sys.argv[1]
  8. f1 = open("./data/user_item_share_hour_"+nowdate)
  9. user_share_item_dict={}
  10. item_dict = {}
  11. while True:
  12. line = f1.readline()
  13. if not line:
  14. break
  15. items = line.strip().split("\t")
  16. if len(items)<4:
  17. continue
  18. #(user, share)
  19. uid = items[1]
  20. shareid = items[2]
  21. vid = items[3]
  22. user_share_item_dict[shareid] = (uid,vid)
  23. f1.close()
  24. print(len(user_share_item_dict))
  25. f2 = open("./data/user_item_click_hour_"+nowdate)
  26. #user_group_dict={}
  27. item_group_dict = {}
  28. item_dict = {}
  29. while True:
  30. line = f2.readline()
  31. if not line:
  32. break
  33. items = line.strip().split("\t")
  34. if len(items)<3:
  35. continue
  36. #(user, share)
  37. uid = items[1]
  38. shareid = items[2]
  39. #vid = items[3]
  40. sim_user_set = set('')
  41. if shareid in user_share_item_dict:
  42. kuid, kvid = user_share_item_dict[shareid]
  43. key_info = kvid
  44. if key_info in item_group_dict:
  45. sim_user_set = item_group_dict[key_info]
  46. sim_user_set.add(uid)
  47. item_group_dict[key_info] = sim_user_set
  48. else:
  49. sim_user_set.add(uid)
  50. item_group_dict[key_info] = sim_user_set
  51. print(len(item_group_dict))
  52. f2.close()
  53. f3 = open("./data/return_item_hour_"+nowdate, 'w')
  54. for k, v in item_group_dict.items():
  55. f3.write(k+"\t"+json.dumps(list(v))+"\n")
  56. f3.close()
  57. #((user,item), score)
  58. #print(user_item_dict)
  59. #2. (uid, [(vid, score)....])