15_getI2IHour.py 1.4 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849
  1. #coding utf-8
  2. import sys
  3. import json
  4. if __name__=="__main__":
  5. f = open(sys.argv[1])
  6. user_dict = {}
  7. while True:
  8. line = f.readline()
  9. if not line:
  10. break
  11. items = line.split("\t")
  12. if len(items)<3:
  13. continue
  14. uid = items[1]
  15. vid = items[2]
  16. vid_set = set('')
  17. if uid in user_dict:
  18. vid_set = user_dict[uid]
  19. vid_set.add(vid)
  20. user_dict[uid] = vid_set
  21. f.close()
  22. f1 = open(sys.argv[2])
  23. f2 = open(sys.argv[3], 'w')
  24. while True:
  25. line = f1.readline()
  26. if not line:
  27. break
  28. items = line.split("\t")
  29. if len(items)<2:
  30. continue
  31. vid = items[0]
  32. uid_list = json.loads(items[1])
  33. if len(uid_list)<5: # 小于5个mid的视频过滤掉
  34. continue
  35. item_dict = {}
  36. for uid in uid_list:
  37. if uid in user_dict:
  38. item_list = user_dict[uid] # 这个人曝光的视频
  39. for item in item_list:
  40. item = item.strip()
  41. if item in item_dict:
  42. item_dict[item] = item_dict[item]+1
  43. else:
  44. item_dict[item] = 1
  45. item_list= sorted(item_dict.items(), key=lambda s:s[1], reverse=True)
  46. f2.write(vid+"\t"+json.dumps(item_list[:100])+"\t"+items[1]+"\n")
  47. f1.close()
  48. f2.close()