12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849 |
- #coding utf-8
- import sys
- import json
- if __name__=="__main__":
- f = open(sys.argv[1])
- user_dict = {}
- while True:
- line = f.readline()
- if not line:
- break
- items = line.split("\t")
- if len(items)<3:
- continue
- uid = items[1]
- vid = items[2]
- vid_set = set('')
- if uid in user_dict:
- vid_set = user_dict[uid]
- vid_set.add(vid)
- user_dict[uid] = vid_set
- f.close()
- f1 = open(sys.argv[2])
- f2 = open(sys.argv[3], 'w')
- while True:
- line = f1.readline()
- if not line:
- break
- items = line.split("\t")
- if len(items)<2:
- continue
- vid = items[0]
- uid_list = json.loads(items[1])
- if len(uid_list)<5: # 小于5个mid的视频过滤掉
- continue
- item_dict = {}
- for uid in uid_list:
- if uid in user_dict:
- item_list = user_dict[uid] # 这个人曝光的视频
- for item in item_list:
- item = item.strip()
- if item in item_dict:
- item_dict[item] = item_dict[item]+1
- else:
- item_dict[item] = 1
- item_list= sorted(item_dict.items(), key=lambda s:s[1], reverse=True)
- f2.write(vid+"\t"+json.dumps(item_list[:100])+"\t"+items[1]+"\n")
- f1.close()
- f2.close()
|