#coding utf-8 import sys import json if __name__=="__main__": f = open(sys.argv[1]) user_dict = {} while True: line = f.readline() if not line: break items = line.split("\t") if len(items)<3: continue uid = items[1] vid = items[2] vid_set = set('') if uid in user_dict: vid_set = user_dict[uid] vid_set.add(vid) user_dict[uid] = vid_set f.close() f1 = open(sys.argv[2]) f2 = open(sys.argv[3], 'w') while True: line = f1.readline() if not line: break items = line.split("\t") if len(items)<2: continue vid = items[0] uid_list = json.loads(items[1]) if len(uid_list)<5: # 小于5个mid的视频过滤掉 continue item_dict = {} for uid in uid_list: if uid in user_dict: item_list = user_dict[uid] # 这个人曝光的视频 for item in item_list: item = item.strip() if item in item_dict: item_dict[item] = item_dict[item]+1 else: item_dict[item] = 1 item_list= sorted(item_dict.items(), key=lambda s:s[1], reverse=True) f2.write(vid+"\t"+json.dumps(item_list[:100])+"\t"+items[1]+"\n") f1.close() f2.close()