#coding utf-8 import sys from operator import itemgetter import json if __name__=="__main__": #1.load data nowdate=sys.argv[1] f = open("./data/user_item_share_filter_"+nowdate) user_item_dict={} item_dict = {} while True: line = f.readline() if not line: break items = line.strip().split("\t") if len(items)<3: continue vid = -1 try: vid = int(items[2]) except: continue if vid == -1: continue key = (items[1],vid) #print(key) if key not in user_item_dict: user_item_dict[key] = 1 else: user_item_dict[key] = user_item_dict[key]+1 if vid not in item_dict: item_dict[vid] = 1 else: item_dict[vid] = item_dict[vid]+1 f.close() nowhour=sys.argv[2] f1 = open("./data/user_cur_day_item_share_filter_"+nowhour) while True: line = f1.readline() if not line: break items = line.strip().split("\t") if len(items)<3: continue vid = -1 try: vid = int(items[2]) except: continue if vid == -1: continue key = (items[1],vid) #print(key) if key not in user_item_dict: user_item_dict[key] = 1 else: user_item_dict[key] = user_item_dict[key]+1 if vid not in item_dict: item_dict[vid] = 1 else: item_dict[vid] = item_dict[vid]+1 f1.close() #((user,item), score) #print(user_item_dict) #2. (uid, [(vid, score)....]) user_group_dict = {} for k, v in user_item_dict.items(): uid = k[0] vid = k[1] score = v #if score <3: # continue vid_list = [] if uid not in user_group_dict: vid_list.append((vid, score)) user_group_dict[uid] = vid_list else: vid_list = user_group_dict[uid] vid_list.append((vid, score)) user_group_dict[uid] = vid_list #print(user_group_dict) item_pair_dict = {} #3. expand item for k, v_list in user_group_dict.items(): v_n = len(v_list) if v_n<2: continue for i in range(v_n): for j in range(1, v_n): if v_list[i][0] == v_list[j][0]: continue item_key = (v_list[i][0], v_list[j][0]) item_score = 1 if item_key not in item_pair_dict: item_pair_dict[item_key] = item_score else: item_pair_dict[item_key] = item_pair_dict[item_key]+item_score print(len(item_pair_dict)) #print(item_pair_dict) #print(item_dict) left_pair_num = 0 rec_item_dict = {} #4. rec item for k, v in item_pair_dict.items(): if v<3: continue left_pair_num+=1 #print(k[0]) #print(k[1]) item1 = int(k[0]) item2 = int(k[1]) pair_score = v if item1 in item_dict: item_score1 = item_dict[item1] #if item_score1<10: # continue item_score1 = 1 i2i_pro = float(pair_score)/(float(item_score1)+5) if i2i_pro<0.000001: continue rec_list1 = [] if item2 not in rec_item_dict: rec_list1.append((item1, i2i_pro, pair_score, item_score1)) rec_item_dict[item2] = rec_list1 else: rec_list1 = rec_item_dict[item2] rec_list1.append((item1, i2i_pro, pair_score, item_score1)) rec_item_dict[item2] = rec_list1 if item2 in item_dict: item_score2 = item_dict[item2] #if item_score2<10: # continue item_score2 = 1.0 i2i_pro = float(pair_score)/(float(item_score2)+5) if i2i_pro<0.000001: continue rec_list2 = [] if item1 not in rec_item_dict: rec_list2.append((item2, i2i_pro, pair_score, item_score2)) rec_item_dict[item1] = rec_list2 else: rec_list2 = rec_item_dict[item1] rec_list2.append((item2, i2i_pro, pair_score, item_score2)) rec_item_dict[item1] = rec_list2 #(item, share_count) print(left_pair_num) #print(rec_item_dict) final_rec_list = [] #f = open("rec_result", "w") #5. sorted item_list for k,v in rec_item_dict.items(): v_set = set('') value_list = v dup_list = [] for item in value_list: if item[0] in v_set: continue v_set.add(item[0]) #print(item[1]) #if float(items[1])<0.000001: # continue dup_list.append(item) sorted_v = sorted(dup_list, key=itemgetter(1), reverse=True) final_rec_list.append((k, sorted_v)) #print(final_rec_list[:1]) #json_str = json.dumps(final_rec_list) with open("./data/rec_result3_"+nowhour+".json", "w") as f : json.dump(final_rec_list, f)