123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142 |
- #coding utf-8
- import sys
- from operator import itemgetter
- import json
- if __name__=="__main__":
- #1.load data
- nowdate=sys.argv[1]
- f = open("./data/user_item_share_"+nowdate)
- user_item_dict={}
- item_dict = {}
- while True:
- line = f.readline()
- if not line:
- break
- items = line.strip().split("\t")
- if len(items)<3:
- continue
- key = (items[1],items[2])
- #print(key)
- if key not in user_item_dict:
- user_item_dict[key] = 1
- else:
- user_item_dict[key] = user_item_dict[key]+1
- if items[2] not in item_dict:
- item_dict[items[2]] = 1
- else:
- item_dict[items[2]] = item_dict[items[2]]+1
- f.close()
- nowhour=sys.argv[2]
- f1 = open("./data/user_cur_day_item_share_"+nowhour)
- while True:
- line = f1.readline()
- if not line:
- break
- items = line.strip().split("\t")
- if len(items)<3:
- continue
- key = (items[1],items[2])
- #print(key)
- if key not in user_item_dict:
- user_item_dict[key] = 1
- else:
- user_item_dict[key] = user_item_dict[key]+1
- if items[2] not in item_dict:
- item_dict[items[2]] = 1
- else:
- item_dict[items[2]] = item_dict[items[2]]+1
- f1.close()
- #((user,item), score)
- #print(user_item_dict)
- #2. (uid, [(vid, score)....])
- user_group_dict = {}
- for k, v in user_item_dict.items():
- uid = k[0]
- vid = k[1]
- score = v
- vid_list = []
- if uid not in user_group_dict:
- vid_list.append((vid, score))
- user_group_dict[uid] = vid_list
- else:
- vid_list = user_group_dict[uid]
- vid_list.append((vid, score))
- user_group_dict[uid] = vid_list
- #print(user_group_dict)
- item_pair_dict = {}
- #3. expand item
- for k, v_list in user_group_dict.items():
- v_n = len(v_list)
- if v_n<2:
- continue
- for i in range(v_n):
- for j in range(1, v_n):
- if v_list[i][0] == v_list[j][0]:
- continue
- item_key = (v_list[i][0], v_list[j][0])
- item_score = min(v_list[i][1], v_list[j][1])
- if item_key not in item_pair_dict:
- item_pair_dict[item_key] = item_score
- else:
- item_pair_dict[item_key] = item_pair_dict[item_key]+item_score
- #print(item_pair_dict)
- print(len(item_pair_dict))
- print(len(item_dict))
- left_pair_num = 0
- rec_item_dict = {}
- #4. rec item
- for k, v in item_pair_dict.items():
- if v<2:
- continue
- left_pair_num+=1
- item1 = k[0]
- item2 = k[1]
- pair_score = v
- if item1 in item_dict:
- item_score1 = item_dict[item1]
- i2i_pro = pair_score/(item_score1+5)
- rec_list1 = []
- if item2 not in rec_item_dict:
- rec_list1.append((item1, i2i_pro, pair_score, item_score1))
- rec_item_dict[item2] = rec_list1
- else:
- rec_list1 = rec_item_dict[item2]
- rec_list1.append((item1, i2i_pro, pair_score, item_score1))
- rec_item_dict[item2] = rec_list1
- if item2 in item_dict:
- item_score2 = item_dict[item2]
- i2i_pro = pair_score/(item_score2+5)
- rec_list2 = []
- if item1 not in rec_item_dict:
- rec_list2.append((item2, i2i_pro, pair_score, item_score2))
- rec_item_dict[item1] = rec_list2
- else:
- rec_list2 = rec_item_dict[item1]
- rec_list2.append((item2, i2i_pro, pair_score, item_score2))
- rec_item_dict[item1] = rec_list2
-
- #(item, share_count)
- print(left_pair_num)
- #print(rec_item_dict)
- final_rec_list = []
- #f = open("rec_result", "w")
- #5. sorted item_list
-
- for k,v in rec_item_dict.items():
- v_set = set('')
- value_list = v
- dup_list = []
- for item in value_list:
- if item[0] in v_set:
- continue
- v_set.add(item[0])
- dup_list.append(item)
- sorted_v = sorted(dup_list, key=itemgetter(1), reverse=True)
- final_rec_list.append((k, sorted_v))
- #print(final_rec_list[:1])
- #json_str = json.dumps(final_rec_list)
- with open("./data/rec_result_"+nowhour+".json", "w") as f :
- json.dump(final_rec_list, f)
-
-
|