Browse Source

update i2i

linfan 2 years ago
parent
commit
048f308f5c
10 changed files with 495 additions and 18 deletions
  1. 141 0
      calI2I2.py
  2. 124 0
      calI2I3.py
  3. 47 0
      cut_title_top.py
  4. 83 0
      get_batch_sim_k.py
  5. 16 11
      get_sim_k.py
  6. 4 3
      predict.py
  7. 53 0
      process_video.py
  8. 11 0
      test_faiss.py
  9. 12 0
      train_vec.sh
  10. 4 4
      word2vec.py

+ 141 - 0
calI2I2.py

@@ -0,0 +1,141 @@
+#coding utf-8
+import sys
+from operator import itemgetter
+import json
+
+if __name__=="__main__":
+    #1.load data
+    nowdate=sys.argv[1]
+    f = open("./data/user_item_share_"+nowdate)
+    user_item_dict={}
+    item_dict = {}  
+    while True:
+        line = f.readline()
+        if not line:
+            break
+        items = line.strip().split("\t")
+        if len(items)<3:
+            continue
+        vid = -1
+        try:
+            vid = int(items[2])
+        except:
+            continue
+        if vid == -1:
+            continue
+        key = (items[1],vid)
+        #print(key)
+        if key not in user_item_dict:
+            user_item_dict[key] = 1
+        else:
+            user_item_dict[key] = user_item_dict[key]+1
+        if items[2] not in item_dict:
+            item_dict[items[2]] = 1
+        else:
+            item_dict[items[2]] = item_dict[items[2]]+1
+    f.close()
+    #((user,item), score)
+    #print(user_item_dict)
+    #2. (uid, [(vid, score)....])
+    user_group_dict = {}
+    for k, v in user_item_dict.items():
+        uid = k[0]
+        vid = k[1]
+        score = v
+        #if score <3:
+        #    continue
+        vid_list = []
+        if uid not in user_group_dict:
+            vid_list.append((vid, score))
+            user_group_dict[uid] = vid_list
+        else:
+            vid_list = user_group_dict[uid]
+            vid_list.append((vid, score))
+            user_group_dict[uid] = vid_list
+    #print(user_group_dict)
+    item_pair_dict = {}
+    #3. expand item
+    for k, v_list in user_group_dict.items():
+         v_n = len(v_list)
+         if v_n<2:
+             continue
+         for i in range(v_n):
+             for j in range(1, v_n):
+                if v_list[i][0] == v_list[j][0]:
+                    continue
+                item_key = (v_list[i][0], v_list[j][0])
+                item_score = 1
+                if item_key not in item_pair_dict:
+                    item_pair_dict[item_key] = item_score
+                else:
+                    item_pair_dict[item_key] = item_pair_dict[item_key]+item_score
+    print(len(item_pair_dict))
+    #print(item_pair_dict)
+    #print(item_dict)
+    left_pair_num = 0
+    rec_item_dict = {}
+    #4. rec item
+    for k, v in item_pair_dict.items():
+        if v<3:
+            continue
+        left_pair_num+=1
+        #print(k[0])
+        #print(k[1])
+        
+        item1 = int(k[0])
+        item2 = int(k[1])
+        pair_score = v
+        if item1 in item_dict:
+            item_score1 = item_dict[item1]
+            i2i_pro = float(pair_score)/(float(item_score1)+5)
+            if i2i_pro<0.000001:
+                continue
+            rec_list1 = []
+            if item2 not in rec_item_dict:
+                rec_list1.append((item1, i2i_pro, pair_score, item_score1))
+                rec_item_dict[item2] = rec_list1
+            else:
+                rec_list1 = rec_item_dict[item2]
+                rec_list1.append((item1, i2i_pro, pair_score, item_score1))
+                rec_item_dict[item2] = rec_list1
+        if item2 in item_dict:
+            item_score2 = item_dict[item2]
+            i2i_pro = float(pair_score)/(float(item_score2)+5)
+            if i2i_pro<0.000001:
+                continue
+            rec_list2 = []
+            if item1 not in rec_item_dict:
+                rec_list2.append((item2, i2i_pro, pair_score, item_score2))
+                rec_item_dict[item1] = rec_list2
+            else:
+                rec_list2 = rec_item_dict[item1]
+                rec_list2.append((item2, i2i_pro, pair_score, item_score2))
+                rec_item_dict[item1] = rec_list2   
+     
+    #(item, share_count)
+    print(left_pair_num)
+    #print(rec_item_dict)
+    final_rec_list = []
+    #f = open("rec_result", "w")
+    #5. sorted item_list
+    
+    for k,v in rec_item_dict.items():
+        v_set = set('')
+        value_list = v
+        dup_list = []
+        for item in value_list:
+            if item[0] in v_set:
+                continue
+            v_set.add(item[0])
+            #print(item[1])
+            #if float(items[1])<0.000001:
+            #    continue
+            dup_list.append(item)
+        sorted_v = sorted(dup_list, key=itemgetter(1), reverse=True)
+        final_rec_list.append((k, sorted_v))
+    #print(final_rec_list[:1])
+    #json_str = json.dumps(final_rec_list)
+    with open("./data/rec_result_"+nowdate+".json", "w") as f :
+        json.dump(final_rec_list, f)
+    
+     

+ 124 - 0
calI2I3.py

@@ -0,0 +1,124 @@
+#coding utf-8
+import sys
+from operator import itemgetter
+import json
+
+if __name__=="__main__":
+    #1.load data
+    nowdate=sys.argv[1]
+    f = open("./data/user_item_share_"+nowdate)
+    user_item_dict={}
+    item_dict = {}  
+    while True:
+        line = f.readline()
+        if not line:
+            break
+        items = line.strip().split("\t")
+        if len(items)<3:
+            continue
+        key = (items[1],items[2])
+        #print(key)
+        if key not in user_item_dict:
+            user_item_dict[key] = 1
+        else:
+            user_item_dict[key] = user_item_dict[key]+1
+        if items[2] not in item_dict:
+            item_dict[items[2]] = 1
+        else:
+            item_dict[items[2]] = item_dict[items[2]]+1
+    f.close()
+    #((user,item), score)
+    #print(user_item_dict)
+    #2. (uid, [(vid, score)....])
+    user_group_dict = {}
+    for k, v in user_item_dict.items():
+        uid = k[0]
+        vid = k[1]
+        score = v
+        #if score <3:
+        #    continue
+        vid_list = []
+        if uid not in user_group_dict:
+            vid_list.append((vid, score))
+            user_group_dict[uid] = vid_list
+        else:
+            vid_list = user_group_dict[uid]
+            vid_list.append((vid, score))
+            user_group_dict[uid] = vid_list
+    #print(user_group_dict)
+    item_pair_dict = {}
+    #3. expand item
+    for k, v_list in user_group_dict.items():
+         v_n = len(v_list)
+         if v_n<2:
+             continue
+         for i in range(v_n):
+             for j in range(1, v_n):
+                if v_list[i][0] == v_list[j][0]:
+                    continue
+                item_key = (v_list[i][0], v_list[j][0])
+                item_score = 1
+                if item_key not in item_pair_dict:
+                    item_pair_dict[item_key] = item_score
+                else:
+                    item_pair_dict[item_key] = item_pair_dict[item_key]+item_score
+    #print(item_pair_dict)
+    print(item_pair_dict)
+    print(item_dict)
+    left_pair_num = 0
+    rec_item_dict = {}
+    #4. rec item
+    for k, v in item_pair_dict.items():
+        if v<2:
+            continue
+        left_pair_num+=1
+        item1 = k[0]
+        item2 = k[1]
+        pair_score = v
+        if item1 in item_dict:
+             item_score1 = item_dict[item1]
+             i2i_pro = pair_score/(item_score1+5)
+             rec_list1 = []
+             if item2 not in rec_item_dict:
+                 rec_list1.append((item1, i2i_pro, pair_score, item_score1))
+                 rec_item_dict[item2] = rec_list1
+             else:
+                 rec_list1 = rec_item_dict[item2]
+                 rec_list1.append((item1, i2i_pro, pair_score, item_score1))
+                 rec_item_dict[item2] = rec_list1
+        if item2 in item_dict:
+             item_score2 = item_dict[item2]
+             i2i_pro = pair_score/(item_score2+5)
+             rec_list2 = []
+             if item1 not in rec_item_dict:
+                 rec_list2.append((item2, i2i_pro, pair_score, item_score2))
+                 rec_item_dict[item1] = rec_list2
+             else:
+                 rec_list2 = rec_item_dict[item1]
+                 rec_list2.append((item2, i2i_pro, pair_score, item_score2))
+                 rec_item_dict[item1] = rec_list2          
+     
+    #(item, share_count)
+    print(left_pair_num)
+    #print(rec_item_dict)
+    final_rec_list = []
+    #f = open("rec_result", "w")
+    #5. sorted item_list
+    
+    for k,v in rec_item_dict.items():
+        v_set = set('')
+        value_list = v
+        dup_list = []
+        for item in value_list:
+            if item[0] in v_set:
+                continue
+            v_set.add(item[0])
+            dup_list.append(item)
+        sorted_v = sorted(dup_list, key=itemgetter(1), reverse=True)
+        final_rec_list.append((k, sorted_v))
+    #print(final_rec_list[:1])
+    #json_str = json.dumps(final_rec_list)
+    with open("./data/rec_result2_"+nowdate+".json", "w") as f :
+        json.dump(final_rec_list, f)
+    
+     

+ 47 - 0
cut_title_top.py

@@ -0,0 +1,47 @@
+#coding utf-8
+import sys
+import jieba 
+import os
+
+if __name__=="__main__":
+    #f1 = open(sys.argv[1])
+    stop_words = set('')
+    path = sys.argv[1]
+    files_dir = os.listdir(path)
+    #print(files_dir)
+    for file_name in files_dir:
+        if file_name.find('.txt')>-1:
+            f1 = open(path+"/"+file_name)
+            while True:
+                file_line = f1.readline()
+                if not file_line:
+                    break
+                file_line = file_line.strip()
+                stop_words.add(file_line)
+            f1.close()
+    #print(len(stop_words))
+    f = open(sys.argv[2])
+    f3 = open(sys.argv[3], 'w')
+    while True:
+        line = f.readline()
+        if not line:
+            break
+        line = line.strip()
+        items = line.split("\t")
+        if len(items)<2:
+            continue
+        vid = items[0]
+        title = items[1] 
+        cut_info =  jieba.lcut(title)
+        cut_arr = []
+        for cut_item in cut_info:
+            #print("cut_item:", cut_item)
+            if cut_item==' ':
+                continue
+            if cut_item in stop_words:
+                continue
+            cut_arr.append(cut_item)
+        vid_info = vid+'\t'+" ".join(cut_arr)
+        f3.write(vid_info.strip()+"\n")
+    f3.close()
+       

+ 83 - 0
get_batch_sim_k.py

@@ -0,0 +1,83 @@
+#coding utf-8
+import sys
+import pandas as pd
+import numpy as np
+import faiss
+import time
+
+def gen_i2i(index_item, embeddings,i2i):
+    fw=open(i2i,"w")
+    #print(i2i)
+    start_time = time.time()
+    #xb = embeddings
+    xb=np.array(embeddings).astype('float32')
+    #print(xb)
+    #index.add(xb)
+    dim, measure = 64, faiss.METRIC_L2  
+    param =  'IVF100,PQ16'
+    index = faiss.index_factory(dim, param, measure) 
+    #print(index.is_trained)                          # 此时输出为False,因为倒排索引需要训练k-means, 
+    index.train(xb) 
+    end_time = time.time()
+    print("time:", (end_time-start_time))
+    #index=faiss.IndexFlatL2(100)
+    #index.add(embed_matrix)
+    #the candicate matrix is embed_matrix,but the search matrix is the same.
+    #if the search vector is in the candicate matrix, the return idx>> the first is the search vector itself
+    #if the search vector is not in the candicate matrix, the return idx>>the first is the index of the candicate
+    batch = 10000
+    num = len(embeddings)
+    per_rounds = int(num/batch)+1
+    #index=faiss.IndexFlatL2(64)
+    index.add(xb)
+    print("cost time:", (end_time-start_time))
+    #distence_matrix,recall_list=index.search(xb, 20)
+    #print(distence_matrix)
+    #print(recall_list)
+    for i in range(per_rounds):
+        per_embedding = xb[i:(i+1)*batch]
+        #print(per_embedding)
+        #print(len(per_embedding))
+        distence_matrix,recall_list=index.search(per_embedding, 20)
+        #print("distence_matrix:", distence_matrix)
+        #print("recall_list:", recall_list)
+       	for idx,rec_arr in enumerate(recall_list):
+            #print("idx:", idx)
+            orgin_item=str(index_item[idx])
+            #print("orgin_item:", orgin_item)
+            #print("rec_arr:", rec_arr)
+            recall_str=""
+            for re_id in rec_arr[1:]:
+                if re_id in index_item:
+                    recall_idstr=str(index_item[re_id])
+                    recall_str=recall_str+","+recall_idstr
+            fw.write(orgin_item+"\t"+recall_str[1:]+"\n")
+
+if __name__ == '__main__':
+    f = open(sys.argv[1])
+    index = 0
+    index_dict = {}
+    index_arr = []
+    while True:
+        line = f.readline()
+        if not line:
+           break
+        items = line.strip().split(" ")
+        try:
+            vid = int(items[0])
+            vid_vec = eval(" ".join(items[1:]))
+            vid_vec=np.array(vid_vec)
+            float_arr = vid_vec.astype(np.float64).tolist()
+            #print(float_arr)
+            index_arr.append(float_arr)
+            #index +=1
+            index_dict[index] = vid
+            index +=1
+            #break
+            #print(index_arr)
+        except:
+            #break
+            continue
+    f.close()
+    #print(index_arr)
+    gen_i2i(index_dict, index_arr, "i2i_result")

+ 16 - 11
get_sim_k.py

@@ -3,7 +3,7 @@ import sys
 import pandas as pd
 import numpy as np
 import faiss
-
+import time
 
 def gen_i2i(index_item, embeddings,i2i):
     fw=open(i2i,"w")
@@ -30,25 +30,30 @@ def gen_i2i(index_item, embeddings,i2i):
 if __name__ == '__main__':
     f = open(sys.argv[1])
     index = 0
+    start_time = time.time()
     index_dict = {}
     index_arr = []
     while True:
         line = f.readline()
         if not line:
            break
-        line = line.strip()
+        line = line.strip().replace("[","").replace("]","")
         #print(eval(line))
         items = line.split(" ")
+        if len(items)<2:
+            continue
         try:
-            vid = int(items[0])
-            vid_vec = eval(" ".join(items[1:]))
-            index_arr.append(vid_vec)
-            #index +=1
-            index_dict[index] = vid
-            index +=1
-            #print(index_arr)
+           vid = int(items[0])
+           #vid_vec = items[1:]
+           print(line.split(" "))
+           vid_vec = eval(" ".join(items[1:]))
+           index_arr.append(vid_vec)
+           index_dict[index] = vid
+           index +=1
         except:
-            continue
+           continue
     f.close()
     print(len(index_arr))
-    gen_i2i(index_dict, index_arr, "i2i_result")
+    end_time = time.time()
+    print("time:", (end_time-start_time))
+    #gen_i2i(index_dict, index_arr, "i2i_result")

+ 4 - 3
predict.py

@@ -13,7 +13,7 @@ if __name__=="__main__":
         if not line:
             break
         items = line.strip().split(" ")
-        if len(items)<100:
+        if len(items)<64:
             continue
         arr = []
         for w in items[1:]:
@@ -34,7 +34,7 @@ if __name__=="__main__":
             continue
         vid = items[0]
         title_arr = items[1].split(" ")
-        title_info = np.zeros(100)
+        title_info = np.zeros(64)
         word_len = 0
         for word in title_arr:
             if word in word_dict:
@@ -44,12 +44,13 @@ if __name__=="__main__":
                  #print(word_vec)
                  title_info = np.add(title_info, word_vec)
                  word_len +=1
+        #print(title_info)
         title_info_list = []
         if word_len<=0:
             continue
         for j in title_info:
             title_info_list.append(j/word_len)
         #print("title_info_list:", title_info_list)
-        print(vid,title_info_list)
+        print(vid,"\t",title_info_list)
         
     

+ 53 - 0
process_video.py

@@ -0,0 +1,53 @@
+#coding utf-8
+import sys
+import pandas as pd
+import numpy as np
+import faiss
+
+
+def gen_i2i(index_item, embeddings,i2i):
+    fw=open(i2i,"w")
+    #print(i2i)
+    embed_matrix=np.array(embeddings).astype('float32')
+    #print(embed_matrix)
+    index=faiss.IndexFlatL2(100)
+    index.add(embed_matrix)
+    #the candicate matrix is embed_matrix,but the search matrix is the same.
+    #if the search vector is in the candicate matrix, the return idx>> the first is the search vector itself
+    #if the search vector is not in the candicate matrix, the return idx>>the first is the index of the candicate
+    distence_matrix,recall_list=index.search(embed_matrix, 20)
+    for idx,rec_arr in enumerate(recall_list):
+        #print("idx:", idx)
+        orgin_item=str(index_item[idx])
+        recall_str=""
+        #rec_arr=[0 6 3 8 7 1]
+        for re_id in rec_arr[1:]:
+            recall_idstr=str(index_item[re_id])
+            #print(recall_idstr)
+            recall_str=recall_str+","+recall_idstr
+        fw.write(orgin_item+"\t"+recall_str[1:]+"\n")
+
+if __name__ == '__main__':
+    f = open(sys.argv[1])
+    index = 0
+    index_dict = {}
+    index_arr = []
+    while True:
+        line = f.readline()
+        if not line:
+           break
+        line = line.strip()
+        #print(line)
+        items = line.split(" ")
+        #print(int(items[0]))
+        try:
+            vid = int(items[0])
+            print(line)
+            #print(str(vid)+"\t"+items[1:])
+            #print(index_arr)
+        except:
+            #print(int(items[0]))
+            continue
+    f.close()
+    #print(len(index_arr))
+    #gen_i2i(index_dict, index_arr, "i2i_result")

+ 11 - 0
test_faiss.py

@@ -0,0 +1,11 @@
+import numpy as np
+d = 64                                           # 向量维度
+nb = 100000                                      # index向量库的数据量
+nq = 10000                                       # 待检索query的数目
+np.random.seed(1234)             
+xb = np.random.random((nb, d)).astype('float32')
+#xb[:, 0] += np.arange(nb) / 1000.                # index向量库的向量
+xq = np.random.random((nq, d)).astype('float32')
+#xq[:, 0] += np.arange(nq) / 1000.
+
+print(xb)

+ 12 - 0
train_vec.sh

@@ -0,0 +1,12 @@
+cd /home/rec/project/git_project/OffLineRec
+
+#1.cut_title
+nowday=`date  +"%Y%m%d" -d -0days`
+nowday=20230512
+
+#python cut_title.py ./stopwords/ ./data/video_title_${nowday} ./data/video_cut_title_${nowday}
+
+#python word2vec.py 
+
+
+#python predict.py ./data/word2vec_cut_title  > ./data/video_title_embedding

+ 4 - 4
word2vec.py

@@ -15,14 +15,14 @@ if __name__=="__main__":
         if  num == 1:
             continue
         items = line.strip().split("\t")
-        print(items)
+        #print(items)
         if len(items)<2:
             continue
         arr.append(items[1].split(" "))
-        print(arr)
+        #print(arr)
     f.close()
-    '''model = word2vec.Word2Vec(arr, vector_size=100, min_count=1,sg=1)
-    model.wv.save_word2vec_format('word2vec.txt',binary=False)'''
+    model = word2vec.Word2Vec(arr, vector_size=64, min_count=2,sg=1, workers=10)
+    model.wv.save_word2vec_format('word2vec.txt',binary=False)
     #model.save('word2vec.model')