Browse Source

update word2vec

linfan 2 years ago
parent
commit
cfabe2b033
4 changed files with 119 additions and 7 deletions
  1. 54 0
      get_sim_k.py
  2. 55 0
      predict.py
  3. 7 5
      run.sh
  4. 3 2
      run_ctr.sh

+ 54 - 0
get_sim_k.py

@@ -0,0 +1,54 @@
+#coding utf-8
+import sys
+import pandas as pd
+import numpy as np
+import faiss
+
+
+def gen_i2i(index_item, embeddings,i2i):
+    fw=open(i2i,"w")
+    #print(i2i)
+    embed_matrix=np.array(embeddings).astype('float32')
+    #print(embed_matrix)
+    index=faiss.IndexFlatL2(100)
+    index.add(embed_matrix)
+    #the candicate matrix is embed_matrix,but the search matrix is the same.
+    #if the search vector is in the candicate matrix, the return idx>> the first is the search vector itself
+    #if the search vector is not in the candicate matrix, the return idx>>the first is the index of the candicate
+    distence_matrix,recall_list=index.search(embed_matrix, 20)
+    for idx,rec_arr in enumerate(recall_list):
+        #print("idx:", idx)
+        orgin_item=str(index_item[idx])
+        recall_str=""
+        #rec_arr=[0 6 3 8 7 1]
+        for re_id in rec_arr[1:]:
+            recall_idstr=str(index_item[re_id])
+            #print(recall_idstr)
+            recall_str=recall_str+","+recall_idstr
+        fw.write(orgin_item+"\t"+recall_str[1:]+"\n")
+
+if __name__ == '__main__':
+    f = open(sys.argv[1])
+    index = 0
+    index_dict = {}
+    index_arr = []
+    while True:
+        line = f.readline()
+        if not line:
+           break
+        line = line.strip()
+        #print(eval(line))
+        items = line.split(" ")
+        try:
+            vid = int(items[0])
+            vid_vec = eval(" ".join(items[1:]))
+            index_arr.append(vid_vec)
+            #index +=1
+            index_dict[index] = vid
+            index +=1
+            #print(index_arr)
+        except:
+            continue
+    f.close()
+    print(len(index_arr))
+    gen_i2i(index_dict, index_arr, "i2i_result")

+ 55 - 0
predict.py

@@ -0,0 +1,55 @@
+#coding utf-8
+import sys
+from gensim import models
+import numpy as np
+
+if __name__=="__main__":
+    #model = models.word2vec.Word2Vec.load('word2vec.txt')
+    #print(model.wx)
+    f1 = open('word2vec.txt')
+    word_dict = {}
+    while True:
+        line = f1.readline()
+        if not line:
+            break
+        items = line.strip().split(" ")
+        if len(items)<100:
+            continue
+        arr = []
+        for w in items[1:]:
+            arr.append(float(w))
+        word_dict[items[0]] = arr
+    #print(word_dict)
+    f  = open(sys.argv[1])
+    num = 0
+    while True:
+        line = f.readline()
+        if not line:
+            break
+        num = num+1
+        if num == 1:
+            continue
+        items = line.split("\t")
+        if len(items)<2:
+            continue
+        vid = items[0]
+        title_arr = items[1].split(" ")
+        title_info = np.zeros(100)
+        word_len = 0
+        for word in title_arr:
+            if word in word_dict:
+                 #print(title_info)
+                 #print(word)
+                 word_vec = word_dict[word]
+                 #print(word_vec)
+                 title_info = np.add(title_info, word_vec)
+                 word_len +=1
+        title_info_list = []
+        if word_len<=0:
+            continue
+        for j in title_info:
+            title_info_list.append(j/word_len)
+        #print("title_info_list:", title_info_list)
+        print(vid,title_info_list)
+        
+    

+ 7 - 5
run.sh

@@ -1,9 +1,9 @@
 #!/bin/bash
 source ~/.bash_profile
 source ~/.bashrc
-conda activate base 
+conda activate python36 
 
-cd /data/rec_project/OffLineRec
+cd /home/rec/project/git_project/OffLineRec
 
 #1. download data
 nowday=`date  +"%Y%m%d" -d -0days`
@@ -11,12 +11,14 @@ last7day=`date  +"%Y%m%d" -d -8days`
 echo ${nowday} 
 echo ${last7day}
 mkdir -p ./data/
+mkdir -p ./logs/
+#conda activate py36
 
 python extract_share_log.py ${last7day} ${nowday}
 if [ $? -ne 0 ];
 then
     msg = "[ERROR] simrecall extract_share_log"
-    #sh sendmsg.sh  $nowday  $msg
+    sh sendmsg.sh  $nowday  $msg
     echo "[ERROR] echo 'extract_share_log"
     exit 255
 fi
@@ -26,7 +28,7 @@ python calI2I.py ${nowday}
 if [ $? -ne 0 ];
 then
     msg = "[ERROR] simrecall calI2I.py"
-    #sh sendmsg.sh $nowday $msg
+    sh sendmsg.sh $nowday $msg
     echo $msg
     exit -1
 fi
@@ -36,7 +38,7 @@ python import_redist.py "./data/rec_result_"${nowday}".json"  "./data/redis_cls_
 if [ $? -ne 0 ];
 then
     msg = "[ERROR] simhot recall import_redist.py"
-    #sh sendmsg.sh  $nowday  $msg
+    sh sendmsg.sh  $nowday  $msg
     echo $msg
     exit -1
 fi

+ 3 - 2
run_ctr.sh

@@ -2,8 +2,9 @@
 source ~/.bash_profile
 source ~/.bashrc
 
-conda activate base 
-cd /data/rec_project/OffLineRec
+conda activate python36 
+cd /home/rec/project/git_project/OffLineRec 
+#cd /data/rec_project/OffLineRec
 #1. download data
 nowday=`date  +"%Y%m%d%H" -d -0days`
 echo ${nowday}