2 years ago · 048f308f5c
--- a/calI2I2.py
+++ b/calI2I2.py
@@ -0,0 +1,141 @@
 
				+#coding utf-8
			
 
				+import sys
			
 
				+from operator import itemgetter
			
 
				+import json
			
 
				+
			
 
				+if __name__=="__main__":
			
 
				+    #1.load data
			
 
				+    nowdate=sys.argv[1]
			
 
				+    f = open("./data/user_item_share_"+nowdate)
			
 
				+    user_item_dict={}
			
 
				+    item_dict = {}  
			
 
				+    while True:
			
 
				+        line = f.readline()
			
 
				+        if not line:
			
 
				+            break
			
 
				+        items = line.strip().split("\t")
			
 
				+        if len(items)<3:
			
 
				+            continue
			
 
				+        vid = -1
			
 
				+        try:
			
 
				+            vid = int(items[2])
			
 
				+        except:
			
 
				+            continue
			
 
				+        if vid == -1:
			
 
				+            continue
			
 
				+        key = (items[1],vid)
			
 
				+        #print(key)
			
 
				+        if key not in user_item_dict:
			
 
				+            user_item_dict[key] = 1
			
 
				+        else:
			
 
				+            user_item_dict[key] = user_item_dict[key]+1
			
 
				+        if items[2] not in item_dict:
			
 
				+            item_dict[items[2]] = 1
			
 
				+        else:
			
 
				+            item_dict[items[2]] = item_dict[items[2]]+1
			
 
				+    f.close()
			
 
				+    #((user,item), score)
			
 
				+    #print(user_item_dict)
			
 
				+    #2. (uid, [(vid, score)....])
			
 
				+    user_group_dict = {}
			
 
				+    for k, v in user_item_dict.items():
			
 
				+        uid = k[0]
			
 
				+        vid = k[1]
			
 
				+        score = v
			
 
				+        #if score <3:
			
 
				+        #    continue
			
 
				+        vid_list = []
			
 
				+        if uid not in user_group_dict:
			
 
				+            vid_list.append((vid, score))
			
 
				+            user_group_dict[uid] = vid_list
			
 
				+        else:
			
 
				+            vid_list = user_group_dict[uid]
			
 
				+            vid_list.append((vid, score))
			
 
				+            user_group_dict[uid] = vid_list
			
 
				+    #print(user_group_dict)
			
 
				+    item_pair_dict = {}
			
 
				+    #3. expand item
			
 
				+    for k, v_list in user_group_dict.items():
			
 
				+         v_n = len(v_list)
			
 
				+         if v_n<2:
			
 
				+             continue
			
 
				+         for i in range(v_n):
			
 
				+             for j in range(1, v_n):
			
 
				+                if v_list[i][0] == v_list[j][0]:
			
 
				+                    continue
			
 
				+                item_key = (v_list[i][0], v_list[j][0])
			
 
				+                item_score = 1
			
 
				+                if item_key not in item_pair_dict:
			
 
				+                    item_pair_dict[item_key] = item_score
			
 
				+                else:
			
 
				+                    item_pair_dict[item_key] = item_pair_dict[item_key]+item_score
			
 
				+    print(len(item_pair_dict))
			
 
				+    #print(item_pair_dict)
			
 
				+    #print(item_dict)
			
 
				+    left_pair_num = 0
			
 
				+    rec_item_dict = {}
			
 
				+    #4. rec item
			
 
				+    for k, v in item_pair_dict.items():
			
 
				+        if v<3:
			
 
				+            continue
			
 
				+        left_pair_num+=1
			
 
				+        #print(k[0])
			
 
				+        #print(k[1])
			
 
				+        
			
 
				+        item1 = int(k[0])
			
 
				+        item2 = int(k[1])
			
 
				+        pair_score = v
			
 
				+        if item1 in item_dict:
			
 
				+            item_score1 = item_dict[item1]
			
 
				+            i2i_pro = float(pair_score)/(float(item_score1)+5)
			
 
				+            if i2i_pro<0.000001:
			
 
				+                continue
			
 
				+            rec_list1 = []
			
 
				+            if item2 not in rec_item_dict:
			
 
				+                rec_list1.append((item1, i2i_pro, pair_score, item_score1))
			
 
				+                rec_item_dict[item2] = rec_list1
			
 
				+            else:
			
 
				+                rec_list1 = rec_item_dict[item2]
			
 
				+                rec_list1.append((item1, i2i_pro, pair_score, item_score1))
			
 
				+                rec_item_dict[item2] = rec_list1
			
 
				+        if item2 in item_dict:
			
 
				+            item_score2 = item_dict[item2]
			
 
				+            i2i_pro = float(pair_score)/(float(item_score2)+5)
			
 
				+            if i2i_pro<0.000001:
			
 
				+                continue
			
 
				+            rec_list2 = []
			
 
				+            if item1 not in rec_item_dict:
			
 
				+                rec_list2.append((item2, i2i_pro, pair_score, item_score2))
			
 
				+                rec_item_dict[item1] = rec_list2
			
 
				+            else:
			
 
				+                rec_list2 = rec_item_dict[item1]
			
 
				+                rec_list2.append((item2, i2i_pro, pair_score, item_score2))
			
 
				+                rec_item_dict[item1] = rec_list2   
			
 
				+     
			
 
				+    #(item, share_count)
			
 
				+    print(left_pair_num)
			
 
				+    #print(rec_item_dict)
			
 
				+    final_rec_list = []
			
 
				+    #f = open("rec_result", "w")
			
 
				+    #5. sorted item_list
			
 
				+    
			
 
				+    for k,v in rec_item_dict.items():
			
 
				+        v_set = set('')
			
 
				+        value_list = v
			
 
				+        dup_list = []
			
 
				+        for item in value_list:
			
 
				+            if item[0] in v_set:
			
 
				+                continue
			
 
				+            v_set.add(item[0])
			
 
				+            #print(item[1])
			
 
				+            #if float(items[1])<0.000001:
			
 
				+            #    continue
			
 
				+            dup_list.append(item)
			
 
				+        sorted_v = sorted(dup_list, key=itemgetter(1), reverse=True)
			
 
				+        final_rec_list.append((k, sorted_v))
			
 
				+    #print(final_rec_list[:1])
			
 
				+    #json_str = json.dumps(final_rec_list)
			
 
				+    with open("./data/rec_result_"+nowdate+".json", "w") as f :
			
 
				+        json.dump(final_rec_list, f)
			
 
				+    
			
 
				+     
			
--- a/calI2I3.py
+++ b/calI2I3.py
@@ -0,0 +1,124 @@
 
				+#coding utf-8
			
 
				+import sys
			
 
				+from operator import itemgetter
			
 
				+import json
			
 
				+
			
 
				+if __name__=="__main__":
			
 
				+    #1.load data
			
 
				+    nowdate=sys.argv[1]
			
 
				+    f = open("./data/user_item_share_"+nowdate)
			
 
				+    user_item_dict={}
			
 
				+    item_dict = {}  
			
 
				+    while True:
			
 
				+        line = f.readline()
			
 
				+        if not line:
			
 
				+            break
			
 
				+        items = line.strip().split("\t")
			
 
				+        if len(items)<3:
			
 
				+            continue
			
 
				+        key = (items[1],items[2])
			
 
				+        #print(key)
			
 
				+        if key not in user_item_dict:
			
 
				+            user_item_dict[key] = 1
			
 
				+        else:
			
 
				+            user_item_dict[key] = user_item_dict[key]+1
			
 
				+        if items[2] not in item_dict:
			
 
				+            item_dict[items[2]] = 1
			
 
				+        else:
			
 
				+            item_dict[items[2]] = item_dict[items[2]]+1
			
 
				+    f.close()
			
 
				+    #((user,item), score)
			
 
				+    #print(user_item_dict)
			
 
				+    #2. (uid, [(vid, score)....])
			
 
				+    user_group_dict = {}
			
 
				+    for k, v in user_item_dict.items():
			
 
				+        uid = k[0]
			
 
				+        vid = k[1]
			
 
				+        score = v
			
 
				+        #if score <3:
			
 
				+        #    continue
			
 
				+        vid_list = []
			
 
				+        if uid not in user_group_dict:
			
 
				+            vid_list.append((vid, score))
			
 
				+            user_group_dict[uid] = vid_list
			
 
				+        else:
			
 
				+            vid_list = user_group_dict[uid]
			
 
				+            vid_list.append((vid, score))
			
 
				+            user_group_dict[uid] = vid_list
			
 
				+    #print(user_group_dict)
			
 
				+    item_pair_dict = {}
			
 
				+    #3. expand item
			
 
				+    for k, v_list in user_group_dict.items():
			
 
				+         v_n = len(v_list)
			
 
				+         if v_n<2:
			
 
				+             continue
			
 
				+         for i in range(v_n):
			
 
				+             for j in range(1, v_n):
			
 
				+                if v_list[i][0] == v_list[j][0]:
			
 
				+                    continue
			
 
				+                item_key = (v_list[i][0], v_list[j][0])
			
 
				+                item_score = 1
			
 
				+                if item_key not in item_pair_dict:
			
 
				+                    item_pair_dict[item_key] = item_score
			
 
				+                else:
			
 
				+                    item_pair_dict[item_key] = item_pair_dict[item_key]+item_score
			
 
				+    #print(item_pair_dict)
			
 
				+    print(item_pair_dict)
			
 
				+    print(item_dict)
			
 
				+    left_pair_num = 0
			
 
				+    rec_item_dict = {}
			
 
				+    #4. rec item
			
 
				+    for k, v in item_pair_dict.items():
			
 
				+        if v<2:
			
 
				+            continue
			
 
				+        left_pair_num+=1
			
 
				+        item1 = k[0]
			
 
				+        item2 = k[1]
			
 
				+        pair_score = v
			
 
				+        if item1 in item_dict:
			
 
				+             item_score1 = item_dict[item1]
			
 
				+             i2i_pro = pair_score/(item_score1+5)
			
 
				+             rec_list1 = []
			
 
				+             if item2 not in rec_item_dict:
			
 
				+                 rec_list1.append((item1, i2i_pro, pair_score, item_score1))
			
 
				+                 rec_item_dict[item2] = rec_list1
			
 
				+             else:
			
 
				+                 rec_list1 = rec_item_dict[item2]
			
 
				+                 rec_list1.append((item1, i2i_pro, pair_score, item_score1))
			
 
				+                 rec_item_dict[item2] = rec_list1
			
 
				+        if item2 in item_dict:
			
 
				+             item_score2 = item_dict[item2]
			
 
				+             i2i_pro = pair_score/(item_score2+5)
			
 
				+             rec_list2 = []
			
 
				+             if item1 not in rec_item_dict:
			
 
				+                 rec_list2.append((item2, i2i_pro, pair_score, item_score2))
			
 
				+                 rec_item_dict[item1] = rec_list2
			
 
				+             else:
			
 
				+                 rec_list2 = rec_item_dict[item1]
			
 
				+                 rec_list2.append((item2, i2i_pro, pair_score, item_score2))
			
 
				+                 rec_item_dict[item1] = rec_list2          
			
 
				+     
			
 
				+    #(item, share_count)
			
 
				+    print(left_pair_num)
			
 
				+    #print(rec_item_dict)
			
 
				+    final_rec_list = []
			
 
				+    #f = open("rec_result", "w")
			
 
				+    #5. sorted item_list
			
 
				+    
			
 
				+    for k,v in rec_item_dict.items():
			
 
				+        v_set = set('')
			
 
				+        value_list = v
			
 
				+        dup_list = []
			
 
				+        for item in value_list:
			
 
				+            if item[0] in v_set:
			
 
				+                continue
			
 
				+            v_set.add(item[0])
			
 
				+            dup_list.append(item)
			
 
				+        sorted_v = sorted(dup_list, key=itemgetter(1), reverse=True)
			
 
				+        final_rec_list.append((k, sorted_v))
			
 
				+    #print(final_rec_list[:1])
			
 
				+    #json_str = json.dumps(final_rec_list)
			
 
				+    with open("./data/rec_result2_"+nowdate+".json", "w") as f :
			
 
				+        json.dump(final_rec_list, f)
			
 
				+    
			
 
				+     
			
--- a/cut_title_top.py
+++ b/cut_title_top.py
@@ -0,0 +1,47 @@
 
				+#coding utf-8
			
 
				+import sys
			
 
				+import jieba 
			
 
				+import os
			
 
				+
			
 
				+if __name__=="__main__":
			
 
				+    #f1 = open(sys.argv[1])
			
 
				+    stop_words = set('')
			
 
				+    path = sys.argv[1]
			
 
				+    files_dir = os.listdir(path)
			
 
				+    #print(files_dir)
			
 
				+    for file_name in files_dir:
			
 
				+        if file_name.find('.txt')>-1:
			
 
				+            f1 = open(path+"/"+file_name)
			
 
				+            while True:
			
 
				+                file_line = f1.readline()
			
 
				+                if not file_line:
			
 
				+                    break
			
 
				+                file_line = file_line.strip()
			
 
				+                stop_words.add(file_line)
			
 
				+            f1.close()
			
 
				+    #print(len(stop_words))
			
 
				+    f = open(sys.argv[2])
			
 
				+    f3 = open(sys.argv[3], 'w')
			
 
				+    while True:
			
 
				+        line = f.readline()
			
 
				+        if not line:
			
 
				+            break
			
 
				+        line = line.strip()
			
 
				+        items = line.split("\t")
			
 
				+        if len(items)<2:
			
 
				+            continue
			
 
				+        vid = items[0]
			
 
				+        title = items[1] 
			
 
				+        cut_info =  jieba.lcut(title)
			
 
				+        cut_arr = []
			
 
				+        for cut_item in cut_info:
			
 
				+            #print("cut_item:", cut_item)
			
 
				+            if cut_item==' ':
			
 
				+                continue
			
 
				+            if cut_item in stop_words:
			
 
				+                continue
			
 
				+            cut_arr.append(cut_item)
			
 
				+        vid_info = vid+'\t'+" ".join(cut_arr)
			
 
				+        f3.write(vid_info.strip()+"\n")
			
 
				+    f3.close()
			
 
				+       
			
--- a/get_batch_sim_k.py
+++ b/get_batch_sim_k.py
@@ -0,0 +1,83 @@
 
				+#coding utf-8
			
 
				+import sys
			
 
				+import pandas as pd
			
 
				+import numpy as np
			
 
				+import faiss
			
 
				+import time
			
 
				+
			
 
				+def gen_i2i(index_item, embeddings,i2i):
			
 
				+    fw=open(i2i,"w")
			
 
				+    #print(i2i)
			
 
				+    start_time = time.time()
			
 
				+    #xb = embeddings
			
 
				+    xb=np.array(embeddings).astype('float32')
			
 
				+    #print(xb)
			
 
				+    #index.add(xb)
			
 
				+    dim, measure = 64, faiss.METRIC_L2  
			
 
				+    param =  'IVF100,PQ16'
			
 
				+    index = faiss.index_factory(dim, param, measure) 
			
 
				+    #print(index.is_trained)                          # 此时输出为False，因为倒排索引需要训练k-means， 
			
 
				+    index.train(xb) 
			
 
				+    end_time = time.time()
			
 
				+    print("time:", (end_time-start_time))
			
 
				+    #index=faiss.IndexFlatL2(100)
			
 
				+    #index.add(embed_matrix)
			
 
				+    #the candicate matrix is embed_matrix,but the search matrix is the same.
			
 
				+    #if the search vector is in the candicate matrix, the return idx>> the first is the search vector itself
			
 
				+    #if the search vector is not in the candicate matrix, the return idx>>the first is the index of the candicate
			
 
				+    batch = 10000
			
 
				+    num = len(embeddings)
			
 
				+    per_rounds = int(num/batch)+1
			
 
				+    #index=faiss.IndexFlatL2(64)
			
 
				+    index.add(xb)
			
 
				+    print("cost time:", (end_time-start_time))
			
 
				+    #distence_matrix,recall_list=index.search(xb, 20)
			
 
				+    #print(distence_matrix)
			
 
				+    #print(recall_list)
			
 
				+    for i in range(per_rounds):
			
 
				+        per_embedding = xb[i:(i+1)*batch]
			
 
				+        #print(per_embedding)
			
 
				+        #print(len(per_embedding))
			
 
				+        distence_matrix,recall_list=index.search(per_embedding, 20)
			
 
				+        #print("distence_matrix:", distence_matrix)
			
 
				+        #print("recall_list:", recall_list)
			
 
				+       	for idx,rec_arr in enumerate(recall_list):
			
 
				+            #print("idx:", idx)
			
 
				+            orgin_item=str(index_item[idx])
			
 
				+            #print("orgin_item:", orgin_item)
			
 
				+            #print("rec_arr:", rec_arr)
			
 
				+            recall_str=""
			
 
				+            for re_id in rec_arr[1:]:
			
 
				+                if re_id in index_item:
			
 
				+                    recall_idstr=str(index_item[re_id])
			
 
				+                    recall_str=recall_str+","+recall_idstr
			
 
				+            fw.write(orgin_item+"\t"+recall_str[1:]+"\n")
			
 
				+
			
 
				+if __name__ == '__main__':
			
 
				+    f = open(sys.argv[1])
			
 
				+    index = 0
			
 
				+    index_dict = {}
			
 
				+    index_arr = []
			
 
				+    while True:
			
 
				+        line = f.readline()
			
 
				+        if not line:
			
 
				+           break
			
 
				+        items = line.strip().split(" ")
			
 
				+        try:
			
 
				+            vid = int(items[0])
			
 
				+            vid_vec = eval(" ".join(items[1:]))
			
 
				+            vid_vec=np.array(vid_vec)
			
 
				+            float_arr = vid_vec.astype(np.float64).tolist()
			
 
				+            #print(float_arr)
			
 
				+            index_arr.append(float_arr)
			
 
				+            #index +=1
			
 
				+            index_dict[index] = vid
			
 
				+            index +=1
			
 
				+            #break
			
 
				+            #print(index_arr)
			
 
				+        except:
			
 
				+            #break
			
 
				+            continue
			
 
				+    f.close()
			
 
				+    #print(index_arr)
			
 
				+    gen_i2i(index_dict, index_arr, "i2i_result")
			
--- a/get_sim_k.py
+++ b/get_sim_k.py
@@ -3,7 +3,7 @@ import sys
 
				 import pandas as pd
			
 
				 import numpy as np
			
 
				 import faiss
			
 
				-
			
 
				+import time
			
 
				 
			
 
				 def gen_i2i(index_item, embeddings,i2i):
			
 
				     fw=open(i2i,"w")
			
@@ -30,25 +30,30 @@ def gen_i2i(index_item, embeddings,i2i):
 
				 if __name__ == '__main__':
			
 
				     f = open(sys.argv[1])
			
 
				     index = 0
			
 
				+    start_time = time.time()
			
 
				     index_dict = {}
			
 
				     index_arr = []
			
 
				     while True:
			
 
				         line = f.readline()
			
 
				         if not line:
			
 
				            break
			
 
				-        line = line.strip()
			
 
				+        line = line.strip().replace("[","").replace("]","")
			
 
				         #print(eval(line))
			
 
				         items = line.split(" ")
			
 
				+        if len(items)<2:
			
 
				+            continue
			
 
				         try:
			
 
				-            vid = int(items[0])
			
 
				-            vid_vec = eval(" ".join(items[1:]))
			
 
				-            index_arr.append(vid_vec)
			
 
				-            #index +=1
			
 
				-            index_dict[index] = vid
			
 
				-            index +=1
			
 
				-            #print(index_arr)
			
 
				+           vid = int(items[0])
			
 
				+           #vid_vec = items[1:]
			
 
				+           print(line.split(" "))
			
 
				+           vid_vec = eval(" ".join(items[1:]))
			
 
				+           index_arr.append(vid_vec)
			
 
				+           index_dict[index] = vid
			
 
				+           index +=1
			
 
				         except:
			
 
				-            continue
			
 
				+           continue
			
 
				     f.close()
			
 
				     print(len(index_arr))
			
 
				-    gen_i2i(index_dict, index_arr, "i2i_result")
			
 
				+    end_time = time.time()
			
 
				+    print("time:", (end_time-start_time))
			
 
				+    #gen_i2i(index_dict, index_arr, "i2i_result")
			
--- a/predict.py
+++ b/predict.py
@@ -13,7 +13,7 @@ if __name__=="__main__":
 
				         if not line:
			
 
				             break
			
 
				         items = line.strip().split(" ")
			
 
				-        if len(items)<100:
			
 
				+        if len(items)<64:
			
 
				             continue
			
 
				         arr = []
			
 
				         for w in items[1:]:
			
@@ -34,7 +34,7 @@ if __name__=="__main__":
 
				             continue
			
 
				         vid = items[0]
			
 
				         title_arr = items[1].split(" ")
			
 
				-        title_info = np.zeros(100)
			
 
				+        title_info = np.zeros(64)
			
 
				         word_len = 0
			
 
				         for word in title_arr:
			
 
				             if word in word_dict:
			
@@ -44,12 +44,13 @@ if __name__=="__main__":
 
				                  #print(word_vec)
			
 
				                  title_info = np.add(title_info, word_vec)
			
 
				                  word_len +=1
			
 
				+        #print(title_info)
			
 
				         title_info_list = []
			
 
				         if word_len<=0:
			
 
				             continue
			
 
				         for j in title_info:
			
 
				             title_info_list.append(j/word_len)
			
 
				         #print("title_info_list:", title_info_list)
			
 
				-        print(vid,title_info_list)
			
 
				+        print(vid,"\t",title_info_list)
			
 
				         
			
 
				     
			
--- a/process_video.py
+++ b/process_video.py
@@ -0,0 +1,53 @@
 
				+#coding utf-8
			
 
				+import sys
			
 
				+import pandas as pd
			
 
				+import numpy as np
			
 
				+import faiss
			
 
				+
			
 
				+
			
 
				+def gen_i2i(index_item, embeddings,i2i):
			
 
				+    fw=open(i2i,"w")
			
 
				+    #print(i2i)
			
 
				+    embed_matrix=np.array(embeddings).astype('float32')
			
 
				+    #print(embed_matrix)
			
 
				+    index=faiss.IndexFlatL2(100)
			
 
				+    index.add(embed_matrix)
			
 
				+    #the candicate matrix is embed_matrix,but the search matrix is the same.
			
 
				+    #if the search vector is in the candicate matrix, the return idx>> the first is the search vector itself
			
 
				+    #if the search vector is not in the candicate matrix, the return idx>>the first is the index of the candicate
			
 
				+    distence_matrix,recall_list=index.search(embed_matrix, 20)
			
 
				+    for idx,rec_arr in enumerate(recall_list):
			
 
				+        #print("idx:", idx)
			
 
				+        orgin_item=str(index_item[idx])
			
 
				+        recall_str=""
			
 
				+        #rec_arr=[0 6 3 8 7 1]
			
 
				+        for re_id in rec_arr[1:]:
			
 
				+            recall_idstr=str(index_item[re_id])
			
 
				+            #print(recall_idstr)
			
 
				+            recall_str=recall_str+","+recall_idstr
			
 
				+        fw.write(orgin_item+"\t"+recall_str[1:]+"\n")
			
 
				+
			
 
				+if __name__ == '__main__':
			
 
				+    f = open(sys.argv[1])
			
 
				+    index = 0
			
 
				+    index_dict = {}
			
 
				+    index_arr = []
			
 
				+    while True:
			
 
				+        line = f.readline()
			
 
				+        if not line:
			
 
				+           break
			
 
				+        line = line.strip()
			
 
				+        #print(line)
			
 
				+        items = line.split(" ")
			
 
				+        #print(int(items[0]))
			
 
				+        try:
			
 
				+            vid = int(items[0])
			
 
				+            print(line)
			
 
				+            #print(str(vid)+"\t"+items[1:])
			
 
				+            #print(index_arr)
			
 
				+        except:
			
 
				+            #print(int(items[0]))
			
 
				+            continue
			
 
				+    f.close()
			
 
				+    #print(len(index_arr))
			
 
				+    #gen_i2i(index_dict, index_arr, "i2i_result")
			
--- a/test_faiss.py
+++ b/test_faiss.py
@@ -0,0 +1,11 @@
 
				+import numpy as np
			
 
				+d = 64                                           # 向量维度
			
 
				+nb = 100000                                      # index向量库的数据量
			
 
				+nq = 10000                                       # 待检索query的数目
			
 
				+np.random.seed(1234)             
			
 
				+xb = np.random.random((nb, d)).astype('float32')
			
 
				+#xb[:, 0] += np.arange(nb) / 1000.                # index向量库的向量
			
 
				+xq = np.random.random((nq, d)).astype('float32')
			
 
				+#xq[:, 0] += np.arange(nq) / 1000.
			
 
				+
			
 
				+print(xb)
			
--- a/train_vec.sh
+++ b/train_vec.sh
@@ -0,0 +1,12 @@
 
				+cd /home/rec/project/git_project/OffLineRec
			
 
				+
			
 
				+#1.cut_title
			
 
				+nowday=`date  +"%Y%m%d" -d -0days`
			
 
				+nowday=20230512
			
 
				+
			
 
				+#python cut_title.py ./stopwords/ ./data/video_title_${nowday} ./data/video_cut_title_${nowday}
			
 
				+
			
 
				+#python word2vec.py 
			
 
				+
			
 
				+
			
 
				+#python predict.py ./data/word2vec_cut_title  > ./data/video_title_embedding
			
--- a/word2vec.py
+++ b/word2vec.py
@@ -15,14 +15,14 @@ if __name__=="__main__":
 
				         if  num == 1:
			
 
				             continue
			
 
				         items = line.strip().split("\t")
			
 
				-        print(items)
			
 
				+        #print(items)
			
 
				         if len(items)<2:
			
 
				             continue
			
 
				         arr.append(items[1].split(" "))
			
 
				-        print(arr)
			
 
				+        #print(arr)
			
 
				     f.close()
			
 
				-    '''model = word2vec.Word2Vec(arr, vector_size=100, min_count=1,sg=1)
			
 
				-    model.wv.save_word2vec_format('word2vec.txt',binary=False)'''
			
 
				+    model = word2vec.Word2Vec(arr, vector_size=64, min_count=2,sg=1, workers=10)
			
 
				+    model.wv.save_word2vec_format('word2vec.txt',binary=False)
			
 
				     #model.save('word2vec.model')