#coding utf-8
import sys
import jieba 
import os

if __name__=="__main__":
    #f1 = open(sys.argv[1])
    stop_words = set('')
    path = sys.argv[1]
    files_dir = os.listdir(path)
    #print(files_dir)
    for file_name in files_dir:
        if file_name.find('.txt')>-1:
            f1 = open(path+"/"+file_name)
            while True:
                file_line = f1.readline()
                if not file_line:
                    break
                file_line = file_line.strip()
                stop_words.add(file_line)
            f1.close()
    #print(len(stop_words))
    f = open(sys.argv[2])
    f3 = open(sys.argv[3], 'w')
    while True:
        line = f.readline()
        if not line:
            break
        line = line.strip()
        items = line.split("\t")
        if len(items)<2:
            continue
        vid = items[0]
        title = items[1] 
        cut_info =  jieba.lcut(title)
        cut_arr = []
        for cut_item in cut_info:
            #print("cut_item:", cut_item)
            if cut_item==' ':
                continue
            if cut_item in stop_words:
                continue
            cut_arr.append(cut_item)
        vid_info = vid+'\t'+" ".join(cut_arr)
        f3.write(vid_info.strip()+"\n")
    f3.close()