#coding utf-8 import sys import jieba import os if __name__=="__main__": #f1 = open(sys.argv[1]) stop_words = set('') path = sys.argv[1] files_dir = os.listdir(path) #print(files_dir) for file_name in files_dir: if file_name.find('.txt')>-1: f1 = open(path+"/"+file_name) while True: file_line = f1.readline() if not file_line: break file_line = file_line.strip() stop_words.add(file_line) f1.close() #print(len(stop_words)) f = open(sys.argv[2]) f3 = open(sys.argv[3], 'w') while True: line = f.readline() if not line: break line = line.strip() items = line.split("\t") if len(items)<2: continue vid = items[0] title = items[1] cut_info = jieba.lcut(title) cut_arr = [] for cut_item in cut_info: #print("cut_item:", cut_item) if cut_item==' ': continue if cut_item in stop_words: continue cut_arr.append(cut_item) vid_info = vid+'\t'+" ".join(cut_arr) f3.write(vid_info.strip()+"\n") f3.close()