1234567891011121314151617181920212223242526272829303132333435363738394041424344454647 |
- #coding utf-8
- import sys
- import jieba
- import os
- if __name__=="__main__":
- #f1 = open(sys.argv[1])
- stop_words = set('')
- path = sys.argv[1]
- files_dir = os.listdir(path)
- #print(files_dir)
- for file_name in files_dir:
- if file_name.find('.txt')>-1:
- f1 = open(path+"/"+file_name)
- while True:
- file_line = f1.readline()
- if not file_line:
- break
- file_line = file_line.strip()
- stop_words.add(file_line)
- f1.close()
- #print(len(stop_words))
- f = open(sys.argv[2])
- f3 = open(sys.argv[3], 'w')
- while True:
- line = f.readline()
- if not line:
- break
- line = line.strip()
- items = line.split("\t")
- if len(items)<2:
- continue
- vid = items[0]
- title = items[1]
- cut_info = jieba.lcut(title)
- cut_arr = []
- for cut_item in cut_info:
- #print("cut_item:", cut_item)
- if cut_item==' ':
- continue
- if cut_item in stop_words:
- continue
- cut_arr.append(cut_item)
- vid_info = vid+'\t'+" ".join(cut_arr)
- f3.write(vid_info.strip()+"\n")
- f3.close()
-
|