1234567891011121314151617181920212223242526272829303132333435363738394041424344454647 |
- import sys
- import jieba
- import os
- if __name__=="__main__":
-
- stop_words = set('')
- path = sys.argv[1]
- files_dir = os.listdir(path)
-
- for file_name in files_dir:
- if file_name.find('.txt')>-1:
- f1 = open(path+"/"+file_name)
- while True:
- file_line = f1.readline()
- if not file_line:
- break
- file_line = file_line.strip()
- stop_words.add(file_line)
- f1.close()
-
- f = open(sys.argv[2])
- f3 = open(sys.argv[3], 'w')
- while True:
- line = f.readline()
- if not line:
- break
- line = line.strip()
- items = line.split("\t")
- if len(items)<2:
- continue
- vid = items[0]
- title = items[1]
- cut_info = jieba.lcut(title)
- cut_arr = []
- for cut_item in cut_info:
-
- if cut_item==' ':
- continue
- if cut_item in stop_words:
- continue
- cut_arr.append(cut_item)
- vid_info = vid+'\t'+" ".join(cut_arr)
- f3.write(vid_info.strip()+"\n")
- f3.close()
-
|