#coding utf-8 import sys import jieba from jieba import analyse import jieba.posseg as pseg import re import os if __name__=="__main__": #f1 = open(sys.argv[1]) stop_words = set('') '''path = sys.argv[1] files_dir = os.listdir(path) #print(files_dir) for file_name in files_dir: if file_name.find('.txt')>-1: f1 = open(path+"/"+file_name) while True: file_line = f1.readline() if not file_line: break file_line = file_line.strip() stop_words.add(file_line) f1.close() #print(len(stop_words))''' analyse.set_stop_words("all_stopword.txt") f = open(sys.argv[1]) f3 = open(sys.argv[2], 'w') while True: line = f.readline() if not line: break line = line.strip() items = line.split("\t") if len(items)<2: continue vid = items[0] title = items[1] #cut_info = pseg.cut(title) #tfif_top =jieba.analyse.extract_tags(title,topK=3, withWeight=True, allowPOS=("nr","ns","n","nt","nw","nz","vn","v","a", "d", "f", "s","t", "PER", "LOC", "ORG")) tfif_top =jieba.analyse.extract_tags(title,topK=4, withWeight=True) #text_rank_top =jieba.analyse.textrank(title,topK=3,withWeight=True, allowPOS=("nr","ns","n","nt","nw","nz","vn","v","a", "d", "f", "s","t", "PER", "LOC", "ORG")) #print(title) #print(tfif_top) #print(text_rank_top) tags = [] for word in tfif_top: #pattern = re.compile(r'^[-+]?[-0-9]\d*\.\d*|[-+]?\.?[0-9]\d*$') #result = pattern.match(word) #if result: # continue if word[0].isdigit(): continue try: vid = float(word[0]) continue except: tags.append(str(word[0])) #print('%s %s' % (word[0], word[1])) #print('%s %s' % (word[0], word[1])) if len(tags)>0: #print(tags) vid_info=str(vid)+"\t"+",".join(tags) f3.write(vid_info.strip()+"\n") #print("--------------") '''cut_arr = [] for cut_item in cut_info: #print("cut_item:", cut_item) if cut_item==' ': continue if cut_item in stop_words: continue cut_arr.append(cut_item)''' #vid_info = vid+'\t'+" ".join(cut_arr) #f3.write(vid_info.strip()+"\n") f3.close()