123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778 |
- #coding utf-8
- import sys
- import jieba
- from jieba import analyse
- import jieba.posseg as pseg
- import re
-
- import os
- if __name__=="__main__":
- #f1 = open(sys.argv[1])
- stop_words = set('')
- '''path = sys.argv[1]
- files_dir = os.listdir(path)
- #print(files_dir)
- for file_name in files_dir:
- if file_name.find('.txt')>-1:
- f1 = open(path+"/"+file_name)
- while True:
- file_line = f1.readline()
- if not file_line:
- break
- file_line = file_line.strip()
- stop_words.add(file_line)
- f1.close()
- #print(len(stop_words))'''
- analyse.set_stop_words("all_stopword.txt")
- f = open(sys.argv[1])
- f3 = open(sys.argv[2], 'w')
- while True:
- line = f.readline()
- if not line:
- break
- line = line.strip()
- items = line.split("\t")
- if len(items)<2:
- continue
- vid = items[0]
- title = items[1]
- #cut_info = pseg.cut(title)
- #tfif_top =jieba.analyse.extract_tags(title,topK=3, withWeight=True, allowPOS=("nr","ns","n","nt","nw","nz","vn","v","a", "d", "f", "s","t", "PER", "LOC", "ORG"))
- tfif_top =jieba.analyse.extract_tags(title,topK=4, withWeight=True)
- #text_rank_top =jieba.analyse.textrank(title,topK=3,withWeight=True, allowPOS=("nr","ns","n","nt","nw","nz","vn","v","a", "d", "f", "s","t", "PER", "LOC", "ORG"))
- #print(title)
- #print(tfif_top)
- #print(text_rank_top)
- tags = []
- for word in tfif_top:
- #pattern = re.compile(r'^[-+]?[-0-9]\d*\.\d*|[-+]?\.?[0-9]\d*$')
- #result = pattern.match(word)
- #if result:
- # continue
- if word[0].isdigit():
- continue
- try:
- vid = float(word[0])
- continue
- except:
- tags.append(str(word[0]))
- #print('%s %s' % (word[0], word[1]))
- #print('%s %s' % (word[0], word[1]))
- if len(tags)>0:
- #print(tags)
- vid_info=str(vid)+"\t"+",".join(tags)
- f3.write(vid_info.strip()+"\n")
- #print("--------------")
- '''cut_arr = []
- for cut_item in cut_info:
- #print("cut_item:", cut_item)
- if cut_item==' ':
- continue
- if cut_item in stop_words:
- continue
- cut_arr.append(cut_item)'''
- #vid_info = vid+'\t'+" ".join(cut_arr)
- #f3.write(vid_info.strip()+"\n")
- f3.close()
-
|