|
@@ -0,0 +1,78 @@
|
|
|
+#coding utf-8
|
|
|
+import sys
|
|
|
+import jieba
|
|
|
+from jieba import analyse
|
|
|
+import jieba.posseg as pseg
|
|
|
+import re
|
|
|
+
|
|
|
+import os
|
|
|
+
|
|
|
+if __name__=="__main__":
|
|
|
+ #f1 = open(sys.argv[1])
|
|
|
+ stop_words = set('')
|
|
|
+ '''path = sys.argv[1]
|
|
|
+ files_dir = os.listdir(path)
|
|
|
+ #print(files_dir)
|
|
|
+ for file_name in files_dir:
|
|
|
+ if file_name.find('.txt')>-1:
|
|
|
+ f1 = open(path+"/"+file_name)
|
|
|
+ while True:
|
|
|
+ file_line = f1.readline()
|
|
|
+ if not file_line:
|
|
|
+ break
|
|
|
+ file_line = file_line.strip()
|
|
|
+ stop_words.add(file_line)
|
|
|
+ f1.close()
|
|
|
+ #print(len(stop_words))'''
|
|
|
+ analyse.set_stop_words("all_stopword.txt")
|
|
|
+ f = open(sys.argv[1])
|
|
|
+ f3 = open(sys.argv[2], 'w')
|
|
|
+ while True:
|
|
|
+ line = f.readline()
|
|
|
+ if not line:
|
|
|
+ break
|
|
|
+ line = line.strip()
|
|
|
+ items = line.split("\t")
|
|
|
+ if len(items)<2:
|
|
|
+ continue
|
|
|
+ vid = items[0]
|
|
|
+ title = items[1]
|
|
|
+ #cut_info = pseg.cut(title)
|
|
|
+ #tfif_top =jieba.analyse.extract_tags(title,topK=3, withWeight=True, allowPOS=("nr","ns","n","nt","nw","nz","vn","v","a", "d", "f", "s","t", "PER", "LOC", "ORG"))
|
|
|
+ tfif_top =jieba.analyse.extract_tags(title,topK=4, withWeight=True)
|
|
|
+ #text_rank_top =jieba.analyse.textrank(title,topK=3,withWeight=True, allowPOS=("nr","ns","n","nt","nw","nz","vn","v","a", "d", "f", "s","t", "PER", "LOC", "ORG"))
|
|
|
+ #print(title)
|
|
|
+ #print(tfif_top)
|
|
|
+ #print(text_rank_top)
|
|
|
+ tags = []
|
|
|
+ for word in tfif_top:
|
|
|
+ #pattern = re.compile(r'^[-+]?[-0-9]\d*\.\d*|[-+]?\.?[0-9]\d*$')
|
|
|
+ #result = pattern.match(word)
|
|
|
+ #if result:
|
|
|
+ # continue
|
|
|
+ if word[0].isdigit():
|
|
|
+ continue
|
|
|
+ try:
|
|
|
+ vid = float(word[0])
|
|
|
+ continue
|
|
|
+ except:
|
|
|
+ tags.append(str(word[0]))
|
|
|
+ #print('%s %s' % (word[0], word[1]))
|
|
|
+ #print('%s %s' % (word[0], word[1]))
|
|
|
+ if len(tags)>0:
|
|
|
+ #print(tags)
|
|
|
+ vid_info=str(vid)+"\t"+",".join(tags)
|
|
|
+ f3.write(vid_info.strip()+"\n")
|
|
|
+ #print("--------------")
|
|
|
+ '''cut_arr = []
|
|
|
+ for cut_item in cut_info:
|
|
|
+ #print("cut_item:", cut_item)
|
|
|
+ if cut_item==' ':
|
|
|
+ continue
|
|
|
+ if cut_item in stop_words:
|
|
|
+ continue
|
|
|
+ cut_arr.append(cut_item)'''
|
|
|
+ #vid_info = vid+'\t'+" ".join(cut_arr)
|
|
|
+ #f3.write(vid_info.strip()+"\n")
|
|
|
+ f3.close()
|
|
|
+
|