3 anos atrás · eddfabd576
--- a/extract_title_tag.py
+++ b/extract_title_tag.py
@@ -0,0 +1,78 @@
 
				+#coding utf-8
			
 
				+import sys
			
 
				+import jieba
			
 
				+from jieba import analyse
			
 
				+import jieba.posseg as pseg
			
 
				+import re
			
 
				+ 
			
 
				+import os
			
 
				+
			
 
				+if __name__=="__main__":
			
 
				+    #f1 = open(sys.argv[1])
			
 
				+    stop_words = set('')
			
 
				+    '''path = sys.argv[1]
			
 
				+    files_dir = os.listdir(path)
			
 
				+    #print(files_dir)
			
 
				+    for file_name in files_dir:
			
 
				+        if file_name.find('.txt')>-1:
			
 
				+            f1 = open(path+"/"+file_name)
			
 
				+            while True:
			
 
				+                file_line = f1.readline()
			
 
				+                if not file_line:
			
 
				+                    break
			
 
				+                file_line = file_line.strip()
			
 
				+                stop_words.add(file_line)
			
 
				+            f1.close()
			
 
				+    #print(len(stop_words))'''
			
 
				+    analyse.set_stop_words("all_stopword.txt")
			
 
				+    f = open(sys.argv[1])
			
 
				+    f3 = open(sys.argv[2], 'w')
			
 
				+    while True:
			
 
				+        line = f.readline()
			
 
				+        if not line:
			
 
				+            break
			
 
				+        line = line.strip()
			
 
				+        items = line.split("\t")
			
 
				+        if len(items)<2:
			
 
				+            continue
			
 
				+        vid = items[0]
			
 
				+        title = items[1] 
			
 
				+        #cut_info =  pseg.cut(title)
			
 
				+        #tfif_top =jieba.analyse.extract_tags(title,topK=3, withWeight=True, allowPOS=("nr","ns","n","nt","nw","nz","vn","v","a", "d", "f", "s","t", "PER", "LOC", "ORG"))
			
 
				+        tfif_top =jieba.analyse.extract_tags(title,topK=4, withWeight=True)
			
 
				+        #text_rank_top =jieba.analyse.textrank(title,topK=3,withWeight=True, allowPOS=("nr","ns","n","nt","nw","nz","vn","v","a", "d", "f", "s","t", "PER", "LOC", "ORG"))
			
 
				+        #print(title)
			
 
				+        #print(tfif_top)
			
 
				+        #print(text_rank_top)
			
 
				+        tags = []
			
 
				+        for word in tfif_top:
			
 
				+            #pattern = re.compile(r'^[-+]?[-0-9]\d*\.\d*|[-+]?\.?[0-9]\d*$')
			
 
				+            #result = pattern.match(word)
			
 
				+            #if result:
			
 
				+            #    continue
			
 
				+            if word[0].isdigit():
			
 
				+                continue
			
 
				+            try:
			
 
				+                vid = float(word[0])
			
 
				+                continue
			
 
				+            except:
			
 
				+                tags.append(str(word[0]))
			
 
				+                #print('%s %s' % (word[0], word[1]))
			
 
				+            #print('%s %s' % (word[0], word[1]))
			
 
				+        if len(tags)>0:
			
 
				+            #print(tags)
			
 
				+            vid_info=str(vid)+"\t"+",".join(tags)
			
 
				+            f3.write(vid_info.strip()+"\n")
			
 
				+            #print("--------------")
			
 
				+        '''cut_arr = []
			
 
				+        for cut_item in cut_info:
			
 
				+            #print("cut_item:", cut_item)
			
 
				+            if cut_item==' ':
			
 
				+                continue
			
 
				+            if cut_item in stop_words:
			
 
				+                continue
			
 
				+            cut_arr.append(cut_item)'''
			
 
				+        #vid_info = vid+'\t'+" ".join(cut_arr)
			
 
				+        #f3.write(vid_info.strip()+"\n")
			
 
				+    f3.close()
			
 
				+