Ver código fonte

extract_title_tag.py

linfan 1 ano atrás
pai
commit
eddfabd576
1 arquivos alterados com 78 adições e 0 exclusões
  1. 78 0
      extract_title_tag.py

+ 78 - 0
extract_title_tag.py

@@ -0,0 +1,78 @@
+#coding utf-8
+import sys
+import jieba
+from jieba import analyse
+import jieba.posseg as pseg
+import re
+ 
+import os
+
+if __name__=="__main__":
+    #f1 = open(sys.argv[1])
+    stop_words = set('')
+    '''path = sys.argv[1]
+    files_dir = os.listdir(path)
+    #print(files_dir)
+    for file_name in files_dir:
+        if file_name.find('.txt')>-1:
+            f1 = open(path+"/"+file_name)
+            while True:
+                file_line = f1.readline()
+                if not file_line:
+                    break
+                file_line = file_line.strip()
+                stop_words.add(file_line)
+            f1.close()
+    #print(len(stop_words))'''
+    analyse.set_stop_words("all_stopword.txt")
+    f = open(sys.argv[1])
+    f3 = open(sys.argv[2], 'w')
+    while True:
+        line = f.readline()
+        if not line:
+            break
+        line = line.strip()
+        items = line.split("\t")
+        if len(items)<2:
+            continue
+        vid = items[0]
+        title = items[1] 
+        #cut_info =  pseg.cut(title)
+        #tfif_top =jieba.analyse.extract_tags(title,topK=3, withWeight=True, allowPOS=("nr","ns","n","nt","nw","nz","vn","v","a", "d", "f", "s","t", "PER", "LOC", "ORG"))
+        tfif_top =jieba.analyse.extract_tags(title,topK=4, withWeight=True)
+        #text_rank_top =jieba.analyse.textrank(title,topK=3,withWeight=True, allowPOS=("nr","ns","n","nt","nw","nz","vn","v","a", "d", "f", "s","t", "PER", "LOC", "ORG"))
+        #print(title)
+        #print(tfif_top)
+        #print(text_rank_top)
+        tags = []
+        for word in tfif_top:
+            #pattern = re.compile(r'^[-+]?[-0-9]\d*\.\d*|[-+]?\.?[0-9]\d*$')
+            #result = pattern.match(word)
+            #if result:
+            #    continue
+            if word[0].isdigit():
+                continue
+            try:
+                vid = float(word[0])
+                continue
+            except:
+                tags.append(str(word[0]))
+                #print('%s %s' % (word[0], word[1]))
+            #print('%s %s' % (word[0], word[1]))
+        if len(tags)>0:
+            #print(tags)
+            vid_info=str(vid)+"\t"+",".join(tags)
+            f3.write(vid_info.strip()+"\n")
+            #print("--------------")
+        '''cut_arr = []
+        for cut_item in cut_info:
+            #print("cut_item:", cut_item)
+            if cut_item==' ':
+                continue
+            if cut_item in stop_words:
+                continue
+            cut_arr.append(cut_item)'''
+        #vid_info = vid+'\t'+" ".join(cut_arr)
+        #f3.write(vid_info.strip()+"\n")
+    f3.close()
+