extract_title_tag.py 2.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778
  1. #coding utf-8
  2. import sys
  3. import jieba
  4. from jieba import analyse
  5. import jieba.posseg as pseg
  6. import re
  7. import os
  8. if __name__=="__main__":
  9. #f1 = open(sys.argv[1])
  10. stop_words = set('')
  11. '''path = sys.argv[1]
  12. files_dir = os.listdir(path)
  13. #print(files_dir)
  14. for file_name in files_dir:
  15. if file_name.find('.txt')>-1:
  16. f1 = open(path+"/"+file_name)
  17. while True:
  18. file_line = f1.readline()
  19. if not file_line:
  20. break
  21. file_line = file_line.strip()
  22. stop_words.add(file_line)
  23. f1.close()
  24. #print(len(stop_words))'''
  25. analyse.set_stop_words("all_stopword.txt")
  26. f = open(sys.argv[1])
  27. f3 = open(sys.argv[2], 'w')
  28. while True:
  29. line = f.readline()
  30. if not line:
  31. break
  32. line = line.strip()
  33. items = line.split("\t")
  34. if len(items)<2:
  35. continue
  36. vid = items[0]
  37. title = items[1]
  38. #cut_info = pseg.cut(title)
  39. #tfif_top =jieba.analyse.extract_tags(title,topK=3, withWeight=True, allowPOS=("nr","ns","n","nt","nw","nz","vn","v","a", "d", "f", "s","t", "PER", "LOC", "ORG"))
  40. tfif_top =jieba.analyse.extract_tags(title,topK=4, withWeight=True)
  41. #text_rank_top =jieba.analyse.textrank(title,topK=3,withWeight=True, allowPOS=("nr","ns","n","nt","nw","nz","vn","v","a", "d", "f", "s","t", "PER", "LOC", "ORG"))
  42. #print(title)
  43. #print(tfif_top)
  44. #print(text_rank_top)
  45. tags = []
  46. for word in tfif_top:
  47. #pattern = re.compile(r'^[-+]?[-0-9]\d*\.\d*|[-+]?\.?[0-9]\d*$')
  48. #result = pattern.match(word)
  49. #if result:
  50. # continue
  51. if word[0].isdigit():
  52. continue
  53. try:
  54. vid = float(word[0])
  55. continue
  56. except:
  57. tags.append(str(word[0]))
  58. #print('%s %s' % (word[0], word[1]))
  59. #print('%s %s' % (word[0], word[1]))
  60. if len(tags)>0:
  61. #print(tags)
  62. vid_info=str(vid)+"\t"+",".join(tags)
  63. f3.write(vid_info.strip()+"\n")
  64. #print("--------------")
  65. '''cut_arr = []
  66. for cut_item in cut_info:
  67. #print("cut_item:", cut_item)
  68. if cut_item==' ':
  69. continue
  70. if cut_item in stop_words:
  71. continue
  72. cut_arr.append(cut_item)'''
  73. #vid_info = vid+'\t'+" ".join(cut_arr)
  74. #f3.write(vid_info.strip()+"\n")
  75. f3.close()