cut_title.py 1.2 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647
  1. #coding utf-8
  2. import sys
  3. import jieba
  4. import os
  5. if __name__=="__main__":
  6. #f1 = open(sys.argv[1])
  7. stop_words = set('')
  8. path = sys.argv[1]
  9. files_dir = os.listdir(path)
  10. #print(files_dir)
  11. for file_name in files_dir:
  12. if file_name.find('.txt')>-1:
  13. f1 = open(path+"/"+file_name)
  14. while True:
  15. file_line = f1.readline()
  16. if not file_line:
  17. break
  18. file_line = file_line.strip()
  19. stop_words.add(file_line)
  20. f1.close()
  21. #print(len(stop_words))
  22. f = open(sys.argv[2])
  23. f3 = open(sys.argv[3], 'w')
  24. while True:
  25. line = f.readline()
  26. if not line:
  27. break
  28. line = line.strip()
  29. items = line.split("\t")
  30. if len(items)<2:
  31. continue
  32. vid = items[0]
  33. title = items[1]
  34. cut_info = jieba.lcut(title)
  35. cut_arr = []
  36. for cut_item in cut_info:
  37. #print("cut_item:", cut_item)
  38. if cut_item==' ':
  39. continue
  40. if cut_item in stop_words:
  41. continue
  42. cut_arr.append(cut_item)
  43. vid_info = vid+'\t'+" ".join(cut_arr)
  44. f3.write(vid_info.strip()+"\n")
  45. f3.close()