process_data.py 2.0 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071
  1. """
  2. process the data to satisfy the lightgbm
  3. """
  4. import sys
  5. import os
  6. import json
  7. from tqdm import tqdm
  8. sys.path.append(os.getcwd())
  9. from functions import generate_label_date
  10. def generate_train_label(item, y_ori_data, cate):
  11. """
  12. 生成训练数据,用 np.array矩阵的方式返回,
  13. :return: x_train, 训练数据, y_train, 训练 label
  14. """
  15. video_id = item['video_id']
  16. dt = item['dt']
  17. userful_features = [
  18. "uid",
  19. "type",
  20. "channel",
  21. "fans",
  22. "view_count_user_30days",
  23. "share_count_user_30days",
  24. "return_count_user_30days",
  25. "rov_user",
  26. "str_user",
  27. "out_user_id",
  28. "mode",
  29. "out_play_cnt",
  30. "out_like_cnt",
  31. "out_share_cnt",
  32. "out_collection_cnt"
  33. ]
  34. item_features = [item[i] for i in userful_features]
  35. label_dt = generate_label_date(dt)
  36. label_obj = y_ori_data.get(label_dt, {}).get(video_id)
  37. if label_obj:
  38. label = int(label_obj[cate]) if label_obj[cate] else 0
  39. else:
  40. label = 0
  41. return label, item_features
  42. if __name__ == '__main__':
  43. x_path = 'data/hour_train.json'
  44. y_path = 'data/daily-label-20240101-20240320.json'
  45. with open(x_path) as f:
  46. x_data = json.loads(f.read())
  47. with open(y_path) as f:
  48. y_data = json.loads(f.read())
  49. cate_list = ['total_return', '3day_up_level', 'total_view', 'total_share']
  50. for c in cate_list:
  51. x_list = []
  52. y_list = []
  53. for video_obj in tqdm(x_data):
  54. print(video_obj)
  55. our_label, features = generate_train_label(video_obj, y_data, c)
  56. x_list.append(features)
  57. y_list.append(our_label)
  58. # print(len(y_list))
  59. with open("whole_data/x_data_{}.json".format(c), "w") as f1:
  60. f1.write(json.dumps(x_list, ensure_ascii=False))
  61. with open("whole_data/y_data_{}.json".format(c), "w") as f2:
  62. f2.write(json.dumps(y_list, ensure_ascii=False))