ad_feature_data_sample.py 1.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445
  1. import os
  2. import pandas as pd
  3. import random
  4. import datetime
  5. import time
  6. def neg_under_sample(data_dir, sample_data_dir, dt):
  7. # 欠采样,对负样本按照30%的概率进行舍弃,pos:neg ≈ 1:10
  8. data_df = pd.read_csv(f"{data_dir}/{dt}.csv")
  9. pos_df = data_df[data_df['share_status'] == 1].copy()
  10. neg_df = data_df[data_df['share_status'] == 0].copy()
  11. neg_df['rand'] = [random.uniform(0, 1) for i in range(neg_df.shape[0])]
  12. rate = 1 - pos_df.shape[0] / neg_df.shape[0]
  13. sample_neg_df = neg_df[neg_df['rand'] > rate]
  14. sample_neg_df = sample_neg_df.drop(columns=['rand'])
  15. sample_df = pd.concat([pos_df, sample_neg_df])
  16. sample_df = sample_df.sample(frac=1.0).reset_index(drop=True)
  17. # 写入csv
  18. if not os.path.exists(sample_data_dir):
  19. os.makedirs(sample_data_dir)
  20. sample_df.to_csv(f"{sample_data_dir}/{dt}.csv", index=False)
  21. print(f"pos num: {pos_df.shape[0]}")
  22. print(f"neg num: {neg_df.shape[0]}")
  23. print(f"neg_sample num: {sample_neg_df.shape[0]}")
  24. print(f"initial data num: {data_df.shape[0]}")
  25. print(f"sample data num: {sample_df.shape[0]}")
  26. if __name__ == '__main__':
  27. st_time = time.time()
  28. data_dir = './data/train_data'
  29. sample_data_dir = './data/sample_train_data'
  30. now_date = datetime.datetime.today()
  31. print(f"now_date: {datetime.datetime.strftime(now_date, '%Y%m%d')}")
  32. dt = datetime.datetime.strftime(now_date - datetime.timedelta(days=15), '%Y%m%d')
  33. print(f"update data dt: {dt}")
  34. neg_under_sample(data_dir=data_dir, sample_data_dir=sample_data_dir, dt=dt)
  35. print(f"{time.time() - st_time}s")
  36. # for days in range(3, 19):
  37. # cur_dt = datetime.datetime.strftime(now_date - datetime.timedelta(days=days), '%Y%m%d')
  38. # print(f"cur_dt = {cur_dt}")
  39. # neg_under_sample(data_dir=data_dir, sample_data_dir=sample_data_dir, dt=cur_dt)