|
@@ -0,0 +1,35 @@
|
|
|
+import os
|
|
|
+import pandas as pd
|
|
|
+import random
|
|
|
+import datetime
|
|
|
+import time
|
|
|
+
|
|
|
+
|
|
|
+def neg_under_sample(data_dir, sample_data_dir, dt):
|
|
|
+ # 欠采样,对负样本按照30%的概率进行舍弃,pos:neg ≈ 1:10
|
|
|
+ data_df = pd.read_csv(f"{data_dir}/{dt}.csv")
|
|
|
+ neg_df = data_df[data_df['share_status'] == 0].copy()
|
|
|
+ neg_df['rand'] = [random.uniform(0, 1) for i in range(neg_df.shape[0])]
|
|
|
+ sample_neg_df = neg_df[neg_df['rand'] > 0.3]
|
|
|
+ sample_neg_df = sample_neg_df.drop(columns=['rand'])
|
|
|
+ pos_df = data_df[data_df['share_status'] == 1].copy()
|
|
|
+ sample_df = pd.concat([pos_df, sample_neg_df])
|
|
|
+ sample_df = sample_df.sample(frac=1.0).reset_index(drop=True)
|
|
|
+ # 写入csv
|
|
|
+ if not os.path.exists(sample_data_dir):
|
|
|
+ os.makedirs(sample_data_dir)
|
|
|
+ sample_df.to_csv(f"{sample_data_dir}/{dt}.csv", index=False)
|
|
|
+ print(f"pos num: {pos_df.shape[0]}")
|
|
|
+ print(f"neg num: {neg_df.shape[0]}")
|
|
|
+ print(f"neg_sample num: {sample_neg_df.shape[0]}")
|
|
|
+ print(f"initial data num: {data_df.shape[0]}")
|
|
|
+ print(f"sample data num: {sample_df.shape[0]}")
|
|
|
+
|
|
|
+
|
|
|
+if __name__ == '__main__':
|
|
|
+ st_time = time.time()
|
|
|
+ data_dir = './data/train_data'
|
|
|
+ sample_data_dir = './data/sample_train_data'
|
|
|
+ now_date = datetime.datetime.today()
|
|
|
+ dt = datetime.datetime.strftime(now_date - datetime.timedelta(days=1), '%Y%m%d')
|
|
|
+ neg_under_sample(data_dir=data_dir, sample_data_dir=sample_data_dir, dt=dt)
|