Jelajahi Sumber

update & add ad_feature_data_sample.py

liqian 1 tahun lalu
induk
melakukan
1017acf52a
2 mengubah file dengan 36 tambahan dan 4 penghapusan
  1. 35 0
      ad_feature_data_sample.py
  2. 1 4
      ad_feature_process.py

+ 35 - 0
ad_feature_data_sample.py

@@ -0,0 +1,35 @@
+import os
+import pandas as pd
+import random
+import datetime
+import time
+
+
+def neg_under_sample(data_dir, sample_data_dir, dt):
+    # 欠采样,对负样本按照30%的概率进行舍弃,pos:neg ≈ 1:10
+    data_df = pd.read_csv(f"{data_dir}/{dt}.csv")
+    neg_df = data_df[data_df['share_status'] == 0].copy()
+    neg_df['rand'] = [random.uniform(0, 1) for i in range(neg_df.shape[0])]
+    sample_neg_df = neg_df[neg_df['rand'] > 0.3]
+    sample_neg_df = sample_neg_df.drop(columns=['rand'])
+    pos_df = data_df[data_df['share_status'] == 1].copy()
+    sample_df = pd.concat([pos_df, sample_neg_df])
+    sample_df = sample_df.sample(frac=1.0).reset_index(drop=True)
+    # 写入csv
+    if not os.path.exists(sample_data_dir):
+        os.makedirs(sample_data_dir)
+    sample_df.to_csv(f"{sample_data_dir}/{dt}.csv", index=False)
+    print(f"pos num: {pos_df.shape[0]}")
+    print(f"neg num: {neg_df.shape[0]}")
+    print(f"neg_sample num: {sample_neg_df.shape[0]}")
+    print(f"initial data num: {data_df.shape[0]}")
+    print(f"sample data num: {sample_df.shape[0]}")
+
+
+if __name__ == '__main__':
+    st_time = time.time()
+    data_dir = './data/train_data'
+    sample_data_dir = './data/sample_train_data'
+    now_date = datetime.datetime.today()
+    dt = datetime.datetime.strftime(now_date - datetime.timedelta(days=1), '%Y%m%d')
+    neg_under_sample(data_dir=data_dir, sample_data_dir=sample_data_dir, dt=dt)

+ 1 - 4
ad_feature_process.py

@@ -1,10 +1,7 @@
 import os.path
 import time
 import datetime
-
 import pandas as pd
-from utils import get_data_from_odps
-from odps.df import DataFrame
 from odps import ODPS
 from config import set_config
 
@@ -154,7 +151,7 @@ def daily_data_process(project, table, features, dt, app_type):
     train_data_dir = './data/train_data'
     if not os.path.exists(train_data_dir):
         os.makedirs(train_data_dir)
-    train_df.to_csv(f"{train_data_dir}/{dt}.csv")
+    train_df.to_csv(f"{train_data_dir}/{dt}.csv", index=False)
     return train_df