3 Komitmen 30a6a86389 ... a1db8f641b

Pembuat SHA1 Pesan Tanggal
  StrayWarrior a1db8f641b Update data_fields_v3 and inspect_features 3 hari lalu
  StrayWarrior bc132fd297 Update widedeep_v13_4: fix title split 1 Minggu lalu
  StrayWarrior 29c39056cf Rename widedeep_v13_3 to widedeep_v13_4 1 Minggu lalu
3 mengubah file dengan 30 tambahan dan 10 penghapusan
  1. 3 0
      data_fields_v3.config
  2. 14 7
      inspect_features.py
  3. 13 3
      widedeep_v13_4.py

+ 3 - 0
data_fields_v3.config

@@ -745,3 +745,6 @@ user_skuid_click_30d BIGINT
 user_skuid_conver_3d BIGINT
 user_skuid_conver_7d BIGINT
 user_skuid_conver_30d BIGINT
+is_weekday BIGINT
+day_of_the_week BIGINT
+user_conver_ad_class STRING

+ 14 - 7
inspect_features.py

@@ -39,7 +39,8 @@ sparse_features = [
     "user_adverid_conver_3d", "user_adverid_conver_7d", "user_adverid_conver_30d",
     "user_skuid_view_3d", "user_skuid_view_7d", "user_skuid_view_30d",
     "user_skuid_click_3d", "user_skuid_click_7d", "user_skuid_click_30d",
-    "user_skuid_conver_3d", "user_skuid_conver_7d", "user_skuid_conver_30d"
+    "user_skuid_conver_3d", "user_skuid_conver_7d", "user_skuid_conver_30d",
+    "user_conver_ad_class"
 ]
 
 int_features = [
@@ -60,10 +61,10 @@ def get_data():
     dense_features = [name.strip().lower() for name in dense_features]
     feature_names = ','.join(dense_features + sparse_features)
 
-    partitions = "dt in ('20250620')"
+    partitions = "dt in ('20250709')"
     sql = f''' SELECT {feature_names},has_conversion
            FROM loghubods.ad_easyrec_train_realtime_data_v3_sampled_temp
-           WHERE {partitions} AND adverid = '598'
+           WHERE {partitions} AND adverid = '523'
     '''
            # AND ts BETWEEN unix_timestamp('2025-05-14 17:40:00') AND unix_timestamp('2025-05-14 18:00:00')
     data_query_hash = hashlib.sha1(sql.encode("utf-8")).hexdigest()[0:8]
@@ -137,7 +138,7 @@ def clear_feature(df, column):
     df[column] = zero_value
     return df
 
-def build_req(df):
+def build_req(df, save_req=None):
     feature_names = df.columns.tolist()
     batch_size = len(df)
     req = TFRequest('serving_default')
@@ -149,6 +150,9 @@ def build_req(df):
             values = [bytes(x, 'utf-8') for x in values]
         req.add_feed(name, [batch_size], tf_type, values)
     req.add_fetch('probs')
+    if save_req:
+        with open(save_req, "wb") as f:
+            f.write(req.to_string())
     return req
 
 def predict_by_batches(df, batch_size = 512):
@@ -188,8 +192,10 @@ def permutate_feature_and_predict(df):
 
 
 def clear_feature_by_prefix_and_predict(df):
-    feature_prefix_list = ["actionstatic","adid","adverid","apptype","b2","b3","b4","b5","b6","b7","b8","brand","cate1","cate2","cid","city","clickall","converall","cpa","creative","ctcvr","ctr","cvr","d1","e1","e2","ecpm","has","hour","incomeall","is","profession","region","root","timediff","title","user","vid","viewall"
-]
+    feature_prefix_list = [
+        # "actionstatic","adid","adverid","apptype","b2","b3","b4","b5","b6","b7","b8","brand","cate1","cate2","cid","city","clickall","converall","cpa","creative","ctcvr","ctr","cvr","d1","e1","e2","ecpm","has","hour","incomeall","is","profession","region","root","timediff","title","user","vid","viewall",
+        "user_conver_ad_class"
+    ]
     base_scores = client.predict(build_req(df)).response.outputs['probs'].float_val
     base_scores = np.array(base_scores)
     base_scores = base_scores / (base_scores + (1 - base_scores) / 0.04)
@@ -289,5 +295,6 @@ if __name__ == '__main__':
     # print(df[['vid', 'cid', 'adid', 'adverid', 'apptype', 'hour', 'hour_quarter', 'is_first_layer']])
     # clear_feature_and_predict(df)
     # permutate_feature_and_predict(df)
-    clear_feature_by_prefix_and_predict(df)
+    # clear_feature_by_prefix_and_predict(df)
+    # scores = client.predict(build_req(df, 'warmup_widedeep_v12.bin')).response.outputs['probs'].float_val
 

+ 13 - 3
widedeep_v13_3.py → widedeep_v13_4.py

@@ -15,6 +15,8 @@
 9.调整embedding variable (PAI平台目前还不支持)
 """
 
+import os
+os.environ['PROCESSOR_TEST'] = "1"
 import re
 from easy_rec.python.protos.pipeline_pb2 import EasyRecConfig
 from easy_rec.python.protos.train_pb2 import TrainConfig
@@ -51,7 +53,7 @@ sparse_features = [
     "cid", "adid", "adverid",
     "region", "city", "brand",
     "vid", "cate1", "cate2",
-    "apptype", "hour", "hour_quarter", "root_source_scene", "root_source_channel", "is_first_layer", "title_split", "user_has_conver_1y",
+    "apptype", "hour", "hour_quarter", "root_source_scene", "root_source_channel", "is_first_layer", "user_has_conver_1y",
     "user_adverid_view_3d", "user_adverid_view_7d", "user_adverid_view_30d",
     "user_adverid_click_3d", "user_adverid_click_7d", "user_adverid_click_30d",
     "user_adverid_conver_3d", "user_adverid_conver_7d", "user_adverid_conver_30d",
@@ -61,7 +63,8 @@ sparse_features = [
 ]
 tag_features = [
     "user_vid_return_tags_2h", "user_vid_return_tags_1d", "user_vid_return_tags_3d",
-    "user_vid_return_tags_7d", "user_vid_return_tags_14d"
+    "user_vid_return_tags_7d", "user_vid_return_tags_14d",
+    "user_conver_ad_class", "title_split"
 ]
 seq_features = [
     "user_cid_click_list", "user_cid_conver_list"
@@ -74,7 +77,7 @@ input_type_map = {
 }
 
 use_ev_features = [
-    "cid", "adid", "adverid", "vid"
+    # PAI上运行的TF疑似不支持
 ]
 
 bucket_size_map = {
@@ -91,6 +94,7 @@ bucket_size_map = {
     'root_source_channel': 1000,
     'is_first_layer': 100,
     'user_has_conver_1y': 100,
+    'user_conver_ad_class': 10000
 }
 
 def create_config():
@@ -163,6 +167,8 @@ def create_config():
     for name in dense_features:
         feature_config = FeatureConfig()
         feature_config.input_names.append(name)
+        if name not in input_fields:
+            raise Exception(f"{name} not found in input fields")
         feature_config.feature_type = FeatureConfig.RawFeature
         feature_config.boundaries.extend(boundaries)
         feature_config.embedding_dim = 6
@@ -172,6 +178,8 @@ def create_config():
     for name in sparse_features:
         feature_config = FeatureConfig()
         feature_config.input_names.append(name)
+        if name not in input_fields:
+            raise Exception(f"{name} not found in input fields")
         feature_config.feature_type = FeatureConfig.IdFeature
         # 只有INT64类型的特征才能使用embedding variable特性
         if name in use_ev_features:
@@ -187,6 +195,8 @@ def create_config():
     for name in tag_features + seq_features:
         feature_config = FeatureConfig()
         feature_config.input_names.append(name)
+        if name not in input_fields:
+            raise Exception(f"{name} not found in input fields")
         feature_config.feature_type = FeatureConfig.TagFeature
         feature_config.hash_bucket_size = bucket_size_map.get(name, 1000000)
         feature_config.embedding_dim = 6