Browse Source

feat:修改str特征生产脚本

zhaohaipeng 1 month ago
parent
commit
4999afc8d3
2 changed files with 24 additions and 96 deletions
  1. 12 48
      recommend/00_train_data_make.sh
  2. 12 48
      recommend/00_train_data_make_day.sh

+ 12 - 48
recommend/00_train_data_make.sh

@@ -10,65 +10,29 @@ export SEGMENT_BASE_PATH=/dw/recommend/model/36_model_attachment/score_calibrati
 HADOOP=/opt/apps/HADOOP-COMMON/hadoop-common-current/bin/hadoop
 
 
-dts=('20250313' '20250314' '20250315' '20250316')
+dts=('20250317' '20250318')
 for dt in "${dts[@]}"; do
     echo "开始处理: ${dt}"
 
+    # STR负样本采样
     /opt/apps/SPARK2/spark-2.4.8-hadoop3.2-1.0.8/bin/spark-class2 org.apache.spark.deploy.SparkSubmit \
-    --class com.aliyun.odps.spark.examples.makedata_recsys.v20250218.makedata_recsys_41_originData_20250218 \
-    --master yarn --driver-memory 2G --executor-memory 4G --executor-cores 1 --num-executors 10 \
-    --conf spark.yarn.executor.memoryOverhead=2G \
-    /root/zhaohp/recommend-emr-dataprocess/target/spark-examples-1.0.0-SNAPSHOT-shaded.jar \
-    tablePart:64 beginStr:${dt}00 endStr:${dt}11 repartition:32 \
-    savePath:/dw/recommend/model/41_recsys_origin_date \
-    table:dwd_recsys_alg_sample_all_20250212 &
-
-    /opt/apps/SPARK2/spark-2.4.8-hadoop3.2-1.0.8/bin/spark-class2 org.apache.spark.deploy.SparkSubmit \
-    --class com.aliyun.odps.spark.examples.makedata_recsys.v20250218.makedata_recsys_41_originData_20250218 \
-    --master yarn --driver-memory 2G --executor-memory 4G --executor-cores 1 --num-executors 10 \
-    --conf spark.yarn.executor.memoryOverhead=2G \
-    /root/zhaohp/recommend-emr-dataprocess/target/spark-examples-1.0.0-SNAPSHOT-shaded.jar \
-    tablePart:64 beginStr:${dt}12 endStr:${dt}17 repartition:32 \
-    savePath:/dw/recommend/model/41_recsys_origin_date \
-    table:dwd_recsys_alg_sample_all_20250212 &
-
-    /opt/apps/SPARK2/spark-2.4.8-hadoop3.2-1.0.8/bin/spark-class2 org.apache.spark.deploy.SparkSubmit \
-    --class com.aliyun.odps.spark.examples.makedata_recsys.v20250218.makedata_recsys_41_originData_20250218 \
-    --master yarn --driver-memory 2G --executor-memory 4G --executor-cores 1 --num-executors 10 \
-    --conf spark.yarn.executor.memoryOverhead=2G \
+    --class com.aliyun.odps.spark.examples.makedata_recsys.v20250218.makedata_recsys_41_str_train_data_sample_20250319 \
+    --master yarn --driver-memory 6G --executor-memory 6G --executor-cores 1 --num-executors 28 \
     /root/zhaohp/recommend-emr-dataprocess/target/spark-examples-1.0.0-SNAPSHOT-shaded.jar \
-    tablePart:64 beginStr:${dt}18 endStr:${dt}23 repartition:32 \
-    savePath:/dw/recommend/model/41_recsys_origin_date \
-    table:dwd_recsys_alg_sample_all_20250212 &
+    tablePart:64 beginStr:${dt}00 endStr:${dt}23 \
+    savePath:/dw/recommend/model/41_recsys_str_train_data/${dt} \
+    fuSampleRate:0.05 whatLabel:is_share repartition:64
 
-    wait 
-    echo "${dt} 原始特征生产完成"
+    echo "${dt} Str负样本采样完成"
 
     # ROS数据过滤
     /opt/apps/SPARK2/spark-2.4.8-hadoop3.2-1.0.8/bin/spark-class2 org.apache.spark.deploy.SparkSubmit \
     --class com.aliyun.odps.spark.examples.makedata_recsys.v20250218.makedata_recsys_41_ros_train_data_20250304 \
     --master yarn --driver-memory 2G --executor-memory 5G --executor-cores 1 --num-executors 12 \
-    --conf spark.driver.maxResultSize=2g \
     /root/zhaohp/recommend-emr-dataprocess/target/spark-examples-1.0.0-SNAPSHOT-shaded.jar \
-    readPath:/dw/recommend/model/41_recsys_origin_date/${dt}*/* \
-    savePath:/dw/recommend/model/41_recsys_ros_train_data/${dt} \
-    whatLabel:is_share repartition:64 &
-
-    # STR负样本采样
-    /opt/apps/SPARK2/spark-2.4.8-hadoop3.2-1.0.8/bin/spark-class2 org.apache.spark.deploy.SparkSubmit \
-    --class com.aliyun.odps.spark.examples.makedata_recsys.v20250218.makedata_recsys_41_str_train_data_20250218 \
-    --master yarn --driver-memory 4G --executor-memory 5G --executor-cores 1 --num-executors 12 \
-    --conf spark.driver.maxResultSize=2g \
-    ./target/spark-examples-1.0.0-SNAPSHOT-shaded.jar \
-    readPath:/dw/recommend/model/41_recsys_origin_date/${dt}*/* \
-    savePath:/dw/recommend/model/41_recsys_str_train_data/${dt} \
-    fuSampleRate:0.05 whatLabel:is_share repartition:64 &
-
-    wait
-    echo "${dt} 负样本采样完成"
-
-    # 删除原始特征,释放空间
-    ${HADOOP} fs -rm -r /dw/recommend/model/41_recsys_origin_date/${dt}*
-    echo "删除 ${dt} 的原始特征数据"
+    readPath:/dw/recommend/model/41_recsys_str_train_data/${dt}*/* \
+    savePath:/dw/recommend/model/41_recsys_ros_train_data/${dt}/ \
+    whatLabel:is_share repartition:64
 
+    echo "${dt} Ros样本过滤完成"
 done

+ 12 - 48
recommend/00_train_data_make_day.sh

@@ -11,62 +11,26 @@ HADOOP=/opt/apps/HADOOP-COMMON/hadoop-common-current/bin/hadoop
 
 dt="$(date -d '2 days ago' +%Y%m%d)"
 
-
 echo "开始处理: ${dt}"
 
+# STR负样本采样
 /opt/apps/SPARK2/spark-2.4.8-hadoop3.2-1.0.8/bin/spark-class2 org.apache.spark.deploy.SparkSubmit \
---class com.aliyun.odps.spark.examples.makedata_recsys.v20250218.makedata_recsys_41_originData_20250218 \
---master yarn --driver-memory 4G --executor-memory 6G --executor-cores 1 --num-executors 13 \
---conf spark.yarn.executor.memoryOverhead=2G \
-/root/zhaohp/recommend-emr-dataprocess/target/spark-examples-1.0.0-SNAPSHOT-shaded.jar \
-tablePart:64 beginStr:${dt}00 endStr:${dt}11 repartition:32 \
-savePath:/dw/recommend/model/41_recsys_origin_date \
-table:dwd_recsys_alg_sample_all_20250212 &
-
-/opt/apps/SPARK2/spark-2.4.8-hadoop3.2-1.0.8/bin/spark-class2 org.apache.spark.deploy.SparkSubmit \
---class com.aliyun.odps.spark.examples.makedata_recsys.v20250218.makedata_recsys_41_originData_20250218 \
---master yarn --driver-memory 4G --executor-memory 6G --executor-cores 1 --num-executors 13 \
---conf spark.yarn.executor.memoryOverhead=2G \
-/root/zhaohp/recommend-emr-dataprocess/target/spark-examples-1.0.0-SNAPSHOT-shaded.jar \
-tablePart:64 beginStr:${dt}12 endStr:${dt}17 repartition:32 \
-savePath:/dw/recommend/model/41_recsys_origin_date \
-table:dwd_recsys_alg_sample_all_20250212 &
-
-/opt/apps/SPARK2/spark-2.4.8-hadoop3.2-1.0.8/bin/spark-class2 org.apache.spark.deploy.SparkSubmit \
---class com.aliyun.odps.spark.examples.makedata_recsys.v20250218.makedata_recsys_41_originData_20250218 \
---master yarn --driver-memory 4G --executor-memory 6G --executor-cores 1 --num-executors 13 \
---conf spark.yarn.executor.memoryOverhead=2G \
+--class com.aliyun.odps.spark.examples.makedata_recsys.v20250218.makedata_recsys_41_str_train_data_sample_20250319 \
+--master yarn --driver-memory 6G --executor-memory 6G --executor-cores 1 --num-executors 28 \
 /root/zhaohp/recommend-emr-dataprocess/target/spark-examples-1.0.0-SNAPSHOT-shaded.jar \
-tablePart:64 beginStr:${dt}18 endStr:${dt}23 repartition:32 \
-savePath:/dw/recommend/model/41_recsys_origin_date \
-table:dwd_recsys_alg_sample_all_20250212 &
+tablePart:64 beginStr:${dt}00 endStr:${dt}23 \
+savePath:/dw/recommend/model/41_recsys_str_train_data/${dt} \
+fuSampleRate:0.05 whatLabel:is_share repartition:64
 
-wait 
-echo "${dt} 原始特征生产完成"
+echo "${dt} Str负样本采样完成"
 
 # ROS数据过滤
 /opt/apps/SPARK2/spark-2.4.8-hadoop3.2-1.0.8/bin/spark-class2 org.apache.spark.deploy.SparkSubmit \
 --class com.aliyun.odps.spark.examples.makedata_recsys.v20250218.makedata_recsys_41_ros_train_data_20250304 \
---master yarn --driver-memory 2G --executor-memory 5G --executor-cores 1 --num-executors 15 \
---conf spark.driver.maxResultSize=2g \
+--master yarn --driver-memory 2G --executor-memory 5G --executor-cores 1 --num-executors 12 \
 /root/zhaohp/recommend-emr-dataprocess/target/spark-examples-1.0.0-SNAPSHOT-shaded.jar \
-readPath:/dw/recommend/model/41_recsys_origin_date/${dt}*/* \
-savePath:/dw/recommend/model/41_recsys_ros_train_data/${dt} \
-whatLabel:is_share repartition:64 &
-
-# STR负样本采样
-/opt/apps/SPARK2/spark-2.4.8-hadoop3.2-1.0.8/bin/spark-class2 org.apache.spark.deploy.SparkSubmit \
---class com.aliyun.odps.spark.examples.makedata_recsys.v20250218.makedata_recsys_41_str_train_data_20250218 \
---master yarn --driver-memory 4G --executor-memory 5G --executor-cores 1 --num-executors 15 \
---conf spark.driver.maxResultSize=2g \
-./target/spark-examples-1.0.0-SNAPSHOT-shaded.jar \
-readPath:/dw/recommend/model/41_recsys_origin_date/${dt}*/* \
-savePath:/dw/recommend/model/41_recsys_str_train_data/${dt} \
-fuSampleRate:0.05 whatLabel:is_share repartition:64 &
-
-wait
-echo "${dt} 负样本采样完成"
+readPath:/dw/recommend/model/41_recsys_str_train_data/${dt}*/* \
+savePath:/dw/recommend/model/41_recsys_ros_train_data/${dt}/ \
+whatLabel:is_share repartition:64
 
-# 删除原始特征,释放空间
-${HADOOP} fs -rm -r /dw/recommend/model/41_recsys_origin_date/${dt}*
-echo "删除 ${dt} 的原始特征数据"
+echo "${dt} Ros样本过滤完成"