|
@@ -49,7 +49,7 @@ max_minute=20
|
|
global_init() {
|
|
global_init() {
|
|
# 获取当前小时,确定需要使用的数据分区范围
|
|
# 获取当前小时,确定需要使用的数据分区范围
|
|
local current_hour="$(date +%H)"
|
|
local current_hour="$(date +%H)"
|
|
- if [ $current_hour -lt 08 ]; then
|
|
|
|
|
|
+ if [ $current_hour -le 05 ]; then
|
|
train_begin_str=${today_early_1}14
|
|
train_begin_str=${today_early_1}14
|
|
train_end_str=${today_early_1}21
|
|
train_end_str=${today_early_1}21
|
|
predict_begin_str=${today_early_1}22
|
|
predict_begin_str=${today_early_1}22
|
|
@@ -70,8 +70,8 @@ global_init() {
|
|
trainBucketFeaturePath=${bucketFeatureSavePathHome}/${today}/train
|
|
trainBucketFeaturePath=${bucketFeatureSavePathHome}/${today}/train
|
|
predictBucketFeaturePath=${bucketFeatureSavePathHome}/${today}/predict
|
|
predictBucketFeaturePath=${bucketFeatureSavePathHome}/${today}/predict
|
|
|
|
|
|
- local_model_file_path=${MODEL_HOME}/${train_end_str}.txt
|
|
|
|
- local_change_model_file_path=${MODEL_HOME}/${train_end_str}_change.txt
|
|
|
|
|
|
+ local_model_file_path=${MODEL_HOME}/${model_name}_${train_end_str}.txt
|
|
|
|
+ local_change_model_file_path=${MODEL_HOME}/${model_name}_${train_end_str}_change.txt
|
|
max_hour=21
|
|
max_hour=21
|
|
|
|
|
|
else
|
|
else
|
|
@@ -160,10 +160,8 @@ make_origin_data() {
|
|
|
|
|
|
}
|
|
}
|
|
|
|
|
|
-# 特征分桶,训练用的数据和预测用的数据分不同的目录
|
|
|
|
-make_bucket_feature() {
|
|
|
|
- local step_start_time=$(date +%s)
|
|
|
|
- # 训练用的数据
|
|
|
|
|
|
+# 训练用数据分桶
|
|
|
|
+make_train_bucket_feature() {
|
|
/opt/apps/SPARK2/spark-2.4.8-hadoop3.2-1.0.8/bin/spark-class2 org.apache.spark.deploy.SparkSubmit \
|
|
/opt/apps/SPARK2/spark-2.4.8-hadoop3.2-1.0.8/bin/spark-class2 org.apache.spark.deploy.SparkSubmit \
|
|
--class com.aliyun.odps.spark.zhp.makedata_ad.makedata_ad_33_bucketData_20240717 \
|
|
--class com.aliyun.odps.spark.zhp.makedata_ad.makedata_ad_33_bucketData_20240717 \
|
|
--master yarn --driver-memory 2G --executor-memory 4G --executor-cores 1 --num-executors 16 \
|
|
--master yarn --driver-memory 2G --executor-memory 4G --executor-cores 1 --num-executors 16 \
|
|
@@ -172,11 +170,10 @@ make_bucket_feature() {
|
|
filterNames:adid_,targeting_conversion_ \
|
|
filterNames:adid_,targeting_conversion_ \
|
|
readPath:${originDataSavePath} \
|
|
readPath:${originDataSavePath} \
|
|
savePath:${trainBucketFeaturePath}
|
|
savePath:${trainBucketFeaturePath}
|
|
|
|
+}
|
|
|
|
|
|
- local return_code=$?
|
|
|
|
- check_run_status $return_code $step_start_time "Spark特征分桶任务: 训练数据分桶"
|
|
|
|
-
|
|
|
|
- # 预测用的数据
|
|
|
|
|
|
+# 预测用数据分桶
|
|
|
|
+make_predict_bucket_feature() {
|
|
/opt/apps/SPARK2/spark-2.4.8-hadoop3.2-1.0.8/bin/spark-class2 org.apache.spark.deploy.SparkSubmit \
|
|
/opt/apps/SPARK2/spark-2.4.8-hadoop3.2-1.0.8/bin/spark-class2 org.apache.spark.deploy.SparkSubmit \
|
|
--class com.aliyun.odps.spark.zhp.makedata_ad.makedata_ad_33_bucketData_20240717 \
|
|
--class com.aliyun.odps.spark.zhp.makedata_ad.makedata_ad_33_bucketData_20240717 \
|
|
--master yarn --driver-memory 2G --executor-memory 4G --executor-cores 1 --num-executors 16 \
|
|
--master yarn --driver-memory 2G --executor-memory 4G --executor-cores 1 --num-executors 16 \
|
|
@@ -185,9 +182,31 @@ make_bucket_feature() {
|
|
filterNames:adid_,targeting_conversion_ \
|
|
filterNames:adid_,targeting_conversion_ \
|
|
readPath:${originDataSavePath} \
|
|
readPath:${originDataSavePath} \
|
|
savePath:${predictBucketFeaturePath}
|
|
savePath:${predictBucketFeaturePath}
|
|
|
|
+}
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+# 特征分桶,训练用的数据和预测用的数据分不同的目录
|
|
|
|
+make_bucket_feature() {
|
|
|
|
+ local step_start_time=$(date +%s)
|
|
|
|
+
|
|
|
|
+ # 训练用的数据
|
|
|
|
+ make_train_bucket_feature &
|
|
|
|
+ train_bucket_pid=$!
|
|
|
|
+
|
|
|
|
+ wait $train_bucket_pid
|
|
|
|
+
|
|
|
|
+ local train_return_code=$?
|
|
|
|
+ check_run_status $train_return_code $step_start_time "Spark特征分桶任务: 训练数据分桶"
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+ # 预测用的数据
|
|
|
|
+ make_predict_bucket_feature &
|
|
|
|
+ predict_bucket_pid=$!
|
|
|
|
+
|
|
|
|
+ wait $predict_bucket_pid
|
|
|
|
|
|
- return_code=$?
|
|
|
|
- check_run_status $return_code $step_start_time "Spark特征分桶任务: 预测数据分桶"
|
|
|
|
|
|
+ local predict_return_code=$?
|
|
|
|
+ check_run_status $predict_return_code $step_start_time "Spark特征分桶任务: 预测数据分桶"
|
|
}
|
|
}
|
|
|
|
|
|
# 模型训练
|
|
# 模型训练
|
|
@@ -199,25 +218,44 @@ model_train() {
|
|
check_run_status $return_code $step_start_time "模型训练"
|
|
check_run_status $return_code $step_start_time "模型训练"
|
|
}
|
|
}
|
|
|
|
|
|
|
|
+
|
|
|
|
+# 计算线上模型的AUC
|
|
|
|
+calc_online_model_auc() {
|
|
|
|
+ $HADOOP fs -text ${predictBucketFeaturePath}/*/* | ${FM_HOME}/bin/fm_predict -m ${LAST_MODEL_HOME}/model_online.txt -dim 8 -core 8 -out ${PREDICT_PATH}/${model_name}_${train_end_str}_online.txt
|
|
|
|
+ online_auc=`cat ${PREDICT_PATH}/${model_name}_${train_end_str}_online.txt | /root/sunmingze/AUC/AUC`
|
|
|
|
+}
|
|
|
|
+
|
|
|
|
+calc_new_model_auc() {
|
|
|
|
+ $HADOOP fs -text ${predictBucketFeaturePath}/*/* | ${FM_HOME}/bin/fm_predict -m ${local_model_file_path} -dim 8 -core 8 -out ${PREDICT_PATH}/${model_name}_${train_end_str}_new.txt
|
|
|
|
+ new_auc=`cat ${PREDICT_PATH}/${model_name}_${train_end_str}_new.txt | /root/sunmingze/AUC/AUC`
|
|
|
|
+}
|
|
|
|
+
|
|
# AUC对比
|
|
# AUC对比
|
|
auc_compare() {
|
|
auc_compare() {
|
|
local step5_start_time=$(date +%s)
|
|
local step5_start_time=$(date +%s)
|
|
|
|
|
|
# 5.1 计算线上模型的AUC
|
|
# 5.1 计算线上模型的AUC
|
|
local step_start_time=$(date +%s)
|
|
local step_start_time=$(date +%s)
|
|
- $HADOOP fs -text ${predictBucketFeaturePath}/*/* | ${FM_HOME}/bin/fm_predict -m ${LAST_MODEL_HOME}/model_online.txt -dim 8 -core 8 -out ${PREDICT_PATH}/${model_name}_${train_end_str}_online.txt
|
|
|
|
- online_auc=`cat ${PREDICT_PATH}/${model_name}_${train_end_str}_online.txt | /root/sunmingze/AUC/AUC`
|
|
|
|
-
|
|
|
|
|
|
+
|
|
|
|
+ calc_online_model_auc &
|
|
|
|
+ local calc_online_model_auc_pid=$!
|
|
|
|
+
|
|
|
|
+ wait $calc_online_model_auc_pid
|
|
local return_code=$?
|
|
local return_code=$?
|
|
check_run_status $return_code $step_start_time "线上模型AUC计算"
|
|
check_run_status $return_code $step_start_time "线上模型AUC计算"
|
|
|
|
|
|
# 5.2 计算新模型的AUC
|
|
# 5.2 计算新模型的AUC
|
|
step_start_time=$(date +%s)
|
|
step_start_time=$(date +%s)
|
|
- $HADOOP fs -text ${predictBucketFeaturePath}/*/* | ${FM_HOME}/bin/fm_predict -m ${local_model_file_path} -dim 8 -core 8 -out ${PREDICT_PATH}/${model_name}_${train_end_str}_new.txt
|
|
|
|
- new_auc=`cat ${PREDICT_PATH}/${model_name}_${train_end_str}_new.txt | /root/sunmingze/AUC/AUC`
|
|
|
|
|
|
|
|
- return_code=$?
|
|
|
|
- check_run_status $return_code $step_start_time "新模型的AUC计算"
|
|
|
|
|
|
+ calc_new_model_auc &
|
|
|
|
+ local calc_new_model_auc_pid=$!
|
|
|
|
+
|
|
|
|
+ wait $calc_new_model_auc_pid
|
|
|
|
+
|
|
|
|
+ local new_return_code=$?
|
|
|
|
+ check_run_status $new_return_code $step_start_time "新模型的AUC计算"
|
|
|
|
+
|
|
|
|
+
|
|
|
|
|
|
echo "AUC比对: 线上模型的AUC: ${online_auc}, 新模型的AUC: ${new_auc}"
|
|
echo "AUC比对: 线上模型的AUC: ${online_auc}, 新模型的AUC: ${new_auc}"
|
|
|
|
|
|
@@ -322,11 +360,11 @@ main() {
|
|
|
|
|
|
model_to_online_format
|
|
model_to_online_format
|
|
|
|
|
|
- model_upload_oss
|
|
|
|
|
|
+ # model_upload_oss
|
|
|
|
|
|
- model_local_back
|
|
|
|
|
|
+ # model_local_back
|
|
|
|
|
|
- success_inform
|
|
|
|
|
|
+ # success_inform
|
|
}
|
|
}
|
|
|
|
|
|
|
|
|