Quellcode durchsuchen

feat:修改脚本

root vor 8 Monaten
Ursprung
Commit
cc60a4eb64

+ 2 - 2
ad/25_ad_data_make.sh

@@ -29,8 +29,8 @@ today_early_1="$(date -d '1 days ago' +%Y%m%d)"
 ./target/spark-examples-1.0.0-SNAPSHOT-shaded.jar \
 tablePart:64 repartition:32 \
 beginStr:${today_early_1}00 endStr:${today_early_1}23 \
-savePath: ${TRAIN_PATH} \
-table: ${TABLE} \
+savePath:${TRAIN_PATH} \
+table:${TABLE} \
 filterHours:00,01,02,03,04,05,06,07 \
 idDefaultValue:0.1
 

+ 45 - 0
ad/25_ad_data_make_tmp.sh

@@ -0,0 +1,45 @@
+#!/bin/sh
+
+# 广告数据生产
+
+set -x 
+
+source /root/anaconda3/bin/activate py37
+
+export SPARK_HOME=/opt/apps/SPARK2/spark-2.4.8-hadoop3.2-1.0.8
+export PATH=$SPARK_HOME/bin:$PATH
+export HADOOP_CONF_DIR=/etc/taihao-apps/hadoop-conf
+export JAVA_HOME=/usr/lib/jvm/java-1.8.0
+
+# 全局常量
+HADOOP=/opt/apps/HADOOP-COMMON/hadoop-common-current/bin/hadoop
+FM_HOME=/root/sunmingze/alphaFM
+
+TRAIN_PATH=/dw/recommend/model/31_ad_sample_data_v4
+BUCKET_FEATURE_PATH=/dw/recommend/model/33_ad_train_data_v4
+TABLE=alg_recsys_ad_sample_all
+
+today="$(date +%Y%m%d)"
+today_early_1="$(date -d '1 days ago' +%Y%m%d)"
+
+
+/opt/apps/SPARK2/spark-2.4.8-hadoop3.2-1.0.8/bin/spark-class2 org.apache.spark.deploy.SparkSubmit \
+--class com.aliyun.odps.spark.examples.makedata_ad.v20240718.makedata_ad_31_originData_20240718 \
+--master yarn --driver-memory 1G --executor-memory 2G --executor-cores 1 --num-executors 16 \
+./target/spark-examples-1.0.0-SNAPSHOT-shaded.jar \
+tablePart:64 repartition:32 \
+beginStr:2024081600 endStr:2024081823 \
+savePath:${TRAIN_PATH} \
+table:${TABLE} \
+filterHours:00,01,02,03,04,05,06,07 \
+idDefaultValue:0.1
+
+/opt/apps/SPARK2/spark-2.4.8-hadoop3.2-1.0.8/bin/spark-class2 org.apache.spark.deploy.SparkSubmit \
+--class com.aliyun.odps.spark.examples.makedata_ad.v20240718.makedata_ad_33_bucketData_20240718 \
+--master yarn --driver-memory 2G --executor-memory 4G --executor-cores 1 --num-executors 16 \
+./target/spark-examples-1.0.0-SNAPSHOT-shaded.jar \
+beginStr:20240816 endStr:20240818 repartition:100 \
+filterNames:_4h_,_5h_,_ecpm,ecpm_,adid_,targeting_conversion_ \
+readPath:${TRAIN_PATH} \
+savePath:${BUCKET_FEATURE_PATH}
+

+ 46 - 0
recommend/01_recommend_model_new_train.sh

@@ -0,0 +1,46 @@
+#!/bin/sh
+
+# 重新训练模型
+
+set -x
+
+begin_date=$1
+end_date=$2
+model_name=$3
+train_dim=$4
+hdfs_path=$5
+
+
+PROJECT_HOME=/root/zhaohp/recommend-emr-dataprocess
+HADOOP=/opt/apps/HADOOP-COMMON/hadoop-common-current/bin/hadoop
+MODEL_PATH=${PROJECT_HOME}/model/recommend
+
+FM_TRAIN=/root/sunmingze/alphaFM/bin/fm_train
+
+train_date=$begin_date
+
+main() {
+
+    end_date=$(date -d "$end_date +1 day" +%Y%m%d)
+
+    # 增量训练模型
+    while [ "$train_date" != "$end_date" ]; do
+        echo "==================== 开始训练 $train_date 模型 ===================="
+
+        if [ "$train_date" == "$begin_date" ]; then
+            $HADOOP fs -text ${hdfs_path}/${train_date}/* | ${FM_TRAIN} -m ${MODEL_PATH}/${model_name}_${train_date}.txt -dim ${train_dim} -core 8
+        else
+            yesterday=$(date -d "$train_date -1 day" +%Y%m%d)
+            $HADOOP fs -text ${hdfs_path}/${train_date}/* | ${FM_TRAIN} -m ${MODEL_PATH}/${model_name}_${train_date}.txt -dim ${train_dim} -core 8 -im ${MODEL_PATH}/${model_name}_${yesterday}.txt
+        fi
+
+        echo -e "==================== 训练 $train_date 模型结束 ====================\n\n\n\n\n\n"
+
+        train_date=$(date -d "$train_date +1 day" +%Y%m%d)
+    done
+
+}
+
+main
+
+# nohup ./recommend/01_recommend_model_new_train.sh 20240815 20240821 model_nba8_v3 1,1,8 /dw/recommend/model/43_recsys_train_data_new_table_274_sample_01/ > logs/25_recommend_model_new_train.log 2>&1 &

+ 14 - 0
recommend/03_predict.sh

@@ -0,0 +1,14 @@
+#!/bin/sh
+set -e
+set -x
+
+day=$1
+train_path=$2
+model_name=$3
+output_file=$4
+bias=$5
+HADOOP="/opt/apps/HADOOP-COMMON/hadoop-common-current/bin/hadoop"
+$HADOOP fs -text ${train_path}/${day}/* | /root/sunmingze/alphaFM/bin/fm_predict -m model/$model_name -dim ${bias} -core 8 -out predict/${output_file}_$day.txt
+cat predict/${output_file}_$day.txt | /root/sunmingze/AUC/AUC
+
+

+ 4 - 5
recommend/data_new_table.sh

@@ -13,8 +13,8 @@ export JAVA_HOME=/usr/lib/jvm/java-1.8.0
 # 原始数据table name
 table='alg_recsys_sample_all_v2'
 # 处理分区配置 推荐数据间隔一天生产,所以5日0点使用3日0-23点数据生产new模型数据
-begin_early_2_Str=20240803
-end_early_2_Str=20240803
+begin_early_2_Str="$(date -d '2 days ago' +%Y%m%d)"
+end_early_2_Str="$(date -d '2 days ago' +%Y%m%d)"
 beginHhStr=00
 endHhStr=23
 max_hour=05
@@ -23,9 +23,7 @@ max_minute=00
 # 源数据文件
 originDataPath=/dw/recommend/model/41_recsys_sample_data_new_table/
 # 特征分桶
-bucketDataPath=/dw/recommend/model/43_recsys_train_data_new_table/
-
-
+bucketDataPath=/dw/recommend/model/43_recsys_train_data_new_table
 # hadoop
 HADOOP=/opt/apps/HADOOP-COMMON/hadoop-common-current/bin/hadoop
 
@@ -59,6 +57,7 @@ beginStr:${begin_early_2_Str}16 endStr:${end_early_2_Str}23 \
 savePath:${originDataPath} \
 table:${table} &
 
+
 wait
 if [ $? -ne 0 ]; then
    echo "Spark原始样本生产任务执行失败"