jch преди 2 месеца
родител
ревизия
fa65a2d2a2
променени са 5 файла, в които са добавени 195 реда и са изтрити 0 реда
  1. 44 0
      scripts/make_data.sh
  2. 50 0
      scripts/nor/nor_predict.sh
  3. 32 0
      scripts/nor/nor_sample.sh
  4. 51 0
      scripts/nor/nor_train.sh
  5. 18 0
      scripts/readme.txt

+ 44 - 0
scripts/make_data.sh

@@ -0,0 +1,44 @@
+#!/bin/bash
+
+run_mode=""
+if(($#==1))
+then
+        run_mode=$1
+else
+        exit -1
+fi
+
+if [[ "$run_mode" != "run" ]]
+then
+        exit -1
+fi
+
+set -x
+export SPARK_HOME=/opt/apps/SPARK2/spark-2.4.8-hadoop3.2-1.0.8
+export PATH=$SPARK_HOME/bin:$PATH
+export HADOOP_CONF_DIR=/etc/taihao-apps/hadoop-conf
+export JAVA_HOME=/usr/lib/jvm/java-1.8.0
+
+# params
+data_date=$(date +%Y%m%d -d "-2 $days day")
+start_date=${data_date}
+end_date=${data_date}
+start_hour=00
+end_hour=23
+sampleRate=0.01
+table=dwd_recsys_alg_sample_all_20250212
+savePath=/dw/recommend/model/82_origin_data/
+
+# 1 生产原始数据
+echo "$(date +%Y-%m-%d_%H-%M-%S)----------step1------------开始根据${table}生产原始数据"
+
+/opt/apps/SPARK2/spark-2.4.8-hadoop3.2-1.0.8/bin/spark-class2 org.apache.spark.deploy.SparkSubmit \
+--class com.aliyun.odps.spark.examples.makedata_recsys_r_rate.makedata_recsys_82_originData_20250221 \
+--master yarn --driver-memory 3G --executor-memory 4G --executor-cores 1 --num-executors 16 \
+/mnt/disk1/jch/recommend-emr-dataprocess/target/spark-examples-1.0.0-SNAPSHOT-jar-with-dependencies.jar \
+table:${table} tablePart:64 \
+beginStr:${start_date}${start_hour} endStr:${end_date}${end_hour} \
+whatLabel:is_return_n_noself \
+fuSampleRate:${sampleRate} \
+repartition:8 \
+savePath:${savePath} \

+ 50 - 0
scripts/nor/nor_predict.sh

@@ -0,0 +1,50 @@
+#!/bin/sh
+set -x
+
+export HADOOP_CONF_DIR=/etc/taihao-apps/hadoop-conf
+export JAVA_HOME=/usr/lib/jvm/java-1.8.0
+
+# params
+FEATURE_FILE=20250303_recsys_nor_name.txt
+BASE_TRAIN_DATA_PATH=/dw/recommend/model/82_recsys_nor_train_data
+PREDICT_RESULT_PATH=/dw/recommend/model/82_recsys_nor_predict_data
+MODEL_SAVE_PATH=/dw/recommend/model/82_recsys_nor_model/model_xgb
+
+start_date=20250301
+end_date=20250301
+test_data_path=""
+for((i=0; i<=21; i++))
+do
+  data_date=$(date -d "$start_date $i day" +"%Y%m%d")
+  if [ "$data_date" -le "$end_date" ]
+  then
+    one_day_data_path="${BASE_TRAIN_DATA_PATH}/${data_date}"
+    if [[ -z $test_data_path ]]
+    then
+      test_data_path=$one_day_data_path
+    else
+      test_data_path="$test_data_path,$one_day_data_path"
+    fi
+  fi
+done
+
+
+/opt/apps/SPARK3/spark-3.3.1-hadoop3.2-1.0.5/bin/spark-class org.apache.spark.deploy.SparkSubmit \
+--class com.tzld.piaoquan.recommend.model.pred_recsys_61_xgb_nor_hdfsfile_20241209 \
+--master yarn --driver-memory 1G --executor-memory 2G --executor-cores 1 --num-executors 16 \
+--conf spark.yarn.executor.memoryoverhead=1024 \
+--conf spark.shuffle.service.enabled=true \
+--conf spark.shuffle.service.port=7337 \
+--conf spark.shuffle.consolidateFiles=true \
+--conf spark.shuffle.manager=sort \
+--conf spark.storage.memoryFraction=0.4 \
+--conf spark.shuffle.memoryFraction=0.5 \
+--conf spark.default.parallelism=200 \
+--conf spark.debug.maxToStringFields=100 \
+/mnt/disk1/jch/recommend-model/recommend-model-produce/target/recommend-model-produce-jar-with-dependencies.jar \
+labelLogType:0 \
+labelLogBase:1.5 \
+featureFile:${FEATURE_FILE} \
+testPath:${test_data_path} \
+savePath:${PREDICT_RESULT_PATH} \
+modelPath:${MODEL_SAVE_PATH}

+ 32 - 0
scripts/nor/nor_sample.sh

@@ -0,0 +1,32 @@
+#!/bin/sh
+set -x
+
+export SPARK_HOME=/opt/apps/SPARK2/spark-2.4.8-hadoop3.2-1.0.8
+export PATH=$SPARK_HOME/bin:$PATH
+export HADOOP_CONF_DIR=/etc/taihao-apps/hadoop-conf
+export JAVA_HOME=/usr/lib/jvm/java-1.8.0
+
+# params
+data_date=$(date +%Y%m%d -d "-2 $days day")
+start_date=${data_date}
+end_date=${data_date}
+sampleRate=-1
+readPath=/dw/recommend/model/82_origin_data/
+savePath=/dw/recommend/model/82_recsys_nor_train_data/
+
+echo `date` "nor sample"
+
+/opt/apps/SPARK2/spark-2.4.8-hadoop3.2-1.0.8/bin/spark-class2 org.apache.spark.deploy.SparkSubmit \
+--class com.aliyun.odps.spark.examples.makedata_recsys_r_rate.makedata_recsys_82_nor_sample_20250221 \
+--master yarn --driver-memory 2G --executor-memory 4G --executor-cores 1 --num-executors 16 \
+/mnt/disk1/jch/recommend-emr-dataprocess/target/spark-examples-1.0.0-SNAPSHOT-jar-with-dependencies.jar \
+readPath:${readPath} \
+beginStr:${start_date} endStr:${end_date} \
+whatApps:0,3,4,21,17 \
+whatLabel:return_n_uv_noself \
+fuSampleRate:${sampleRate} \
+notUseBucket:1 \
+featureName:20250303_recsys_nor_name.txt \
+featureBucket:20250303_recsys_nor_bucket.txt \
+repartition:8 \
+savePath:${savePath} \

+ 51 - 0
scripts/nor/nor_train.sh

@@ -0,0 +1,51 @@
+#!/bin/sh
+set -x
+
+export HADOOP_CONF_DIR=/etc/taihao-apps/hadoop-conf
+export JAVA_HOME=/usr/lib/jvm/java-1.8.0
+
+# params
+FEATURE_FILE=20250303_recsys_nor_name.txt
+BASE_TRAIN_DATA_PATH=/dw/recommend/model/82_recsys_nor_train_data
+PREDICT_RESULT_PATH=/dw/recommend/model/82_recsys_nor_predict_data
+MODEL_SAVE_PATH=/dw/recommend/model/82_recsys_nor_model/model_xgb
+
+start_date=20250221
+end_date=20250228
+train_data_path=""
+for((i=0; i<=21; i++))
+do
+  data_date=$(date -d "$start_date $i day" +"%Y%m%d")
+  if [ "$data_date" -le "$end_date" ]
+  then
+    one_day_data_path="${BASE_TRAIN_DATA_PATH}/${data_date}"
+    if [[ -z $train_data_path ]]
+    then
+      train_data_path=$one_day_data_path
+    else
+      train_data_path="$train_data_path,$one_day_data_path"
+    fi
+  fi
+done
+
+## ******* train *******
+/opt/apps/SPARK3/spark-3.3.1-hadoop3.2-1.0.5/bin/spark-class org.apache.spark.deploy.SparkSubmit \
+--class com.tzld.piaoquan.recommend.model.train_recsys_61_xgb_nor_20241209 \
+--master yarn --driver-memory 4G --executor-memory 10G --executor-cores 1 --num-executors 32 \
+--conf spark.yarn.executor.memoryoverhead=2048 \
+--conf spark.shuffle.service.enabled=true \
+--conf spark.shuffle.service.port=7337 \
+--conf spark.shuffle.consolidateFiles=true \
+--conf spark.shuffle.manager=sort \
+--conf spark.storage.memoryFraction=0.4 \
+--conf spark.shuffle.memoryFraction=0.5 \
+--conf spark.default.parallelism=200 \
+--conf spark.sql.debug.maxToStringFields=100 \
+/mnt/disk1/jch/recommend-model/recommend-model-produce/target/recommend-model-produce-jar-with-dependencies.jar \
+featureFile:${FEATURE_FILE} \
+trainPath:${train_data_path} \
+savePath:${PREDICT_RESULT_PATH} \
+modelPath:${MODEL_SAVE_PATH} \
+labelLogType:0 \
+labelLogBase:1.5 \
+eta:0.06 gamma:0.0 max_depth:5 num_round:1000 num_worker:32 repartition:20

+ 18 - 0
scripts/readme.txt

@@ -0,0 +1,18 @@
+最新使用代码为: 82
+
+1. make_data.sh
+	生成特征数据
+
+2. nor
+	1. nor_sample.sh
+		生成(return_n_uv_noself of is_return_n_noself)样本, (注意需要抽取的特征文件和特征分桶文件)
+
+	2. nor_train.sh
+		使用xgboost训练nor模型, 注意使用的特征文件(训练测试需保持一致)
+		模型训练git: https://git.yishihui.com/algorithm/recommend-model.git
+
+	3. nor_predict.sh
+		评估nor模型, 注意使用的特征文件(训练测试需保持一致)
+
+	4. 下载模型
+		从hdsf上下载模型(训练配置的路径)