Jelajahi Sumber

推荐脚本

jch 1 Minggu lalu
induk
melakukan
d0a0d08586
31 mengubah file dengan 1570 tambahan dan 62 penghapusan
  1. 36 0
      rec_scripts/README.md
  2. 25 21
      rec_scripts/make_data.sh
  3. 35 0
      rec_scripts/nor/nor_pipline.sh
  4. 17 7
      rec_scripts/nor/nor_predict.sh
  5. 18 8
      rec_scripts/nor/nor_sample.sh
  6. 19 8
      rec_scripts/nor/nor_train.sh
  7. 0 0
      rec_scripts/rov/sample/feat/feat_20251027.txt
  8. 33 0
      rec_scripts/rov/sample/scripts/feat_stat.sh
  9. 51 0
      rec_scripts/rov/sample/scripts/rov_batch.sh
  10. 51 0
      rec_scripts/rov/sample/scripts/rov_batch_t1.sh
  11. 43 0
      rec_scripts/rov/sample/scripts/v1/rov_sample_v1.sh
  12. 32 0
      rec_scripts/rov/sample/scripts/v1/train_stat_v1.sh
  13. 31 0
      rec_scripts/rov/sample/scripts/v1/v1_pipline.sh
  14. 37 0
      rec_scripts/rov/sample/scripts/v1/v1_pipline_t1.sh
  15. 172 0
      rec_scripts/rov/sample/src/get_fm_feature_name_v1.py
  16. 0 0
      rec_scripts/rov/sample/vocab/vocab_v1_20251027.txt
  17. 312 0
      rec_scripts/rov/train/data/filter_feature.csv
  18. 33 0
      rec_scripts/rov/train/scripts/download_data.sh
  19. 114 0
      rec_scripts/rov/train/scripts/pipline.sh
  20. 21 0
      rec_scripts/rov/train/scripts/remove_data.sh
  21. 25 0
      rec_scripts/rov/train/scripts/run.sh
  22. 79 0
      rec_scripts/rov/train/scripts/train_model_data.sh
  23. 88 0
      rec_scripts/rov/train/scripts/update_model.sh
  24. 27 0
      rec_scripts/rov/train/src/tools/cal_auc.py
  25. 33 0
      rec_scripts/rov/train/src/tools/offline2online_fm.py
  26. 89 0
      rec_scripts/rov/train/src/tools/rec_monitor_push.py
  27. 44 0
      rec_scripts/rov/train/src/tools/update2oss.py
  28. 36 0
      rec_scripts/run.sh
  29. 31 0
      rec_scripts/run_t0.sh
  30. 38 0
      rec_scripts/run_t1.sh
  31. 0 18
      scripts/readme.txt

+ 36 - 0
rec_scripts/README.md

@@ -0,0 +1,36 @@
+# 1. 总体
+- str+模型(rov)和ros-模型(nor),使用相同的底层数据,都是通过make_data.sh获得的;其中str+模型每天更新,ros-模型手动更新(目前没有更新)。
+- 使用时,需将脚本中的jar文件路径&alpha_fm路径,替换成自己的路径
+- [数据生成code]()
+- [ros-模型训练code](https://git.yishihui.com/algorithm/recommend-model/src/feature/jch)
+
+# 2. make_data
+- run.sh生成t+2底层数据(样本表:dwd_recsys_alg_sample_all_20250212) & rov t+2 训练样本
+- run_t0.sh生成t+1底层数据(0-16点,dwd_recsys_alg_sample_all_20250905)
+- run_t1.sh生成t+1底层数据(17-21点数据,22/23数据没有利用,dwd_recsys_alg_sample_all_20250905) & rov t+1 训练样本
+
+# 3. str+模型(rov)
+## 3.1 sample
+### 3.1.1 feat(存放基础数据特征频次统计)
+### 3.1.2 vocab(存放str+模型生成样本数据,使用的特征列表)
+### 3.1.3 scripts
+- feat_stat.sh 统计基础数据中的特征频次
+- rov_batch.sh 生成t+2rov样本数据
+- rov_batch_t1.sh 生成t+1rov样本数据, 并统计样本数据中的特征频次
+- v1(版本) v1_pipline.sh(生成t+2样本), v1_pipline_t1.sh(生成t+1样本,并统计特征频次), rov_sample_v1.sh(生成样本), train_stat_v1.sh(统计特征频次)
+## 3.2 train
+### 3.2.1 data(存放样本数据&特征)
+### 3.2.2 logs(日志)
+### 3.2.3 src(python 相关脚本)
+### 3.2.4 scripts(训练相关脚本)
+### 3.2.5 model(存放训练好的模型)
+- run.sh 触发任务
+- pipline.sh 训练流程
+- download_data.sh 下载样本数据
+- train_model_data.sh 训练
+- update_model.sh 校验模型&上传模型至oss
+- remove_data.sh 删除旧的样本数据
+### src
+
+# 4. ros-模型(nor)
+- python src/preprocess/eval_result.py --input_file test_result.csv

+ 25 - 21
scripts/make_data.sh → rec_scripts/make_data.sh

@@ -1,16 +1,23 @@
 #!/bin/bash
 
-run_mode=""
-if(($#==1))
+start_date=""
+end_date=""
+start_hour=""
+end_hour=""
+table=""
+if(($#==5))
 then
-        run_mode=$1
+    start_date=$1
+    end_date=$2
+    start_hour=$3
+    end_hour=$4
+    table=$5
 else
-        exit -1
-fi
-
-if [[ "$run_mode" != "run" ]]
-then
-        exit -1
+    start_date=$(date +%Y%m%d -d "-1 $days day")
+    end_date=$start_date
+    start_hour=00
+    end_hour=23
+    table=dwd_recsys_alg_sample_all_20250212
 fi
 
 set -x
@@ -20,25 +27,22 @@ export HADOOP_CONF_DIR=/etc/taihao-apps/hadoop-conf
 export JAVA_HOME=/usr/lib/jvm/java-1.8.0
 
 # params
-data_date=$(date +%Y%m%d -d "-2 $days day")
-start_date=${data_date}
-end_date=${data_date}
-start_hour=00
-end_hour=23
-sampleRate=0.01
-table=dwd_recsys_alg_sample_all_20250212
-savePath=/dw/recommend/model/82_origin_data/
+sampleRate=0.036
+label=is_share
+savePath=/dw/recommend/model/83_origin_data/
 
 # 1 生产原始数据
 echo "$(date +%Y-%m-%d_%H-%M-%S)----------step1------------开始根据${table}生产原始数据"
 
 /opt/apps/SPARK2/spark-2.4.8-hadoop3.2-1.0.8/bin/spark-class2 org.apache.spark.deploy.SparkSubmit \
---class com.aliyun.odps.spark.examples.makedata_recsys_r_rate.makedata_recsys_82_originData_20250221 \
---master yarn --driver-memory 3G --executor-memory 4G --executor-cores 1 --num-executors 16 \
+--class com.aliyun.odps.spark.examples.makedata_recsys_r_rate.makedata_recsys_83_originData_20250317 \
+--master yarn --driver-memory 6G --executor-memory 10G --executor-cores 1 --num-executors 16 \
+--conf spark.yarn.executor.memoryoverhead=2048 \
 /mnt/disk1/jch/recommend-emr-dataprocess/target/spark-examples-1.0.0-SNAPSHOT-jar-with-dependencies.jar \
-table:${table} tablePart:64 \
+table:${table} tablePart:96 \
 beginStr:${start_date}${start_hour} endStr:${end_date}${end_hour} \
-whatLabel:is_return_n_noself \
+whatPages:"详情后沉浸页,回流后沉浸页&内页feed,首页feed,详情页,回流页" \
+whatLabel:${label} \
 fuSampleRate:${sampleRate} \
 repartition:8 \
 savePath:${savePath} \

+ 35 - 0
rec_scripts/nor/nor_pipline.sh

@@ -0,0 +1,35 @@
+#!/bin/bash
+
+abs_path=$(cd `dirname $0`; pwd)
+run_mode=""
+if(($#==1))
+then
+  run_mode=$1
+else
+  exit -1
+fi
+
+if [[ "$run_mode" != "run" ]]
+then
+  exit -1
+fi
+
+# 1. nor sample
+sample_sh="${abs_path}/nor_sample.sh"
+echo `date` "sh +x $sample_sh"
+sh +x $sample_sh &
+wait
+sleep 60s
+
+# 2. nor train
+train_sh="${abs_path}/nor_train.sh"
+echo `date` "sh +x $train_sh"
+sh +x $train_sh &
+wait
+sleep 60s
+
+# 3. nor predict
+predict_sh="${abs_path}/nor_predict.sh"
+echo `date` "sh +x $predict_sh"
+sh +x $predict_sh &
+wait

+ 17 - 7
scripts/nor/nor_predict.sh → rec_scripts/nor/nor_predict.sh

@@ -1,17 +1,27 @@
 #!/bin/sh
 set -x
 
+start_date=""
+end_date=""
+if(($#==2))
+then
+    start_date=$1
+    end_date=$2
+else
+    start_date=$(date +%Y%m%d -d "-2 $days day")
+    end_date=$start_date
+fi
+
+# env
 export HADOOP_CONF_DIR=/etc/taihao-apps/hadoop-conf
 export JAVA_HOME=/usr/lib/jvm/java-1.8.0
 
 # params
-FEATURE_FILE=20250303_recsys_nor_name.txt
-BASE_TRAIN_DATA_PATH=/dw/recommend/model/82_recsys_nor_train_data
-PREDICT_RESULT_PATH=/dw/recommend/model/82_recsys_nor_predict_data
-MODEL_SAVE_PATH=/dw/recommend/model/82_recsys_nor_model/model_xgb
+FEATURE_FILE=20250627_recsys_nor_name.txt
+BASE_TRAIN_DATA_PATH=/dw/recommend/model/83_recsys_nor_train_data
+PREDICT_RESULT_PATH=/dw/recommend/model/83_recsys_nor_predict_data
+MODEL_SAVE_PATH=/dw/recommend/model/83_recsys_nor_model/model_xgb
 
-start_date=20250301
-end_date=20250301
 test_data_path=""
 for((i=0; i<=21; i++))
 do
@@ -31,7 +41,7 @@ done
 
 /opt/apps/SPARK3/spark-3.3.1-hadoop3.2-1.0.5/bin/spark-class org.apache.spark.deploy.SparkSubmit \
 --class com.tzld.piaoquan.recommend.model.pred_recsys_61_xgb_nor_hdfsfile_20241209 \
---master yarn --driver-memory 1G --executor-memory 2G --executor-cores 1 --num-executors 16 \
+--master yarn --driver-memory 1G --executor-memory 2G --executor-cores 1 --num-executors 4 \
 --conf spark.yarn.executor.memoryoverhead=1024 \
 --conf spark.shuffle.service.enabled=true \
 --conf spark.shuffle.service.port=7337 \

+ 18 - 8
scripts/nor/nor_sample.sh → rec_scripts/nor/nor_sample.sh

@@ -1,32 +1,42 @@
 #!/bin/sh
 set -x
 
+start_date=""
+end_date=""
+if(($#==2))
+then
+    start_date=$1
+    end_date=$2
+else
+    start_date=$(date +%Y%m%d -d "-2 $days day")
+    end_date=${start_date}
+fi
+
+# env
 export SPARK_HOME=/opt/apps/SPARK2/spark-2.4.8-hadoop3.2-1.0.8
 export PATH=$SPARK_HOME/bin:$PATH
 export HADOOP_CONF_DIR=/etc/taihao-apps/hadoop-conf
 export JAVA_HOME=/usr/lib/jvm/java-1.8.0
 
 # params
-data_date=$(date +%Y%m%d -d "-2 $days day")
-start_date=${data_date}
-end_date=${data_date}
 sampleRate=-1
-readPath=/dw/recommend/model/82_origin_data/
-savePath=/dw/recommend/model/82_recsys_nor_train_data/
+readPath=/dw/recommend/model/83_origin_data/
+savePath=/dw/recommend/model/83_recsys_nor_train_data/
 
 echo `date` "nor sample"
 
 /opt/apps/SPARK2/spark-2.4.8-hadoop3.2-1.0.8/bin/spark-class2 org.apache.spark.deploy.SparkSubmit \
---class com.aliyun.odps.spark.examples.makedata_recsys_r_rate.makedata_recsys_82_nor_sample_20250221 \
+--class com.aliyun.odps.spark.examples.makedata_recsys_r_rate.makedata_recsys_86_nor_sample_20250627 \
 --master yarn --driver-memory 2G --executor-memory 4G --executor-cores 1 --num-executors 16 \
 /mnt/disk1/jch/recommend-emr-dataprocess/target/spark-examples-1.0.0-SNAPSHOT-jar-with-dependencies.jar \
 readPath:${readPath} \
 beginStr:${start_date} endStr:${end_date} \
-whatApps:0,3,4,21,17 \
+whatApps:0,4,2,32,31,21,29,27,26,28,34,3,36,6,17,35 \
 whatLabel:return_n_uv_noself \
+whatPages:"详情后沉浸页,回流后沉浸页&内页feed,首页feed,详情页" \
 fuSampleRate:${sampleRate} \
 notUseBucket:1 \
-featureName:20250303_recsys_nor_name.txt \
+featureName:20250627_recsys_nor_name.txt \
 featureBucket:20250303_recsys_nor_bucket.txt \
 repartition:8 \
 savePath:${savePath} \

+ 19 - 8
scripts/nor/nor_train.sh → rec_scripts/nor/nor_train.sh

@@ -1,17 +1,27 @@
 #!/bin/sh
 set -x
 
+start_date=""
+end_date=""
+if(($#==2))
+then
+    start_date=$1
+    end_date=$2
+else
+    start_date=$(date +%Y%m%d -d "-8 $days day")
+    end_date=$(date +%Y%m%d -d "-2 $days day")
+fi
+
+# env
 export HADOOP_CONF_DIR=/etc/taihao-apps/hadoop-conf
 export JAVA_HOME=/usr/lib/jvm/java-1.8.0
 
 # params
-FEATURE_FILE=20250303_recsys_nor_name.txt
-BASE_TRAIN_DATA_PATH=/dw/recommend/model/82_recsys_nor_train_data
-PREDICT_RESULT_PATH=/dw/recommend/model/82_recsys_nor_predict_data
-MODEL_SAVE_PATH=/dw/recommend/model/82_recsys_nor_model/model_xgb
+FEATURE_FILE=20250627_recsys_nor_name.txt
+BASE_TRAIN_DATA_PATH=/dw/recommend/model/83_recsys_nor_train_data
+PREDICT_RESULT_PATH=/dw/recommend/model/83_recsys_nor_predict_data
+MODEL_SAVE_PATH=/dw/recommend/model/83_recsys_nor_model/model_xgb
 
-start_date=20250221
-end_date=20250228
 train_data_path=""
 for((i=0; i<=21; i++))
 do
@@ -29,9 +39,10 @@ do
 done
 
 ## ******* train *******
+workers=32
 /opt/apps/SPARK3/spark-3.3.1-hadoop3.2-1.0.5/bin/spark-class org.apache.spark.deploy.SparkSubmit \
 --class com.tzld.piaoquan.recommend.model.train_recsys_61_xgb_nor_20241209 \
---master yarn --driver-memory 4G --executor-memory 10G --executor-cores 1 --num-executors 32 \
+--master yarn --driver-memory 4G --executor-memory 10G --executor-cores 1 --num-executors ${workers} \
 --conf spark.yarn.executor.memoryoverhead=2048 \
 --conf spark.shuffle.service.enabled=true \
 --conf spark.shuffle.service.port=7337 \
@@ -48,4 +59,4 @@ savePath:${PREDICT_RESULT_PATH} \
 modelPath:${MODEL_SAVE_PATH} \
 labelLogType:0 \
 labelLogBase:1.5 \
-eta:0.06 gamma:0.0 max_depth:5 num_round:1000 num_worker:32 repartition:20
+eta:0.06 gamma:0.0 max_depth:5 num_round:1000 num_worker:${workers} repartition:20

+ 0 - 0
rec_scripts/rov/sample/feat/feat_20251027.txt


+ 33 - 0
rec_scripts/rov/sample/scripts/feat_stat.sh

@@ -0,0 +1,33 @@
+#!/bin/sh
+set -x
+
+start_date=""
+end_date=""
+if(($#==2))
+then
+  start_date=$1
+  end_date=$2
+else
+  start_date=$(date +%Y%m%d -d "-15 $days day")
+  end_date=$(date +%Y%m%d -d "-2 $days day")
+fi
+
+# env
+export SPARK_HOME=/opt/apps/SPARK2/spark-2.4.8-hadoop3.2-1.0.8
+export PATH=$SPARK_HOME/bin:$PATH
+export HADOOP_CONF_DIR=/etc/taihao-apps/hadoop-conf
+export JAVA_HOME=/usr/lib/jvm/java-1.8.0
+
+# param
+sampleRate=-1
+readPath=/dw/recommend/model/83_origin_data/
+savePath=/dw/recommend/model/83_recsys_feature/
+
+/opt/apps/SPARK2/spark-2.4.8-hadoop3.2-1.0.8/bin/spark-class2 org.apache.spark.deploy.SparkSubmit \
+--class com.aliyun.odps.spark.examples.makedata_recsys_r_rate.feature_stat \
+--master yarn --driver-memory 2G --executor-memory 4G --executor-cores 1 --num-executors 24 \
+/mnt/disk1/jch/recommend-emr-dataprocess/target/spark-examples-1.0.0-SNAPSHOT-jar-with-dependencies.jar \
+readPath:${readPath} \
+beginStr:${start_date} endStr:${end_date} \
+repartition:8 \
+savePath:${savePath}

+ 51 - 0
rec_scripts/rov/sample/scripts/rov_batch.sh

@@ -0,0 +1,51 @@
+#!/bin/bash
+
+abs_path=$(cd `dirname $0`; pwd)
+run_mode=""
+data_date=$(date +%Y%m%d -d "-2 $days day")
+if(($#==1))
+then
+  run_mode=$1
+elif(($#==2))
+then
+  run_mode=$1
+  data_date=$2
+else
+  exit -1
+fi
+
+if [[ "$run_mode" != "run" ]]
+then
+  exit -1
+fi
+
+# env
+export SPARK_HOME=/opt/apps/SPARK2/spark-2.4.8-hadoop3.2-1.0.8
+export PATH=$SPARK_HOME/bin:$PATH
+export HADOOP_CONF_DIR=/etc/taihao-apps/hadoop-conf
+export JAVA_HOME=/usr/lib/jvm/java-1.8.0
+
+# 1. feat stat
+stat_sh="${abs_path}/feat_stat.sh"
+start_date=$(date -d "$data_date -13 day" +"%Y%m%d")
+end_date=$data_date
+echo `date` "sh +x $stat_sh $start_date $end_date"
+sh +x $stat_sh $start_date $end_date &
+wait
+sleep 30s
+
+# 2. feat file
+hadoop_bin=/opt/apps/HADOOP-COMMON/hadoop-3.2.1-1.2.7-alinux3/bin/hadoop
+feat_file="${abs_path}/../feat/feat_${end_date}.txt"
+feat_hdfs_dir="/dw/recommend/model/83_recsys_feature/"
+echo `date` "$hadoop_bin fs -text $feat_hdfs_dir/$end_date/part* > $feat_file"
+$hadoop_bin fs -text $feat_hdfs_dir/$end_date/part* > $feat_file &
+wait
+sleep 30s
+
+# 3. v1
+v1_sh="${abs_path}/v1/v1_pipline.sh"
+echo `date` "sh +x $v1_sh $feat_file"
+sh +x $v1_sh $feat_file &
+wait
+sleep 30s

+ 51 - 0
rec_scripts/rov/sample/scripts/rov_batch_t1.sh

@@ -0,0 +1,51 @@
+#!/bin/bash
+
+abs_path=$(cd `dirname $0`; pwd)
+run_mode=""
+data_date=$(date +%Y%m%d -d "-1 $days day")
+if(($#==1))
+then
+  run_mode=$1
+elif(($#==2))
+then
+  run_mode=$1
+  data_date=$2
+else
+  exit -1
+fi
+
+if [[ "$run_mode" != "run" ]]
+then
+  exit -1
+fi
+
+# env
+export SPARK_HOME=/opt/apps/SPARK2/spark-2.4.8-hadoop3.2-1.0.8
+export PATH=$SPARK_HOME/bin:$PATH
+export HADOOP_CONF_DIR=/etc/taihao-apps/hadoop-conf
+export JAVA_HOME=/usr/lib/jvm/java-1.8.0
+
+# 1. feat stat
+stat_sh="${abs_path}/feat_stat.sh"
+start_date=$(date -d "$data_date -13 day" +"%Y%m%d")
+end_date=$data_date
+echo `date` "sh +x $stat_sh $start_date $end_date"
+sh +x $stat_sh $start_date $end_date &
+wait
+sleep 30s
+
+# 2. feat file
+hadoop_bin=/opt/apps/HADOOP-COMMON/hadoop-3.2.1-1.2.7-alinux3/bin/hadoop
+feat_file="${abs_path}/../feat/feat_${end_date}.txt"
+feat_hdfs_dir="/dw/recommend/model/83_recsys_feature/"
+echo `date` "$hadoop_bin fs -text $feat_hdfs_dir/$end_date/part* > $feat_file"
+$hadoop_bin fs -text $feat_hdfs_dir/$end_date/part* > $feat_file &
+wait
+sleep 30s
+
+# v1
+v1_sh="${abs_path}/v1/v1_pipline_t1.sh"
+echo `date` "sh +x $v1_sh $feat_file $data_date"
+sh +x $v1_sh $feat_file $data_date &
+wait
+sleep 30s

+ 43 - 0
rec_scripts/rov/sample/scripts/v1/rov_sample_v1.sh

@@ -0,0 +1,43 @@
+#!/bin/sh
+set -x
+
+start_date=""
+end_date=""
+feature_file=""
+if(($#==3))
+then
+  start_date=$1
+  end_date=$2
+  feature_file=$3
+else
+  exit -1
+fi
+
+# env
+export SPARK_HOME=/opt/apps/SPARK2/spark-2.4.8-hadoop3.2-1.0.8
+export PATH=$SPARK_HOME/bin:$PATH
+export HADOOP_CONF_DIR=/etc/taihao-apps/hadoop-conf
+export JAVA_HOME=/usr/lib/jvm/java-1.8.0
+
+# param
+sampleRate=2
+readPath=/dw/recommend/model/83_origin_data/
+savePath=/dw/recommend/model/831_recsys_rov_train_data/
+
+echo `date` "rov sample"
+/opt/apps/SPARK2/spark-2.4.8-hadoop3.2-1.0.8/bin/spark-class2 org.apache.spark.deploy.SparkSubmit \
+--class com.aliyun.odps.spark.examples.makedata_recsys_r_rate.makedata_recsys_86_fm_sample_20250627 \
+--master yarn --driver-memory 2G --executor-memory 6G --executor-cores 1 --num-executors 24 \
+--files ${feature_file} \
+/mnt/disk1/jch/recommend-emr-dataprocess/target/spark-examples-1.0.0-SNAPSHOT-jar-with-dependencies.jar \
+readPath:${readPath} \
+beginStr:${start_date} endStr:${end_date} \
+whatApps:0,4,32,31,21,29,27,26,28,34,3,36,6,17,35 \
+whatPages:"详情后沉浸页,回流后沉浸页&内页feed,首页feed,详情页" \
+whatLabel:is_return_noself \
+fuSampleRate:${sampleRate} \
+notUseBucket:1 \
+featureName:${feature_file} \
+featureBucket:20241209_recsys_rov_bucket.txt \
+repartition:64 \
+savePath:${savePath} \

+ 32 - 0
rec_scripts/rov/sample/scripts/v1/train_stat_v1.sh

@@ -0,0 +1,32 @@
+#!/bin/sh
+set -x
+
+start_date=""
+end_date=""
+if(($#==2))
+then
+  start_date=$1
+  end_date=$2
+else
+  start_date=$(date +%Y%m%d -d "-15 $days day")
+  end_date=$(date +%Y%m%d -d "-2 $days day")
+fi
+
+# env
+export SPARK_HOME=/opt/apps/SPARK2/spark-2.4.8-hadoop3.2-1.0.8
+export PATH=$SPARK_HOME/bin:$PATH
+export HADOOP_CONF_DIR=/etc/taihao-apps/hadoop-conf
+export JAVA_HOME=/usr/lib/jvm/java-1.8.0
+
+# param
+readPath=/dw/recommend/model/831_recsys_rov_train_data/
+savePath=/dw/recommend/model/831_recsys_analysis_data/
+
+/opt/apps/SPARK2/spark-2.4.8-hadoop3.2-1.0.8/bin/spark-class2 org.apache.spark.deploy.SparkSubmit \
+--class com.aliyun.odps.spark.examples.makedata_recsys_r_rate.train_feature_stat \
+--master yarn --driver-memory 2G --executor-memory 4G --executor-cores 1 --num-executors 24 \
+/mnt/disk1/jch/recommend-emr-dataprocess/target/spark-examples-1.0.0-SNAPSHOT-jar-with-dependencies.jar \
+readPath:${readPath} \
+beginStr:${start_date} endStr:${end_date} \
+repartition:1 \
+savePath:${savePath}

+ 31 - 0
rec_scripts/rov/sample/scripts/v1/v1_pipline.sh

@@ -0,0 +1,31 @@
+#!/bin/bash
+
+abs_path=$(cd `dirname $0`; pwd)
+feat_file=""
+end_date=$(date +%Y%m%d -d "-2 $days day")
+if(($#==1))
+then
+  feat_file=$1
+elif(($#==2))
+then
+  feat_file=$1
+  end_date=$2
+else
+  exit -1
+fi
+
+# 1. vocab
+vocab_py="${abs_path}/../../src/get_fm_feature_name_v1.py"
+vocab_file="${abs_path}/../../vocab/vocab_v1_${end_date}.txt"
+echo `date` "python3 $vocab_py --all $feat_file --output $vocab_file"
+python3 $vocab_py --all $feat_file --output $vocab_file &
+wait
+sleep 30s
+
+# 2. sample
+sample_sh="${abs_path}/rov_sample_v1.sh"
+echo `date` "sh +x $sample_sh $end_date $end_date $vocab_file"
+sh +x $sample_sh $end_date $end_date $vocab_file &
+wait
+sleep 30s
+

+ 37 - 0
rec_scripts/rov/sample/scripts/v1/v1_pipline_t1.sh

@@ -0,0 +1,37 @@
+#!/bin/bash
+
+abs_path=$(cd `dirname $0`; pwd)
+feat_file=""
+end_date=$(date +%Y%m%d -d "-2 $days day")
+if(($#==1))
+then
+  feat_file=$1
+elif(($#==2))
+then
+  feat_file=$1
+  end_date=$2
+else
+  exit -1
+fi
+
+# 1. vocab
+vocab_py="${abs_path}/../../src/get_fm_feature_name_v1.py"
+vocab_file="${abs_path}/../../vocab/vocab_v1_${end_date}.txt"
+echo `date` "python3 $vocab_py --all $feat_file --output $vocab_file"
+python3 $vocab_py --all $feat_file --output $vocab_file &
+wait
+sleep 30s
+
+# 2. sample
+sample_sh="${abs_path}/rov_sample_v1.sh"
+echo `date` "sh +x $sample_sh $end_date $end_date $vocab_file"
+sh +x $sample_sh $end_date $end_date $vocab_file &
+wait
+sleep 30s
+
+# 3. stat
+train_stat_sh="${abs_path}/train_stat_v1.sh"
+start_date=$(date -d "$end_date -13 day" +"%Y%m%d")
+echo `date` "sh +x $train_stat_sh $start_date $end_date"
+sh +x $train_stat_sh $start_date $end_date &
+wait

+ 172 - 0
rec_scripts/rov/sample/src/get_fm_feature_name_v1.py

@@ -0,0 +1,172 @@
+#!/usr/bin/env python
+# coding=utf-8
+
+import argparse
+
+min_cnt = 500
+prefix_set = {
+    "app",
+    "hot",
+    "week",
+    "hour",
+    "province",
+    "city",
+    "model",
+    "brand",
+    "system",
+    "b0_1h",
+    "b0_3h",
+    "b0_6h",
+    "b0_12h",
+    "b1_1h",
+    "b1_3h",
+    "b1_6h",
+    "b1_24h",
+    "b1_72h",
+    "b1_168h",
+    "b2_1h",
+    "b2_3h",
+    "b2_6h",
+    "b2_24h",
+    "b3_3h",
+    "b3_6h",
+    "b3_24h",
+    # "b3_168h",
+    "b4_1h",
+    "b4_3h",
+    "b4_6h",
+    "b4_12h",
+    "b4_24h",
+    "b4_72h",
+    "b5_1h",
+    "b5_3h",
+    "b5_6h",
+    "b5_12h",
+    "b5_24h",
+    "b5_72h",
+    # "b5_168h",
+    # "b6_1h",
+    # "b6_24h",
+    "b7_1h",
+    "b7_3h",
+    "b7_6h",
+    "b7_12h",
+    "b7_24h",
+    # "b7_168h",
+    "b8_1h",
+    "b8_3h",
+    "b8_24h",
+    "b9_1h",
+    "b9_3h",
+    "b9_24h",
+    # "b10_1h",
+    "b10_12h",
+    # "b11_12h",
+    # "b11_168h",
+    "b13_1h",
+    "b13_3h",
+    "b13_6h",
+    "b13_12h",
+    "b13_24h",
+    "b13_72h",
+    "b14_1h",
+    "b14_2h",
+    "b14_3h",
+    "b14_6h",
+    "b14_12h",
+    "b15_1h",
+    "b15_2h",
+    "b15_3h",
+    "b15_6h",
+    "b15_12h",
+    "c1_72h",
+    "c1_168h",
+    "c5_tags_1d",
+    "c5_tags_3d",
+    "c5_tags_7d",
+    "c6_tags_1d",
+    "c6_tags_3d",
+    "c6_tags_7d",
+    "c7_share",
+    "c7_return",
+    "c8_share",
+    "c8_return",
+    "c9",
+    "c9_mss",
+    "c9_mrs",
+    "c9_lss",
+    "c9_lrs",
+    "c9_lr1s",
+    "c9_l1s",
+    "c9_l2s",
+    "c9_c1s",
+    "c9_c2s",
+    "d1",
+    "d2",
+    "d3",
+    "h",
+    "r",
+    "hr_sim",
+    "user_channel",
+    "user_level",
+    "page",
+    "e1"}
+
+
+def load_feature_name(input_file):
+    feat_dict = dict()
+    with open(input_file) as in_fp:
+        for line in in_fp:
+            pair = line.strip().split('\t')
+            if 2 != len(pair):
+                continue
+            feat_dict[pair[0]] = int(pair[1])
+    return feat_dict
+
+
+def get_new_feature_name(all_dict):
+    new_dict = dict()
+    for (feat_name, feat_cnt) in all_dict.items():
+        if 'hour' == feat_name:
+            continue
+        if 'matchnum' in feat_name:
+            continue
+        if 'c4_' in feat_name:
+            continue
+        if 'c7_' in feat_name:
+            continue
+        if 'c8_' in feat_name:
+            continue
+        if 'rank' in feat_name:
+            continue
+        if '@rov' in feat_name and '_#' not in feat_name:
+            continue
+        if 'ros' in feat_name and 'ros_one' not in feat_name and '_#' not in feat_name:
+            continue
+        prefix = feat_name.split('@')[0]
+        if prefix in prefix_set and feat_cnt >= min_cnt:
+            new_dict[feat_name] = feat_cnt
+    return new_dict
+
+
+def save_feature_name(feat_dict, output_file):
+    with open(output_file, 'w') as out_fp:
+        for (k, v) in sorted(feat_dict.items(), key=lambda item: item[0]):
+            line = '%s\n' % k
+            out_fp.write(line)
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--all', required=True, type=str, help='all feature file')
+    parser.add_argument('--output', required=True, type=str, help='output feature file')
+    args = parser.parse_args()
+    print('\n\n')
+    print(args)
+    print('\n\n')
+
+    all_feature_dict = load_feature_name(args.all)
+
+    new_feature_dict = get_new_feature_name(all_feature_dict)
+
+    save_feature_name(new_feature_dict, args.output)

+ 0 - 0
rec_scripts/rov/sample/vocab/vocab_v1_20251027.txt


+ 312 - 0
rec_scripts/rov/train/data/filter_feature.csv

@@ -0,0 +1,312 @@
+b13_12h@exp
+b13_12h@is_return_1
+b13_12h@is_share
+b13_12h@return_n_uv
+b13_12h@ros_#
+b13_12h@ros_minus_#
+b13_12h@ros_n_#
+b13_12h@ros_one
+b13_12h@rovn_#
+b13_12h@share_cnt
+b13_12h@str
+b13_12h@str_plus
+b13_6h@exp
+b13_6h@is_return_1
+b13_6h@is_share
+b13_6h@return_n_uv
+b13_6h@ros_#
+b13_6h@ros_minus_#
+b13_6h@ros_n_#
+b13_6h@ros_one
+b13_6h@rovn_#
+b13_6h@share_cnt
+b13_6h@str
+b13_6h@str_plus
+b13_72h@exp
+b13_72h@is_return_1
+b13_72h@is_share
+b13_72h@return_n_uv
+b13_72h@ros_#
+b13_72h@ros_minus_#
+b13_72h@ros_n_#
+b13_72h@ros_one
+b13_72h@rovn_#
+b13_72h@share_cnt
+b13_72h@str
+b13_72h@str_plus
+b14_12h@exp
+b14_12h@is_return_1
+b14_12h@is_share
+b14_12h@return_1_uv
+b14_12h@return_n_uv
+b14_12h@ros_#
+b14_12h@ros1_#
+b14_12h@ros_minus_#
+b14_12h@ros_minus1_#
+b14_12h@ros_n_#
+b14_12h@ros_n1_#
+b14_12h@ros_one
+b14_12h@rovn_#
+b14_12h@rovn1_#
+b14_12h@share_cnt
+b14_12h@str
+b14_12h@str_plus
+b14_1h@exp
+b14_1h@is_return_1
+b14_1h@is_share
+b14_1h@return_1_uv
+b14_1h@return_n_uv
+b14_1h@ros_#
+b14_1h@ros1_#
+b14_1h@ros_minus_#
+b14_1h@ros_minus1_#
+b14_1h@ros_n_#
+b14_1h@ros_n1_#
+b14_1h@ros_one
+b14_1h@rovn_#
+b14_1h@rovn1_#
+b14_1h@share_cnt
+b14_1h@str
+b14_1h@str_plus
+b14_2h@exp
+b14_2h@is_return_1
+b14_2h@is_share
+b14_2h@return_1_uv
+b14_2h@return_n_uv
+b14_2h@ros_#
+b14_2h@ros1_#
+b14_2h@ros_minus_#
+b14_2h@ros_minus1_#
+b14_2h@ros_n_#
+b14_2h@ros_n1_#
+b14_2h@ros_one
+b14_2h@rovn_#
+b14_2h@rovn1_#
+b14_2h@share_cnt
+b14_2h@str
+b14_2h@str_plus
+b14_3h@exp
+b14_3h@is_return_1
+b14_3h@is_share
+b14_3h@return_1_uv
+b14_3h@return_n_uv
+b14_3h@ros_#
+b14_3h@ros1_#
+b14_3h@ros_minus_#
+b14_3h@ros_minus1_#
+b14_3h@ros_n_#
+b14_3h@ros_n1_#
+b14_3h@ros_one
+b14_3h@rovn_#
+b14_3h@rovn1_#
+b14_3h@share_cnt
+b14_3h@str
+b14_3h@str_plus
+b14_6h@exp
+b14_6h@is_return_1
+b14_6h@is_share
+b14_6h@return_1_uv
+b14_6h@return_n_uv
+b14_6h@ros_#
+b14_6h@ros1_#
+b14_6h@ros_minus_#
+b14_6h@ros_minus1_#
+b14_6h@ros_n_#
+b14_6h@ros_n1_#
+b14_6h@ros_one
+b14_6h@rovn_#
+b14_6h@rovn1_#
+b14_6h@share_cnt
+b14_6h@str
+b14_6h@str_plus
+b15_12h@exp
+b15_12h@is_return_1
+b15_12h@is_share
+b15_12h@return_1_uv
+b15_12h@return_n_uv
+b15_12h@ros_#
+b15_12h@ros1_#
+b15_12h@ros_minus_#
+b15_12h@ros_minus1_#
+b15_12h@ros_n_#
+b15_12h@ros_n1_#
+b15_12h@ros_one
+b15_12h@rovn_#
+b15_12h@rovn1_#
+b15_12h@share_cnt
+b15_12h@str
+b15_12h@str_plus
+b15_1h@exp
+b15_1h@is_return_1
+b15_1h@is_share
+b15_1h@return_1_uv
+b15_1h@return_n_uv
+b15_1h@ros_#
+b15_1h@ros1_#
+b15_1h@ros_minus_#
+b15_1h@ros_minus1_#
+b15_1h@ros_n_#
+b15_1h@ros_n1_#
+b15_1h@ros_one
+b15_1h@rovn_#
+b15_1h@rovn1_#
+b15_1h@share_cnt
+b15_1h@str
+b15_1h@str_plus
+b15_2h@exp
+b15_2h@is_return_1
+b15_2h@is_share
+b15_2h@return_1_uv
+b15_2h@return_n_uv
+b15_2h@ros_#
+b15_2h@ros1_#
+b15_2h@ros_minus_#
+b15_2h@ros_minus1_#
+b15_2h@ros_n_#
+b15_2h@ros_n1_#
+b15_2h@ros_one
+b15_2h@rovn_#
+b15_2h@rovn1_#
+b15_2h@share_cnt
+b15_2h@str
+b15_2h@str_plus
+b15_3h@exp
+b15_3h@is_return_1
+b15_3h@is_share
+b15_3h@return_1_uv
+b15_3h@return_n_uv
+b15_3h@ros_#
+b15_3h@ros1_#
+b15_3h@ros_minus_#
+b15_3h@ros_minus1_#
+b15_3h@ros_n_#
+b15_3h@ros_n1_#
+b15_3h@ros_one
+b15_3h@rovn_#
+b15_3h@rovn1_#
+b15_3h@share_cnt
+b15_3h@str
+b15_3h@str_plus
+b15_6h@exp
+b15_6h@is_return_1
+b15_6h@is_share
+b15_6h@return_1_uv
+b15_6h@return_n_uv
+b15_6h@ros_#
+b15_6h@ros1_#
+b15_6h@ros_minus_#
+b15_6h@ros_minus1_#
+b15_6h@ros_n_#
+b15_6h@ros_n1_#
+b15_6h@ros_one
+b15_6h@rovn_#
+b15_6h@rovn1_#
+b15_6h@share_cnt
+b15_6h@str
+b15_6h@str_plus
+b2_6h@exp
+b2_6h@is_return_1
+b2_6h@is_share
+b2_6h@return_n_uv
+b2_6h@ros_#
+b2_6h@ros_minus_#
+b2_6h@ros_n_#
+b2_6h@ros_one
+b2_6h@rovn_#
+b2_6h@share_cnt
+b2_6h@str
+b2_6h@str_plus
+b3_3h@exp
+b3_3h@is_return_1
+b3_3h@is_share
+b3_3h@return_n_uv
+b3_3h@ros_#
+b3_3h@ros_minus_#
+b3_3h@ros_n_#
+b3_3h@ros_one
+b3_3h@rovn_#
+b3_3h@share_cnt
+b3_3h@str
+b3_3h@str_plus
+b3_6h@exp
+b3_6h@is_return_1
+b3_6h@is_share
+b3_6h@return_n_uv
+b3_6h@ros_#
+b3_6h@ros_minus_#
+b3_6h@ros_n_#
+b3_6h@ros_one
+b3_6h@rovn_#
+b3_6h@share_cnt
+b3_6h@str
+b3_6h@str_plus
+b4_12h@exp
+b4_12h@is_return_1
+b4_12h@is_share
+b4_12h@return_n_uv
+b4_12h@ros_#
+b4_12h@ros_minus_#
+b4_12h@ros_n_#
+b4_12h@ros_one
+b4_12h@rovn_#
+b4_12h@share_cnt
+b4_12h@str
+b4_12h@str_plus
+b4_24h@exp
+b4_24h@is_return_1
+b4_24h@is_share
+b4_24h@return_n_uv
+b4_24h@ros_#
+b4_24h@ros_minus_#
+b4_24h@ros_n_#
+b4_24h@ros_one
+b4_24h@rovn_#
+b4_24h@share_cnt
+b4_24h@str
+b4_24h@str_plus
+b4_72h@exp
+b4_72h@is_return_1
+b4_72h@is_share
+b4_72h@return_n_uv
+b4_72h@ros_#
+b4_72h@ros_minus_#
+b4_72h@ros_n_#
+b4_72h@ros_one
+b4_72h@rovn_#
+b4_72h@share_cnt
+b4_72h@str
+b4_72h@str_plus
+b7_12h@exp
+b7_12h@is_return_1
+b7_12h@is_share
+b7_12h@return_n_uv
+b7_12h@ros_#
+b7_12h@ros_minus_#
+b7_12h@ros_n_#
+b7_12h@ros_one
+b7_12h@rovn_#
+b7_12h@share_cnt
+b7_12h@str
+b7_12h@str_plus
+b7_6h@exp
+b7_6h@is_return_1
+b7_6h@is_share
+b7_6h@return_n_uv
+b7_6h@ros_#
+b7_6h@ros_minus_#
+b7_6h@ros_n_#
+b7_6h@ros_one
+b7_6h@rovn_#
+b7_6h@share_cnt
+b7_6h@str
+b7_6h@str_plus
+c9_c1s@ros_#
+c9_c1s@ros_minus_#
+c9_c2s@ros_#
+c9_c2s@ros_minus_#
+c9_l1s@ros_#
+c9_l1s@ros_minus_#
+c9_l2s@ros_#
+c9_l2s@ros_minus_#
+c9@ros_#
+c9@ros_minus_#

+ 33 - 0
rec_scripts/rov/train/scripts/download_data.sh

@@ -0,0 +1,33 @@
+#!/bin/bash
+
+abs_path=$(cd `dirname $0`; pwd)
+start_date=""
+end_date=""
+if(($#==2))
+then
+    start_date=$1
+    end_date=$2
+else
+    start_date=$(date +%Y%m%d -d "-2 $days day")
+    end_date=$start_date
+fi
+
+partition=64
+hdfs_url="hdfs://192.168.141.208:9000"
+hdfs_path="/dw/recommend/model/831_recsys_rov_train_data"
+local_path="${abs_path}/../data"
+for((i=0; i<=21; i++))
+do
+  data_date=$(date -d "$start_date $i day" +"%Y%m%d")
+  if [ "$data_date" -le "$end_date" ]
+  then
+    for((j=0; j<$partition; j++))
+    do
+      hdfs_file=$(printf "%s/%s/part-%05d.gz" $hdfs_path $data_date $j)
+      local_file=$(printf "%s/%s_%05d.csv" $local_path $data_date $j)
+      echo `date` " hdfs dfs -fs $hdfs_url -text $hdfs_file > $local_file"
+      hdfs dfs -fs $hdfs_url -text $hdfs_file > $local_file
+      sleep 1s
+    done
+  fi
+done

+ 114 - 0
rec_scripts/rov/train/scripts/pipline.sh

@@ -0,0 +1,114 @@
+#!/bin/bash
+
+abs_path=$(cd `dirname $0`; pwd)
+hdfs_url="hdfs://192.168.141.208:9000"
+hdfs_path="/dw/recommend/model/831_recsys_rov_train_data"
+hdfs_feature_path="/dw/recommend/model/831_recsys_analysis_data"
+data_path="${abs_path}/../data"
+model_path="${abs_path}/../model"
+monitor_py="${abs_path}/../src/tools/rec_monitor_push.py"
+model_name="推荐模型str+_v1"
+
+run_mode=""
+if(($#==1))
+then
+    run_mode=$1
+else
+    exit -1
+fi
+
+if [[ "$run_mode" != "run" ]]
+then
+    exit -1
+fi
+
+# 0. check data
+try_times=10
+wait_time=300s
+data_size_threshold=25000
+feature_size_threshold=10000
+data_date=$(date +%Y%m%d -d "-1 $days day")
+before_data_date=$(date +%Y%m%d -d "-2 $days day")
+for((i=0; i<=$try_times; i++))
+do
+  hdfs_file=$(printf "%s/%s/part-00063.gz" $hdfs_path $data_date)
+  hdfs dfs -fs $hdfs_url -test -e $hdfs_file
+  if [ $? -ne 0 ]
+  then
+    if [ $i -ge $try_times ]
+    then
+      echo `date` "$hdfs_file is not exist"
+      exit -1
+    else
+      echo `date` "check $i, $hdfs_file is not exist, sleep $wait_time"
+      sleep $wait_time
+    fi
+  else
+    echo `date` "$hdfs_file is exist"
+    data_size=$(hdfs dfs -fs $hdfs_url -text $hdfs_file | wc -l | awk '{print $1}')
+    if [ $data_size -le $data_size_threshold ]
+    then
+      level=error
+      msg=" ${model_name}更新失败, $hdfs_file $data_size <= $data_size_threshold"
+      #python3 $monitor_py --level "$level" --model "$model_name" --msg "$msg"
+      echo `date` "$msg"
+      exit -1
+    fi
+    break
+  fi
+done
+
+# 0.1 download feature
+hdfs_feature_file=$(printf "%s/%s/part-00000.gz" $hdfs_feature_path $data_date)
+hdfs dfs -fs $hdfs_url -test -e $hdfs_feature_file
+if [ $? -ne 0 ]
+then
+  echo `date` "$hdfs_feature_file is not exist"
+  exit -1
+else
+  local_feature_file="${data_path}/feature_${data_date}.csv"
+  hdfs dfs -fs $hdfs_url -text $hdfs_feature_file | grep -v -E '@wh@|@unknown|e1@|b8_6h|b8_12h|b9_6h|b9_12h|c9_.*cate|c9_lr1s' > $local_feature_file &
+  wait
+  sleep 30s
+  feature_size=$(wc -l $local_feature_file | awk '{print $1}')
+  if [ $feature_size -le $feature_size_threshold ]
+  then
+    exho `date` "$local_feature_file, $feature_size <= $feature_size_threshold"
+    exit -1
+  fi
+fi
+
+# 1. download data
+down_sh="${abs_path}/download_data.sh"
+echo `date` "sh +x $down_sh $before_data_date $data_date"
+sh +x $down_sh $before_data_date $data_date &
+wait
+sleep 30s
+
+# 2. train model
+train_sh="${abs_path}/train_model_data.sh"
+sub_model_path="${model_path}/${data_date}"
+if [ ! -d $sub_model_path ]
+then
+  mkdir $sub_model_path
+fi
+train_start_date=$(date -d "$data_date -13 day" +"%Y%m%d")
+train_end_date=$data_date
+save_model_file="${sub_model_path}/model_fm_for_recsys_v1_str.txt"
+echo `date` "sh +x $train_sh $train_start_date $train_end_date $save_model_file"
+sh +x $train_sh $train_start_date $train_end_date $save_model_file $local_feature_file &
+wait
+sleep 30s
+
+# 3. update model
+update_sh="${abs_path}/update_model.sh"
+echo `date` "sh +x $update_sh $save_model_file $model_name"
+sh +x $update_sh $save_model_file $model_name &
+wait
+sleep 30s
+
+# 4. remove data
+remove_sh="${abs_path}/remove_data.sh"
+sh +x $remove_sh
+wait
+

+ 21 - 0
rec_scripts/rov/train/scripts/remove_data.sh

@@ -0,0 +1,21 @@
+#!/bin/bash
+
+abs_path=$(cd `dirname $0`; pwd)
+sample_path="${abs_path}/../data"
+data_date=""
+if(($#==1))
+then
+    data_date=$1
+else
+    data_date=$(date +%Y%m%d -d "-17 $days day")
+fi
+
+# remove
+test_file="${sample_path}/${data_date}_00000.csv"
+if [ ! -f $test_file ]
+then
+    echo `date` "${test_file} is not exist"
+else
+    echo `date` "rm -f ${sample_path}/${data_date}_000*.csv"
+    rm -f ${sample_path}/${data_date}_000*.csv
+fi

+ 25 - 0
rec_scripts/rov/train/scripts/run.sh

@@ -0,0 +1,25 @@
+#!/bin/bash
+
+run_mode=""
+if(($#==1))
+then
+  run_mode=$1
+else
+  exit -1
+fi
+
+if [[ "$run_mode" != "run" ]]
+then
+  exit -1
+fi
+
+dd=$(date +%Y%m%d)
+
+# 0. abs path
+abs_path=$(cd `dirname $0`; pwd)
+log_file="${abs_path}/../logs/${dd}.log"
+
+# 1. pipline
+pipline_sh="${abs_path}/pipline.sh"
+echo `date` "sh +x $pipline_sh run >> $log_file"
+sh +x $pipline_sh run >> $log_file 2>&1 &

+ 79 - 0
rec_scripts/rov/train/scripts/train_model_data.sh

@@ -0,0 +1,79 @@
+#!/bin/bash
+
+abs_path=$(cd `dirname $0`; pwd)
+sample_path="${abs_path}/../data"
+model_path="${abs_path}/../model"
+
+start_date=20250721
+end_date=20250727
+model_file="${model_path}/str_fm.txt"
+feature_file="${sample_path}/feature.csv"
+if(($#==4))
+then
+    start_date=$1
+    end_date=$2
+    model_file=$3
+    feature_file=$4
+else
+    start_date=$(date +%Y%m%d -d "-15 $days day")
+    end_date=$(date +%Y%m%d -d "-2 $days day")
+fi
+
+sample_file_list=""
+for((i=0; i<=21; i++))
+do
+  data_date=$(date -d "$start_date $i day" +"%Y%m%d")
+  if [ "$data_date" -le "$end_date" ]
+  then
+    tmp_path=$sample_path
+    test_file="${tmp_path}/${data_date}_00000.csv"
+    if [ ! -f $test_file ]
+    then
+      tmp_path="/data2/jch/project/return_of_view_v6/data"
+      test_file="${tmp_path}/${data_date}_00000.csv"
+    fi
+
+    if [ -f $test_file ]
+    then
+      sample_file="${tmp_path}/${data_date}_00*.csv"
+      if [[ -z $sample_file_list ]]
+      then
+        sample_file_list=$sample_file
+      else
+        sample_file_list="$sample_file_list $sample_file"
+      fi
+    fi
+  fi
+done
+
+# filter feature
+filter_feature="${abs_path}/../data/filter_feature.csv"
+if [ -f $filter_feature ]
+then
+    local_feature="${feature_file}.tmp"
+    awk '{if(NR==FNR){a[$1]=1}else{if(!($1 in a)){print}}}' $filter_feature $feature_file > $local_feature
+    wait
+    sleep 20s
+    feature_file=$local_feature
+fi
+
+# model name
+model_prefix=`echo $model_file | awk -F".txt" '{print $1}'`
+off_model_file="${model_prefix}_offline.txt"
+
+# train
+#trainTools="/data2/jch/package/alphaFM/bin/fm_train"
+trainTools="/data2/jch/package/onlyTest/alphaFM/bin/fm_train"
+trainParam="-dim 1,1,8 -core 24"
+
+echo `date` "cat $sample_file_list | $trainTools -m $off_model_file $trainParam"
+cat $sample_file_list | $trainTools -m $off_model_file $trainParam -feature $feature_file -min_feature_cnt 200 &
+wait
+sleep 30s
+
+# transform model
+trans_py="${abs_path}/../src/tools/offline2online_fm.py"
+echo `date` "python3 $trans_py --input $off_model_file --output $model_file"
+python3 $trans_py --input $off_model_file --output $model_file &
+wait
+sleep 30s

+ 88 - 0
rec_scripts/rov/train/scripts/update_model.sh

@@ -0,0 +1,88 @@
+#!/bin/bash
+
+abs_path=$(cd `dirname $0`; pwd)
+sample_path="${abs_path}/../data"
+model_path="${abs_path}/../model"
+predict_tools="/data2/jch/package/alphaFM/bin/fm_predict"
+predict_param="-core 16"
+update_py="${abs_path}/../src/tools/update2oss.py"
+monitor_py="${abs_path}/../src/tools/rec_monitor_push.py"
+
+auc_threshold=0.05
+online_model_file=""
+push_model_name="推荐模型str+"
+if(($#==2))
+then
+  online_model_file=$1
+  push_model_name=$2
+else
+  exit -1
+fi
+
+# model size
+model_size=$(wc -l $online_model_file| awk '{print $1}')
+if [ $model_size -le 10000 ]
+then
+  echo "$online_model_file lines <= 10000"
+  exit -1
+fi
+
+# name & date
+model_name=$(echo $online_model_file | awk -F"/" '{print $NF}' | awk -F".txt" '{print $1}')
+new_model_date=$(echo $online_model_file | awk -F"/" '{print $(NF-1)}')
+old_model_date=$(date -d "$new_model_date -1 day" +"%Y%m%d")
+
+# offline model file
+new_off_model_file="${model_path}/${new_model_date}/${model_name}_offline.txt"
+old_off_model_file="${model_path}/${old_model_date}/${model_name}_offline.txt"
+
+# eval model
+check_file="${sample_path}/${new_model_date}_00063.csv"
+if [ -f $check_file ]
+then
+  eval_file_list="${sample_path}/${new_model_date}_00*.csv"
+
+  # eval new
+  new_predict_file="${sample_path}/new_predict.txt"
+  echo `date` "cat $eval_file_list | $predict_tools -m $new_off_model_file $predict_param -out $new_predict_file"
+  echo
+  cat $eval_file_list | $predict_tools -m $new_off_model_file $predict_param -out $new_predict_file &
+  wait
+  sleep 30s
+
+  # eval old
+  old_predict_file="${sample_path}/old_predict.txt"
+  echo `date` "cat $eval_file_list | $predict_tools -m $old_off_model_file $predict_param -out $old_predict_file"
+  echo
+  cat $eval_file_list | $predict_tools -m $old_off_model_file $predict_param -out $old_predict_file &
+  wait
+  sleep 30s
+
+  # compare auc
+  auc_tools="${abs_path}/../src/tools/cal_auc.py"
+  new_auc=$(python3 $auc_tools --input $new_predict_file)
+  old_auc=$(python3 $auc_tools --input $old_predict_file)
+  diff_auc=$(echo $new_auc $old_auc | awk '{print $1-$2}')
+  flag=$(echo $diff_auc $auc_threshold | awk '{if($1>-$2&&$1<$2){print 1}else{print 0}}')
+  if(($flag==1))
+  then
+    # 上传模型
+    echo `date` "python3.6 $update_py --local $online_model_file"
+    python3.6 $update_py --local $online_model_file &
+    wait
+    # 发送通知
+    level=info
+    msg=" ${push_model_name}更新完成"
+    msg+="\n\t - 新模型AUC: ${new_auc}"
+    msg+="\n\t - 老模型AUC: ${old_auc}"
+  else
+    echo "$diff_auc $auc_threshold"
+    # 发送通知
+    level=error
+    msg=" ${push_model_name}更新失败, ${new_model_date}_auc差异为: ${diff_auc}, 大于$auc_threshold"
+    msg+="\n\t - 新模型AUC: ${new_auc}"
+    msg+="\n\t - 老模型AUC: ${old_auc}"
+  fi
+  echo `date` "python3 $monitor_py --level $level --model $push_model_name --msg $msg"
+  python3 $monitor_py --level "$level" --model "$push_model_name" --msg "$msg"
+fi

+ 27 - 0
rec_scripts/rov/train/src/tools/cal_auc.py

@@ -0,0 +1,27 @@
+#!/usr/bin/env python
+# coding=utf-8
+import argparse
+
+from sklearn.metrics import roc_auc_score
+
+
+def evaluate(evaluate_file):
+    true_y = []
+    pred_y = []
+    with open(evaluate_file) as input_fp:
+        for line in input_fp:
+            cells = line.strip().split(' ')
+            if len(cells) < 2:
+                return False
+
+            true_y.append(float(cells[0]))
+            pred_y.append(float(cells[-1]))
+    auc = roc_auc_score(true_y, pred_y)
+    print(auc)
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--input', required=True, type=str, help='input file')
+    args = parser.parse_args()
+    evaluate(args.input)

+ 33 - 0
rec_scripts/rov/train/src/tools/offline2online_fm.py

@@ -0,0 +1,33 @@
+#!/usr/bin/env python
+# coding=utf-8
+
+import argparse
+
+
+def format_model(input_file, output_file):
+    out_fp = open(output_file, 'w')
+    with open(input_file) as in_fp:
+        index = 0
+        for line in in_fp:
+            index += 1
+            line = line.strip()
+            cells = line.split(' ')
+            if index == 1:
+                cells = cells[:2]
+            else:
+                cells = cells[:10]
+            new_line = '%s\n' % '\t'.join(cells)
+            out_fp.write(new_line)
+    out_fp.close()
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--input', required=True, type=str, help='input file')
+    parser.add_argument('--output', required=True, type=str, help='output file')
+    args = parser.parse_args()
+    print('\n\n')
+    print(args)
+    print('\n\n')
+
+    format_model(args.input, args.output)

+ 89 - 0
rec_scripts/rov/train/src/tools/rec_monitor_push.py

@@ -0,0 +1,89 @@
+#!/usr/bin/env python
+# coding=utf-8
+import argparse
+import json
+from datetime import datetime
+
+import requests
+
+server_robot = {
+    'webhook': 'https://open.feishu.cn/open-apis/bot/v2/hook/c28a6fea-2323-4d09-94fb-943e5ccdb61b',
+}
+
+level_header_template_map = {
+    "info": "turquoise",
+    "error": "red",
+    "warn": "yellow"
+}
+
+level_header_title_suffix_map = {
+    "info": "自动更新通知",
+    "error": "自动更新告警",
+    "warn": "自动更新告警"
+}
+
+level_task_status_map = {
+    "info": "任务执行成功",
+    "error": "任务执行失败",
+    "warn": "任务执行失败",
+}
+
+
+def send_card_msg_to_feishu(webhook, card_json):
+    """发送消息到飞书"""
+    headers = {'Content-Type': 'application/json'}
+    payload_message = {
+        "msg_type": "interactive",
+        "card": card_json
+    }
+    print(f"推送飞书消息内容: {json.dumps(payload_message)}")
+    response = requests.request('POST', url=webhook, headers=headers, data=json.dumps(payload_message))
+    print(response.text)
+
+
+def _monitor(level, model, msg: str):
+    """消息推送"""
+    now = datetime.now()
+    msg = msg.replace("\\n", "\n").replace("\\t", "\t")
+    mgs_text = f"- 当前时间: {now.strftime('%Y-%m-%d %H:%M:%S')}" \
+               f"\n- 任务状态: {level_task_status_map[level]}" \
+               f"\n- 任务描述: {msg}"
+    card_json = {
+        "schema": "2.0",
+        "header": {
+            "title": {
+                "tag": "plain_text",
+                "content": model + level_header_title_suffix_map[level]
+            },
+            "template": level_header_template_map[level]
+        },
+        "body": {
+            "elements": [
+                {
+                    "tag": "markdown",
+                    "content": mgs_text,
+                    "text_align": "left",
+                    "text_size": "normal",
+                    "element_id": "overview"
+                }
+            ]
+        }
+    }
+    send_card_msg_to_feishu(
+        webhook=server_robot.get('webhook'),
+        card_json=card_json
+    )
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser(description='告警Utils')
+    parser.add_argument('--level', type=str, help='通知级别, info, warn, error', required=True)
+    parser.add_argument('--model', type=str, help='消息', required=True)
+    parser.add_argument('--msg', type=str, help='消息', required=True)
+    args = parser.parse_args()
+
+    _monitor(
+        level=args.level,
+        model=args.model,
+        msg=args.msg
+    )

+ 44 - 0
rec_scripts/rov/train/src/tools/update2oss.py

@@ -0,0 +1,44 @@
+#!/usr/bin/env python
+# coding=utf-8
+
+import argparse
+import os
+
+import oss2
+from oss2.credentials import EnvironmentVariableCredentialsProvider
+
+os.environ['OSS_ACCESS_KEY_ID'] = 'LTAIP6x1l3DXfSxm'
+os.environ['OSS_ACCESS_KEY_SECRET'] = 'KbTaM9ars4OX3PMS6Xm7rtxGr1FLon'
+endpoint = 'https://oss-cn-hangzhou.aliyuncs.com'
+bucket_name = 'art-recommend'
+region = 'cn-hangzhou'
+oss_sub_path = 'zhangbo'
+
+
+def get_bucket():
+    auth = oss2.ProviderAuthV4(EnvironmentVariableCredentialsProvider())
+    return oss2.Bucket(auth, endpoint, bucket_name, region=region)
+
+
+def get_filename(file_path):
+    return file_path[file_path.rfind('/') + 1:]
+
+
+def update(local_file):
+    bucket = get_bucket()
+    filename = get_filename(local_file)
+    oss_dest_file = '%s/%s' % (oss_sub_path, filename)
+    bucket.put_object_from_file(oss_dest_file, local_file)
+    print('%s -> %s' % (local_file, oss_dest_file))
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--local', required=True, type=str, help='local file')
+    args = parser.parse_args()
+    print('\n\n')
+    print(args)
+    print('\n\n')
+
+    if os.path.exists(args.local):
+        update(args.local)

+ 36 - 0
rec_scripts/run.sh

@@ -0,0 +1,36 @@
+#!/bin/bash
+
+run_mode=""
+if(($#==1))
+then
+  run_mode=$1
+else
+  exit -1
+fi
+
+if [[ "$run_mode" != "run" ]]
+then
+  exit -1
+fi
+
+dd=$(date +%Y%m%d)
+
+
+# 0. abs path
+abs_path=$(cd `dirname $0`; pwd)
+log_file="${abs_path}/logs/${dd}.log"
+
+
+# 1. make data
+make_sh="${abs_path}/make_data.sh"
+echo `date` "sh +x $make_sh >> $log_file"
+sh +x $make_sh >> $log_file 2>&1 &
+wait
+sleep 30s
+
+
+# 2. rov batch
+rov_batch_sh="${abs_path}/rov/sample/scripts/rov_batch.sh"
+echo `date` "sh +x $rov_batch_sh run"
+sh +x $rov_batch_sh run >> $log_file 2>&1 &
+wait

+ 31 - 0
rec_scripts/run_t0.sh

@@ -0,0 +1,31 @@
+#!/bin/bash
+
+run_mode=""
+if(($#==1))
+then
+  run_mode=$1
+else
+  exit -1
+fi
+
+if [[ "$run_mode" != "run" ]]
+then
+  exit -1
+fi
+
+dd=$(date +%Y%m%d)
+start_date=$dd
+end_date=$start_date
+start_hour=00
+end_hour=23
+
+# 0. abs path
+abs_path=$(cd `dirname $0`; pwd)
+log_file="${abs_path}/logs/${dd}_t0.log"
+
+# 1. make data
+table=dwd_recsys_alg_sample_all_20250905
+make_sh="${abs_path}/make_data.sh"
+echo `date` "sh +x $make_sh $start_date $end_date $start_hour $end_hour $table" >> $log_file
+sh +x $make_sh $start_date $end_date $start_hour $end_hour $table >> $log_file 2>&1 &
+wait

+ 38 - 0
rec_scripts/run_t1.sh

@@ -0,0 +1,38 @@
+#!/bin/bash
+
+run_mode=""
+if(($#==1))
+then
+  run_mode=$1
+else
+  exit -1
+fi
+
+if [[ "$run_mode" != "run" ]]
+then
+  exit -1
+fi
+
+dd=$(date +%Y%m%d)
+start_date=$(date +%Y%m%d -d "-1 $days day")
+end_date=$start_date
+start_hour=17
+end_hour=23
+
+# 0. abs path
+abs_path=$(cd `dirname $0`; pwd)
+log_file="${abs_path}/logs/${dd}_t1.log"
+
+# 1. make data
+table=dwd_recsys_alg_sample_all_20250905
+make_sh="${abs_path}/make_data.sh"
+echo `date` "sh +x $make_sh $start_date $end_date $start_hour $end_hour $table" >> $log_file
+sh +x $make_sh $start_date $end_date $start_hour $end_hour $table >> $log_file 2>&1 &
+wait
+sleep 30s
+
+# 2. rov batch
+rov_batch_sh="${abs_path}/rov/sample/scripts/rov_batch_t1.sh"
+echo `date` "sh +x $rov_batch_sh run" >> $log_file
+sh +x $rov_batch_sh run >> $log_file 2>&1 &
+wait

+ 0 - 18
scripts/readme.txt

@@ -1,18 +0,0 @@
-最新使用代码为: 82
-
-1. make_data.sh
-	生成特征数据
-
-2. nor
-	1. nor_sample.sh
-		生成(return_n_uv_noself of is_return_n_noself)样本, (注意需要抽取的特征文件和特征分桶文件)
-
-	2. nor_train.sh
-		使用xgboost训练nor模型, 注意使用的特征文件(训练测试需保持一致)
-		模型训练git: https://git.yishihui.com/algorithm/recommend-model.git
-
-	3. nor_predict.sh
-		评估nor模型, 注意使用的特征文件(训练测试需保持一致)
-
-	4. 下载模型
-		从hdsf上下载模型(训练配置的路径)