Explorar el Código

推荐模型自动化更新-特征分桶数据生产

Joe hace 9 meses
padre
commit
6af6fd6d9a

+ 11 - 10
qiaojialiang/handle_rov.sh

@@ -6,11 +6,11 @@ set -ex
 # 原始数据table name
 #table='alg_recsys_sample_all'
 table='alg_recsys_sample_all_test'
-# 处理分区配置
-beginStr="$(date -d '1 days ago' +%Y%m%d)"
-endStr="$(date -d '1 days ago' +%Y%m%d)"
-beginHhStr=08
-endHhStr=08
+# 处理分区配置 推荐数据间隔一天生产,所以5日0点使用3日0-23点数据生产new模型数据
+begin_early_2_Str="$(date -d '2 days ago' +%Y%m%d)"
+end_early_2_Str="$(date -d '2 days ago' +%Y%m%d)"
+beginHhStr=00
+endHhStr=23
 # 各节点产出hdfs文件绝对路径
 originDataPath=/dw/recommend/model/13_sample_data/
 valueDataPath=/dw/recommend/model/14_feature_data/
@@ -19,9 +19,10 @@ bucketDataPath=/dw/recommend/model/16_train_data/
 # 0 判断上游表是否生产完成,最长等待到12点
 # shellcheck disable=SC2039
 source /root/anaconda3/bin/activate py37
-echo "----------step1------------开始校验是否生产完数据,分区信息:beginStr:${beginStr}${beginHhStr},endStr:${endStr}${endHhStr}"
+# shellcheck disable=SC2154
+echo "----------step1------------开始校验是否生产完数据,分区信息:begin_early_2_Str:${begin_early_2_Str}${beginHhStr},end_early_2_Str:${end_early_2_Str}${endHhStr}"
 while true; do
-  python_return_code=$(python /root/joe/recommend-emr-dataprocess/qiaojialiang/checkHiveDataUtil.py --table ${table} --beginStr ${beginStr}${beginHhStr} --endStr ${endStr}${endHhStr})
+  python_return_code=$(python /root/joe/recommend-emr-dataprocess/qiaojialiang/checkHiveDataUtil.py --table ${table} --begin_early_2_Str ${begin_early_2_Str}${beginHhStr} --end_early_2_Str ${end_early_2_Str}${endHhStr})
   echo "python 返回值:${python_return_code}"
   if [ $python_return_code -eq 0 ]; then
     echo "Python程序返回0,校验存在数据,退出循环。"
@@ -45,7 +46,7 @@ echo "----------step2------------开始根据${table}生产原始数据"
 --master yarn --driver-memory 1G --executor-memory 2G --executor-cores 1 --num-executors 16 \
 ../target/spark-examples-1.0.0-SNAPSHOT-shaded.jar \
 tablePart:64 repartition:32 \
-beginStr:${beginStr}${beginHhStr} endStr:${endStr}${endHhStr} \
+beginStr:${begin_early_2_Str}${beginHhStr} endStr:${end_early_2_Str}${endHhStr} \
 savePath:${originDataPath} \
 table:${table}
 if [ $? -ne 0 ]; then
@@ -64,7 +65,7 @@ echo "----------step3------------开始特征值拼接"
 ../target/spark-examples-1.0.0-SNAPSHOT-shaded.jar \
 readPath:${originDataPath} \
 savePath:${valueDataPath} \
-beginStr:${beginStr} endStr:${endStr} repartition:1000
+beginStr:${begin_early_2_Str} endStr:${end_early_2_Str} repartition:1000
 if [ $? -ne 0 ]; then
    echo "Spark特征值拼接处理任务执行失败"
    exit 1
@@ -80,7 +81,7 @@ echo "----------step4------------根据特征分桶生产重打分特征数据"
 ../target/spark-examples-1.0.0-SNAPSHOT-shaded.jar \
 readPath:${valueDataPath} \
 savePath:${bucketDataPath} \
-beginStr:${beginStr} endStr:${endStr} repartition:1000
+beginStr:${begin_early_2_Str} endStr:${end_early_2_Str} repartition:1000
 if [ $? -ne 0 ]; then
    echo "Spark特征分桶处理任务执行失败"
    exit 1

+ 2 - 2
qiaojialiang/demo01.py → qiaojialiang/test/demo01.py

@@ -12,8 +12,8 @@ n1 = 1
 n2 = 5
 
 # 定义Shell脚本的路径
-script1_path = "./script1.sh"
-script2_path = "./script2.sh"
+script1_path = "script1.sh"
+script2_path = "script2.sh"
 
 # 打开日志文件准备写入
 with open(log_file, 'w') as f:

+ 0 - 0
qiaojialiang/script1.sh → qiaojialiang/test/script1.sh


+ 0 - 0
qiaojialiang/script2.sh → qiaojialiang/test/script2.sh