7 months ago · adbdd80996
--- a/.gitignore
+++ b/.gitignore
@@ -0,0 +1,45 @@
 
				+HELP.md
			
 
				+target/
			
 
				+!.mvn/wrapper/maven-wrapper.jar
			
 
				+!**/src/main/**/target/
			
 
				+!**/src/test/**/target/
			
 
				+
			
 
				+### STS ###
			
 
				+.apt_generated
			
 
				+.classpath
			
 
				+.factorypath
			
 
				+.project
			
 
				+.settings
			
 
				+.springBeans
			
 
				+.sts4-cache
			
 
				+
			
 
				+### IntelliJ IDEA ###
			
 
				+.idea
			
 
				+*.iws
			
 
				+*.iml
			
 
				+*.ipr
			
 
				+
			
 
				+### NetBeans ###
			
 
				+/nbproject/private/
			
 
				+/nbbuild/
			
 
				+/dist/
			
 
				+/nbdist/
			
 
				+/.nb-gradle/
			
 
				+build/
			
 
				+!**/src/main/**/build/
			
 
				+!**/src/test/**/build/
			
 
				+
			
 
				+### VS Code ###
			
 
				+.vscode/
			
 
				+
			
 
				+apollo-cache-dir
			
 
				+sentinel
			
 
				+weblog
			
 
				+xxl-job
			
 
				+
			
 
				+.DS_Store
			
 
				+logs
			
 
				+
			
 
				+model
			
 
				+predict
			
 
				+.idea
			
--- a/ad/00_common.sh
+++ b/ad/00_common.sh
@@ -0,0 +1,16 @@
 
				+#!/bin/sh
			
 
				+
			
 
				+is_not_holidays() {
			
 
				+    if [ -z "$1" ]; then
			
 
				+        echo "0"
			
 
				+        return
			
 
				+    fi
			
 
				+    
			
 
				+    path=$(dirname $0)
			
 
				+
			
 
				+    if grep -w "$1" "${path}/holidays.txt" > /dev/null; then
			
 
				+        echo "0"
			
 
				+    else
			
 
				+        echo "1"
			
 
				+    fi 
			
 
				+}
			
--- a/ad/01_ad_model_update.sh
+++ b/ad/01_ad_model_update.sh
@@ -0,0 +1,417 @@
 
				+#!/bin/sh
			
 
				+set -x
			
 
				+
			
 
				+export PATH=$SPARK_HOME/bin:$PATH
			
 
				+export HADOOP_CONF_DIR=/etc/taihao-apps/hadoop-conf
			
 
				+export JAVA_HOME=/usr/lib/jvm/java-1.8.0
			
 
				+
			
 
				+sh_path=$(cd $(dirname $0); pwd)
			
 
				+source ${sh_path}/00_common.sh
			
 
				+
			
 
				+source /root/anaconda3/bin/activate py37
			
 
				+
			
 
				+
			
 
				+# 全局常量
			
 
				+LOG_PREFIX=广告模型训练任务
			
 
				+HADOOP=/opt/apps/HADOOP-COMMON/hadoop-common-current/bin/hadoop
			
 
				+TRAIN_PATH=/dw/recommend/model/31_ad_sample_data_v4
			
 
				+BUCKET_FEATURE_PATH=/dw/recommend/model/33_ad_train_data_v4
			
 
				+TABLE=alg_recsys_ad_sample_all
			
 
				+# 特征文件名
			
 
				+feature_file=20240703_ad_feature_name.txt
			
 
				+# 模型本地临时保存路径
			
 
				+model_local_home=/root/zhaohp/XGB/
			
 
				+
			
 
				+# 模型HDFS保存路径，测试时修改为其他路径，避免影响线上
			
 
				+MODEL_PATH=/dw/recommend/model/35_ad_model
			
 
				+# 预测结果保存路径，测试时修改为其他路径，避免影响线上
			
 
				+PREDICT_RESULT_SAVE_PATH=/dw/recommend/model/34_ad_predict_data
			
 
				+# 模型OSS保存路径，测试时修改为其他路径，避免影响线上
			
 
				+MODEL_OSS_PATH=oss://art-recommend.oss-cn-hangzhou.aliyuncs.com/zhangbo/
			
 
				+# 线上模型名，测试时修改为其他模型名，避免影响线上
			
 
				+model_name=model_xgb_351_1000_v2
			
 
				+# 线上校准文件名
			
 
				+OSS_CALIBRATION_FILE_NAME=model_xgb_351_1000_v2_calibration
			
 
				+# 用于存放一些临时的文件
			
 
				+PREDICT_CACHE_PATH=/root/zhaohp/XGB/predict_cache
			
 
				+
			
 
				+
			
 
				+# 本地保存HDFS模型路径文件，测试时修改为其他模型名，避免影响线上
			
 
				+model_path_file=${model_local_home}/online_model_path.txt
			
 
				+# 获取当前是星期几，1表示星期一
			
 
				+current_day_of_week="$(date +"%u")"
			
 
				+
			
 
				+# 任务开始时间
			
 
				+start_time=$(date +%s)
			
 
				+# 前一天
			
 
				+today_early_1="$(date -d '1 days ago' +%Y%m%d)"
			
 
				+# 线上模型在HDFS中的路径
			
 
				+online_model_path=`cat ${model_path_file}`
			
 
				+# 训练用的数据路径
			
 
				+train_data_path=""
			
 
				+# 评估用的数据路径
			
 
				+predict_date_path=""
			
 
				+#评估结果保存路径
			
 
				+new_model_predict_result_path=""
			
 
				+# 模型保存路径
			
 
				+model_save_path=""
			
 
				+# 评测结果保存路径，后续需要根据此文件评估是否要更新模型
			
 
				+predict_analyse_file_path=""
			
 
				+# 校准文件保存路径
			
 
				+calibration_file_path=""
			
 
				+
			
 
				+# 保存模型评估的分析结果
			
 
				+old_incr_rate_avg=0
			
 
				+new_incr_rate_avg=0
			
 
				+# Top10的详情
			
 
				+top10_msg=""
			
 
				+# AUC值
			
 
				+old_auc=0
			
 
				+new_auc=0
			
 
				+
			
 
				+declare -A real_score_map
			
 
				+declare -A old_score_map
			
 
				+declare -A new_score_map
			
 
				+
			
 
				+# 校验命令的退出码
			
 
				+check_run_status() {
			
 
				+    local status=$1
			
 
				+    local step_start_time=$2
			
 
				+    local step_name=$3
			
 
				+    local msg=$4
			
 
				+
			
 
				+    local step_end_time=$(date +%s)
			
 
				+    local step_elapsed=$(($step_end_time - $step_start_time))
			
 
				+
			
 
				+    if [[ -n "${old_auc}" && "${old_auc}" != "0" ]]; then
			
 
				+      msg+="\n\t - 老模型AUC: ${old_auc}"
			
 
				+    fi
			
 
				+    if [[ -n "${new_auc}" && "${new_auc}" != "0" ]]; then
			
 
				+      msg+="\n\t - 新模型AUC: ${new_auc}"
			
 
				+    fi
			
 
				+
			
 
				+
			
 
				+    if [ ${status} -ne 0 ]; then
			
 
				+        echo "${LOG_PREFIX} -- ${step_name}失败: 耗时 ${step_elapsed}"
			
 
				+        local elapsed=$(($step_end_time - $start_time))
			
 
				+        /root/anaconda3/bin/python ${sh_path}/ad_monitor_util.py --level error --msg "${msg}" --start "${start_time}" --elapsed "${elapsed}" --top10 "${top10_msg}"
			
 
				+        exit 1
			
 
				+    else
			
 
				+        echo "${LOG_PREFIX} -- ${step_name}成功: 耗时 ${step_elapsed}"
			
 
				+    fi
			
 
				+}
			
 
				+
			
 
				+send_success_upload_msg(){ 
			
 
				+  # 发送更新成功通知
			
 
				+  local msg=" 广告模型文件更新完成"
			
 
				+  msg+="\n\t - 老模型AUC: ${old_auc}"
			
 
				+  msg+="\n\t - 新模型AUC: ${new_auc}"
			
 
				+  msg+="\n\t - 老模型Top10差异平均值: ${old_incr_rate_avg}"
			
 
				+  msg+="\n\t - 新模型Top10差异平均值: ${new_incr_rate_avg}"
			
 
				+  msg+="\n\t - 模型在HDFS中的路径: ${model_save_path}"
			
 
				+  msg+="\n\t - 模型上传OSS中的路径: ${MODEL_OSS_PATH}/${model_name}.tar.gz"
			
 
				+
			
 
				+  local step_end_time=$(date +%s)
			
 
				+  local elapsed=$((${step_end_time} - ${start_time}))
			
 
				+
			
 
				+  /root/anaconda3/bin/python ${sh_path}/ad_monitor_util.py --level info --msg "${msg}" --start "${start_time}" --elapsed "${elapsed}" --top10 "${top10_msg}"
			
 
				+}
			
 
				+
			
 
				+init() {
			
 
				+  
			
 
				+  declare -a date_keys=()
			
 
				+  local count=1
			
 
				+  local current_data="$(date -d '2 days ago' +%Y%m%d)"
			
 
				+  # 循环获取前 n 天的非节日日期
			
 
				+  while [[ ${count} -le 7 ]]; do
			
 
				+    date_key=$(date -d "${current_data}" +%Y%m%d)
			
 
				+    # 判断是否是节日，并拼接训练数据路径
			
 
				+    if [ $(is_not_holidays ${date_key}) -eq 1 ]; then
			
 
				+
			
 
				+      # 将 date_key 放入数组
			
 
				+      date_keys+=("${date_key}")
			
 
				+
			
 
				+      if [[ -z ${train_data_path} ]]; then
			
 
				+        train_data_path="${BUCKET_FEATURE_PATH}/${date_key}"
			
 
				+      else
			
 
				+        train_data_path="${BUCKET_FEATURE_PATH}/${date_key},${train_data_path}"
			
 
				+      fi 
			
 
				+      count=$((count + 1))
			
 
				+    else
			
 
				+      echo "日期: ${date_key}是节日，跳过"
			
 
				+    fi
			
 
				+    current_data=$(date -d "${current_data} -1 day" +%Y%m%d)
			
 
				+  done
			
 
				+
			
 
				+  last_index=$((${#date_keys[@]} - 1))
			
 
				+  train_first_day=${date_keys[$last_index]}
			
 
				+  train_last_day=${date_keys[0]}
			
 
				+
			
 
				+  model_save_path=${MODEL_PATH}/${model_name}_${train_first_day: -4}_${train_last_day: -4}
			
 
				+  predict_date_path=${BUCKET_FEATURE_PATH}/${today_early_1}
			
 
				+  new_model_predict_result_path=${PREDICT_RESULT_SAVE_PATH}/${today_early_1}_351_1000_${train_first_day: -4}_${train_last_day: -4}
			
 
				+  online_model_predict_result_path=${PREDICT_RESULT_SAVE_PATH}/${today_early_1}_351_1000_${online_model_path: -9}
			
 
				+  predict_analyse_file_path=${model_local_home}/predict_analyse_file/${today_early_1}_351_1000_analyse.txt
			
 
				+  calibration_file_path=${model_local_home}/${OSS_CALIBRATION_FILE_NAME}.txt
			
 
				+
			
 
				+  echo "init param train_data_path: ${train_data_path}"
			
 
				+  echo "init param predict_date_path: ${predict_date_path}"
			
 
				+  echo "init param new_model_predict_result_path: ${new_model_predict_result_path}"
			
 
				+  echo "init param online_model_predict_result_path: ${online_model_predict_result_path}"
			
 
				+  echo "init param model_save_path: ${model_save_path}"
			
 
				+  echo "init param online_model_path: ${online_model_path}"
			
 
				+  echo "init param feature_file: ${feature_file}"
			
 
				+  echo "init param model_name: ${model_name}"
			
 
				+  echo "init param model_local_home: ${model_local_home}"
			
 
				+  echo "init param model_oss_path: ${MODEL_OSS_PATH}"
			
 
				+  echo "init param predict_analyse_file_path: ${predict_analyse_file_path}"
			
 
				+  echo "init param calibration_file_path: ${calibration_file_path}"
			
 
				+  echo "init param current_day_of_week: ${current_day_of_week}"
			
 
				+
			
 
				+  echo "当前Python环境安装的Python版本: $(python --version)"
			
 
				+  echo "当前Python环境安装的三方包: $(python -m pip list)"
			
 
				+}
			
 
				+
			
 
				+# 校验大数据任务是否执行完成
			
 
				+check_ad_hive() {
			
 
				+  local step_start_time=$(date +%s)
			
 
				+  local max_hour=05
			
 
				+  local max_minute=30
			
 
				+  local elapsed=0
			
 
				+  while true; do
			
 
				+      local python_return_code=$(python ${sh_path}/ad_utils.py --excute_program check_ad_origin_hive --partition ${today_early_1} --hh 23)
			
 
				+
			
 
				+      elapsed=$(($(date +%s) - ${step_start_time}))
			
 
				+      if [ "${python_return_code}" -eq 0 ]; then
			
 
				+          break
			
 
				+      fi
			
 
				+      echo "Python程序返回非0值，等待五分钟后再次调用。"
			
 
				+      sleep 300
			
 
				+      local current_hour=$(date +%H)
			
 
				+      local current_minute=$(date +%M)
			
 
				+      if (( ${current_hour} > ${max_hour} || ( ${current_hour} == ${max_hour} && ${current_minute} >= ${max_minute} ) )); then
			
 
				+          local msg="大数据数据生产校验失败, 分区: ${today_early_1}"
			
 
				+          echo -e "${LOG_PREFIX} -- 大数据数据生产校验 -- ${msg}: 耗时 ${elapsed}"
			
 
				+          /root/anaconda3/bin/python ${sh_path}/ad_monitor_util.py --level error --msg "${msg}" --start "${start_time}" --elapsed "${elapsed}"
			
 
				+          exit 1
			
 
				+      fi
			
 
				+  done
			
 
				+  echo "${LOG_PREFIX} -- 大数据数据生产校验 -- 大数据数据生产校验通过: 耗时 ${elapsed}"
			
 
				+}
			
 
				+
			
 
				+origin_data() {
			
 
				+  (
			
 
				+    source ${sh_path}/25_xgb_make_data_origin_bucket.sh
			
 
				+    make_origin_data
			
 
				+  )
			
 
				+}
			
 
				+
			
 
				+bucket_feature() {
			
 
				+  (
			
 
				+    source ${sh_path}/25_xgb_make_data_origin_bucket.sh
			
 
				+    make_bucket_feature
			
 
				+  )
			
 
				+}
			
 
				+
			
 
				+xgb_train() {
			
 
				+  local step_start_time=$(date +%s)
			
 
				+
			
 
				+  /opt/apps/SPARK3/spark-3.3.1-hadoop3.2-1.0.5/bin/spark-class org.apache.spark.deploy.SparkSubmit \
			
 
				+  --class com.tzld.piaoquan.recommend.model.train_01_xgb_ad_20240808 \
			
 
				+  --master yarn --driver-memory 6G --executor-memory 10G --executor-cores 1 --num-executors 31 \
			
 
				+  --conf spark.yarn.executor.memoryoverhead=2048 \
			
 
				+  --conf spark.shuffle.service.enabled=true \
			
 
				+  --conf spark.shuffle.service.port=7337 \
			
 
				+  --conf spark.shuffle.consolidateFiles=true \
			
 
				+  --conf spark.shuffle.manager=sort \
			
 
				+  --conf spark.storage.memoryFraction=0.4 \
			
 
				+  --conf spark.shuffle.memoryFraction=0.5 \
			
 
				+  --conf spark.default.parallelism=200 \
			
 
				+  /root/zhangbo/recommend-model/recommend-model-produce/target/recommend-model-produce-jar-with-dependencies.jar \
			
 
				+  featureFile:20240703_ad_feature_name.txt \
			
 
				+  trainPath:${train_data_path} \
			
 
				+  testPath:${predict_date_path} \
			
 
				+  savePath:${new_model_predict_result_path} \
			
 
				+  modelPath:${model_save_path} \
			
 
				+  eta:0.01 gamma:0.0 max_depth:5 num_round:1000 num_worker:30 repartition:20
			
 
				+
			
 
				+  local return_code=$?
			
 
				+  check_run_status ${return_code} ${step_start_time} "XGB模型训练任务" "XGB模型训练失败"
			
 
				+}
			
 
				+
			
 
				+calc_model_predict() {
			
 
				+  local count=0
			
 
				+  local max_line=10
			
 
				+  local old_total_diff=0
			
 
				+  local new_total_diff=0
			
 
				+  top10_msg="| CID  | 老模型相对真实CTCVR的变化 | 新模型相对真实CTCVR的变化 |"
			
 
				+  top10_msg+=" \n| ---- | --------- | -------- |"
			
 
				+  while read -r line && [ ${count} -lt ${max_line} ]; do
			
 
				+
			
 
				+      # 使用 ! 取反判断，只有当行中不包含 "cid" 时才执行继续的逻辑
			
 
				+      if [[ "${line}" == *"cid"* ]]; then
			
 
				+          continue
			
 
				+      fi
			
 
				+
			
 
				+      read -a numbers <<< "${line}"
			
 
				+
			
 
				+      # 分数分别保存
			
 
				+      real_score_map[${numbers[0]}]=${numbers[3]}
			
 
				+      old_score_map[${numbers[0]}]=${numbers[6]}
			
 
				+      new_score_map[${numbers[0]}]=${numbers[7]}
			
 
				+
			
 
				+      # 拼接Top10详情的飞书消息
			
 
				+      top10_msg="${top10_msg} \n| ${numbers[0]} | ${numbers[6]} | ${numbers[7]} | "
			
 
				+
			
 
				+      # 计算top10相对误差绝对值的均值
			
 
				+      old_abs_score=$( echo "${numbers[6]} * ((${numbers[6]} >= 0) * 2 - 1)" | bc -l )
			
 
				+      new_abs_score=$( echo "${numbers[7]} * ((${numbers[7]} >= 0) * 2 - 1)" | bc -l )
			
 
				+
			
 
				+      old_total_diff=$( echo "${old_total_diff} + ${old_abs_score}" | bc -l )
			
 
				+      new_total_diff=$( echo "${new_total_diff} + ${new_abs_score}" | bc -l )
			
 
				+
			
 
				+      count=$((${count} + 1))
			
 
				+
			
 
				+  done < "${predict_analyse_file_path}"
			
 
				+
			
 
				+  local return_code=$?
			
 
				+  check_run_status ${return_code} ${step_start_time} "计算Top10差异" "计算Top10差异异常"
			
 
				+
			
 
				+  old_incr_rate_avg=$( echo "scale=6; ${old_total_diff} / ${count}" | bc -l )
			
 
				+  check_run_status $? ${step_start_time} "计算老模型Top10差异" "计算老模型Top10差异异常"
			
 
				+
			
 
				+
			
 
				+  new_incr_rate_avg=$( echo "scale=6; ${new_total_diff} / ${count}" | bc -l )
			
 
				+  check_run_status $? ${step_start_time} "计算新模型Top10差异" "计算新模型Top10差异异常"
			
 
				+
			
 
				+  echo "老模型Top10差异平均值: ${old_incr_rate_avg}"
			
 
				+  echo "新模型Top10差异平均值: ${new_incr_rate_avg}"
			
 
				+  echo "新老模型分数对比: "
			
 
				+  for cid in "${!new_score_map[@]}"; do
			
 
				+    echo "\t CID: $cid, 老模型分数: ${old_score_map[$cid]}, 新模型分数: ${new_score_map[$cid]}"
			
 
				+  done
			
 
				+}
			
 
				+
			
 
				+calc_auc() {
			
 
				+  old_auc=`cat ${PREDICT_CACHE_PATH}/old_1.txt | /root/sunmingze/AUC/AUC`
			
 
				+  new_auc=`cat ${PREDICT_CACHE_PATH}/new_1.txt | /root/sunmingze/AUC/AUC`
			
 
				+}
			
 
				+
			
 
				+model_predict() {
			
 
				+
			
 
				+  # 线上模型评估最新的数据
			
 
				+  local step_start_time=$(date +%s)
			
 
				+  /opt/apps/SPARK3/spark-3.3.1-hadoop3.2-1.0.5/bin/spark-class org.apache.spark.deploy.SparkSubmit \
			
 
				+  --class com.tzld.piaoquan.recommend.model.pred_01_xgb_ad_hdfsfile_20240813 \
			
 
				+  --master yarn --driver-memory 1G --executor-memory 3G --executor-cores 1 --num-executors 30 \
			
 
				+  --conf spark.yarn.executor.memoryoverhead=1024 \
			
 
				+  --conf spark.shuffle.service.enabled=true \
			
 
				+  --conf spark.shuffle.service.port=7337 \
			
 
				+  --conf spark.shuffle.consolidateFiles=true \
			
 
				+  --conf spark.shuffle.manager=sort \
			
 
				+  --conf spark.storage.memoryFraction=0.4 \
			
 
				+  --conf spark.shuffle.memoryFraction=0.5 \
			
 
				+  --conf spark.default.parallelism=200 \
			
 
				+  /root/zhangbo/recommend-model/recommend-model-produce/target/recommend-model-produce-jar-with-dependencies.jar \
			
 
				+  featureFile:20240703_ad_feature_name.txt \
			
 
				+  testPath:${predict_date_path} \
			
 
				+  savePath:${online_model_predict_result_path} \
			
 
				+  modelPath:${online_model_path}
			
 
				+
			
 
				+  local return_code=$?
			
 
				+  check_run_status ${return_code} ${step_start_time} "线上模型评估${predict_date_path: -8}的数据" "线上模型评估${predict_date_path: -8}的数据失败"
			
 
				+
			
 
				+  # 结果分析
			
 
				+  local python_return_code=$(python ${sh_path}/model_predict_analyse.py -op ${online_model_predict_result_path} -np ${new_model_predict_result_path} -af ${predict_analyse_file_path} -cf ${calibration_file_path})
			
 
				+  check_run_status ${python_return_code} ${step_start_time} "分析线上模型评估${predict_date_path: -8}的数据" "分析线上模型评估${predict_date_path: -8}的数据失败"
			
 
				+
			
 
				+  calc_model_predict
			
 
				+
			
 
				+  calc_auc
			
 
				+
			
 
				+  if (( $(echo "${new_incr_rate_avg} > 0.100000" | bc -l ) ));then 
			
 
				+    echo "线上模型评估${predict_date_path: -8}的数据，绝对误差大于0.1，请检查"
			
 
				+    check_run_status 1 ${step_start_time} "${predict_date_path: -8}的数据，绝对误差大于0.1" "线上模型评估${predict_date_path: -8}的数据，绝对误差大于0.1，请检查"
			
 
				+    exit 1
			
 
				+  fi 
			
 
				+
			
 
				+
			
 
				+  # 对比两个模型的差异
			
 
				+  score_diff=$( echo "${new_incr_rate_avg} - ${old_incr_rate_avg}" | bc -l )
			
 
				+  if (( $(echo "${score_diff} > 0.050000" | bc -l ) ));then 
			
 
				+    echo "两个模型评估${predict_date_path: -8}的数据，两个模型分数差异为: ${score_diff}, 大于0.05, 请检查"
			
 
				+    check_run_status 1 ${step_start_time} "两个模型评估${predict_date_path: -8}的数据" "两个模型评估${predict_date_path: -8}的数据，两个模型分数差异为: ${score_diff}, 大于0.05"
			
 
				+    exit 1
			
 
				+  fi 
			
 
				+
			
 
				+}
			
 
				+
			
 
				+model_upload_oss() {
			
 
				+  local step_start_time=$(date +%s)
			
 
				+
			
 
				+  (
			
 
				+    cd ${model_local_home}
			
 
				+
			
 
				+    ${HADOOP} fs -get ${model_save_path} ${model_name}
			
 
				+    if [ ! -d ${model_name} ]; then
			
 
				+      echo "从HDFS下载模型失败"
			
 
				+      check_run_status 1 ${step_start_time} "HDFS下载模型任务" "HDFS下载模型失败" 
			
 
				+      exit 1 
			
 
				+    fi
			
 
				+
			
 
				+    tar -czvf ${model_name}.tar.gz -C ${model_name} .
			
 
				+
			
 
				+    rm -rf ${model_name}.tar.gz.crc
			
 
				+
			
 
				+    # 从OSS中移除模型文件和校准文件
			
 
				+    ${HADOOP} fs -rm -r -skipTrash ${MODEL_OSS_PATH}/${model_name}.tar.gz ${MODEL_OSS_PATH}/${OSS_CALIBRATION_FILE_NAME}.txt
			
 
				+    
			
 
				+    # 将模型文件和校准文件推送到OSS上
			
 
				+    ${HADOOP} fs -put ${model_name}.tar.gz ${OSS_CALIBRATION_FILE_NAME}.txt ${MODEL_OSS_PATH}
			
 
				+    local return_code=$?
			
 
				+    check_run_status ${return_code} ${step_start_time} "模型上传OSS任务" "模型上传OSS失败"
			
 
				+
			
 
				+    echo ${model_save_path} > ${model_path_file}
			
 
				+
			
 
				+    # 
			
 
				+    rm -f ./${model_name}.tar.gz
			
 
				+    rm -rf ./${model_name}
			
 
				+    rm -rf ${OSS_CALIBRATION_FILE_NAME}.txt
			
 
				+  )
			
 
				+
			
 
				+  local return_code=$?
			
 
				+  check_run_status ${return_code} ${step_start_time} "模型上传OSS任务" "模型上传OSS失败"
			
 
				+
			
 
				+  local step_end_time=$(date +%s)
			
 
				+  local elapsed=$((${step_end_time} - ${start_time}))
			
 
				+  echo -e "${LOG_PREFIX} -- 模型更新完成 -- 模型更新成功: 耗时 ${elapsed}"
			
 
				+  
			
 
				+  send_success_upload_msg
			
 
				+}
			
 
				+
			
 
				+# 主方法
			
 
				+main() {
			
 
				+  init
			
 
				+
			
 
				+  check_ad_hive
			
 
				+
			
 
				+  origin_data
			
 
				+
			
 
				+  bucket_feature
			
 
				+
			
 
				+  if [ "${current_day_of_week}" -eq 1 ] || [ "${current_day_of_week}" -eq 3 ] || [ "${current_day_of_week}" -eq 5 ]; then
			
 
				+    echo "当前是周一，周三或周五，开始训练并更新模型"
			
 
				+    
			
 
				+    xgb_train
			
 
				+
			
 
				+    model_predict
			
 
				+
			
 
				+    model_upload_oss
			
 
				+  else
			
 
				+    echo "当前是周一，周三或周五，不更新模型"
			
 
				+  fi 
			
 
				+
			
 
				+}
			
 
				+
			
 
				+
			
 
				+main
			
--- a/ad/02_ad_model_update_test.sh
+++ b/ad/02_ad_model_update_test.sh
@@ -0,0 +1,21 @@
 
				+#!/bin/sh
			
 
				+set -x
			
 
				+
			
 
				+export PATH=$SPARK_HOME/bin:$PATH
			
 
				+export HADOOP_CONF_DIR=/etc/taihao-apps/hadoop-conf
			
 
				+export JAVA_HOME=/usr/lib/jvm/java-1.8.0
			
 
				+export PREDICT_CACHE_PATH=/root/zhaohp/XGB/test/predict_cache/
			
 
				+export SEGMENT_BASE_PATH=/root/zhaohp/XGB/test/predict_analyse_file/
			
 
				+
			
 
				+
			
 
				+sh_path=$(cd $(dirname $0); pwd)
			
 
				+source ${sh_path}/00_common.sh
			
 
				+
			
 
				+online_model_predict_result_path=/dw/recommend/model/34_ad_predict_data/20241110_351_1000_1031_1106
			
 
				+new_model_predict_result_path=/dw/recommend/model/34_ad_predict_data/20241110_351_1000_1103_1109
			
 
				+predict_analyse_file_path=/root/zhaohp/XGB/test/predict_analyse_file/20241110_351_1000_analyse.txt
			
 
				+calibration_file_path=/root/zhaohp/XGB/test/model_xgb_351_1000_v2_calibration.txt
			
 
				+
			
 
				+
			
 
				+local python_return_code=$(python ${sh_path}/model_predict_analyse.py -op ${online_model_predict_result_path} -np ${new_model_predict_result_path} -af ${predict_analyse_file_path} -cf ${calibration_file_path})
			
 
				+echo "${python_return_code}"
			
--- a/ad/21_ad_model_add_dt_train_predict_auc.sh
+++ b/ad/21_ad_model_add_dt_train_predict_auc.sh
@@ -0,0 +1,71 @@
 
				+#!/bin/sh
			
 
				+
			
 
				+# 指定基础模型，模型增量训练，预测，计算AUC脚本
			
 
				+
			
 
				+set -x
			
 
				+
			
 
				+begin_date=$1
			
 
				+end_date=$2
			
 
				+model_name=$3
			
 
				+train_dim=$4
			
 
				+predict_dim=$5
			
 
				+
			
 
				+PROJECT_HOME=/root/zhaohp/recommend-emr-dataprocess
			
 
				+HADOOP=/opt/apps/HADOOP-COMMON/hadoop-common-current/bin/hadoop
			
 
				+HDFS_TRAIN_DATE_PATH=/dw/recommend/model/33_ad_train_data_v4
			
 
				+MODEL_PATH=${PROJECT_HOME}/model
			
 
				+PREDICT_PATH=${PROJECT_HOME}/predict
			
 
				+
			
 
				+FM_TRAIN=/root/sunmingze/alphaFM/bin/fm_train
			
 
				+FM_PREDICT=/root/sunmingze/alphaFM/bin/fm_predict
			
 
				+
			
 
				+train_date=$begin_date
			
 
				+
			
 
				+# 计算模型的AUC，从训练日期的后一天到参数的end_date
			
 
				+predict_auc() {
			
 
				+    echo -e "\t==================== 开始预测 $train_date 模型 ===================="
			
 
				+
			
 
				+    predict_date=$(date -d "$train_date +1 day" +%Y%m%d)
			
 
				+    predict_end_date=$(date -d "$end_date +1 day" +%Y%m%d)
			
 
				+    while [ "$predict_date" != "$predict_end_date" ]; do
			
 
				+
			
 
				+        $HADOOP fs -text ${HDFS_TRAIN_DATE_PATH}/${predict_date}/* | ${FM_PREDICT} -m ${MODEL_PATH}/${model_name}_${train_date}.txt -dim ${predict_dim} -core 8 -out ${PREDICT_PATH}/${model_name}_${train_date}_${predict_date}.txt
			
 
				+        auc=`cat ${PREDICT_PATH}/${model_name}_${train_date}_${predict_date}.txt | /root/sunmingze/AUC/AUC`
			
 
				+
			
 
				+        echo "模型训练日期: ${train_date}, 模型预测日期: ${predict_date}, AUC: ${auc}, 模型路径: ${MODEL_PATH}/${model_name}_${train_date}.txt"
			
 
				+
			
 
				+        predict_date=$(date -d "$predict_date +1 day" +%Y%m%d)
			
 
				+
			
 
				+    done
			
 
				+
			
 
				+    echo -e "\n\t==================== 预测 $train_date 模型结束 ===================="
			
 
				+
			
 
				+}
			
 
				+main() {
			
 
				+
			
 
				+    # 增量训练模型
			
 
				+    while [ "$train_date" != "$end_date" ]; do
			
 
				+        echo "==================== 开始训练 $train_date 模型 ===================="
			
 
				+
			
 
				+        # 模型训练
			
 
				+        yesterday=$(date -d "$train_date -1 day" +%Y%m%d)
			
 
				+
			
 
				+        input_model=${MODEL_PATH}/${model_name}_${yesterday}.txt
			
 
				+        if [ ! -e "${input_model}" ]; then
			
 
				+            echo "输入模型: ${input_model} 不存在，退出"
			
 
				+            exit 1
			
 
				+        fi
			
 
				+
			
 
				+        $HADOOP fs -text ${HDFS_TRAIN_DATE_PATH}/${train_date}/* | ${FM_TRAIN} -m ${MODEL_PATH}/${model_name}_${train_date}.txt -dim ${train_dim} -core 8 -im ${input_model}
			
 
				+
			
 
				+        predict_auc
			
 
				+
			
 
				+        train_date=$(date -d "$train_date +1 day" +%Y%m%d)
			
 
				+
			
 
				+        echo "==================== 训练 $train_date 模型结束 ===================="
			
 
				+        echo -e "\n\n\n\n\n\n"
			
 
				+    done
			
 
				+
			
 
				+}
			
 
				+
			
 
				+main
			
--- a/ad/22_ad_model_predict_auc.sh
+++ b/ad/22_ad_model_predict_auc.sh
@@ -0,0 +1,60 @@
 
				+#!/bin/sh
			
 
				+
			
 
				+# 训练新模型，并使用后面的数据计算AUC，评估模型效果
			
 
				+
			
 
				+set -x
			
 
				+
			
 
				+begin_date=$1
			
 
				+end_date=$2
			
 
				+model_name=$3
			
 
				+predict_dim=$4
			
 
				+
			
 
				+PROJECT_HOME=/root/zhaohp/20240723
			
 
				+HADOOP=/opt/apps/HADOOP-COMMON/hadoop-common-current/bin/hadoop
			
 
				+HDFS_TRAIN_DATE_PATH=/dw/recommend/model/33_ad_train_data_v4_idn1
			
 
				+MODEL_PATH=${PROJECT_HOME}/model
			
 
				+PREDICT_PATH=${PROJECT_HOME}/predict
			
 
				+
			
 
				+FM_TRAIN=/root/sunmingze/alphaFM/bin/fm_train
			
 
				+FM_PREDICT=/root/sunmingze/alphaFM/bin/fm_predict
			
 
				+
			
 
				+train_date=$begin_date
			
 
				+
			
 
				+# 计算模型的AUC，从训练日期的后一天到参数的end_date
			
 
				+predict_auc() {
			
 
				+    echo -e "\t==================== 开始预测 $train_date 模型 ===================="
			
 
				+
			
 
				+    predict_date=$(date -d "$train_date +1 day" +%Y%m%d)
			
 
				+    predict_end_date=$(date -d "$end_date +1 day" +%Y%m%d)
			
 
				+    while [ "$predict_date" != "$predict_end_date" ]; do
			
 
				+
			
 
				+        $HADOOP fs -text ${HDFS_TRAIN_DATE_PATH}/${predict_date}/* | ${FM_PREDICT} -m ${MODEL_PATH}/${model_name}_${train_date}.txt -dim ${predict_dim} -core 8 -out ${PREDICT_PATH}/${model_name}_${train_date}.txt
			
 
				+        auc=`cat ${PREDICT_PATH}/${model_name}_${train_date}.txt | /root/sunmingze/AUC/AUC`
			
 
				+
			
 
				+        echo "模型训练日期: ${train_date}, 模型预测日期: ${predict_date}, AUC: ${auc}, 模型路径: ${MODEL_PATH}/${model_name}_${train_date}.txt"
			
 
				+
			
 
				+        predict_date=$(date -d "$predict_date +1 day" +%Y%m%d)
			
 
				+
			
 
				+    done
			
 
				+
			
 
				+    echo -e "\n\t==================== 预测 $train_date 模型结束 ===================="
			
 
				+
			
 
				+}
			
 
				+main() {
			
 
				+
			
 
				+    # 增量训练模型
			
 
				+    while [ "$train_date" != "$end_date" ]; do
			
 
				+        echo "==================== 开始训练 $train_date 模型 ===================="
			
 
				+
			
 
				+        predict_auc
			
 
				+
			
 
				+        echo -e "==================== 训练 $train_date 模型结束 ==================== \n\n\n\n\n\n"
			
 
				+        train_date=$(date -d "$train_date +1 day" +%Y%m%d)
			
 
				+    done
			
 
				+
			
 
				+}
			
 
				+
			
 
				+main
			
 
				+
			
 
				+
			
 
				+# nohup ./22_ad_model_predict_auc.sh 20240712 20240717 model_bkb8_v4_idn1 8  > logs/22_ad_model_predict_auc.log 2>&1 &
			
--- a/ad/23_ad_model_batch_calc_cid_score_avg.sh
+++ b/ad/23_ad_model_batch_calc_cid_score_avg.sh
@@ -0,0 +1,29 @@
 
				+#!/bin/sh
			
 
				+
			
 
				+# 计算模型对某天，某个CID的打分情况，输出平均值
			
 
				+
			
 
				+set -x
			
 
				+
			
 
				+cids=$1
			
 
				+model=$2
			
 
				+hdfs_path=$3
			
 
				+bias=$4
			
 
				+
			
 
				+MODEL_PATH=/root/zhaohp/recommend-emr-dataprocess/model/ad
			
 
				+PREDICT_PATH=/root/zhaohp/recommend-emr-dataprocess/predict/ad
			
 
				+HADOOP=/opt/apps/HADOOP-COMMON/hadoop-common-current/bin/hadoop
			
 
				+FM_HOME=/root/sunmingze/alphaFM
			
 
				+
			
 
				+# 将cids中的逗号分隔列表拆分为数组
			
 
				+IFS=',' read -ra cid_array <<< "$cids"
			
 
				+
			
 
				+for cid in "${cid_array[@]}"; do
			
 
				+    # 对每个CID执行打分计算并输出平均值
			
 
				+    $HADOOP fs -text ${hdfs_path}/* | grep "cid_${cid}" | ${FM_HOME}/bin/fm_predict -m ${MODEL_PATH}/${model}.txt -dim ${bias} -core 8 -out ${PREDICT_PATH}/${model}_${cid}.txt
			
 
				+
			
 
				+    score_avg=`awk '{ sum += $2; count++ } END { if (count > 0) print sum / count }' ${PREDICT_PATH}/${model}_${cid}.txt`
			
 
				+
			
 
				+    echo -e "CID- ${cid} -平均分计算结果: ${score_avg} \n\t模型: ${MODEL_PATH}/${model} \n\tHDFS数据路径: ${hdfs_path} \n\t"
			
 
				+done
			
 
				+
			
 
				+# nohup ./ad/23_ad_model_batch_calc_cid_score_avg.sh 3024,2966,2670,3163,3595,3594,3364,3365,3593,3363,3180,1910,2660,3478,3431,3772,3060,3178,3056,3771,3208,3041,2910,3690,1626,3318,3357,3628,3766,3770,3763,3769,3768,3541,3534,2806,3755,3760,3319,3758,3746,3759,3747,3754,3767,3745,3756,3437,3608,3527,3691,3197,3361,3362,3212,3344,3343,3346,3345,3612,3540,3526,3611,3761,3617,3762,3618,3616,3623,3765,3624,3764,3198,3542,3353,2374,3200 model_bkb8_v55_20240804 /dw/recommend/model/33_ad_train_data_v4/20240806 8 > logs/model_bkb8_v55_20240804_cid_06_12.log 2>&1 &
			
--- a/ad/24_supplementary_data.sh
+++ b/ad/24_supplementary_data.sh
@@ -0,0 +1,99 @@
 
				+#!/bin/sh
			
 
				+set -x
			
 
				+
			
 
				+# 广告补数据脚本，修改{today_early_1}补单天的数据
			
 
				+
			
 
				+export PATH=$SPARK_HOME/bin:$PATH
			
 
				+export HADOOP_CONF_DIR=/etc/taihao-apps/hadoop-conf
			
 
				+export JAVA_HOME=/usr/lib/jvm/java-1.8.0
			
 
				+
			
 
				+sh_path=$(cd $(dirname $0); pwd)
			
 
				+source ${sh_path}/00_common.sh
			
 
				+
			
 
				+source /root/anaconda3/bin/activate py37
			
 
				+
			
 
				+
			
 
				+# 全局常量
			
 
				+LOG_PREFIX=广告模型训练任务
			
 
				+HADOOP=/opt/apps/HADOOP-COMMON/hadoop-common-current/bin/hadoop
			
 
				+TRAIN_PATH=/dw/recommend/model/31_ad_sample_data_v4
			
 
				+BUCKET_FEATURE_PATH=/dw/recommend/model/33_ad_train_data_v4
			
 
				+TABLE=alg_recsys_ad_sample_all
			
 
				+
			
 
				+# 任务开始时间
			
 
				+start_time=$(date +%s)
			
 
				+# 前一天
			
 
				+today_early_1="$(date -d '1 days ago' +%Y%m%d)"
			
 
				+
			
 
				+# 校验命令的退出码
			
 
				+check_run_status() {
			
 
				+    local status=$1
			
 
				+    local step_start_time=$2
			
 
				+    local step_name=$3
			
 
				+    local msg=$4
			
 
				+
			
 
				+    local step_end_time=$(date +%s)
			
 
				+    local step_elapsed=$((${step_end_time} - ${step_start_time}))
			
 
				+
			
 
				+    if [ ${status} -ne 0 ]; then
			
 
				+        echo "${LOG_PREFIX} -- ${step_name}失败: 耗时 ${step_elapsed}"
			
 
				+        local elapsed=$((${step_end_time} - ${start_time}))
			
 
				+        /root/anaconda3/bin/python ${sh_path}/ad_monitor_util.py --level error --msg "${msg}" --start "${start_time}" --elapsed "${elapsed}" --top10 "${top10_msg}"
			
 
				+        exit 1
			
 
				+    else
			
 
				+        echo "${LOG_PREFIX} -- ${step_name}成功: 耗时 ${step_elapsed}"
			
 
				+    fi
			
 
				+}
			
 
				+
			
 
				+# 校验大数据任务是否执行完成
			
 
				+check_ad_hive() {
			
 
				+  local step_start_time=$(date +%s)
			
 
				+  local max_hour=05
			
 
				+  local max_minute=30
			
 
				+  local elapsed=0
			
 
				+  while true; do
			
 
				+      local python_return_code=$(python ${sh_path}/ad_utils.py --excute_program check_ad_origin_hive --partition ${today_early_1} --hh 23)
			
 
				+
			
 
				+      elapsed=$(($(date +%s) - ${step_start_time}))
			
 
				+      if [ "${python_return_code}" -eq 0 ]; then
			
 
				+          break
			
 
				+      fi
			
 
				+      echo "Python程序返回非0值，等待五分钟后再次调用。"
			
 
				+      sleep 300
			
 
				+      local current_hour=$(date +%H)
			
 
				+      local current_minute=$(date +%M)
			
 
				+      if (( ${current_hour} > ${max_hour} || ( ${current_hour} == ${max_hour} && ${current_minute} >= ${max_minute} ) )); then
			
 
				+          local msg="大数据数据生产校验失败, 分区: ${today_early_1}"
			
 
				+          echo -e "${LOG_PREFIX} -- 大数据数据生产校验 -- ${msg}: 耗时 ${elapsed}"
			
 
				+          /root/anaconda3/bin/python ${sh_path}/ad_monitor_util.py --level error --msg "${msg}" --start "${start_time}" --elapsed "${elapsed}"
			
 
				+          exit 1
			
 
				+      fi
			
 
				+  done
			
 
				+  echo "${LOG_PREFIX} -- 大数据数据生产校验 -- 大数据数据生产校验通过: 耗时 $elapsed"
			
 
				+}
			
 
				+
			
 
				+origin_data() {
			
 
				+  (
			
 
				+    source ${sh_path}/25_xgb_make_data_origin_bucket.sh
			
 
				+    make_origin_data
			
 
				+  )
			
 
				+}
			
 
				+
			
 
				+bucket_feature() {
			
 
				+  (
			
 
				+    source ${sh_path}/25_xgb_make_data_origin_bucket.sh
			
 
				+    make_bucket_feature
			
 
				+  )
			
 
				+}
			
 
				+
			
 
				+# 主方法
			
 
				+main() {
			
 
				+  check_ad_hive
			
 
				+
			
 
				+  origin_data
			
 
				+
			
 
				+  bucket_feature
			
 
				+}
			
 
				+
			
 
				+
			
 
				+main
			
--- a/ad/25_xgb_make_data_origin_bucket.sh
+++ b/ad/25_xgb_make_data_origin_bucket.sh
@@ -0,0 +1,87 @@
 
				+#!/bin/sh
			
 
				+set -x
			
 
				+
			
 
				+export PATH=$SPARK_HOME/bin:$PATH
			
 
				+export HADOOP_CONF_DIR=/etc/taihao-apps/hadoop-conf
			
 
				+export JAVA_HOME=/usr/lib/jvm/java-1.8.0
			
 
				+export SPARK_HOME=/opt/apps/SPARK2/spark-2.4.8-hadoop3.2-1.0.8
			
 
				+
			
 
				+
			
 
				+sh_path=$(dirname $0)
			
 
				+source ${sh_path}/00_common.sh
			
 
				+
			
 
				+source /root/anaconda3/bin/activate py37
			
 
				+
			
 
				+make_origin_data() {
			
 
				+  
			
 
				+  local step_start_time=$(date +%s)
			
 
				+
			
 
				+  /opt/apps/SPARK2/spark-2.4.8-hadoop3.2-1.0.8/bin/spark-class2 org.apache.spark.deploy.SparkSubmit \
			
 
				+  --class com.aliyun.odps.spark.examples.makedata_ad.v20240718.makedata_ad_31_originData_20240718 \
			
 
				+  --master yarn --driver-memory 1G --executor-memory 2G --executor-cores 1 --num-executors 16 \
			
 
				+  ./target/spark-examples-1.0.0-SNAPSHOT-shaded.jar \
			
 
				+  tablePart:64 repartition:32 \
			
 
				+  beginStr:${today_early_1}00 endStr:${today_early_1}12 \
			
 
				+  savePath:${TRAIN_PATH} \
			
 
				+  table:${TABLE} \
			
 
				+  filterHours:00,01,02,03,04,05,06,07 \
			
 
				+  idDefaultValue:0.1 &
			
 
				+  local task1=$!
			
 
				+
			
 
				+  /opt/apps/SPARK2/spark-2.4.8-hadoop3.2-1.0.8/bin/spark-class2 org.apache.spark.deploy.SparkSubmit \
			
 
				+  --class com.aliyun.odps.spark.examples.makedata_ad.v20240718.makedata_ad_31_originData_20240718 \
			
 
				+  --master yarn --driver-memory 1G --executor-memory 2G --executor-cores 1 --num-executors 16 \
			
 
				+  ./target/spark-examples-1.0.0-SNAPSHOT-shaded.jar \
			
 
				+  tablePart:64 repartition:32 \
			
 
				+  beginStr:${today_early_1}13 endStr:${today_early_1}18 \
			
 
				+  savePath:${TRAIN_PATH} \
			
 
				+  table:${TABLE} \
			
 
				+  filterHours:00,01,02,03,04,05,06,07 \
			
 
				+  idDefaultValue:0.1 &
			
 
				+  local task2=$!
			
 
				+
			
 
				+  /opt/apps/SPARK2/spark-2.4.8-hadoop3.2-1.0.8/bin/spark-class2 org.apache.spark.deploy.SparkSubmit \
			
 
				+  --class com.aliyun.odps.spark.examples.makedata_ad.v20240718.makedata_ad_31_originData_20240718 \
			
 
				+  --master yarn --driver-memory 1G --executor-memory 2G --executor-cores 1 --num-executors 16 \
			
 
				+  ./target/spark-examples-1.0.0-SNAPSHOT-shaded.jar \
			
 
				+  tablePart:64 repartition:32 \
			
 
				+  beginStr:${today_early_1}19 endStr:${today_early_1}23 \
			
 
				+  savePath:${TRAIN_PATH} \
			
 
				+  table:${TABLE} \
			
 
				+  filterHours:00,01,02,03,04,05,06,07 \
			
 
				+  idDefaultValue:0.1 &
			
 
				+  local task3=$!
			
 
				+
			
 
				+  wait ${task1}
			
 
				+  local task1_return_code=$?
			
 
				+
			
 
				+  wait ${task2}
			
 
				+  local task2_return_code=$?
			
 
				+
			
 
				+  wait ${task3}
			
 
				+  local task3_return_code=$?
			
 
				+
			
 
				+
			
 
				+  check_run_status ${task1_return_code} ${step_start_time} "spark原始样本生产任务: 生产00~12数据异常"
			
 
				+  check_run_status ${task2_return_code} ${step_start_time} "spark原始样本生产任务: 生产13~18数据异常"
			
 
				+  check_run_status ${task3_return_code} ${step_start_time} "spark原始样本生产任务: 生产19~23数据异常"
			
 
				+}
			
 
				+
			
 
				+
			
 
				+
			
 
				+make_bucket_feature() {
			
 
				+
			
 
				+  local step_start_time=$(date +%s)
			
 
				+  
			
 
				+  /opt/apps/SPARK2/spark-2.4.8-hadoop3.2-1.0.8/bin/spark-class2 org.apache.spark.deploy.SparkSubmit \
			
 
				+  --class com.aliyun.odps.spark.examples.makedata_ad.v20240718.makedata_ad_33_bucketData_20240718 \
			
 
				+  --master yarn --driver-memory 2G --executor-memory 4G --executor-cores 1 --num-executors 16 \
			
 
				+  ./target/spark-examples-1.0.0-SNAPSHOT-shaded.jar \
			
 
				+  beginStr:${today_early_1} endStr:${today_early_1} repartition:100 \
			
 
				+  filterNames:_4h_,_5h_,adid_,targeting_conversion_ \
			
 
				+  readPath:${TRAIN_PATH} \
			
 
				+  savePath:${BUCKET_FEATURE_PATH}
			
 
				+
			
 
				+  local return_code=$?
			
 
				+  check_run_status ${return_code} ${step_start_time} "spark特征分桶任务"
			
 
				+}
			
--- a/ad/30_delete_timer_file.sh
+++ b/ad/30_delete_timer_file.sh
@@ -0,0 +1,75 @@
 
				+#!/bin/sh
			
 
				+
			
 
				+export HADOOP_CONF_DIR=/etc/taihao-apps/hadoop-conf
			
 
				+
			
 
				+HADOOP=/opt/apps/HADOOP-COMMON/hadoop-common-current/bin/hadoop
			
 
				+
			
 
				+PREDICT_HOME=/root/zhaohp/recommend-emr-dataprocess/predict
			
 
				+origin_data_hdfs_dir=/dw/recommend/model/31_ad_sample_data_v3_auto
			
 
				+bucket_feature_hdfs_dir=/dw/recommend/model/33_ad_train_data_v3_auto
			
 
				+
			
 
				+
			
 
				+# 删除五天之前的预测结果文件
			
 
				+delete_predict_5d_ago() {
			
 
				+
			
 
				+    echo "=========== 开始删除五天前的预测结果文件 $(date "+%Y-%m-%d %H:%M:%d") ==========="
			
 
				+
			
 
				+    tmp_file_name=./files_to_delete.txt
			
 
				+
			
 
				+    # 查询五天前的预测结果文件，并保存到临时文件
			
 
				+    find "$PREDICT_HOME" -type f -mtime +5 > "${tmp_file_name}"
			
 
				+
			
 
				+    # 逐行读取临时文件中的路径并删除文件
			
 
				+    while IFS= read -r file; do
			
 
				+        echo "Deleting: $file"
			
 
				+        rm -f "$file"
			
 
				+    done < "${tmp_file_name}"
			
 
				+
			
 
				+    # 删除临时文件
			
 
				+    rm -f "${tmp_file_name}"
			
 
				+
			
 
				+    echo "=========== 删除五天前的预测结果文件结束 $(date "+%Y-%m-%d %H:%M:%d") ==========="
			
 
				+}
			
 
				+
			
 
				+# 删除HDFS中的目录
			
 
				+delete_hdfs_path() {
			
 
				+    if [ "$#" -ne 2 ]; then
			
 
				+        echo "Usage: delete_path <early> <path>"
			
 
				+        return 1
			
 
				+    fi
			
 
				+
			
 
				+    early=$1
			
 
				+    path=$2
			
 
				+
			
 
				+    echo "=========== $(date "+%Y-%m-%d %H:%M:%d") 开始删除目录 ${path}下 ${early}天前的文件  ==========="
			
 
				+
			
 
				+    EARLY_DAYS_AGO=$(date -d "${early} days ago" +%Y-%m-%d)
			
 
				+
			
 
				+    $HADOOP fs -ls $path | grep '^d' | while read line;
			
 
				+    do
			
 
				+        dir=$(echo $line | awk '{print $8}')
			
 
				+        modified_date=$(echo $line | awk '{print $6}')
			
 
				+        echo "${line}"
			
 
				+        if [[ "${modified_date}" < "${EARLY_DAYS_AGO}" ]]; then
			
 
				+            echo "Deleting: ${dir}"
			
 
				+            $HADOOP fs -rm -r -skipTrash ${dir}
			
 
				+        fi
			
 
				+
			
 
				+    done
			
 
				+
			
 
				+    echo "=========== $(date "+%Y-%m-%d %H:%M:%d") 删除目录 ${path}下 ${early}天前的文件结束  ==========="
			
 
				+
			
 
				+}
			
 
				+
			
 
				+
			
 
				+main() {
			
 
				+    # 删除五天前的预测结果文件
			
 
				+    delete_predict_5d_ago
			
 
				+    # 删除七天之前的HDFS中的特征原始数据
			
 
				+    delete_hdfs_path 7 $origin_data_hdfs_dir
			
 
				+    # 删除七天之前的HDFS中的特征分桶数据
			
 
				+    delete_hdfs_path 7 $bucket_feature_hdfs_dir
			
 
				+}
			
 
				+
			
 
				+
			
 
				+main
			
--- a/ad/ad_monitor_util.py
+++ b/ad/ad_monitor_util.py
@@ -0,0 +1,141 @@
 
				+# -*- coding: utf-8 -*-
			
 
				+import argparse
			
 
				+import json
			
 
				+
			
 
				+import pytz
			
 
				+import requests
			
 
				+
			
 
				+from datetime import datetime
			
 
				+
			
 
				+server_robot = {
			
 
				+    'webhook': 'https://open.feishu.cn/open-apis/bot/v2/hook/926982f5-e7af-40f5-81fd-27d8f42718e4',
			
 
				+}
			
 
				+
			
 
				+level_header_template_map = {
			
 
				+    "info": "turquoise",
			
 
				+    "error": "red",
			
 
				+    "warn": "yellow"
			
 
				+}
			
 
				+
			
 
				+level_header_title_content_map = {
			
 
				+    "info": "广告模型自动更新通知",
			
 
				+    "error": "广告模型自动更新告警",
			
 
				+    "warn": "广告模型自动更新告警"
			
 
				+}
			
 
				+
			
 
				+level_task_status_map = {
			
 
				+    "info": "任务执行成功",
			
 
				+    "error": "任务执行失败",
			
 
				+    "warn": "任务执行失败",
			
 
				+}
			
 
				+
			
 
				+
			
 
				+def send_card_msg_to_feishu(webhook, card_json):
			
 
				+    """发送消息到飞书"""
			
 
				+    headers = {'Content-Type': 'application/json'}
			
 
				+    payload_message = {
			
 
				+        "msg_type": "interactive",
			
 
				+        "card": card_json
			
 
				+    }
			
 
				+    print(f"推送飞书消息内容: {json.dumps(payload_message)}")
			
 
				+    response = requests.request('POST', url=webhook, headers=headers, data=json.dumps(payload_message))
			
 
				+    print(response.text)
			
 
				+
			
 
				+
			
 
				+def timestamp_format(timestamp: str) -> str:
			
 
				+    try:
			
 
				+        return (datetime.utcfromtimestamp(int(timestamp))
			
 
				+                .replace(tzinfo=pytz.UTC)
			
 
				+                .astimezone(pytz.timezone('Asia/Shanghai'))
			
 
				+                .strftime('%Y-%m-%d %H:%M:%S')
			
 
				+                )
			
 
				+    except ValueError as e:
			
 
				+        return timestamp
			
 
				+
			
 
				+
			
 
				+def seconds_convert(seconds):
			
 
				+    hours = seconds // 3600
			
 
				+    minutes = (seconds % 3600) // 60
			
 
				+    seconds = seconds % 60
			
 
				+    return f"{hours}小时 {minutes}分钟 {seconds}秒"
			
 
				+
			
 
				+
			
 
				+def _monitor(level, msg: str, start, elapsed, top10):
			
 
				+    """消息推送"""
			
 
				+    """消息推送"""
			
 
				+    now = datetime.now()
			
 
				+    msg = msg.replace("\\n", "\n").replace("\\t", "\t")
			
 
				+    mgs_text = f"- 当前时间: {now.strftime('%Y-%m-%d %H:%M:%S')}" \
			
 
				+               f"\n- 任务开始时间: {timestamp_format(start)}" \
			
 
				+               f"\n- 任务状态: {level_task_status_map[level]}" \
			
 
				+               f"\n- 任务耗时: {seconds_convert(elapsed)}" \
			
 
				+               f"\n- 任务描述: {msg}"
			
 
				+    card_json = {
			
 
				+        "schema": "2.0",
			
 
				+        "header": {
			
 
				+            "title": {
			
 
				+                "tag": "plain_text",
			
 
				+                "content": level_header_title_content_map[level]
			
 
				+            },
			
 
				+            "template": level_header_template_map[level]
			
 
				+        },
			
 
				+        "body": {
			
 
				+            "elements": [
			
 
				+                {
			
 
				+                    "tag": "markdown",
			
 
				+                    "content": mgs_text,
			
 
				+                    "text_align": "left",
			
 
				+                    "text_size": "normal",
			
 
				+                    "element_id": "overview"
			
 
				+                }
			
 
				+            ]
			
 
				+        }
			
 
				+    }
			
 
				+    if top10 is not None and len(top10) > 0:
			
 
				+        collapsible_panel = {
			
 
				+            "tag": "collapsible_panel",
			
 
				+            "header": {
			
 
				+                "title": {
			
 
				+                    "tag": "markdown",
			
 
				+                    "content": "**Top10差异详情**"
			
 
				+                },
			
 
				+                "vertical_align": "center",
			
 
				+                "padding": "4px 0px 4px 8px"
			
 
				+            },
			
 
				+            "border": {
			
 
				+                "color": "grey",
			
 
				+                "corner_radius": "5px"
			
 
				+            },
			
 
				+            "element_id": "detail",
			
 
				+            "elements": [
			
 
				+                {
			
 
				+                    "tag": "markdown",
			
 
				+                    "content": top10.replace("\\n", "\n").replace("\\t", "\t"),
			
 
				+                    "element_id": "Top10CID"
			
 
				+                }
			
 
				+            ]
			
 
				+        }
			
 
				+        card_json['body']['elements'].append(collapsible_panel)
			
 
				+
			
 
				+    send_card_msg_to_feishu(
			
 
				+        webhook=server_robot.get('webhook'),
			
 
				+        card_json=card_json
			
 
				+    )
			
 
				+
			
 
				+
			
 
				+if __name__ == '__main__':
			
 
				+    parser = argparse.ArgumentParser(description='告警Utils')
			
 
				+    parser.add_argument('--level', type=str, help='通知级别, info, warn, error', required=True)
			
 
				+    parser.add_argument('--msg', type=str, help='消息', required=True)
			
 
				+    parser.add_argument('--start', type=str, help='任务开始时间', required=True)
			
 
				+    parser.add_argument('--elapsed', type=int, help='任务耗时【秒】', required=True)
			
 
				+    parser.add_argument("--top10", type=str, help='Top10打分详情', required=False)
			
 
				+    args = parser.parse_args()
			
 
				+
			
 
				+    _monitor(
			
 
				+        level=args.level,
			
 
				+        msg=args.msg,
			
 
				+        start=args.start,
			
 
				+        elapsed=args.elapsed,
			
 
				+        top10=args.top10
			
 
				+    )
			
--- a/ad/ad_utils.py
+++ b/ad/ad_utils.py
@@ -0,0 +1,64 @@
 
				+# -*- coding: utf-8 -*-
			
 
				+from odps import ODPS
			
 
				+import argparse
			
 
				+
			
 
				+ODPS_CONFIG = {
			
 
				+    'ENDPOINT': 'http://service.cn.maxcompute.aliyun.com/api',
			
 
				+    'ACCESSID': 'LTAIWYUujJAm7CbH',
			
 
				+    'ACCESSKEY': 'RfSjdiWwED1sGFlsjXv0DlfTnZTG1P',
			
 
				+}
			
 
				+
			
 
				+
			
 
				+def check_data_hh(project, table, partition, hh) -> int:
			
 
				+    """检查数据是否准备好,输出数据条数"""
			
 
				+    odps = ODPS(
			
 
				+        access_id=ODPS_CONFIG['ACCESSID'],
			
 
				+        secret_access_key=ODPS_CONFIG['ACCESSKEY'],
			
 
				+        project=project,
			
 
				+        endpoint=ODPS_CONFIG['ENDPOINT'],
			
 
				+        connect_timeout=3000,
			
 
				+        read_timeout=500000,
			
 
				+        pool_maxsize=1000,
			
 
				+        pool_connections=1000
			
 
				+    )
			
 
				+    try:
			
 
				+        t = odps.get_table(name=table)
			
 
				+        check_res = t.exist_partition(partition_spec=f'dt={partition},hh={hh}')
			
 
				+        if check_res:
			
 
				+            sql = f'select * from {project}.{table} where dt = {partition}'
			
 
				+            with odps.execute_sql(sql=sql).open_reader() as reader:
			
 
				+                data_count = reader.count
			
 
				+        else:
			
 
				+            data_count = 0
			
 
				+    except Exception as e:
			
 
				+        print("error:" + str(e))
			
 
				+        data_count = 0
			
 
				+    return data_count
			
 
				+
			
 
				+
			
 
				+def check_ad_origin_hive(args):
			
 
				+    project = "loghubods"
			
 
				+    table = "alg_recsys_ad_sample_all"
			
 
				+    partition = args.partition
			
 
				+    hh = args.hh
			
 
				+    count = check_data_hh(project, table, partition, hh)
			
 
				+    if count == 0:
			
 
				+        print("1")
			
 
				+        exit(1)
			
 
				+    else:
			
 
				+        print("0")
			
 
				+
			
 
				+
			
 
				+if __name__ == '__main__':
			
 
				+    parser = argparse.ArgumentParser(description='脚本utils')
			
 
				+    parser.add_argument('--excute_program', type=str, help='执行程序')
			
 
				+    parser.add_argument('--partition', type=str, help='表分区')
			
 
				+    parser.add_argument('--hh', type=str, help='小时级分区时的小时')
			
 
				+    parser.add_argument('--project', type=str, help='表空间')
			
 
				+    parser.add_argument('--table', type=str, help='表名')
			
 
				+    args = parser.parse_args()
			
 
				+    if args.excute_program == "check_ad_origin_hive":
			
 
				+        check_ad_origin_hive(args)
			
 
				+    else:
			
 
				+        print("无合法参数，验证失败。")
			
 
				+        exit(999)
			
--- a/ad/holidays.txt
+++ b/ad/holidays.txt
@@ -0,0 +1,53 @@
 
				+国庆节
			
 
				+20241001
			
 
				+2024-10-01
			
 
				+重阳节
			
 
				+20241011
			
 
				+2024-10-11
			
 
				+样本有问题
			
 
				+20241112
			
 
				+2024-11-12
			
 
				+20241113
			
 
				+2024-11-13
			
 
				+圣诞节
			
 
				+20241225
			
 
				+2024-12-25
			
 
				+元旦
			
 
				+20250101
			
 
				+2025-01-01
			
 
				+春节
			
 
				+20250129
			
 
				+2025-01-29
			
 
				+元宵节
			
 
				+20250215
			
 
				+2025-02-15
			
 
				+妇女节
			
 
				+20250308
			
 
				+2025-03-08
			
 
				+劳动节
			
 
				+20250501
			
 
				+2025-05-01
			
 
				+青年节
			
 
				+20250504
			
 
				+2025-05-04
			
 
				+端午节
			
 
				+20250531
			
 
				+2025-05-31
			
 
				+儿童节
			
 
				+20250601
			
 
				+2025-06-01
			
 
				+建党节
			
 
				+20250701
			
 
				+2025-07-01
			
 
				+建军节
			
 
				+20250801
			
 
				+2025-08-01
			
 
				+七夕节
			
 
				+20250829
			
 
				+2025-08-29
			
 
				+2025国庆节
			
 
				+20251001
			
 
				+2025-10-01
			
 
				+中秋节
			
 
				+20251006
			
 
				+2025-10-06
			
--- a/ad/model_predict_analyse.py
+++ b/ad/model_predict_analyse.py
@@ -0,0 +1,198 @@
 
				+import argparse
			
 
				+import gzip
			
 
				+import os.path
			
 
				+from collections import OrderedDict
			
 
				+
			
 
				+import pandas as pd
			
 
				+from hdfs import InsecureClient
			
 
				+
			
 
				+client = InsecureClient("http://master-1-1.c-7f31a3eea195cb73.cn-hangzhou.emr.aliyuncs.com:9870", user="spark")
			
 
				+
			
 
				+SEGMENT_BASE_PATH = os.environ.get("SEGMENT_BASE_PATH", "/dw/recommend/model/36_score_calibration_file")
			
 
				+PREDICT_CACHE_PATH = os.environ.get("PREDICT_CACHE_PATH", "/root/zhaohp/XGB/predict_cache")
			
 
				+
			
 
				+
			
 
				+def read_predict_from_local_txt(txt_file) -> list:
			
 
				+    result = []
			
 
				+    with open(txt_file, "r") as f:
			
 
				+        for line in f.readlines():
			
 
				+            sp = line.replace("\n", "").split("\t")
			
 
				+            if len(sp) == 4:
			
 
				+                label = int(sp[0])
			
 
				+                cid = sp[3].split("_")[0]
			
 
				+                score = float(sp[2].replace("[", "").replace("]", "").split(",")[1])
			
 
				+                result.append({
			
 
				+                    "label": label,
			
 
				+                    "cid": cid,
			
 
				+                    "score": score
			
 
				+                })
			
 
				+    return result
			
 
				+
			
 
				+
			
 
				+def read_predict_from_hdfs(hdfs_path: str) -> list:
			
 
				+    if not hdfs_path.endswith("/"):
			
 
				+        hdfs_path += "/"
			
 
				+    result = []
			
 
				+    for file in client.list(hdfs_path):
			
 
				+        with client.read(hdfs_path + file) as reader:
			
 
				+            with gzip.GzipFile(fileobj=reader, mode="rb") as gz_file:
			
 
				+                for line in gz_file.read().decode("utf-8").split("\n"):
			
 
				+                    split = line.split("\t")
			
 
				+                    if len(split) == 4:
			
 
				+                        cid = split[3].split("_")[0]
			
 
				+                        label = int(split[0])
			
 
				+                        score = float(split[2].replace("[", "").replace("]", "").split(",")[1])
			
 
				+                        result.append({
			
 
				+                            "cid": cid,
			
 
				+                            "label": label,
			
 
				+                            "score": score
			
 
				+                        })
			
 
				+
			
 
				+    return result
			
 
				+
			
 
				+
			
 
				+def _segment_v1(scores, step):
			
 
				+    bins = []
			
 
				+    for i in range(0, len(scores), int((len(scores) / step))):
			
 
				+        if i == 0:
			
 
				+            bins.append(0)
			
 
				+        else:
			
 
				+            bins.append(scores[i])
			
 
				+    bins.append(1)
			
 
				+    return list(OrderedDict.fromkeys(bins))
			
 
				+
			
 
				+
			
 
				+def segment_calc_diff_rate_by_score(df: pd.DataFrame, segment_file_path: str, step=100) -> [pd.DataFrame, pd.DataFrame]:
			
 
				+    sored_df = df.sort_values(by=['score'])
			
 
				+    # 评估分数分段
			
 
				+    scores = sored_df['score'].values
			
 
				+
			
 
				+    bins = _segment_v1(scores, step)
			
 
				+
			
 
				+    # 等分分桶
			
 
				+    # split_indices = np.array_split(np.arange(len(scores)), step)
			
 
				+    # bins = [scores[index[0]] for index in split_indices] + [scores[split_indices[-1][-1]]]
			
 
				+
			
 
				+    sored_df['score_segment'] = pd.cut(sored_df['score'], bins=bins)
			
 
				+
			
 
				+    # 计算分段内分数的差异
			
 
				+    group_df = sored_df.groupby("score_segment", observed=True).agg(
			
 
				+        segment_label_sum=('label', 'sum'),
			
 
				+        segment_label_cnt=('label', 'count'),
			
 
				+        segment_score_avg=('score', 'mean'),
			
 
				+    ).reset_index()
			
 
				+    group_df['segment_true_score'] = group_df['segment_label_sum'] / group_df['segment_label_cnt']
			
 
				+    group_df['segment_diff_rate'] = (group_df['segment_score_avg'] / group_df['segment_true_score'] - 1).mask(group_df['segment_true_score'] == 0, 0)
			
 
				+
			
 
				+    # 完整的分段文件保存
			
 
				+    csv_data = group_df.to_csv(sep="\t", index=False)
			
 
				+    with client.write(segment_file_path, encoding='utf-8', overwrite=True) as writer:
			
 
				+        writer.write(csv_data)
			
 
				+
			
 
				+    filtered_df = group_df[(abs(group_df['segment_diff_rate']) >= 0.2) & (group_df['segment_label_cnt'] >= 1000)]
			
 
				+    filtered_df = filtered_df[['score_segment', 'segment_diff_rate']]
			
 
				+    # 每条曝光数据添加对应分数的diff
			
 
				+    merged_df = pd.merge(sored_df, filtered_df, on="score_segment", how="left")
			
 
				+
			
 
				+    merged_df['segment_diff_rate'] = merged_df['segment_diff_rate'].fillna(0)
			
 
				+    return merged_df, filtered_df
			
 
				+
			
 
				+
			
 
				+def read_and_calibration_predict(predict_path: str, step=100) -> [pd.DataFrame, pd.DataFrame, pd.DataFrame]:
			
 
				+    """
			
 
				+    读取评估结果，并进行校准
			
 
				+    """
			
 
				+    # 本地调试使用
			
 
				+    # predicts = read_predict_from_local_txt(predict_path)
			
 
				+    predicts = read_predict_from_hdfs(predict_path)
			
 
				+    df = pd.DataFrame(predicts)
			
 
				+
			
 
				+    # 模型分分段计算与真实ctcvr的dff_rate
			
 
				+    predict_basename = os.path.basename(predict_path)
			
 
				+    if predict_basename.endswith("/"):
			
 
				+        predict_basename = predict_basename[:-1]
			
 
				+    df, segment_df = segment_calc_diff_rate_by_score(df, segment_file_path=f"{SEGMENT_BASE_PATH}/{predict_basename}.txt", step=100)
			
 
				+
			
 
				+    # 生成校准后的分数
			
 
				+    df['score_2'] = df['score'] / (1 + df['segment_diff_rate'])
			
 
				+
			
 
				+    # 按CID统计真实ctcvr和校准前后的平均模型分
			
 
				+    grouped_df = df.groupby("cid").agg(
			
 
				+        view=('cid', 'size'),
			
 
				+        conv=('label', 'sum'),
			
 
				+        score_avg=('score', lambda x: round(x.mean(), 6)),
			
 
				+        score_2_avg=('score_2', lambda x: round(x.mean(), 6)),
			
 
				+    ).reset_index()
			
 
				+    grouped_df['true_ctcvr'] = grouped_df['conv'] / grouped_df['view']
			
 
				+
			
 
				+    return df, grouped_df, segment_df
			
 
				+
			
 
				+
			
 
				+def predict_local_save_for_auc(old_df: pd.DataFrame, new_df: pd.DataFrame):
			
 
				+    """
			
 
				+    本地保存一份评估结果, 计算AUC使用
			
 
				+    """
			
 
				+    d = {"old": old_df, "new": new_df}
			
 
				+    for key in d:
			
 
				+        df = d[key][['label', "score"]]
			
 
				+        df.to_csv(f"{PREDICT_CACHE_PATH}/{key}_1.txt", sep="\t", index=False, header=False)
			
 
				+        df = d[key][['label', "score_2"]]
			
 
				+        df.to_csv(f"{PREDICT_CACHE_PATH}/{key}_2.txt", sep="\t", index=False, header=False)
			
 
				+
			
 
				+
			
 
				+def _main(old_predict_path: str, new_predict_path: str, calibration_file: str, analyse_file: str):
			
 
				+    old_df, old_group_df, old_segment_df = read_and_calibration_predict(old_predict_path)
			
 
				+    new_df, new_group_df, new_segment_df = read_and_calibration_predict(new_predict_path)
			
 
				+
			
 
				+    predict_local_save_for_auc(old_df, new_df)
			
 
				+
			
 
				+    # 分段文件保存, 此处保留的最后使用的分段文件，不是所有的分段
			
 
				+    new_segment_df.to_csv(calibration_file, sep='\t', index=False, header=False)
			
 
				+
			
 
				+    # 字段重命名，和列过滤
			
 
				+    old_group_df.rename(columns={'score_avg': 'old_score_avg', 'score_2_avg': 'old_score_2_avg'}, inplace=True)
			
 
				+    new_group_df.rename(columns={'score_avg': 'new_score_avg', 'score_2_avg': 'new_score_2_avg'}, inplace=True)
			
 
				+    old_group_df = old_group_df[['cid', 'view', 'conv', 'true_ctcvr', 'old_score_avg', 'old_score_2_avg']]
			
 
				+    new_group_df = new_group_df[['cid', 'new_score_avg', 'new_score_2_avg']]
			
 
				+
			
 
				+    merged = pd.merge(old_group_df, new_group_df, on='cid', how='left')
			
 
				+
			
 
				+    # 计算与真实ctcvr的差异值
			
 
				+    merged["(new-true)/true"] = (merged['new_score_avg'] / merged['true_ctcvr'] - 1).mask(merged['true_ctcvr'] == 0, 0)
			
 
				+    merged["(old-true)/true"] = (merged['old_score_avg'] / merged['true_ctcvr'] - 1).mask(merged['true_ctcvr'] == 0, 0)
			
 
				+
			
 
				+    # 计算校准后的模型分与ctcvr的差异值
			
 
				+    merged["(new2-true)/true"] = (merged['new_score_2_avg'] / merged['true_ctcvr'] - 1).mask(merged['true_ctcvr'] == 0, 0)
			
 
				+    merged["(old2-true)/true"] = (merged['old_score_2_avg'] / merged['true_ctcvr'] - 1).mask(merged['true_ctcvr'] == 0, 0)
			
 
				+
			
 
				+    # 按照曝光排序，写入本地文件
			
 
				+    merged = merged.sort_values(by=['view'], ascending=False)
			
 
				+    merged = merged[[
			
 
				+        'cid', 'view', "conv", "true_ctcvr",
			
 
				+        "old_score_avg", "new_score_avg", "(old-true)/true", "(new-true)/true",
			
 
				+        "old_score_2_avg", "new_score_2_avg", "(old2-true)/true", "(new2-true)/true",
			
 
				+    ]]
			
 
				+
			
 
				+    # 根据文件名保存不同的格式
			
 
				+    if analyse_file.endswith(".csv"):
			
 
				+        merged.to_csv(analyse_file, index=False)
			
 
				+    else:
			
 
				+        with open(analyse_file, "w") as writer:
			
 
				+            writer.write(merged.to_string(index=False))
			
 
				+    print("0")
			
 
				+
			
 
				+
			
 
				+if __name__ == '__main__':
			
 
				+    parser = argparse.ArgumentParser(description="model_predict_analyse.py")
			
 
				+    parser.add_argument("-op", "--old_predict_path", required=True, help="老模型评估结果")
			
 
				+    parser.add_argument("-np", "--new_predict_path", required=True, help="新模型评估结果")
			
 
				+    parser.add_argument("-af", "--analyse_file", required=True, help="最后计算结果的保存路径")
			
 
				+    parser.add_argument("-cf", "--calibration_file", required=True, help="线上使用的segment文件保存路径")
			
 
				+    args = parser.parse_args()
			
 
				+
			
 
				+    _main(
			
 
				+        old_predict_path=args.old_predict_path,
			
 
				+        new_predict_path=args.new_predict_path,
			
 
				+        calibration_file=args.calibration_file,
			
 
				+        analyse_file=args.analyse_file
			
 
				+    )
			
--- a/pom.xml
+++ b/pom.xml
@@ -26,7 +26,7 @@
 
				     <properties>
			
 
				         <spark.version>2.3.0</spark.version>
			
 
				         <cupid.sdk.version>3.3.8-public</cupid.sdk.version>
			
 
				-        <scala.version>2.11.8</scala.version>
			
 
				+        <scala.version>2.11.12</scala.version>
			
 
				         <scala.binary.version>2.11</scala.binary.version>
			
 
				         <java.version>1.8</java.version>
			
 
				         <maven.compiler.source>${java.version}</maven.compiler.source>
			
@@ -176,7 +176,11 @@
 
				             <artifactId>lombok</artifactId>
			
 
				             <version>1.18.24</version>
			
 
				         </dependency>
			
 
				-
			
 
				+        <dependency>
			
 
				+            <groupId>ml.dmlc</groupId>
			
 
				+            <artifactId>xgboost4j-spark_2.11</artifactId>
			
 
				+            <version>1.1.2</version>
			
 
				+        </dependency>
			
 
				     </dependencies>
			
 
				 
			
 
				     <build>
			
--- a/recommend/01_recommend_model_new_train.sh
+++ b/recommend/01_recommend_model_new_train.sh
@@ -0,0 +1,46 @@
 
				+#!/bin/sh
			
 
				+
			
 
				+# 重新训练模型
			
 
				+
			
 
				+set -x
			
 
				+
			
 
				+begin_date=$1
			
 
				+end_date=$2
			
 
				+model_name=$3
			
 
				+train_dim=$4
			
 
				+hdfs_path=$5
			
 
				+
			
 
				+
			
 
				+PROJECT_HOME=/root/zhaohp/recommend-emr-dataprocess
			
 
				+HADOOP=/opt/apps/HADOOP-COMMON/hadoop-common-current/bin/hadoop
			
 
				+MODEL_PATH=${PROJECT_HOME}/model/recommend
			
 
				+
			
 
				+FM_TRAIN=/root/sunmingze/alphaFM/bin/fm_train
			
 
				+
			
 
				+train_date=$begin_date
			
 
				+
			
 
				+main() {
			
 
				+
			
 
				+    end_date=$(date -d "$end_date +1 day" +%Y%m%d)
			
 
				+
			
 
				+    # 增量训练模型
			
 
				+    while [ "$train_date" != "$end_date" ]; do
			
 
				+        echo "==================== 开始训练 $train_date 模型 ===================="
			
 
				+
			
 
				+        if [ "$train_date" == "$begin_date" ]; then
			
 
				+            $HADOOP fs -text ${hdfs_path}/${train_date}/* | ${FM_TRAIN} -m ${MODEL_PATH}/${model_name}_${train_date}.txt -dim ${train_dim} -core 8
			
 
				+        else
			
 
				+            yesterday=$(date -d "$train_date -1 day" +%Y%m%d)
			
 
				+            $HADOOP fs -text ${hdfs_path}/${train_date}/* | ${FM_TRAIN} -m ${MODEL_PATH}/${model_name}_${train_date}.txt -dim ${train_dim} -core 8 -im ${MODEL_PATH}/${model_name}_${yesterday}.txt
			
 
				+        fi
			
 
				+
			
 
				+        echo -e "==================== 训练 $train_date 模型结束 ====================\n\n\n\n\n\n"
			
 
				+
			
 
				+        train_date=$(date -d "$train_date +1 day" +%Y%m%d)
			
 
				+    done
			
 
				+
			
 
				+}
			
 
				+
			
 
				+main
			
 
				+
			
 
				+# nohup ./recommend/01_recommend_model_new_train.sh 20240815 20240821 model_nba8_v3 1,1,8 /dw/recommend/model/43_recsys_train_data_new_table_274_sample_01/ > logs/25_recommend_model_new_train.log 2>&1 &
			
--- a/recommend/02_train_go.sh
+++ b/recommend/02_train_go.sh
@@ -0,0 +1,52 @@
 
				+#!/bin/sh
			
 
				+
			
 
				+# 训练新模型，并使用后面的数据计算AUC，评估模型效果
			
 
				+
			
 
				+set -x
			
 
				+
			
 
				+begin_date=$1
			
 
				+end_date=$2
			
 
				+model_name=$3
			
 
				+train_dim=$4
			
 
				+hdfs_path=$5
			
 
				+
			
 
				+PROJECT_HOME=/root/zhaohp/recommend-emr-dataprocess/
			
 
				+HADOOP=/opt/apps/HADOOP-COMMON/hadoop-common-current/bin/hadoop
			
 
				+MODEL_PATH=${PROJECT_HOME}/model/recommend/
			
 
				+PREDICT_PATH=${PROJECT_HOME}/predict/recommend/
			
 
				+
			
 
				+FM_TRAIN=/root/sunmingze/alphaFM/bin/fm_train
			
 
				+
			
 
				+
			
 
				+train_date=$begin_date
			
 
				+train_end_time=$(date -d "$end_date +1 day" +%Y%m%d)
			
 
				+
			
 
				+main() {
			
 
				+
			
 
				+    # 增量训练模型
			
 
				+
			
 
				+    while [ "$train_date" != "$train_end_time" ]; do
			
 
				+        echo "==================== 开始训练 $train_date 模型 ===================="
			
 
				+
			
 
				+        if [ "$train_date" == 20240801 ]; then
			
 
				+            echo -e "\t\t 无效的数据分区: $train_date, 跳过"
			
 
				+        elif [ "$train_date" == "$begin_date" ]; then
			
 
				+            $HADOOP fs -text ${hdfs_path}/${train_date}/* | ${FM_TRAIN} -m ${MODEL_PATH}/${model_name}_${train_date}.txt -dim ${train_dim} -core 8
			
 
				+        else
			
 
				+            if [ "$train_date" == 20240801 ]; then
			
 
				+                yesterday=$(date -d "$train_date -2 day" +%Y%m%d)
			
 
				+            else
			
 
				+                yesterday=$(date -d "$train_date -1 day" +%Y%m%d)
			
 
				+            fi
			
 
				+            $HADOOP fs -text ${hdfs_path}/${train_date}/* | ${FM_TRAIN} -m ${MODEL_PATH}/${model_name}_${train_date}.txt -dim ${train_dim} -core 8 -im ${MODEL_PATH}/${model_name}_${yesterday}.txt
			
 
				+        fi
			
 
				+
			
 
				+        echo -e "==================== 训练 $train_date 模型结束 ====================\n\n\n\n\n\n"
			
 
				+
			
 
				+        train_date=$(date -d "$train_date +1 day" +%Y%m%d)
			
 
				+    done
			
 
				+
			
 
				+}
			
 
				+
			
 
				+main
			
 
				+
			
--- a/recommend/03_predict.sh
+++ b/recommend/03_predict.sh
@@ -0,0 +1,14 @@
 
				+#!/bin/sh
			
 
				+set -e
			
 
				+set -x
			
 
				+
			
 
				+day=$1
			
 
				+train_path=$2
			
 
				+model_name=$3
			
 
				+output_file=$4
			
 
				+bias=$5
			
 
				+HADOOP="/opt/apps/HADOOP-COMMON/hadoop-common-current/bin/hadoop"
			
 
				+$HADOOP fs -text ${train_path}/${day}/* | /root/sunmingze/alphaFM/bin/fm_predict -m model/$model_name -dim ${bias} -core 8 -out predict/${output_file}_$day.txt
			
 
				+cat predict/${output_file}_$day.txt | /root/sunmingze/AUC/AUC
			
 
				+
			
 
				+
			
--- a/recommend/20_vid_avg_score.sh
+++ b/recommend/20_vid_avg_score.sh
@@ -0,0 +1,67 @@
 
				+#!/bin/sh
			
 
				+
			
 
				+# 计算不同VID的平均分
			
 
				+
			
 
				+set -x
			
 
				+
			
 
				+predict_date=$1
			
 
				+model_name=$2
			
 
				+predict_dim=$3
			
 
				+
			
 
				+PROJECT_HOME=/root/zhaohp/recommend-emr-dataprocess/
			
 
				+MODEL_PATH=${PROJECT_HOME}/model/20240805/
			
 
				+PREDICT_PATH=${PROJECT_HOME}/predict/recommend/
			
 
				+TXT_PATH=/mnt/disk1/20240729
			
 
				+
			
 
				+HADOOP=/opt/apps/HADOOP-COMMON/hadoop-common-current/bin/hadoop
			
 
				+FM_PREDICT=/root/sunmingze/alphaFM/bin/fm_predict
			
 
				+
			
 
				+vids=(22895200 22751457 14146727 22847440 22927926 22858609 22974689 22563167 22959023 22970515 22946931 22994781 20720060 22979110)
			
 
				+
			
 
				+
			
 
				+restore_score() {
			
 
				+    for(( i = 0; i < ${#vids[@]}; i++)) do
			
 
				+        vid=${vids[i]}
			
 
				+        score_avg=$(awk '{
			
 
				+            score = $2
			
 
				+            new_score = ( 0.1 * score ) / ( 1 - 0.9 * score)
			
 
				+            sum += new_score
			
 
				+            count++
			
 
				+        } END {
			
 
				+            if ( count > 0 ){
			
 
				+                print sum / count
			
 
				+            } else {
			
 
				+                print "NaN"
			
 
				+            }
			
 
				+        }' ${PREDICT_PATH}/${model_name}_${predict_date}_${vid}.txt)
			
 
				+        echo -e "VID: ${vid} 平均分计算结果: ${score_avg} \n\t数据路径: ${PREDICT_PATH}/${model_name}_${predict_date}_${vid}.txt"
			
 
				+    done
			
 
				+}
			
 
				+
			
 
				+main() {
			
 
				+    for(( i = 0; i < ${#vids[@]}; i++)) do
			
 
				+        vid=${vids[i]}
			
 
				+        cat ${TXT_PATH}/${predict_date}.txt | \
			
 
				+        awk -v vid="$vid" -F'\t' '{
			
 
				+            if ($2 == vid) {
			
 
				+                split($0, fields, "\t");
			
 
				+                OFS="\t";
			
 
				+                line="";
			
 
				+                for (i=1; i<= length(fields); i++){ 
			
 
				+                    if (i != 2) {
			
 
				+                        line = (line ? line "\t" : "") fields[i];
			
 
				+                    }
			
 
				+                }
			
 
				+                print line
			
 
				+            }
			
 
				+        }' | \
			
 
				+        ${FM_PREDICT} -m ${MODEL_PATH}/${model_name}.txt -dim ${predict_dim} -core 8 -out ${PREDICT_PATH}/${model_name}_${predict_date}_${vid}.txt
			
 
				+        score_avg=`awk '{ sum += $2; count++ } END { if (count > 0) print sum / count }' ${PREDICT_PATH}/${model_name}_${predict_date}_${vid}.txt`
			
 
				+        echo -e "VID: ${vid} 平均分计算结果: ${score_avg} \n\t模型路径: ${MODEL_PATH}/${model_name}.txt \n\t评估数据路径: ${TXT_PATH}/${predict_date}.txt"
			
 
				+    done
			
 
				+}
			
 
				+
			
 
				+main
			
 
				+
			
 
				+
			
 
				+# nohup ./recommend/20_vid_avg_score.sh 20240729 model_recommend_v3_sample_01_20240728 8 > logs/20_vid_model_recommend_v3_20240728.sh 2>&1 &
			
--- a/recommend/21_make_data_new_table.sh
+++ b/recommend/21_make_data_new_table.sh
@@ -0,0 +1,89 @@
 
				+#!/bin/sh
			
 
				+set -x
			
 
				+
			
 
				+
			
 
				+export SPARK_HOME=/opt/apps/SPARK2/spark-2.4.8-hadoop3.2-1.0.8
			
 
				+export PATH=$SPARK_HOME/bin:$PATH
			
 
				+export HADOOP_CONF_DIR=/etc/taihao-apps/hadoop-conf
			
 
				+export JAVA_HOME=/usr/lib/jvm/java-1.8.0
			
 
				+
			
 
				+source /root/anaconda3/bin/activate py37
			
 
				+
			
 
				+# 原始数据table name
			
 
				+table='alg_recsys_sample_all_v2'
			
 
				+# 处理分区配置 推荐数据间隔一天生产，所以5日0点使用3日0-23点数据生产new模型数据
			
 
				+begin_early_2_Str="$(date -d '2 days ago' +%Y%m%d)"
			
 
				+end_early_2_Str="$(date -d '2 days ago' +%Y%m%d)"
			
 
				+beginHhStr=00
			
 
				+endHhStr=23
			
 
				+max_hour=05
			
 
				+max_minute=00
			
 
				+# 各节点产出hdfs文件绝对路径
			
 
				+# 源数据文件
			
 
				+originDataPath=/dw/recommend/model/41_recsys_sample_data_new_table/
			
 
				+# 特征分桶
			
 
				+bucketDataPath=/dw/recommend/model/43_recsys_train_data_new_table/
			
 
				+# hadoop
			
 
				+HADOOP=/opt/apps/HADOOP-COMMON/hadoop-common-current/bin/hadoop
			
 
				+
			
 
				+# 1 生产原始数据
			
 
				+echo "$(date +%Y-%m-%d_%H-%M-%S)----------step1------------开始根据${table}生产原始数据"
			
 
				+
			
 
				+/opt/apps/SPARK2/spark-2.4.8-hadoop3.2-1.0.8/bin/spark-class2 org.apache.spark.deploy.SparkSubmit \
			
 
				+--class com.aliyun.odps.spark.examples.makedata_recsys.makedata_recsys_41_originData_20240709 \
			
 
				+--master yarn --driver-memory 1G --executor-memory 2G --executor-cores 1 --num-executors 16 \
			
 
				+./target/spark-examples-1.0.0-SNAPSHOT-shaded.jar \
			
 
				+tablePart:64 repartition:32 \
			
 
				+beginStr:${begin_early_2_Str}00 endStr:${end_early_2_Str}09 \
			
 
				+savePath:${originDataPath} \
			
 
				+table:${table} &
			
 
				+
			
 
				+/opt/apps/SPARK2/spark-2.4.8-hadoop3.2-1.0.8/bin/spark-class2 org.apache.spark.deploy.SparkSubmit \
			
 
				+--class com.aliyun.odps.spark.examples.makedata_recsys.makedata_recsys_41_originData_20240709 \
			
 
				+--master yarn --driver-memory 1G --executor-memory 2G --executor-cores 1 --num-executors 16 \
			
 
				+./target/spark-examples-1.0.0-SNAPSHOT-shaded.jar \
			
 
				+tablePart:64 repartition:32 \
			
 
				+beginStr:${begin_early_2_Str}10 endStr:${end_early_2_Str}15 \
			
 
				+savePath:${originDataPath} \
			
 
				+table:${table} &
			
 
				+
			
 
				+/opt/apps/SPARK2/spark-2.4.8-hadoop3.2-1.0.8/bin/spark-class2 org.apache.spark.deploy.SparkSubmit \
			
 
				+--class com.aliyun.odps.spark.examples.makedata_recsys.makedata_recsys_41_originData_20240709 \
			
 
				+--master yarn --driver-memory 1G --executor-memory 2G --executor-cores 1 --num-executors 16 \
			
 
				+./target/spark-examples-1.0.0-SNAPSHOT-shaded.jar \
			
 
				+tablePart:64 repartition:32 \
			
 
				+beginStr:${begin_early_2_Str}16 endStr:${end_early_2_Str}23 \
			
 
				+savePath:${originDataPath} \
			
 
				+table:${table} &
			
 
				+
			
 
				+
			
 
				+wait
			
 
				+if [ $? -ne 0 ]; then
			
 
				+   echo "Spark原始样本生产任务执行失败"
			
 
				+   exit 1
			
 
				+else
			
 
				+   echo "spark原始样本生产执行成功"
			
 
				+fi
			
 
				+
			
 
				+# 特征采样分桶
			
 
				+/opt/apps/SPARK2/spark-2.4.8-hadoop3.2-1.0.8/bin/spark-class2 org.apache.spark.deploy.SparkSubmit \
			
 
				+--class com.aliyun.odps.spark.examples.makedata_recsys.makedata_recsys_43_bucketData_fu_sample_20240709 \
			
 
				+--master yarn --driver-memory 2G --executor-memory 4G --executor-cores 1 --num-executors 16 \
			
 
				+./target/spark-examples-1.0.0-SNAPSHOT-shaded.jar \
			
 
				+readPath:/dw/recommend/model/41_recsys_sample_data_new_table \
			
 
				+savePath:/dw/recommend/model/43_recsys_train_data_new_table_274_sample_01 \
			
 
				+beginStr:${begin_early_2_Str} endStr:${end_early_2_Str} repartition:500 \
			
 
				+filterNames:ROS fuSampleRate:0.1 \
			
 
				+fileName:20240609_bucket_314.txt \
			
 
				+whatLabel:is_return whatApps:0,3,4,21,17
			
 
				+
			
 
				+if [ $? -ne 0 ]; then
			
 
				+   echo "Spark特征分桶任务执行失败"
			
 
				+   exit 1
			
 
				+else
			
 
				+   echo "spark特征分桶任务执行成功"
			
 
				+fi
			
 
				+
			
 
				+
			
 
				+# 定时任务配置
			
 
				+# 0 11 * * * cd /root/zhaohp/recommend-emr-dataprocess && /bin/sh ./recommend/21_make_data_new_table.sh > logs/recommend/21_make_data_new_table/$(date +\%Y\%m\%d\%H\%M).log 2>&1
			
--- a/recommend/22_supplementary_data_new_table.sh
+++ b/recommend/22_supplementary_data_new_table.sh
@@ -0,0 +1,78 @@
 
				+#!/bin/sh
			
 
				+set -x
			
 
				+
			
 
				+
			
 
				+export SPARK_HOME=/opt/apps/SPARK2/spark-2.4.8-hadoop3.2-1.0.8
			
 
				+export PATH=$SPARK_HOME/bin:$PATH
			
 
				+export HADOOP_CONF_DIR=/etc/taihao-apps/hadoop-conf
			
 
				+export JAVA_HOME=/usr/lib/jvm/java-1.8.0
			
 
				+
			
 
				+source /root/anaconda3/bin/activate py37
			
 
				+
			
 
				+# 原始数据table name
			
 
				+table='alg_recsys_sample_all_v2'
			
 
				+# 各节点产出hdfs文件绝对路径
			
 
				+# 源数据文件
			
 
				+originDataPath=/dw/recommend/model/41_recsys_sample_data_new_table/
			
 
				+# 特征分桶
			
 
				+bucketDataPath=/dw/recommend/model/43_recsys_train_data_new_table/
			
 
				+# hadoop
			
 
				+HADOOP=/opt/apps/HADOOP-COMMON/hadoop-common-current/bin/hadoop
			
 
				+
			
 
				+# 1 生产原始数据
			
 
				+echo "$(date +%Y-%m-%d_%H-%M-%S)----------step1------------开始根据${table}生产原始数据"
			
 
				+
			
 
				+/opt/apps/SPARK2/spark-2.4.8-hadoop3.2-1.0.8/bin/spark-class2 org.apache.spark.deploy.SparkSubmit \
			
 
				+--class com.aliyun.odps.spark.examples.makedata_recsys.makedata_recsys_41_originData_20240709 \
			
 
				+--master yarn --driver-memory 1G --executor-memory 2G --executor-cores 1 --num-executors 16 \
			
 
				+./target/spark-examples-1.0.0-SNAPSHOT-shaded.jar \
			
 
				+tablePart:64 repartition:32 \
			
 
				+beginStr:2024110800 endStr:2024110808 \
			
 
				+savePath:${originDataPath} \
			
 
				+table:${table} &
			
 
				+
			
 
				+/opt/apps/SPARK2/spark-2.4.8-hadoop3.2-1.0.8/bin/spark-class2 org.apache.spark.deploy.SparkSubmit \
			
 
				+--class com.aliyun.odps.spark.examples.makedata_recsys.makedata_recsys_41_originData_20240709 \
			
 
				+--master yarn --driver-memory 1G --executor-memory 2G --executor-cores 1 --num-executors 16 \
			
 
				+./target/spark-examples-1.0.0-SNAPSHOT-shaded.jar \
			
 
				+tablePart:64 repartition:32 \
			
 
				+beginStr:2024110809 endStr:2024110816 \
			
 
				+savePath:${originDataPath} \
			
 
				+table:${table} &
			
 
				+
			
 
				+/opt/apps/SPARK2/spark-2.4.8-hadoop3.2-1.0.8/bin/spark-class2 org.apache.spark.deploy.SparkSubmit \
			
 
				+--class com.aliyun.odps.spark.examples.makedata_recsys.makedata_recsys_41_originData_20240709 \
			
 
				+--master yarn --driver-memory 1G --executor-memory 2G --executor-cores 1 --num-executors 16 \
			
 
				+./target/spark-examples-1.0.0-SNAPSHOT-shaded.jar \
			
 
				+tablePart:64 repartition:32 \
			
 
				+beginStr:2024110817 endStr:2024110823 \
			
 
				+savePath:${originDataPath} \
			
 
				+table:${table} &
			
 
				+
			
 
				+wait
			
 
				+if [ $? -ne 0 ]; then
			
 
				+   echo "Spark原始样本生产任务执行失败"
			
 
				+   exit 1
			
 
				+else
			
 
				+   echo "spark原始样本生产执行成功"
			
 
				+fi
			
 
				+
			
 
				+# 特征采样分桶
			
 
				+/opt/apps/SPARK2/spark-2.4.8-hadoop3.2-1.0.8/bin/spark-class2 org.apache.spark.deploy.SparkSubmit \
			
 
				+--class com.aliyun.odps.spark.examples.makedata_recsys.makedata_recsys_43_bucketData_fu_sample_20240709 \
			
 
				+--master yarn --driver-memory 2G --executor-memory 4G --executor-cores 1 --num-executors 16 \
			
 
				+./target/spark-examples-1.0.0-SNAPSHOT-shaded.jar \
			
 
				+readPath:/dw/recommend/model/41_recsys_sample_data_new_table \
			
 
				+savePath:/dw/recommend/model/43_recsys_train_data_new_table_274_sample_01 \
			
 
				+beginStr:20241108 endStr:20241108 repartition:500 \
			
 
				+filterNames:ROS fuSampleRate:0.1 \
			
 
				+fileName:20240609_bucket_314.txt \
			
 
				+whatLabel:is_return whatApps:0,3,4,21,17
			
 
				+
			
 
				+if [ $? -ne 0 ]; then
			
 
				+   echo "Spark特征分桶任务执行失败"
			
 
				+   exit 1
			
 
				+else
			
 
				+   echo "spark特征分桶任务执行成功"
			
 
				+fi
			
 
				+
			
--- a/spark-examples.iml
+++ b/spark-examples.iml
@@ -0,0 +1,8 @@
 
				+<?xml version="1.0" encoding="UTF-8"?>
			
 
				+<module version="4">
			
 
				+  <component name="FacetManager">
			
 
				+    <facet type="Python" name="Python">
			
 
				+      <configuration sdkName="Python 3.12 (recommend-emr-dataprocess)" />
			
 
				+    </facet>
			
 
				+  </component>
			
 
				+</module>
			
--- a/src/main/java/examples/sparksql/SparkAdCTRSampleTester.java
+++ b/src/main/java/examples/sparksql/SparkAdCTRSampleTester.java
@@ -0,0 +1,62 @@
 
				+package examples.sparksql;
			
 
				+
			
 
				+import com.aliyun.odps.TableSchema;
			
 
				+import com.aliyun.odps.data.Record;
			
 
				+import org.apache.spark.SparkConf;
			
 
				+import org.apache.spark.aliyun.odps.OdpsOps;
			
 
				+import org.apache.spark.api.java.JavaRDD;
			
 
				+import org.apache.spark.api.java.JavaSparkContext;
			
 
				+import org.apache.spark.api.java.function.Function2;
			
 
				+
			
 
				+import java.util.ArrayList;
			
 
				+
			
 
				+
			
 
				+public class SparkAdCTRSampleTester {
			
 
				+
			
 
				+    public static void main(String[] args) {
			
 
				+
			
 
				+        String partition = args[0];
			
 
				+        String accessId = "LTAIWYUujJAm7CbH";
			
 
				+        String accessKey = "RfSjdiWwED1sGFlsjXv0DlfTnZTG1P";
			
 
				+        String odpsUrl = "http://service.odps.aliyun.com/api";
			
 
				+        String tunnelUrl = "http://dt.cn-hangzhou.maxcompute.aliyun-inc.com";
			
 
				+        String project = "loghubods";
			
 
				+        String table = "alg_ad_view_sample";
			
 
				+        String hdfsPath = "/dw/recommend/model/ad_ctr_samples_test/" + partition;
			
 
				+
			
 
				+        SparkConf sparkConf = new SparkConf().setAppName("E-MapReduce Demo 3-2: Spark MaxCompute Demo (Java)");
			
 
				+        JavaSparkContext jsc = new JavaSparkContext(sparkConf);
			
 
				+        OdpsOps odpsOps = new OdpsOps(jsc.sc(), accessId, accessKey, odpsUrl, tunnelUrl);
			
 
				+        System.out.println("Read odps table...");
			
 
				+
			
 
				+        JavaRDD<Record> readData = odpsOps.readTableWithJava(project, table, partition, new RecordsToSamples(), Integer.valueOf(30));
			
 
				+        readData.filter(row -> row.get("type") != null)
			
 
				+                .filter(row -> row.get("lrsample") != null)
			
 
				+                .map(line -> singleParse2(line))
			
 
				+                .saveAsTextFile(hdfsPath);
			
 
				+    }
			
 
				+
			
 
				+
			
 
				+    static class RecordsToSamples implements Function2<Record, TableSchema, Record> {
			
 
				+        @Override
			
 
				+        public Record call(Record record, TableSchema schema) throws Exception {
			
 
				+            return record;
			
 
				+        }
			
 
				+    }
			
 
				+
			
 
				+
			
 
				+    // 单条日志处理逻辑
			
 
				+    public static String singleParse2(Record record) {
			
 
				+        // 数据解析
			
 
				+        String label = record.getString("adclick_ornot");
			
 
				+        if (label == null || label.equals("1")) {
			
 
				+            label = "0";
			
 
				+        } else {
			
 
				+            label = "1";
			
 
				+        }
			
 
				+        String samples = record.getString("lrsample").replaceAll("\\\\t","\t");
			
 
				+        return label + "\t" +  samples;
			
 
				+    }
			
 
				+
			
 
				+
			
 
				+}
			
--- a/src/main/java/examples/sparksql/SparkAdCVRSampleLoader.java
+++ b/src/main/java/examples/sparksql/SparkAdCVRSampleLoader.java
@@ -0,0 +1,99 @@
 
				+//package examples.sparksql;
			
 
				+//
			
 
				+//import com.aliyun.odps.TableSchema;
			
 
				+//import com.aliyun.odps.data.Record;
			
 
				+//import com.tzld.piaoquan.recommend.feature.domain.ad.base.*;
			
 
				+//import com.tzld.piaoquan.recommend.feature.domain.ad.feature.VlogAdCtrLRFeatureExtractor;
			
 
				+//import com.tzld.piaoquan.recommend.feature.model.sample.BaseFeature;
			
 
				+//import com.tzld.piaoquan.recommend.feature.model.sample.GroupedFeature;
			
 
				+//import com.tzld.piaoquan.recommend.feature.model.sample.LRSamples;
			
 
				+//import examples.dataloader.AdSampleConstructor;
			
 
				+//import org.apache.spark.SparkConf;
			
 
				+//import org.apache.spark.aliyun.odps.OdpsOps;
			
 
				+//import org.apache.spark.api.java.JavaRDD;
			
 
				+//import org.apache.spark.api.java.JavaSparkContext;
			
 
				+//import org.apache.spark.api.java.function.Function2;
			
 
				+//
			
 
				+//import java.util.ArrayList;
			
 
				+//
			
 
				+//
			
 
				+//public class SparkAdCVRSampleLoader {
			
 
				+//
			
 
				+//    public static void main(String[] args) {
			
 
				+//
			
 
				+//        String partition = args[0];
			
 
				+//        String accessId = "LTAIWYUujJAm7CbH";
			
 
				+//        String accessKey = "RfSjdiWwED1sGFlsjXv0DlfTnZTG1P";
			
 
				+//        String odpsUrl = "http://service.odps.aliyun.com/api";
			
 
				+//        String tunnelUrl = "http://dt.cn-hangzhou.maxcompute.aliyun-inc.com";
			
 
				+//        String project = "loghubods";
			
 
				+//        String table = "alg_ad_view_sample";
			
 
				+//        String hdfsPath = "/dw/recommend/model/ad_cvr_samples/" + partition;
			
 
				+//
			
 
				+//        SparkConf sparkConf = new SparkConf().setAppName("E-MapReduce Demo 3-2: Spark MaxCompute Demo (Java)");
			
 
				+//        JavaSparkContext jsc = new JavaSparkContext(sparkConf);
			
 
				+//        OdpsOps odpsOps = new OdpsOps(jsc.sc(), accessId, accessKey, odpsUrl, tunnelUrl);
			
 
				+//        System.out.println("Read odps table...");
			
 
				+//
			
 
				+//        JavaRDD<Record> readData = odpsOps.readTableWithJava(project, table, partition, new RecordsToSamples(), Integer.valueOf(30));
			
 
				+//        readData.filter(row -> row.getString("adclick_ornot").equals("0")).map(line -> singleParse(line)).saveAsTextFile(hdfsPath);
			
 
				+//    }
			
 
				+//
			
 
				+//
			
 
				+//    static class RecordsToSamples implements Function2<Record, TableSchema, Record> {
			
 
				+//        @Override
			
 
				+//        public Record call(Record record, TableSchema schema) throws Exception {
			
 
				+//            return record;
			
 
				+//        }
			
 
				+//    }
			
 
				+//
			
 
				+//
			
 
				+//    // 单条日志处理逻辑
			
 
				+//    public static String singleParse(Record record) {
			
 
				+//        // 数据解析
			
 
				+//        String label = record.getString("adinvert_ornot");
			
 
				+//        if (label == null || label.equals("1")) {
			
 
				+//            label = "0";
			
 
				+//        } else {
			
 
				+//            label = "1";
			
 
				+//        }
			
 
				+//
			
 
				+//
			
 
				+//        // 从sql的 record中 初始化对象内容
			
 
				+//        AdRequestContext requestContext = AdSampleConstructor.constructRequestContext(record);
			
 
				+//        UserAdFeature userFeature = AdSampleConstructor.constructUserFeature(record);
			
 
				+//        AdItemFeature itemFeature = AdSampleConstructor.constructItemFeature(record);
			
 
				+//
			
 
				+//        // 转化成bytes
			
 
				+//        AdRequestContextBytesFeature adRequestContextBytesFeature = new AdRequestContextBytesFeature(requestContext);
			
 
				+//        UserAdBytesFeature userBytesFeature = new UserAdBytesFeature(userFeature);
			
 
				+//        AdItemBytesFeature adItemBytesFeature = new AdItemBytesFeature(itemFeature);
			
 
				+//
			
 
				+//        // 特征抽取
			
 
				+//        VlogAdCtrLRFeatureExtractor bytesFeatureExtractor;
			
 
				+//        bytesFeatureExtractor = new VlogAdCtrLRFeatureExtractor();
			
 
				+//
			
 
				+//        LRSamples lrSamples = bytesFeatureExtractor.single(userBytesFeature, adItemBytesFeature, adRequestContextBytesFeature);
			
 
				+//
			
 
				+//        return parseSamplesToString2(label, lrSamples);
			
 
				+//    }
			
 
				+//
			
 
				+//
			
 
				+//    // 构建样本的字符串
			
 
				+//    public static String parseSamplesToString2(String label, LRSamples lrSamples) {
			
 
				+//        ArrayList<String> featureList = new ArrayList<String>();
			
 
				+//        for (int i = 0; i < lrSamples.getFeaturesCount(); i++) {
			
 
				+//            GroupedFeature groupedFeature = lrSamples.getFeatures(i);
			
 
				+//            if (groupedFeature != null && groupedFeature.getFeaturesCount() != 0) {
			
 
				+//                for (int j = 0; j < groupedFeature.getFeaturesCount(); j++) {
			
 
				+//                    BaseFeature baseFeature = groupedFeature.getFeatures(j);
			
 
				+//                    if (baseFeature != null) {
			
 
				+//                        featureList.add(String.valueOf(baseFeature.getIdentifier()) + ":1");
			
 
				+//                    }
			
 
				+//                }
			
 
				+//            }
			
 
				+//        }
			
 
				+//        return label + "\t" + String.join("\t", featureList);
			
 
				+//    }
			
 
				+//
			
 
				+//}
			
--- a/src/main/java/examples/sparksql/SparkAdCVRSampleTester.java
+++ b/src/main/java/examples/sparksql/SparkAdCVRSampleTester.java
@@ -0,0 +1,59 @@
 
				+package examples.sparksql;
			
 
				+
			
 
				+import com.aliyun.odps.TableSchema;
			
 
				+import com.aliyun.odps.data.Record;
			
 
				+import org.apache.spark.SparkConf;
			
 
				+import org.apache.spark.aliyun.odps.OdpsOps;
			
 
				+import org.apache.spark.api.java.JavaRDD;
			
 
				+import org.apache.spark.api.java.JavaSparkContext;
			
 
				+import org.apache.spark.api.java.function.Function2;
			
 
				+
			
 
				+import java.util.ArrayList;
			
 
				+
			
 
				+
			
 
				+public class SparkAdCVRSampleTester {
			
 
				+
			
 
				+    public static void main(String[] args) {
			
 
				+
			
 
				+        String partition = args[0];
			
 
				+        String accessId = "LTAIWYUujJAm7CbH";
			
 
				+        String accessKey = "RfSjdiWwED1sGFlsjXv0DlfTnZTG1P";
			
 
				+        String odpsUrl = "http://service.odps.aliyun.com/api";
			
 
				+        String tunnelUrl = "http://dt.cn-hangzhou.maxcompute.aliyun-inc.com";
			
 
				+        String project = "loghubods";
			
 
				+        String table = "alg_ad_view_sample";
			
 
				+        String hdfsPath = "/dw/recommend/model/ad_cvr_samples_test/" + partition;
			
 
				+
			
 
				+        SparkConf sparkConf = new SparkConf().setAppName("E-MapReduce Demo 3-2: Spark MaxCompute Demo (Java)");
			
 
				+        JavaSparkContext jsc = new JavaSparkContext(sparkConf);
			
 
				+        OdpsOps odpsOps = new OdpsOps(jsc.sc(), accessId, accessKey, odpsUrl, tunnelUrl);
			
 
				+        System.out.println("Read odps table...");
			
 
				+
			
 
				+        JavaRDD<Record> readData = odpsOps.readTableWithJava(project, table, partition, new RecordsToSamples(), Integer.valueOf(30));
			
 
				+        readData.filter(row -> row.get("type") != null)
			
 
				+                .filter(row -> row.get("lrsample") != null)
			
 
				+                .filter(row -> row.getString("adclick_ornot").equals("0"))
			
 
				+                .map(line -> singleParse(line))
			
 
				+                .saveAsTextFile(hdfsPath);
			
 
				+    }
			
 
				+
			
 
				+
			
 
				+    static class RecordsToSamples implements Function2<Record, TableSchema, Record> {
			
 
				+        @Override
			
 
				+        public Record call(Record record, TableSchema schema) throws Exception {
			
 
				+            return record;
			
 
				+        }
			
 
				+    }
			
 
				+
			
 
				+    public static String singleParse(Record record) {
			
 
				+        // 数据解析
			
 
				+        String label = record.getString("adinvert_ornot");
			
 
				+        if (label == null || label.equals("1")) {
			
 
				+            label = "0";
			
 
				+        } else {
			
 
				+            label = "1";
			
 
				+        }
			
 
				+        String samples = record.getString("lrsample").replaceAll("\\\\t","\t");
			
 
				+        return label + "\t" + samples;
			
 
				+    }
			
 
				+}
			
--- a/src/main/java/examples/sparksql/SparkAdFeaToRedisHourLoader.java
+++ b/src/main/java/examples/sparksql/SparkAdFeaToRedisHourLoader.java
@@ -0,0 +1,95 @@
 
				+//package examples.sparksql;
			
 
				+//
			
 
				+//import com.aliyun.odps.TableSchema;
			
 
				+//import com.aliyun.odps.data.Record;
			
 
				+//import com.tzld.piaoquan.recommend.feature.domain.ad.base.AdItemFeature;
			
 
				+//import com.tzld.piaoquan.recommend.feature.domain.ad.base.UserAdFeature;
			
 
				+//import examples.dataloader.AdRedisFeatureConstructor;
			
 
				+//import org.apache.spark.SparkConf;
			
 
				+//import org.apache.spark.aliyun.odps.OdpsOps;
			
 
				+//import org.apache.spark.api.java.JavaRDD;
			
 
				+//import org.apache.spark.api.java.JavaSparkContext;
			
 
				+//import org.apache.spark.api.java.function.Function2;
			
 
				+//import org.springframework.data.redis.connection.RedisStandaloneConfiguration;
			
 
				+//import org.springframework.data.redis.connection.jedis.JedisConnectionFactory;
			
 
				+//import org.springframework.data.redis.core.RedisTemplate;
			
 
				+//import org.springframework.data.redis.serializer.StringRedisSerializer;
			
 
				+//
			
 
				+//import java.util.ArrayList;
			
 
				+//import java.util.HashMap;
			
 
				+//import java.util.List;
			
 
				+//import java.util.Map;
			
 
				+//
			
 
				+//
			
 
				+//public class SparkAdFeaToRedisHourLoader {
			
 
				+//
			
 
				+//    private static final String adKeyFormat = "ad:%s";
			
 
				+//
			
 
				+//
			
 
				+//    public static RedisTemplate<String, String> buildRedisTemplate() {
			
 
				+//        RedisStandaloneConfiguration rsc = new RedisStandaloneConfiguration();
			
 
				+//        rsc.setPort(6379);
			
 
				+//        rsc.setPassword("Wqsd@2019");
			
 
				+//        rsc.setHostName("r-bp1pi8wyv6lzvgjy5z.redis.rds.aliyuncs.com");
			
 
				+//        RedisTemplate<String, String> template = new RedisTemplate<>();
			
 
				+//        JedisConnectionFactory fac = new JedisConnectionFactory(rsc);
			
 
				+//        fac.afterPropertiesSet();
			
 
				+//        template.setDefaultSerializer(new StringRedisSerializer());
			
 
				+//        template.setConnectionFactory(fac);
			
 
				+//        template.afterPropertiesSet();
			
 
				+//        return template;
			
 
				+//    }
			
 
				+//
			
 
				+//
			
 
				+//    public static void loadFeatureToRedis(RedisTemplate<String, String> redisTemplate, List<String> line) {
			
 
				+//        Map<String, String> redisFormat = new HashMap<String, String>();
			
 
				+//        String key = line.get(0);
			
 
				+//        String value = line.get(1);
			
 
				+//        redisFormat.put(key, value);
			
 
				+//        redisTemplate.opsForValue().multiSet(redisFormat);
			
 
				+//    }
			
 
				+//
			
 
				+//
			
 
				+//    static class RecordsToAdRedisKV implements Function2<Record, TableSchema, List<String>> {
			
 
				+//        @Override
			
 
				+//        public List<String> call(Record record, TableSchema schema) throws Exception {
			
 
				+//            AdItemFeature adItemFeature = AdRedisFeatureConstructor.constructItemFeature(record);
			
 
				+//            // ad feature 中的key以creativeID拼接
			
 
				+//            String key = String.format(adKeyFormat, adItemFeature.getCreativeId());
			
 
				+//            String value = adItemFeature.getValue();
			
 
				+//            List<String> kv = new ArrayList<String>();
			
 
				+//            kv.add(key);
			
 
				+//            kv.add(value);
			
 
				+//            return kv;
			
 
				+//        }
			
 
				+//    }
			
 
				+//
			
 
				+//
			
 
				+//
			
 
				+//    public static void main(String[] args) {
			
 
				+//
			
 
				+//        String partition = args[0];
			
 
				+//        String accessId = "LTAIWYUujJAm7CbH";
			
 
				+//        String accessKey = "RfSjdiWwED1sGFlsjXv0DlfTnZTG1P";
			
 
				+//        String odpsUrl = "http://service.odps.aliyun.com/api";
			
 
				+//        String tunnelUrl = "http://dt.cn-hangzhou.maxcompute.aliyun-inc.com";
			
 
				+//        String project = "loghubods";
			
 
				+//        String tableAdInfo = "alg_ad_item_info";
			
 
				+//
			
 
				+//        SparkConf sparkConf = new SparkConf().setAppName("E-MapReduce Demo 3-2: Spark MaxCompute Demo (Java)");
			
 
				+//        JavaSparkContext jsc = new JavaSparkContext(sparkConf);
			
 
				+//        OdpsOps odpsOps = new OdpsOps(jsc.sc(), accessId, accessKey, odpsUrl, tunnelUrl);
			
 
				+//        System.out.println("Read odps table...");
			
 
				+//
			
 
				+//
			
 
				+//        // load Ad features
			
 
				+//        JavaRDD<List<String>> readAdData = odpsOps.readTableWithJava(project, tableAdInfo, partition, new RecordsToAdRedisKV(), Integer.valueOf(10));
			
 
				+//        readAdData.foreachPartition(
			
 
				+//                rowIterator -> {
			
 
				+//                    RedisTemplate<String, String> redisTemplate = buildRedisTemplate();
			
 
				+//                    rowIterator.forEachRemaining(line -> loadFeatureToRedis(redisTemplate, line));
			
 
				+//                }
			
 
				+//        );
			
 
				+//    }
			
 
				+//
			
 
				+//}
			
--- a/src/main/java/examples/utils/AdUtil.java
+++ b/src/main/java/examples/utils/AdUtil.java
@@ -0,0 +1,67 @@
 
				+package examples.utils;
			
 
				+
			
 
				+import com.alibaba.fastjson.JSON;
			
 
				+import com.alibaba.fastjson.JSONObject;
			
 
				+import com.aliyun.odps.data.Record;
			
 
				+import org.apache.commons.collections4.MapUtils;
			
 
				+
			
 
				+import java.util.HashMap;
			
 
				+import java.util.Map;
			
 
				+
			
 
				+public class AdUtil {
			
 
				+
			
 
				+    public static final String IS_API_FLAG = "1";
			
 
				+
			
 
				+    /**
			
 
				+     * 将线上metaFeature中的Key替换为大数据表中的Key，用于定位问题
			
 
				+     */
			
 
				+    public static JSONObject keyReplace(JSONObject featureJson) {
			
 
				+        JSONObject newJson = new JSONObject();
			
 
				+        Map<String, String> keyMap = new HashMap<String, String>() {{
			
 
				+            put("alg_cid_feature_basic_info", "b1_feature");
			
 
				+            put("alg_cid_feature_adver_action", "b2_feature");
			
 
				+            put("alg_cid_feature_cid_action", "b3_feature");
			
 
				+            put("alg_cid_feature_region_action", "b4_feature");
			
 
				+            put("alg_cid_feature_app_action", "b5_feature");
			
 
				+            put("alg_cid_feature_week_action", "b6_feature");
			
 
				+            put("alg_cid_feature_hour_action", "b7_feature");
			
 
				+            put("alg_cid_feature_brand_action", "b8_feature");
			
 
				+            put("alg_cid_feature_weChatVersion_action", "b9_feature");
			
 
				+            put("alg_mid_feature_ad_action", "c1_feature");
			
 
				+            put("alg_cid_feature_vid_cf", "d1_feature");
			
 
				+            put("alg_cid_feature_vid_cf_rank", "d2_feature");
			
 
				+            put("alg_vid_feature_basic_info", "d3_feature");
			
 
				+            put("alg_mid_feature_return_tags", "e1_feature");
			
 
				+            put("alg_mid_feature_share_tags", "e2_feature");
			
 
				+        }};
			
 
				+
			
 
				+        for (Map.Entry<String, Object> entry : featureJson.entrySet()) {
			
 
				+            String key = keyMap.getOrDefault(entry.getKey(), entry.getKey());
			
 
				+            newJson.put(key, entry.getValue());
			
 
				+        }
			
 
				+
			
 
				+        return newJson;
			
 
				+    }
			
 
				+
			
 
				+    /**
			
 
				+     * 是否有回传，用于生产数据时的过滤
			
 
				+     */
			
 
				+    public static boolean isApi(Record record) {
			
 
				+
			
 
				+        if (record.isNull("extend_alg")) {
			
 
				+            return false;
			
 
				+        }
			
 
				+
			
 
				+        JSONObject extendAlgJson = JSON.parseObject(record.getString("extend_alg"));
			
 
				+        if (MapUtils.isEmpty(extendAlgJson)) {
			
 
				+            return false;
			
 
				+        }
			
 
				+        if (extendAlgJson.containsKey("extinfo")) {
			
 
				+            return IS_API_FLAG.equals(extendAlgJson.getJSONObject("extinfo").getString("isApi"));
			
 
				+        }
			
 
				+        if (extendAlgJson.containsKey("is_api")) {
			
 
				+            return IS_API_FLAG.equals(extendAlgJson.getString("is_api"));
			
 
				+        }
			
 
				+        return true;
			
 
				+    }
			
 
				+}
			
--- a/src/main/java/examples/utils/DateTimeUtil.java
+++ b/src/main/java/examples/utils/DateTimeUtil.java
@@ -0,0 +1,22 @@
 
				+package examples.utils;
			
 
				+
			
 
				+import java.time.Instant;
			
 
				+import java.time.LocalDateTime;
			
 
				+import java.time.ZoneId;
			
 
				+
			
 
				+public class DateTimeUtil {
			
 
				+
			
 
				+    public static int getHourByTimestamp(long timestamp) {
			
 
				+        return LocalDateTime
			
 
				+                .ofInstant(Instant.ofEpochSecond(timestamp), ZoneId.systemDefault())
			
 
				+                .getHour();
			
 
				+    }
			
 
				+
			
 
				+    public static int getDayOrWeekByTimestamp(long timestamp) {
			
 
				+        return LocalDateTime
			
 
				+                .ofInstant(Instant.ofEpochSecond(timestamp), ZoneId.systemDefault())
			
 
				+                .getDayOfWeek()
			
 
				+                .getValue();
			
 
				+    }
			
 
				+}
			
 
				+
			
--- a/src/main/resources/20240718_ad_bucket_517.txt
+++ b/src/main/resources/20240718_ad_bucket_517.txt
--- a/src/main/resources/20240718_ad_bucket_688.txt
+++ b/src/main/resources/20240718_ad_bucket_688.txt
--- a/src/main/resources/20240718_ad_feature_name.txt
+++ b/src/main/resources/20240718_ad_feature_name.txt
@@ -0,0 +1,689 @@
 
				+cpa
			
 
				+b2_1h_ctr
			
 
				+b2_1h_ctcvr
			
 
				+b2_1h_cvr
			
 
				+b2_1h_conver
			
 
				+b2_1h_ecpm
			
 
				+b2_1h_click
			
 
				+b2_1h_conver*log(view)
			
 
				+b2_1h_conver*ctcvr
			
 
				+b2_2h_ctr
			
 
				+b2_2h_ctcvr
			
 
				+b2_2h_cvr
			
 
				+b2_2h_conver
			
 
				+b2_2h_ecpm
			
 
				+b2_2h_click
			
 
				+b2_2h_conver*log(view)
			
 
				+b2_2h_conver*ctcvr
			
 
				+b2_3h_ctr
			
 
				+b2_3h_ctcvr
			
 
				+b2_3h_cvr
			
 
				+b2_3h_conver
			
 
				+b2_3h_ecpm
			
 
				+b2_3h_click
			
 
				+b2_3h_conver*log(view)
			
 
				+b2_3h_conver*ctcvr
			
 
				+b2_4h_ctr
			
 
				+b2_4h_ctcvr
			
 
				+b2_4h_cvr
			
 
				+b2_4h_conver
			
 
				+b2_4h_ecpm
			
 
				+b2_4h_click
			
 
				+b2_4h_conver*log(view)
			
 
				+b2_4h_conver*ctcvr
			
 
				+b2_5h_ctr
			
 
				+b2_5h_ctcvr
			
 
				+b2_5h_cvr
			
 
				+b2_5h_conver
			
 
				+b2_5h_ecpm
			
 
				+b2_5h_click
			
 
				+b2_5h_conver*log(view)
			
 
				+b2_5h_conver*ctcvr
			
 
				+b2_6h_ctr
			
 
				+b2_6h_ctcvr
			
 
				+b2_6h_cvr
			
 
				+b2_6h_conver
			
 
				+b2_6h_ecpm
			
 
				+b2_6h_click
			
 
				+b2_6h_conver*log(view)
			
 
				+b2_6h_conver*ctcvr
			
 
				+b2_12h_ctr
			
 
				+b2_12h_ctcvr
			
 
				+b2_12h_cvr
			
 
				+b2_12h_conver
			
 
				+b2_12h_ecpm
			
 
				+b2_12h_click
			
 
				+b2_12h_conver*log(view)
			
 
				+b2_12h_conver*ctcvr
			
 
				+b2_1d_ctr
			
 
				+b2_1d_ctcvr
			
 
				+b2_1d_cvr
			
 
				+b2_1d_conver
			
 
				+b2_1d_ecpm
			
 
				+b2_1d_click
			
 
				+b2_1d_conver*log(view)
			
 
				+b2_1d_conver*ctcvr
			
 
				+b2_3d_ctr
			
 
				+b2_3d_ctcvr
			
 
				+b2_3d_cvr
			
 
				+b2_3d_conver
			
 
				+b2_3d_ecpm
			
 
				+b2_3d_click
			
 
				+b2_3d_conver*log(view)
			
 
				+b2_3d_conver*ctcvr
			
 
				+b2_7d_ctr
			
 
				+b2_7d_ctcvr
			
 
				+b2_7d_cvr
			
 
				+b2_7d_conver
			
 
				+b2_7d_ecpm
			
 
				+b2_7d_click
			
 
				+b2_7d_conver*log(view)
			
 
				+b2_7d_conver*ctcvr
			
 
				+b2_today_ctr
			
 
				+b2_today_ctcvr
			
 
				+b2_today_cvr
			
 
				+b2_today_conver
			
 
				+b2_today_ecpm
			
 
				+b2_today_click
			
 
				+b2_today_conver*log(view)
			
 
				+b2_today_conver*ctcvr
			
 
				+b2_yesterday_ctr
			
 
				+b2_yesterday_ctcvr
			
 
				+b2_yesterday_cvr
			
 
				+b2_yesterday_conver
			
 
				+b2_yesterday_ecpm
			
 
				+b2_yesterday_click
			
 
				+b2_yesterday_conver*log(view)
			
 
				+b2_yesterday_conver*ctcvr
			
 
				+b3_1h_ctr
			
 
				+b3_1h_ctcvr
			
 
				+b3_1h_cvr
			
 
				+b3_1h_conver
			
 
				+b3_1h_ecpm
			
 
				+b3_1h_click
			
 
				+b3_1h_conver*log(view)
			
 
				+b3_1h_conver*ctcvr
			
 
				+b3_2h_ctr
			
 
				+b3_2h_ctcvr
			
 
				+b3_2h_cvr
			
 
				+b3_2h_conver
			
 
				+b3_2h_ecpm
			
 
				+b3_2h_click
			
 
				+b3_2h_conver*log(view)
			
 
				+b3_2h_conver*ctcvr
			
 
				+b3_3h_ctr
			
 
				+b3_3h_ctcvr
			
 
				+b3_3h_cvr
			
 
				+b3_3h_conver
			
 
				+b3_3h_ecpm
			
 
				+b3_3h_click
			
 
				+b3_3h_conver*log(view)
			
 
				+b3_3h_conver*ctcvr
			
 
				+b3_4h_ctr
			
 
				+b3_4h_ctcvr
			
 
				+b3_4h_cvr
			
 
				+b3_4h_conver
			
 
				+b3_4h_ecpm
			
 
				+b3_4h_click
			
 
				+b3_4h_conver*log(view)
			
 
				+b3_4h_conver*ctcvr
			
 
				+b3_5h_ctr
			
 
				+b3_5h_ctcvr
			
 
				+b3_5h_cvr
			
 
				+b3_5h_conver
			
 
				+b3_5h_ecpm
			
 
				+b3_5h_click
			
 
				+b3_5h_conver*log(view)
			
 
				+b3_5h_conver*ctcvr
			
 
				+b3_6h_ctr
			
 
				+b3_6h_ctcvr
			
 
				+b3_6h_cvr
			
 
				+b3_6h_conver
			
 
				+b3_6h_ecpm
			
 
				+b3_6h_click
			
 
				+b3_6h_conver*log(view)
			
 
				+b3_6h_conver*ctcvr
			
 
				+b3_12h_ctr
			
 
				+b3_12h_ctcvr
			
 
				+b3_12h_cvr
			
 
				+b3_12h_conver
			
 
				+b3_12h_ecpm
			
 
				+b3_12h_click
			
 
				+b3_12h_conver*log(view)
			
 
				+b3_12h_conver*ctcvr
			
 
				+b3_1d_ctr
			
 
				+b3_1d_ctcvr
			
 
				+b3_1d_cvr
			
 
				+b3_1d_conver
			
 
				+b3_1d_ecpm
			
 
				+b3_1d_click
			
 
				+b3_1d_conver*log(view)
			
 
				+b3_1d_conver*ctcvr
			
 
				+b3_3d_ctr
			
 
				+b3_3d_ctcvr
			
 
				+b3_3d_cvr
			
 
				+b3_3d_conver
			
 
				+b3_3d_ecpm
			
 
				+b3_3d_click
			
 
				+b3_3d_conver*log(view)
			
 
				+b3_3d_conver*ctcvr
			
 
				+b3_7d_ctr
			
 
				+b3_7d_ctcvr
			
 
				+b3_7d_cvr
			
 
				+b3_7d_conver
			
 
				+b3_7d_ecpm
			
 
				+b3_7d_click
			
 
				+b3_7d_conver*log(view)
			
 
				+b3_7d_conver*ctcvr
			
 
				+b3_today_ctr
			
 
				+b3_today_ctcvr
			
 
				+b3_today_cvr
			
 
				+b3_today_conver
			
 
				+b3_today_ecpm
			
 
				+b3_today_click
			
 
				+b3_today_conver*log(view)
			
 
				+b3_today_conver*ctcvr
			
 
				+b3_yesterday_ctr
			
 
				+b3_yesterday_ctcvr
			
 
				+b3_yesterday_cvr
			
 
				+b3_yesterday_conver
			
 
				+b3_yesterday_ecpm
			
 
				+b3_yesterday_click
			
 
				+b3_yesterday_conver*log(view)
			
 
				+b3_yesterday_conver*ctcvr
			
 
				+b4_1h_ctr
			
 
				+b4_1h_ctcvr
			
 
				+b4_1h_cvr
			
 
				+b4_1h_conver
			
 
				+b4_1h_ecpm
			
 
				+b4_1h_click
			
 
				+b4_1h_conver*log(view)
			
 
				+b4_1h_conver*ctcvr
			
 
				+b4_2h_ctr
			
 
				+b4_2h_ctcvr
			
 
				+b4_2h_cvr
			
 
				+b4_2h_conver
			
 
				+b4_2h_ecpm
			
 
				+b4_2h_click
			
 
				+b4_2h_conver*log(view)
			
 
				+b4_2h_conver*ctcvr
			
 
				+b4_3h_ctr
			
 
				+b4_3h_ctcvr
			
 
				+b4_3h_cvr
			
 
				+b4_3h_conver
			
 
				+b4_3h_ecpm
			
 
				+b4_3h_click
			
 
				+b4_3h_conver*log(view)
			
 
				+b4_3h_conver*ctcvr
			
 
				+b4_4h_ctr
			
 
				+b4_4h_ctcvr
			
 
				+b4_4h_cvr
			
 
				+b4_4h_conver
			
 
				+b4_4h_ecpm
			
 
				+b4_4h_click
			
 
				+b4_4h_conver*log(view)
			
 
				+b4_4h_conver*ctcvr
			
 
				+b4_5h_ctr
			
 
				+b4_5h_ctcvr
			
 
				+b4_5h_cvr
			
 
				+b4_5h_conver
			
 
				+b4_5h_ecpm
			
 
				+b4_5h_click
			
 
				+b4_5h_conver*log(view)
			
 
				+b4_5h_conver*ctcvr
			
 
				+b4_6h_ctr
			
 
				+b4_6h_ctcvr
			
 
				+b4_6h_cvr
			
 
				+b4_6h_conver
			
 
				+b4_6h_ecpm
			
 
				+b4_6h_click
			
 
				+b4_6h_conver*log(view)
			
 
				+b4_6h_conver*ctcvr
			
 
				+b4_12h_ctr
			
 
				+b4_12h_ctcvr
			
 
				+b4_12h_cvr
			
 
				+b4_12h_conver
			
 
				+b4_12h_ecpm
			
 
				+b4_12h_click
			
 
				+b4_12h_conver*log(view)
			
 
				+b4_12h_conver*ctcvr
			
 
				+b4_1d_ctr
			
 
				+b4_1d_ctcvr
			
 
				+b4_1d_cvr
			
 
				+b4_1d_conver
			
 
				+b4_1d_ecpm
			
 
				+b4_1d_click
			
 
				+b4_1d_conver*log(view)
			
 
				+b4_1d_conver*ctcvr
			
 
				+b4_3d_ctr
			
 
				+b4_3d_ctcvr
			
 
				+b4_3d_cvr
			
 
				+b4_3d_conver
			
 
				+b4_3d_ecpm
			
 
				+b4_3d_click
			
 
				+b4_3d_conver*log(view)
			
 
				+b4_3d_conver*ctcvr
			
 
				+b4_7d_ctr
			
 
				+b4_7d_ctcvr
			
 
				+b4_7d_cvr
			
 
				+b4_7d_conver
			
 
				+b4_7d_ecpm
			
 
				+b4_7d_click
			
 
				+b4_7d_conver*log(view)
			
 
				+b4_7d_conver*ctcvr
			
 
				+b4_today_ctr
			
 
				+b4_today_ctcvr
			
 
				+b4_today_cvr
			
 
				+b4_today_conver
			
 
				+b4_today_ecpm
			
 
				+b4_today_click
			
 
				+b4_today_conver*log(view)
			
 
				+b4_today_conver*ctcvr
			
 
				+b4_yesterday_ctr
			
 
				+b4_yesterday_ctcvr
			
 
				+b4_yesterday_cvr
			
 
				+b4_yesterday_conver
			
 
				+b4_yesterday_ecpm
			
 
				+b4_yesterday_click
			
 
				+b4_yesterday_conver*log(view)
			
 
				+b4_yesterday_conver*ctcvr
			
 
				+b5_1h_ctr
			
 
				+b5_1h_ctcvr
			
 
				+b5_1h_cvr
			
 
				+b5_1h_conver
			
 
				+b5_1h_ecpm
			
 
				+b5_1h_click
			
 
				+b5_1h_conver*log(view)
			
 
				+b5_1h_conver*ctcvr
			
 
				+b5_2h_ctr
			
 
				+b5_2h_ctcvr
			
 
				+b5_2h_cvr
			
 
				+b5_2h_conver
			
 
				+b5_2h_ecpm
			
 
				+b5_2h_click
			
 
				+b5_2h_conver*log(view)
			
 
				+b5_2h_conver*ctcvr
			
 
				+b5_3h_ctr
			
 
				+b5_3h_ctcvr
			
 
				+b5_3h_cvr
			
 
				+b5_3h_conver
			
 
				+b5_3h_ecpm
			
 
				+b5_3h_click
			
 
				+b5_3h_conver*log(view)
			
 
				+b5_3h_conver*ctcvr
			
 
				+b5_4h_ctr
			
 
				+b5_4h_ctcvr
			
 
				+b5_4h_cvr
			
 
				+b5_4h_conver
			
 
				+b5_4h_ecpm
			
 
				+b5_4h_click
			
 
				+b5_4h_conver*log(view)
			
 
				+b5_4h_conver*ctcvr
			
 
				+b5_5h_ctr
			
 
				+b5_5h_ctcvr
			
 
				+b5_5h_cvr
			
 
				+b5_5h_conver
			
 
				+b5_5h_ecpm
			
 
				+b5_5h_click
			
 
				+b5_5h_conver*log(view)
			
 
				+b5_5h_conver*ctcvr
			
 
				+b5_6h_ctr
			
 
				+b5_6h_ctcvr
			
 
				+b5_6h_cvr
			
 
				+b5_6h_conver
			
 
				+b5_6h_ecpm
			
 
				+b5_6h_click
			
 
				+b5_6h_conver*log(view)
			
 
				+b5_6h_conver*ctcvr
			
 
				+b5_12h_ctr
			
 
				+b5_12h_ctcvr
			
 
				+b5_12h_cvr
			
 
				+b5_12h_conver
			
 
				+b5_12h_ecpm
			
 
				+b5_12h_click
			
 
				+b5_12h_conver*log(view)
			
 
				+b5_12h_conver*ctcvr
			
 
				+b5_1d_ctr
			
 
				+b5_1d_ctcvr
			
 
				+b5_1d_cvr
			
 
				+b5_1d_conver
			
 
				+b5_1d_ecpm
			
 
				+b5_1d_click
			
 
				+b5_1d_conver*log(view)
			
 
				+b5_1d_conver*ctcvr
			
 
				+b5_3d_ctr
			
 
				+b5_3d_ctcvr
			
 
				+b5_3d_cvr
			
 
				+b5_3d_conver
			
 
				+b5_3d_ecpm
			
 
				+b5_3d_click
			
 
				+b5_3d_conver*log(view)
			
 
				+b5_3d_conver*ctcvr
			
 
				+b5_7d_ctr
			
 
				+b5_7d_ctcvr
			
 
				+b5_7d_cvr
			
 
				+b5_7d_conver
			
 
				+b5_7d_ecpm
			
 
				+b5_7d_click
			
 
				+b5_7d_conver*log(view)
			
 
				+b5_7d_conver*ctcvr
			
 
				+b5_today_ctr
			
 
				+b5_today_ctcvr
			
 
				+b5_today_cvr
			
 
				+b5_today_conver
			
 
				+b5_today_ecpm
			
 
				+b5_today_click
			
 
				+b5_today_conver*log(view)
			
 
				+b5_today_conver*ctcvr
			
 
				+b5_yesterday_ctr
			
 
				+b5_yesterday_ctcvr
			
 
				+b5_yesterday_cvr
			
 
				+b5_yesterday_conver
			
 
				+b5_yesterday_ecpm
			
 
				+b5_yesterday_click
			
 
				+b5_yesterday_conver*log(view)
			
 
				+b5_yesterday_conver*ctcvr
			
 
				+b8_1h_ctr
			
 
				+b8_1h_ctcvr
			
 
				+b8_1h_cvr
			
 
				+b8_1h_conver
			
 
				+b8_1h_ecpm
			
 
				+b8_1h_click
			
 
				+b8_1h_conver*log(view)
			
 
				+b8_1h_conver*ctcvr
			
 
				+b8_2h_ctr
			
 
				+b8_2h_ctcvr
			
 
				+b8_2h_cvr
			
 
				+b8_2h_conver
			
 
				+b8_2h_ecpm
			
 
				+b8_2h_click
			
 
				+b8_2h_conver*log(view)
			
 
				+b8_2h_conver*ctcvr
			
 
				+b8_3h_ctr
			
 
				+b8_3h_ctcvr
			
 
				+b8_3h_cvr
			
 
				+b8_3h_conver
			
 
				+b8_3h_ecpm
			
 
				+b8_3h_click
			
 
				+b8_3h_conver*log(view)
			
 
				+b8_3h_conver*ctcvr
			
 
				+b8_4h_ctr
			
 
				+b8_4h_ctcvr
			
 
				+b8_4h_cvr
			
 
				+b8_4h_conver
			
 
				+b8_4h_ecpm
			
 
				+b8_4h_click
			
 
				+b8_4h_conver*log(view)
			
 
				+b8_4h_conver*ctcvr
			
 
				+b8_5h_ctr
			
 
				+b8_5h_ctcvr
			
 
				+b8_5h_cvr
			
 
				+b8_5h_conver
			
 
				+b8_5h_ecpm
			
 
				+b8_5h_click
			
 
				+b8_5h_conver*log(view)
			
 
				+b8_5h_conver*ctcvr
			
 
				+b8_6h_ctr
			
 
				+b8_6h_ctcvr
			
 
				+b8_6h_cvr
			
 
				+b8_6h_conver
			
 
				+b8_6h_ecpm
			
 
				+b8_6h_click
			
 
				+b8_6h_conver*log(view)
			
 
				+b8_6h_conver*ctcvr
			
 
				+b8_12h_ctr
			
 
				+b8_12h_ctcvr
			
 
				+b8_12h_cvr
			
 
				+b8_12h_conver
			
 
				+b8_12h_ecpm
			
 
				+b8_12h_click
			
 
				+b8_12h_conver*log(view)
			
 
				+b8_12h_conver*ctcvr
			
 
				+b8_1d_ctr
			
 
				+b8_1d_ctcvr
			
 
				+b8_1d_cvr
			
 
				+b8_1d_conver
			
 
				+b8_1d_ecpm
			
 
				+b8_1d_click
			
 
				+b8_1d_conver*log(view)
			
 
				+b8_1d_conver*ctcvr
			
 
				+b8_3d_ctr
			
 
				+b8_3d_ctcvr
			
 
				+b8_3d_cvr
			
 
				+b8_3d_conver
			
 
				+b8_3d_ecpm
			
 
				+b8_3d_click
			
 
				+b8_3d_conver*log(view)
			
 
				+b8_3d_conver*ctcvr
			
 
				+b8_7d_ctr
			
 
				+b8_7d_ctcvr
			
 
				+b8_7d_cvr
			
 
				+b8_7d_conver
			
 
				+b8_7d_ecpm
			
 
				+b8_7d_click
			
 
				+b8_7d_conver*log(view)
			
 
				+b8_7d_conver*ctcvr
			
 
				+b8_today_ctr
			
 
				+b8_today_ctcvr
			
 
				+b8_today_cvr
			
 
				+b8_today_conver
			
 
				+b8_today_ecpm
			
 
				+b8_today_click
			
 
				+b8_today_conver*log(view)
			
 
				+b8_today_conver*ctcvr
			
 
				+b8_yesterday_ctr
			
 
				+b8_yesterday_ctcvr
			
 
				+b8_yesterday_cvr
			
 
				+b8_yesterday_conver
			
 
				+b8_yesterday_ecpm
			
 
				+b8_yesterday_click
			
 
				+b8_yesterday_conver*log(view)
			
 
				+b8_yesterday_conver*ctcvr
			
 
				+b9_1h_ctr
			
 
				+b9_1h_ctcvr
			
 
				+b9_1h_cvr
			
 
				+b9_1h_conver
			
 
				+b9_1h_ecpm
			
 
				+b9_1h_click
			
 
				+b9_1h_conver*log(view)
			
 
				+b9_1h_conver*ctcvr
			
 
				+b9_2h_ctr
			
 
				+b9_2h_ctcvr
			
 
				+b9_2h_cvr
			
 
				+b9_2h_conver
			
 
				+b9_2h_ecpm
			
 
				+b9_2h_click
			
 
				+b9_2h_conver*log(view)
			
 
				+b9_2h_conver*ctcvr
			
 
				+b9_3h_ctr
			
 
				+b9_3h_ctcvr
			
 
				+b9_3h_cvr
			
 
				+b9_3h_conver
			
 
				+b9_3h_ecpm
			
 
				+b9_3h_click
			
 
				+b9_3h_conver*log(view)
			
 
				+b9_3h_conver*ctcvr
			
 
				+b9_4h_ctr
			
 
				+b9_4h_ctcvr
			
 
				+b9_4h_cvr
			
 
				+b9_4h_conver
			
 
				+b9_4h_ecpm
			
 
				+b9_4h_click
			
 
				+b9_4h_conver*log(view)
			
 
				+b9_4h_conver*ctcvr
			
 
				+b9_5h_ctr
			
 
				+b9_5h_ctcvr
			
 
				+b9_5h_cvr
			
 
				+b9_5h_conver
			
 
				+b9_5h_ecpm
			
 
				+b9_5h_click
			
 
				+b9_5h_conver*log(view)
			
 
				+b9_5h_conver*ctcvr
			
 
				+b9_6h_ctr
			
 
				+b9_6h_ctcvr
			
 
				+b9_6h_cvr
			
 
				+b9_6h_conver
			
 
				+b9_6h_ecpm
			
 
				+b9_6h_click
			
 
				+b9_6h_conver*log(view)
			
 
				+b9_6h_conver*ctcvr
			
 
				+b9_12h_ctr
			
 
				+b9_12h_ctcvr
			
 
				+b9_12h_cvr
			
 
				+b9_12h_conver
			
 
				+b9_12h_ecpm
			
 
				+b9_12h_click
			
 
				+b9_12h_conver*log(view)
			
 
				+b9_12h_conver*ctcvr
			
 
				+b9_1d_ctr
			
 
				+b9_1d_ctcvr
			
 
				+b9_1d_cvr
			
 
				+b9_1d_conver
			
 
				+b9_1d_ecpm
			
 
				+b9_1d_click
			
 
				+b9_1d_conver*log(view)
			
 
				+b9_1d_conver*ctcvr
			
 
				+b9_3d_ctr
			
 
				+b9_3d_ctcvr
			
 
				+b9_3d_cvr
			
 
				+b9_3d_conver
			
 
				+b9_3d_ecpm
			
 
				+b9_3d_click
			
 
				+b9_3d_conver*log(view)
			
 
				+b9_3d_conver*ctcvr
			
 
				+b9_7d_ctr
			
 
				+b9_7d_ctcvr
			
 
				+b9_7d_cvr
			
 
				+b9_7d_conver
			
 
				+b9_7d_ecpm
			
 
				+b9_7d_click
			
 
				+b9_7d_conver*log(view)
			
 
				+b9_7d_conver*ctcvr
			
 
				+b9_today_ctr
			
 
				+b9_today_ctcvr
			
 
				+b9_today_cvr
			
 
				+b9_today_conver
			
 
				+b9_today_ecpm
			
 
				+b9_today_click
			
 
				+b9_today_conver*log(view)
			
 
				+b9_today_conver*ctcvr
			
 
				+b9_yesterday_ctr
			
 
				+b9_yesterday_ctcvr
			
 
				+b9_yesterday_cvr
			
 
				+b9_yesterday_conver
			
 
				+b9_yesterday_ecpm
			
 
				+b9_yesterday_click
			
 
				+b9_yesterday_conver*log(view)
			
 
				+b9_yesterday_conver*ctcvr
			
 
				+b6_7d_ctr
			
 
				+b6_7d_ctcvr
			
 
				+b6_7d_cvr
			
 
				+b6_7d_conver
			
 
				+b6_7d_ecpm
			
 
				+b6_7d_click
			
 
				+b6_7d_conver*log(view)
			
 
				+b6_7d_conver*ctcvr
			
 
				+b6_14d_ctr
			
 
				+b6_14d_ctcvr
			
 
				+b6_14d_cvr
			
 
				+b6_14d_conver
			
 
				+b6_14d_ecpm
			
 
				+b6_14d_click
			
 
				+b6_14d_conver*log(view)
			
 
				+b6_14d_conver*ctcvr
			
 
				+b7_7d_ctr
			
 
				+b7_7d_ctcvr
			
 
				+b7_7d_cvr
			
 
				+b7_7d_conver
			
 
				+b7_7d_ecpm
			
 
				+b7_7d_click
			
 
				+b7_7d_conver*log(view)
			
 
				+b7_7d_conver*ctcvr
			
 
				+b7_14d_ctr
			
 
				+b7_14d_ctcvr
			
 
				+b7_14d_cvr
			
 
				+b7_14d_conver
			
 
				+b7_14d_ecpm
			
 
				+b7_14d_click
			
 
				+b7_14d_conver*log(view)
			
 
				+b7_14d_conver*ctcvr
			
 
				+viewAll
			
 
				+clickAll
			
 
				+converAll
			
 
				+incomeAll
			
 
				+ctr_all
			
 
				+ctcvr_all
			
 
				+cvr_all
			
 
				+ecpm_all
			
 
				+timediff_view
			
 
				+timediff_click
			
 
				+timediff_conver
			
 
				+actionstatic_view
			
 
				+actionstatic_click
			
 
				+actionstatic_conver
			
 
				+actionstatic_income
			
 
				+actionstatic_ctr
			
 
				+actionstatic_ctcvr
			
 
				+actionstatic_cvr
			
 
				+e1_tags_3d_matchnum
			
 
				+e1_tags_3d_maxscore
			
 
				+e1_tags_3d_avgscore
			
 
				+e1_tags_7d_matchnum
			
 
				+e1_tags_7d_maxscore
			
 
				+e1_tags_7d_avgscore
			
 
				+e1_tags_14d_matchnum
			
 
				+e1_tags_14d_maxscore
			
 
				+e1_tags_14d_avgscore
			
 
				+e2_tags_3d_matchnum
			
 
				+e2_tags_3d_maxscore
			
 
				+e2_tags_3d_avgscore
			
 
				+e2_tags_7d_matchnum
			
 
				+e2_tags_7d_maxscore
			
 
				+e2_tags_7d_avgscore
			
 
				+e2_tags_14d_matchnum
			
 
				+e2_tags_14d_maxscore
			
 
				+e2_tags_14d_avgscore
			
 
				+d1_feature_3h_ctr
			
 
				+d1_feature_3h_ctcvr
			
 
				+d1_feature_3h_cvr
			
 
				+d1_feature_3h_conver
			
 
				+d1_feature_3h_ecpm
			
 
				+d1_feature_6h_ctr
			
 
				+d1_feature_6h_ctcvr
			
 
				+d1_feature_6h_cvr
			
 
				+d1_feature_6h_conver
			
 
				+d1_feature_6h_ecpm
			
 
				+d1_feature_12h_ctr
			
 
				+d1_feature_12h_ctcvr
			
 
				+d1_feature_12h_cvr
			
 
				+d1_feature_12h_conver
			
 
				+d1_feature_12h_ecpm
			
 
				+d1_feature_1d_ctr
			
 
				+d1_feature_1d_ctcvr
			
 
				+d1_feature_1d_cvr
			
 
				+d1_feature_1d_conver
			
 
				+d1_feature_1d_ecpm
			
 
				+d1_feature_3d_ctr
			
 
				+d1_feature_3d_ctcvr
			
 
				+d1_feature_3d_cvr
			
 
				+d1_feature_3d_conver
			
 
				+d1_feature_3d_ecpm
			
 
				+d1_feature_7d_ctr
			
 
				+d1_feature_7d_ctcvr
			
 
				+d1_feature_7d_cvr
			
 
				+d1_feature_7d_conver
			
 
				+d1_feature_7d_ecpm
			
 
				+vid_rank_ctr_1d
			
 
				+vid_rank_ctr_3d
			
 
				+vid_rank_ctr_7d
			
 
				+vid_rank_ctr_14d
			
 
				+vid_rank_ctcvr_1d
			
 
				+vid_rank_ctcvr_3d
			
 
				+vid_rank_ctcvr_7d
			
 
				+vid_rank_ctcvr_14d
			
 
				+vid_rank_ecpm_1d
			
 
				+vid_rank_ecpm_3d
			
 
				+vid_rank_ecpm_7d
			
 
				+vid_rank_ecpm_14d
			
 
				+ctitle_vtitle_similarity
			
 
				+weight
			
--- a/src/main/resources/20240718_ad_feature_name_517.txt
+++ b/src/main/resources/20240718_ad_feature_name_517.txt
@@ -0,0 +1,518 @@
 
				+cpa
			
 
				+b2_1h_ctr
			
 
				+b2_1h_ctcvr
			
 
				+b2_1h_cvr
			
 
				+b2_1h_conver
			
 
				+b2_1h_click
			
 
				+b2_1h_conver*log(view)
			
 
				+b2_1h_conver*ctcvr
			
 
				+b2_2h_ctr
			
 
				+b2_2h_ctcvr
			
 
				+b2_2h_cvr
			
 
				+b2_2h_conver
			
 
				+b2_2h_click
			
 
				+b2_2h_conver*log(view)
			
 
				+b2_2h_conver*ctcvr
			
 
				+b2_3h_ctr
			
 
				+b2_3h_ctcvr
			
 
				+b2_3h_cvr
			
 
				+b2_3h_conver
			
 
				+b2_3h_click
			
 
				+b2_3h_conver*log(view)
			
 
				+b2_3h_conver*ctcvr
			
 
				+b2_6h_ctr
			
 
				+b2_6h_ctcvr
			
 
				+b2_6h_cvr
			
 
				+b2_6h_conver
			
 
				+b2_6h_click
			
 
				+b2_6h_conver*log(view)
			
 
				+b2_6h_conver*ctcvr
			
 
				+b2_12h_ctr
			
 
				+b2_12h_ctcvr
			
 
				+b2_12h_cvr
			
 
				+b2_12h_conver
			
 
				+b2_12h_click
			
 
				+b2_12h_conver*log(view)
			
 
				+b2_12h_conver*ctcvr
			
 
				+b2_1d_ctr
			
 
				+b2_1d_ctcvr
			
 
				+b2_1d_cvr
			
 
				+b2_1d_conver
			
 
				+b2_1d_click
			
 
				+b2_1d_conver*log(view)
			
 
				+b2_1d_conver*ctcvr
			
 
				+b2_3d_ctr
			
 
				+b2_3d_ctcvr
			
 
				+b2_3d_cvr
			
 
				+b2_3d_conver
			
 
				+b2_3d_click
			
 
				+b2_3d_conver*log(view)
			
 
				+b2_3d_conver*ctcvr
			
 
				+b2_7d_ctr
			
 
				+b2_7d_ctcvr
			
 
				+b2_7d_cvr
			
 
				+b2_7d_conver
			
 
				+b2_7d_click
			
 
				+b2_7d_conver*log(view)
			
 
				+b2_7d_conver*ctcvr
			
 
				+b2_yesterday_ctr
			
 
				+b2_yesterday_ctcvr
			
 
				+b2_yesterday_cvr
			
 
				+b2_yesterday_conver
			
 
				+b2_yesterday_click
			
 
				+b2_yesterday_conver*log(view)
			
 
				+b2_yesterday_conver*ctcvr
			
 
				+b2_today_ctr
			
 
				+b2_today_ctcvr
			
 
				+b2_today_cvr
			
 
				+b2_today_conver
			
 
				+b2_today_click
			
 
				+b2_today_conver*log(view)
			
 
				+b2_today_conver*ctcvr
			
 
				+b3_1h_ctr
			
 
				+b3_1h_ctcvr
			
 
				+b3_1h_cvr
			
 
				+b3_1h_conver
			
 
				+b3_1h_click
			
 
				+b3_1h_conver*log(view)
			
 
				+b3_1h_conver*ctcvr
			
 
				+b3_2h_ctr
			
 
				+b3_2h_ctcvr
			
 
				+b3_2h_cvr
			
 
				+b3_2h_conver
			
 
				+b3_2h_click
			
 
				+b3_2h_conver*log(view)
			
 
				+b3_2h_conver*ctcvr
			
 
				+b3_3h_ctr
			
 
				+b3_3h_ctcvr
			
 
				+b3_3h_cvr
			
 
				+b3_3h_conver
			
 
				+b3_3h_click
			
 
				+b3_3h_conver*log(view)
			
 
				+b3_3h_conver*ctcvr
			
 
				+b3_6h_ctr
			
 
				+b3_6h_ctcvr
			
 
				+b3_6h_cvr
			
 
				+b3_6h_conver
			
 
				+b3_6h_click
			
 
				+b3_6h_conver*log(view)
			
 
				+b3_6h_conver*ctcvr
			
 
				+b3_12h_ctr
			
 
				+b3_12h_ctcvr
			
 
				+b3_12h_cvr
			
 
				+b3_12h_conver
			
 
				+b3_12h_click
			
 
				+b3_12h_conver*log(view)
			
 
				+b3_12h_conver*ctcvr
			
 
				+b3_1d_ctr
			
 
				+b3_1d_ctcvr
			
 
				+b3_1d_cvr
			
 
				+b3_1d_conver
			
 
				+b3_1d_click
			
 
				+b3_1d_conver*log(view)
			
 
				+b3_1d_conver*ctcvr
			
 
				+b3_3d_ctr
			
 
				+b3_3d_ctcvr
			
 
				+b3_3d_cvr
			
 
				+b3_3d_conver
			
 
				+b3_3d_click
			
 
				+b3_3d_conver*log(view)
			
 
				+b3_3d_conver*ctcvr
			
 
				+b3_7d_ctr
			
 
				+b3_7d_ctcvr
			
 
				+b3_7d_cvr
			
 
				+b3_7d_conver
			
 
				+b3_7d_click
			
 
				+b3_7d_conver*log(view)
			
 
				+b3_7d_conver*ctcvr
			
 
				+b3_yesterday_ctr
			
 
				+b3_yesterday_ctcvr
			
 
				+b3_yesterday_cvr
			
 
				+b3_yesterday_conver
			
 
				+b3_yesterday_click
			
 
				+b3_yesterday_conver*log(view)
			
 
				+b3_yesterday_conver*ctcvr
			
 
				+b3_today_ctr
			
 
				+b3_today_ctcvr
			
 
				+b3_today_cvr
			
 
				+b3_today_conver
			
 
				+b3_today_click
			
 
				+b3_today_conver*log(view)
			
 
				+b3_today_conver*ctcvr
			
 
				+b4_1h_ctr
			
 
				+b4_1h_ctcvr
			
 
				+b4_1h_cvr
			
 
				+b4_1h_conver
			
 
				+b4_1h_click
			
 
				+b4_1h_conver*log(view)
			
 
				+b4_1h_conver*ctcvr
			
 
				+b4_2h_ctr
			
 
				+b4_2h_ctcvr
			
 
				+b4_2h_cvr
			
 
				+b4_2h_conver
			
 
				+b4_2h_click
			
 
				+b4_2h_conver*log(view)
			
 
				+b4_2h_conver*ctcvr
			
 
				+b4_3h_ctr
			
 
				+b4_3h_ctcvr
			
 
				+b4_3h_cvr
			
 
				+b4_3h_conver
			
 
				+b4_3h_click
			
 
				+b4_3h_conver*log(view)
			
 
				+b4_3h_conver*ctcvr
			
 
				+b4_6h_ctr
			
 
				+b4_6h_ctcvr
			
 
				+b4_6h_cvr
			
 
				+b4_6h_conver
			
 
				+b4_6h_click
			
 
				+b4_6h_conver*log(view)
			
 
				+b4_6h_conver*ctcvr
			
 
				+b4_12h_ctr
			
 
				+b4_12h_ctcvr
			
 
				+b4_12h_cvr
			
 
				+b4_12h_conver
			
 
				+b4_12h_click
			
 
				+b4_12h_conver*log(view)
			
 
				+b4_12h_conver*ctcvr
			
 
				+b4_1d_ctr
			
 
				+b4_1d_ctcvr
			
 
				+b4_1d_cvr
			
 
				+b4_1d_conver
			
 
				+b4_1d_click
			
 
				+b4_1d_conver*log(view)
			
 
				+b4_1d_conver*ctcvr
			
 
				+b4_3d_ctr
			
 
				+b4_3d_ctcvr
			
 
				+b4_3d_cvr
			
 
				+b4_3d_conver
			
 
				+b4_3d_click
			
 
				+b4_3d_conver*log(view)
			
 
				+b4_3d_conver*ctcvr
			
 
				+b4_7d_ctr
			
 
				+b4_7d_ctcvr
			
 
				+b4_7d_cvr
			
 
				+b4_7d_conver
			
 
				+b4_7d_click
			
 
				+b4_7d_conver*log(view)
			
 
				+b4_7d_conver*ctcvr
			
 
				+b4_yesterday_ctr
			
 
				+b4_yesterday_ctcvr
			
 
				+b4_yesterday_cvr
			
 
				+b4_yesterday_conver
			
 
				+b4_yesterday_click
			
 
				+b4_yesterday_conver*log(view)
			
 
				+b4_yesterday_conver*ctcvr
			
 
				+b4_today_ctr
			
 
				+b4_today_ctcvr
			
 
				+b4_today_cvr
			
 
				+b4_today_conver
			
 
				+b4_today_click
			
 
				+b4_today_conver*log(view)
			
 
				+b4_today_conver*ctcvr
			
 
				+b5_1h_ctr
			
 
				+b5_1h_ctcvr
			
 
				+b5_1h_cvr
			
 
				+b5_1h_conver
			
 
				+b5_1h_click
			
 
				+b5_1h_conver*log(view)
			
 
				+b5_1h_conver*ctcvr
			
 
				+b5_2h_ctr
			
 
				+b5_2h_ctcvr
			
 
				+b5_2h_cvr
			
 
				+b5_2h_conver
			
 
				+b5_2h_click
			
 
				+b5_2h_conver*log(view)
			
 
				+b5_2h_conver*ctcvr
			
 
				+b5_3h_ctr
			
 
				+b5_3h_ctcvr
			
 
				+b5_3h_cvr
			
 
				+b5_3h_conver
			
 
				+b5_3h_click
			
 
				+b5_3h_conver*log(view)
			
 
				+b5_3h_conver*ctcvr
			
 
				+b5_6h_ctr
			
 
				+b5_6h_ctcvr
			
 
				+b5_6h_cvr
			
 
				+b5_6h_conver
			
 
				+b5_6h_click
			
 
				+b5_6h_conver*log(view)
			
 
				+b5_6h_conver*ctcvr
			
 
				+b5_12h_ctr
			
 
				+b5_12h_ctcvr
			
 
				+b5_12h_cvr
			
 
				+b5_12h_conver
			
 
				+b5_12h_click
			
 
				+b5_12h_conver*log(view)
			
 
				+b5_12h_conver*ctcvr
			
 
				+b5_1d_ctr
			
 
				+b5_1d_ctcvr
			
 
				+b5_1d_cvr
			
 
				+b5_1d_conver
			
 
				+b5_1d_click
			
 
				+b5_1d_conver*log(view)
			
 
				+b5_1d_conver*ctcvr
			
 
				+b5_3d_ctr
			
 
				+b5_3d_ctcvr
			
 
				+b5_3d_cvr
			
 
				+b5_3d_conver
			
 
				+b5_3d_click
			
 
				+b5_3d_conver*log(view)
			
 
				+b5_3d_conver*ctcvr
			
 
				+b5_7d_ctr
			
 
				+b5_7d_ctcvr
			
 
				+b5_7d_cvr
			
 
				+b5_7d_conver
			
 
				+b5_7d_click
			
 
				+b5_7d_conver*log(view)
			
 
				+b5_7d_conver*ctcvr
			
 
				+b5_yesterday_ctr
			
 
				+b5_yesterday_ctcvr
			
 
				+b5_yesterday_cvr
			
 
				+b5_yesterday_conver
			
 
				+b5_yesterday_click
			
 
				+b5_yesterday_conver*log(view)
			
 
				+b5_yesterday_conver*ctcvr
			
 
				+b5_today_ctr
			
 
				+b5_today_ctcvr
			
 
				+b5_today_cvr
			
 
				+b5_today_conver
			
 
				+b5_today_click
			
 
				+b5_today_conver*log(view)
			
 
				+b5_today_conver*ctcvr
			
 
				+b8_1h_ctr
			
 
				+b8_1h_ctcvr
			
 
				+b8_1h_cvr
			
 
				+b8_1h_conver
			
 
				+b8_1h_click
			
 
				+b8_1h_conver*log(view)
			
 
				+b8_1h_conver*ctcvr
			
 
				+b8_2h_ctr
			
 
				+b8_2h_ctcvr
			
 
				+b8_2h_cvr
			
 
				+b8_2h_conver
			
 
				+b8_2h_click
			
 
				+b8_2h_conver*log(view)
			
 
				+b8_2h_conver*ctcvr
			
 
				+b8_3h_ctr
			
 
				+b8_3h_ctcvr
			
 
				+b8_3h_cvr
			
 
				+b8_3h_conver
			
 
				+b8_3h_click
			
 
				+b8_3h_conver*log(view)
			
 
				+b8_3h_conver*ctcvr
			
 
				+b8_6h_ctr
			
 
				+b8_6h_ctcvr
			
 
				+b8_6h_cvr
			
 
				+b8_6h_conver
			
 
				+b8_6h_click
			
 
				+b8_6h_conver*log(view)
			
 
				+b8_6h_conver*ctcvr
			
 
				+b8_12h_ctr
			
 
				+b8_12h_ctcvr
			
 
				+b8_12h_cvr
			
 
				+b8_12h_conver
			
 
				+b8_12h_click
			
 
				+b8_12h_conver*log(view)
			
 
				+b8_12h_conver*ctcvr
			
 
				+b8_1d_ctr
			
 
				+b8_1d_ctcvr
			
 
				+b8_1d_cvr
			
 
				+b8_1d_conver
			
 
				+b8_1d_click
			
 
				+b8_1d_conver*log(view)
			
 
				+b8_1d_conver*ctcvr
			
 
				+b8_3d_ctr
			
 
				+b8_3d_ctcvr
			
 
				+b8_3d_cvr
			
 
				+b8_3d_conver
			
 
				+b8_3d_click
			
 
				+b8_3d_conver*log(view)
			
 
				+b8_3d_conver*ctcvr
			
 
				+b8_7d_ctr
			
 
				+b8_7d_ctcvr
			
 
				+b8_7d_cvr
			
 
				+b8_7d_conver
			
 
				+b8_7d_click
			
 
				+b8_7d_conver*log(view)
			
 
				+b8_7d_conver*ctcvr
			
 
				+b8_yesterday_ctr
			
 
				+b8_yesterday_ctcvr
			
 
				+b8_yesterday_cvr
			
 
				+b8_yesterday_conver
			
 
				+b8_yesterday_click
			
 
				+b8_yesterday_conver*log(view)
			
 
				+b8_yesterday_conver*ctcvr
			
 
				+b8_today_ctr
			
 
				+b8_today_ctcvr
			
 
				+b8_today_cvr
			
 
				+b8_today_conver
			
 
				+b8_today_click
			
 
				+b8_today_conver*log(view)
			
 
				+b8_today_conver*ctcvr
			
 
				+b9_1h_ctr
			
 
				+b9_1h_ctcvr
			
 
				+b9_1h_cvr
			
 
				+b9_1h_conver
			
 
				+b9_1h_click
			
 
				+b9_1h_conver*log(view)
			
 
				+b9_1h_conver*ctcvr
			
 
				+b9_2h_ctr
			
 
				+b9_2h_ctcvr
			
 
				+b9_2h_cvr
			
 
				+b9_2h_conver
			
 
				+b9_2h_click
			
 
				+b9_2h_conver*log(view)
			
 
				+b9_2h_conver*ctcvr
			
 
				+b9_3h_ctr
			
 
				+b9_3h_ctcvr
			
 
				+b9_3h_cvr
			
 
				+b9_3h_conver
			
 
				+b9_3h_click
			
 
				+b9_3h_conver*log(view)
			
 
				+b9_3h_conver*ctcvr
			
 
				+b9_6h_ctr
			
 
				+b9_6h_ctcvr
			
 
				+b9_6h_cvr
			
 
				+b9_6h_conver
			
 
				+b9_6h_click
			
 
				+b9_6h_conver*log(view)
			
 
				+b9_6h_conver*ctcvr
			
 
				+b9_12h_ctr
			
 
				+b9_12h_ctcvr
			
 
				+b9_12h_cvr
			
 
				+b9_12h_conver
			
 
				+b9_12h_click
			
 
				+b9_12h_conver*log(view)
			
 
				+b9_12h_conver*ctcvr
			
 
				+b9_1d_ctr
			
 
				+b9_1d_ctcvr
			
 
				+b9_1d_cvr
			
 
				+b9_1d_conver
			
 
				+b9_1d_click
			
 
				+b9_1d_conver*log(view)
			
 
				+b9_1d_conver*ctcvr
			
 
				+b9_3d_ctr
			
 
				+b9_3d_ctcvr
			
 
				+b9_3d_cvr
			
 
				+b9_3d_conver
			
 
				+b9_3d_click
			
 
				+b9_3d_conver*log(view)
			
 
				+b9_3d_conver*ctcvr
			
 
				+b9_7d_ctr
			
 
				+b9_7d_ctcvr
			
 
				+b9_7d_cvr
			
 
				+b9_7d_conver
			
 
				+b9_7d_click
			
 
				+b9_7d_conver*log(view)
			
 
				+b9_7d_conver*ctcvr
			
 
				+b9_yesterday_ctr
			
 
				+b9_yesterday_ctcvr
			
 
				+b9_yesterday_cvr
			
 
				+b9_yesterday_conver
			
 
				+b9_yesterday_click
			
 
				+b9_yesterday_conver*log(view)
			
 
				+b9_yesterday_conver*ctcvr
			
 
				+b9_today_ctr
			
 
				+b9_today_ctcvr
			
 
				+b9_today_cvr
			
 
				+b9_today_conver
			
 
				+b9_today_click
			
 
				+b9_today_conver*log(view)
			
 
				+b9_today_conver*ctcvr
			
 
				+b6_7d_ctr
			
 
				+b6_7d_ctcvr
			
 
				+b6_7d_cvr
			
 
				+b6_7d_conver
			
 
				+b6_7d_click
			
 
				+b6_7d_conver*log(view)
			
 
				+b6_7d_conver*ctcvr
			
 
				+b6_14d_ctr
			
 
				+b6_14d_ctcvr
			
 
				+b6_14d_cvr
			
 
				+b6_14d_conver
			
 
				+b6_14d_click
			
 
				+b6_14d_conver*log(view)
			
 
				+b6_14d_conver*ctcvr
			
 
				+b7_7d_ctr
			
 
				+b7_7d_ctcvr
			
 
				+b7_7d_cvr
			
 
				+b7_7d_conver
			
 
				+b7_7d_click
			
 
				+b7_7d_conver*log(view)
			
 
				+b7_7d_conver*ctcvr
			
 
				+b7_14d_ctr
			
 
				+b7_14d_ctcvr
			
 
				+b7_14d_cvr
			
 
				+b7_14d_conver
			
 
				+b7_14d_click
			
 
				+b7_14d_conver*log(view)
			
 
				+b7_14d_conver*ctcvr
			
 
				+viewAll
			
 
				+clickAll
			
 
				+converAll
			
 
				+incomeAll
			
 
				+ctr_all
			
 
				+ctcvr_all
			
 
				+cvr_all
			
 
				+timediff_view
			
 
				+timediff_click
			
 
				+timediff_conver
			
 
				+actionstatic_view
			
 
				+actionstatic_click
			
 
				+actionstatic_conver
			
 
				+actionstatic_income
			
 
				+actionstatic_ctr
			
 
				+actionstatic_ctcvr
			
 
				+actionstatic_cvr
			
 
				+e1_tags_3d_matchnum
			
 
				+e1_tags_3d_maxscore
			
 
				+e1_tags_3d_avgscore
			
 
				+e1_tags_7d_matchnum
			
 
				+e1_tags_7d_maxscore
			
 
				+e1_tags_7d_avgscore
			
 
				+e1_tags_14d_matchnum
			
 
				+e1_tags_14d_maxscore
			
 
				+e1_tags_14d_avgscore
			
 
				+e2_tags_3d_matchnum
			
 
				+e2_tags_3d_maxscore
			
 
				+e2_tags_3d_avgscore
			
 
				+e2_tags_7d_matchnum
			
 
				+e2_tags_7d_maxscore
			
 
				+e2_tags_7d_avgscore
			
 
				+e2_tags_14d_matchnum
			
 
				+e2_tags_14d_maxscore
			
 
				+e2_tags_14d_avgscore
			
 
				+d1_feature_3h_ctr
			
 
				+d1_feature_3h_ctcvr
			
 
				+d1_feature_3h_cvr
			
 
				+d1_feature_3h_conver
			
 
				+d1_feature_6h_ctr
			
 
				+d1_feature_6h_ctcvr
			
 
				+d1_feature_6h_cvr
			
 
				+d1_feature_6h_conver
			
 
				+d1_feature_12h_ctr
			
 
				+d1_feature_12h_ctcvr
			
 
				+d1_feature_12h_cvr
			
 
				+d1_feature_12h_conver
			
 
				+d1_feature_1d_ctr
			
 
				+d1_feature_1d_ctcvr
			
 
				+d1_feature_1d_cvr
			
 
				+d1_feature_1d_conver
			
 
				+d1_feature_3d_ctr
			
 
				+d1_feature_3d_ctcvr
			
 
				+d1_feature_3d_cvr
			
 
				+d1_feature_3d_conver
			
 
				+d1_feature_7d_ctr
			
 
				+d1_feature_7d_ctcvr
			
 
				+d1_feature_7d_cvr
			
 
				+d1_feature_7d_conver
			
 
				+vid_rank_ctr_1d
			
 
				+vid_rank_ctr_3d
			
 
				+vid_rank_ctr_7d
			
 
				+vid_rank_ctr_14d
			
 
				+vid_rank_ctcvr_1d
			
 
				+vid_rank_ctcvr_3d
			
 
				+vid_rank_ctcvr_7d
			
 
				+vid_rank_ctcvr_14d
			
 
				+ctitle_vtitle_similarity
			
 
				+weight
			
--- a/src/main/resources/weight_ad_feature_name.txt
+++ b/src/main/resources/weight_ad_feature_name.txt
@@ -0,0 +1 @@
 
				+weight
			
--- a/src/main/scala/com/aliyun/odps/spark/ad/xgboost/v20240808/XGBoostTrain.scala
+++ b/src/main/scala/com/aliyun/odps/spark/ad/xgboost/v20240808/XGBoostTrain.scala
@@ -0,0 +1,131 @@
 
				+package com.aliyun.odps.spark.ad.xgboost.v20240808
			
 
				+
			
 
				+import com.aliyun.odps.spark.examples.myUtils.ParamUtils
			
 
				+import ml.dmlc.xgboost4j.scala.spark.XGBoostClassifier
			
 
				+import org.apache.commons.lang.StringUtils
			
 
				+import org.apache.commons.lang3.math.NumberUtils
			
 
				+import org.apache.spark.ml.feature.VectorAssembler
			
 
				+import org.apache.spark.rdd.RDD
			
 
				+import org.apache.spark.sql.types.{DataTypes, StructField}
			
 
				+import org.apache.spark.sql.{Dataset, Row, SparkSession}
			
 
				+
			
 
				+import java.net.URL
			
 
				+import java.time.LocalDateTime
			
 
				+import java.time.format.DateTimeFormatter
			
 
				+import scala.io.Source
			
 
				+
			
 
				+object XGBoostTrain {
			
 
				+  def main(args: Array[String]): Unit = {
			
 
				+    try {
			
 
				+
			
 
				+      val param = ParamUtils.parseArgs(args)
			
 
				+
			
 
				+      val dt = LocalDateTime.now.format(DateTimeFormatter.ofPattern("yyyyMMddHHmmSS"))
			
 
				+
			
 
				+      val spark = SparkSession.builder()
			
 
				+        .appName("XGBoostTrain:" + dt)
			
 
				+        .getOrCreate()
			
 
				+      val sc = spark.sparkContext
			
 
				+
			
 
				+      val loader = getClass.getClassLoader
			
 
				+
			
 
				+      val readPath = param.getOrElse("trainReadPath", "")
			
 
				+      val predictReadPath = param.getOrElse("predictReadPath", "")
			
 
				+      val filterNameSet = param.getOrElse("filterNames", "").split(",").filter(_.nonEmpty).toSet
			
 
				+      val featureNameFile = param.getOrElse("featureNameFile", "20240718_ad_feature_name.txt")
			
 
				+
			
 
				+      val featureNameContent = readFile(loader.getResource(featureNameFile))
			
 
				+
			
 
				+      val featureNameList: List[String] = featureNameContent.split("\n")
			
 
				+        .map(r => r.replace(" ", "").replaceAll("\n", ""))
			
 
				+        .filter(r => r.nonEmpty)
			
 
				+        .filter(r => !containsAny(filterNameSet, r))
			
 
				+        .toList
			
 
				+
			
 
				+      val rowRDD = dataMap(sc.textFile(readPath), featureNameList)
			
 
				+
			
 
				+      println(s"rowRDD count ${rowRDD.count()}")
			
 
				+
			
 
				+      val fields: Array[StructField] = Array(
			
 
				+        DataTypes.createStructField("label", DataTypes.IntegerType, true)
			
 
				+      ) ++ featureNameList.map(f => DataTypes.createStructField(f, DataTypes.DoubleType, true))
			
 
				+
			
 
				+      val trainDataSet: Dataset[Row] = spark.createDataFrame(rowRDD, DataTypes.createStructType(fields))
			
 
				+
			
 
				+      val vectorAssembler = new VectorAssembler().setInputCols(featureNameList.toArray).setOutputCol("features")
			
 
				+
			
 
				+      val xgbInput = vectorAssembler.transform(trainDataSet).select("features", "label")
			
 
				+      xgbInput.show()
			
 
				+
			
 
				+      // 创建 XGBoostClassifier 对象
			
 
				+      val xgbClassifier = new XGBoostClassifier()
			
 
				+        .setEta(0.01f)
			
 
				+        .setMissing(0.0f)
			
 
				+        .setMaxDepth(5)
			
 
				+        .setNumRound(1000)
			
 
				+        .setSubsample(0.8)
			
 
				+        .setColsampleBytree(0.8)
			
 
				+        .setScalePosWeight(1)
			
 
				+        .setObjective("binary:logistic")
			
 
				+        .setEvalMetric("auc")
			
 
				+        .setFeaturesCol("features")
			
 
				+        .setLabelCol("label")
			
 
				+        .setNthread(1)
			
 
				+        .setNumWorkers(22)
			
 
				+
			
 
				+      // 训练模型
			
 
				+      val model = xgbClassifier.fit(xgbInput)
			
 
				+
			
 
				+
			
 
				+
			
 
				+    }
			
 
				+    catch {
			
 
				+      case e: Throwable => e.printStackTrace()
			
 
				+    }
			
 
				+  }
			
 
				+
			
 
				+  private def readFile(filePath: URL): String = {
			
 
				+    var source: Option[Source] = None
			
 
				+    try {
			
 
				+      source = Some(Source.fromURL(filePath))
			
 
				+      return source.get.getLines().mkString("\n")
			
 
				+    }
			
 
				+    catch {
			
 
				+      case e: Exception => println("文件读取异常: " + e.toString)
			
 
				+    }
			
 
				+    finally {
			
 
				+      source.foreach(_.close())
			
 
				+    }
			
 
				+    ""
			
 
				+  }
			
 
				+
			
 
				+  private def containsAny(list: Iterable[String], s: String): Boolean = {
			
 
				+    for (item <- list) {
			
 
				+      if (s.contains(item)) {
			
 
				+        return true
			
 
				+      }
			
 
				+    }
			
 
				+    false
			
 
				+  }
			
 
				+
			
 
				+  private def dataMap(data: RDD[String], featureNameList: List[String]): RDD[Row] = {
			
 
				+    data.map(r => {
			
 
				+      val line: Array[String] = StringUtils.split(r, "\t")
			
 
				+      val label: Int = NumberUtils.toInt(line(0))
			
 
				+
			
 
				+      val map: Map[String, Double] = line.drop(1).map { entry =>
			
 
				+        val Array(key, value) = entry.split(":")
			
 
				+        key -> NumberUtils.toDouble(value, 0.0)
			
 
				+      }.toMap
			
 
				+
			
 
				+      val v: Array[Any] = Array.ofDim[Any](featureNameList.length + 1)
			
 
				+      v(0) = label
			
 
				+
			
 
				+      for (index <- featureNameList.indices) {
			
 
				+        v(index + 1) = map.getOrElse(featureNameList(index), 0.0)
			
 
				+      }
			
 
				+
			
 
				+      Row.fromSeq(v)
			
 
				+    })
			
 
				+  }
			
 
				+}
			
--- a/src/main/scala/com/aliyun/odps/spark/examples/SparkPi.scala
+++ b/src/main/scala/com/aliyun/odps/spark/examples/SparkPi.scala
@@ -1,20 +1,20 @@
 
				 /**
			
 
				-  * Licensed to the Apache Software Foundation (ASF) under one
			
 
				-  * or more contributor license agreements.  See the NOTICE file
			
 
				-  * distributed with this work for additional information
			
 
				-  * regarding copyright ownership.  The ASF licenses this file
			
 
				-  * to you under the Apache License, Version 2.0 (the
			
 
				-  * "License"); you may not use this file except in compliance
			
 
				-  * with the License.  You may obtain a copy of the License at
			
 
				-  * <p>
			
 
				-  * http://www.apache.org/licenses/LICENSE-2.0
			
 
				-  * <p>
			
 
				-  * Unless required by applicable law or agreed to in writing, software
			
 
				-  * distributed under the License is distributed on an "AS IS" BASIS,
			
 
				-  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
			
 
				-  * See the License for the specific language governing permissions and
			
 
				-  * limitations under the License.
			
 
				-  */
			
 
				+ * Licensed to the Apache Software Foundation (ASF) under one
			
 
				+ * or more contributor license agreements.  See the NOTICE file
			
 
				+ * distributed with this work for additional information
			
 
				+ * regarding copyright ownership.  The ASF licenses this file
			
 
				+ * to you under the Apache License, Version 2.0 (the
			
 
				+ * "License"); you may not use this file except in compliance
			
 
				+ * with the License.  You may obtain a copy of the License at
			
 
				+ * <p>
			
 
				+ * http://www.apache.org/licenses/LICENSE-2.0
			
 
				+ * <p>
			
 
				+ * Unless required by applicable law or agreed to in writing, software
			
 
				+ * distributed under the License is distributed on an "AS IS" BASIS,
			
 
				+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
			
 
				+ * See the License for the specific language governing permissions and
			
 
				+ * limitations under the License.
			
 
				+ */
			
 
				 
			
 
				 package com.aliyun.odps.spark.examples
			
 
				 
			
--- a/src/main/scala/com/aliyun/odps/spark/examples/makedata_ad/v20240620/makedata_ad_31_originData_20240620.scala
+++ b/src/main/scala/com/aliyun/odps/spark/examples/makedata_ad/v20240620/makedata_ad_31_originData_20240620.scala
@@ -1,4 +1,4 @@
 
				-package com.aliyun.odps.spark.examples.makedata_ad
			
 
				+package com.aliyun.odps.spark.examples.makedata_ad.v20240620
			
 
				 
			
 
				 import com.alibaba.fastjson.{JSON, JSONObject}
			
 
				 import com.aliyun.odps.TableSchema
			
--- a/src/main/scala/com/aliyun/odps/spark/examples/makedata_ad/v20240620/makedata_ad_32_bucket_20240622.scala
+++ b/src/main/scala/com/aliyun/odps/spark/examples/makedata_ad/v20240620/makedata_ad_32_bucket_20240622.scala
@@ -1,4 +1,4 @@
 
				-package com.aliyun.odps.spark.examples.makedata_ad
			
 
				+package com.aliyun.odps.spark.examples.makedata_ad.v20240620
			
 
				 
			
 
				 import com.alibaba.fastjson.JSON
			
 
				 import com.aliyun.odps.spark.examples.myUtils.{MyHdfsUtils, ParamUtils}
			
--- a/src/main/scala/com/aliyun/odps/spark/examples/makedata_ad/v20240620/makedata_ad_33_bucketDataPrint_20240628.scala
+++ b/src/main/scala/com/aliyun/odps/spark/examples/makedata_ad/v20240620/makedata_ad_33_bucketDataPrint_20240628.scala
@@ -1,4 +1,4 @@
 
				-package com.aliyun.odps.spark.examples.makedata_ad
			
 
				+package com.aliyun.odps.spark.examples.makedata_ad.v20240620
			
 
				 
			
 
				 import com.alibaba.fastjson.{JSON, JSONObject}
			
 
				 import com.aliyun.odps.TableSchema
			
--- a/src/main/scala/com/aliyun/odps/spark/examples/makedata_ad/v20240620/makedata_ad_33_bucketData_20240622.scala
+++ b/src/main/scala/com/aliyun/odps/spark/examples/makedata_ad/v20240620/makedata_ad_33_bucketData_20240622.scala
@@ -1,4 +1,4 @@
 
				-package com.aliyun.odps.spark.examples.makedata_ad
			
 
				+package com.aliyun.odps.spark.examples.makedata_ad.v20240620
			
 
				 
			
 
				 import com.alibaba.fastjson.JSON
			
 
				 import com.aliyun.odps.spark.examples.myUtils.{MyDateUtils, MyHdfsUtils, ParamUtils}
			
@@ -51,7 +51,7 @@ object makedata_ad_33_bucketData_20240622 {
 
				     val beginStr = param.getOrElse("beginStr", "20240620")
			
 
				     val endStr = param.getOrElse("endStr", "20240620")
			
 
				     val repartition = param.getOrElse("repartition", "100").toInt
			
 
				-    val filterNames = param.getOrElse("filterNames", "").split(",").toSet
			
 
				+    val filterNames = param.getOrElse("filterNames", "").split(",").filter(_.nonEmpty).toSet
			
 
				     val whatLabel = param.getOrElse("whatLabel", "ad_is_conversion")
			
 
				 
			
 
				     val dateRange = MyDateUtils.getDateRange(beginStr, endStr)
			
--- a/src/main/scala/com/aliyun/odps/spark/examples/makedata_ad/v20240718/makedata_ad_31_originData_20240718.scala
+++ b/src/main/scala/com/aliyun/odps/spark/examples/makedata_ad/v20240718/makedata_ad_31_originData_20240718.scala
@@ -0,0 +1,431 @@
 
				+package com.aliyun.odps.spark.examples.makedata_ad.v20240718
			
 
				+
			
 
				+import com.alibaba.fastjson.{JSON, JSONObject}
			
 
				+import com.aliyun.odps.TableSchema
			
 
				+import com.aliyun.odps.data.Record
			
 
				+import com.aliyun.odps.spark.examples.myUtils.{MyDateUtils, MyHdfsUtils, ParamUtils, env}
			
 
				+import examples.extractor.RankExtractorFeature_20240530
			
 
				+import examples.utils.DateTimeUtil
			
 
				+import org.apache.hadoop.io.compress.GzipCodec
			
 
				+import org.apache.spark.sql.SparkSession
			
 
				+import org.xm.Similarity
			
 
				+
			
 
				+import scala.collection.JavaConversions._
			
 
				+import scala.collection.mutable.ArrayBuffer
			
 
				+
			
 
				+/*
			
 
				+   20240608 提取特征
			
 
				+ */
			
 
				+
			
 
				+object makedata_ad_31_originData_20240718 {
			
 
				+  def main(args: Array[String]): Unit = {
			
 
				+    val spark = SparkSession
			
 
				+      .builder()
			
 
				+      .appName(this.getClass.getName)
			
 
				+      .getOrCreate()
			
 
				+    val sc = spark.sparkContext
			
 
				+
			
 
				+    // 1 读取参数
			
 
				+    val param = ParamUtils.parseArgs(args)
			
 
				+    val tablePart = param.getOrElse("tablePart", "64").toInt
			
 
				+    val beginStr = param.getOrElse("beginStr", "2024062008")
			
 
				+    val endStr = param.getOrElse("endStr", "2024062023")
			
 
				+    val savePath = param.getOrElse("savePath", "/dw/recommend/model/31_ad_sample_data/")
			
 
				+    val project = param.getOrElse("project", "loghubods")
			
 
				+    val table = param.getOrElse("table", "alg_recsys_ad_sample_all")
			
 
				+    val repartition = param.getOrElse("repartition", "100").toInt
			
 
				+    val filterHours = param.getOrElse("filterHours", "00,01,02,03,04,05,06,07").split(",").toSet
			
 
				+    val idDefaultValue = param.getOrElse("idDefaultValue", "1.0").toDouble
			
 
				+    // 2 读取odps+表信息
			
 
				+    val odpsOps = env.getODPS(sc)
			
 
				+
			
 
				+    // 3 循环执行数据生产
			
 
				+    val timeRange = MyDateUtils.getDateHourRange(beginStr, endStr)
			
 
				+    for (dt_hh <- timeRange) {
			
 
				+      val dt = dt_hh.substring(0, 8)
			
 
				+      val hh = dt_hh.substring(8, 10)
			
 
				+      val partition = s"dt=$dt,hh=$hh"
			
 
				+      if (filterHours.nonEmpty && filterHours.contains(hh)) {
			
 
				+        println("不执行partiton:" + partition)
			
 
				+      } else {
			
 
				+        println("开始执行partiton:" + partition)
			
 
				+        val odpsData = odpsOps.readTable(project = project,
			
 
				+            table = table,
			
 
				+            partition = partition,
			
 
				+            transfer = func,
			
 
				+            numPartition = tablePart)
			
 
				+          .filter(record => {
			
 
				+            val extendAlg: JSONObject = if (record.isNull("extend_alg")) new JSONObject() else
			
 
				+              JSON.parseObject(record.getString("extend_alg"))
			
 
				+            val isApi = extendAlg.getString("is_api")
			
 
				+            "1".equals(isApi)
			
 
				+          })
			
 
				+          .map(record => {
			
 
				+
			
 
				+            val ts = record.getString("ts").toInt
			
 
				+            val cid = record.getString("cid")
			
 
				+            val apptype = record.getString("apptype")
			
 
				+            val extend: JSONObject = if (record.isNull("extend")) new JSONObject() else
			
 
				+              JSON.parseObject(record.getString("extend"))
			
 
				+
			
 
				+            val featureMap = new JSONObject()
			
 
				+
			
 
				+            val b1: JSONObject = if (record.isNull("b1_feature")) new JSONObject() else
			
 
				+              JSON.parseObject(record.getString("b1_feature"))
			
 
				+            val b2: JSONObject = if (record.isNull("b2_feature")) new JSONObject() else
			
 
				+              JSON.parseObject(record.getString("b2_feature"))
			
 
				+            val b3: JSONObject = if (record.isNull("b3_feature")) new JSONObject() else
			
 
				+              JSON.parseObject(record.getString("b3_feature"))
			
 
				+            val b4: JSONObject = if (record.isNull("b4_feature")) new JSONObject() else
			
 
				+              JSON.parseObject(record.getString("b4_feature"))
			
 
				+            val b5: JSONObject = if (record.isNull("b5_feature")) new JSONObject() else
			
 
				+              JSON.parseObject(record.getString("b5_feature"))
			
 
				+            val b6: JSONObject = if (record.isNull("b6_feature")) new JSONObject() else
			
 
				+              JSON.parseObject(record.getString("b6_feature"))
			
 
				+            val b7: JSONObject = if (record.isNull("b7_feature")) new JSONObject() else
			
 
				+              JSON.parseObject(record.getString("b7_feature"))
			
 
				+            val b8: JSONObject = if (record.isNull("b8_feature")) new JSONObject() else
			
 
				+              JSON.parseObject(record.getString("b8_feature"))
			
 
				+            val b9: JSONObject = if (record.isNull("b9_feature")) new JSONObject() else
			
 
				+              JSON.parseObject(record.getString("b9_feature"))
			
 
				+
			
 
				+
			
 
				+            featureMap.put("cid_" + cid, idDefaultValue)
			
 
				+            if (b1.containsKey("adid") && b1.getString("adid").nonEmpty) {
			
 
				+              featureMap.put("adid_" + b1.getString("adid"), idDefaultValue)
			
 
				+            }
			
 
				+            if (b1.containsKey("adverid") && b1.getString("adverid").nonEmpty) {
			
 
				+              featureMap.put("adverid_" + b1.getString("adverid"), idDefaultValue)
			
 
				+            }
			
 
				+            if (b1.containsKey("targeting_conversion") && b1.getString("targeting_conversion").nonEmpty) {
			
 
				+              featureMap.put("targeting_conversion_" + b1.getString("targeting_conversion"), idDefaultValue)
			
 
				+            }
			
 
				+
			
 
				+            val hour = DateTimeUtil.getHourByTimestamp(ts)
			
 
				+            featureMap.put("hour_" + hour, idDefaultValue)
			
 
				+
			
 
				+            val dayOfWeek = DateTimeUtil.getDayOrWeekByTimestamp(ts)
			
 
				+            featureMap.put("dayofweek_" + dayOfWeek, idDefaultValue);
			
 
				+
			
 
				+            featureMap.put("apptype_" + apptype, idDefaultValue);
			
 
				+
			
 
				+            if (extend.containsKey("abcode") && extend.getString("abcode").nonEmpty) {
			
 
				+              featureMap.put("abcode_" + extend.getString("abcode"), idDefaultValue)
			
 
				+            }
			
 
				+
			
 
				+
			
 
				+            if (b1.containsKey("cpa")) {
			
 
				+              featureMap.put("cpa", b1.getString("cpa").toDouble)
			
 
				+            }
			
 
				+            if (b1.containsKey("weight") && b1.getString("weight").nonEmpty) {
			
 
				+              featureMap.put("weight", b1.getString("weight").toDouble)
			
 
				+            }
			
 
				+
			
 
				+            for ((bn, prefix1) <- List(
			
 
				+              (b2, "b2"), (b3, "b3"), (b4, "b4"), (b5, "b5"), (b8, "b8"), (b9, "b9")
			
 
				+            )) {
			
 
				+              for (prefix2 <- List(
			
 
				+                "1h", "2h", "3h", "4h", "5h", "6h", "12h", "1d", "3d", "7d", "today", "yesterday"
			
 
				+              )) {
			
 
				+                val view = if (bn.isEmpty) 0D else bn.getIntValue("ad_view_" + prefix2).toDouble
			
 
				+                val click = if (bn.isEmpty) 0D else bn.getIntValue("ad_click_" + prefix2).toDouble
			
 
				+                val conver = if (bn.isEmpty) 0D else bn.getIntValue("ad_conversion_" + prefix2).toDouble
			
 
				+                val income = if (bn.isEmpty) 0D else bn.getIntValue("ad_income_" + prefix2).toDouble
			
 
				+                val f1 = RankExtractorFeature_20240530.calDiv(click, view)
			
 
				+                val f2 = RankExtractorFeature_20240530.calDiv(conver, view)
			
 
				+                val f3 = RankExtractorFeature_20240530.calDiv(conver, click)
			
 
				+                val f4 = conver
			
 
				+                val f5 = RankExtractorFeature_20240530.calDiv(income * 1000, view)
			
 
				+                featureMap.put(prefix1 + "_" + prefix2 + "_" + "ctr", f1)
			
 
				+                featureMap.put(prefix1 + "_" + prefix2 + "_" + "ctcvr", f2)
			
 
				+                featureMap.put(prefix1 + "_" + prefix2 + "_" + "cvr", f3)
			
 
				+                featureMap.put(prefix1 + "_" + prefix2 + "_" + "conver", f4)
			
 
				+                featureMap.put(prefix1 + "_" + prefix2 + "_" + "ecpm", f5)
			
 
				+
			
 
				+                featureMap.put(prefix1 + "_" + prefix2 + "_" + "click", click)
			
 
				+                featureMap.put(prefix1 + "_" + prefix2 + "_" + "conver*log(view)", conver * RankExtractorFeature_20240530.calLog(view))
			
 
				+                featureMap.put(prefix1 + "_" + prefix2 + "_" + "conver*ctcvr", conver * f2)
			
 
				+              }
			
 
				+            }
			
 
				+
			
 
				+            for ((bn, prefix1) <- List(
			
 
				+              (b6, "b6"), (b7, "b7")
			
 
				+            )) {
			
 
				+              for (prefix2 <- List(
			
 
				+                "7d", "14d"
			
 
				+              )) {
			
 
				+                val view = if (bn.isEmpty) 0D else bn.getIntValue("ad_view_" + prefix2).toDouble
			
 
				+                val click = if (bn.isEmpty) 0D else bn.getIntValue("ad_click_" + prefix2).toDouble
			
 
				+                val conver = if (bn.isEmpty) 0D else bn.getIntValue("ad_conversion_" + prefix2).toDouble
			
 
				+                val income = if (bn.isEmpty) 0D else bn.getIntValue("ad_income_" + prefix2).toDouble
			
 
				+                val f1 = RankExtractorFeature_20240530.calDiv(click, view)
			
 
				+                val f2 = RankExtractorFeature_20240530.calDiv(conver, view)
			
 
				+                val f3 = RankExtractorFeature_20240530.calDiv(conver, click)
			
 
				+                val f4 = conver
			
 
				+                val f5 = RankExtractorFeature_20240530.calDiv(income * 1000, view)
			
 
				+                featureMap.put(prefix1 + "_" + prefix2 + "_" + "ctr", f1)
			
 
				+                featureMap.put(prefix1 + "_" + prefix2 + "_" + "ctcvr", f2)
			
 
				+                featureMap.put(prefix1 + "_" + prefix2 + "_" + "cvr", f3)
			
 
				+                featureMap.put(prefix1 + "_" + prefix2 + "_" + "conver", f4)
			
 
				+                featureMap.put(prefix1 + "_" + prefix2 + "_" + "ecpm", f5)
			
 
				+
			
 
				+                featureMap.put(prefix1 + "_" + prefix2 + "_" + "click", click)
			
 
				+                featureMap.put(prefix1 + "_" + prefix2 + "_" + "conver*log(view)", conver * RankExtractorFeature_20240530.calLog(view))
			
 
				+                featureMap.put(prefix1 + "_" + prefix2 + "_" + "conver*ctcvr", conver * f2)
			
 
				+              }
			
 
				+            }
			
 
				+
			
 
				+            val c1: JSONObject = if (record.isNull("c1_feature")) new JSONObject() else
			
 
				+              JSON.parseObject(record.getString("c1_feature"))
			
 
				+
			
 
				+            val midActionList = if (c1.containsKey("action") && c1.getString("action").nonEmpty) {
			
 
				+              c1.getString("action").split(",").map(r => {
			
 
				+                val rList = r.split(":")
			
 
				+                (rList(0), (rList(1).toInt, rList(2).toInt, rList(3).toInt, rList(4).toInt, rList(5)))
			
 
				+              }).sortBy(-_._2._1).toList
			
 
				+            } else {
			
 
				+              new ArrayBuffer[(String, (Int, Int, Int, Int, String))]().toList
			
 
				+            }
			
 
				+            // u特征
			
 
				+            val viewAll = midActionList.size.toDouble
			
 
				+            val clickAll = midActionList.map(_._2._2).sum.toDouble
			
 
				+            val converAll = midActionList.map(_._2._3).sum.toDouble
			
 
				+            val incomeAll = midActionList.map(_._2._4).sum.toDouble
			
 
				+            featureMap.put("viewAll", viewAll)
			
 
				+            featureMap.put("clickAll", clickAll)
			
 
				+            featureMap.put("converAll", converAll)
			
 
				+            featureMap.put("incomeAll", incomeAll)
			
 
				+            featureMap.put("ctr_all", RankExtractorFeature_20240530.calDiv(clickAll, viewAll))
			
 
				+            featureMap.put("ctcvr_all", RankExtractorFeature_20240530.calDiv(converAll, viewAll))
			
 
				+            featureMap.put("cvr_all", RankExtractorFeature_20240530.calDiv(clickAll, converAll))
			
 
				+            featureMap.put("ecpm_all", RankExtractorFeature_20240530.calDiv(incomeAll * 1000, viewAll))
			
 
				+
			
 
				+            // ui特征
			
 
				+            val midTimeDiff = scala.collection.mutable.Map[String, Double]()
			
 
				+            midActionList.foreach {
			
 
				+              case (cid, (ts_history, click, conver, income, title)) =>
			
 
				+                if (!midTimeDiff.contains("timediff_view_" + cid)) {
			
 
				+                  midTimeDiff.put("timediff_view_" + cid, 1.0 / ((ts - ts_history).toDouble / 3600.0 / 24.0))
			
 
				+                }
			
 
				+                if (!midTimeDiff.contains("timediff_click_" + cid) && click > 0) {
			
 
				+                  midTimeDiff.put("timediff_click_" + cid, 1.0 / ((ts - ts_history).toDouble / 3600.0 / 24.0))
			
 
				+                }
			
 
				+                if (!midTimeDiff.contains("timediff_conver_" + cid) && conver > 0) {
			
 
				+                  midTimeDiff.put("timediff_conver_" + cid, 1.0 / ((ts - ts_history).toDouble / 3600.0 / 24.0))
			
 
				+                }
			
 
				+            }
			
 
				+
			
 
				+            val midActionStatic = scala.collection.mutable.Map[String, Double]()
			
 
				+            midActionList.foreach {
			
 
				+              case (cid, (ts_history, click, conver, income, title)) =>
			
 
				+                midActionStatic.put("actionstatic_view_" + cid, 1.0 + midActionStatic.getOrDefault("actionstatic_view_" + cid, 0.0))
			
 
				+                midActionStatic.put("actionstatic_click_" + cid, click + midActionStatic.getOrDefault("actionstatic_click_" + cid, 0.0))
			
 
				+                midActionStatic.put("actionstatic_conver_" + cid, conver + midActionStatic.getOrDefault("actionstatic_conver_" + cid, 0.0))
			
 
				+                midActionStatic.put("actionstatic_income_" + cid, income + midActionStatic.getOrDefault("actionstatic_income_" + cid, 0.0))
			
 
				+            }
			
 
				+
			
 
				+            if (midTimeDiff.contains("timediff_view_" + cid)) {
			
 
				+              featureMap.put("timediff_view", midTimeDiff.getOrDefault("timediff_view_" + cid, 0.0))
			
 
				+            }
			
 
				+            if (midTimeDiff.contains("timediff_click_" + cid)) {
			
 
				+              featureMap.put("timediff_click", midTimeDiff.getOrDefault("timediff_click_" + cid, 0.0))
			
 
				+            }
			
 
				+            if (midTimeDiff.contains("timediff_conver_" + cid)) {
			
 
				+              featureMap.put("timediff_conver", midTimeDiff.getOrDefault("timediff_conver_" + cid, 0.0))
			
 
				+            }
			
 
				+            if (midActionStatic.contains("actionstatic_view_" + cid)) {
			
 
				+              featureMap.put("actionstatic_view", midActionStatic.getOrDefault("actionstatic_view_" + cid, 0.0))
			
 
				+            }
			
 
				+            if (midActionStatic.contains("actionstatic_click_" + cid)) {
			
 
				+              featureMap.put("actionstatic_click", midActionStatic.getOrDefault("actionstatic_click_" + cid, 0.0))
			
 
				+            }
			
 
				+            if (midActionStatic.contains("actionstatic_conver_" + cid)) {
			
 
				+              featureMap.put("actionstatic_conver", midActionStatic.getOrDefault("actionstatic_conver_" + cid, 0.0))
			
 
				+            }
			
 
				+            if (midActionStatic.contains("actionstatic_income_" + cid)) {
			
 
				+              featureMap.put("actionstatic_income", midActionStatic.getOrDefault("actionstatic_income_" + cid, 0.0))
			
 
				+            }
			
 
				+            if (midActionStatic.contains("actionstatic_view_" + cid) && midActionStatic.contains("actionstatic_click_" + cid)) {
			
 
				+              featureMap.put("actionstatic_ctr", RankExtractorFeature_20240530.calDiv(
			
 
				+                midActionStatic.getOrDefault("actionstatic_click_" + cid, 0.0),
			
 
				+                midActionStatic.getOrDefault("actionstatic_view_" + cid, 0.0)
			
 
				+              ))
			
 
				+            }
			
 
				+            if (midActionStatic.contains("actionstatic_view_" + cid) && midActionStatic.contains("actionstatic_conver_" + cid)) {
			
 
				+              featureMap.put("actionstatic_ctcvr", RankExtractorFeature_20240530.calDiv(
			
 
				+                midActionStatic.getOrDefault("actionstatic_conver_" + cid, 0.0),
			
 
				+                midActionStatic.getOrDefault("actionstatic_view_" + cid, 0.0)
			
 
				+              ))
			
 
				+            }
			
 
				+            if (midActionStatic.contains("actionstatic_conver_" + cid) && midActionStatic.contains("actionstatic_click_" + cid)) {
			
 
				+              featureMap.put("actionstatic_cvr", RankExtractorFeature_20240530.calDiv(
			
 
				+                midActionStatic.getOrDefault("actionstatic_conver_" + cid, 0.0),
			
 
				+                midActionStatic.getOrDefault("actionstatic_click_" + cid, 0.0)
			
 
				+              ))
			
 
				+            }
			
 
				+
			
 
				+            val e1: JSONObject = if (record.isNull("e1_feature")) new JSONObject() else
			
 
				+              JSON.parseObject(record.getString("e1_feature"))
			
 
				+            val e2: JSONObject = if (record.isNull("e2_feature")) new JSONObject() else
			
 
				+              JSON.parseObject(record.getString("e2_feature"))
			
 
				+            val title = b1.getOrDefault("cidtitle", "").toString
			
 
				+            if (title.nonEmpty) {
			
 
				+              for ((en, prefix1) <- List((e1, "e1"), (e2, "e2"))) {
			
 
				+                for (prefix2 <- List("tags_3d", "tags_7d", "tags_14d")) {
			
 
				+                  if (en.nonEmpty && en.containsKey(prefix2) && en.getString(prefix2).nonEmpty) {
			
 
				+                    val (f1, f2, f3, f4) = funcC34567ForTags(en.getString(prefix2), title)
			
 
				+                    featureMap.put(prefix1 + "_" + prefix2 + "_matchnum", f1)
			
 
				+                    featureMap.put(prefix1 + "_" + prefix2 + "_maxscore", f3)
			
 
				+                    featureMap.put(prefix1 + "_" + prefix2 + "_avgscore", f4)
			
 
				+
			
 
				+                  }
			
 
				+                }
			
 
				+              }
			
 
				+            }
			
 
				+
			
 
				+            val d1: JSONObject = if (record.isNull("d1_feature")) new JSONObject() else
			
 
				+              JSON.parseObject(record.getString("d1_feature"))
			
 
				+            val d2: JSONObject = if (record.isNull("d2_feature")) new JSONObject() else
			
 
				+              JSON.parseObject(record.getString("d2_feature"))
			
 
				+            val d3: JSONObject = if (record.isNull("d3_feature")) new JSONObject() else
			
 
				+              JSON.parseObject(record.getString("d3_feature"))
			
 
				+
			
 
				+            if (d1.nonEmpty) {
			
 
				+              for (prefix <- List("3h", "6h", "12h", "1d", "3d", "7d")) {
			
 
				+                val view = if (!d1.containsKey("ad_view_" + prefix)) 0D else d1.getIntValue("ad_view_" + prefix).toDouble
			
 
				+                val click = if (!d1.containsKey("ad_click_" + prefix)) 0D else d1.getIntValue("ad_click_" + prefix).toDouble
			
 
				+                val conver = if (!d1.containsKey("ad_conversion_" + prefix)) 0D else d1.getIntValue("ad_conversion_" + prefix).toDouble
			
 
				+                val income = if (!d1.containsKey("ad_income_" + prefix)) 0D else d1.getIntValue("ad_income_" + prefix).toDouble
			
 
				+                val f1 = RankExtractorFeature_20240530.calDiv(click, view)
			
 
				+                val f2 = RankExtractorFeature_20240530.calDiv(conver, view)
			
 
				+                val f3 = RankExtractorFeature_20240530.calDiv(conver, click)
			
 
				+                val f4 = conver
			
 
				+                val f5 = RankExtractorFeature_20240530.calDiv(income * 1000, view)
			
 
				+                featureMap.put("d1_feature" + "_" + prefix + "_" + "ctr", f1)
			
 
				+                featureMap.put("d1_feature" + "_" + prefix + "_" + "ctcvr", f2)
			
 
				+                featureMap.put("d1_feature" + "_" + prefix + "_" + "cvr", f3)
			
 
				+                featureMap.put("d1_feature" + "_" + prefix + "_" + "conver", f4)
			
 
				+                featureMap.put("d1_feature" + "_" + prefix + "_" + "ecpm", f5)
			
 
				+              }
			
 
				+            }
			
 
				+
			
 
				+            val vidRankMaps = scala.collection.mutable.Map[String, scala.collection.immutable.Map[String, Double]]()
			
 
				+            if (d2.nonEmpty) {
			
 
				+              d2.foreach(r => {
			
 
				+                val key = r._1
			
 
				+                val value = d2.getString(key).split(",").map(r => {
			
 
				+                  val rList = r.split(":")
			
 
				+                  (rList(0), rList(2).toDouble)
			
 
				+                }).toMap
			
 
				+                vidRankMaps.put(key, value)
			
 
				+              })
			
 
				+            }
			
 
				+            for (prefix1 <- List("ctr", "ctcvr", "ecpm")) {
			
 
				+              for (prefix2 <- List("1d", "3d", "7d", "14d")) {
			
 
				+                if (vidRankMaps.contains(prefix1 + "_" + prefix2)) {
			
 
				+                  val rank = vidRankMaps(prefix1 + "_" + prefix2).getOrDefault(cid, 0.0)
			
 
				+                  if (rank >= 1.0) {
			
 
				+                    featureMap.put("vid_rank_" + prefix1 + "_" + prefix2, 1.0 / rank)
			
 
				+                  }
			
 
				+                }
			
 
				+              }
			
 
				+            }
			
 
				+
			
 
				+            if (d3.nonEmpty) {
			
 
				+              val vTitle = d3.getString("title")
			
 
				+              val score = Similarity.conceptSimilarity(title, vTitle)
			
 
				+              featureMap.put("ctitle_vtitle_similarity", score);
			
 
				+            }
			
 
				+
			
 
				+            /*
			
 
				+            广告
			
 
				+              sparse：cid adid adverid targeting_conversion
			
 
				+
			
 
				+              cpa --> 1个
			
 
				+              adverid下的 3h 6h 12h 1d 3d 7d 、 ctr ctcvr cvr conver ecpm  --> 30个
			
 
				+              cid下的 3h 6h 12h 1d 3d 7d 、 ctr ctcvr cvr ecpm conver --> 30个
			
 
				+              地理//cid下的 3h 6h 12h 1d 3d 7d 、 ctr ctcvr cvr ecpm conver --> 30个
			
 
				+              app//cid下的 3h 6h 12h 1d 3d 7d 、 ctr ctcvr cvr ecpm conver --> 30个
			
 
				+              手机品牌//cid下的 3h 6h 12h 1d 3d 7d 、 ctr ctcvr cvr ecpm conver --> 30个
			
 
				+              系统 无数据
			
 
				+              week//cid下的 7d 14d、 ctr ctcvr cvr ecpm conver --> 10个
			
 
				+              hour//cid下的 7d 14d、 ctr ctcvr cvr ecpm conver --> 10个
			
 
				+
			
 
				+            用户
			
 
				+              用户历史 点击/转化 的title tag；3d 7d 14d； cid的title； 数量/最高分/平均分 --> 18个
			
 
				+              用户历史 14d 看过/点过/转化次数/income； ctr cvr ctcvr ecpm；  --> 8个
			
 
				+
			
 
				+              用户到cid的ui特征 --> 10个
			
 
				+                1/用户最近看过这个cid的时间间隔
			
 
				+                1/用户最近点过这个cid的时间间隔
			
 
				+                1/用户最近转过这个cid的时间间隔
			
 
				+                用户看过这个cid多少次
			
 
				+                用户点过这个cid多少次
			
 
				+                用户转过这个cid多少次
			
 
				+                用户对这个cid花了多少钱
			
 
				+                用户对这个cid的ctr ctcvr cvr
			
 
				+
			
 
				+            视频
			
 
				+              title与cid的 sim-score-1/-2 无数据
			
 
				+              vid//cid下的 3h 6h 12h 1d 3d 7d 、 ctr ctcvr cvr ecpm conver --> 30个
			
 
				+              vid//cid下的 1d 3d 7d 14d、 ctr ctcvr ecpm 的rank值 倒数 --> 12个
			
 
				+
			
 
				+             */
			
 
				+
			
 
				+
			
 
				+            //4 处理label信息。
			
 
				+            val labels = new JSONObject
			
 
				+            for (labelKey <- List("ad_is_click", "ad_is_conversion")) {
			
 
				+              if (!record.isNull(labelKey)) {
			
 
				+                labels.put(labelKey, record.getString(labelKey))
			
 
				+              }
			
 
				+            }
			
 
				+            //5 处理log key表头。
			
 
				+            val mid = record.getString("mid")
			
 
				+            val headvideoid = record.getString("headvideoid")
			
 
				+            val logKey = (apptype, mid, cid, ts, headvideoid).productIterator.mkString(",")
			
 
				+            val labelKey = labels.toString()
			
 
				+            val featureKey = featureMap.toString()
			
 
				+            //6 拼接数据，保存。
			
 
				+            logKey + "\t" + labelKey + "\t" + featureKey
			
 
				+          })
			
 
				+
			
 
				+        // 4 保存数据到hdfs
			
 
				+        val savePartition = dt + hh
			
 
				+        val hdfsPath = savePath + "/" + savePartition
			
 
				+        if (hdfsPath.nonEmpty && hdfsPath.startsWith("/dw/recommend/model/")) {
			
 
				+          println("删除路径并开始数据写入:" + hdfsPath)
			
 
				+          MyHdfsUtils.delete_hdfs_path(hdfsPath)
			
 
				+          odpsData.coalesce(repartition).saveAsTextFile(hdfsPath, classOf[GzipCodec])
			
 
				+        } else {
			
 
				+          println("路径不合法，无法写入:" + hdfsPath)
			
 
				+        }
			
 
				+      }
			
 
				+
			
 
				+    }
			
 
				+  }
			
 
				+
			
 
				+  def func(record: Record, schema: TableSchema): Record = {
			
 
				+    record
			
 
				+  }
			
 
				+
			
 
				+  def funcC34567ForTags(tags: String, title: String): Tuple4[Double, String, Double, Double] = {
			
 
				+    // 匹配数量 匹配词 语义最高相似度分 语义平均相似度分
			
 
				+    val tagsList = tags.split(",")
			
 
				+    var d1 = 0.0
			
 
				+    val d2 = new ArrayBuffer[String]()
			
 
				+    var d3 = 0.0
			
 
				+    var d4 = 0.0
			
 
				+    for (tag <- tagsList) {
			
 
				+      if (title.contains(tag)) {
			
 
				+        d1 = d1 + 1.0
			
 
				+        d2.add(tag)
			
 
				+      }
			
 
				+      val score = Similarity.conceptSimilarity(tag, title)
			
 
				+      d3 = if (score > d3) score else d3
			
 
				+      d4 = d4 + score
			
 
				+    }
			
 
				+    d4 = if (tagsList.nonEmpty) d4 / tagsList.size else d4
			
 
				+    (d1, d2.mkString(","), d3, d4)
			
 
				+  }
			
 
				+}
			
--- a/src/main/scala/com/aliyun/odps/spark/examples/makedata_ad/v20240718/makedata_ad_32_bucket_20240718.scala
+++ b/src/main/scala/com/aliyun/odps/spark/examples/makedata_ad/v20240718/makedata_ad_32_bucket_20240718.scala
@@ -0,0 +1,105 @@
 
				+package com.aliyun.odps.spark.examples.makedata_ad.v20240718
			
 
				+
			
 
				+import com.alibaba.fastjson.JSON
			
 
				+import com.aliyun.odps.spark.examples.myUtils.{MyHdfsUtils, ParamUtils}
			
 
				+import org.apache.hadoop.io.compress.GzipCodec
			
 
				+import org.apache.spark.sql.SparkSession
			
 
				+
			
 
				+import scala.collection.JavaConversions._
			
 
				+import scala.collection.mutable.ArrayBuffer
			
 
				+import scala.io.Source
			
 
				+/*
			
 
				+
			
 
				+ */
			
 
				+
			
 
				+object makedata_ad_32_bucket_20240718 {
			
 
				+  def main(args: Array[String]): Unit = {
			
 
				+
			
 
				+    val spark = SparkSession
			
 
				+      .builder()
			
 
				+      .appName(this.getClass.getName)
			
 
				+      .getOrCreate()
			
 
				+    val sc = spark.sparkContext
			
 
				+
			
 
				+    // 1 读取参数
			
 
				+    val param = ParamUtils.parseArgs(args)
			
 
				+    val readPath = param.getOrElse("readPath", "/dw/recommend/model/31_ad_sample_data/20240620*")
			
 
				+    val savePath = param.getOrElse("savePath", "/dw/recommend/model/32_bucket_data/")
			
 
				+    val fileName = param.getOrElse("fileName", "20240620_100")
			
 
				+    val sampleRate = param.getOrElse("sampleRate", "1.0").toDouble
			
 
				+    val bucketNum = param.getOrElse("bucketNum", "100").toInt
			
 
				+    val featureNameFile = param.getOrElse("featureNameFile", "20240718_ad_feature_name.txt");
			
 
				+
			
 
				+
			
 
				+    val loader = getClass.getClassLoader
			
 
				+    val resourceUrl = loader.getResource(featureNameFile)
			
 
				+    val content =
			
 
				+      if (resourceUrl != null) {
			
 
				+        val content = Source.fromURL(resourceUrl).getLines().mkString("\n")
			
 
				+        Source.fromURL(resourceUrl).close()
			
 
				+        content
			
 
				+      } else {
			
 
				+        ""
			
 
				+      }
			
 
				+    println(content)
			
 
				+    val contentList = content.split("\n")
			
 
				+      .map(r=> r.replace(" ", "").replaceAll("\n", ""))
			
 
				+      .filter(r=> r.nonEmpty).toList
			
 
				+
			
 
				+
			
 
				+
			
 
				+    val data = sc.textFile(readPath)
			
 
				+    println("问题数据数量：" + data.filter(r=>r.split("\t").length != 3).count())
			
 
				+    val data1 = data.map(r => {
			
 
				+      val rList = r.split("\t")
			
 
				+      val jsons = JSON.parseObject(rList(2))
			
 
				+      val doubles = scala.collection.mutable.Map[String, Double]()
			
 
				+      jsons.foreach(r =>{
			
 
				+        doubles.put(r._1, jsons.getDoubleValue(r._1))
			
 
				+      })
			
 
				+      doubles
			
 
				+    }).sample(false, sampleRate ).repartition(20)
			
 
				+
			
 
				+    val result = new ArrayBuffer[String]()
			
 
				+
			
 
				+    for (i <- contentList.indices){
			
 
				+      println("特征:" + contentList(i))
			
 
				+      val data2 = data1.map(r => r.getOrDefault(contentList(i), 0D)).filter(_ > 1E-8).collect().sorted
			
 
				+      val len = data2.length
			
 
				+      if (len == 0){
			
 
				+        result.add(contentList(i) + "\t" + bucketNum.toString + "\t" + "0")
			
 
				+      }else{
			
 
				+        val oneBucketNum = (len - 1) / (bucketNum - 1) + 1 // 确保每个桶至少有一个元素
			
 
				+        val buffers = new ArrayBuffer[Double]()
			
 
				+
			
 
				+        var lastBucketValue = data2(0) // 记录上一个桶的切分点
			
 
				+        for (j <- 0 until len by oneBucketNum) {
			
 
				+          val d = data2(j)
			
 
				+          if (j > 0 && d != lastBucketValue) {
			
 
				+            // 如果当前切分点不同于上一个切分点，则保存当前切分点
			
 
				+            buffers += d
			
 
				+          }
			
 
				+          lastBucketValue = d // 更新上一个桶的切分点
			
 
				+        }
			
 
				+
			
 
				+        // 最后一个桶的结束点应该是数组的最后一个元素
			
 
				+        if (!buffers.contains(data2.last)) {
			
 
				+          buffers += data2.last
			
 
				+        }
			
 
				+        result.add(contentList(i) + "\t" + bucketNum.toString + "\t" + buffers.mkString(","))
			
 
				+      }
			
 
				+    }
			
 
				+    val data3 = sc.parallelize(result)
			
 
				+
			
 
				+
			
 
				+    // 4 保存数据到hdfs
			
 
				+    val hdfsPath = savePath + "/" + fileName
			
 
				+    if (hdfsPath.nonEmpty && hdfsPath.startsWith("/dw/recommend/model/")) {
			
 
				+      println("删除路径并开始数据写入:" + hdfsPath)
			
 
				+      MyHdfsUtils.delete_hdfs_path(hdfsPath)
			
 
				+      data3.repartition(1).saveAsTextFile(hdfsPath, classOf[GzipCodec])
			
 
				+    } else {
			
 
				+      println("路径不合法，无法写入:" + hdfsPath)
			
 
				+    }
			
 
				+  }
			
 
				+}
			
--- a/src/main/scala/com/aliyun/odps/spark/examples/makedata_ad/v20240718/makedata_ad_33_bucketDataPrint_20240718.scala
+++ b/src/main/scala/com/aliyun/odps/spark/examples/makedata_ad/v20240718/makedata_ad_33_bucketDataPrint_20240718.scala
--- a/src/main/scala/com/aliyun/odps/spark/examples/makedata_ad/v20240718/makedata_ad_33_bucketData_20240718.scala
+++ b/src/main/scala/com/aliyun/odps/spark/examples/makedata_ad/v20240718/makedata_ad_33_bucketData_20240718.scala
@@ -0,0 +1,128 @@
 
				+package com.aliyun.odps.spark.examples.makedata_ad.v20240718
			
 
				+
			
 
				+import com.alibaba.fastjson.JSON
			
 
				+import com.aliyun.odps.spark.examples.myUtils.{MyDateUtils, MyHdfsUtils, ParamUtils}
			
 
				+import examples.extractor.ExtractorUtils
			
 
				+import org.apache.hadoop.io.compress.GzipCodec
			
 
				+import org.apache.spark.sql.SparkSession
			
 
				+
			
 
				+import scala.collection.JavaConversions._
			
 
				+import scala.collection.mutable.ArrayBuffer
			
 
				+import scala.io.Source
			
 
				+/*
			
 
				+
			
 
				+ */
			
 
				+
			
 
				+object makedata_ad_33_bucketData_20240718 {
			
 
				+  def main(args: Array[String]): Unit = {
			
 
				+
			
 
				+    val spark = SparkSession
			
 
				+      .builder()
			
 
				+      .appName(this.getClass.getName)
			
 
				+      .getOrCreate()
			
 
				+    val sc = spark.sparkContext
			
 
				+
			
 
				+    val loader = getClass.getClassLoader
			
 
				+
			
 
				+    val resourceUrlBucket = loader.getResource("20240718_ad_bucket_688.txt")
			
 
				+    val buckets =
			
 
				+      if (resourceUrlBucket != null) {
			
 
				+        val buckets = Source.fromURL(resourceUrlBucket).getLines().mkString("\n")
			
 
				+        Source.fromURL(resourceUrlBucket).close()
			
 
				+        buckets
			
 
				+      } else {
			
 
				+        ""
			
 
				+      }
			
 
				+    println(buckets)
			
 
				+    val bucketsMap = buckets.split("\n")
			
 
				+      .map(r => r.replace(" ", "").replaceAll("\n", ""))
			
 
				+      .filter(r => r.nonEmpty)
			
 
				+      .map(r =>{
			
 
				+        val rList = r.split("\t")
			
 
				+        (rList(0), (rList(1).toDouble, rList(2).split(",").map(_.toDouble)))
			
 
				+      }).toMap
			
 
				+    val bucketsMap_br = sc.broadcast(bucketsMap)
			
 
				+
			
 
				+
			
 
				+    // 1 读取参数
			
 
				+    val param = ParamUtils.parseArgs(args)
			
 
				+    val readPath = param.getOrElse("readPath", "/dw/recommend/model/31_ad_sample_data/")
			
 
				+    val savePath = param.getOrElse("savePath", "/dw/recommend/model/33_ad_train_data/")
			
 
				+    val beginStr = param.getOrElse("beginStr", "20240620")
			
 
				+    val endStr = param.getOrElse("endStr", "20240620")
			
 
				+    val repartition = param.getOrElse("repartition", "100").toInt
			
 
				+    val filterNames = param.getOrElse("filterNames", "").split(",").filter(_.nonEmpty).toSet
			
 
				+    val whatLabel = param.getOrElse("whatLabel", "ad_is_conversion")
			
 
				+
			
 
				+    val dateRange = MyDateUtils.getDateRange(beginStr, endStr)
			
 
				+    for (date <- dateRange) {
			
 
				+      println("开始执行:" + date)
			
 
				+      val data = sc.textFile(readPath + "/" + date + "*").map(r=>{
			
 
				+        val rList = r.split("\t")
			
 
				+        val logKey = rList(0)
			
 
				+        val labelKey = rList(1)
			
 
				+        val jsons = JSON.parseObject(rList(2))
			
 
				+        val features = scala.collection.mutable.Map[String, Double]()
			
 
				+        jsons.foreach(r => {
			
 
				+          features.put(r._1, jsons.getDoubleValue(r._1))
			
 
				+        })
			
 
				+        (logKey, labelKey, features)
			
 
				+      })
			
 
				+        .filter{
			
 
				+          case (logKey, labelKey, features) =>
			
 
				+            val logKeyList = logKey.split(",")
			
 
				+            val apptype = logKeyList(0)
			
 
				+            !Set("12", "13").contains(apptype)
			
 
				+        }
			
 
				+        .map{
			
 
				+          case (logKey, labelKey, features) =>
			
 
				+            val label = JSON.parseObject(labelKey).getOrDefault(whatLabel, "0").toString
			
 
				+            (label, features)
			
 
				+        }
			
 
				+        .mapPartitions(row => {
			
 
				+          val result = new ArrayBuffer[String]()
			
 
				+          val bucketsMap = bucketsMap_br.value
			
 
				+          row.foreach{
			
 
				+            case (label, features) =>
			
 
				+              val featuresBucket = features.map{
			
 
				+                case (name, score) =>
			
 
				+                  var ifFilter = false
			
 
				+                  if (filterNames.nonEmpty){
			
 
				+                    filterNames.foreach(r=> if (!ifFilter && name.contains(r)) {ifFilter = true} )
			
 
				+                  }
			
 
				+                  if (ifFilter){
			
 
				+                    ""
			
 
				+                  }else{
			
 
				+                    if (score > 1E-8) {
			
 
				+                      if (bucketsMap.contains(name)) {
			
 
				+                        val (bucketsNum, buckets) = bucketsMap(name)
			
 
				+                        val scoreNew = 1.0 / bucketsNum * (ExtractorUtils.findInsertPosition(buckets, score).toDouble + 1.0)
			
 
				+                        name + ":" + scoreNew.toString
			
 
				+                      } else {
			
 
				+                        name + ":" + score.toString
			
 
				+                      }
			
 
				+                    } else {
			
 
				+                      ""
			
 
				+                    }
			
 
				+                  }
			
 
				+              }.filter(_.nonEmpty)
			
 
				+              result.add(label + "\t" + featuresBucket.mkString("\t"))
			
 
				+          }
			
 
				+          result.iterator
			
 
				+      })
			
 
				+
			
 
				+      // 4 保存数据到hdfs
			
 
				+      val hdfsPath = savePath + "/" + date
			
 
				+      if (hdfsPath.nonEmpty && hdfsPath.startsWith("/dw/recommend/model/")) {
			
 
				+        println("删除路径并开始数据写入:" + hdfsPath)
			
 
				+        MyHdfsUtils.delete_hdfs_path(hdfsPath)
			
 
				+        data.repartition(repartition).saveAsTextFile(hdfsPath, classOf[GzipCodec])
			
 
				+      } else {
			
 
				+        println("路径不合法，无法写入:" + hdfsPath)
			
 
				+      }
			
 
				+    }
			
 
				+
			
 
				+
			
 
				+
			
 
				+  }
			
 
				+}
			
--- a/src/main/scala/com/aliyun/odps/spark/examples/makedata_ad/v20240718/makedata_ad_33_bucketData_20240718_sample.scala
+++ b/src/main/scala/com/aliyun/odps/spark/examples/makedata_ad/v20240718/makedata_ad_33_bucketData_20240718_sample.scala
@@ -0,0 +1,135 @@
 
				+package com.aliyun.odps.spark.examples.makedata_ad.v20240718
			
 
				+
			
 
				+import com.alibaba.fastjson.JSON
			
 
				+import com.aliyun.odps.spark.examples.myUtils.{MyDateUtils, MyHdfsUtils, ParamUtils}
			
 
				+import examples.extractor.ExtractorUtils
			
 
				+import org.apache.hadoop.io.compress.GzipCodec
			
 
				+import org.apache.spark.sql.SparkSession
			
 
				+
			
 
				+import scala.collection.JavaConversions._
			
 
				+import scala.collection.mutable.ArrayBuffer
			
 
				+import scala.io.Source
			
 
				+import scala.util.Random
			
 
				+
			
 
				+/*
			
 
				+
			
 
				+ */
			
 
				+
			
 
				+object makedata_ad_33_bucketData_20240718_sample {
			
 
				+  def main(args: Array[String]): Unit = {
			
 
				+
			
 
				+    val spark = SparkSession
			
 
				+      .builder()
			
 
				+      .appName(this.getClass.getName)
			
 
				+      .getOrCreate()
			
 
				+    val sc = spark.sparkContext
			
 
				+
			
 
				+
			
 
				+    // 1 读取参数
			
 
				+    val param = ParamUtils.parseArgs(args)
			
 
				+    val readPath = param.getOrElse("readPath", "/dw/recommend/model/31_ad_sample_data/")
			
 
				+    val savePath = param.getOrElse("savePath", "/dw/recommend/model/33_ad_train_data/")
			
 
				+    val beginStr = param.getOrElse("beginStr", "20240620")
			
 
				+    val endStr = param.getOrElse("endStr", "20240620")
			
 
				+    val repartition = param.getOrElse("repartition", "100").toInt
			
 
				+    val filterNames = param.getOrElse("filterNames", "").split(",").filter(_.nonEmpty).toSet
			
 
				+    val whatLabel = param.getOrElse("whatLabel", "ad_is_conversion")
			
 
				+    val sampleRate = param.getOrElse("sampleRate", "0.1").toDouble
			
 
				+
			
 
				+    val loader = getClass.getClassLoader
			
 
				+
			
 
				+    val resourceUrlBucket = loader.getResource("20240718_ad_bucket_688.txt")
			
 
				+    val buckets =
			
 
				+      if (resourceUrlBucket != null) {
			
 
				+        val buckets = Source.fromURL(resourceUrlBucket).getLines().mkString("\n")
			
 
				+        Source.fromURL(resourceUrlBucket).close()
			
 
				+        buckets
			
 
				+      } else {
			
 
				+        ""
			
 
				+      }
			
 
				+    println(buckets)
			
 
				+    val bucketsMap = buckets.split("\n")
			
 
				+      .map(r => r.replace(" ", "").replaceAll("\n", ""))
			
 
				+      .filter(r => r.nonEmpty)
			
 
				+      .map(r => {
			
 
				+        val rList = r.split("\t")
			
 
				+        (rList(0), (rList(1).toDouble, rList(2).split(",").map(_.toDouble)))
			
 
				+      }).toMap
			
 
				+    val bucketsMap_br = sc.broadcast(bucketsMap)
			
 
				+
			
 
				+    val dateRange = MyDateUtils.getDateRange(beginStr, endStr)
			
 
				+    for (date <- dateRange) {
			
 
				+      println("开始执行:" + date)
			
 
				+      val data = sc.textFile(readPath + "/" + date + "*").map(r => {
			
 
				+          val rList = r.split("\t")
			
 
				+          val logKey = rList(0)
			
 
				+          val labelKey = rList(1)
			
 
				+          val jsons = JSON.parseObject(rList(2))
			
 
				+          val features = scala.collection.mutable.Map[String, Double]()
			
 
				+          jsons.foreach(r => {
			
 
				+            features.put(r._1, jsons.getDoubleValue(r._1))
			
 
				+          })
			
 
				+          (logKey, labelKey, features)
			
 
				+        })
			
 
				+        .filter {
			
 
				+          case (logKey, labelKey, features) =>
			
 
				+            val logKeyList = logKey.split(",")
			
 
				+            val apptype = logKeyList(0)
			
 
				+            !Set("12", "13").contains(apptype)
			
 
				+        }.filter {
			
 
				+          case (logKey, labelKey, features) =>
			
 
				+            new Random().nextDouble() < sampleRate
			
 
				+        }
			
 
				+        .map {
			
 
				+          case (logKey, labelKey, features) =>
			
 
				+            val label = JSON.parseObject(labelKey).getOrDefault(whatLabel, "0").toString
			
 
				+            (label, features)
			
 
				+        }
			
 
				+        .mapPartitions(row => {
			
 
				+          val result = new ArrayBuffer[String]()
			
 
				+          val bucketsMap = bucketsMap_br.value
			
 
				+          row.foreach {
			
 
				+            case (label, features) =>
			
 
				+              val featuresBucket = features.map {
			
 
				+                case (name, score) =>
			
 
				+                  var ifFilter = false
			
 
				+                  if (filterNames.nonEmpty) {
			
 
				+                    filterNames.foreach(r => if (!ifFilter && name.contains(r)) {
			
 
				+                      ifFilter = true
			
 
				+                    })
			
 
				+                  }
			
 
				+                  if (ifFilter) {
			
 
				+                    ""
			
 
				+                  } else {
			
 
				+                    if (score > 1E-8) {
			
 
				+                      if (bucketsMap.contains(name)) {
			
 
				+                        val (bucketsNum, buckets) = bucketsMap(name)
			
 
				+                        val scoreNew = 1.0 / bucketsNum * (ExtractorUtils.findInsertPosition(buckets, score).toDouble + 1.0)
			
 
				+                        name + ":" + scoreNew.toString
			
 
				+                      } else {
			
 
				+                        name + ":" + score.toString
			
 
				+                      }
			
 
				+                    } else {
			
 
				+                      ""
			
 
				+                    }
			
 
				+                  }
			
 
				+              }.filter(_.nonEmpty)
			
 
				+              result.add(label + "\t" + featuresBucket.mkString("\t"))
			
 
				+          }
			
 
				+          result.iterator
			
 
				+        })
			
 
				+
			
 
				+      // 4 保存数据到hdfs
			
 
				+      val hdfsPath = savePath + "/" + date
			
 
				+      if (hdfsPath.nonEmpty && hdfsPath.startsWith("/dw/recommend/model/")) {
			
 
				+        println("删除路径并开始数据写入:" + hdfsPath)
			
 
				+        MyHdfsUtils.delete_hdfs_path(hdfsPath)
			
 
				+        data.repartition(repartition).saveAsTextFile(hdfsPath, classOf[GzipCodec])
			
 
				+      } else {
			
 
				+        println("路径不合法，无法写入:" + hdfsPath)
			
 
				+      }
			
 
				+    }
			
 
				+
			
 
				+
			
 
				+  }
			
 
				+}
			
--- a/src/main/scala/com/aliyun/odps/spark/examples/makedata_ad/v20240718/makedata_ad_33_bucketData_20240726.scala
+++ b/src/main/scala/com/aliyun/odps/spark/examples/makedata_ad/v20240718/makedata_ad_33_bucketData_20240726.scala
@@ -0,0 +1,158 @@
 
				+package com.aliyun.odps.spark.examples.makedata_ad.v20240718
			
 
				+
			
 
				+import com.alibaba.fastjson.JSON
			
 
				+import com.aliyun.odps.spark.examples.myUtils.{MyDateUtils, MyHdfsUtils, ParamUtils}
			
 
				+import examples.extractor.ExtractorUtils
			
 
				+import org.apache.hadoop.io.compress.GzipCodec
			
 
				+import org.apache.spark.sql.SparkSession
			
 
				+
			
 
				+import scala.collection.JavaConversions._
			
 
				+import scala.collection.mutable.ArrayBuffer
			
 
				+import scala.io.Source
			
 
				+
			
 
				+/*
			
 
				+
			
 
				+ */
			
 
				+
			
 
				+object makedata_ad_33_bucketData_20240726 {
			
 
				+  def main(args: Array[String]): Unit = {
			
 
				+
			
 
				+    val spark = SparkSession
			
 
				+      .builder()
			
 
				+      .appName(this.getClass.getName)
			
 
				+      .getOrCreate()
			
 
				+    val sc = spark.sparkContext
			
 
				+
			
 
				+
			
 
				+    // 1 读取参数
			
 
				+    val param = ParamUtils.parseArgs(args)
			
 
				+    val readPath = param.getOrElse("readPath", "/dw/recommend/model/31_ad_sample_data/")
			
 
				+    val savePath = param.getOrElse("savePath", "/dw/recommend/model/33_ad_train_data/")
			
 
				+    val beginStr = param.getOrElse("beginStr", "20240620")
			
 
				+    val endStr = param.getOrElse("endStr", "20240620")
			
 
				+    val repartition = param.getOrElse("repartition", "100").toInt
			
 
				+    val filterNames = param.getOrElse("filterNames", "").split(",").filter(_.nonEmpty).toSet
			
 
				+    val whatLabel = param.getOrElse("whatLabel", "ad_is_conversion")
			
 
				+    val featureNameFile = param.getOrElse("featureNameFile", "20240718_ad_feature_name_517.txt");
			
 
				+
			
 
				+
			
 
				+    val loader = getClass.getClassLoader
			
 
				+
			
 
				+    val resourceUrlBucket = loader.getResource("20240718_ad_bucket_517.txt")
			
 
				+    val buckets =
			
 
				+      if (resourceUrlBucket != null) {
			
 
				+        val buckets = Source.fromURL(resourceUrlBucket).getLines().mkString("\n")
			
 
				+        Source.fromURL(resourceUrlBucket).close()
			
 
				+        buckets
			
 
				+      } else {
			
 
				+        ""
			
 
				+      }
			
 
				+    println(buckets)
			
 
				+    val bucketsMap = buckets.split("\n")
			
 
				+      .map(r => r.replace(" ", "").replaceAll("\n", ""))
			
 
				+      .filter(r => r.nonEmpty)
			
 
				+      .map(r => {
			
 
				+        val rList = r.split("\t")
			
 
				+        (rList(0), (rList(1).toDouble, rList(2).split(",").map(_.toDouble)))
			
 
				+      }).toMap
			
 
				+    val bucketsMap_br = sc.broadcast(bucketsMap)
			
 
				+
			
 
				+    val resourceUrl = loader.getResource(featureNameFile)
			
 
				+    val content =
			
 
				+      if (resourceUrl != null) {
			
 
				+        val content = Source.fromURL(resourceUrl).getLines().mkString("\n")
			
 
				+        Source.fromURL(resourceUrl).close()
			
 
				+        content
			
 
				+      } else {
			
 
				+        ""
			
 
				+      }
			
 
				+
			
 
				+    println()
			
 
				+    println()
			
 
				+    println()
			
 
				+    println(content)
			
 
				+    val contentList = content.split("\n")
			
 
				+      .map(r => r.replace(" ", "").replaceAll("\n", ""))
			
 
				+      .filter(r => r.nonEmpty).toList
			
 
				+
			
 
				+    val dateRange = MyDateUtils.getDateRange(beginStr, endStr)
			
 
				+    for (date <- dateRange) {
			
 
				+      println("开始执行:" + date)
			
 
				+      val data = sc.textFile(readPath + "/" + date + "*").map(r => {
			
 
				+          val rList = r.split("\t")
			
 
				+          val logKey = rList(0)
			
 
				+          val labelKey = rList(1)
			
 
				+          val jsons = JSON.parseObject(rList(2))
			
 
				+          val features = scala.collection.mutable.Map[String, Double]()
			
 
				+          jsons.foreach(r => {
			
 
				+            features.put(r._1, jsons.getDoubleValue(r._1))
			
 
				+          })
			
 
				+
			
 
				+          for (name <- contentList) {
			
 
				+            if (!features.contains(name)) {
			
 
				+              features.put(name, 0)
			
 
				+            }
			
 
				+          }
			
 
				+
			
 
				+          (logKey, labelKey, features)
			
 
				+        })
			
 
				+        .filter {
			
 
				+          case (logKey, labelKey, features) =>
			
 
				+            val logKeyList = logKey.split(",")
			
 
				+            val apptype = logKeyList(0)
			
 
				+            !Set("12", "13").contains(apptype)
			
 
				+        }
			
 
				+        .map {
			
 
				+          case (logKey, labelKey, features) =>
			
 
				+            val label = JSON.parseObject(labelKey).getOrDefault(whatLabel, "0").toString
			
 
				+
			
 
				+            (label, features)
			
 
				+        }
			
 
				+        .mapPartitions(row => {
			
 
				+          val result = new ArrayBuffer[String]()
			
 
				+          val bucketsMap = bucketsMap_br.value
			
 
				+          row.foreach {
			
 
				+            case (label, features) =>
			
 
				+              val featuresBucket = features.map {
			
 
				+                case (name, score) =>
			
 
				+                  var ifFilter = false
			
 
				+                  if (filterNames.nonEmpty) {
			
 
				+                    filterNames.foreach(r => if (!ifFilter && name.contains(r)) {
			
 
				+                      ifFilter = true
			
 
				+                    })
			
 
				+                  }
			
 
				+                  if (ifFilter) {
			
 
				+                    ""
			
 
				+                  } else {
			
 
				+                    if (score > 1E-8) {
			
 
				+                      if (bucketsMap.contains(name)) {
			
 
				+                        val (bucketsNum, buckets) = bucketsMap(name)
			
 
				+                        val scoreNew = 0.01 + (1.0 / bucketsNum * (ExtractorUtils.findInsertPosition(buckets, score).toDouble + 1.0))
			
 
				+                        name + ":" + scoreNew.toString
			
 
				+                      } else {
			
 
				+                        name + ":" + score.toString
			
 
				+                      }
			
 
				+                    } else {
			
 
				+                      name + ":" + "0.01"
			
 
				+                    }
			
 
				+                  }
			
 
				+              }.filter(_.nonEmpty)
			
 
				+              result.add(label + "\t" + featuresBucket.mkString("\t"))
			
 
				+          }
			
 
				+          result.iterator
			
 
				+        })
			
 
				+
			
 
				+      // 4 保存数据到hdfs
			
 
				+      val hdfsPath = savePath + "/" + date
			
 
				+      if (hdfsPath.nonEmpty && hdfsPath.startsWith("/dw/recommend/model/")) {
			
 
				+        println("删除路径并开始数据写入:" + hdfsPath)
			
 
				+        MyHdfsUtils.delete_hdfs_path(hdfsPath)
			
 
				+        data.repartition(repartition).saveAsTextFile(hdfsPath, classOf[GzipCodec])
			
 
				+      } else {
			
 
				+        println("路径不合法，无法写入:" + hdfsPath)
			
 
				+      }
			
 
				+    }
			
 
				+
			
 
				+
			
 
				+  }
			
 
				+}
			
--- a/src/main/scala/com/aliyun/odps/spark/examples/makedata_ad/v20240718/makedata_ad_33_bucketData_20240729.scala
+++ b/src/main/scala/com/aliyun/odps/spark/examples/makedata_ad/v20240718/makedata_ad_33_bucketData_20240729.scala
@@ -0,0 +1,152 @@
 
				+package com.aliyun.odps.spark.examples.makedata_ad.v20240718
			
 
				+
			
 
				+import com.alibaba.fastjson.JSON
			
 
				+import com.aliyun.odps.spark.examples.myUtils.{MyDateUtils, MyHdfsUtils, ParamUtils}
			
 
				+import examples.extractor.ExtractorUtils
			
 
				+import org.apache.hadoop.io.compress.GzipCodec
			
 
				+import org.apache.spark.sql.SparkSession
			
 
				+
			
 
				+import scala.collection.JavaConversions._
			
 
				+import scala.collection.mutable.ArrayBuffer
			
 
				+import scala.io.Source
			
 
				+import scala.util.Random
			
 
				+
			
 
				+/*
			
 
				+
			
 
				+ */
			
 
				+
			
 
				+object makedata_ad_33_bucketData_20240729 {
			
 
				+  def main(args: Array[String]): Unit = {
			
 
				+
			
 
				+    val spark = SparkSession
			
 
				+      .builder()
			
 
				+      .appName(this.getClass.getName)
			
 
				+      .getOrCreate()
			
 
				+    val sc = spark.sparkContext
			
 
				+
			
 
				+    // 1 读取参数
			
 
				+    val param = ParamUtils.parseArgs(args)
			
 
				+    val readPath = param.getOrElse("readPath", "/dw/recommend/model/31_ad_sample_data/")
			
 
				+    val savePath = param.getOrElse("savePath", "/dw/recommend/model/33_ad_train_data/")
			
 
				+    val beginStr = param.getOrElse("beginStr", "20240620")
			
 
				+    val endStr = param.getOrElse("endStr", "20240620")
			
 
				+    val repartition = param.getOrElse("repartition", "100").toInt
			
 
				+    val filterNames = param.getOrElse("filterNames", "").split(",").filter(_.nonEmpty).toSet
			
 
				+    val whatLabel = param.getOrElse("whatLabel", "ad_is_conversion")
			
 
				+
			
 
				+    val loader = getClass.getClassLoader
			
 
				+
			
 
				+    val resourceUrlBucket = loader.getResource("20240718_ad_bucket_688.txt")
			
 
				+    val buckets =
			
 
				+      if (resourceUrlBucket != null) {
			
 
				+        val buckets = Source.fromURL(resourceUrlBucket).getLines().mkString("\n")
			
 
				+        Source.fromURL(resourceUrlBucket).close()
			
 
				+        buckets
			
 
				+      } else {
			
 
				+        ""
			
 
				+      }
			
 
				+    println(buckets)
			
 
				+    val bucketsMap = buckets.split("\n")
			
 
				+      .map(r => r.replace(" ", "").replaceAll("\n", ""))
			
 
				+      .filter(r => r.nonEmpty)
			
 
				+      .map(r => {
			
 
				+        val rList = r.split("\t")
			
 
				+        (rList(0), (rList(1).toDouble, rList(2).split(",").map(_.toDouble)))
			
 
				+      }).toMap
			
 
				+    val bucketsMap_br = sc.broadcast(bucketsMap)
			
 
				+
			
 
				+    val dateRange = MyDateUtils.getDateRange(beginStr, endStr)
			
 
				+    val cidCountMap = scala.collection.mutable.Map[String, Int]()
			
 
				+    for (date <- dateRange) {
			
 
				+      println("开始执行:" + date)
			
 
				+      val data = sc.textFile(readPath + "/" + date + "*").map(r => {
			
 
				+          val rList = r.split("\t")
			
 
				+          val logKey = rList(0)
			
 
				+          val labelKey = rList(1)
			
 
				+          val jsons = JSON.parseObject(rList(2))
			
 
				+          val features = scala.collection.mutable.Map[String, Double]()
			
 
				+          jsons.foreach(r => {
			
 
				+            features.put(r._1, jsons.getDoubleValue(r._1))
			
 
				+          })
			
 
				+          (logKey, labelKey, features)
			
 
				+        })
			
 
				+        .filter {
			
 
				+          case (logKey, labelKey, features) =>
			
 
				+            val logKeyList = logKey.split(",")
			
 
				+            val apptype = logKeyList(0)
			
 
				+            !Set("12", "13").contains(apptype)
			
 
				+        }.filter {
			
 
				+          case (logKey, labelKey, features) =>
			
 
				+            var key = ""
			
 
				+            for (elem <- features) {
			
 
				+              if (elem._1.contains("cid_")) {
			
 
				+                key = elem._1
			
 
				+              }
			
 
				+            }
			
 
				+
			
 
				+            if (key.equals("cid_3319")) {
			
 
				+              true
			
 
				+            } else if (key.equals("cid_3024")) {
			
 
				+              // 创建一个Random实例
			
 
				+              val rand = new Random()
			
 
				+
			
 
				+              // 生成一个0到1之间的随机浮点数
			
 
				+              val randomDouble = rand.nextDouble()
			
 
				+
			
 
				+              randomDouble < 0.01
			
 
				+            } else {
			
 
				+              false
			
 
				+            }
			
 
				+        }.map {
			
 
				+          case (logKey, labelKey, features) =>
			
 
				+            val label = JSON.parseObject(labelKey).getOrDefault(whatLabel, "0").toString
			
 
				+            (label, features)
			
 
				+        }
			
 
				+        .mapPartitions(row => {
			
 
				+          val result = new ArrayBuffer[String]()
			
 
				+          val bucketsMap = bucketsMap_br.value
			
 
				+          row.foreach {
			
 
				+            case (label, features) =>
			
 
				+              val featuresBucket = features.map {
			
 
				+                case (name, score) =>
			
 
				+                  var ifFilter = false
			
 
				+                  if (filterNames.nonEmpty) {
			
 
				+                    filterNames.foreach(r => if (!ifFilter && name.contains(r)) {
			
 
				+                      ifFilter = true
			
 
				+                    })
			
 
				+                  }
			
 
				+                  if (ifFilter) {
			
 
				+                    ""
			
 
				+                  } else {
			
 
				+                    if (score > 1E-8) {
			
 
				+                      if (bucketsMap.contains(name)) {
			
 
				+                        val (bucketsNum, buckets) = bucketsMap(name)
			
 
				+                        val scoreNew = 1.0 / bucketsNum * (ExtractorUtils.findInsertPosition(buckets, score).toDouble + 1.0)
			
 
				+                        name + ":" + scoreNew.toString
			
 
				+                      } else {
			
 
				+                        name + ":" + score.toString
			
 
				+                      }
			
 
				+                    } else {
			
 
				+                      ""
			
 
				+                    }
			
 
				+                  }
			
 
				+              }.filter(_.nonEmpty)
			
 
				+              result.add(label + "\t" + featuresBucket.mkString("\t"))
			
 
				+          }
			
 
				+          result.iterator
			
 
				+        })
			
 
				+
			
 
				+      // 4 保存数据到hdfs
			
 
				+      val hdfsPath = savePath + "/" + date
			
 
				+      if (hdfsPath.nonEmpty && hdfsPath.startsWith("/dw/recommend/model/")) {
			
 
				+        println("删除路径并开始数据写入:" + hdfsPath)
			
 
				+        MyHdfsUtils.delete_hdfs_path(hdfsPath)
			
 
				+        data.repartition(repartition).saveAsTextFile(hdfsPath, classOf[GzipCodec])
			
 
				+      } else {
			
 
				+        println("路径不合法，无法写入:" + hdfsPath)
			
 
				+      }
			
 
				+    }
			
 
				+
			
 
				+
			
 
				+  }
			
 
				+}
			
--- a/src/main/scala/com/aliyun/odps/spark/examples/makedata_ad/v20240718/makedata_ad_33_bucketData_20240729_copy_zheng.scala
+++ b/src/main/scala/com/aliyun/odps/spark/examples/makedata_ad/v20240718/makedata_ad_33_bucketData_20240729_copy_zheng.scala
@@ -0,0 +1,181 @@
 
				+package com.aliyun.odps.spark.examples.makedata_ad.v20240718
			
 
				+
			
 
				+import com.alibaba.fastjson.JSON
			
 
				+import com.aliyun.odps.spark.examples.myUtils.{MyDateUtils, MyHdfsUtils, ParamUtils}
			
 
				+import examples.extractor.ExtractorUtils
			
 
				+import org.apache.hadoop.io.compress.GzipCodec
			
 
				+import org.apache.spark.sql.SparkSession
			
 
				+
			
 
				+import scala.collection.JavaConversions._
			
 
				+import scala.collection.mutable.ArrayBuffer
			
 
				+import scala.io.Source
			
 
				+import scala.util.Random
			
 
				+
			
 
				+/*
			
 
				+
			
 
				+ */
			
 
				+
			
 
				+object makedata_ad_33_bucketData_20240729_copy_zheng {
			
 
				+  def main(args: Array[String]): Unit = {
			
 
				+
			
 
				+    val spark = SparkSession
			
 
				+      .builder()
			
 
				+      .appName(this.getClass.getName)
			
 
				+      .getOrCreate()
			
 
				+    val sc = spark.sparkContext
			
 
				+
			
 
				+    // 1 读取参数
			
 
				+    val param = ParamUtils.parseArgs(args)
			
 
				+    val readPath = param.getOrElse("readPath", "/dw/recommend/model/31_ad_sample_data/")
			
 
				+    val savePath = param.getOrElse("savePath", "/dw/recommend/model/33_ad_train_data/")
			
 
				+    val beginStr = param.getOrElse("beginStr", "20240620")
			
 
				+    val endStr = param.getOrElse("endStr", "20240620")
			
 
				+    val repartition = param.getOrElse("repartition", "100").toInt
			
 
				+    val filterNames = param.getOrElse("filterNames", "").split(",").filter(_.nonEmpty).toSet
			
 
				+    val whatLabel = param.getOrElse("whatLabel", "ad_is_conversion")
			
 
				+
			
 
				+    val loader = getClass.getClassLoader
			
 
				+
			
 
				+    val resourceUrlBucket = loader.getResource("20240718_ad_bucket_688.txt")
			
 
				+    val buckets =
			
 
				+      if (resourceUrlBucket != null) {
			
 
				+        val buckets = Source.fromURL(resourceUrlBucket).getLines().mkString("\n")
			
 
				+        Source.fromURL(resourceUrlBucket).close()
			
 
				+        buckets
			
 
				+      } else {
			
 
				+        ""
			
 
				+      }
			
 
				+    println(buckets)
			
 
				+    val bucketsMap = buckets.split("\n")
			
 
				+      .map(r => r.replace(" ", "").replaceAll("\n", ""))
			
 
				+      .filter(r => r.nonEmpty)
			
 
				+      .map(r => {
			
 
				+        val rList = r.split("\t")
			
 
				+        (rList(0), (rList(1).toDouble, rList(2).split(",").map(_.toDouble)))
			
 
				+      }).toMap
			
 
				+    val bucketsMap_br = sc.broadcast(bucketsMap)
			
 
				+
			
 
				+    val dateRange = MyDateUtils.getDateRange(beginStr, endStr)
			
 
				+    val cidCountMap = scala.collection.mutable.Map[String, Int]()
			
 
				+    for (date <- dateRange) {
			
 
				+      println("开始执行:" + date)
			
 
				+      val data = sc.textFile(readPath + "/" + date + "*").map(r => {
			
 
				+          val rList = r.split("\t")
			
 
				+          val logKey = rList(0)
			
 
				+          val labelKey = rList(1)
			
 
				+          val jsons = JSON.parseObject(rList(2))
			
 
				+          val features = scala.collection.mutable.Map[String, Double]()
			
 
				+          jsons.foreach(r => {
			
 
				+            features.put(r._1, jsons.getDoubleValue(r._1))
			
 
				+          })
			
 
				+          (logKey, labelKey, features)
			
 
				+        })
			
 
				+        .filter {
			
 
				+          case (logKey, labelKey, features) =>
			
 
				+            val logKeyList = logKey.split(",")
			
 
				+            val apptype = logKeyList(0)
			
 
				+            !Set("12", "13").contains(apptype)
			
 
				+        }.filter {
			
 
				+          case (logKey, labelKey, features) =>
			
 
				+            var key = ""
			
 
				+            for (elem <- features) {
			
 
				+              if (elem._1.contains("cid_")) {
			
 
				+                key = elem._1
			
 
				+              }
			
 
				+            }
			
 
				+
			
 
				+            if (key.equals("cid_3319")) {
			
 
				+              true
			
 
				+            } else if (key.equals("cid_3024")) {
			
 
				+              // 创建一个Random实例
			
 
				+              val rand = new Random()
			
 
				+
			
 
				+              // 生成一个0到1之间的随机浮点数
			
 
				+              val randomDouble = rand.nextDouble()
			
 
				+
			
 
				+              randomDouble < 0.01
			
 
				+            } else {
			
 
				+              false
			
 
				+            }
			
 
				+        }.flatMap {
			
 
				+          case (logKey, labelKey, features) =>
			
 
				+            var key = ""
			
 
				+            for (elem <- features) {
			
 
				+              if (elem._1.contains("cid_")) {
			
 
				+                key = elem._1
			
 
				+              }
			
 
				+            }
			
 
				+            if (key.equals("cid_3319")) {
			
 
				+              val label = JSON.parseObject(labelKey).getOrDefault(whatLabel, "0").toString
			
 
				+              if (!label.equals("0")) {
			
 
				+                Seq(
			
 
				+                  (logKey, labelKey, features),
			
 
				+                  (logKey, labelKey, features),
			
 
				+                  (logKey, labelKey, features),
			
 
				+                  (logKey, labelKey, features),
			
 
				+                  (logKey, labelKey, features),
			
 
				+                  (logKey, labelKey, features),
			
 
				+                  (logKey, labelKey, features),
			
 
				+                  (logKey, labelKey, features),
			
 
				+                  (logKey, labelKey, features),
			
 
				+                  (logKey, labelKey, features)
			
 
				+                )
			
 
				+              } else {
			
 
				+                Seq((logKey, labelKey, features))
			
 
				+              }
			
 
				+            } else {
			
 
				+              Seq((logKey, labelKey, features))
			
 
				+            }
			
 
				+        }.map {
			
 
				+          case (logKey, labelKey, features) =>
			
 
				+            val label = JSON.parseObject(labelKey).getOrDefault(whatLabel, "0").toString
			
 
				+            (label, features)
			
 
				+        }
			
 
				+        .mapPartitions(row => {
			
 
				+          val result = new ArrayBuffer[String]()
			
 
				+          val bucketsMap = bucketsMap_br.value
			
 
				+          row.foreach {
			
 
				+            case (label, features) =>
			
 
				+              val featuresBucket = features.map {
			
 
				+                case (name, score) =>
			
 
				+                  var ifFilter = false
			
 
				+                  if (filterNames.nonEmpty) {
			
 
				+                    filterNames.foreach(r => if (!ifFilter && name.contains(r)) {
			
 
				+                      ifFilter = true
			
 
				+                    })
			
 
				+                  }
			
 
				+                  if (ifFilter) {
			
 
				+                    ""
			
 
				+                  } else {
			
 
				+                    if (score > 1E-8) {
			
 
				+                      if (bucketsMap.contains(name)) {
			
 
				+                        val (bucketsNum, buckets) = bucketsMap(name)
			
 
				+                        val scoreNew = 1.0 / bucketsNum * (ExtractorUtils.findInsertPosition(buckets, score).toDouble + 1.0)
			
 
				+                        name + ":" + scoreNew.toString
			
 
				+                      } else {
			
 
				+                        name + ":" + score.toString
			
 
				+                      }
			
 
				+                    } else {
			
 
				+                      ""
			
 
				+                    }
			
 
				+                  }
			
 
				+              }.filter(_.nonEmpty)
			
 
				+              result.add(label + "\t" + featuresBucket.mkString("\t"))
			
 
				+          }
			
 
				+          result.iterator
			
 
				+        })
			
 
				+
			
 
				+      // 4 保存数据到hdfs
			
 
				+      val hdfsPath = savePath + "/" + date
			
 
				+      if (hdfsPath.nonEmpty && hdfsPath.startsWith("/dw/recommend/model/")) {
			
 
				+        println("删除路径并开始数据写入:" + hdfsPath)
			
 
				+        MyHdfsUtils.delete_hdfs_path(hdfsPath)
			
 
				+        data.repartition(repartition).saveAsTextFile(hdfsPath, classOf[GzipCodec])
			
 
				+      } else {
			
 
				+        println("路径不合法，无法写入:" + hdfsPath)
			
 
				+      }
			
 
				+    }
			
 
				+
			
 
				+
			
 
				+  }
			
 
				+}
			
--- a/src/main/scala/com/aliyun/odps/spark/examples/makedata_ad/v20240718/makedata_ad_33_bucketData_20240729_reduce_feature.scala
+++ b/src/main/scala/com/aliyun/odps/spark/examples/makedata_ad/v20240718/makedata_ad_33_bucketData_20240729_reduce_feature.scala
@@ -0,0 +1,129 @@
 
				+package com.aliyun.odps.spark.examples.makedata_ad.v20240718
			
 
				+
			
 
				+import com.alibaba.fastjson.JSON
			
 
				+import com.aliyun.odps.spark.examples.myUtils.{MyDateUtils, MyHdfsUtils, ParamUtils}
			
 
				+import examples.extractor.ExtractorUtils
			
 
				+import org.apache.hadoop.io.compress.GzipCodec
			
 
				+import org.apache.spark.sql.SparkSession
			
 
				+
			
 
				+import scala.collection.JavaConversions._
			
 
				+import scala.collection.mutable.ArrayBuffer
			
 
				+import scala.io.Source
			
 
				+
			
 
				+/*
			
 
				+
			
 
				+ */
			
 
				+
			
 
				+object makedata_ad_33_bucketData_20240729_reduce_feature {
			
 
				+  def main(args: Array[String]): Unit = {
			
 
				+
			
 
				+    val spark = SparkSession
			
 
				+      .builder()
			
 
				+      .appName(this.getClass.getName)
			
 
				+      .getOrCreate()
			
 
				+    val sc = spark.sparkContext
			
 
				+
			
 
				+    // 1 读取参数
			
 
				+    val param = ParamUtils.parseArgs(args)
			
 
				+    val readPath = param.getOrElse("readPath", "/dw/recommend/model/31_ad_sample_data/")
			
 
				+    val savePath = param.getOrElse("savePath", "/dw/recommend/model/33_ad_train_data/")
			
 
				+    val beginStr = param.getOrElse("beginStr", "20240620")
			
 
				+    val endStr = param.getOrElse("endStr", "20240620")
			
 
				+    val repartition = param.getOrElse("repartition", "100").toInt
			
 
				+    val retainNames = param.getOrElse("retainNames", "").split(",").filter(_.nonEmpty).toSet
			
 
				+    val whatLabel = param.getOrElse("whatLabel", "ad_is_conversion")
			
 
				+
			
 
				+    val loader = getClass.getClassLoader
			
 
				+
			
 
				+    val resourceUrlBucket = loader.getResource("20240718_ad_bucket_688.txt")
			
 
				+    val buckets =
			
 
				+      if (resourceUrlBucket != null) {
			
 
				+        val buckets = Source.fromURL(resourceUrlBucket).getLines().mkString("\n")
			
 
				+        Source.fromURL(resourceUrlBucket).close()
			
 
				+        buckets
			
 
				+      } else {
			
 
				+        ""
			
 
				+      }
			
 
				+    println(buckets)
			
 
				+    val bucketsMap = buckets.split("\n")
			
 
				+      .map(r => r.replace(" ", "").replaceAll("\n", ""))
			
 
				+      .filter(r => r.nonEmpty)
			
 
				+      .map(r => {
			
 
				+        val rList = r.split("\t")
			
 
				+        (rList(0), (rList(1).toDouble, rList(2).split(",").map(_.toDouble)))
			
 
				+      }).toMap
			
 
				+    val bucketsMap_br = sc.broadcast(bucketsMap)
			
 
				+
			
 
				+    val dateRange = MyDateUtils.getDateRange(beginStr, endStr)
			
 
				+    val cidCountMap = scala.collection.mutable.Map[String, Int]()
			
 
				+    for (date <- dateRange) {
			
 
				+      println("开始执行:" + date)
			
 
				+      val data = sc.textFile(readPath + "/" + date + "*").map(r => {
			
 
				+          val rList = r.split("\t")
			
 
				+          val logKey = rList(0)
			
 
				+          val labelKey = rList(1)
			
 
				+          val jsons = JSON.parseObject(rList(2))
			
 
				+          val features = scala.collection.mutable.Map[String, Double]()
			
 
				+          jsons.foreach(r => {
			
 
				+            features.put(r._1, jsons.getDoubleValue(r._1))
			
 
				+          })
			
 
				+          (logKey, labelKey, features)
			
 
				+        })
			
 
				+        .filter {
			
 
				+          case (logKey, labelKey, features) =>
			
 
				+            val logKeyList = logKey.split(",")
			
 
				+            val apptype = logKeyList(0)
			
 
				+            !Set("12", "13").contains(apptype)
			
 
				+        }.map {
			
 
				+          case (logKey, labelKey, features) =>
			
 
				+            val label = JSON.parseObject(labelKey).getOrDefault(whatLabel, "0").toString
			
 
				+            (label, features)
			
 
				+        }
			
 
				+        .mapPartitions(row => {
			
 
				+          val result = new ArrayBuffer[String]()
			
 
				+          val bucketsMap = bucketsMap_br.value
			
 
				+          row.foreach {
			
 
				+            case (label, features) =>
			
 
				+              val featuresBucket = features.map {
			
 
				+                case (name, score) =>
			
 
				+                  var isRetain = false
			
 
				+                  if (retainNames.nonEmpty) {
			
 
				+                    retainNames.foreach(r => if (!isRetain && name.contains(r)) {
			
 
				+                      isRetain = true
			
 
				+                    })
			
 
				+                  }
			
 
				+                  if (isRetain) {
			
 
				+                    if (score > 1E-8) {
			
 
				+                      if (bucketsMap.contains(name)) {
			
 
				+                        val (bucketsNum, buckets) = bucketsMap(name)
			
 
				+                        val scoreNew = 1.0 / bucketsNum * (ExtractorUtils.findInsertPosition(buckets, score).toDouble + 1.0)
			
 
				+                        name + ":" + scoreNew.toString
			
 
				+                      } else {
			
 
				+                        name + ":" + score.toString
			
 
				+                      }
			
 
				+                    } else {
			
 
				+                      ""
			
 
				+                    }
			
 
				+                  } else {
			
 
				+                    ""
			
 
				+                  }
			
 
				+              }.filter(_.nonEmpty)
			
 
				+              result.add(label + "\t" + featuresBucket.mkString("\t"))
			
 
				+          }
			
 
				+          result.iterator
			
 
				+        })
			
 
				+
			
 
				+      // 4 保存数据到hdfs
			
 
				+      val hdfsPath = savePath + "/" + date
			
 
				+      if (hdfsPath.nonEmpty && hdfsPath.startsWith("/dw/recommend/model/")) {
			
 
				+        println("删除路径并开始数据写入:" + hdfsPath)
			
 
				+        MyHdfsUtils.delete_hdfs_path(hdfsPath)
			
 
				+        data.repartition(repartition).saveAsTextFile(hdfsPath, classOf[GzipCodec])
			
 
				+      } else {
			
 
				+        println("路径不合法，无法写入:" + hdfsPath)
			
 
				+      }
			
 
				+    }
			
 
				+
			
 
				+
			
 
				+  }
			
 
				+}
			
--- a/src/main/scala/com/aliyun/odps/spark/examples/makedata_ad/v20240718/makedata_ad_33_bucketData_default_value_20240718.scala
+++ b/src/main/scala/com/aliyun/odps/spark/examples/makedata_ad/v20240718/makedata_ad_33_bucketData_default_value_20240718.scala
@@ -0,0 +1,140 @@
 
				+package com.aliyun.odps.spark.examples.makedata_ad.v20240718
			
 
				+
			
 
				+import com.alibaba.fastjson.JSON
			
 
				+import com.aliyun.odps.spark.examples.myUtils.{MyDateUtils, MyHdfsUtils, ParamUtils}
			
 
				+import examples.extractor.ExtractorUtils
			
 
				+import org.apache.hadoop.io.compress.GzipCodec
			
 
				+import org.apache.spark.sql.SparkSession
			
 
				+
			
 
				+import scala.collection.JavaConversions._
			
 
				+import scala.collection.mutable.ArrayBuffer
			
 
				+import scala.io.Source
			
 
				+/*
			
 
				+
			
 
				+ */
			
 
				+
			
 
				+object makedata_ad_33_bucketData_default_value_20240718 {
			
 
				+  def main(args: Array[String]): Unit = {
			
 
				+
			
 
				+    val spark = SparkSession
			
 
				+      .builder()
			
 
				+      .appName(this.getClass.getName)
			
 
				+      .getOrCreate()
			
 
				+    val sc = spark.sparkContext
			
 
				+
			
 
				+    val loader = getClass.getClassLoader
			
 
				+
			
 
				+    val resourceUrlBucket = loader.getResource("20240718_ad_bucket_688.txt")
			
 
				+    val buckets =
			
 
				+      if (resourceUrlBucket != null) {
			
 
				+        val buckets = Source.fromURL(resourceUrlBucket).getLines().mkString("\n")
			
 
				+        Source.fromURL(resourceUrlBucket).close()
			
 
				+        buckets
			
 
				+      } else {
			
 
				+        ""
			
 
				+      }
			
 
				+    println(buckets)
			
 
				+    val bucketsMap = buckets.split("\n")
			
 
				+      .map(r => r.replace(" ", "").replaceAll("\n", ""))
			
 
				+      .filter(r => r.nonEmpty)
			
 
				+      .map(r =>{
			
 
				+        val rList = r.split("\t")
			
 
				+        (rList(0), (rList(1).toDouble, rList(2).split(",").map(_.toDouble)))
			
 
				+      }).toMap
			
 
				+    val bucketsMap_br = sc.broadcast(bucketsMap)
			
 
				+
			
 
				+
			
 
				+    // 1 读取参数
			
 
				+    val param = ParamUtils.parseArgs(args)
			
 
				+    val readPath = param.getOrElse("readPath", "/dw/recommend/model/31_ad_sample_data/")
			
 
				+    val savePath = param.getOrElse("savePath", "/dw/recommend/model/33_ad_train_data/")
			
 
				+    val beginStr = param.getOrElse("beginStr", "20240620")
			
 
				+    val endStr = param.getOrElse("endStr", "20240620")
			
 
				+    val repartition = param.getOrElse("repartition", "100").toInt
			
 
				+    val filterNames = param.getOrElse("filterNames", "").split(",").filter(_.nonEmpty).toSet
			
 
				+    val whatLabel = param.getOrElse("whatLabel", "ad_is_conversion")
			
 
				+    val modifyFeatureName= param.getOrElse("modifyName", "").split(",").filter(_.nonEmpty).toSet
			
 
				+    val defaultValue= param.getOrElse("defaultValue", "0.01")
			
 
				+
			
 
				+    val dateRange = MyDateUtils.getDateRange(beginStr, endStr)
			
 
				+    for (date <- dateRange) {
			
 
				+      println("开始执行:" + date)
			
 
				+      val data = sc.textFile(readPath + "/" + date + "*").map(r=>{
			
 
				+        val rList = r.split("\t")
			
 
				+        val logKey = rList(0)
			
 
				+        val labelKey = rList(1)
			
 
				+        val jsons = JSON.parseObject(rList(2))
			
 
				+        val features = scala.collection.mutable.Map[String, Double]()
			
 
				+        jsons.foreach(r => {
			
 
				+          features.put(r._1, jsons.getDoubleValue(r._1))
			
 
				+        })
			
 
				+        (logKey, labelKey, features)
			
 
				+      })
			
 
				+        .filter{
			
 
				+          case (logKey, labelKey, features) =>
			
 
				+            val logKeyList = logKey.split(",")
			
 
				+            val apptype = logKeyList(0)
			
 
				+            !Set("12", "13").contains(apptype)
			
 
				+        }
			
 
				+        .map{
			
 
				+          case (logKey, labelKey, features) =>
			
 
				+            val label = JSON.parseObject(labelKey).getOrDefault(whatLabel, "0").toString
			
 
				+            (label, features)
			
 
				+        }
			
 
				+        .mapPartitions(row => {
			
 
				+          val result = new ArrayBuffer[String]()
			
 
				+          val bucketsMap = bucketsMap_br.value
			
 
				+          row.foreach{
			
 
				+            case (label, features) =>
			
 
				+              val featuresBucket = features.map{
			
 
				+                case (name, score) =>
			
 
				+                  var ifFilter = false
			
 
				+                  if (filterNames.nonEmpty){
			
 
				+                    filterNames.foreach(r=> if (!ifFilter && name.contains(r)) {ifFilter = true} )
			
 
				+                  }
			
 
				+                  if (ifFilter){
			
 
				+                    ""
			
 
				+                  }else{
			
 
				+                    if (score > 1E-8) {
			
 
				+                      if (bucketsMap.contains(name)) {
			
 
				+                        val (bucketsNum, buckets) = bucketsMap(name)
			
 
				+                        val scoreNew = 1.0 / bucketsNum * (ExtractorUtils.findInsertPosition(buckets, score).toDouble + 1.0)
			
 
				+                        name + ":" + scoreNew.toString
			
 
				+                      } else {
			
 
				+                        var isModify = false
			
 
				+                        if (modifyFeatureName.nonEmpty) {
			
 
				+                          modifyFeatureName.foreach(r => if (!isModify && name.startsWith(r)) {
			
 
				+                            isModify = true
			
 
				+                          })
			
 
				+                        }
			
 
				+                        if (isModify) {
			
 
				+                          name + ":" + defaultValue
			
 
				+                        } else {
			
 
				+                          name + ":" + score.toString
			
 
				+                        }
			
 
				+                      }
			
 
				+                    } else {
			
 
				+                      ""
			
 
				+                    }
			
 
				+                  }
			
 
				+              }.filter(_.nonEmpty)
			
 
				+              result.add(label + "\t" + featuresBucket.mkString("\t"))
			
 
				+          }
			
 
				+          result.iterator
			
 
				+      })
			
 
				+
			
 
				+      // 4 保存数据到hdfs
			
 
				+      val hdfsPath = savePath + "/" + date
			
 
				+      if (hdfsPath.nonEmpty && hdfsPath.startsWith("/dw/recommend/model/")) {
			
 
				+        println("删除路径并开始数据写入:" + hdfsPath)
			
 
				+        MyHdfsUtils.delete_hdfs_path(hdfsPath)
			
 
				+        data.repartition(repartition).saveAsTextFile(hdfsPath, classOf[GzipCodec])
			
 
				+      } else {
			
 
				+        println("路径不合法，无法写入:" + hdfsPath)
			
 
				+      }
			
 
				+    }
			
 
				+
			
 
				+
			
 
				+
			
 
				+  }
			
 
				+}
			
--- a/src/main/scala/com/aliyun/odps/spark/examples/makedata_ad/v20240718/makedata_ad_34_statistics_20241111.scala
+++ b/src/main/scala/com/aliyun/odps/spark/examples/makedata_ad/v20240718/makedata_ad_34_statistics_20241111.scala
@@ -0,0 +1,24 @@
 
				+package src.main.scala.com.aliyun.odps.spark.examples.makedata_ad.v20240718
			
 
				+
			
 
				+import com.aliyun.odps.spark.examples.myUtils.ParamUtils
			
 
				+import org.apache.spark.sql.SparkSession
			
 
				+
			
 
				+/**
			
 
				+ * 附件生产
			
 
				+ * <br >
			
 
				+ * 1. 按CID维度汇总，曝光总量，转化量等信息
			
 
				+ */
			
 
				+object makedata_ad_34_statistic_20241111 {
			
 
				+  def main(args: Array[String]): Unit = {
			
 
				+
			
 
				+    val spark = SparkSession
			
 
				+      .builder()
			
 
				+      .appName(this.getClass.getName)
			
 
				+      .getOrCreate();
			
 
				+    val sc =  spark.sparkContext
			
 
				+    val loader = getClass.getClassLoader
			
 
				+
			
 
				+    val param = ParamUtils.parseArgs(args)
			
 
				+
			
 
				+  }
			
 
				+}
			
--- a/src/main/scala/com/aliyun/odps/spark/examples/makedata_ad/xgb/makedata_31_bucketDataPrint_20240821.scala
+++ b/src/main/scala/com/aliyun/odps/spark/examples/makedata_ad/xgb/makedata_31_bucketDataPrint_20240821.scala
@@ -0,0 +1,549 @@
 
				+package com.aliyun.odps.spark.examples.makedata_ad.xgb
			
 
				+
			
 
				+import com.alibaba.fastjson.{JSON, JSONObject}
			
 
				+import com.aliyun.odps.TableSchema
			
 
				+import com.aliyun.odps.data.Record
			
 
				+import com.aliyun.odps.spark.examples.myUtils.{MyDateUtils, MyHdfsUtils, ParamUtils, env}
			
 
				+import examples.extractor.{ExtractorUtils, RankExtractorFeature_20240530}
			
 
				+import examples.utils.DateTimeUtil
			
 
				+import org.apache.hadoop.io.compress.GzipCodec
			
 
				+import org.apache.spark.sql.SparkSession
			
 
				+import org.xm.Similarity
			
 
				+
			
 
				+import scala.collection.JavaConversions._
			
 
				+import scala.collection.mutable.ArrayBuffer
			
 
				+import scala.io.Source
			
 
				+
			
 
				+object makedata_31_bucketDataPrint_20240821 {
			
 
				+  def main(args: Array[String]): Unit = {
			
 
				+    // 1 读取参数
			
 
				+    val param = ParamUtils.parseArgs(args)
			
 
				+    val tablePart = param.getOrElse("tablePart", "64").toInt
			
 
				+    val beginStr = param.getOrElse("beginStr", "2024061500")
			
 
				+    val endStr = param.getOrElse("endStr", "2024061523")
			
 
				+    val savePath = param.getOrElse("savePath", "/dw/recommend/model/33_for_check")
			
 
				+    val project = param.getOrElse("project", "loghubods")
			
 
				+    val table = param.getOrElse("table", "alg_recsys_ad_sample_all")
			
 
				+    val repartition = param.getOrElse("repartition", "32").toInt
			
 
				+    val readDate = param.getOrElse("readDate", "20240615")
			
 
				+    val featureNameFile = param.getOrElse("featureName", "20240718_ad_feature_name_517.txt")
			
 
				+    val featureBucketFile = param.getOrElse("featureBucketFile", "20240718_ad_bucket_517.txt");
			
 
				+    val filterHours = param.getOrElse("filterHours", "00,01,02,03,04,05,06,07").split(",").toSet
			
 
				+    val idDefaultValue = param.getOrElse("idDefaultValue", "1.0").toDouble
			
 
				+
			
 
				+    val spark = SparkSession
			
 
				+      .builder()
			
 
				+      .appName(this.getClass.getName)
			
 
				+      .getOrCreate()
			
 
				+    val sc = spark.sparkContext
			
 
				+
			
 
				+    val loader = getClass.getClassLoader
			
 
				+    val featureNameUrl = loader.getResource(featureNameFile)
			
 
				+    val content =
			
 
				+      if (featureNameUrl != null) {
			
 
				+        val content = Source.fromURL(featureNameUrl).getLines().mkString("\n")
			
 
				+        Source.fromURL(featureNameUrl).close()
			
 
				+        content
			
 
				+      } else {
			
 
				+        ""
			
 
				+      }
			
 
				+    println(content)
			
 
				+    val featureNameList = content.split("\n")
			
 
				+      .map(r => r.replace(" ", "").replaceAll("\n", ""))
			
 
				+      .filter(r => r.nonEmpty).toList
			
 
				+    val contentList_br = sc.broadcast(featureNameList)
			
 
				+
			
 
				+    val resourceUrlBucket = loader.getResource(featureBucketFile)
			
 
				+    val buckets =
			
 
				+      if (resourceUrlBucket != null) {
			
 
				+        val buckets = Source.fromURL(resourceUrlBucket).getLines().mkString("\n")
			
 
				+        Source.fromURL(resourceUrlBucket).close()
			
 
				+        buckets
			
 
				+      } else {
			
 
				+        ""
			
 
				+      }
			
 
				+    println(buckets)
			
 
				+    val bucketsMap = buckets.split("\n")
			
 
				+      .map(r => r.replace(" ", "").replaceAll("\n", ""))
			
 
				+      .filter(r => r.nonEmpty)
			
 
				+      .map(r => {
			
 
				+        val rList = r.split("\t")
			
 
				+        (rList(0), (rList(1).toDouble, rList(2).split(",").map(_.toDouble)))
			
 
				+      }).toMap
			
 
				+    val bucketsMap_br = sc.broadcast(bucketsMap)
			
 
				+    // 2 读取odps+表信息
			
 
				+    val odpsOps = env.getODPS(sc)
			
 
				+
			
 
				+    // 3 循环执行数据生产
			
 
				+    val timeRange = MyDateUtils.getDateHourRange(beginStr, endStr)
			
 
				+    for (dt_hh <- timeRange) {
			
 
				+      val dt = dt_hh.substring(0, 8)
			
 
				+      val hh = dt_hh.substring(8, 10)
			
 
				+      val partition = s"dt=$dt,hh=$hh"
			
 
				+      if (filterHours.nonEmpty && filterHours.contains(hh)) {
			
 
				+        println("不执行partiton:" + partition)
			
 
				+      } else {
			
 
				+        println("开始执行partiton:" + partition)
			
 
				+        val odpsData = odpsOps.readTable(project = project,
			
 
				+            table = table,
			
 
				+            partition = partition,
			
 
				+            transfer = func,
			
 
				+            numPartition = tablePart)
			
 
				+          .map(record => {
			
 
				+
			
 
				+
			
 
				+            val ts = record.getString("ts").toInt
			
 
				+            val cid = record.getString("cid")
			
 
				+            val apptype = record.getString("apptype")
			
 
				+            val extend: JSONObject = if (record.isNull("extend")) new JSONObject() else
			
 
				+              JSON.parseObject(record.getString("extend"))
			
 
				+
			
 
				+
			
 
				+            val featureMap = new JSONObject()
			
 
				+
			
 
				+            val b1: JSONObject = if (record.isNull("b1_feature")) new JSONObject() else
			
 
				+              JSON.parseObject(record.getString("b1_feature"))
			
 
				+            val b2: JSONObject = if (record.isNull("b2_feature")) new JSONObject() else
			
 
				+              JSON.parseObject(record.getString("b2_feature"))
			
 
				+            val b3: JSONObject = if (record.isNull("b3_feature")) new JSONObject() else
			
 
				+              JSON.parseObject(record.getString("b3_feature"))
			
 
				+            val b4: JSONObject = if (record.isNull("b4_feature")) new JSONObject() else
			
 
				+              JSON.parseObject(record.getString("b4_feature"))
			
 
				+            val b5: JSONObject = if (record.isNull("b5_feature")) new JSONObject() else
			
 
				+              JSON.parseObject(record.getString("b5_feature"))
			
 
				+            val b6: JSONObject = if (record.isNull("b6_feature")) new JSONObject() else
			
 
				+              JSON.parseObject(record.getString("b6_feature"))
			
 
				+            val b7: JSONObject = if (record.isNull("b7_feature")) new JSONObject() else
			
 
				+              JSON.parseObject(record.getString("b7_feature"))
			
 
				+            val b8: JSONObject = if (record.isNull("b8_feature")) new JSONObject() else
			
 
				+              JSON.parseObject(record.getString("b8_feature"))
			
 
				+            val b9: JSONObject = if (record.isNull("b9_feature")) new JSONObject() else
			
 
				+              JSON.parseObject(record.getString("b9_feature"))
			
 
				+
			
 
				+
			
 
				+            featureMap.put("cid_" + cid, idDefaultValue)
			
 
				+            if (b1.containsKey("adid") && b1.getString("adid").nonEmpty) {
			
 
				+              featureMap.put("adid_" + b1.getString("adid"), idDefaultValue)
			
 
				+            }
			
 
				+            if (b1.containsKey("adverid") && b1.getString("adverid").nonEmpty) {
			
 
				+              featureMap.put("adverid_" + b1.getString("adverid"), idDefaultValue)
			
 
				+            }
			
 
				+            if (b1.containsKey("targeting_conversion") && b1.getString("targeting_conversion").nonEmpty) {
			
 
				+              featureMap.put("targeting_conversion_" + b1.getString("targeting_conversion"), idDefaultValue)
			
 
				+            }
			
 
				+
			
 
				+            val hour = DateTimeUtil.getHourByTimestamp(ts)
			
 
				+            featureMap.put("hour_" + hour, idDefaultValue)
			
 
				+
			
 
				+            val dayOfWeek = DateTimeUtil.getDayOrWeekByTimestamp(ts)
			
 
				+            featureMap.put("dayofweek_" + dayOfWeek, idDefaultValue);
			
 
				+
			
 
				+            featureMap.put("apptype_" + apptype, idDefaultValue);
			
 
				+
			
 
				+            if (extend.containsKey("abcode") && extend.getString("abcode").nonEmpty) {
			
 
				+              featureMap.put("abcode_" + extend.getString("abcode"), idDefaultValue)
			
 
				+            }
			
 
				+
			
 
				+
			
 
				+            if (b1.containsKey("cpa")) {
			
 
				+              featureMap.put("cpa", b1.getString("cpa").toDouble)
			
 
				+            }
			
 
				+            if (b1.containsKey("weight") && b1.getString("weight").nonEmpty) {
			
 
				+              featureMap.put("weight", b1.getString("weight").toDouble)
			
 
				+            }
			
 
				+
			
 
				+            for ((bn, prefix1) <- List(
			
 
				+              (b2, "b2"), (b3, "b3"), (b4, "b4"), (b5, "b5"), (b8, "b8"), (b9, "b9")
			
 
				+            )) {
			
 
				+              for (prefix2 <- List(
			
 
				+                "1h", "2h", "3h", "4h", "5h", "6h", "12h", "1d", "3d", "7d", "today", "yesterday"
			
 
				+              )) {
			
 
				+                val view = if (bn.isEmpty) 0D else bn.getIntValue("ad_view_" + prefix2).toDouble
			
 
				+                val click = if (bn.isEmpty) 0D else bn.getIntValue("ad_click_" + prefix2).toDouble
			
 
				+                val conver = if (bn.isEmpty) 0D else bn.getIntValue("ad_conversion_" + prefix2).toDouble
			
 
				+                val income = if (bn.isEmpty) 0D else bn.getIntValue("ad_income_" + prefix2).toDouble
			
 
				+                val f1 = RankExtractorFeature_20240530.calDiv(click, view)
			
 
				+                val f2 = RankExtractorFeature_20240530.calDiv(conver, view)
			
 
				+                val f3 = RankExtractorFeature_20240530.calDiv(conver, click)
			
 
				+                val f4 = conver
			
 
				+                val f5 = RankExtractorFeature_20240530.calDiv(income * 1000, view)
			
 
				+                featureMap.put(prefix1 + "_" + prefix2 + "_" + "ctr", f1)
			
 
				+                featureMap.put(prefix1 + "_" + prefix2 + "_" + "ctcvr", f2)
			
 
				+                featureMap.put(prefix1 + "_" + prefix2 + "_" + "cvr", f3)
			
 
				+                featureMap.put(prefix1 + "_" + prefix2 + "_" + "conver", f4)
			
 
				+                featureMap.put(prefix1 + "_" + prefix2 + "_" + "ecpm", f5)
			
 
				+
			
 
				+                featureMap.put(prefix1 + "_" + prefix2 + "_" + "click", click)
			
 
				+                featureMap.put(prefix1 + "_" + prefix2 + "_" + "conver*log(view)", conver * RankExtractorFeature_20240530.calLog(view))
			
 
				+                featureMap.put(prefix1 + "_" + prefix2 + "_" + "conver*ctcvr", conver * f2)
			
 
				+              }
			
 
				+            }
			
 
				+
			
 
				+            for ((bn, prefix1) <- List(
			
 
				+              (b6, "b6"), (b7, "b7")
			
 
				+            )) {
			
 
				+              for (prefix2 <- List(
			
 
				+                "7d", "14d"
			
 
				+              )) {
			
 
				+                val view = if (bn.isEmpty) 0D else bn.getIntValue("ad_view_" + prefix2).toDouble
			
 
				+                val click = if (bn.isEmpty) 0D else bn.getIntValue("ad_click_" + prefix2).toDouble
			
 
				+                val conver = if (bn.isEmpty) 0D else bn.getIntValue("ad_conversion_" + prefix2).toDouble
			
 
				+                val income = if (bn.isEmpty) 0D else bn.getIntValue("ad_income_" + prefix2).toDouble
			
 
				+                val f1 = RankExtractorFeature_20240530.calDiv(click, view)
			
 
				+                val f2 = RankExtractorFeature_20240530.calDiv(conver, view)
			
 
				+                val f3 = RankExtractorFeature_20240530.calDiv(conver, click)
			
 
				+                val f4 = conver
			
 
				+                val f5 = RankExtractorFeature_20240530.calDiv(income * 1000, view)
			
 
				+                featureMap.put(prefix1 + "_" + prefix2 + "_" + "ctr", f1)
			
 
				+                featureMap.put(prefix1 + "_" + prefix2 + "_" + "ctcvr", f2)
			
 
				+                featureMap.put(prefix1 + "_" + prefix2 + "_" + "cvr", f3)
			
 
				+                featureMap.put(prefix1 + "_" + prefix2 + "_" + "conver", f4)
			
 
				+                featureMap.put(prefix1 + "_" + prefix2 + "_" + "ecpm", f5)
			
 
				+
			
 
				+                featureMap.put(prefix1 + "_" + prefix2 + "_" + "click", click)
			
 
				+                featureMap.put(prefix1 + "_" + prefix2 + "_" + "conver*log(view)", conver * RankExtractorFeature_20240530.calLog(view))
			
 
				+                featureMap.put(prefix1 + "_" + prefix2 + "_" + "conver*ctcvr", conver * f2)
			
 
				+              }
			
 
				+            }
			
 
				+
			
 
				+            val c1: JSONObject = if (record.isNull("c1_feature")) new JSONObject() else
			
 
				+              JSON.parseObject(record.getString("c1_feature"))
			
 
				+
			
 
				+            val midActionList = if (c1.containsKey("action") && c1.getString("action").nonEmpty) {
			
 
				+              c1.getString("action").split(",").map(r => {
			
 
				+                val rList = r.split(":")
			
 
				+                (rList(0), (rList(1).toInt, rList(2).toInt, rList(3).toInt, rList(4).toInt, rList(5)))
			
 
				+              }).sortBy(-_._2._1).toList
			
 
				+            } else {
			
 
				+              new ArrayBuffer[(String, (Int, Int, Int, Int, String))]().toList
			
 
				+            }
			
 
				+            // u特征
			
 
				+            val viewAll = midActionList.size.toDouble
			
 
				+            val clickAll = midActionList.map(_._2._2).sum.toDouble
			
 
				+            val converAll = midActionList.map(_._2._3).sum.toDouble
			
 
				+            val incomeAll = midActionList.map(_._2._4).sum.toDouble
			
 
				+            featureMap.put("viewAll", viewAll)
			
 
				+            featureMap.put("clickAll", clickAll)
			
 
				+            featureMap.put("converAll", converAll)
			
 
				+            featureMap.put("incomeAll", incomeAll)
			
 
				+            featureMap.put("ctr_all", RankExtractorFeature_20240530.calDiv(clickAll, viewAll))
			
 
				+            featureMap.put("ctcvr_all", RankExtractorFeature_20240530.calDiv(converAll, viewAll))
			
 
				+            featureMap.put("cvr_all", RankExtractorFeature_20240530.calDiv(clickAll, converAll))
			
 
				+            featureMap.put("ecpm_all", RankExtractorFeature_20240530.calDiv(incomeAll * 1000, viewAll))
			
 
				+
			
 
				+            // ui特征
			
 
				+            val midTimeDiff = scala.collection.mutable.Map[String, Double]()
			
 
				+            midActionList.foreach {
			
 
				+              case (cid, (ts_history, click, conver, income, title)) =>
			
 
				+                if (!midTimeDiff.contains("timediff_view_" + cid)) {
			
 
				+                  midTimeDiff.put("timediff_view_" + cid, 1.0 / ((ts - ts_history).toDouble / 3600.0 / 24.0))
			
 
				+                }
			
 
				+                if (!midTimeDiff.contains("timediff_click_" + cid) && click > 0) {
			
 
				+                  midTimeDiff.put("timediff_click_" + cid, 1.0 / ((ts - ts_history).toDouble / 3600.0 / 24.0))
			
 
				+                }
			
 
				+                if (!midTimeDiff.contains("timediff_conver_" + cid) && conver > 0) {
			
 
				+                  midTimeDiff.put("timediff_conver_" + cid, 1.0 / ((ts - ts_history).toDouble / 3600.0 / 24.0))
			
 
				+                }
			
 
				+            }
			
 
				+
			
 
				+            val midActionStatic = scala.collection.mutable.Map[String, Double]()
			
 
				+            midActionList.foreach {
			
 
				+              case (cid, (ts_history, click, conver, income, title)) =>
			
 
				+                midActionStatic.put("actionstatic_view_" + cid, 1.0 + midActionStatic.getOrDefault("actionstatic_view_" + cid, 0.0))
			
 
				+                midActionStatic.put("actionstatic_click_" + cid, click + midActionStatic.getOrDefault("actionstatic_click_" + cid, 0.0))
			
 
				+                midActionStatic.put("actionstatic_conver_" + cid, conver + midActionStatic.getOrDefault("actionstatic_conver_" + cid, 0.0))
			
 
				+                midActionStatic.put("actionstatic_income_" + cid, income + midActionStatic.getOrDefault("actionstatic_income_" + cid, 0.0))
			
 
				+            }
			
 
				+
			
 
				+            if (midTimeDiff.contains("timediff_view_" + cid)) {
			
 
				+              featureMap.put("timediff_view", midTimeDiff.getOrDefault("timediff_view_" + cid, 0.0))
			
 
				+            }
			
 
				+            if (midTimeDiff.contains("timediff_click_" + cid)) {
			
 
				+              featureMap.put("timediff_click", midTimeDiff.getOrDefault("timediff_click_" + cid, 0.0))
			
 
				+            }
			
 
				+            if (midTimeDiff.contains("timediff_conver_" + cid)) {
			
 
				+              featureMap.put("timediff_conver", midTimeDiff.getOrDefault("timediff_conver_" + cid, 0.0))
			
 
				+            }
			
 
				+            if (midActionStatic.contains("actionstatic_view_" + cid)) {
			
 
				+              featureMap.put("actionstatic_view", midActionStatic.getOrDefault("actionstatic_view_" + cid, 0.0))
			
 
				+            }
			
 
				+            if (midActionStatic.contains("actionstatic_click_" + cid)) {
			
 
				+              featureMap.put("actionstatic_click", midActionStatic.getOrDefault("actionstatic_click_" + cid, 0.0))
			
 
				+            }
			
 
				+            if (midActionStatic.contains("actionstatic_conver_" + cid)) {
			
 
				+              featureMap.put("actionstatic_conver", midActionStatic.getOrDefault("actionstatic_conver_" + cid, 0.0))
			
 
				+            }
			
 
				+            if (midActionStatic.contains("actionstatic_income_" + cid)) {
			
 
				+              featureMap.put("actionstatic_income", midActionStatic.getOrDefault("actionstatic_income_" + cid, 0.0))
			
 
				+            }
			
 
				+            if (midActionStatic.contains("actionstatic_view_" + cid) && midActionStatic.contains("actionstatic_click_" + cid)) {
			
 
				+              featureMap.put("actionstatic_ctr", RankExtractorFeature_20240530.calDiv(
			
 
				+                midActionStatic.getOrDefault("actionstatic_click_" + cid, 0.0),
			
 
				+                midActionStatic.getOrDefault("actionstatic_view_" + cid, 0.0)
			
 
				+              ))
			
 
				+            }
			
 
				+            if (midActionStatic.contains("actionstatic_view_" + cid) && midActionStatic.contains("actionstatic_conver_" + cid)) {
			
 
				+              featureMap.put("actionstatic_ctcvr", RankExtractorFeature_20240530.calDiv(
			
 
				+                midActionStatic.getOrDefault("actionstatic_conver_" + cid, 0.0),
			
 
				+                midActionStatic.getOrDefault("actionstatic_view_" + cid, 0.0)
			
 
				+              ))
			
 
				+            }
			
 
				+            if (midActionStatic.contains("actionstatic_conver_" + cid) && midActionStatic.contains("actionstatic_click_" + cid)) {
			
 
				+              featureMap.put("actionstatic_cvr", RankExtractorFeature_20240530.calDiv(
			
 
				+                midActionStatic.getOrDefault("actionstatic_conver_" + cid, 0.0),
			
 
				+                midActionStatic.getOrDefault("actionstatic_click_" + cid, 0.0)
			
 
				+              ))
			
 
				+            }
			
 
				+
			
 
				+            val e1: JSONObject = if (record.isNull("e1_feature")) new JSONObject() else
			
 
				+              JSON.parseObject(record.getString("e1_feature"))
			
 
				+            val e2: JSONObject = if (record.isNull("e2_feature")) new JSONObject() else
			
 
				+              JSON.parseObject(record.getString("e2_feature"))
			
 
				+            val title = b1.getOrDefault("cidtitle", "").toString
			
 
				+            if (title.nonEmpty) {
			
 
				+              for ((en, prefix1) <- List((e1, "e1"), (e2, "e2"))) {
			
 
				+                for (prefix2 <- List("tags_3d", "tags_7d", "tags_14d")) {
			
 
				+                  if (en.nonEmpty && en.containsKey(prefix2) && en.getString(prefix2).nonEmpty) {
			
 
				+                    val (f1, f2, f3, f4) = funcC34567ForTags(en.getString(prefix2), title)
			
 
				+                    featureMap.put(prefix1 + "_" + prefix2 + "_matchnum", f1)
			
 
				+                    featureMap.put(prefix1 + "_" + prefix2 + "_maxscore", f3)
			
 
				+                    featureMap.put(prefix1 + "_" + prefix2 + "_avgscore", f4)
			
 
				+
			
 
				+                  }
			
 
				+                }
			
 
				+              }
			
 
				+            }
			
 
				+
			
 
				+            val d1: JSONObject = if (record.isNull("d1_feature")) new JSONObject() else
			
 
				+              JSON.parseObject(record.getString("d1_feature"))
			
 
				+            val d2: JSONObject = if (record.isNull("d2_feature")) new JSONObject() else
			
 
				+              JSON.parseObject(record.getString("d2_feature"))
			
 
				+            val d3: JSONObject = if (record.isNull("d3_feature")) new JSONObject() else
			
 
				+              JSON.parseObject(record.getString("d3_feature"))
			
 
				+
			
 
				+            if (d1.nonEmpty) {
			
 
				+              for (prefix <- List("3h", "6h", "12h", "1d", "3d", "7d")) {
			
 
				+                val view = if (!d1.containsKey("ad_view_" + prefix)) 0D else d1.getIntValue("ad_view_" + prefix).toDouble
			
 
				+                val click = if (!d1.containsKey("ad_click_" + prefix)) 0D else d1.getIntValue("ad_click_" + prefix).toDouble
			
 
				+                val conver = if (!d1.containsKey("ad_conversion_" + prefix)) 0D else d1.getIntValue("ad_conversion_" + prefix).toDouble
			
 
				+                val income = if (!d1.containsKey("ad_income_" + prefix)) 0D else d1.getIntValue("ad_income_" + prefix).toDouble
			
 
				+                val f1 = RankExtractorFeature_20240530.calDiv(click, view)
			
 
				+                val f2 = RankExtractorFeature_20240530.calDiv(conver, view)
			
 
				+                val f3 = RankExtractorFeature_20240530.calDiv(conver, click)
			
 
				+                val f4 = conver
			
 
				+                val f5 = RankExtractorFeature_20240530.calDiv(income * 1000, view)
			
 
				+                featureMap.put("d1_feature" + "_" + prefix + "_" + "ctr", f1)
			
 
				+                featureMap.put("d1_feature" + "_" + prefix + "_" + "ctcvr", f2)
			
 
				+                featureMap.put("d1_feature" + "_" + prefix + "_" + "cvr", f3)
			
 
				+                featureMap.put("d1_feature" + "_" + prefix + "_" + "conver", f4)
			
 
				+                featureMap.put("d1_feature" + "_" + prefix + "_" + "ecpm", f5)
			
 
				+              }
			
 
				+            }
			
 
				+
			
 
				+            val vidRankMaps = scala.collection.mutable.Map[String, scala.collection.immutable.Map[String, Double]]()
			
 
				+            if (d2.nonEmpty) {
			
 
				+              d2.foreach(r => {
			
 
				+                val key = r._1
			
 
				+                val value = d2.getString(key).split(",").map(r => {
			
 
				+                  val rList = r.split(":")
			
 
				+                  (rList(0), rList(2).toDouble)
			
 
				+                }).toMap
			
 
				+                vidRankMaps.put(key, value)
			
 
				+              })
			
 
				+            }
			
 
				+            for (prefix1 <- List("ctr", "ctcvr", "ecpm")) {
			
 
				+              for (prefix2 <- List("1d", "3d", "7d", "14d")) {
			
 
				+                if (vidRankMaps.contains(prefix1 + "_" + prefix2)) {
			
 
				+                  val rank = vidRankMaps(prefix1 + "_" + prefix2).getOrDefault(cid, 0.0)
			
 
				+                  if (rank >= 1.0) {
			
 
				+                    featureMap.put("vid_rank_" + prefix1 + "_" + prefix2, 1.0 / rank)
			
 
				+                  }
			
 
				+                }
			
 
				+              }
			
 
				+            }
			
 
				+
			
 
				+            if (d3.nonEmpty) {
			
 
				+              val vTitle = d3.getString("title")
			
 
				+              val score = Similarity.conceptSimilarity(title, vTitle)
			
 
				+              featureMap.put("ctitle_vtitle_similarity", score);
			
 
				+            }
			
 
				+
			
 
				+            /*
			
 
				+            广告
			
 
				+              sparse：cid adid adverid targeting_conversion
			
 
				+
			
 
				+              cpa --> 1个
			
 
				+              adverid下的 3h 6h 12h 1d 3d 7d 、 ctr ctcvr cvr conver ecpm  --> 30个
			
 
				+              cid下的 3h 6h 12h 1d 3d 7d 、 ctr ctcvr cvr ecpm conver --> 30个
			
 
				+              地理//cid下的 3h 6h 12h 1d 3d 7d 、 ctr ctcvr cvr ecpm conver --> 30个
			
 
				+              app//cid下的 3h 6h 12h 1d 3d 7d 、 ctr ctcvr cvr ecpm conver --> 30个
			
 
				+              手机品牌//cid下的 3h 6h 12h 1d 3d 7d 、 ctr ctcvr cvr ecpm conver --> 30个
			
 
				+              系统 无数据
			
 
				+              week//cid下的 7d 14d、 ctr ctcvr cvr ecpm conver --> 10个
			
 
				+              hour//cid下的 7d 14d、 ctr ctcvr cvr ecpm conver --> 10个
			
 
				+
			
 
				+            用户
			
 
				+              用户历史 点击/转化 的title tag；3d 7d 14d； cid的title； 数量/最高分/平均分 --> 18个
			
 
				+              用户历史 14d 看过/点过/转化次数/income； ctr cvr ctcvr ecpm；  --> 8个
			
 
				+
			
 
				+              用户到cid的ui特征 --> 10个
			
 
				+                1/用户最近看过这个cid的时间间隔
			
 
				+                1/用户最近点过这个cid的时间间隔
			
 
				+                1/用户最近转过这个cid的时间间隔
			
 
				+                用户看过这个cid多少次
			
 
				+                用户点过这个cid多少次
			
 
				+                用户转过这个cid多少次
			
 
				+                用户对这个cid花了多少钱
			
 
				+                用户对这个cid的ctr ctcvr cvr
			
 
				+
			
 
				+            视频
			
 
				+              title与cid的 sim-score-1/-2 无数据
			
 
				+              vid//cid下的 3h 6h 12h 1d 3d 7d 、 ctr ctcvr cvr ecpm conver --> 30个
			
 
				+              vid//cid下的 1d 3d 7d 14d、 ctr ctcvr ecpm 的rank值 倒数 --> 12个
			
 
				+
			
 
				+             */
			
 
				+
			
 
				+
			
 
				+            //4 处理label信息。
			
 
				+            val labels = new JSONObject
			
 
				+            for (labelKey <- List("ad_is_click", "ad_is_conversion")) {
			
 
				+              if (!record.isNull(labelKey)) {
			
 
				+                labels.put(labelKey, record.getString(labelKey))
			
 
				+              }
			
 
				+            }
			
 
				+            //5 处理log key表头。
			
 
				+            val mid = record.getString("mid")
			
 
				+            val allfeature = if (record.isNull("allfeaturemap")) new JSONObject() else
			
 
				+              JSON.parseObject(record.getString("allfeaturemap"))
			
 
				+
			
 
				+            val headvideoid = record.getString("headvideoid")
			
 
				+            // val logKey = (apptype, mid, cid, ts, headvideoid).productIterator.mkString(",")
			
 
				+            val labelKey = labels.toString()
			
 
				+            val label = record.getString("ad_is_conversion")
			
 
				+            //6 拼接数据，保存。
			
 
				+            (apptype, mid, cid, ts, headvideoid, label, allfeature, featureMap)
			
 
				+          }).filter {
			
 
				+            case (apptype, mid, cid, ts, headvideoid, label, allfeature, featureMap) =>
			
 
				+              !(allfeature.isEmpty || allfeature.containsKey("weight_sum") || allfeature.contains("weight"))
			
 
				+          }.mapPartitions(row => {
			
 
				+            val result = new ArrayBuffer[String]()
			
 
				+            val bucketsMap = bucketsMap_br.value
			
 
				+            row.foreach {
			
 
				+              case (apptype, mid, cid, ts, headvideoid, label, allfeature, featureMap) =>
			
 
				+                val offlineFeatureMap = featureMap.map(r => {
			
 
				+                  val score = r._2.toString.toDouble
			
 
				+                  val name = r._1
			
 
				+                  if (score > 1E-8) {
			
 
				+                    if (bucketsMap.contains(name)) {
			
 
				+                      val (bucketsNum, buckets) = bucketsMap(name)
			
 
				+                      val scoreNew = 1.0 / bucketsNum * (ExtractorUtils.findInsertPosition(buckets, score).toDouble + 1.0)
			
 
				+                      name + ":" + scoreNew.toString
			
 
				+                    } else {
			
 
				+                      name + ":" + score.toString
			
 
				+                    }
			
 
				+                  } else {
			
 
				+                    ""
			
 
				+                  }
			
 
				+                }).filter(_.nonEmpty)
			
 
				+                result.add(
			
 
				+                  (apptype, mid, cid, ts, headvideoid, label, allfeature.toString(), offlineFeatureMap.iterator.mkString(",")).productIterator.mkString("\t")
			
 
				+                )
			
 
				+            }
			
 
				+            result.iterator
			
 
				+          })
			
 
				+
			
 
				+        // 4 保存数据到hdfs
			
 
				+        val savePartition = dt + hh
			
 
				+        val hdfsPath = savePath + "/" + savePartition
			
 
				+        if (hdfsPath.nonEmpty && hdfsPath.startsWith("/dw/recommend/model/")) {
			
 
				+          println("删除路径并开始数据写入:" + hdfsPath)
			
 
				+          MyHdfsUtils.delete_hdfs_path(hdfsPath)
			
 
				+          odpsData.coalesce(repartition).saveAsTextFile(hdfsPath, classOf[GzipCodec])
			
 
				+        } else {
			
 
				+          println("路径不合法，无法写入:" + hdfsPath)
			
 
				+        }
			
 
				+      }
			
 
				+    }
			
 
				+
			
 
				+
			
 
				+    val data2 = sc.textFile(savePath + "/" + readDate + "*").mapPartitions(row => {
			
 
				+      val result = new ArrayBuffer[(String, List[String], List[String])]()
			
 
				+      // 680实验，517个特征
			
 
				+      row.foreach(r => {
			
 
				+        val rList = r.split("\t")
			
 
				+        val label = rList(5).toString
			
 
				+        val allFeatureMap = JSON.parseObject(rList(6)).toMap.map(r => (r._1, r._2.toString))
			
 
				+        val offlineFeature = rList(7).split(",").map(r => (r.split(":")(0), r.split(":")(1))).toMap
			
 
				+
			
 
				+        val offlineFeatureList = allFeatureMap.map {
			
 
				+          case (key, value) =>
			
 
				+            key + ":" + value
			
 
				+        }.filter(_.nonEmpty).toList
			
 
				+
			
 
				+        val b8FeatureSet = Set("b8_3h_ctr", "b8_3h_ctcvr", "b8_3h_cvr", "b8_3h_conver", "b8_3h_ecpm", "b8_3h_click", "b8_3h_conver*log(view)", "b8_3h_conver*ctcvr", "b8_6h_ctr", "b8_6h_ctcvr", "b8_6h_cvr", "b8_6h_conver", "b8_6h_ecpm", "b8_6h_click", "b8_6h_conver*log(view)", "b8_6h_conver*ctcvr", "b8_12h_ctr", "b8_12h_ctcvr", "b8_12h_cvr", "b8_12h_conver", "b8_12h_ecpm", "b8_12h_click", "b8_12h_conver*log(view)", "b8_12h_conver*ctcvr", "b8_1d_ctr", "b8_1d_ctcvr", "b8_1d_cvr", "b8_1d_conver", "b8_1d_ecpm", "b8_1d_click", "b8_1d_conver*log(view)", "b8_1d_conver*ctcvr", "b8_3d_ctr", "b8_3d_ctcvr", "b8_3d_cvr", "b8_3d_conver", "b8_3d_ecpm", "b8_3d_click", "b8_3d_conver*log(view)", "b8_3d_conver*ctcvr", "b8_7d_ctr", "b8_7d_ctcvr", "b8_7d_cvr", "b8_7d_conver", "b8_7d_ecpm", "b8_7d_click", "b8_7d_conver*log(view)", "b8_7d_conver*ctcvr")
			
 
				+        val b8AllFeatureMap = new JSONObject()
			
 
				+        for (elem <- allFeatureMap) {
			
 
				+          b8AllFeatureMap.put(elem._1, elem._2)
			
 
				+        }
			
 
				+        for (elem <- b8FeatureSet) {
			
 
				+          if (!b8AllFeatureMap.containsKey(elem) && offlineFeature.contains(elem)) {
			
 
				+            b8AllFeatureMap.put(elem, offlineFeature(elem))
			
 
				+          }
			
 
				+        }
			
 
				+        val b8AllFeature = b8AllFeatureMap.map {
			
 
				+          case (key, value) =>
			
 
				+            key + ":" + value
			
 
				+        }.filter(_.nonEmpty).toList
			
 
				+
			
 
				+
			
 
				+
			
 
				+        result.add((label, offlineFeatureList, b8AllFeature))
			
 
				+      })
			
 
				+
			
 
				+      result.iterator
			
 
				+    })
			
 
				+
			
 
				+    val offlineSave = "/dw/recommend/model/33_for_check_all/" + readDate
			
 
				+    if (offlineSave.nonEmpty && offlineSave.startsWith("/dw/recommend/model/")) {
			
 
				+      println("删除路径并开始数据写入:" + offlineSave)
			
 
				+      MyHdfsUtils.delete_hdfs_path(offlineSave)
			
 
				+      data2.map(r => r._1 + "\t" + r._2.mkString("\t")).saveAsTextFile(offlineSave, classOf[GzipCodec])
			
 
				+    } else {
			
 
				+      println("路径不合法，无法写入:" + offlineSave)
			
 
				+    }
			
 
				+
			
 
				+    val allFeatureV1 = "/dw/recommend/model/33_for_check_all_b8/" + readDate
			
 
				+    if (allFeatureV1.nonEmpty && allFeatureV1.startsWith("/dw/recommend/model/")) {
			
 
				+      println("删除路径并开始数据写入:" + allFeatureV1)
			
 
				+      MyHdfsUtils.delete_hdfs_path(allFeatureV1)
			
 
				+      data2.map(r => r._1 + "\t" + r._3.mkString("\t")).saveAsTextFile(allFeatureV1, classOf[GzipCodec])
			
 
				+    } else {
			
 
				+      println("路径不合法，无法写入:" + allFeatureV1)
			
 
				+    }
			
 
				+
			
 
				+  }
			
 
				+
			
 
				+  def func(record: Record, schema: TableSchema): Record = {
			
 
				+    record
			
 
				+  }
			
 
				+
			
 
				+  def funcC34567ForTags(tags: String, title: String): Tuple4[Double, String, Double, Double] = {
			
 
				+    // 匹配数量 匹配词 语义最高相似度分 语义平均相似度分
			
 
				+    val tagsList = tags.split(",")
			
 
				+    var d1 = 0.0
			
 
				+    val d2 = new ArrayBuffer[String]()
			
 
				+    var d3 = 0.0
			
 
				+    var d4 = 0.0
			
 
				+    for (tag <- tagsList) {
			
 
				+      if (title.contains(tag)) {
			
 
				+        d1 = d1 + 1.0
			
 
				+        d2.add(tag)
			
 
				+      }
			
 
				+      val score = Similarity.conceptSimilarity(tag, title)
			
 
				+      d3 = if (score > d3) score else d3
			
 
				+      d4 = d4 + score
			
 
				+    }
			
 
				+    d4 = if (tagsList.nonEmpty) d4 / tagsList.size else d4
			
 
				+    (d1, d2.mkString(","), d3, d4)
			
 
				+  }
			
 
				+}
			
--- a/src/main/scala/com/aliyun/odps/spark/examples/makedata_qiao/makedata_13_originData_20240705.scala
+++ b/src/main/scala/com/aliyun/odps/spark/examples/makedata_qiao/makedata_13_originData_20240705.scala
@@ -0,0 +1,278 @@
 
				+package com.aliyun.odps.spark.examples.makedata_qiao
			
 
				+
			
 
				+import com.alibaba.fastjson.{JSON, JSONObject}
			
 
				+import com.aliyun.odps.TableSchema
			
 
				+import com.aliyun.odps.data.Record
			
 
				+import com.aliyun.odps.spark.examples.myUtils.{MyDateUtils, MyHdfsUtils, ParamUtils, env}
			
 
				+import examples.extractor.RankExtractorFeature_20240530
			
 
				+import org.apache.hadoop.io.compress.GzipCodec
			
 
				+import org.apache.spark.sql.SparkSession
			
 
				+import org.xm.Similarity
			
 
				+
			
 
				+import scala.collection.JavaConversions._
			
 
				+import scala.collection.mutable.ArrayBuffer
			
 
				+/*
			
 
				+   20240608 提取特征
			
 
				+ */
			
 
				+
			
 
				+object makedata_13_originData_20240705 {
			
 
				+  def main(args: Array[String]): Unit = {
			
 
				+    val spark = SparkSession
			
 
				+      .builder()
			
 
				+      .appName(this.getClass.getName)
			
 
				+      .getOrCreate()
			
 
				+    val sc = spark.sparkContext
			
 
				+
			
 
				+    // 1 读取参数
			
 
				+    val param = ParamUtils.parseArgs(args)
			
 
				+    val tablePart = param.getOrElse("tablePart", "64").toInt
			
 
				+    val beginStr = param.getOrElse("beginStr", "2023010100")
			
 
				+    val endStr = param.getOrElse("endStr", "2023010123")
			
 
				+    val savePath = param.getOrElse("savePath", "/dw/recommend/model/13_sample_data/")
			
 
				+    val project = param.getOrElse("project", "loghubods")
			
 
				+    val table = param.getOrElse("table", "XXXX")
			
 
				+    val repartition = param.getOrElse("repartition", "100").toInt
			
 
				+
			
 
				+    // 2 读取odps+表信息
			
 
				+    val odpsOps = env.getODPS(sc)
			
 
				+
			
 
				+    // 3 循环执行数据生产
			
 
				+    val timeRange = MyDateUtils.getDateHourRange(beginStr, endStr)
			
 
				+    for (dt_hh <- timeRange) {
			
 
				+      val dt = dt_hh.substring(0, 8)
			
 
				+      val hh = dt_hh.substring(8, 10)
			
 
				+      val partition = s"dt=$dt,hh=$hh"
			
 
				+      println("开始执行partiton:" + partition)
			
 
				+      val odpsData = odpsOps.readTable(project = project,
			
 
				+        table = table,
			
 
				+        partition = partition,
			
 
				+        transfer = func,
			
 
				+        numPartition = tablePart)
			
 
				+        .map(record => {
			
 
				+
			
 
				+          val featureMap = new JSONObject()
			
 
				+
			
 
				+          // a 视频特征
			
 
				+          val b1: JSONObject = if (record.isNull("b1_feature")) new JSONObject() else
			
 
				+            JSON.parseObject(record.getString("b1_feature"))
			
 
				+          val b2: JSONObject = if (record.isNull("b2_feature")) new JSONObject() else
			
 
				+            JSON.parseObject(record.getString("b2_feature"))
			
 
				+          val b3: JSONObject = if (record.isNull("b3_feature")) new JSONObject() else
			
 
				+            JSON.parseObject(record.getString("b3_feature"))
			
 
				+          val b6: JSONObject = if (record.isNull("b6_feature")) new JSONObject() else
			
 
				+            JSON.parseObject(record.getString("b6_feature"))
			
 
				+          val b7: JSONObject = if (record.isNull("b7_feature")) new JSONObject() else
			
 
				+            JSON.parseObject(record.getString("b7_feature"))
			
 
				+
			
 
				+          val b8: JSONObject = if (record.isNull("b8_feature")) new JSONObject() else
			
 
				+            JSON.parseObject(record.getString("b8_feature"))
			
 
				+          val b9: JSONObject = if (record.isNull("b9_feature")) new JSONObject() else
			
 
				+            JSON.parseObject(record.getString("b9_feature"))
			
 
				+          val b10: JSONObject = if (record.isNull("b10_feature")) new JSONObject() else
			
 
				+            JSON.parseObject(record.getString("b10_feature"))
			
 
				+          val b11: JSONObject = if (record.isNull("b11_feature")) new JSONObject() else
			
 
				+            JSON.parseObject(record.getString("b11_feature"))
			
 
				+          val b12: JSONObject = if (record.isNull("b12_feature")) new JSONObject() else
			
 
				+            JSON.parseObject(record.getString("b12_feature"))
			
 
				+          val b13: JSONObject = if (record.isNull("b13_feature")) new JSONObject() else
			
 
				+            JSON.parseObject(record.getString("b13_feature"))
			
 
				+          val b17: JSONObject = if (record.isNull("b17_feature")) new JSONObject() else
			
 
				+            JSON.parseObject(record.getString("b17_feature"))
			
 
				+          val b18: JSONObject = if (record.isNull("b18_feature")) new JSONObject() else
			
 
				+            JSON.parseObject(record.getString("b18_feature"))
			
 
				+          val b19: JSONObject = if (record.isNull("b19_feature")) new JSONObject() else
			
 
				+            JSON.parseObject(record.getString("b19_feature"))
			
 
				+
			
 
				+
			
 
				+          val origin_data = List(
			
 
				+            (b1, b2, b3, "b123"), (b1, b6, b7, "b167"),
			
 
				+            (b8, b9, b10, "b8910"), (b11, b12, b13, "b111213"),
			
 
				+            (b17, b18, b19, "b171819")
			
 
				+          )
			
 
				+          for ((b_1, b_2, b_3, prefix1) <- origin_data){
			
 
				+            for (prefix2 <- List(
			
 
				+              "1h", "2h", "3h", "4h", "12h", "1d", "3d", "7d"
			
 
				+            )){
			
 
				+              val exp = if (b_1.isEmpty) 0D else b_1.getIntValue("exp_pv_" + prefix2).toDouble
			
 
				+              val share = if (b_2.isEmpty) 0D else b_2.getIntValue("share_pv_" + prefix2).toDouble
			
 
				+              val returns = if (b_3.isEmpty) 0D else b_3.getIntValue("return_uv_" + prefix2).toDouble
			
 
				+              val f1 = RankExtractorFeature_20240530.calDiv(share, exp)
			
 
				+              val f2 = RankExtractorFeature_20240530.calLog(share)
			
 
				+              val f3 = RankExtractorFeature_20240530.calDiv(returns, exp)
			
 
				+              val f4 = RankExtractorFeature_20240530.calLog(returns)
			
 
				+              val f5 = f3 * f4
			
 
				+              featureMap.put(prefix1 + "_" + prefix2 + "_" + "STR", f1)
			
 
				+              featureMap.put(prefix1 + "_" + prefix2 + "_" + "log(share)", f2)
			
 
				+              featureMap.put(prefix1 + "_" + prefix2 + "_" + "ROV", f3)
			
 
				+              featureMap.put(prefix1 + "_" + prefix2 + "_" + "log(return)", f4)
			
 
				+              featureMap.put(prefix1 + "_" + prefix2 + "_" + "ROV*log(return)", f5)
			
 
				+            }
			
 
				+          }
			
 
				+
			
 
				+          val video_info: JSONObject = if (record.isNull("t_v_info_feature")) new JSONObject() else
			
 
				+            JSON.parseObject(record.getString("t_v_info_feature"))
			
 
				+          featureMap.put("total_time", if (video_info.containsKey("total_time")) video_info.getIntValue("total_time").toDouble else 0D)
			
 
				+          featureMap.put("bit_rate", if (video_info.containsKey("bit_rate")) video_info.getIntValue("bit_rate").toDouble else 0D)
			
 
				+
			
 
				+          val c1: JSONObject = if (record.isNull("c1_feature")) new JSONObject() else
			
 
				+            JSON.parseObject(record.getString("c1_feature"))
			
 
				+          if (c1.nonEmpty){
			
 
				+            featureMap.put("playcnt_6h", if (c1.containsKey("playcnt_6h")) c1.getIntValue("playcnt_6h").toDouble else 0D)
			
 
				+            featureMap.put("playcnt_1d", if (c1.containsKey("playcnt_1d")) c1.getIntValue("playcnt_1d").toDouble else 0D)
			
 
				+            featureMap.put("playcnt_3d", if (c1.containsKey("playcnt_3d")) c1.getIntValue("playcnt_3d").toDouble else 0D)
			
 
				+            featureMap.put("playcnt_7d", if (c1.containsKey("playcnt_7d")) c1.getIntValue("playcnt_7d").toDouble else 0D)
			
 
				+          }
			
 
				+          val c2: JSONObject = if (record.isNull("c2_feature")) new JSONObject() else
			
 
				+            JSON.parseObject(record.getString("c2_feature"))
			
 
				+          if (c2.nonEmpty){
			
 
				+            featureMap.put("share_pv_12h", if (c2.containsKey("share_pv_12h")) c2.getIntValue("share_pv_12h").toDouble else 0D)
			
 
				+            featureMap.put("share_pv_1d", if (c2.containsKey("share_pv_1d")) c2.getIntValue("share_pv_1d").toDouble else 0D)
			
 
				+            featureMap.put("share_pv_3d", if (c2.containsKey("share_pv_3d")) c2.getIntValue("share_pv_3d").toDouble else 0D)
			
 
				+            featureMap.put("share_pv_7d", if (c2.containsKey("share_pv_7d")) c2.getIntValue("share_pv_7d").toDouble else 0D)
			
 
				+            featureMap.put("return_uv_12h", if (c2.containsKey("return_uv_12h")) c2.getIntValue("return_uv_12h").toDouble else 0D)
			
 
				+            featureMap.put("return_uv_1d", if (c2.containsKey("return_uv_1d")) c2.getIntValue("return_uv_1d").toDouble else 0D)
			
 
				+            featureMap.put("return_uv_3d", if (c2.containsKey("return_uv_3d")) c2.getIntValue("return_uv_3d").toDouble else 0D)
			
 
				+            featureMap.put("return_uv_7d", if (c2.containsKey("return_uv_7d")) c2.getIntValue("return_uv_7d").toDouble else 0D)
			
 
				+          }
			
 
				+
			
 
				+          val title = if (video_info.containsKey("title")) video_info.getString("title") else ""
			
 
				+          if (!title.equals("")){
			
 
				+            for (key_feature <- List("c3_feature", "c4_feature", "c5_feature", "c6_feature", "c7_feature")){
			
 
				+              val c34567: JSONObject = if (record.isNull(key_feature)) new JSONObject() else
			
 
				+                JSON.parseObject(record.getString(key_feature))
			
 
				+              for (key_time <- List("tags_1d", "tags_3d", "tags_7d")) {
			
 
				+                val tags = if (c34567.containsKey(key_time)) c34567.getString(key_time) else ""
			
 
				+                if (!tags.equals("")){
			
 
				+                  val (f1, f2, f3, f4) = funcC34567ForTags(tags, title)
			
 
				+                  featureMap.put(key_feature + "_" + key_time + "_matchnum", f1)
			
 
				+                  featureMap.put(key_feature + "_" + key_time + "_maxscore", f3)
			
 
				+                  featureMap.put(key_feature + "_" + key_time + "_avgscore", f4)
			
 
				+                }
			
 
				+              }
			
 
				+            }
			
 
				+          }
			
 
				+
			
 
				+          val vid = if (record.isNull("vid")) "" else record.getString("vid")
			
 
				+          if (!vid.equals("")){
			
 
				+            for (key_feature <- List("c8_feature", "c9_feature")){
			
 
				+              val c89: JSONObject = if (record.isNull(key_feature)) new JSONObject() else
			
 
				+                JSON.parseObject(record.getString(key_feature))
			
 
				+              for (key_action <- List("share", "return")){
			
 
				+                  val cfListStr = if (c89.containsKey(key_action)) c89.getString(key_action) else ""
			
 
				+                  if (!cfListStr.equals("")){
			
 
				+                    val cfMap = cfListStr.split(",").map(r =>{
			
 
				+                      val rList = r.split(":")
			
 
				+                      (rList(0), (rList(1), rList(2), rList(3)))
			
 
				+                    }).toMap
			
 
				+                    if (cfMap.contains(vid)){
			
 
				+                      val (score, num, rank) = cfMap(vid)
			
 
				+                      featureMap.put(key_feature + "_" + key_action + "_score", score.toDouble)
			
 
				+                      featureMap.put(key_feature + "_" + key_action + "_num", num.toDouble)
			
 
				+                      featureMap.put(key_feature + "_" + key_action + "_rank", 1.0 / rank.toDouble)
			
 
				+                    }
			
 
				+                  }
			
 
				+              }
			
 
				+            }
			
 
				+          }
			
 
				+
			
 
				+          val d1: JSONObject = if (record.isNull("d1_feature")) new JSONObject() else
			
 
				+            JSON.parseObject(record.getString("d1_feature"))
			
 
				+          if (d1.nonEmpty){
			
 
				+            featureMap.put("d1_exp", if (d1.containsKey("exp")) d1.getString("exp").toDouble else 0D)
			
 
				+            featureMap.put("d1_return_n", if (d1.containsKey("return_n")) d1.getString("return_n").toDouble else 0D)
			
 
				+            featureMap.put("d1_rovn", if (d1.containsKey("rovn")) d1.getString("rovn").toDouble else 0D)
			
 
				+          }
			
 
				+
			
 
				+
			
 
				+          /*
			
 
				+
			
 
				+
			
 
				+          视频：
			
 
				+          曝光使用pv 分享使用pv 回流使用uv --> 1h 2h 3h 4h 12h 1d 3d 7d
			
 
				+          STR log(share) ROV log(return) ROV*log(return)
			
 
				+          40个特征组合
			
 
				+          整体、整体曝光对应、推荐非冷启root、推荐冷启root、分省份root
			
 
				+          200个特征值
			
 
				+
			
 
				+          视频：
			
 
				+          视频时长、比特率
			
 
				+
			
 
				+          人：
			
 
				+          播放次数 --> 6h 1d 3d 7d --> 4个
			
 
				+          带回来的分享pv 回流uv --> 12h 1d 3d 7d --> 8个
			
 
				+          人+vid-title:
			
 
				+          播放点/回流点/分享点/累积分享/累积回流 --> 1d 3d 7d --> 匹配数量 语义最高相似度分 语义平均相似度分 --> 45个
			
 
				+          人+vid-cf
			
 
				+          基于分享行为/基于回流行为 -->  “分享cf”+”回流点击cf“ 相似分 相似数量 相似rank的倒数 --> 12个
			
 
				+
			
 
				+          头部视频：
			
 
				+          曝光 回流 ROVn 3个特征
			
 
				+
			
 
				+          场景：
			
 
				+          小时 星期 apptype city province pagesource 机器型号
			
 
				+           */
			
 
				+
			
 
				+
			
 
				+
			
 
				+          //4 处理label信息。
			
 
				+          val labels = new JSONObject
			
 
				+          for (labelKey <- List(
			
 
				+            "is_play", "is_share", "is_return", "noself_is_return", "return_uv", "noself_return_uv", "total_return_uv",
			
 
				+            "share_pv", "total_share_uv"
			
 
				+          )){
			
 
				+            if (!record.isNull(labelKey)){
			
 
				+              labels.put(labelKey, record.getString(labelKey))
			
 
				+            }
			
 
				+          }
			
 
				+          //5 处理log key表头。
			
 
				+          val apptype = record.getString("apptype")
			
 
				+          val pagesource = record.getString("pagesource")
			
 
				+          val mid = record.getString("mid")
			
 
				+          // vid 已经提取了
			
 
				+          val ts = record.getString("ts")
			
 
				+          val abcode = record.getString("abcode")
			
 
				+          val level = if (record.isNull("level")) "0" else record.getString("level")
			
 
				+          val logKey = (apptype, pagesource, mid, vid, ts, abcode, level).productIterator.mkString(",")
			
 
				+          val labelKey = labels.toString()
			
 
				+          val featureKey = featureMap.toString()
			
 
				+          //6 拼接数据，保存。
			
 
				+          logKey + "\t" + labelKey + "\t" + featureKey
			
 
				+
			
 
				+        })
			
 
				+
			
 
				+      // 4 保存数据到hdfs
			
 
				+      val savePartition = dt + hh
			
 
				+      val hdfsPath = savePath + "/" + savePartition
			
 
				+      if (hdfsPath.nonEmpty && hdfsPath.startsWith("/dw/recommend/model/")){
			
 
				+        println("删除路径并开始数据写入:" + hdfsPath)
			
 
				+        MyHdfsUtils.delete_hdfs_path(hdfsPath)
			
 
				+        odpsData.coalesce(repartition).saveAsTextFile(hdfsPath, classOf[GzipCodec])
			
 
				+      }else{
			
 
				+        println("路径不合法，无法写入:" + hdfsPath)
			
 
				+      }
			
 
				+    }
			
 
				+  }
			
 
				+
			
 
				+  def func(record: Record, schema: TableSchema): Record = {
			
 
				+    record
			
 
				+  }
			
 
				+  def funcC34567ForTags(tags: String, title: String): Tuple4[Double, String, Double, Double] = {
			
 
				+    // 匹配数量 匹配词 语义最高相似度分 语义平均相似度分
			
 
				+    val tagsList = tags.split(",")
			
 
				+    var d1 = 0.0
			
 
				+    val d2 = new ArrayBuffer[String]()
			
 
				+    var d3 = 0.0
			
 
				+    var d4 = 0.0
			
 
				+    for (tag <- tagsList){
			
 
				+      if (title.contains(tag)){
			
 
				+        d1 = d1 + 1.0
			
 
				+        d2.add(tag)
			
 
				+      }
			
 
				+      val score = Similarity.conceptSimilarity(tag, title)
			
 
				+      d3 = if (score > d3) score else d3
			
 
				+      d4 = d4 + score
			
 
				+    }
			
 
				+    d4 = if (tagsList.nonEmpty) d4 / tagsList.size else d4
			
 
				+    (d1, d2.mkString(","), d3, d4)
			
 
				+  }
			
 
				+}
			
--- a/src/main/scala/com/aliyun/odps/spark/examples/makedata_qiao/makedata_14_valueData_20240705.scala
+++ b/src/main/scala/com/aliyun/odps/spark/examples/makedata_qiao/makedata_14_valueData_20240705.scala
@@ -0,0 +1,91 @@
 
				+package com.aliyun.odps.spark.examples.makedata_qiao
			
 
				+
			
 
				+import com.alibaba.fastjson.JSON
			
 
				+import com.aliyun.odps.spark.examples.myUtils.{MyDateUtils, MyHdfsUtils, ParamUtils}
			
 
				+import org.apache.hadoop.io.compress.GzipCodec
			
 
				+import org.apache.spark.sql.SparkSession
			
 
				+
			
 
				+import scala.collection.JavaConversions._
			
 
				+import scala.collection.mutable.ArrayBuffer
			
 
				+import scala.io.Source
			
 
				+/*
			
 
				+
			
 
				+ */
			
 
				+
			
 
				+object makedata_14_valueData_20240705 {
			
 
				+  def main(args: Array[String]): Unit = {
			
 
				+
			
 
				+    val spark = SparkSession
			
 
				+      .builder()
			
 
				+      .appName(this.getClass.getName)
			
 
				+      .getOrCreate()
			
 
				+    val sc = spark.sparkContext
			
 
				+
			
 
				+    val loader = getClass.getClassLoader
			
 
				+    val resourceUrl = loader.getResource("20240608_feature_name.txt")
			
 
				+    val content =
			
 
				+      if (resourceUrl != null) {
			
 
				+        val content = Source.fromURL(resourceUrl).getLines().mkString("\n")
			
 
				+        Source.fromURL(resourceUrl).close()
			
 
				+        content
			
 
				+      } else {
			
 
				+        ""
			
 
				+      }
			
 
				+    println(content)
			
 
				+    val contentList = content.split("\n")
			
 
				+      .map(r=> r.replace(" ", "").replaceAll("\n", ""))
			
 
				+      .filter(r=> r.nonEmpty).toList
			
 
				+    val contentList_bc = sc.broadcast(contentList)
			
 
				+
			
 
				+
			
 
				+
			
 
				+    // 1 读取参数
			
 
				+    val param = ParamUtils.parseArgs(args)
			
 
				+    val beginStr = param.getOrElse("beginStr", "20230101")
			
 
				+    val endStr = param.getOrElse("endStr", "20230101")
			
 
				+    val readPath = param.getOrElse("readPath", "/dw/recommend/model/13_sample_data/")
			
 
				+    val savePath = param.getOrElse("savePath", "/dw/recommend/model/14_feature_data/")
			
 
				+    val repartition = param.getOrElse("repartition", "200").toInt
			
 
				+    val dateRange = MyDateUtils.getDateRange(beginStr, endStr)
			
 
				+    for (date <- dateRange) {
			
 
				+      val data = sc.textFile(readPath + "/" + date + "*")
			
 
				+      val data1 = data.map(r => {
			
 
				+        val rList = r.split("\t")
			
 
				+        val logKey = rList(0)
			
 
				+        val labelKey = rList(1)
			
 
				+        val featureKey = rList(2)
			
 
				+        (logKey, labelKey, featureKey)
			
 
				+      }).filter(r =>
			
 
				+        r._1.split(",")(6).equals("0")
			
 
				+      ).mapPartitions(row => {
			
 
				+        val result = new ArrayBuffer[String]()
			
 
				+        val contentList = contentList_bc.value
			
 
				+        row.foreach {
			
 
				+          case (logKey, labelKey, featureKey) =>
			
 
				+            val featureJson = JSON.parseObject(featureKey)
			
 
				+
			
 
				+            val featureValues = contentList.map(key => {
			
 
				+              if (featureJson.containsKey(key)) {
			
 
				+                featureJson.getDouble(key)
			
 
				+              } else {
			
 
				+                0.0
			
 
				+              }
			
 
				+            })
			
 
				+            result.add(logKey + "\t" + labelKey + "\t" + featureValues.mkString(","))
			
 
				+        }
			
 
				+        result.iterator
			
 
				+      })
			
 
				+
			
 
				+      // 4 保存数据到hdfs
			
 
				+      val hdfsPath = savePath + "/" + date
			
 
				+      if (hdfsPath.nonEmpty && hdfsPath.startsWith("/dw/recommend/model/")) {
			
 
				+        println("删除路径并开始数据写入:" + hdfsPath)
			
 
				+        MyHdfsUtils.delete_hdfs_path(hdfsPath)
			
 
				+        data1.coalesce(repartition).saveAsTextFile(hdfsPath, classOf[GzipCodec])
			
 
				+      } else {
			
 
				+        println("路径不合法，无法写入:" + hdfsPath)
			
 
				+      }
			
 
				+    }
			
 
				+
			
 
				+  }
			
 
				+}
			
--- a/src/main/scala/com/aliyun/odps/spark/examples/makedata_qiao/makedata_16_bucketData_20240705.scala
+++ b/src/main/scala/com/aliyun/odps/spark/examples/makedata_qiao/makedata_16_bucketData_20240705.scala
@@ -0,0 +1,127 @@
 
				+package com.aliyun.odps.spark.examples.makedata_qiao
			
 
				+
			
 
				+import com.alibaba.fastjson.JSON
			
 
				+import com.aliyun.odps.spark.examples.myUtils.{MyDateUtils, MyHdfsUtils, ParamUtils}
			
 
				+import examples.extractor.ExtractorUtils
			
 
				+import org.apache.hadoop.io.compress.GzipCodec
			
 
				+import org.apache.spark.sql.SparkSession
			
 
				+
			
 
				+import scala.collection.JavaConversions._
			
 
				+import scala.collection.mutable.ArrayBuffer
			
 
				+import scala.io.Source
			
 
				+/*
			
 
				+
			
 
				+ */
			
 
				+
			
 
				+object makedata_16_bucketData_20240705 {
			
 
				+  def main(args: Array[String]): Unit = {
			
 
				+
			
 
				+    val spark = SparkSession
			
 
				+      .builder()
			
 
				+      .appName(this.getClass.getName)
			
 
				+      .getOrCreate()
			
 
				+    val sc = spark.sparkContext
			
 
				+
			
 
				+    val loader = getClass.getClassLoader
			
 
				+    val resourceUrl = loader.getResource("20240608_feature_name.txt")
			
 
				+    val content =
			
 
				+      if (resourceUrl != null) {
			
 
				+        val content = Source.fromURL(resourceUrl).getLines().mkString("\n")
			
 
				+        Source.fromURL(resourceUrl).close()
			
 
				+        content
			
 
				+      } else {
			
 
				+        ""
			
 
				+      }
			
 
				+    println(content)
			
 
				+    val contentList = content.split("\n")
			
 
				+      .map(r=> r.replace(" ", "").replaceAll("\n", ""))
			
 
				+      .filter(r=> r.nonEmpty).toList
			
 
				+    val contentList_br = sc.broadcast(contentList)
			
 
				+
			
 
				+    val resourceUrlBucket = loader.getResource("20240609_bucket_274.txt")
			
 
				+    val buckets =
			
 
				+      if (resourceUrlBucket != null) {
			
 
				+        val buckets = Source.fromURL(resourceUrlBucket).getLines().mkString("\n")
			
 
				+        Source.fromURL(resourceUrlBucket).close()
			
 
				+        buckets
			
 
				+      } else {
			
 
				+        ""
			
 
				+      }
			
 
				+    println(buckets)
			
 
				+    val bucketsMap = buckets.split("\n")
			
 
				+      .map(r => r.replace(" ", "").replaceAll("\n", ""))
			
 
				+      .filter(r => r.nonEmpty)
			
 
				+      .map(r =>{
			
 
				+        val rList = r.split("\t")
			
 
				+        (rList(0), (rList(1).toDouble, rList(2).split(",").map(_.toDouble)))
			
 
				+      }).toMap
			
 
				+    val bucketsMap_br = sc.broadcast(bucketsMap)
			
 
				+
			
 
				+
			
 
				+    // 1 读取参数
			
 
				+    val param = ParamUtils.parseArgs(args)
			
 
				+    val readPath = param.getOrElse("readPath", "/dw/recommend/model/14_feature_data/")
			
 
				+    val savePath = param.getOrElse("savePath", "/dw/recommend/model/16_train_data/")
			
 
				+    val beginStr = param.getOrElse("beginStr", "20240606")
			
 
				+    val endStr = param.getOrElse("endStr", "20240607")
			
 
				+    val repartition = param.getOrElse("repartition", "200").toInt
			
 
				+
			
 
				+    val dateRange = MyDateUtils.getDateRange(beginStr, endStr)
			
 
				+    for (date <- dateRange) {
			
 
				+      println("开始执行:" + date)
			
 
				+      val data = sc.textFile(readPath + date).map(r=>{
			
 
				+        val rList = r.split("\t")
			
 
				+        val logKey = rList(0)
			
 
				+        val labelKey = rList(1)
			
 
				+        val features = rList(2).split(",").map(_.toDouble)
			
 
				+        (logKey, labelKey, features)
			
 
				+      })
			
 
				+        .filter{
			
 
				+          case (logKey, labelKey, features) =>
			
 
				+            val logKeyList = logKey.split(",")
			
 
				+            val apptype = logKeyList(0)
			
 
				+            val pagesource = logKeyList(1)
			
 
				+            Set("0", "4", "5", "21", "3", "6").contains(apptype) && pagesource.endsWith("recommend")
			
 
				+        }
			
 
				+        .map{
			
 
				+          case (logKey, labelKey, features) =>
			
 
				+            val label = JSON.parseObject(labelKey).getOrDefault("is_return", "0").toString
			
 
				+            (label, features)
			
 
				+        }
			
 
				+        .mapPartitions(row => {
			
 
				+        val result = new ArrayBuffer[String]()
			
 
				+        val contentList = contentList_br.value
			
 
				+        val bucketsMap = bucketsMap_br.value
			
 
				+        row.foreach{
			
 
				+          case (label, features) =>
			
 
				+            val featuresBucket = contentList.indices.map(i =>{
			
 
				+              val featureName = contentList(i)
			
 
				+              val score = features(i)
			
 
				+              if (score > 1E-8){
			
 
				+                val (bucketNum, buckets) = bucketsMap(featureName)
			
 
				+                val scoreNew = 1.0 / bucketNum * (ExtractorUtils.findInsertPosition(buckets, score).toDouble + 1.0)
			
 
				+                featureName + ":" + scoreNew.toString
			
 
				+              }else{
			
 
				+                ""
			
 
				+              }
			
 
				+            }).filter(_.nonEmpty)
			
 
				+            result.add(label + "\t" + featuresBucket.mkString("\t"))
			
 
				+        }
			
 
				+        result.iterator
			
 
				+      })
			
 
				+
			
 
				+      // 4 保存数据到hdfs
			
 
				+      val hdfsPath = savePath + "/" + date
			
 
				+      if (hdfsPath.nonEmpty && hdfsPath.startsWith("/dw/recommend/model/")) {
			
 
				+        println("删除路径并开始数据写入:" + hdfsPath)
			
 
				+        MyHdfsUtils.delete_hdfs_path(hdfsPath)
			
 
				+        data.repartition(repartition).saveAsTextFile(hdfsPath, classOf[GzipCodec])
			
 
				+      } else {
			
 
				+        println("路径不合法，无法写入:" + hdfsPath)
			
 
				+      }
			
 
				+    }
			
 
				+
			
 
				+
			
 
				+
			
 
				+  }
			
 
				+}
			
--- a/src/main/scala/com/aliyun/odps/spark/examples/makedata_recsys/makedata_recsys_43_bucketData_20240709.scala
+++ b/src/main/scala/com/aliyun/odps/spark/examples/makedata_recsys/makedata_recsys_43_bucketData_20240709.scala
@@ -93,7 +93,7 @@ object makedata_recsys_43_bucketData_20240709 {
 
				                 case (name, score) =>
			
 
				                   var ifFilter = false
			
 
				                   if (filterNames.nonEmpty){
			
 
				-                    filterNames.foreach(r=> if (!ifFilter && name.startsWith(r)) {ifFilter = true} )
			
 
				+                    filterNames.foreach(r=> if (!ifFilter && name.contains(r)) {ifFilter = true} )
			
 
				                   }
			
 
				                   if (ifFilter){
			
 
				                     ""
			
--- a/src/main/scala/com/aliyun/odps/spark/examples/makedata_recsys/makedata_recsys_43_bucketData_20240709_vid.scala
+++ b/src/main/scala/com/aliyun/odps/spark/examples/makedata_recsys/makedata_recsys_43_bucketData_20240709_vid.scala
@@ -0,0 +1,141 @@
 
				+package com.aliyun.odps.spark.examples.makedata_recsys
			
 
				+
			
 
				+import com.alibaba.fastjson.JSON
			
 
				+import com.aliyun.odps.spark.examples.myUtils.{MyDateUtils, MyHdfsUtils, ParamUtils}
			
 
				+import examples.extractor.ExtractorUtils
			
 
				+import org.apache.hadoop.io.compress.GzipCodec
			
 
				+import org.apache.spark.sql.SparkSession
			
 
				+
			
 
				+import scala.collection.JavaConversions._
			
 
				+import scala.collection.mutable.ArrayBuffer
			
 
				+import scala.io.Source
			
 
				+
			
 
				+object makedata_recsys_43_bucketData_20240709_vid {
			
 
				+  def main(args: Array[String]): Unit = {
			
 
				+
			
 
				+    // 1 读取参数
			
 
				+    val param = ParamUtils.parseArgs(args)
			
 
				+    param.foreach {
			
 
				+      case (key, value) => {
			
 
				+        println("Key: " + key + "; Value: " + value)
			
 
				+      }
			
 
				+    }
			
 
				+
			
 
				+    val readPath = param.getOrElse("readPath", "/dw/recommend/model/41_recsys_sample_data_v1/")
			
 
				+    val savePath = param.getOrElse("savePath", "/dw/recommend/model/43_recsys_train_data_v1/")
			
 
				+    val beginStr = param.getOrElse("beginStr", "20240703")
			
 
				+    val endStr = param.getOrElse("endStr", "20240703")
			
 
				+    val repartition = param.getOrElse("repartition", "100").toInt
			
 
				+    val filterNames = param.getOrElse("filterNames", "").split(",").filter(_.nonEmpty).toSet
			
 
				+    val filterVids = param.getOrElse("filterVids", "").split(",").filter(_.nonEmpty).toSet
			
 
				+    val whatLabel = param.getOrElse("whatLabel", "is_return")
			
 
				+    val whatApps = param.getOrElse("whatApps", "0,4,5,21,3,6").split(",").toSet
			
 
				+    val fileName = param.getOrElse("fileName", "20240709_recsys_bucket_314.txt")
			
 
				+
			
 
				+    val spark = SparkSession
			
 
				+      .builder()
			
 
				+      .appName(this.getClass.getName)
			
 
				+      .getOrCreate()
			
 
				+    val sc = spark.sparkContext
			
 
				+
			
 
				+    val loader = getClass.getClassLoader
			
 
				+
			
 
				+    val resourceUrlBucket = loader.getResource(fileName)
			
 
				+    val buckets =
			
 
				+      if (resourceUrlBucket != null) {
			
 
				+        val buckets = Source.fromURL(resourceUrlBucket).getLines().mkString("\n")
			
 
				+        Source.fromURL(resourceUrlBucket).close()
			
 
				+        buckets
			
 
				+      } else {
			
 
				+        ""
			
 
				+      }
			
 
				+    println(buckets)
			
 
				+    val bucketsMap = buckets.split("\n")
			
 
				+      .map(r => r.replace(" ", "").replaceAll("\n", ""))
			
 
				+      .filter(r => r.nonEmpty)
			
 
				+      .map(r => {
			
 
				+        val rList = r.split("\t")
			
 
				+        (rList(0), (rList(1).toDouble, rList(2).split(",").map(_.toDouble)))
			
 
				+      }).toMap
			
 
				+    val bucketsMap_br = sc.broadcast(bucketsMap)
			
 
				+
			
 
				+    val dateRange = MyDateUtils.getDateRange(beginStr, endStr)
			
 
				+    for (date <- dateRange) {
			
 
				+      println("开始执行:" + date)
			
 
				+      val data = sc.textFile(readPath + "/" + date + "*").map(r => {
			
 
				+          val rList = r.split("\t")
			
 
				+          val logKey = rList(0)
			
 
				+          val labelKey = rList(1)
			
 
				+          val jsons = JSON.parseObject(rList(2))
			
 
				+          val features = scala.collection.mutable.Map[String, Double]()
			
 
				+          jsons.foreach(r => {
			
 
				+            features.put(r._1, jsons.getDoubleValue(r._1))
			
 
				+          })
			
 
				+          (logKey, labelKey, features)
			
 
				+        })
			
 
				+        .filter {
			
 
				+          case (logKey, labelKey, features) =>
			
 
				+            val logKeyList = logKey.split(",")
			
 
				+            val apptype = logKeyList(0)
			
 
				+            val pagesource = logKeyList(1)
			
 
				+            whatApps.contains(apptype) && pagesource.endsWith("recommend")
			
 
				+        }
			
 
				+        .filter {
			
 
				+          case (logKey, labelKey, features) =>
			
 
				+            val logKeyList = logKey.split(",")
			
 
				+            val vid = logKeyList(3)
			
 
				+            filterVids.isEmpty || filterVids.contains(vid)
			
 
				+        }
			
 
				+        .map {
			
 
				+          case (logKey, labelKey, features) =>
			
 
				+            val label = JSON.parseObject(labelKey).getOrDefault(whatLabel, "0").toString
			
 
				+            val vid = logKey.split(",")(3)
			
 
				+            (label, vid, features)
			
 
				+        }
			
 
				+        .mapPartitions(row => {
			
 
				+          val result = new ArrayBuffer[String]()
			
 
				+          val bucketsMap = bucketsMap_br.value
			
 
				+          row.foreach {
			
 
				+            case (label, vid, features) =>
			
 
				+              val featuresBucket = features.map {
			
 
				+                case (name, score) =>
			
 
				+                  var ifFilter = false
			
 
				+                  if (filterNames.nonEmpty) {
			
 
				+                    filterNames.foreach(r => if (!ifFilter && name.contains(r)) {
			
 
				+                      ifFilter = true
			
 
				+                    })
			
 
				+                  }
			
 
				+                  if (ifFilter) {
			
 
				+                    ""
			
 
				+                  } else {
			
 
				+                    if (score > 1E-8) {
			
 
				+                      if (bucketsMap.contains(name)) {
			
 
				+                        val (bucketsNum, buckets) = bucketsMap(name)
			
 
				+                        val scoreNew = 1.0 / bucketsNum * (ExtractorUtils.findInsertPosition(buckets, score).toDouble + 1.0)
			
 
				+                        name + ":" + scoreNew.toString
			
 
				+                      } else {
			
 
				+                        name + ":" + score.toString
			
 
				+                      }
			
 
				+                    } else {
			
 
				+                      ""
			
 
				+                    }
			
 
				+                  }
			
 
				+              }.filter(_.nonEmpty)
			
 
				+              result.add(label + "\t" + vid + "\t" + featuresBucket.mkString("\t"))
			
 
				+          }
			
 
				+          result.iterator
			
 
				+        })
			
 
				+
			
 
				+      // 4 保存数据到hdfs
			
 
				+      val hdfsPath = savePath + "/" + date
			
 
				+      if (hdfsPath.nonEmpty && hdfsPath.startsWith("/dw/recommend/model/")) {
			
 
				+        println("删除路径并开始数据写入:" + hdfsPath)
			
 
				+        MyHdfsUtils.delete_hdfs_path(hdfsPath)
			
 
				+        data.repartition(repartition).saveAsTextFile(hdfsPath, classOf[GzipCodec])
			
 
				+      } else {
			
 
				+        println("路径不合法，无法写入:" + hdfsPath)
			
 
				+      }
			
 
				+    }
			
 
				+  }
			
 
				+}
			
 
				+
			
--- a/src/main/scala/com/aliyun/odps/spark/examples/makedata_recsys/makedata_recsys_43_bucketData_fu_sample_20240709.scala
+++ b/src/main/scala/com/aliyun/odps/spark/examples/makedata_recsys/makedata_recsys_43_bucketData_fu_sample_20240709.scala
@@ -0,0 +1,136 @@
 
				+package com.aliyun.odps.spark.examples.makedata_recsys
			
 
				+
			
 
				+import com.alibaba.fastjson.JSON
			
 
				+import com.aliyun.odps.spark.examples.myUtils.{MyDateUtils, MyHdfsUtils, ParamUtils}
			
 
				+import examples.extractor.ExtractorUtils
			
 
				+import org.apache.hadoop.io.compress.GzipCodec
			
 
				+import org.apache.spark.sql.SparkSession
			
 
				+
			
 
				+import scala.collection.JavaConversions._
			
 
				+import scala.collection.mutable.ArrayBuffer
			
 
				+import scala.io.Source
			
 
				+import scala.util.Random
			
 
				+/*
			
 
				+
			
 
				+ */
			
 
				+
			
 
				+object makedata_recsys_43_bucketData_fu_sample_20240709 {
			
 
				+  def main(args: Array[String]): Unit = {
			
 
				+
			
 
				+    // 1 读取参数
			
 
				+    val param = ParamUtils.parseArgs(args)
			
 
				+    val readPath = param.getOrElse("readPath", "/dw/recommend/model/41_recsys_sample_data_v1/")
			
 
				+    val savePath = param.getOrElse("savePath", "/dw/recommend/model/43_recsys_train_data_v1/")
			
 
				+    val beginStr = param.getOrElse("beginStr", "20240703")
			
 
				+    val endStr = param.getOrElse("endStr", "20240703")
			
 
				+    val repartition = param.getOrElse("repartition", "100").toInt
			
 
				+    val filterNames = param.getOrElse("filterNames", "XXXXXXXXXX").split(",").filter(_.nonEmpty).toSet
			
 
				+    val whatLabel = param.getOrElse("whatLabel", "is_return")
			
 
				+    val whatApps = param.getOrElse("whatApps", "0,4,5,21,3,6").split(",").toSet
			
 
				+    val fuSampleRate= param.getOrElse("fuSampleRate", "0.1").toDouble
			
 
				+    val fileName = param.getOrElse("fileName", "20240709_recsys_bucket_314.txt")
			
 
				+
			
 
				+    val spark = SparkSession
			
 
				+      .builder()
			
 
				+      .appName(this.getClass.getName)
			
 
				+      .getOrCreate()
			
 
				+    val sc = spark.sparkContext
			
 
				+
			
 
				+    val loader = getClass.getClassLoader
			
 
				+
			
 
				+    val resourceUrlBucket = loader.getResource(fileName)
			
 
				+    val buckets =
			
 
				+      if (resourceUrlBucket != null) {
			
 
				+        val buckets = Source.fromURL(resourceUrlBucket).getLines().mkString("\n")
			
 
				+        Source.fromURL(resourceUrlBucket).close()
			
 
				+        buckets
			
 
				+      } else {
			
 
				+        ""
			
 
				+      }
			
 
				+    println(buckets)
			
 
				+    val bucketsMap = buckets.split("\n")
			
 
				+      .map(r => r.replace(" ", "").replaceAll("\n", ""))
			
 
				+      .filter(r => r.nonEmpty)
			
 
				+      .map(r =>{
			
 
				+        val rList = r.split("\t")
			
 
				+        (rList(0), (rList(1).toDouble, rList(2).split(",").map(_.toDouble)))
			
 
				+      }).toMap
			
 
				+    val bucketsMap_br = sc.broadcast(bucketsMap)
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+    val dateRange = MyDateUtils.getDateRange(beginStr, endStr)
			
 
				+    for (date <- dateRange) {
			
 
				+      println("开始执行:" + date)
			
 
				+      val data = sc.textFile(readPath + "/" + date + "*").map(r=>{
			
 
				+        val rList = r.split("\t")
			
 
				+        val logKey = rList(0)
			
 
				+        val labelKey = rList(1)
			
 
				+        val jsons = JSON.parseObject(rList(2))
			
 
				+        val features = scala.collection.mutable.Map[String, Double]()
			
 
				+        jsons.foreach(r => {
			
 
				+          features.put(r._1, jsons.getDoubleValue(r._1))
			
 
				+        })
			
 
				+        (logKey, labelKey, features)
			
 
				+      })
			
 
				+        .filter{
			
 
				+          case (logKey, labelKey, features) =>
			
 
				+            val logKeyList = logKey.split(",")
			
 
				+            val apptype = logKeyList(0)
			
 
				+            val pagesource = logKeyList(1)
			
 
				+            whatApps.contains(apptype) && pagesource.endsWith("recommend")
			
 
				+        }.filter{
			
 
				+          case (logKey, labelKey, features) =>
			
 
				+            val label = JSON.parseObject(labelKey).getOrDefault(whatLabel, "0").toString
			
 
				+            "1".equals(label) || new Random().nextDouble() <= fuSampleRate
			
 
				+        }
			
 
				+        .map{
			
 
				+          case (logKey, labelKey, features) =>
			
 
				+            val label = JSON.parseObject(labelKey).getOrDefault(whatLabel, "0").toString
			
 
				+            (label, features)
			
 
				+        }
			
 
				+        .mapPartitions(row => {
			
 
				+          val result = new ArrayBuffer[String]()
			
 
				+          val bucketsMap = bucketsMap_br.value
			
 
				+          row.foreach{
			
 
				+            case (label, features) =>
			
 
				+              val featuresBucket = features.map{
			
 
				+                case (name, score) =>
			
 
				+                  var ifFilter = false
			
 
				+                  if (filterNames.nonEmpty){
			
 
				+                    filterNames.foreach(r=> if (!ifFilter && name.contains(r)) {ifFilter = true} )
			
 
				+                  }
			
 
				+                  if (ifFilter){
			
 
				+                    ""
			
 
				+                  }else{
			
 
				+                    if (score > 1E-8) {
			
 
				+                      if (bucketsMap.contains(name)) {
			
 
				+                        val (bucketsNum, buckets) = bucketsMap(name)
			
 
				+                        val scoreNew = 1.0 / bucketsNum * (ExtractorUtils.findInsertPosition(buckets, score).toDouble + 1.0)
			
 
				+                        name + ":" + scoreNew.toString
			
 
				+                      } else {
			
 
				+                        name + ":" + score.toString
			
 
				+                      }
			
 
				+                    } else {
			
 
				+                      ""
			
 
				+                    }
			
 
				+                  }
			
 
				+              }.filter(_.nonEmpty)
			
 
				+              result.add(label + "\t" + featuresBucket.mkString("\t"))
			
 
				+          }
			
 
				+          result.iterator
			
 
				+      })
			
 
				+
			
 
				+      // 4 保存数据到hdfs
			
 
				+      val hdfsPath = savePath + "/" + date
			
 
				+      if (hdfsPath.nonEmpty && hdfsPath.startsWith("/dw/recommend/model/")) {
			
 
				+        println("删除路径并开始数据写入:" + hdfsPath)
			
 
				+        MyHdfsUtils.delete_hdfs_path(hdfsPath)
			
 
				+        data.repartition(repartition).saveAsTextFile(hdfsPath, classOf[GzipCodec])
			
 
				+      } else {
			
 
				+        println("路径不合法，无法写入:" + hdfsPath)
			
 
				+      }
			
 
				+    }
			
 
				+  }
			
 
				+}
			
--- a/src/main/scala/com/aliyun/odps/spark/examples/myUtils/ParamUtils.scala
+++ b/src/main/scala/com/aliyun/odps/spark/examples/myUtils/ParamUtils.scala
@@ -3,7 +3,7 @@ package com.aliyun.odps.spark.examples.myUtils
 
				 import scala.collection.mutable
			
 
				 object ParamUtils {
			
 
				   def parseArgs(args: Array[String]): mutable.HashMap[String, String] = {
			
 
				-    println("args size:" + args.size)
			
 
				+    println("args size:" + args.length)
			
 
				 
			
 
				     val rst = new mutable.HashMap[String, String]() {
			
 
				       override def default(key: String) = "无参数传入"
			
--- a/src/main/scala/com/aliyun/odps/spark/examples/临时记录的脚本-广告
+++ b/src/main/scala/com/aliyun/odps/spark/examples/临时记录的脚本-广告
@@ -4,11 +4,11 @@ nohup /opt/apps/SPARK2/spark-2.4.8-hadoop3.2-1.0.8/bin/spark-class2 org.apache.s
 
				 --master yarn --driver-memory 1G --executor-memory 2G --executor-cores 1 --num-executors 16 \
			
 
				 ./target/spark-examples-1.0.0-SNAPSHOT-shaded.jar \
			
 
				 tablePart:64 repartition:16 \
			
 
				-beginStr:2024070108 endStr:2024070323 \
			
 
				+beginStr:2024072408 endStr:2024072423 \
			
 
				 savePath:/dw/recommend/model/31_ad_sample_data_v3/ \
			
 
				 table:alg_recsys_ad_sample_all filterHours:00,01,02,03,04,05,06,07 \
			
 
				 idDefaultValue:0.01 \
			
 
				-> p31_2024070108.log 2>&1 &
			
 
				+> p31_2024072423.log 2>&1 &
			
 
				 
			
 
				 
			
 
				 nohup /opt/apps/SPARK2/spark-2.4.8-hadoop3.2-1.0.8/bin/spark-class2 org.apache.spark.deploy.SparkSubmit \
			
@@ -28,9 +28,9 @@ nohup /opt/apps/SPARK2/spark-2.4.8-hadoop3.2-1.0.8/bin/spark-class2 org.apache.s
 
				 ./target/spark-examples-1.0.0-SNAPSHOT-shaded.jar \
			
 
				 readPath:/dw/recommend/model/31_ad_sample_data_v3/ \
			
 
				 savePath:/dw/recommend/model/33_ad_train_data_v3/ \
			
 
				-beginStr:20240703 endStr:20240703 repartition:100 \
			
 
				+beginStr:20240724 endStr:20240724 repartition:100 \
			
 
				 filterNames:adid_,targeting_conversion_ \
			
 
				-> p33_20240703_.log 2>&1 &
			
 
				+> p33_20240724_.log 2>&1 &
			
 
				 
			
 
				 filterNames:adid_,targeting_conversion_ \
			
 
				 filterNames:cid_,adid_,adverid_,targeting_conversion_ \
			
--- a/src/main/scala/com/tzld/recommend/recall/algo/CollaborativeFilteringAlgo.scala
+++ b/src/main/scala/com/tzld/recommend/recall/algo/CollaborativeFilteringAlgo.scala
@@ -0,0 +1,5 @@
 
				+package com.tzld.recommend.recall.algo
			
 
				+
			
 
				+class CollaborativeFilteringAlgo {
			
 
				+
			
 
				+}
			
--- a/zhangbo/01_train.sh
+++ b/zhangbo/01_train.sh
--- a/zhangbo/02_train_go.sh
+++ b/zhangbo/02_train_go.sh
--- a/zhangbo/03_predict.sh
+++ b/zhangbo/03_predict.sh
--- a/zhangbo/04_upload.sh
+++ b/zhangbo/04_upload.sh
--- a/zhangbo/05_update_everyday_2model.sh
+++ b/zhangbo/05_update_everyday_2model.sh
--- a/zhangbo/05_update_everyday_str.sh
+++ b/zhangbo/05_update_everyday_str.sh
--- a/zhangbo/06_update_everyday_feature.sh
+++ b/zhangbo/06_update_everyday_feature.sh
--- a/zhangbo/50_delete_hdfs.sh
+++ b/zhangbo/50_delete_hdfs.sh
--- a/zhangbo/train.sh
+++ b/zhangbo/train.sh
--- a/zhangbo/up.sh
+++ b/zhangbo/up.sh
--- a/zhangbo/up2.sh
+++ b/zhangbo/up2.sh
--- a/zhangbo/utils.py
+++ b/zhangbo/utils.py
@@ -92,7 +92,7 @@ if __name__ == '__main__':
 
				     elif args.excute_program == "check_user_hive":
			
 
				         check_user_hive(args)
			
 
				     elif args.excute_program == "check_hive":
			
 
				-            check_hive(args)
			
 
				+        check_hive(args)
			
 
				     else:
			
 
				         print("无合法参数，验证失败。")
			
 
				         exit(999)