Browse Source

Merge branch 'master' into feature/zhangbo_makedata_v2

zhangbo 4 months ago
parent
commit
adbdd80996
72 changed files with 6863 additions and 30 deletions
  1. 45 0
      .gitignore
  2. 16 0
      ad/00_common.sh
  3. 417 0
      ad/01_ad_model_update.sh
  4. 21 0
      ad/02_ad_model_update_test.sh
  5. 71 0
      ad/21_ad_model_add_dt_train_predict_auc.sh
  6. 60 0
      ad/22_ad_model_predict_auc.sh
  7. 29 0
      ad/23_ad_model_batch_calc_cid_score_avg.sh
  8. 99 0
      ad/24_supplementary_data.sh
  9. 87 0
      ad/25_xgb_make_data_origin_bucket.sh
  10. 75 0
      ad/30_delete_timer_file.sh
  11. 141 0
      ad/ad_monitor_util.py
  12. 64 0
      ad/ad_utils.py
  13. 53 0
      ad/holidays.txt
  14. 198 0
      ad/model_predict_analyse.py
  15. 6 2
      pom.xml
  16. 46 0
      recommend/01_recommend_model_new_train.sh
  17. 52 0
      recommend/02_train_go.sh
  18. 14 0
      recommend/03_predict.sh
  19. 67 0
      recommend/20_vid_avg_score.sh
  20. 89 0
      recommend/21_make_data_new_table.sh
  21. 78 0
      recommend/22_supplementary_data_new_table.sh
  22. 8 0
      spark-examples.iml
  23. 62 0
      src/main/java/examples/sparksql/SparkAdCTRSampleTester.java
  24. 99 0
      src/main/java/examples/sparksql/SparkAdCVRSampleLoader.java
  25. 59 0
      src/main/java/examples/sparksql/SparkAdCVRSampleTester.java
  26. 95 0
      src/main/java/examples/sparksql/SparkAdFeaToRedisHourLoader.java
  27. 67 0
      src/main/java/examples/utils/AdUtil.java
  28. 22 0
      src/main/java/examples/utils/DateTimeUtil.java
  29. 8 0
      src/main/resources/20240718_ad_bucket_517.txt
  30. 9 0
      src/main/resources/20240718_ad_bucket_688.txt
  31. 689 0
      src/main/resources/20240718_ad_feature_name.txt
  32. 518 0
      src/main/resources/20240718_ad_feature_name_517.txt
  33. 1 0
      src/main/resources/weight_ad_feature_name.txt
  34. 131 0
      src/main/scala/com/aliyun/odps/spark/ad/xgboost/v20240808/XGBoostTrain.scala
  35. 16 16
      src/main/scala/com/aliyun/odps/spark/examples/SparkPi.scala
  36. 1 1
      src/main/scala/com/aliyun/odps/spark/examples/makedata_ad/v20240620/makedata_ad_31_originData_20240620.scala
  37. 1 1
      src/main/scala/com/aliyun/odps/spark/examples/makedata_ad/v20240620/makedata_ad_32_bucket_20240622.scala
  38. 1 1
      src/main/scala/com/aliyun/odps/spark/examples/makedata_ad/v20240620/makedata_ad_33_bucketDataPrint_20240628.scala
  39. 2 2
      src/main/scala/com/aliyun/odps/spark/examples/makedata_ad/v20240620/makedata_ad_33_bucketData_20240622.scala
  40. 431 0
      src/main/scala/com/aliyun/odps/spark/examples/makedata_ad/v20240718/makedata_ad_31_originData_20240718.scala
  41. 105 0
      src/main/scala/com/aliyun/odps/spark/examples/makedata_ad/v20240718/makedata_ad_32_bucket_20240718.scala
  42. 429 0
      src/main/scala/com/aliyun/odps/spark/examples/makedata_ad/v20240718/makedata_ad_33_bucketDataPrint_20240718.scala
  43. 128 0
      src/main/scala/com/aliyun/odps/spark/examples/makedata_ad/v20240718/makedata_ad_33_bucketData_20240718.scala
  44. 135 0
      src/main/scala/com/aliyun/odps/spark/examples/makedata_ad/v20240718/makedata_ad_33_bucketData_20240718_sample.scala
  45. 158 0
      src/main/scala/com/aliyun/odps/spark/examples/makedata_ad/v20240718/makedata_ad_33_bucketData_20240726.scala
  46. 152 0
      src/main/scala/com/aliyun/odps/spark/examples/makedata_ad/v20240718/makedata_ad_33_bucketData_20240729.scala
  47. 181 0
      src/main/scala/com/aliyun/odps/spark/examples/makedata_ad/v20240718/makedata_ad_33_bucketData_20240729_copy_zheng.scala
  48. 129 0
      src/main/scala/com/aliyun/odps/spark/examples/makedata_ad/v20240718/makedata_ad_33_bucketData_20240729_reduce_feature.scala
  49. 140 0
      src/main/scala/com/aliyun/odps/spark/examples/makedata_ad/v20240718/makedata_ad_33_bucketData_default_value_20240718.scala
  50. 24 0
      src/main/scala/com/aliyun/odps/spark/examples/makedata_ad/v20240718/makedata_ad_34_statistics_20241111.scala
  51. 549 0
      src/main/scala/com/aliyun/odps/spark/examples/makedata_ad/xgb/makedata_31_bucketDataPrint_20240821.scala
  52. 278 0
      src/main/scala/com/aliyun/odps/spark/examples/makedata_qiao/makedata_13_originData_20240705.scala
  53. 91 0
      src/main/scala/com/aliyun/odps/spark/examples/makedata_qiao/makedata_14_valueData_20240705.scala
  54. 127 0
      src/main/scala/com/aliyun/odps/spark/examples/makedata_qiao/makedata_16_bucketData_20240705.scala
  55. 1 1
      src/main/scala/com/aliyun/odps/spark/examples/makedata_recsys/makedata_recsys_43_bucketData_20240709.scala
  56. 141 0
      src/main/scala/com/aliyun/odps/spark/examples/makedata_recsys/makedata_recsys_43_bucketData_20240709_vid.scala
  57. 136 0
      src/main/scala/com/aliyun/odps/spark/examples/makedata_recsys/makedata_recsys_43_bucketData_fu_sample_20240709.scala
  58. 1 1
      src/main/scala/com/aliyun/odps/spark/examples/myUtils/ParamUtils.scala
  59. 4 4
      src/main/scala/com/aliyun/odps/spark/examples/临时记录的脚本-广告
  60. 5 0
      src/main/scala/com/tzld/recommend/recall/algo/CollaborativeFilteringAlgo.scala
  61. 0 0
      zhangbo/01_train.sh
  62. 0 0
      zhangbo/02_train_go.sh
  63. 0 0
      zhangbo/03_predict.sh
  64. 0 0
      zhangbo/04_upload.sh
  65. 0 0
      zhangbo/05_update_everyday_2model.sh
  66. 0 0
      zhangbo/05_update_everyday_str.sh
  67. 0 0
      zhangbo/06_update_everyday_feature.sh
  68. 0 0
      zhangbo/50_delete_hdfs.sh
  69. 0 0
      zhangbo/train.sh
  70. 0 0
      zhangbo/up.sh
  71. 0 0
      zhangbo/up2.sh
  72. 1 1
      zhangbo/utils.py

+ 45 - 0
.gitignore

@@ -0,0 +1,45 @@
+HELP.md
+target/
+!.mvn/wrapper/maven-wrapper.jar
+!**/src/main/**/target/
+!**/src/test/**/target/
+
+### STS ###
+.apt_generated
+.classpath
+.factorypath
+.project
+.settings
+.springBeans
+.sts4-cache
+
+### IntelliJ IDEA ###
+.idea
+*.iws
+*.iml
+*.ipr
+
+### NetBeans ###
+/nbproject/private/
+/nbbuild/
+/dist/
+/nbdist/
+/.nb-gradle/
+build/
+!**/src/main/**/build/
+!**/src/test/**/build/
+
+### VS Code ###
+.vscode/
+
+apollo-cache-dir
+sentinel
+weblog
+xxl-job
+
+.DS_Store
+logs
+
+model
+predict
+.idea

+ 16 - 0
ad/00_common.sh

@@ -0,0 +1,16 @@
+#!/bin/sh
+
+is_not_holidays() {
+    if [ -z "$1" ]; then
+        echo "0"
+        return
+    fi
+    
+    path=$(dirname $0)
+
+    if grep -w "$1" "${path}/holidays.txt" > /dev/null; then
+        echo "0"
+    else
+        echo "1"
+    fi 
+}

+ 417 - 0
ad/01_ad_model_update.sh

@@ -0,0 +1,417 @@
+#!/bin/sh
+set -x
+
+export PATH=$SPARK_HOME/bin:$PATH
+export HADOOP_CONF_DIR=/etc/taihao-apps/hadoop-conf
+export JAVA_HOME=/usr/lib/jvm/java-1.8.0
+
+sh_path=$(cd $(dirname $0); pwd)
+source ${sh_path}/00_common.sh
+
+source /root/anaconda3/bin/activate py37
+
+
+# 全局常量
+LOG_PREFIX=广告模型训练任务
+HADOOP=/opt/apps/HADOOP-COMMON/hadoop-common-current/bin/hadoop
+TRAIN_PATH=/dw/recommend/model/31_ad_sample_data_v4
+BUCKET_FEATURE_PATH=/dw/recommend/model/33_ad_train_data_v4
+TABLE=alg_recsys_ad_sample_all
+# 特征文件名
+feature_file=20240703_ad_feature_name.txt
+# 模型本地临时保存路径
+model_local_home=/root/zhaohp/XGB/
+
+# 模型HDFS保存路径,测试时修改为其他路径,避免影响线上
+MODEL_PATH=/dw/recommend/model/35_ad_model
+# 预测结果保存路径,测试时修改为其他路径,避免影响线上
+PREDICT_RESULT_SAVE_PATH=/dw/recommend/model/34_ad_predict_data
+# 模型OSS保存路径,测试时修改为其他路径,避免影响线上
+MODEL_OSS_PATH=oss://art-recommend.oss-cn-hangzhou.aliyuncs.com/zhangbo/
+# 线上模型名,测试时修改为其他模型名,避免影响线上
+model_name=model_xgb_351_1000_v2
+# 线上校准文件名
+OSS_CALIBRATION_FILE_NAME=model_xgb_351_1000_v2_calibration
+# 用于存放一些临时的文件
+PREDICT_CACHE_PATH=/root/zhaohp/XGB/predict_cache
+
+
+# 本地保存HDFS模型路径文件,测试时修改为其他模型名,避免影响线上
+model_path_file=${model_local_home}/online_model_path.txt
+# 获取当前是星期几,1表示星期一
+current_day_of_week="$(date +"%u")"
+
+# 任务开始时间
+start_time=$(date +%s)
+# 前一天
+today_early_1="$(date -d '1 days ago' +%Y%m%d)"
+# 线上模型在HDFS中的路径
+online_model_path=`cat ${model_path_file}`
+# 训练用的数据路径
+train_data_path=""
+# 评估用的数据路径
+predict_date_path=""
+#评估结果保存路径
+new_model_predict_result_path=""
+# 模型保存路径
+model_save_path=""
+# 评测结果保存路径,后续需要根据此文件评估是否要更新模型
+predict_analyse_file_path=""
+# 校准文件保存路径
+calibration_file_path=""
+
+# 保存模型评估的分析结果
+old_incr_rate_avg=0
+new_incr_rate_avg=0
+# Top10的详情
+top10_msg=""
+# AUC值
+old_auc=0
+new_auc=0
+
+declare -A real_score_map
+declare -A old_score_map
+declare -A new_score_map
+
+# 校验命令的退出码
+check_run_status() {
+    local status=$1
+    local step_start_time=$2
+    local step_name=$3
+    local msg=$4
+
+    local step_end_time=$(date +%s)
+    local step_elapsed=$(($step_end_time - $step_start_time))
+
+    if [[ -n "${old_auc}" && "${old_auc}" != "0" ]]; then
+      msg+="\n\t - 老模型AUC: ${old_auc}"
+    fi
+    if [[ -n "${new_auc}" && "${new_auc}" != "0" ]]; then
+      msg+="\n\t - 新模型AUC: ${new_auc}"
+    fi
+
+
+    if [ ${status} -ne 0 ]; then
+        echo "${LOG_PREFIX} -- ${step_name}失败: 耗时 ${step_elapsed}"
+        local elapsed=$(($step_end_time - $start_time))
+        /root/anaconda3/bin/python ${sh_path}/ad_monitor_util.py --level error --msg "${msg}" --start "${start_time}" --elapsed "${elapsed}" --top10 "${top10_msg}"
+        exit 1
+    else
+        echo "${LOG_PREFIX} -- ${step_name}成功: 耗时 ${step_elapsed}"
+    fi
+}
+
+send_success_upload_msg(){ 
+  # 发送更新成功通知
+  local msg=" 广告模型文件更新完成"
+  msg+="\n\t - 老模型AUC: ${old_auc}"
+  msg+="\n\t - 新模型AUC: ${new_auc}"
+  msg+="\n\t - 老模型Top10差异平均值: ${old_incr_rate_avg}"
+  msg+="\n\t - 新模型Top10差异平均值: ${new_incr_rate_avg}"
+  msg+="\n\t - 模型在HDFS中的路径: ${model_save_path}"
+  msg+="\n\t - 模型上传OSS中的路径: ${MODEL_OSS_PATH}/${model_name}.tar.gz"
+
+  local step_end_time=$(date +%s)
+  local elapsed=$((${step_end_time} - ${start_time}))
+
+  /root/anaconda3/bin/python ${sh_path}/ad_monitor_util.py --level info --msg "${msg}" --start "${start_time}" --elapsed "${elapsed}" --top10 "${top10_msg}"
+}
+
+init() {
+  
+  declare -a date_keys=()
+  local count=1
+  local current_data="$(date -d '2 days ago' +%Y%m%d)"
+  # 循环获取前 n 天的非节日日期
+  while [[ ${count} -le 7 ]]; do
+    date_key=$(date -d "${current_data}" +%Y%m%d)
+    # 判断是否是节日,并拼接训练数据路径
+    if [ $(is_not_holidays ${date_key}) -eq 1 ]; then
+
+      # 将 date_key 放入数组
+      date_keys+=("${date_key}")
+
+      if [[ -z ${train_data_path} ]]; then
+        train_data_path="${BUCKET_FEATURE_PATH}/${date_key}"
+      else
+        train_data_path="${BUCKET_FEATURE_PATH}/${date_key},${train_data_path}"
+      fi 
+      count=$((count + 1))
+    else
+      echo "日期: ${date_key}是节日,跳过"
+    fi
+    current_data=$(date -d "${current_data} -1 day" +%Y%m%d)
+  done
+
+  last_index=$((${#date_keys[@]} - 1))
+  train_first_day=${date_keys[$last_index]}
+  train_last_day=${date_keys[0]}
+
+  model_save_path=${MODEL_PATH}/${model_name}_${train_first_day: -4}_${train_last_day: -4}
+  predict_date_path=${BUCKET_FEATURE_PATH}/${today_early_1}
+  new_model_predict_result_path=${PREDICT_RESULT_SAVE_PATH}/${today_early_1}_351_1000_${train_first_day: -4}_${train_last_day: -4}
+  online_model_predict_result_path=${PREDICT_RESULT_SAVE_PATH}/${today_early_1}_351_1000_${online_model_path: -9}
+  predict_analyse_file_path=${model_local_home}/predict_analyse_file/${today_early_1}_351_1000_analyse.txt
+  calibration_file_path=${model_local_home}/${OSS_CALIBRATION_FILE_NAME}.txt
+
+  echo "init param train_data_path: ${train_data_path}"
+  echo "init param predict_date_path: ${predict_date_path}"
+  echo "init param new_model_predict_result_path: ${new_model_predict_result_path}"
+  echo "init param online_model_predict_result_path: ${online_model_predict_result_path}"
+  echo "init param model_save_path: ${model_save_path}"
+  echo "init param online_model_path: ${online_model_path}"
+  echo "init param feature_file: ${feature_file}"
+  echo "init param model_name: ${model_name}"
+  echo "init param model_local_home: ${model_local_home}"
+  echo "init param model_oss_path: ${MODEL_OSS_PATH}"
+  echo "init param predict_analyse_file_path: ${predict_analyse_file_path}"
+  echo "init param calibration_file_path: ${calibration_file_path}"
+  echo "init param current_day_of_week: ${current_day_of_week}"
+
+  echo "当前Python环境安装的Python版本: $(python --version)"
+  echo "当前Python环境安装的三方包: $(python -m pip list)"
+}
+
+# 校验大数据任务是否执行完成
+check_ad_hive() {
+  local step_start_time=$(date +%s)
+  local max_hour=05
+  local max_minute=30
+  local elapsed=0
+  while true; do
+      local python_return_code=$(python ${sh_path}/ad_utils.py --excute_program check_ad_origin_hive --partition ${today_early_1} --hh 23)
+
+      elapsed=$(($(date +%s) - ${step_start_time}))
+      if [ "${python_return_code}" -eq 0 ]; then
+          break
+      fi
+      echo "Python程序返回非0值,等待五分钟后再次调用。"
+      sleep 300
+      local current_hour=$(date +%H)
+      local current_minute=$(date +%M)
+      if (( ${current_hour} > ${max_hour} || ( ${current_hour} == ${max_hour} && ${current_minute} >= ${max_minute} ) )); then
+          local msg="大数据数据生产校验失败, 分区: ${today_early_1}"
+          echo -e "${LOG_PREFIX} -- 大数据数据生产校验 -- ${msg}: 耗时 ${elapsed}"
+          /root/anaconda3/bin/python ${sh_path}/ad_monitor_util.py --level error --msg "${msg}" --start "${start_time}" --elapsed "${elapsed}"
+          exit 1
+      fi
+  done
+  echo "${LOG_PREFIX} -- 大数据数据生产校验 -- 大数据数据生产校验通过: 耗时 ${elapsed}"
+}
+
+origin_data() {
+  (
+    source ${sh_path}/25_xgb_make_data_origin_bucket.sh
+    make_origin_data
+  )
+}
+
+bucket_feature() {
+  (
+    source ${sh_path}/25_xgb_make_data_origin_bucket.sh
+    make_bucket_feature
+  )
+}
+
+xgb_train() {
+  local step_start_time=$(date +%s)
+
+  /opt/apps/SPARK3/spark-3.3.1-hadoop3.2-1.0.5/bin/spark-class org.apache.spark.deploy.SparkSubmit \
+  --class com.tzld.piaoquan.recommend.model.train_01_xgb_ad_20240808 \
+  --master yarn --driver-memory 6G --executor-memory 10G --executor-cores 1 --num-executors 31 \
+  --conf spark.yarn.executor.memoryoverhead=2048 \
+  --conf spark.shuffle.service.enabled=true \
+  --conf spark.shuffle.service.port=7337 \
+  --conf spark.shuffle.consolidateFiles=true \
+  --conf spark.shuffle.manager=sort \
+  --conf spark.storage.memoryFraction=0.4 \
+  --conf spark.shuffle.memoryFraction=0.5 \
+  --conf spark.default.parallelism=200 \
+  /root/zhangbo/recommend-model/recommend-model-produce/target/recommend-model-produce-jar-with-dependencies.jar \
+  featureFile:20240703_ad_feature_name.txt \
+  trainPath:${train_data_path} \
+  testPath:${predict_date_path} \
+  savePath:${new_model_predict_result_path} \
+  modelPath:${model_save_path} \
+  eta:0.01 gamma:0.0 max_depth:5 num_round:1000 num_worker:30 repartition:20
+
+  local return_code=$?
+  check_run_status ${return_code} ${step_start_time} "XGB模型训练任务" "XGB模型训练失败"
+}
+
+calc_model_predict() {
+  local count=0
+  local max_line=10
+  local old_total_diff=0
+  local new_total_diff=0
+  top10_msg="| CID  | 老模型相对真实CTCVR的变化 | 新模型相对真实CTCVR的变化 |"
+  top10_msg+=" \n| ---- | --------- | -------- |"
+  while read -r line && [ ${count} -lt ${max_line} ]; do
+
+      # 使用 ! 取反判断,只有当行中不包含 "cid" 时才执行继续的逻辑
+      if [[ "${line}" == *"cid"* ]]; then
+          continue
+      fi
+
+      read -a numbers <<< "${line}"
+
+      # 分数分别保存
+      real_score_map[${numbers[0]}]=${numbers[3]}
+      old_score_map[${numbers[0]}]=${numbers[6]}
+      new_score_map[${numbers[0]}]=${numbers[7]}
+
+      # 拼接Top10详情的飞书消息
+      top10_msg="${top10_msg} \n| ${numbers[0]} | ${numbers[6]} | ${numbers[7]} | "
+
+      # 计算top10相对误差绝对值的均值
+      old_abs_score=$( echo "${numbers[6]} * ((${numbers[6]} >= 0) * 2 - 1)" | bc -l )
+      new_abs_score=$( echo "${numbers[7]} * ((${numbers[7]} >= 0) * 2 - 1)" | bc -l )
+
+      old_total_diff=$( echo "${old_total_diff} + ${old_abs_score}" | bc -l )
+      new_total_diff=$( echo "${new_total_diff} + ${new_abs_score}" | bc -l )
+
+      count=$((${count} + 1))
+
+  done < "${predict_analyse_file_path}"
+
+  local return_code=$?
+  check_run_status ${return_code} ${step_start_time} "计算Top10差异" "计算Top10差异异常"
+
+  old_incr_rate_avg=$( echo "scale=6; ${old_total_diff} / ${count}" | bc -l )
+  check_run_status $? ${step_start_time} "计算老模型Top10差异" "计算老模型Top10差异异常"
+
+
+  new_incr_rate_avg=$( echo "scale=6; ${new_total_diff} / ${count}" | bc -l )
+  check_run_status $? ${step_start_time} "计算新模型Top10差异" "计算新模型Top10差异异常"
+
+  echo "老模型Top10差异平均值: ${old_incr_rate_avg}"
+  echo "新模型Top10差异平均值: ${new_incr_rate_avg}"
+  echo "新老模型分数对比: "
+  for cid in "${!new_score_map[@]}"; do
+    echo "\t CID: $cid, 老模型分数: ${old_score_map[$cid]}, 新模型分数: ${new_score_map[$cid]}"
+  done
+}
+
+calc_auc() {
+  old_auc=`cat ${PREDICT_CACHE_PATH}/old_1.txt | /root/sunmingze/AUC/AUC`
+  new_auc=`cat ${PREDICT_CACHE_PATH}/new_1.txt | /root/sunmingze/AUC/AUC`
+}
+
+model_predict() {
+
+  # 线上模型评估最新的数据
+  local step_start_time=$(date +%s)
+  /opt/apps/SPARK3/spark-3.3.1-hadoop3.2-1.0.5/bin/spark-class org.apache.spark.deploy.SparkSubmit \
+  --class com.tzld.piaoquan.recommend.model.pred_01_xgb_ad_hdfsfile_20240813 \
+  --master yarn --driver-memory 1G --executor-memory 3G --executor-cores 1 --num-executors 30 \
+  --conf spark.yarn.executor.memoryoverhead=1024 \
+  --conf spark.shuffle.service.enabled=true \
+  --conf spark.shuffle.service.port=7337 \
+  --conf spark.shuffle.consolidateFiles=true \
+  --conf spark.shuffle.manager=sort \
+  --conf spark.storage.memoryFraction=0.4 \
+  --conf spark.shuffle.memoryFraction=0.5 \
+  --conf spark.default.parallelism=200 \
+  /root/zhangbo/recommend-model/recommend-model-produce/target/recommend-model-produce-jar-with-dependencies.jar \
+  featureFile:20240703_ad_feature_name.txt \
+  testPath:${predict_date_path} \
+  savePath:${online_model_predict_result_path} \
+  modelPath:${online_model_path}
+
+  local return_code=$?
+  check_run_status ${return_code} ${step_start_time} "线上模型评估${predict_date_path: -8}的数据" "线上模型评估${predict_date_path: -8}的数据失败"
+
+  # 结果分析
+  local python_return_code=$(python ${sh_path}/model_predict_analyse.py -op ${online_model_predict_result_path} -np ${new_model_predict_result_path} -af ${predict_analyse_file_path} -cf ${calibration_file_path})
+  check_run_status ${python_return_code} ${step_start_time} "分析线上模型评估${predict_date_path: -8}的数据" "分析线上模型评估${predict_date_path: -8}的数据失败"
+
+  calc_model_predict
+
+  calc_auc
+
+  if (( $(echo "${new_incr_rate_avg} > 0.100000" | bc -l ) ));then 
+    echo "线上模型评估${predict_date_path: -8}的数据,绝对误差大于0.1,请检查"
+    check_run_status 1 ${step_start_time} "${predict_date_path: -8}的数据,绝对误差大于0.1" "线上模型评估${predict_date_path: -8}的数据,绝对误差大于0.1,请检查"
+    exit 1
+  fi 
+
+
+  # 对比两个模型的差异
+  score_diff=$( echo "${new_incr_rate_avg} - ${old_incr_rate_avg}" | bc -l )
+  if (( $(echo "${score_diff} > 0.050000" | bc -l ) ));then 
+    echo "两个模型评估${predict_date_path: -8}的数据,两个模型分数差异为: ${score_diff}, 大于0.05, 请检查"
+    check_run_status 1 ${step_start_time} "两个模型评估${predict_date_path: -8}的数据" "两个模型评估${predict_date_path: -8}的数据,两个模型分数差异为: ${score_diff}, 大于0.05"
+    exit 1
+  fi 
+
+}
+
+model_upload_oss() {
+  local step_start_time=$(date +%s)
+
+  (
+    cd ${model_local_home}
+
+    ${HADOOP} fs -get ${model_save_path} ${model_name}
+    if [ ! -d ${model_name} ]; then
+      echo "从HDFS下载模型失败"
+      check_run_status 1 ${step_start_time} "HDFS下载模型任务" "HDFS下载模型失败" 
+      exit 1 
+    fi
+
+    tar -czvf ${model_name}.tar.gz -C ${model_name} .
+
+    rm -rf ${model_name}.tar.gz.crc
+
+    # 从OSS中移除模型文件和校准文件
+    ${HADOOP} fs -rm -r -skipTrash ${MODEL_OSS_PATH}/${model_name}.tar.gz ${MODEL_OSS_PATH}/${OSS_CALIBRATION_FILE_NAME}.txt
+    
+    # 将模型文件和校准文件推送到OSS上
+    ${HADOOP} fs -put ${model_name}.tar.gz ${OSS_CALIBRATION_FILE_NAME}.txt ${MODEL_OSS_PATH}
+    local return_code=$?
+    check_run_status ${return_code} ${step_start_time} "模型上传OSS任务" "模型上传OSS失败"
+
+    echo ${model_save_path} > ${model_path_file}
+
+    # 
+    rm -f ./${model_name}.tar.gz
+    rm -rf ./${model_name}
+    rm -rf ${OSS_CALIBRATION_FILE_NAME}.txt
+  )
+
+  local return_code=$?
+  check_run_status ${return_code} ${step_start_time} "模型上传OSS任务" "模型上传OSS失败"
+
+  local step_end_time=$(date +%s)
+  local elapsed=$((${step_end_time} - ${start_time}))
+  echo -e "${LOG_PREFIX} -- 模型更新完成 -- 模型更新成功: 耗时 ${elapsed}"
+  
+  send_success_upload_msg
+}
+
+# 主方法
+main() {
+  init
+
+  check_ad_hive
+
+  origin_data
+
+  bucket_feature
+
+  if [ "${current_day_of_week}" -eq 1 ] || [ "${current_day_of_week}" -eq 3 ] || [ "${current_day_of_week}" -eq 5 ]; then
+    echo "当前是周一,周三或周五,开始训练并更新模型"
+    
+    xgb_train
+
+    model_predict
+
+    model_upload_oss
+  else
+    echo "当前是周一,周三或周五,不更新模型"
+  fi 
+
+}
+
+
+main

+ 21 - 0
ad/02_ad_model_update_test.sh

@@ -0,0 +1,21 @@
+#!/bin/sh
+set -x
+
+export PATH=$SPARK_HOME/bin:$PATH
+export HADOOP_CONF_DIR=/etc/taihao-apps/hadoop-conf
+export JAVA_HOME=/usr/lib/jvm/java-1.8.0
+export PREDICT_CACHE_PATH=/root/zhaohp/XGB/test/predict_cache/
+export SEGMENT_BASE_PATH=/root/zhaohp/XGB/test/predict_analyse_file/
+
+
+sh_path=$(cd $(dirname $0); pwd)
+source ${sh_path}/00_common.sh
+
+online_model_predict_result_path=/dw/recommend/model/34_ad_predict_data/20241110_351_1000_1031_1106
+new_model_predict_result_path=/dw/recommend/model/34_ad_predict_data/20241110_351_1000_1103_1109
+predict_analyse_file_path=/root/zhaohp/XGB/test/predict_analyse_file/20241110_351_1000_analyse.txt
+calibration_file_path=/root/zhaohp/XGB/test/model_xgb_351_1000_v2_calibration.txt
+
+
+local python_return_code=$(python ${sh_path}/model_predict_analyse.py -op ${online_model_predict_result_path} -np ${new_model_predict_result_path} -af ${predict_analyse_file_path} -cf ${calibration_file_path})
+echo "${python_return_code}"

+ 71 - 0
ad/21_ad_model_add_dt_train_predict_auc.sh

@@ -0,0 +1,71 @@
+#!/bin/sh
+
+# 指定基础模型,模型增量训练,预测,计算AUC脚本
+
+set -x
+
+begin_date=$1
+end_date=$2
+model_name=$3
+train_dim=$4
+predict_dim=$5
+
+PROJECT_HOME=/root/zhaohp/recommend-emr-dataprocess
+HADOOP=/opt/apps/HADOOP-COMMON/hadoop-common-current/bin/hadoop
+HDFS_TRAIN_DATE_PATH=/dw/recommend/model/33_ad_train_data_v4
+MODEL_PATH=${PROJECT_HOME}/model
+PREDICT_PATH=${PROJECT_HOME}/predict
+
+FM_TRAIN=/root/sunmingze/alphaFM/bin/fm_train
+FM_PREDICT=/root/sunmingze/alphaFM/bin/fm_predict
+
+train_date=$begin_date
+
+# 计算模型的AUC,从训练日期的后一天到参数的end_date
+predict_auc() {
+    echo -e "\t==================== 开始预测 $train_date 模型 ===================="
+
+    predict_date=$(date -d "$train_date +1 day" +%Y%m%d)
+    predict_end_date=$(date -d "$end_date +1 day" +%Y%m%d)
+    while [ "$predict_date" != "$predict_end_date" ]; do
+
+        $HADOOP fs -text ${HDFS_TRAIN_DATE_PATH}/${predict_date}/* | ${FM_PREDICT} -m ${MODEL_PATH}/${model_name}_${train_date}.txt -dim ${predict_dim} -core 8 -out ${PREDICT_PATH}/${model_name}_${train_date}_${predict_date}.txt
+        auc=`cat ${PREDICT_PATH}/${model_name}_${train_date}_${predict_date}.txt | /root/sunmingze/AUC/AUC`
+
+        echo "模型训练日期: ${train_date}, 模型预测日期: ${predict_date}, AUC: ${auc}, 模型路径: ${MODEL_PATH}/${model_name}_${train_date}.txt"
+
+        predict_date=$(date -d "$predict_date +1 day" +%Y%m%d)
+
+    done
+
+    echo -e "\n\t==================== 预测 $train_date 模型结束 ===================="
+
+}
+main() {
+
+    # 增量训练模型
+    while [ "$train_date" != "$end_date" ]; do
+        echo "==================== 开始训练 $train_date 模型 ===================="
+
+        # 模型训练
+        yesterday=$(date -d "$train_date -1 day" +%Y%m%d)
+
+        input_model=${MODEL_PATH}/${model_name}_${yesterday}.txt
+        if [ ! -e "${input_model}" ]; then
+            echo "输入模型: ${input_model} 不存在,退出"
+            exit 1
+        fi
+
+        $HADOOP fs -text ${HDFS_TRAIN_DATE_PATH}/${train_date}/* | ${FM_TRAIN} -m ${MODEL_PATH}/${model_name}_${train_date}.txt -dim ${train_dim} -core 8 -im ${input_model}
+
+        predict_auc
+
+        train_date=$(date -d "$train_date +1 day" +%Y%m%d)
+
+        echo "==================== 训练 $train_date 模型结束 ===================="
+        echo -e "\n\n\n\n\n\n"
+    done
+
+}
+
+main

+ 60 - 0
ad/22_ad_model_predict_auc.sh

@@ -0,0 +1,60 @@
+#!/bin/sh
+
+# 训练新模型,并使用后面的数据计算AUC,评估模型效果
+
+set -x
+
+begin_date=$1
+end_date=$2
+model_name=$3
+predict_dim=$4
+
+PROJECT_HOME=/root/zhaohp/20240723
+HADOOP=/opt/apps/HADOOP-COMMON/hadoop-common-current/bin/hadoop
+HDFS_TRAIN_DATE_PATH=/dw/recommend/model/33_ad_train_data_v4_idn1
+MODEL_PATH=${PROJECT_HOME}/model
+PREDICT_PATH=${PROJECT_HOME}/predict
+
+FM_TRAIN=/root/sunmingze/alphaFM/bin/fm_train
+FM_PREDICT=/root/sunmingze/alphaFM/bin/fm_predict
+
+train_date=$begin_date
+
+# 计算模型的AUC,从训练日期的后一天到参数的end_date
+predict_auc() {
+    echo -e "\t==================== 开始预测 $train_date 模型 ===================="
+
+    predict_date=$(date -d "$train_date +1 day" +%Y%m%d)
+    predict_end_date=$(date -d "$end_date +1 day" +%Y%m%d)
+    while [ "$predict_date" != "$predict_end_date" ]; do
+
+        $HADOOP fs -text ${HDFS_TRAIN_DATE_PATH}/${predict_date}/* | ${FM_PREDICT} -m ${MODEL_PATH}/${model_name}_${train_date}.txt -dim ${predict_dim} -core 8 -out ${PREDICT_PATH}/${model_name}_${train_date}.txt
+        auc=`cat ${PREDICT_PATH}/${model_name}_${train_date}.txt | /root/sunmingze/AUC/AUC`
+
+        echo "模型训练日期: ${train_date}, 模型预测日期: ${predict_date}, AUC: ${auc}, 模型路径: ${MODEL_PATH}/${model_name}_${train_date}.txt"
+
+        predict_date=$(date -d "$predict_date +1 day" +%Y%m%d)
+
+    done
+
+    echo -e "\n\t==================== 预测 $train_date 模型结束 ===================="
+
+}
+main() {
+
+    # 增量训练模型
+    while [ "$train_date" != "$end_date" ]; do
+        echo "==================== 开始训练 $train_date 模型 ===================="
+
+        predict_auc
+
+        echo -e "==================== 训练 $train_date 模型结束 ==================== \n\n\n\n\n\n"
+        train_date=$(date -d "$train_date +1 day" +%Y%m%d)
+    done
+
+}
+
+main
+
+
+# nohup ./22_ad_model_predict_auc.sh 20240712 20240717 model_bkb8_v4_idn1 8  > logs/22_ad_model_predict_auc.log 2>&1 &

+ 29 - 0
ad/23_ad_model_batch_calc_cid_score_avg.sh

@@ -0,0 +1,29 @@
+#!/bin/sh
+
+# 计算模型对某天,某个CID的打分情况,输出平均值
+
+set -x
+
+cids=$1
+model=$2
+hdfs_path=$3
+bias=$4
+
+MODEL_PATH=/root/zhaohp/recommend-emr-dataprocess/model/ad
+PREDICT_PATH=/root/zhaohp/recommend-emr-dataprocess/predict/ad
+HADOOP=/opt/apps/HADOOP-COMMON/hadoop-common-current/bin/hadoop
+FM_HOME=/root/sunmingze/alphaFM
+
+# 将cids中的逗号分隔列表拆分为数组
+IFS=',' read -ra cid_array <<< "$cids"
+
+for cid in "${cid_array[@]}"; do
+    # 对每个CID执行打分计算并输出平均值
+    $HADOOP fs -text ${hdfs_path}/* | grep "cid_${cid}" | ${FM_HOME}/bin/fm_predict -m ${MODEL_PATH}/${model}.txt -dim ${bias} -core 8 -out ${PREDICT_PATH}/${model}_${cid}.txt
+
+    score_avg=`awk '{ sum += $2; count++ } END { if (count > 0) print sum / count }' ${PREDICT_PATH}/${model}_${cid}.txt`
+
+    echo -e "CID- ${cid} -平均分计算结果: ${score_avg} \n\t模型: ${MODEL_PATH}/${model} \n\tHDFS数据路径: ${hdfs_path} \n\t"
+done
+
+# nohup ./ad/23_ad_model_batch_calc_cid_score_avg.sh 3024,2966,2670,3163,3595,3594,3364,3365,3593,3363,3180,1910,2660,3478,3431,3772,3060,3178,3056,3771,3208,3041,2910,3690,1626,3318,3357,3628,3766,3770,3763,3769,3768,3541,3534,2806,3755,3760,3319,3758,3746,3759,3747,3754,3767,3745,3756,3437,3608,3527,3691,3197,3361,3362,3212,3344,3343,3346,3345,3612,3540,3526,3611,3761,3617,3762,3618,3616,3623,3765,3624,3764,3198,3542,3353,2374,3200 model_bkb8_v55_20240804 /dw/recommend/model/33_ad_train_data_v4/20240806 8 > logs/model_bkb8_v55_20240804_cid_06_12.log 2>&1 &

+ 99 - 0
ad/24_supplementary_data.sh

@@ -0,0 +1,99 @@
+#!/bin/sh
+set -x
+
+# 广告补数据脚本,修改{today_early_1}补单天的数据
+
+export PATH=$SPARK_HOME/bin:$PATH
+export HADOOP_CONF_DIR=/etc/taihao-apps/hadoop-conf
+export JAVA_HOME=/usr/lib/jvm/java-1.8.0
+
+sh_path=$(cd $(dirname $0); pwd)
+source ${sh_path}/00_common.sh
+
+source /root/anaconda3/bin/activate py37
+
+
+# 全局常量
+LOG_PREFIX=广告模型训练任务
+HADOOP=/opt/apps/HADOOP-COMMON/hadoop-common-current/bin/hadoop
+TRAIN_PATH=/dw/recommend/model/31_ad_sample_data_v4
+BUCKET_FEATURE_PATH=/dw/recommend/model/33_ad_train_data_v4
+TABLE=alg_recsys_ad_sample_all
+
+# 任务开始时间
+start_time=$(date +%s)
+# 前一天
+today_early_1="$(date -d '1 days ago' +%Y%m%d)"
+
+# 校验命令的退出码
+check_run_status() {
+    local status=$1
+    local step_start_time=$2
+    local step_name=$3
+    local msg=$4
+
+    local step_end_time=$(date +%s)
+    local step_elapsed=$((${step_end_time} - ${step_start_time}))
+
+    if [ ${status} -ne 0 ]; then
+        echo "${LOG_PREFIX} -- ${step_name}失败: 耗时 ${step_elapsed}"
+        local elapsed=$((${step_end_time} - ${start_time}))
+        /root/anaconda3/bin/python ${sh_path}/ad_monitor_util.py --level error --msg "${msg}" --start "${start_time}" --elapsed "${elapsed}" --top10 "${top10_msg}"
+        exit 1
+    else
+        echo "${LOG_PREFIX} -- ${step_name}成功: 耗时 ${step_elapsed}"
+    fi
+}
+
+# 校验大数据任务是否执行完成
+check_ad_hive() {
+  local step_start_time=$(date +%s)
+  local max_hour=05
+  local max_minute=30
+  local elapsed=0
+  while true; do
+      local python_return_code=$(python ${sh_path}/ad_utils.py --excute_program check_ad_origin_hive --partition ${today_early_1} --hh 23)
+
+      elapsed=$(($(date +%s) - ${step_start_time}))
+      if [ "${python_return_code}" -eq 0 ]; then
+          break
+      fi
+      echo "Python程序返回非0值,等待五分钟后再次调用。"
+      sleep 300
+      local current_hour=$(date +%H)
+      local current_minute=$(date +%M)
+      if (( ${current_hour} > ${max_hour} || ( ${current_hour} == ${max_hour} && ${current_minute} >= ${max_minute} ) )); then
+          local msg="大数据数据生产校验失败, 分区: ${today_early_1}"
+          echo -e "${LOG_PREFIX} -- 大数据数据生产校验 -- ${msg}: 耗时 ${elapsed}"
+          /root/anaconda3/bin/python ${sh_path}/ad_monitor_util.py --level error --msg "${msg}" --start "${start_time}" --elapsed "${elapsed}"
+          exit 1
+      fi
+  done
+  echo "${LOG_PREFIX} -- 大数据数据生产校验 -- 大数据数据生产校验通过: 耗时 $elapsed"
+}
+
+origin_data() {
+  (
+    source ${sh_path}/25_xgb_make_data_origin_bucket.sh
+    make_origin_data
+  )
+}
+
+bucket_feature() {
+  (
+    source ${sh_path}/25_xgb_make_data_origin_bucket.sh
+    make_bucket_feature
+  )
+}
+
+# 主方法
+main() {
+  check_ad_hive
+
+  origin_data
+
+  bucket_feature
+}
+
+
+main

+ 87 - 0
ad/25_xgb_make_data_origin_bucket.sh

@@ -0,0 +1,87 @@
+#!/bin/sh
+set -x
+
+export PATH=$SPARK_HOME/bin:$PATH
+export HADOOP_CONF_DIR=/etc/taihao-apps/hadoop-conf
+export JAVA_HOME=/usr/lib/jvm/java-1.8.0
+export SPARK_HOME=/opt/apps/SPARK2/spark-2.4.8-hadoop3.2-1.0.8
+
+
+sh_path=$(dirname $0)
+source ${sh_path}/00_common.sh
+
+source /root/anaconda3/bin/activate py37
+
+make_origin_data() {
+  
+  local step_start_time=$(date +%s)
+
+  /opt/apps/SPARK2/spark-2.4.8-hadoop3.2-1.0.8/bin/spark-class2 org.apache.spark.deploy.SparkSubmit \
+  --class com.aliyun.odps.spark.examples.makedata_ad.v20240718.makedata_ad_31_originData_20240718 \
+  --master yarn --driver-memory 1G --executor-memory 2G --executor-cores 1 --num-executors 16 \
+  ./target/spark-examples-1.0.0-SNAPSHOT-shaded.jar \
+  tablePart:64 repartition:32 \
+  beginStr:${today_early_1}00 endStr:${today_early_1}12 \
+  savePath:${TRAIN_PATH} \
+  table:${TABLE} \
+  filterHours:00,01,02,03,04,05,06,07 \
+  idDefaultValue:0.1 &
+  local task1=$!
+
+  /opt/apps/SPARK2/spark-2.4.8-hadoop3.2-1.0.8/bin/spark-class2 org.apache.spark.deploy.SparkSubmit \
+  --class com.aliyun.odps.spark.examples.makedata_ad.v20240718.makedata_ad_31_originData_20240718 \
+  --master yarn --driver-memory 1G --executor-memory 2G --executor-cores 1 --num-executors 16 \
+  ./target/spark-examples-1.0.0-SNAPSHOT-shaded.jar \
+  tablePart:64 repartition:32 \
+  beginStr:${today_early_1}13 endStr:${today_early_1}18 \
+  savePath:${TRAIN_PATH} \
+  table:${TABLE} \
+  filterHours:00,01,02,03,04,05,06,07 \
+  idDefaultValue:0.1 &
+  local task2=$!
+
+  /opt/apps/SPARK2/spark-2.4.8-hadoop3.2-1.0.8/bin/spark-class2 org.apache.spark.deploy.SparkSubmit \
+  --class com.aliyun.odps.spark.examples.makedata_ad.v20240718.makedata_ad_31_originData_20240718 \
+  --master yarn --driver-memory 1G --executor-memory 2G --executor-cores 1 --num-executors 16 \
+  ./target/spark-examples-1.0.0-SNAPSHOT-shaded.jar \
+  tablePart:64 repartition:32 \
+  beginStr:${today_early_1}19 endStr:${today_early_1}23 \
+  savePath:${TRAIN_PATH} \
+  table:${TABLE} \
+  filterHours:00,01,02,03,04,05,06,07 \
+  idDefaultValue:0.1 &
+  local task3=$!
+
+  wait ${task1}
+  local task1_return_code=$?
+
+  wait ${task2}
+  local task2_return_code=$?
+
+  wait ${task3}
+  local task3_return_code=$?
+
+
+  check_run_status ${task1_return_code} ${step_start_time} "spark原始样本生产任务: 生产00~12数据异常"
+  check_run_status ${task2_return_code} ${step_start_time} "spark原始样本生产任务: 生产13~18数据异常"
+  check_run_status ${task3_return_code} ${step_start_time} "spark原始样本生产任务: 生产19~23数据异常"
+}
+
+
+
+make_bucket_feature() {
+
+  local step_start_time=$(date +%s)
+  
+  /opt/apps/SPARK2/spark-2.4.8-hadoop3.2-1.0.8/bin/spark-class2 org.apache.spark.deploy.SparkSubmit \
+  --class com.aliyun.odps.spark.examples.makedata_ad.v20240718.makedata_ad_33_bucketData_20240718 \
+  --master yarn --driver-memory 2G --executor-memory 4G --executor-cores 1 --num-executors 16 \
+  ./target/spark-examples-1.0.0-SNAPSHOT-shaded.jar \
+  beginStr:${today_early_1} endStr:${today_early_1} repartition:100 \
+  filterNames:_4h_,_5h_,adid_,targeting_conversion_ \
+  readPath:${TRAIN_PATH} \
+  savePath:${BUCKET_FEATURE_PATH}
+
+  local return_code=$?
+  check_run_status ${return_code} ${step_start_time} "spark特征分桶任务"
+}

+ 75 - 0
ad/30_delete_timer_file.sh

@@ -0,0 +1,75 @@
+#!/bin/sh
+
+export HADOOP_CONF_DIR=/etc/taihao-apps/hadoop-conf
+
+HADOOP=/opt/apps/HADOOP-COMMON/hadoop-common-current/bin/hadoop
+
+PREDICT_HOME=/root/zhaohp/recommend-emr-dataprocess/predict
+origin_data_hdfs_dir=/dw/recommend/model/31_ad_sample_data_v3_auto
+bucket_feature_hdfs_dir=/dw/recommend/model/33_ad_train_data_v3_auto
+
+
+# 删除五天之前的预测结果文件
+delete_predict_5d_ago() {
+
+    echo "=========== 开始删除五天前的预测结果文件 $(date "+%Y-%m-%d %H:%M:%d") ==========="
+
+    tmp_file_name=./files_to_delete.txt
+
+    # 查询五天前的预测结果文件,并保存到临时文件
+    find "$PREDICT_HOME" -type f -mtime +5 > "${tmp_file_name}"
+
+    # 逐行读取临时文件中的路径并删除文件
+    while IFS= read -r file; do
+        echo "Deleting: $file"
+        rm -f "$file"
+    done < "${tmp_file_name}"
+
+    # 删除临时文件
+    rm -f "${tmp_file_name}"
+
+    echo "=========== 删除五天前的预测结果文件结束 $(date "+%Y-%m-%d %H:%M:%d") ==========="
+}
+
+# 删除HDFS中的目录
+delete_hdfs_path() {
+    if [ "$#" -ne 2 ]; then
+        echo "Usage: delete_path <early> <path>"
+        return 1
+    fi
+
+    early=$1
+    path=$2
+
+    echo "=========== $(date "+%Y-%m-%d %H:%M:%d") 开始删除目录 ${path}下 ${early}天前的文件  ==========="
+
+    EARLY_DAYS_AGO=$(date -d "${early} days ago" +%Y-%m-%d)
+
+    $HADOOP fs -ls $path | grep '^d' | while read line;
+    do
+        dir=$(echo $line | awk '{print $8}')
+        modified_date=$(echo $line | awk '{print $6}')
+        echo "${line}"
+        if [[ "${modified_date}" < "${EARLY_DAYS_AGO}" ]]; then
+            echo "Deleting: ${dir}"
+            $HADOOP fs -rm -r -skipTrash ${dir}
+        fi
+
+    done
+
+    echo "=========== $(date "+%Y-%m-%d %H:%M:%d") 删除目录 ${path}下 ${early}天前的文件结束  ==========="
+
+}
+
+
+main() {
+    # 删除五天前的预测结果文件
+    delete_predict_5d_ago
+    # 删除七天之前的HDFS中的特征原始数据
+    delete_hdfs_path 7 $origin_data_hdfs_dir
+    # 删除七天之前的HDFS中的特征分桶数据
+    delete_hdfs_path 7 $bucket_feature_hdfs_dir
+}
+
+
+main

+ 141 - 0
ad/ad_monitor_util.py

@@ -0,0 +1,141 @@
+# -*- coding: utf-8 -*-
+import argparse
+import json
+
+import pytz
+import requests
+
+from datetime import datetime
+
+server_robot = {
+    'webhook': 'https://open.feishu.cn/open-apis/bot/v2/hook/926982f5-e7af-40f5-81fd-27d8f42718e4',
+}
+
+level_header_template_map = {
+    "info": "turquoise",
+    "error": "red",
+    "warn": "yellow"
+}
+
+level_header_title_content_map = {
+    "info": "广告模型自动更新通知",
+    "error": "广告模型自动更新告警",
+    "warn": "广告模型自动更新告警"
+}
+
+level_task_status_map = {
+    "info": "任务执行成功",
+    "error": "任务执行失败",
+    "warn": "任务执行失败",
+}
+
+
+def send_card_msg_to_feishu(webhook, card_json):
+    """发送消息到飞书"""
+    headers = {'Content-Type': 'application/json'}
+    payload_message = {
+        "msg_type": "interactive",
+        "card": card_json
+    }
+    print(f"推送飞书消息内容: {json.dumps(payload_message)}")
+    response = requests.request('POST', url=webhook, headers=headers, data=json.dumps(payload_message))
+    print(response.text)
+
+
+def timestamp_format(timestamp: str) -> str:
+    try:
+        return (datetime.utcfromtimestamp(int(timestamp))
+                .replace(tzinfo=pytz.UTC)
+                .astimezone(pytz.timezone('Asia/Shanghai'))
+                .strftime('%Y-%m-%d %H:%M:%S')
+                )
+    except ValueError as e:
+        return timestamp
+
+
+def seconds_convert(seconds):
+    hours = seconds // 3600
+    minutes = (seconds % 3600) // 60
+    seconds = seconds % 60
+    return f"{hours}小时 {minutes}分钟 {seconds}秒"
+
+
+def _monitor(level, msg: str, start, elapsed, top10):
+    """消息推送"""
+    """消息推送"""
+    now = datetime.now()
+    msg = msg.replace("\\n", "\n").replace("\\t", "\t")
+    mgs_text = f"- 当前时间: {now.strftime('%Y-%m-%d %H:%M:%S')}" \
+               f"\n- 任务开始时间: {timestamp_format(start)}" \
+               f"\n- 任务状态: {level_task_status_map[level]}" \
+               f"\n- 任务耗时: {seconds_convert(elapsed)}" \
+               f"\n- 任务描述: {msg}"
+    card_json = {
+        "schema": "2.0",
+        "header": {
+            "title": {
+                "tag": "plain_text",
+                "content": level_header_title_content_map[level]
+            },
+            "template": level_header_template_map[level]
+        },
+        "body": {
+            "elements": [
+                {
+                    "tag": "markdown",
+                    "content": mgs_text,
+                    "text_align": "left",
+                    "text_size": "normal",
+                    "element_id": "overview"
+                }
+            ]
+        }
+    }
+    if top10 is not None and len(top10) > 0:
+        collapsible_panel = {
+            "tag": "collapsible_panel",
+            "header": {
+                "title": {
+                    "tag": "markdown",
+                    "content": "**Top10差异详情**"
+                },
+                "vertical_align": "center",
+                "padding": "4px 0px 4px 8px"
+            },
+            "border": {
+                "color": "grey",
+                "corner_radius": "5px"
+            },
+            "element_id": "detail",
+            "elements": [
+                {
+                    "tag": "markdown",
+                    "content": top10.replace("\\n", "\n").replace("\\t", "\t"),
+                    "element_id": "Top10CID"
+                }
+            ]
+        }
+        card_json['body']['elements'].append(collapsible_panel)
+
+    send_card_msg_to_feishu(
+        webhook=server_robot.get('webhook'),
+        card_json=card_json
+    )
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser(description='告警Utils')
+    parser.add_argument('--level', type=str, help='通知级别, info, warn, error', required=True)
+    parser.add_argument('--msg', type=str, help='消息', required=True)
+    parser.add_argument('--start', type=str, help='任务开始时间', required=True)
+    parser.add_argument('--elapsed', type=int, help='任务耗时【秒】', required=True)
+    parser.add_argument("--top10", type=str, help='Top10打分详情', required=False)
+    args = parser.parse_args()
+
+    _monitor(
+        level=args.level,
+        msg=args.msg,
+        start=args.start,
+        elapsed=args.elapsed,
+        top10=args.top10
+    )

+ 64 - 0
ad/ad_utils.py

@@ -0,0 +1,64 @@
+# -*- coding: utf-8 -*-
+from odps import ODPS
+import argparse
+
+ODPS_CONFIG = {
+    'ENDPOINT': 'http://service.cn.maxcompute.aliyun.com/api',
+    'ACCESSID': 'LTAIWYUujJAm7CbH',
+    'ACCESSKEY': 'RfSjdiWwED1sGFlsjXv0DlfTnZTG1P',
+}
+
+
+def check_data_hh(project, table, partition, hh) -> int:
+    """检查数据是否准备好,输出数据条数"""
+    odps = ODPS(
+        access_id=ODPS_CONFIG['ACCESSID'],
+        secret_access_key=ODPS_CONFIG['ACCESSKEY'],
+        project=project,
+        endpoint=ODPS_CONFIG['ENDPOINT'],
+        connect_timeout=3000,
+        read_timeout=500000,
+        pool_maxsize=1000,
+        pool_connections=1000
+    )
+    try:
+        t = odps.get_table(name=table)
+        check_res = t.exist_partition(partition_spec=f'dt={partition},hh={hh}')
+        if check_res:
+            sql = f'select * from {project}.{table} where dt = {partition}'
+            with odps.execute_sql(sql=sql).open_reader() as reader:
+                data_count = reader.count
+        else:
+            data_count = 0
+    except Exception as e:
+        print("error:" + str(e))
+        data_count = 0
+    return data_count
+
+
+def check_ad_origin_hive(args):
+    project = "loghubods"
+    table = "alg_recsys_ad_sample_all"
+    partition = args.partition
+    hh = args.hh
+    count = check_data_hh(project, table, partition, hh)
+    if count == 0:
+        print("1")
+        exit(1)
+    else:
+        print("0")
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser(description='脚本utils')
+    parser.add_argument('--excute_program', type=str, help='执行程序')
+    parser.add_argument('--partition', type=str, help='表分区')
+    parser.add_argument('--hh', type=str, help='小时级分区时的小时')
+    parser.add_argument('--project', type=str, help='表空间')
+    parser.add_argument('--table', type=str, help='表名')
+    args = parser.parse_args()
+    if args.excute_program == "check_ad_origin_hive":
+        check_ad_origin_hive(args)
+    else:
+        print("无合法参数,验证失败。")
+        exit(999)

+ 53 - 0
ad/holidays.txt

@@ -0,0 +1,53 @@
+国庆节
+20241001
+2024-10-01
+重阳节
+20241011
+2024-10-11
+样本有问题
+20241112
+2024-11-12
+20241113
+2024-11-13
+圣诞节
+20241225
+2024-12-25
+元旦
+20250101
+2025-01-01
+春节
+20250129
+2025-01-29
+元宵节
+20250215
+2025-02-15
+妇女节
+20250308
+2025-03-08
+劳动节
+20250501
+2025-05-01
+青年节
+20250504
+2025-05-04
+端午节
+20250531
+2025-05-31
+儿童节
+20250601
+2025-06-01
+建党节
+20250701
+2025-07-01
+建军节
+20250801
+2025-08-01
+七夕节
+20250829
+2025-08-29
+2025国庆节
+20251001
+2025-10-01
+中秋节
+20251006
+2025-10-06

+ 198 - 0
ad/model_predict_analyse.py

@@ -0,0 +1,198 @@
+import argparse
+import gzip
+import os.path
+from collections import OrderedDict
+
+import pandas as pd
+from hdfs import InsecureClient
+
+client = InsecureClient("http://master-1-1.c-7f31a3eea195cb73.cn-hangzhou.emr.aliyuncs.com:9870", user="spark")
+
+SEGMENT_BASE_PATH = os.environ.get("SEGMENT_BASE_PATH", "/dw/recommend/model/36_score_calibration_file")
+PREDICT_CACHE_PATH = os.environ.get("PREDICT_CACHE_PATH", "/root/zhaohp/XGB/predict_cache")
+
+
+def read_predict_from_local_txt(txt_file) -> list:
+    result = []
+    with open(txt_file, "r") as f:
+        for line in f.readlines():
+            sp = line.replace("\n", "").split("\t")
+            if len(sp) == 4:
+                label = int(sp[0])
+                cid = sp[3].split("_")[0]
+                score = float(sp[2].replace("[", "").replace("]", "").split(",")[1])
+                result.append({
+                    "label": label,
+                    "cid": cid,
+                    "score": score
+                })
+    return result
+
+
+def read_predict_from_hdfs(hdfs_path: str) -> list:
+    if not hdfs_path.endswith("/"):
+        hdfs_path += "/"
+    result = []
+    for file in client.list(hdfs_path):
+        with client.read(hdfs_path + file) as reader:
+            with gzip.GzipFile(fileobj=reader, mode="rb") as gz_file:
+                for line in gz_file.read().decode("utf-8").split("\n"):
+                    split = line.split("\t")
+                    if len(split) == 4:
+                        cid = split[3].split("_")[0]
+                        label = int(split[0])
+                        score = float(split[2].replace("[", "").replace("]", "").split(",")[1])
+                        result.append({
+                            "cid": cid,
+                            "label": label,
+                            "score": score
+                        })
+
+    return result
+
+
+def _segment_v1(scores, step):
+    bins = []
+    for i in range(0, len(scores), int((len(scores) / step))):
+        if i == 0:
+            bins.append(0)
+        else:
+            bins.append(scores[i])
+    bins.append(1)
+    return list(OrderedDict.fromkeys(bins))
+
+
+def segment_calc_diff_rate_by_score(df: pd.DataFrame, segment_file_path: str, step=100) -> [pd.DataFrame, pd.DataFrame]:
+    sored_df = df.sort_values(by=['score'])
+    # 评估分数分段
+    scores = sored_df['score'].values
+
+    bins = _segment_v1(scores, step)
+
+    # 等分分桶
+    # split_indices = np.array_split(np.arange(len(scores)), step)
+    # bins = [scores[index[0]] for index in split_indices] + [scores[split_indices[-1][-1]]]
+
+    sored_df['score_segment'] = pd.cut(sored_df['score'], bins=bins)
+
+    # 计算分段内分数的差异
+    group_df = sored_df.groupby("score_segment", observed=True).agg(
+        segment_label_sum=('label', 'sum'),
+        segment_label_cnt=('label', 'count'),
+        segment_score_avg=('score', 'mean'),
+    ).reset_index()
+    group_df['segment_true_score'] = group_df['segment_label_sum'] / group_df['segment_label_cnt']
+    group_df['segment_diff_rate'] = (group_df['segment_score_avg'] / group_df['segment_true_score'] - 1).mask(group_df['segment_true_score'] == 0, 0)
+
+    # 完整的分段文件保存
+    csv_data = group_df.to_csv(sep="\t", index=False)
+    with client.write(segment_file_path, encoding='utf-8', overwrite=True) as writer:
+        writer.write(csv_data)
+
+    filtered_df = group_df[(abs(group_df['segment_diff_rate']) >= 0.2) & (group_df['segment_label_cnt'] >= 1000)]
+    filtered_df = filtered_df[['score_segment', 'segment_diff_rate']]
+    # 每条曝光数据添加对应分数的diff
+    merged_df = pd.merge(sored_df, filtered_df, on="score_segment", how="left")
+
+    merged_df['segment_diff_rate'] = merged_df['segment_diff_rate'].fillna(0)
+    return merged_df, filtered_df
+
+
+def read_and_calibration_predict(predict_path: str, step=100) -> [pd.DataFrame, pd.DataFrame, pd.DataFrame]:
+    """
+    读取评估结果,并进行校准
+    """
+    # 本地调试使用
+    # predicts = read_predict_from_local_txt(predict_path)
+    predicts = read_predict_from_hdfs(predict_path)
+    df = pd.DataFrame(predicts)
+
+    # 模型分分段计算与真实ctcvr的dff_rate
+    predict_basename = os.path.basename(predict_path)
+    if predict_basename.endswith("/"):
+        predict_basename = predict_basename[:-1]
+    df, segment_df = segment_calc_diff_rate_by_score(df, segment_file_path=f"{SEGMENT_BASE_PATH}/{predict_basename}.txt", step=100)
+
+    # 生成校准后的分数
+    df['score_2'] = df['score'] / (1 + df['segment_diff_rate'])
+
+    # 按CID统计真实ctcvr和校准前后的平均模型分
+    grouped_df = df.groupby("cid").agg(
+        view=('cid', 'size'),
+        conv=('label', 'sum'),
+        score_avg=('score', lambda x: round(x.mean(), 6)),
+        score_2_avg=('score_2', lambda x: round(x.mean(), 6)),
+    ).reset_index()
+    grouped_df['true_ctcvr'] = grouped_df['conv'] / grouped_df['view']
+
+    return df, grouped_df, segment_df
+
+
+def predict_local_save_for_auc(old_df: pd.DataFrame, new_df: pd.DataFrame):
+    """
+    本地保存一份评估结果, 计算AUC使用
+    """
+    d = {"old": old_df, "new": new_df}
+    for key in d:
+        df = d[key][['label', "score"]]
+        df.to_csv(f"{PREDICT_CACHE_PATH}/{key}_1.txt", sep="\t", index=False, header=False)
+        df = d[key][['label', "score_2"]]
+        df.to_csv(f"{PREDICT_CACHE_PATH}/{key}_2.txt", sep="\t", index=False, header=False)
+
+
+def _main(old_predict_path: str, new_predict_path: str, calibration_file: str, analyse_file: str):
+    old_df, old_group_df, old_segment_df = read_and_calibration_predict(old_predict_path)
+    new_df, new_group_df, new_segment_df = read_and_calibration_predict(new_predict_path)
+
+    predict_local_save_for_auc(old_df, new_df)
+
+    # 分段文件保存, 此处保留的最后使用的分段文件,不是所有的分段
+    new_segment_df.to_csv(calibration_file, sep='\t', index=False, header=False)
+
+    # 字段重命名,和列过滤
+    old_group_df.rename(columns={'score_avg': 'old_score_avg', 'score_2_avg': 'old_score_2_avg'}, inplace=True)
+    new_group_df.rename(columns={'score_avg': 'new_score_avg', 'score_2_avg': 'new_score_2_avg'}, inplace=True)
+    old_group_df = old_group_df[['cid', 'view', 'conv', 'true_ctcvr', 'old_score_avg', 'old_score_2_avg']]
+    new_group_df = new_group_df[['cid', 'new_score_avg', 'new_score_2_avg']]
+
+    merged = pd.merge(old_group_df, new_group_df, on='cid', how='left')
+
+    # 计算与真实ctcvr的差异值
+    merged["(new-true)/true"] = (merged['new_score_avg'] / merged['true_ctcvr'] - 1).mask(merged['true_ctcvr'] == 0, 0)
+    merged["(old-true)/true"] = (merged['old_score_avg'] / merged['true_ctcvr'] - 1).mask(merged['true_ctcvr'] == 0, 0)
+
+    # 计算校准后的模型分与ctcvr的差异值
+    merged["(new2-true)/true"] = (merged['new_score_2_avg'] / merged['true_ctcvr'] - 1).mask(merged['true_ctcvr'] == 0, 0)
+    merged["(old2-true)/true"] = (merged['old_score_2_avg'] / merged['true_ctcvr'] - 1).mask(merged['true_ctcvr'] == 0, 0)
+
+    # 按照曝光排序,写入本地文件
+    merged = merged.sort_values(by=['view'], ascending=False)
+    merged = merged[[
+        'cid', 'view', "conv", "true_ctcvr",
+        "old_score_avg", "new_score_avg", "(old-true)/true", "(new-true)/true",
+        "old_score_2_avg", "new_score_2_avg", "(old2-true)/true", "(new2-true)/true",
+    ]]
+
+    # 根据文件名保存不同的格式
+    if analyse_file.endswith(".csv"):
+        merged.to_csv(analyse_file, index=False)
+    else:
+        with open(analyse_file, "w") as writer:
+            writer.write(merged.to_string(index=False))
+    print("0")
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser(description="model_predict_analyse.py")
+    parser.add_argument("-op", "--old_predict_path", required=True, help="老模型评估结果")
+    parser.add_argument("-np", "--new_predict_path", required=True, help="新模型评估结果")
+    parser.add_argument("-af", "--analyse_file", required=True, help="最后计算结果的保存路径")
+    parser.add_argument("-cf", "--calibration_file", required=True, help="线上使用的segment文件保存路径")
+    args = parser.parse_args()
+
+    _main(
+        old_predict_path=args.old_predict_path,
+        new_predict_path=args.new_predict_path,
+        calibration_file=args.calibration_file,
+        analyse_file=args.analyse_file
+    )

+ 6 - 2
pom.xml

@@ -26,7 +26,7 @@
     <properties>
         <spark.version>2.3.0</spark.version>
         <cupid.sdk.version>3.3.8-public</cupid.sdk.version>
-        <scala.version>2.11.8</scala.version>
+        <scala.version>2.11.12</scala.version>
         <scala.binary.version>2.11</scala.binary.version>
         <java.version>1.8</java.version>
         <maven.compiler.source>${java.version}</maven.compiler.source>
@@ -176,7 +176,11 @@
             <artifactId>lombok</artifactId>
             <version>1.18.24</version>
         </dependency>
-
+        <dependency>
+            <groupId>ml.dmlc</groupId>
+            <artifactId>xgboost4j-spark_2.11</artifactId>
+            <version>1.1.2</version>
+        </dependency>
     </dependencies>
 
     <build>

+ 46 - 0
recommend/01_recommend_model_new_train.sh

@@ -0,0 +1,46 @@
+#!/bin/sh
+
+# 重新训练模型
+
+set -x
+
+begin_date=$1
+end_date=$2
+model_name=$3
+train_dim=$4
+hdfs_path=$5
+
+
+PROJECT_HOME=/root/zhaohp/recommend-emr-dataprocess
+HADOOP=/opt/apps/HADOOP-COMMON/hadoop-common-current/bin/hadoop
+MODEL_PATH=${PROJECT_HOME}/model/recommend
+
+FM_TRAIN=/root/sunmingze/alphaFM/bin/fm_train
+
+train_date=$begin_date
+
+main() {
+
+    end_date=$(date -d "$end_date +1 day" +%Y%m%d)
+
+    # 增量训练模型
+    while [ "$train_date" != "$end_date" ]; do
+        echo "==================== 开始训练 $train_date 模型 ===================="
+
+        if [ "$train_date" == "$begin_date" ]; then
+            $HADOOP fs -text ${hdfs_path}/${train_date}/* | ${FM_TRAIN} -m ${MODEL_PATH}/${model_name}_${train_date}.txt -dim ${train_dim} -core 8
+        else
+            yesterday=$(date -d "$train_date -1 day" +%Y%m%d)
+            $HADOOP fs -text ${hdfs_path}/${train_date}/* | ${FM_TRAIN} -m ${MODEL_PATH}/${model_name}_${train_date}.txt -dim ${train_dim} -core 8 -im ${MODEL_PATH}/${model_name}_${yesterday}.txt
+        fi
+
+        echo -e "==================== 训练 $train_date 模型结束 ====================\n\n\n\n\n\n"
+
+        train_date=$(date -d "$train_date +1 day" +%Y%m%d)
+    done
+
+}
+
+main
+
+# nohup ./recommend/01_recommend_model_new_train.sh 20240815 20240821 model_nba8_v3 1,1,8 /dw/recommend/model/43_recsys_train_data_new_table_274_sample_01/ > logs/25_recommend_model_new_train.log 2>&1 &

+ 52 - 0
recommend/02_train_go.sh

@@ -0,0 +1,52 @@
+#!/bin/sh
+
+# 训练新模型,并使用后面的数据计算AUC,评估模型效果
+
+set -x
+
+begin_date=$1
+end_date=$2
+model_name=$3
+train_dim=$4
+hdfs_path=$5
+
+PROJECT_HOME=/root/zhaohp/recommend-emr-dataprocess/
+HADOOP=/opt/apps/HADOOP-COMMON/hadoop-common-current/bin/hadoop
+MODEL_PATH=${PROJECT_HOME}/model/recommend/
+PREDICT_PATH=${PROJECT_HOME}/predict/recommend/
+
+FM_TRAIN=/root/sunmingze/alphaFM/bin/fm_train
+
+
+train_date=$begin_date
+train_end_time=$(date -d "$end_date +1 day" +%Y%m%d)
+
+main() {
+
+    # 增量训练模型
+
+    while [ "$train_date" != "$train_end_time" ]; do
+        echo "==================== 开始训练 $train_date 模型 ===================="
+
+        if [ "$train_date" == 20240801 ]; then
+            echo -e "\t\t 无效的数据分区: $train_date, 跳过"
+        elif [ "$train_date" == "$begin_date" ]; then
+            $HADOOP fs -text ${hdfs_path}/${train_date}/* | ${FM_TRAIN} -m ${MODEL_PATH}/${model_name}_${train_date}.txt -dim ${train_dim} -core 8
+        else
+            if [ "$train_date" == 20240801 ]; then
+                yesterday=$(date -d "$train_date -2 day" +%Y%m%d)
+            else
+                yesterday=$(date -d "$train_date -1 day" +%Y%m%d)
+            fi
+            $HADOOP fs -text ${hdfs_path}/${train_date}/* | ${FM_TRAIN} -m ${MODEL_PATH}/${model_name}_${train_date}.txt -dim ${train_dim} -core 8 -im ${MODEL_PATH}/${model_name}_${yesterday}.txt
+        fi
+
+        echo -e "==================== 训练 $train_date 模型结束 ====================\n\n\n\n\n\n"
+
+        train_date=$(date -d "$train_date +1 day" +%Y%m%d)
+    done
+
+}
+
+main
+

+ 14 - 0
recommend/03_predict.sh

@@ -0,0 +1,14 @@
+#!/bin/sh
+set -e
+set -x
+
+day=$1
+train_path=$2
+model_name=$3
+output_file=$4
+bias=$5
+HADOOP="/opt/apps/HADOOP-COMMON/hadoop-common-current/bin/hadoop"
+$HADOOP fs -text ${train_path}/${day}/* | /root/sunmingze/alphaFM/bin/fm_predict -m model/$model_name -dim ${bias} -core 8 -out predict/${output_file}_$day.txt
+cat predict/${output_file}_$day.txt | /root/sunmingze/AUC/AUC
+
+

+ 67 - 0
recommend/20_vid_avg_score.sh

@@ -0,0 +1,67 @@
+#!/bin/sh
+
+# 计算不同VID的平均分
+
+set -x
+
+predict_date=$1
+model_name=$2
+predict_dim=$3
+
+PROJECT_HOME=/root/zhaohp/recommend-emr-dataprocess/
+MODEL_PATH=${PROJECT_HOME}/model/20240805/
+PREDICT_PATH=${PROJECT_HOME}/predict/recommend/
+TXT_PATH=/mnt/disk1/20240729
+
+HADOOP=/opt/apps/HADOOP-COMMON/hadoop-common-current/bin/hadoop
+FM_PREDICT=/root/sunmingze/alphaFM/bin/fm_predict
+
+vids=(22895200 22751457 14146727 22847440 22927926 22858609 22974689 22563167 22959023 22970515 22946931 22994781 20720060 22979110)
+
+
+restore_score() {
+    for(( i = 0; i < ${#vids[@]}; i++)) do
+        vid=${vids[i]}
+        score_avg=$(awk '{
+            score = $2
+            new_score = ( 0.1 * score ) / ( 1 - 0.9 * score)
+            sum += new_score
+            count++
+        } END {
+            if ( count > 0 ){
+                print sum / count
+            } else {
+                print "NaN"
+            }
+        }' ${PREDICT_PATH}/${model_name}_${predict_date}_${vid}.txt)
+        echo -e "VID: ${vid} 平均分计算结果: ${score_avg} \n\t数据路径: ${PREDICT_PATH}/${model_name}_${predict_date}_${vid}.txt"
+    done
+}
+
+main() {
+    for(( i = 0; i < ${#vids[@]}; i++)) do
+        vid=${vids[i]}
+        cat ${TXT_PATH}/${predict_date}.txt | \
+        awk -v vid="$vid" -F'\t' '{
+            if ($2 == vid) {
+                split($0, fields, "\t");
+                OFS="\t";
+                line="";
+                for (i=1; i<= length(fields); i++){ 
+                    if (i != 2) {
+                        line = (line ? line "\t" : "") fields[i];
+                    }
+                }
+                print line
+            }
+        }' | \
+        ${FM_PREDICT} -m ${MODEL_PATH}/${model_name}.txt -dim ${predict_dim} -core 8 -out ${PREDICT_PATH}/${model_name}_${predict_date}_${vid}.txt
+        score_avg=`awk '{ sum += $2; count++ } END { if (count > 0) print sum / count }' ${PREDICT_PATH}/${model_name}_${predict_date}_${vid}.txt`
+        echo -e "VID: ${vid} 平均分计算结果: ${score_avg} \n\t模型路径: ${MODEL_PATH}/${model_name}.txt \n\t评估数据路径: ${TXT_PATH}/${predict_date}.txt"
+    done
+}
+
+main
+
+
+# nohup ./recommend/20_vid_avg_score.sh 20240729 model_recommend_v3_sample_01_20240728 8 > logs/20_vid_model_recommend_v3_20240728.sh 2>&1 &

+ 89 - 0
recommend/21_make_data_new_table.sh

@@ -0,0 +1,89 @@
+#!/bin/sh
+set -x
+
+
+export SPARK_HOME=/opt/apps/SPARK2/spark-2.4.8-hadoop3.2-1.0.8
+export PATH=$SPARK_HOME/bin:$PATH
+export HADOOP_CONF_DIR=/etc/taihao-apps/hadoop-conf
+export JAVA_HOME=/usr/lib/jvm/java-1.8.0
+
+source /root/anaconda3/bin/activate py37
+
+# 原始数据table name
+table='alg_recsys_sample_all_v2'
+# 处理分区配置 推荐数据间隔一天生产,所以5日0点使用3日0-23点数据生产new模型数据
+begin_early_2_Str="$(date -d '2 days ago' +%Y%m%d)"
+end_early_2_Str="$(date -d '2 days ago' +%Y%m%d)"
+beginHhStr=00
+endHhStr=23
+max_hour=05
+max_minute=00
+# 各节点产出hdfs文件绝对路径
+# 源数据文件
+originDataPath=/dw/recommend/model/41_recsys_sample_data_new_table/
+# 特征分桶
+bucketDataPath=/dw/recommend/model/43_recsys_train_data_new_table/
+# hadoop
+HADOOP=/opt/apps/HADOOP-COMMON/hadoop-common-current/bin/hadoop
+
+# 1 生产原始数据
+echo "$(date +%Y-%m-%d_%H-%M-%S)----------step1------------开始根据${table}生产原始数据"
+
+/opt/apps/SPARK2/spark-2.4.8-hadoop3.2-1.0.8/bin/spark-class2 org.apache.spark.deploy.SparkSubmit \
+--class com.aliyun.odps.spark.examples.makedata_recsys.makedata_recsys_41_originData_20240709 \
+--master yarn --driver-memory 1G --executor-memory 2G --executor-cores 1 --num-executors 16 \
+./target/spark-examples-1.0.0-SNAPSHOT-shaded.jar \
+tablePart:64 repartition:32 \
+beginStr:${begin_early_2_Str}00 endStr:${end_early_2_Str}09 \
+savePath:${originDataPath} \
+table:${table} &
+
+/opt/apps/SPARK2/spark-2.4.8-hadoop3.2-1.0.8/bin/spark-class2 org.apache.spark.deploy.SparkSubmit \
+--class com.aliyun.odps.spark.examples.makedata_recsys.makedata_recsys_41_originData_20240709 \
+--master yarn --driver-memory 1G --executor-memory 2G --executor-cores 1 --num-executors 16 \
+./target/spark-examples-1.0.0-SNAPSHOT-shaded.jar \
+tablePart:64 repartition:32 \
+beginStr:${begin_early_2_Str}10 endStr:${end_early_2_Str}15 \
+savePath:${originDataPath} \
+table:${table} &
+
+/opt/apps/SPARK2/spark-2.4.8-hadoop3.2-1.0.8/bin/spark-class2 org.apache.spark.deploy.SparkSubmit \
+--class com.aliyun.odps.spark.examples.makedata_recsys.makedata_recsys_41_originData_20240709 \
+--master yarn --driver-memory 1G --executor-memory 2G --executor-cores 1 --num-executors 16 \
+./target/spark-examples-1.0.0-SNAPSHOT-shaded.jar \
+tablePart:64 repartition:32 \
+beginStr:${begin_early_2_Str}16 endStr:${end_early_2_Str}23 \
+savePath:${originDataPath} \
+table:${table} &
+
+
+wait
+if [ $? -ne 0 ]; then
+   echo "Spark原始样本生产任务执行失败"
+   exit 1
+else
+   echo "spark原始样本生产执行成功"
+fi
+
+# 特征采样分桶
+/opt/apps/SPARK2/spark-2.4.8-hadoop3.2-1.0.8/bin/spark-class2 org.apache.spark.deploy.SparkSubmit \
+--class com.aliyun.odps.spark.examples.makedata_recsys.makedata_recsys_43_bucketData_fu_sample_20240709 \
+--master yarn --driver-memory 2G --executor-memory 4G --executor-cores 1 --num-executors 16 \
+./target/spark-examples-1.0.0-SNAPSHOT-shaded.jar \
+readPath:/dw/recommend/model/41_recsys_sample_data_new_table \
+savePath:/dw/recommend/model/43_recsys_train_data_new_table_274_sample_01 \
+beginStr:${begin_early_2_Str} endStr:${end_early_2_Str} repartition:500 \
+filterNames:ROS fuSampleRate:0.1 \
+fileName:20240609_bucket_314.txt \
+whatLabel:is_return whatApps:0,3,4,21,17
+
+if [ $? -ne 0 ]; then
+   echo "Spark特征分桶任务执行失败"
+   exit 1
+else
+   echo "spark特征分桶任务执行成功"
+fi
+
+
+# 定时任务配置
+# 0 11 * * * cd /root/zhaohp/recommend-emr-dataprocess && /bin/sh ./recommend/21_make_data_new_table.sh > logs/recommend/21_make_data_new_table/$(date +\%Y\%m\%d\%H\%M).log 2>&1

+ 78 - 0
recommend/22_supplementary_data_new_table.sh

@@ -0,0 +1,78 @@
+#!/bin/sh
+set -x
+
+
+export SPARK_HOME=/opt/apps/SPARK2/spark-2.4.8-hadoop3.2-1.0.8
+export PATH=$SPARK_HOME/bin:$PATH
+export HADOOP_CONF_DIR=/etc/taihao-apps/hadoop-conf
+export JAVA_HOME=/usr/lib/jvm/java-1.8.0
+
+source /root/anaconda3/bin/activate py37
+
+# 原始数据table name
+table='alg_recsys_sample_all_v2'
+# 各节点产出hdfs文件绝对路径
+# 源数据文件
+originDataPath=/dw/recommend/model/41_recsys_sample_data_new_table/
+# 特征分桶
+bucketDataPath=/dw/recommend/model/43_recsys_train_data_new_table/
+# hadoop
+HADOOP=/opt/apps/HADOOP-COMMON/hadoop-common-current/bin/hadoop
+
+# 1 生产原始数据
+echo "$(date +%Y-%m-%d_%H-%M-%S)----------step1------------开始根据${table}生产原始数据"
+
+/opt/apps/SPARK2/spark-2.4.8-hadoop3.2-1.0.8/bin/spark-class2 org.apache.spark.deploy.SparkSubmit \
+--class com.aliyun.odps.spark.examples.makedata_recsys.makedata_recsys_41_originData_20240709 \
+--master yarn --driver-memory 1G --executor-memory 2G --executor-cores 1 --num-executors 16 \
+./target/spark-examples-1.0.0-SNAPSHOT-shaded.jar \
+tablePart:64 repartition:32 \
+beginStr:2024110800 endStr:2024110808 \
+savePath:${originDataPath} \
+table:${table} &
+
+/opt/apps/SPARK2/spark-2.4.8-hadoop3.2-1.0.8/bin/spark-class2 org.apache.spark.deploy.SparkSubmit \
+--class com.aliyun.odps.spark.examples.makedata_recsys.makedata_recsys_41_originData_20240709 \
+--master yarn --driver-memory 1G --executor-memory 2G --executor-cores 1 --num-executors 16 \
+./target/spark-examples-1.0.0-SNAPSHOT-shaded.jar \
+tablePart:64 repartition:32 \
+beginStr:2024110809 endStr:2024110816 \
+savePath:${originDataPath} \
+table:${table} &
+
+/opt/apps/SPARK2/spark-2.4.8-hadoop3.2-1.0.8/bin/spark-class2 org.apache.spark.deploy.SparkSubmit \
+--class com.aliyun.odps.spark.examples.makedata_recsys.makedata_recsys_41_originData_20240709 \
+--master yarn --driver-memory 1G --executor-memory 2G --executor-cores 1 --num-executors 16 \
+./target/spark-examples-1.0.0-SNAPSHOT-shaded.jar \
+tablePart:64 repartition:32 \
+beginStr:2024110817 endStr:2024110823 \
+savePath:${originDataPath} \
+table:${table} &
+
+wait
+if [ $? -ne 0 ]; then
+   echo "Spark原始样本生产任务执行失败"
+   exit 1
+else
+   echo "spark原始样本生产执行成功"
+fi
+
+# 特征采样分桶
+/opt/apps/SPARK2/spark-2.4.8-hadoop3.2-1.0.8/bin/spark-class2 org.apache.spark.deploy.SparkSubmit \
+--class com.aliyun.odps.spark.examples.makedata_recsys.makedata_recsys_43_bucketData_fu_sample_20240709 \
+--master yarn --driver-memory 2G --executor-memory 4G --executor-cores 1 --num-executors 16 \
+./target/spark-examples-1.0.0-SNAPSHOT-shaded.jar \
+readPath:/dw/recommend/model/41_recsys_sample_data_new_table \
+savePath:/dw/recommend/model/43_recsys_train_data_new_table_274_sample_01 \
+beginStr:20241108 endStr:20241108 repartition:500 \
+filterNames:ROS fuSampleRate:0.1 \
+fileName:20240609_bucket_314.txt \
+whatLabel:is_return whatApps:0,3,4,21,17
+
+if [ $? -ne 0 ]; then
+   echo "Spark特征分桶任务执行失败"
+   exit 1
+else
+   echo "spark特征分桶任务执行成功"
+fi
+

+ 8 - 0
spark-examples.iml

@@ -0,0 +1,8 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<module version="4">
+  <component name="FacetManager">
+    <facet type="Python" name="Python">
+      <configuration sdkName="Python 3.12 (recommend-emr-dataprocess)" />
+    </facet>
+  </component>
+</module>

+ 62 - 0
src/main/java/examples/sparksql/SparkAdCTRSampleTester.java

@@ -0,0 +1,62 @@
+package examples.sparksql;
+
+import com.aliyun.odps.TableSchema;
+import com.aliyun.odps.data.Record;
+import org.apache.spark.SparkConf;
+import org.apache.spark.aliyun.odps.OdpsOps;
+import org.apache.spark.api.java.JavaRDD;
+import org.apache.spark.api.java.JavaSparkContext;
+import org.apache.spark.api.java.function.Function2;
+
+import java.util.ArrayList;
+
+
+public class SparkAdCTRSampleTester {
+
+    public static void main(String[] args) {
+
+        String partition = args[0];
+        String accessId = "LTAIWYUujJAm7CbH";
+        String accessKey = "RfSjdiWwED1sGFlsjXv0DlfTnZTG1P";
+        String odpsUrl = "http://service.odps.aliyun.com/api";
+        String tunnelUrl = "http://dt.cn-hangzhou.maxcompute.aliyun-inc.com";
+        String project = "loghubods";
+        String table = "alg_ad_view_sample";
+        String hdfsPath = "/dw/recommend/model/ad_ctr_samples_test/" + partition;
+
+        SparkConf sparkConf = new SparkConf().setAppName("E-MapReduce Demo 3-2: Spark MaxCompute Demo (Java)");
+        JavaSparkContext jsc = new JavaSparkContext(sparkConf);
+        OdpsOps odpsOps = new OdpsOps(jsc.sc(), accessId, accessKey, odpsUrl, tunnelUrl);
+        System.out.println("Read odps table...");
+
+        JavaRDD<Record> readData = odpsOps.readTableWithJava(project, table, partition, new RecordsToSamples(), Integer.valueOf(30));
+        readData.filter(row -> row.get("type") != null)
+                .filter(row -> row.get("lrsample") != null)
+                .map(line -> singleParse2(line))
+                .saveAsTextFile(hdfsPath);
+    }
+
+
+    static class RecordsToSamples implements Function2<Record, TableSchema, Record> {
+        @Override
+        public Record call(Record record, TableSchema schema) throws Exception {
+            return record;
+        }
+    }
+
+
+    // 单条日志处理逻辑
+    public static String singleParse2(Record record) {
+        // 数据解析
+        String label = record.getString("adclick_ornot");
+        if (label == null || label.equals("1")) {
+            label = "0";
+        } else {
+            label = "1";
+        }
+        String samples = record.getString("lrsample").replaceAll("\\\\t","\t");
+        return label + "\t" +  samples;
+    }
+
+
+}

+ 99 - 0
src/main/java/examples/sparksql/SparkAdCVRSampleLoader.java

@@ -0,0 +1,99 @@
+//package examples.sparksql;
+//
+//import com.aliyun.odps.TableSchema;
+//import com.aliyun.odps.data.Record;
+//import com.tzld.piaoquan.recommend.feature.domain.ad.base.*;
+//import com.tzld.piaoquan.recommend.feature.domain.ad.feature.VlogAdCtrLRFeatureExtractor;
+//import com.tzld.piaoquan.recommend.feature.model.sample.BaseFeature;
+//import com.tzld.piaoquan.recommend.feature.model.sample.GroupedFeature;
+//import com.tzld.piaoquan.recommend.feature.model.sample.LRSamples;
+//import examples.dataloader.AdSampleConstructor;
+//import org.apache.spark.SparkConf;
+//import org.apache.spark.aliyun.odps.OdpsOps;
+//import org.apache.spark.api.java.JavaRDD;
+//import org.apache.spark.api.java.JavaSparkContext;
+//import org.apache.spark.api.java.function.Function2;
+//
+//import java.util.ArrayList;
+//
+//
+//public class SparkAdCVRSampleLoader {
+//
+//    public static void main(String[] args) {
+//
+//        String partition = args[0];
+//        String accessId = "LTAIWYUujJAm7CbH";
+//        String accessKey = "RfSjdiWwED1sGFlsjXv0DlfTnZTG1P";
+//        String odpsUrl = "http://service.odps.aliyun.com/api";
+//        String tunnelUrl = "http://dt.cn-hangzhou.maxcompute.aliyun-inc.com";
+//        String project = "loghubods";
+//        String table = "alg_ad_view_sample";
+//        String hdfsPath = "/dw/recommend/model/ad_cvr_samples/" + partition;
+//
+//        SparkConf sparkConf = new SparkConf().setAppName("E-MapReduce Demo 3-2: Spark MaxCompute Demo (Java)");
+//        JavaSparkContext jsc = new JavaSparkContext(sparkConf);
+//        OdpsOps odpsOps = new OdpsOps(jsc.sc(), accessId, accessKey, odpsUrl, tunnelUrl);
+//        System.out.println("Read odps table...");
+//
+//        JavaRDD<Record> readData = odpsOps.readTableWithJava(project, table, partition, new RecordsToSamples(), Integer.valueOf(30));
+//        readData.filter(row -> row.getString("adclick_ornot").equals("0")).map(line -> singleParse(line)).saveAsTextFile(hdfsPath);
+//    }
+//
+//
+//    static class RecordsToSamples implements Function2<Record, TableSchema, Record> {
+//        @Override
+//        public Record call(Record record, TableSchema schema) throws Exception {
+//            return record;
+//        }
+//    }
+//
+//
+//    // 单条日志处理逻辑
+//    public static String singleParse(Record record) {
+//        // 数据解析
+//        String label = record.getString("adinvert_ornot");
+//        if (label == null || label.equals("1")) {
+//            label = "0";
+//        } else {
+//            label = "1";
+//        }
+//
+//
+//        // 从sql的 record中 初始化对象内容
+//        AdRequestContext requestContext = AdSampleConstructor.constructRequestContext(record);
+//        UserAdFeature userFeature = AdSampleConstructor.constructUserFeature(record);
+//        AdItemFeature itemFeature = AdSampleConstructor.constructItemFeature(record);
+//
+//        // 转化成bytes
+//        AdRequestContextBytesFeature adRequestContextBytesFeature = new AdRequestContextBytesFeature(requestContext);
+//        UserAdBytesFeature userBytesFeature = new UserAdBytesFeature(userFeature);
+//        AdItemBytesFeature adItemBytesFeature = new AdItemBytesFeature(itemFeature);
+//
+//        // 特征抽取
+//        VlogAdCtrLRFeatureExtractor bytesFeatureExtractor;
+//        bytesFeatureExtractor = new VlogAdCtrLRFeatureExtractor();
+//
+//        LRSamples lrSamples = bytesFeatureExtractor.single(userBytesFeature, adItemBytesFeature, adRequestContextBytesFeature);
+//
+//        return parseSamplesToString2(label, lrSamples);
+//    }
+//
+//
+//    // 构建样本的字符串
+//    public static String parseSamplesToString2(String label, LRSamples lrSamples) {
+//        ArrayList<String> featureList = new ArrayList<String>();
+//        for (int i = 0; i < lrSamples.getFeaturesCount(); i++) {
+//            GroupedFeature groupedFeature = lrSamples.getFeatures(i);
+//            if (groupedFeature != null && groupedFeature.getFeaturesCount() != 0) {
+//                for (int j = 0; j < groupedFeature.getFeaturesCount(); j++) {
+//                    BaseFeature baseFeature = groupedFeature.getFeatures(j);
+//                    if (baseFeature != null) {
+//                        featureList.add(String.valueOf(baseFeature.getIdentifier()) + ":1");
+//                    }
+//                }
+//            }
+//        }
+//        return label + "\t" + String.join("\t", featureList);
+//    }
+//
+//}

+ 59 - 0
src/main/java/examples/sparksql/SparkAdCVRSampleTester.java

@@ -0,0 +1,59 @@
+package examples.sparksql;
+
+import com.aliyun.odps.TableSchema;
+import com.aliyun.odps.data.Record;
+import org.apache.spark.SparkConf;
+import org.apache.spark.aliyun.odps.OdpsOps;
+import org.apache.spark.api.java.JavaRDD;
+import org.apache.spark.api.java.JavaSparkContext;
+import org.apache.spark.api.java.function.Function2;
+
+import java.util.ArrayList;
+
+
+public class SparkAdCVRSampleTester {
+
+    public static void main(String[] args) {
+
+        String partition = args[0];
+        String accessId = "LTAIWYUujJAm7CbH";
+        String accessKey = "RfSjdiWwED1sGFlsjXv0DlfTnZTG1P";
+        String odpsUrl = "http://service.odps.aliyun.com/api";
+        String tunnelUrl = "http://dt.cn-hangzhou.maxcompute.aliyun-inc.com";
+        String project = "loghubods";
+        String table = "alg_ad_view_sample";
+        String hdfsPath = "/dw/recommend/model/ad_cvr_samples_test/" + partition;
+
+        SparkConf sparkConf = new SparkConf().setAppName("E-MapReduce Demo 3-2: Spark MaxCompute Demo (Java)");
+        JavaSparkContext jsc = new JavaSparkContext(sparkConf);
+        OdpsOps odpsOps = new OdpsOps(jsc.sc(), accessId, accessKey, odpsUrl, tunnelUrl);
+        System.out.println("Read odps table...");
+
+        JavaRDD<Record> readData = odpsOps.readTableWithJava(project, table, partition, new RecordsToSamples(), Integer.valueOf(30));
+        readData.filter(row -> row.get("type") != null)
+                .filter(row -> row.get("lrsample") != null)
+                .filter(row -> row.getString("adclick_ornot").equals("0"))
+                .map(line -> singleParse(line))
+                .saveAsTextFile(hdfsPath);
+    }
+
+
+    static class RecordsToSamples implements Function2<Record, TableSchema, Record> {
+        @Override
+        public Record call(Record record, TableSchema schema) throws Exception {
+            return record;
+        }
+    }
+
+    public static String singleParse(Record record) {
+        // 数据解析
+        String label = record.getString("adinvert_ornot");
+        if (label == null || label.equals("1")) {
+            label = "0";
+        } else {
+            label = "1";
+        }
+        String samples = record.getString("lrsample").replaceAll("\\\\t","\t");
+        return label + "\t" + samples;
+    }
+}

+ 95 - 0
src/main/java/examples/sparksql/SparkAdFeaToRedisHourLoader.java

@@ -0,0 +1,95 @@
+//package examples.sparksql;
+//
+//import com.aliyun.odps.TableSchema;
+//import com.aliyun.odps.data.Record;
+//import com.tzld.piaoquan.recommend.feature.domain.ad.base.AdItemFeature;
+//import com.tzld.piaoquan.recommend.feature.domain.ad.base.UserAdFeature;
+//import examples.dataloader.AdRedisFeatureConstructor;
+//import org.apache.spark.SparkConf;
+//import org.apache.spark.aliyun.odps.OdpsOps;
+//import org.apache.spark.api.java.JavaRDD;
+//import org.apache.spark.api.java.JavaSparkContext;
+//import org.apache.spark.api.java.function.Function2;
+//import org.springframework.data.redis.connection.RedisStandaloneConfiguration;
+//import org.springframework.data.redis.connection.jedis.JedisConnectionFactory;
+//import org.springframework.data.redis.core.RedisTemplate;
+//import org.springframework.data.redis.serializer.StringRedisSerializer;
+//
+//import java.util.ArrayList;
+//import java.util.HashMap;
+//import java.util.List;
+//import java.util.Map;
+//
+//
+//public class SparkAdFeaToRedisHourLoader {
+//
+//    private static final String adKeyFormat = "ad:%s";
+//
+//
+//    public static RedisTemplate<String, String> buildRedisTemplate() {
+//        RedisStandaloneConfiguration rsc = new RedisStandaloneConfiguration();
+//        rsc.setPort(6379);
+//        rsc.setPassword("Wqsd@2019");
+//        rsc.setHostName("r-bp1pi8wyv6lzvgjy5z.redis.rds.aliyuncs.com");
+//        RedisTemplate<String, String> template = new RedisTemplate<>();
+//        JedisConnectionFactory fac = new JedisConnectionFactory(rsc);
+//        fac.afterPropertiesSet();
+//        template.setDefaultSerializer(new StringRedisSerializer());
+//        template.setConnectionFactory(fac);
+//        template.afterPropertiesSet();
+//        return template;
+//    }
+//
+//
+//    public static void loadFeatureToRedis(RedisTemplate<String, String> redisTemplate, List<String> line) {
+//        Map<String, String> redisFormat = new HashMap<String, String>();
+//        String key = line.get(0);
+//        String value = line.get(1);
+//        redisFormat.put(key, value);
+//        redisTemplate.opsForValue().multiSet(redisFormat);
+//    }
+//
+//
+//    static class RecordsToAdRedisKV implements Function2<Record, TableSchema, List<String>> {
+//        @Override
+//        public List<String> call(Record record, TableSchema schema) throws Exception {
+//            AdItemFeature adItemFeature = AdRedisFeatureConstructor.constructItemFeature(record);
+//            // ad feature 中的key以creativeID拼接
+//            String key = String.format(adKeyFormat, adItemFeature.getCreativeId());
+//            String value = adItemFeature.getValue();
+//            List<String> kv = new ArrayList<String>();
+//            kv.add(key);
+//            kv.add(value);
+//            return kv;
+//        }
+//    }
+//
+//
+//
+//    public static void main(String[] args) {
+//
+//        String partition = args[0];
+//        String accessId = "LTAIWYUujJAm7CbH";
+//        String accessKey = "RfSjdiWwED1sGFlsjXv0DlfTnZTG1P";
+//        String odpsUrl = "http://service.odps.aliyun.com/api";
+//        String tunnelUrl = "http://dt.cn-hangzhou.maxcompute.aliyun-inc.com";
+//        String project = "loghubods";
+//        String tableAdInfo = "alg_ad_item_info";
+//
+//        SparkConf sparkConf = new SparkConf().setAppName("E-MapReduce Demo 3-2: Spark MaxCompute Demo (Java)");
+//        JavaSparkContext jsc = new JavaSparkContext(sparkConf);
+//        OdpsOps odpsOps = new OdpsOps(jsc.sc(), accessId, accessKey, odpsUrl, tunnelUrl);
+//        System.out.println("Read odps table...");
+//
+//
+//        // load Ad features
+//        JavaRDD<List<String>> readAdData = odpsOps.readTableWithJava(project, tableAdInfo, partition, new RecordsToAdRedisKV(), Integer.valueOf(10));
+//        readAdData.foreachPartition(
+//                rowIterator -> {
+//                    RedisTemplate<String, String> redisTemplate = buildRedisTemplate();
+//                    rowIterator.forEachRemaining(line -> loadFeatureToRedis(redisTemplate, line));
+//                }
+//        );
+//    }
+//
+//}

+ 67 - 0
src/main/java/examples/utils/AdUtil.java

@@ -0,0 +1,67 @@
+package examples.utils;
+
+import com.alibaba.fastjson.JSON;
+import com.alibaba.fastjson.JSONObject;
+import com.aliyun.odps.data.Record;
+import org.apache.commons.collections4.MapUtils;
+
+import java.util.HashMap;
+import java.util.Map;
+
+public class AdUtil {
+
+    public static final String IS_API_FLAG = "1";
+
+    /**
+     * 将线上metaFeature中的Key替换为大数据表中的Key,用于定位问题
+     */
+    public static JSONObject keyReplace(JSONObject featureJson) {
+        JSONObject newJson = new JSONObject();
+        Map<String, String> keyMap = new HashMap<String, String>() {{
+            put("alg_cid_feature_basic_info", "b1_feature");
+            put("alg_cid_feature_adver_action", "b2_feature");
+            put("alg_cid_feature_cid_action", "b3_feature");
+            put("alg_cid_feature_region_action", "b4_feature");
+            put("alg_cid_feature_app_action", "b5_feature");
+            put("alg_cid_feature_week_action", "b6_feature");
+            put("alg_cid_feature_hour_action", "b7_feature");
+            put("alg_cid_feature_brand_action", "b8_feature");
+            put("alg_cid_feature_weChatVersion_action", "b9_feature");
+            put("alg_mid_feature_ad_action", "c1_feature");
+            put("alg_cid_feature_vid_cf", "d1_feature");
+            put("alg_cid_feature_vid_cf_rank", "d2_feature");
+            put("alg_vid_feature_basic_info", "d3_feature");
+            put("alg_mid_feature_return_tags", "e1_feature");
+            put("alg_mid_feature_share_tags", "e2_feature");
+        }};
+
+        for (Map.Entry<String, Object> entry : featureJson.entrySet()) {
+            String key = keyMap.getOrDefault(entry.getKey(), entry.getKey());
+            newJson.put(key, entry.getValue());
+        }
+
+        return newJson;
+    }
+
+    /**
+     * 是否有回传,用于生产数据时的过滤
+     */
+    public static boolean isApi(Record record) {
+
+        if (record.isNull("extend_alg")) {
+            return false;
+        }
+
+        JSONObject extendAlgJson = JSON.parseObject(record.getString("extend_alg"));
+        if (MapUtils.isEmpty(extendAlgJson)) {
+            return false;
+        }
+        if (extendAlgJson.containsKey("extinfo")) {
+            return IS_API_FLAG.equals(extendAlgJson.getJSONObject("extinfo").getString("isApi"));
+        }
+        if (extendAlgJson.containsKey("is_api")) {
+            return IS_API_FLAG.equals(extendAlgJson.getString("is_api"));
+        }
+        return true;
+    }
+}

+ 22 - 0
src/main/java/examples/utils/DateTimeUtil.java

@@ -0,0 +1,22 @@
+package examples.utils;
+
+import java.time.Instant;
+import java.time.LocalDateTime;
+import java.time.ZoneId;
+
+public class DateTimeUtil {
+
+    public static int getHourByTimestamp(long timestamp) {
+        return LocalDateTime
+                .ofInstant(Instant.ofEpochSecond(timestamp), ZoneId.systemDefault())
+                .getHour();
+    }
+
+    public static int getDayOrWeekByTimestamp(long timestamp) {
+        return LocalDateTime
+                .ofInstant(Instant.ofEpochSecond(timestamp), ZoneId.systemDefault())
+                .getDayOfWeek()
+                .getValue();
+    }
+}
+

File diff suppressed because it is too large
+ 8 - 0
src/main/resources/20240718_ad_bucket_517.txt


File diff suppressed because it is too large
+ 9 - 0
src/main/resources/20240718_ad_bucket_688.txt


+ 689 - 0
src/main/resources/20240718_ad_feature_name.txt

@@ -0,0 +1,689 @@
+cpa
+b2_1h_ctr
+b2_1h_ctcvr
+b2_1h_cvr
+b2_1h_conver
+b2_1h_ecpm
+b2_1h_click
+b2_1h_conver*log(view)
+b2_1h_conver*ctcvr
+b2_2h_ctr
+b2_2h_ctcvr
+b2_2h_cvr
+b2_2h_conver
+b2_2h_ecpm
+b2_2h_click
+b2_2h_conver*log(view)
+b2_2h_conver*ctcvr
+b2_3h_ctr
+b2_3h_ctcvr
+b2_3h_cvr
+b2_3h_conver
+b2_3h_ecpm
+b2_3h_click
+b2_3h_conver*log(view)
+b2_3h_conver*ctcvr
+b2_4h_ctr
+b2_4h_ctcvr
+b2_4h_cvr
+b2_4h_conver
+b2_4h_ecpm
+b2_4h_click
+b2_4h_conver*log(view)
+b2_4h_conver*ctcvr
+b2_5h_ctr
+b2_5h_ctcvr
+b2_5h_cvr
+b2_5h_conver
+b2_5h_ecpm
+b2_5h_click
+b2_5h_conver*log(view)
+b2_5h_conver*ctcvr
+b2_6h_ctr
+b2_6h_ctcvr
+b2_6h_cvr
+b2_6h_conver
+b2_6h_ecpm
+b2_6h_click
+b2_6h_conver*log(view)
+b2_6h_conver*ctcvr
+b2_12h_ctr
+b2_12h_ctcvr
+b2_12h_cvr
+b2_12h_conver
+b2_12h_ecpm
+b2_12h_click
+b2_12h_conver*log(view)
+b2_12h_conver*ctcvr
+b2_1d_ctr
+b2_1d_ctcvr
+b2_1d_cvr
+b2_1d_conver
+b2_1d_ecpm
+b2_1d_click
+b2_1d_conver*log(view)
+b2_1d_conver*ctcvr
+b2_3d_ctr
+b2_3d_ctcvr
+b2_3d_cvr
+b2_3d_conver
+b2_3d_ecpm
+b2_3d_click
+b2_3d_conver*log(view)
+b2_3d_conver*ctcvr
+b2_7d_ctr
+b2_7d_ctcvr
+b2_7d_cvr
+b2_7d_conver
+b2_7d_ecpm
+b2_7d_click
+b2_7d_conver*log(view)
+b2_7d_conver*ctcvr
+b2_today_ctr
+b2_today_ctcvr
+b2_today_cvr
+b2_today_conver
+b2_today_ecpm
+b2_today_click
+b2_today_conver*log(view)
+b2_today_conver*ctcvr
+b2_yesterday_ctr
+b2_yesterday_ctcvr
+b2_yesterday_cvr
+b2_yesterday_conver
+b2_yesterday_ecpm
+b2_yesterday_click
+b2_yesterday_conver*log(view)
+b2_yesterday_conver*ctcvr
+b3_1h_ctr
+b3_1h_ctcvr
+b3_1h_cvr
+b3_1h_conver
+b3_1h_ecpm
+b3_1h_click
+b3_1h_conver*log(view)
+b3_1h_conver*ctcvr
+b3_2h_ctr
+b3_2h_ctcvr
+b3_2h_cvr
+b3_2h_conver
+b3_2h_ecpm
+b3_2h_click
+b3_2h_conver*log(view)
+b3_2h_conver*ctcvr
+b3_3h_ctr
+b3_3h_ctcvr
+b3_3h_cvr
+b3_3h_conver
+b3_3h_ecpm
+b3_3h_click
+b3_3h_conver*log(view)
+b3_3h_conver*ctcvr
+b3_4h_ctr
+b3_4h_ctcvr
+b3_4h_cvr
+b3_4h_conver
+b3_4h_ecpm
+b3_4h_click
+b3_4h_conver*log(view)
+b3_4h_conver*ctcvr
+b3_5h_ctr
+b3_5h_ctcvr
+b3_5h_cvr
+b3_5h_conver
+b3_5h_ecpm
+b3_5h_click
+b3_5h_conver*log(view)
+b3_5h_conver*ctcvr
+b3_6h_ctr
+b3_6h_ctcvr
+b3_6h_cvr
+b3_6h_conver
+b3_6h_ecpm
+b3_6h_click
+b3_6h_conver*log(view)
+b3_6h_conver*ctcvr
+b3_12h_ctr
+b3_12h_ctcvr
+b3_12h_cvr
+b3_12h_conver
+b3_12h_ecpm
+b3_12h_click
+b3_12h_conver*log(view)
+b3_12h_conver*ctcvr
+b3_1d_ctr
+b3_1d_ctcvr
+b3_1d_cvr
+b3_1d_conver
+b3_1d_ecpm
+b3_1d_click
+b3_1d_conver*log(view)
+b3_1d_conver*ctcvr
+b3_3d_ctr
+b3_3d_ctcvr
+b3_3d_cvr
+b3_3d_conver
+b3_3d_ecpm
+b3_3d_click
+b3_3d_conver*log(view)
+b3_3d_conver*ctcvr
+b3_7d_ctr
+b3_7d_ctcvr
+b3_7d_cvr
+b3_7d_conver
+b3_7d_ecpm
+b3_7d_click
+b3_7d_conver*log(view)
+b3_7d_conver*ctcvr
+b3_today_ctr
+b3_today_ctcvr
+b3_today_cvr
+b3_today_conver
+b3_today_ecpm
+b3_today_click
+b3_today_conver*log(view)
+b3_today_conver*ctcvr
+b3_yesterday_ctr
+b3_yesterday_ctcvr
+b3_yesterday_cvr
+b3_yesterday_conver
+b3_yesterday_ecpm
+b3_yesterday_click
+b3_yesterday_conver*log(view)
+b3_yesterday_conver*ctcvr
+b4_1h_ctr
+b4_1h_ctcvr
+b4_1h_cvr
+b4_1h_conver
+b4_1h_ecpm
+b4_1h_click
+b4_1h_conver*log(view)
+b4_1h_conver*ctcvr
+b4_2h_ctr
+b4_2h_ctcvr
+b4_2h_cvr
+b4_2h_conver
+b4_2h_ecpm
+b4_2h_click
+b4_2h_conver*log(view)
+b4_2h_conver*ctcvr
+b4_3h_ctr
+b4_3h_ctcvr
+b4_3h_cvr
+b4_3h_conver
+b4_3h_ecpm
+b4_3h_click
+b4_3h_conver*log(view)
+b4_3h_conver*ctcvr
+b4_4h_ctr
+b4_4h_ctcvr
+b4_4h_cvr
+b4_4h_conver
+b4_4h_ecpm
+b4_4h_click
+b4_4h_conver*log(view)
+b4_4h_conver*ctcvr
+b4_5h_ctr
+b4_5h_ctcvr
+b4_5h_cvr
+b4_5h_conver
+b4_5h_ecpm
+b4_5h_click
+b4_5h_conver*log(view)
+b4_5h_conver*ctcvr
+b4_6h_ctr
+b4_6h_ctcvr
+b4_6h_cvr
+b4_6h_conver
+b4_6h_ecpm
+b4_6h_click
+b4_6h_conver*log(view)
+b4_6h_conver*ctcvr
+b4_12h_ctr
+b4_12h_ctcvr
+b4_12h_cvr
+b4_12h_conver
+b4_12h_ecpm
+b4_12h_click
+b4_12h_conver*log(view)
+b4_12h_conver*ctcvr
+b4_1d_ctr
+b4_1d_ctcvr
+b4_1d_cvr
+b4_1d_conver
+b4_1d_ecpm
+b4_1d_click
+b4_1d_conver*log(view)
+b4_1d_conver*ctcvr
+b4_3d_ctr
+b4_3d_ctcvr
+b4_3d_cvr
+b4_3d_conver
+b4_3d_ecpm
+b4_3d_click
+b4_3d_conver*log(view)
+b4_3d_conver*ctcvr
+b4_7d_ctr
+b4_7d_ctcvr
+b4_7d_cvr
+b4_7d_conver
+b4_7d_ecpm
+b4_7d_click
+b4_7d_conver*log(view)
+b4_7d_conver*ctcvr
+b4_today_ctr
+b4_today_ctcvr
+b4_today_cvr
+b4_today_conver
+b4_today_ecpm
+b4_today_click
+b4_today_conver*log(view)
+b4_today_conver*ctcvr
+b4_yesterday_ctr
+b4_yesterday_ctcvr
+b4_yesterday_cvr
+b4_yesterday_conver
+b4_yesterday_ecpm
+b4_yesterday_click
+b4_yesterday_conver*log(view)
+b4_yesterday_conver*ctcvr
+b5_1h_ctr
+b5_1h_ctcvr
+b5_1h_cvr
+b5_1h_conver
+b5_1h_ecpm
+b5_1h_click
+b5_1h_conver*log(view)
+b5_1h_conver*ctcvr
+b5_2h_ctr
+b5_2h_ctcvr
+b5_2h_cvr
+b5_2h_conver
+b5_2h_ecpm
+b5_2h_click
+b5_2h_conver*log(view)
+b5_2h_conver*ctcvr
+b5_3h_ctr
+b5_3h_ctcvr
+b5_3h_cvr
+b5_3h_conver
+b5_3h_ecpm
+b5_3h_click
+b5_3h_conver*log(view)
+b5_3h_conver*ctcvr
+b5_4h_ctr
+b5_4h_ctcvr
+b5_4h_cvr
+b5_4h_conver
+b5_4h_ecpm
+b5_4h_click
+b5_4h_conver*log(view)
+b5_4h_conver*ctcvr
+b5_5h_ctr
+b5_5h_ctcvr
+b5_5h_cvr
+b5_5h_conver
+b5_5h_ecpm
+b5_5h_click
+b5_5h_conver*log(view)
+b5_5h_conver*ctcvr
+b5_6h_ctr
+b5_6h_ctcvr
+b5_6h_cvr
+b5_6h_conver
+b5_6h_ecpm
+b5_6h_click
+b5_6h_conver*log(view)
+b5_6h_conver*ctcvr
+b5_12h_ctr
+b5_12h_ctcvr
+b5_12h_cvr
+b5_12h_conver
+b5_12h_ecpm
+b5_12h_click
+b5_12h_conver*log(view)
+b5_12h_conver*ctcvr
+b5_1d_ctr
+b5_1d_ctcvr
+b5_1d_cvr
+b5_1d_conver
+b5_1d_ecpm
+b5_1d_click
+b5_1d_conver*log(view)
+b5_1d_conver*ctcvr
+b5_3d_ctr
+b5_3d_ctcvr
+b5_3d_cvr
+b5_3d_conver
+b5_3d_ecpm
+b5_3d_click
+b5_3d_conver*log(view)
+b5_3d_conver*ctcvr
+b5_7d_ctr
+b5_7d_ctcvr
+b5_7d_cvr
+b5_7d_conver
+b5_7d_ecpm
+b5_7d_click
+b5_7d_conver*log(view)
+b5_7d_conver*ctcvr
+b5_today_ctr
+b5_today_ctcvr
+b5_today_cvr
+b5_today_conver
+b5_today_ecpm
+b5_today_click
+b5_today_conver*log(view)
+b5_today_conver*ctcvr
+b5_yesterday_ctr
+b5_yesterday_ctcvr
+b5_yesterday_cvr
+b5_yesterday_conver
+b5_yesterday_ecpm
+b5_yesterday_click
+b5_yesterday_conver*log(view)
+b5_yesterday_conver*ctcvr
+b8_1h_ctr
+b8_1h_ctcvr
+b8_1h_cvr
+b8_1h_conver
+b8_1h_ecpm
+b8_1h_click
+b8_1h_conver*log(view)
+b8_1h_conver*ctcvr
+b8_2h_ctr
+b8_2h_ctcvr
+b8_2h_cvr
+b8_2h_conver
+b8_2h_ecpm
+b8_2h_click
+b8_2h_conver*log(view)
+b8_2h_conver*ctcvr
+b8_3h_ctr
+b8_3h_ctcvr
+b8_3h_cvr
+b8_3h_conver
+b8_3h_ecpm
+b8_3h_click
+b8_3h_conver*log(view)
+b8_3h_conver*ctcvr
+b8_4h_ctr
+b8_4h_ctcvr
+b8_4h_cvr
+b8_4h_conver
+b8_4h_ecpm
+b8_4h_click
+b8_4h_conver*log(view)
+b8_4h_conver*ctcvr
+b8_5h_ctr
+b8_5h_ctcvr
+b8_5h_cvr
+b8_5h_conver
+b8_5h_ecpm
+b8_5h_click
+b8_5h_conver*log(view)
+b8_5h_conver*ctcvr
+b8_6h_ctr
+b8_6h_ctcvr
+b8_6h_cvr
+b8_6h_conver
+b8_6h_ecpm
+b8_6h_click
+b8_6h_conver*log(view)
+b8_6h_conver*ctcvr
+b8_12h_ctr
+b8_12h_ctcvr
+b8_12h_cvr
+b8_12h_conver
+b8_12h_ecpm
+b8_12h_click
+b8_12h_conver*log(view)
+b8_12h_conver*ctcvr
+b8_1d_ctr
+b8_1d_ctcvr
+b8_1d_cvr
+b8_1d_conver
+b8_1d_ecpm
+b8_1d_click
+b8_1d_conver*log(view)
+b8_1d_conver*ctcvr
+b8_3d_ctr
+b8_3d_ctcvr
+b8_3d_cvr
+b8_3d_conver
+b8_3d_ecpm
+b8_3d_click
+b8_3d_conver*log(view)
+b8_3d_conver*ctcvr
+b8_7d_ctr
+b8_7d_ctcvr
+b8_7d_cvr
+b8_7d_conver
+b8_7d_ecpm
+b8_7d_click
+b8_7d_conver*log(view)
+b8_7d_conver*ctcvr
+b8_today_ctr
+b8_today_ctcvr
+b8_today_cvr
+b8_today_conver
+b8_today_ecpm
+b8_today_click
+b8_today_conver*log(view)
+b8_today_conver*ctcvr
+b8_yesterday_ctr
+b8_yesterday_ctcvr
+b8_yesterday_cvr
+b8_yesterday_conver
+b8_yesterday_ecpm
+b8_yesterday_click
+b8_yesterday_conver*log(view)
+b8_yesterday_conver*ctcvr
+b9_1h_ctr
+b9_1h_ctcvr
+b9_1h_cvr
+b9_1h_conver
+b9_1h_ecpm
+b9_1h_click
+b9_1h_conver*log(view)
+b9_1h_conver*ctcvr
+b9_2h_ctr
+b9_2h_ctcvr
+b9_2h_cvr
+b9_2h_conver
+b9_2h_ecpm
+b9_2h_click
+b9_2h_conver*log(view)
+b9_2h_conver*ctcvr
+b9_3h_ctr
+b9_3h_ctcvr
+b9_3h_cvr
+b9_3h_conver
+b9_3h_ecpm
+b9_3h_click
+b9_3h_conver*log(view)
+b9_3h_conver*ctcvr
+b9_4h_ctr
+b9_4h_ctcvr
+b9_4h_cvr
+b9_4h_conver
+b9_4h_ecpm
+b9_4h_click
+b9_4h_conver*log(view)
+b9_4h_conver*ctcvr
+b9_5h_ctr
+b9_5h_ctcvr
+b9_5h_cvr
+b9_5h_conver
+b9_5h_ecpm
+b9_5h_click
+b9_5h_conver*log(view)
+b9_5h_conver*ctcvr
+b9_6h_ctr
+b9_6h_ctcvr
+b9_6h_cvr
+b9_6h_conver
+b9_6h_ecpm
+b9_6h_click
+b9_6h_conver*log(view)
+b9_6h_conver*ctcvr
+b9_12h_ctr
+b9_12h_ctcvr
+b9_12h_cvr
+b9_12h_conver
+b9_12h_ecpm
+b9_12h_click
+b9_12h_conver*log(view)
+b9_12h_conver*ctcvr
+b9_1d_ctr
+b9_1d_ctcvr
+b9_1d_cvr
+b9_1d_conver
+b9_1d_ecpm
+b9_1d_click
+b9_1d_conver*log(view)
+b9_1d_conver*ctcvr
+b9_3d_ctr
+b9_3d_ctcvr
+b9_3d_cvr
+b9_3d_conver
+b9_3d_ecpm
+b9_3d_click
+b9_3d_conver*log(view)
+b9_3d_conver*ctcvr
+b9_7d_ctr
+b9_7d_ctcvr
+b9_7d_cvr
+b9_7d_conver
+b9_7d_ecpm
+b9_7d_click
+b9_7d_conver*log(view)
+b9_7d_conver*ctcvr
+b9_today_ctr
+b9_today_ctcvr
+b9_today_cvr
+b9_today_conver
+b9_today_ecpm
+b9_today_click
+b9_today_conver*log(view)
+b9_today_conver*ctcvr
+b9_yesterday_ctr
+b9_yesterday_ctcvr
+b9_yesterday_cvr
+b9_yesterday_conver
+b9_yesterday_ecpm
+b9_yesterday_click
+b9_yesterday_conver*log(view)
+b9_yesterday_conver*ctcvr
+b6_7d_ctr
+b6_7d_ctcvr
+b6_7d_cvr
+b6_7d_conver
+b6_7d_ecpm
+b6_7d_click
+b6_7d_conver*log(view)
+b6_7d_conver*ctcvr
+b6_14d_ctr
+b6_14d_ctcvr
+b6_14d_cvr
+b6_14d_conver
+b6_14d_ecpm
+b6_14d_click
+b6_14d_conver*log(view)
+b6_14d_conver*ctcvr
+b7_7d_ctr
+b7_7d_ctcvr
+b7_7d_cvr
+b7_7d_conver
+b7_7d_ecpm
+b7_7d_click
+b7_7d_conver*log(view)
+b7_7d_conver*ctcvr
+b7_14d_ctr
+b7_14d_ctcvr
+b7_14d_cvr
+b7_14d_conver
+b7_14d_ecpm
+b7_14d_click
+b7_14d_conver*log(view)
+b7_14d_conver*ctcvr
+viewAll
+clickAll
+converAll
+incomeAll
+ctr_all
+ctcvr_all
+cvr_all
+ecpm_all
+timediff_view
+timediff_click
+timediff_conver
+actionstatic_view
+actionstatic_click
+actionstatic_conver
+actionstatic_income
+actionstatic_ctr
+actionstatic_ctcvr
+actionstatic_cvr
+e1_tags_3d_matchnum
+e1_tags_3d_maxscore
+e1_tags_3d_avgscore
+e1_tags_7d_matchnum
+e1_tags_7d_maxscore
+e1_tags_7d_avgscore
+e1_tags_14d_matchnum
+e1_tags_14d_maxscore
+e1_tags_14d_avgscore
+e2_tags_3d_matchnum
+e2_tags_3d_maxscore
+e2_tags_3d_avgscore
+e2_tags_7d_matchnum
+e2_tags_7d_maxscore
+e2_tags_7d_avgscore
+e2_tags_14d_matchnum
+e2_tags_14d_maxscore
+e2_tags_14d_avgscore
+d1_feature_3h_ctr
+d1_feature_3h_ctcvr
+d1_feature_3h_cvr
+d1_feature_3h_conver
+d1_feature_3h_ecpm
+d1_feature_6h_ctr
+d1_feature_6h_ctcvr
+d1_feature_6h_cvr
+d1_feature_6h_conver
+d1_feature_6h_ecpm
+d1_feature_12h_ctr
+d1_feature_12h_ctcvr
+d1_feature_12h_cvr
+d1_feature_12h_conver
+d1_feature_12h_ecpm
+d1_feature_1d_ctr
+d1_feature_1d_ctcvr
+d1_feature_1d_cvr
+d1_feature_1d_conver
+d1_feature_1d_ecpm
+d1_feature_3d_ctr
+d1_feature_3d_ctcvr
+d1_feature_3d_cvr
+d1_feature_3d_conver
+d1_feature_3d_ecpm
+d1_feature_7d_ctr
+d1_feature_7d_ctcvr
+d1_feature_7d_cvr
+d1_feature_7d_conver
+d1_feature_7d_ecpm
+vid_rank_ctr_1d
+vid_rank_ctr_3d
+vid_rank_ctr_7d
+vid_rank_ctr_14d
+vid_rank_ctcvr_1d
+vid_rank_ctcvr_3d
+vid_rank_ctcvr_7d
+vid_rank_ctcvr_14d
+vid_rank_ecpm_1d
+vid_rank_ecpm_3d
+vid_rank_ecpm_7d
+vid_rank_ecpm_14d
+ctitle_vtitle_similarity
+weight

+ 518 - 0
src/main/resources/20240718_ad_feature_name_517.txt

@@ -0,0 +1,518 @@
+cpa
+b2_1h_ctr
+b2_1h_ctcvr
+b2_1h_cvr
+b2_1h_conver
+b2_1h_click
+b2_1h_conver*log(view)
+b2_1h_conver*ctcvr
+b2_2h_ctr
+b2_2h_ctcvr
+b2_2h_cvr
+b2_2h_conver
+b2_2h_click
+b2_2h_conver*log(view)
+b2_2h_conver*ctcvr
+b2_3h_ctr
+b2_3h_ctcvr
+b2_3h_cvr
+b2_3h_conver
+b2_3h_click
+b2_3h_conver*log(view)
+b2_3h_conver*ctcvr
+b2_6h_ctr
+b2_6h_ctcvr
+b2_6h_cvr
+b2_6h_conver
+b2_6h_click
+b2_6h_conver*log(view)
+b2_6h_conver*ctcvr
+b2_12h_ctr
+b2_12h_ctcvr
+b2_12h_cvr
+b2_12h_conver
+b2_12h_click
+b2_12h_conver*log(view)
+b2_12h_conver*ctcvr
+b2_1d_ctr
+b2_1d_ctcvr
+b2_1d_cvr
+b2_1d_conver
+b2_1d_click
+b2_1d_conver*log(view)
+b2_1d_conver*ctcvr
+b2_3d_ctr
+b2_3d_ctcvr
+b2_3d_cvr
+b2_3d_conver
+b2_3d_click
+b2_3d_conver*log(view)
+b2_3d_conver*ctcvr
+b2_7d_ctr
+b2_7d_ctcvr
+b2_7d_cvr
+b2_7d_conver
+b2_7d_click
+b2_7d_conver*log(view)
+b2_7d_conver*ctcvr
+b2_yesterday_ctr
+b2_yesterday_ctcvr
+b2_yesterday_cvr
+b2_yesterday_conver
+b2_yesterday_click
+b2_yesterday_conver*log(view)
+b2_yesterday_conver*ctcvr
+b2_today_ctr
+b2_today_ctcvr
+b2_today_cvr
+b2_today_conver
+b2_today_click
+b2_today_conver*log(view)
+b2_today_conver*ctcvr
+b3_1h_ctr
+b3_1h_ctcvr
+b3_1h_cvr
+b3_1h_conver
+b3_1h_click
+b3_1h_conver*log(view)
+b3_1h_conver*ctcvr
+b3_2h_ctr
+b3_2h_ctcvr
+b3_2h_cvr
+b3_2h_conver
+b3_2h_click
+b3_2h_conver*log(view)
+b3_2h_conver*ctcvr
+b3_3h_ctr
+b3_3h_ctcvr
+b3_3h_cvr
+b3_3h_conver
+b3_3h_click
+b3_3h_conver*log(view)
+b3_3h_conver*ctcvr
+b3_6h_ctr
+b3_6h_ctcvr
+b3_6h_cvr
+b3_6h_conver
+b3_6h_click
+b3_6h_conver*log(view)
+b3_6h_conver*ctcvr
+b3_12h_ctr
+b3_12h_ctcvr
+b3_12h_cvr
+b3_12h_conver
+b3_12h_click
+b3_12h_conver*log(view)
+b3_12h_conver*ctcvr
+b3_1d_ctr
+b3_1d_ctcvr
+b3_1d_cvr
+b3_1d_conver
+b3_1d_click
+b3_1d_conver*log(view)
+b3_1d_conver*ctcvr
+b3_3d_ctr
+b3_3d_ctcvr
+b3_3d_cvr
+b3_3d_conver
+b3_3d_click
+b3_3d_conver*log(view)
+b3_3d_conver*ctcvr
+b3_7d_ctr
+b3_7d_ctcvr
+b3_7d_cvr
+b3_7d_conver
+b3_7d_click
+b3_7d_conver*log(view)
+b3_7d_conver*ctcvr
+b3_yesterday_ctr
+b3_yesterday_ctcvr
+b3_yesterday_cvr
+b3_yesterday_conver
+b3_yesterday_click
+b3_yesterday_conver*log(view)
+b3_yesterday_conver*ctcvr
+b3_today_ctr
+b3_today_ctcvr
+b3_today_cvr
+b3_today_conver
+b3_today_click
+b3_today_conver*log(view)
+b3_today_conver*ctcvr
+b4_1h_ctr
+b4_1h_ctcvr
+b4_1h_cvr
+b4_1h_conver
+b4_1h_click
+b4_1h_conver*log(view)
+b4_1h_conver*ctcvr
+b4_2h_ctr
+b4_2h_ctcvr
+b4_2h_cvr
+b4_2h_conver
+b4_2h_click
+b4_2h_conver*log(view)
+b4_2h_conver*ctcvr
+b4_3h_ctr
+b4_3h_ctcvr
+b4_3h_cvr
+b4_3h_conver
+b4_3h_click
+b4_3h_conver*log(view)
+b4_3h_conver*ctcvr
+b4_6h_ctr
+b4_6h_ctcvr
+b4_6h_cvr
+b4_6h_conver
+b4_6h_click
+b4_6h_conver*log(view)
+b4_6h_conver*ctcvr
+b4_12h_ctr
+b4_12h_ctcvr
+b4_12h_cvr
+b4_12h_conver
+b4_12h_click
+b4_12h_conver*log(view)
+b4_12h_conver*ctcvr
+b4_1d_ctr
+b4_1d_ctcvr
+b4_1d_cvr
+b4_1d_conver
+b4_1d_click
+b4_1d_conver*log(view)
+b4_1d_conver*ctcvr
+b4_3d_ctr
+b4_3d_ctcvr
+b4_3d_cvr
+b4_3d_conver
+b4_3d_click
+b4_3d_conver*log(view)
+b4_3d_conver*ctcvr
+b4_7d_ctr
+b4_7d_ctcvr
+b4_7d_cvr
+b4_7d_conver
+b4_7d_click
+b4_7d_conver*log(view)
+b4_7d_conver*ctcvr
+b4_yesterday_ctr
+b4_yesterday_ctcvr
+b4_yesterday_cvr
+b4_yesterday_conver
+b4_yesterday_click
+b4_yesterday_conver*log(view)
+b4_yesterday_conver*ctcvr
+b4_today_ctr
+b4_today_ctcvr
+b4_today_cvr
+b4_today_conver
+b4_today_click
+b4_today_conver*log(view)
+b4_today_conver*ctcvr
+b5_1h_ctr
+b5_1h_ctcvr
+b5_1h_cvr
+b5_1h_conver
+b5_1h_click
+b5_1h_conver*log(view)
+b5_1h_conver*ctcvr
+b5_2h_ctr
+b5_2h_ctcvr
+b5_2h_cvr
+b5_2h_conver
+b5_2h_click
+b5_2h_conver*log(view)
+b5_2h_conver*ctcvr
+b5_3h_ctr
+b5_3h_ctcvr
+b5_3h_cvr
+b5_3h_conver
+b5_3h_click
+b5_3h_conver*log(view)
+b5_3h_conver*ctcvr
+b5_6h_ctr
+b5_6h_ctcvr
+b5_6h_cvr
+b5_6h_conver
+b5_6h_click
+b5_6h_conver*log(view)
+b5_6h_conver*ctcvr
+b5_12h_ctr
+b5_12h_ctcvr
+b5_12h_cvr
+b5_12h_conver
+b5_12h_click
+b5_12h_conver*log(view)
+b5_12h_conver*ctcvr
+b5_1d_ctr
+b5_1d_ctcvr
+b5_1d_cvr
+b5_1d_conver
+b5_1d_click
+b5_1d_conver*log(view)
+b5_1d_conver*ctcvr
+b5_3d_ctr
+b5_3d_ctcvr
+b5_3d_cvr
+b5_3d_conver
+b5_3d_click
+b5_3d_conver*log(view)
+b5_3d_conver*ctcvr
+b5_7d_ctr
+b5_7d_ctcvr
+b5_7d_cvr
+b5_7d_conver
+b5_7d_click
+b5_7d_conver*log(view)
+b5_7d_conver*ctcvr
+b5_yesterday_ctr
+b5_yesterday_ctcvr
+b5_yesterday_cvr
+b5_yesterday_conver
+b5_yesterday_click
+b5_yesterday_conver*log(view)
+b5_yesterday_conver*ctcvr
+b5_today_ctr
+b5_today_ctcvr
+b5_today_cvr
+b5_today_conver
+b5_today_click
+b5_today_conver*log(view)
+b5_today_conver*ctcvr
+b8_1h_ctr
+b8_1h_ctcvr
+b8_1h_cvr
+b8_1h_conver
+b8_1h_click
+b8_1h_conver*log(view)
+b8_1h_conver*ctcvr
+b8_2h_ctr
+b8_2h_ctcvr
+b8_2h_cvr
+b8_2h_conver
+b8_2h_click
+b8_2h_conver*log(view)
+b8_2h_conver*ctcvr
+b8_3h_ctr
+b8_3h_ctcvr
+b8_3h_cvr
+b8_3h_conver
+b8_3h_click
+b8_3h_conver*log(view)
+b8_3h_conver*ctcvr
+b8_6h_ctr
+b8_6h_ctcvr
+b8_6h_cvr
+b8_6h_conver
+b8_6h_click
+b8_6h_conver*log(view)
+b8_6h_conver*ctcvr
+b8_12h_ctr
+b8_12h_ctcvr
+b8_12h_cvr
+b8_12h_conver
+b8_12h_click
+b8_12h_conver*log(view)
+b8_12h_conver*ctcvr
+b8_1d_ctr
+b8_1d_ctcvr
+b8_1d_cvr
+b8_1d_conver
+b8_1d_click
+b8_1d_conver*log(view)
+b8_1d_conver*ctcvr
+b8_3d_ctr
+b8_3d_ctcvr
+b8_3d_cvr
+b8_3d_conver
+b8_3d_click
+b8_3d_conver*log(view)
+b8_3d_conver*ctcvr
+b8_7d_ctr
+b8_7d_ctcvr
+b8_7d_cvr
+b8_7d_conver
+b8_7d_click
+b8_7d_conver*log(view)
+b8_7d_conver*ctcvr
+b8_yesterday_ctr
+b8_yesterday_ctcvr
+b8_yesterday_cvr
+b8_yesterday_conver
+b8_yesterday_click
+b8_yesterday_conver*log(view)
+b8_yesterday_conver*ctcvr
+b8_today_ctr
+b8_today_ctcvr
+b8_today_cvr
+b8_today_conver
+b8_today_click
+b8_today_conver*log(view)
+b8_today_conver*ctcvr
+b9_1h_ctr
+b9_1h_ctcvr
+b9_1h_cvr
+b9_1h_conver
+b9_1h_click
+b9_1h_conver*log(view)
+b9_1h_conver*ctcvr
+b9_2h_ctr
+b9_2h_ctcvr
+b9_2h_cvr
+b9_2h_conver
+b9_2h_click
+b9_2h_conver*log(view)
+b9_2h_conver*ctcvr
+b9_3h_ctr
+b9_3h_ctcvr
+b9_3h_cvr
+b9_3h_conver
+b9_3h_click
+b9_3h_conver*log(view)
+b9_3h_conver*ctcvr
+b9_6h_ctr
+b9_6h_ctcvr
+b9_6h_cvr
+b9_6h_conver
+b9_6h_click
+b9_6h_conver*log(view)
+b9_6h_conver*ctcvr
+b9_12h_ctr
+b9_12h_ctcvr
+b9_12h_cvr
+b9_12h_conver
+b9_12h_click
+b9_12h_conver*log(view)
+b9_12h_conver*ctcvr
+b9_1d_ctr
+b9_1d_ctcvr
+b9_1d_cvr
+b9_1d_conver
+b9_1d_click
+b9_1d_conver*log(view)
+b9_1d_conver*ctcvr
+b9_3d_ctr
+b9_3d_ctcvr
+b9_3d_cvr
+b9_3d_conver
+b9_3d_click
+b9_3d_conver*log(view)
+b9_3d_conver*ctcvr
+b9_7d_ctr
+b9_7d_ctcvr
+b9_7d_cvr
+b9_7d_conver
+b9_7d_click
+b9_7d_conver*log(view)
+b9_7d_conver*ctcvr
+b9_yesterday_ctr
+b9_yesterday_ctcvr
+b9_yesterday_cvr
+b9_yesterday_conver
+b9_yesterday_click
+b9_yesterday_conver*log(view)
+b9_yesterday_conver*ctcvr
+b9_today_ctr
+b9_today_ctcvr
+b9_today_cvr
+b9_today_conver
+b9_today_click
+b9_today_conver*log(view)
+b9_today_conver*ctcvr
+b6_7d_ctr
+b6_7d_ctcvr
+b6_7d_cvr
+b6_7d_conver
+b6_7d_click
+b6_7d_conver*log(view)
+b6_7d_conver*ctcvr
+b6_14d_ctr
+b6_14d_ctcvr
+b6_14d_cvr
+b6_14d_conver
+b6_14d_click
+b6_14d_conver*log(view)
+b6_14d_conver*ctcvr
+b7_7d_ctr
+b7_7d_ctcvr
+b7_7d_cvr
+b7_7d_conver
+b7_7d_click
+b7_7d_conver*log(view)
+b7_7d_conver*ctcvr
+b7_14d_ctr
+b7_14d_ctcvr
+b7_14d_cvr
+b7_14d_conver
+b7_14d_click
+b7_14d_conver*log(view)
+b7_14d_conver*ctcvr
+viewAll
+clickAll
+converAll
+incomeAll
+ctr_all
+ctcvr_all
+cvr_all
+timediff_view
+timediff_click
+timediff_conver
+actionstatic_view
+actionstatic_click
+actionstatic_conver
+actionstatic_income
+actionstatic_ctr
+actionstatic_ctcvr
+actionstatic_cvr
+e1_tags_3d_matchnum
+e1_tags_3d_maxscore
+e1_tags_3d_avgscore
+e1_tags_7d_matchnum
+e1_tags_7d_maxscore
+e1_tags_7d_avgscore
+e1_tags_14d_matchnum
+e1_tags_14d_maxscore
+e1_tags_14d_avgscore
+e2_tags_3d_matchnum
+e2_tags_3d_maxscore
+e2_tags_3d_avgscore
+e2_tags_7d_matchnum
+e2_tags_7d_maxscore
+e2_tags_7d_avgscore
+e2_tags_14d_matchnum
+e2_tags_14d_maxscore
+e2_tags_14d_avgscore
+d1_feature_3h_ctr
+d1_feature_3h_ctcvr
+d1_feature_3h_cvr
+d1_feature_3h_conver
+d1_feature_6h_ctr
+d1_feature_6h_ctcvr
+d1_feature_6h_cvr
+d1_feature_6h_conver
+d1_feature_12h_ctr
+d1_feature_12h_ctcvr
+d1_feature_12h_cvr
+d1_feature_12h_conver
+d1_feature_1d_ctr
+d1_feature_1d_ctcvr
+d1_feature_1d_cvr
+d1_feature_1d_conver
+d1_feature_3d_ctr
+d1_feature_3d_ctcvr
+d1_feature_3d_cvr
+d1_feature_3d_conver
+d1_feature_7d_ctr
+d1_feature_7d_ctcvr
+d1_feature_7d_cvr
+d1_feature_7d_conver
+vid_rank_ctr_1d
+vid_rank_ctr_3d
+vid_rank_ctr_7d
+vid_rank_ctr_14d
+vid_rank_ctcvr_1d
+vid_rank_ctcvr_3d
+vid_rank_ctcvr_7d
+vid_rank_ctcvr_14d
+ctitle_vtitle_similarity
+weight

+ 1 - 0
src/main/resources/weight_ad_feature_name.txt

@@ -0,0 +1 @@
+weight

+ 131 - 0
src/main/scala/com/aliyun/odps/spark/ad/xgboost/v20240808/XGBoostTrain.scala

@@ -0,0 +1,131 @@
+package com.aliyun.odps.spark.ad.xgboost.v20240808
+
+import com.aliyun.odps.spark.examples.myUtils.ParamUtils
+import ml.dmlc.xgboost4j.scala.spark.XGBoostClassifier
+import org.apache.commons.lang.StringUtils
+import org.apache.commons.lang3.math.NumberUtils
+import org.apache.spark.ml.feature.VectorAssembler
+import org.apache.spark.rdd.RDD
+import org.apache.spark.sql.types.{DataTypes, StructField}
+import org.apache.spark.sql.{Dataset, Row, SparkSession}
+
+import java.net.URL
+import java.time.LocalDateTime
+import java.time.format.DateTimeFormatter
+import scala.io.Source
+
+object XGBoostTrain {
+  def main(args: Array[String]): Unit = {
+    try {
+
+      val param = ParamUtils.parseArgs(args)
+
+      val dt = LocalDateTime.now.format(DateTimeFormatter.ofPattern("yyyyMMddHHmmSS"))
+
+      val spark = SparkSession.builder()
+        .appName("XGBoostTrain:" + dt)
+        .getOrCreate()
+      val sc = spark.sparkContext
+
+      val loader = getClass.getClassLoader
+
+      val readPath = param.getOrElse("trainReadPath", "")
+      val predictReadPath = param.getOrElse("predictReadPath", "")
+      val filterNameSet = param.getOrElse("filterNames", "").split(",").filter(_.nonEmpty).toSet
+      val featureNameFile = param.getOrElse("featureNameFile", "20240718_ad_feature_name.txt")
+
+      val featureNameContent = readFile(loader.getResource(featureNameFile))
+
+      val featureNameList: List[String] = featureNameContent.split("\n")
+        .map(r => r.replace(" ", "").replaceAll("\n", ""))
+        .filter(r => r.nonEmpty)
+        .filter(r => !containsAny(filterNameSet, r))
+        .toList
+
+      val rowRDD = dataMap(sc.textFile(readPath), featureNameList)
+
+      println(s"rowRDD count ${rowRDD.count()}")
+
+      val fields: Array[StructField] = Array(
+        DataTypes.createStructField("label", DataTypes.IntegerType, true)
+      ) ++ featureNameList.map(f => DataTypes.createStructField(f, DataTypes.DoubleType, true))
+
+      val trainDataSet: Dataset[Row] = spark.createDataFrame(rowRDD, DataTypes.createStructType(fields))
+
+      val vectorAssembler = new VectorAssembler().setInputCols(featureNameList.toArray).setOutputCol("features")
+
+      val xgbInput = vectorAssembler.transform(trainDataSet).select("features", "label")
+      xgbInput.show()
+
+      // 创建 XGBoostClassifier 对象
+      val xgbClassifier = new XGBoostClassifier()
+        .setEta(0.01f)
+        .setMissing(0.0f)
+        .setMaxDepth(5)
+        .setNumRound(1000)
+        .setSubsample(0.8)
+        .setColsampleBytree(0.8)
+        .setScalePosWeight(1)
+        .setObjective("binary:logistic")
+        .setEvalMetric("auc")
+        .setFeaturesCol("features")
+        .setLabelCol("label")
+        .setNthread(1)
+        .setNumWorkers(22)
+
+      // 训练模型
+      val model = xgbClassifier.fit(xgbInput)
+
+
+
+    }
+    catch {
+      case e: Throwable => e.printStackTrace()
+    }
+  }
+
+  private def readFile(filePath: URL): String = {
+    var source: Option[Source] = None
+    try {
+      source = Some(Source.fromURL(filePath))
+      return source.get.getLines().mkString("\n")
+    }
+    catch {
+      case e: Exception => println("文件读取异常: " + e.toString)
+    }
+    finally {
+      source.foreach(_.close())
+    }
+    ""
+  }
+
+  private def containsAny(list: Iterable[String], s: String): Boolean = {
+    for (item <- list) {
+      if (s.contains(item)) {
+        return true
+      }
+    }
+    false
+  }
+
+  private def dataMap(data: RDD[String], featureNameList: List[String]): RDD[Row] = {
+    data.map(r => {
+      val line: Array[String] = StringUtils.split(r, "\t")
+      val label: Int = NumberUtils.toInt(line(0))
+
+      val map: Map[String, Double] = line.drop(1).map { entry =>
+        val Array(key, value) = entry.split(":")
+        key -> NumberUtils.toDouble(value, 0.0)
+      }.toMap
+
+      val v: Array[Any] = Array.ofDim[Any](featureNameList.length + 1)
+      v(0) = label
+
+      for (index <- featureNameList.indices) {
+        v(index + 1) = map.getOrElse(featureNameList(index), 0.0)
+      }
+
+      Row.fromSeq(v)
+    })
+  }
+}

+ 16 - 16
src/main/scala/com/aliyun/odps/spark/examples/SparkPi.scala

@@ -1,20 +1,20 @@
 /**
-  * Licensed to the Apache Software Foundation (ASF) under one
-  * or more contributor license agreements.  See the NOTICE file
-  * distributed with this work for additional information
-  * regarding copyright ownership.  The ASF licenses this file
-  * to you under the Apache License, Version 2.0 (the
-  * "License"); you may not use this file except in compliance
-  * with the License.  You may obtain a copy of the License at
-  * <p>
-  * http://www.apache.org/licenses/LICENSE-2.0
-  * <p>
-  * Unless required by applicable law or agreed to in writing, software
-  * distributed under the License is distributed on an "AS IS" BASIS,
-  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-  * See the License for the specific language governing permissions and
-  * limitations under the License.
-  */
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ * <p>
+ * http://www.apache.org/licenses/LICENSE-2.0
+ * <p>
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
 
 package com.aliyun.odps.spark.examples
 

+ 1 - 1
src/main/scala/com/aliyun/odps/spark/examples/makedata_ad/makedata_ad_31_originData_20240620.scala → src/main/scala/com/aliyun/odps/spark/examples/makedata_ad/v20240620/makedata_ad_31_originData_20240620.scala

@@ -1,4 +1,4 @@
-package com.aliyun.odps.spark.examples.makedata_ad
+package com.aliyun.odps.spark.examples.makedata_ad.v20240620
 
 import com.alibaba.fastjson.{JSON, JSONObject}
 import com.aliyun.odps.TableSchema

+ 1 - 1
src/main/scala/com/aliyun/odps/spark/examples/makedata_ad/makedata_ad_32_bucket_20240622.scala → src/main/scala/com/aliyun/odps/spark/examples/makedata_ad/v20240620/makedata_ad_32_bucket_20240622.scala

@@ -1,4 +1,4 @@
-package com.aliyun.odps.spark.examples.makedata_ad
+package com.aliyun.odps.spark.examples.makedata_ad.v20240620
 
 import com.alibaba.fastjson.JSON
 import com.aliyun.odps.spark.examples.myUtils.{MyHdfsUtils, ParamUtils}

+ 1 - 1
src/main/scala/com/aliyun/odps/spark/examples/makedata_ad/makedata_ad_33_bucketDataPrint_20240628.scala → src/main/scala/com/aliyun/odps/spark/examples/makedata_ad/v20240620/makedata_ad_33_bucketDataPrint_20240628.scala

@@ -1,4 +1,4 @@
-package com.aliyun.odps.spark.examples.makedata_ad
+package com.aliyun.odps.spark.examples.makedata_ad.v20240620
 
 import com.alibaba.fastjson.{JSON, JSONObject}
 import com.aliyun.odps.TableSchema

+ 2 - 2
src/main/scala/com/aliyun/odps/spark/examples/makedata_ad/makedata_ad_33_bucketData_20240622.scala → src/main/scala/com/aliyun/odps/spark/examples/makedata_ad/v20240620/makedata_ad_33_bucketData_20240622.scala

@@ -1,4 +1,4 @@
-package com.aliyun.odps.spark.examples.makedata_ad
+package com.aliyun.odps.spark.examples.makedata_ad.v20240620
 
 import com.alibaba.fastjson.JSON
 import com.aliyun.odps.spark.examples.myUtils.{MyDateUtils, MyHdfsUtils, ParamUtils}
@@ -51,7 +51,7 @@ object makedata_ad_33_bucketData_20240622 {
     val beginStr = param.getOrElse("beginStr", "20240620")
     val endStr = param.getOrElse("endStr", "20240620")
     val repartition = param.getOrElse("repartition", "100").toInt
-    val filterNames = param.getOrElse("filterNames", "").split(",").toSet
+    val filterNames = param.getOrElse("filterNames", "").split(",").filter(_.nonEmpty).toSet
     val whatLabel = param.getOrElse("whatLabel", "ad_is_conversion")
 
     val dateRange = MyDateUtils.getDateRange(beginStr, endStr)

+ 431 - 0
src/main/scala/com/aliyun/odps/spark/examples/makedata_ad/v20240718/makedata_ad_31_originData_20240718.scala

@@ -0,0 +1,431 @@
+package com.aliyun.odps.spark.examples.makedata_ad.v20240718
+
+import com.alibaba.fastjson.{JSON, JSONObject}
+import com.aliyun.odps.TableSchema
+import com.aliyun.odps.data.Record
+import com.aliyun.odps.spark.examples.myUtils.{MyDateUtils, MyHdfsUtils, ParamUtils, env}
+import examples.extractor.RankExtractorFeature_20240530
+import examples.utils.DateTimeUtil
+import org.apache.hadoop.io.compress.GzipCodec
+import org.apache.spark.sql.SparkSession
+import org.xm.Similarity
+
+import scala.collection.JavaConversions._
+import scala.collection.mutable.ArrayBuffer
+
+/*
+   20240608 提取特征
+ */
+
+object makedata_ad_31_originData_20240718 {
+  def main(args: Array[String]): Unit = {
+    val spark = SparkSession
+      .builder()
+      .appName(this.getClass.getName)
+      .getOrCreate()
+    val sc = spark.sparkContext
+
+    // 1 读取参数
+    val param = ParamUtils.parseArgs(args)
+    val tablePart = param.getOrElse("tablePart", "64").toInt
+    val beginStr = param.getOrElse("beginStr", "2024062008")
+    val endStr = param.getOrElse("endStr", "2024062023")
+    val savePath = param.getOrElse("savePath", "/dw/recommend/model/31_ad_sample_data/")
+    val project = param.getOrElse("project", "loghubods")
+    val table = param.getOrElse("table", "alg_recsys_ad_sample_all")
+    val repartition = param.getOrElse("repartition", "100").toInt
+    val filterHours = param.getOrElse("filterHours", "00,01,02,03,04,05,06,07").split(",").toSet
+    val idDefaultValue = param.getOrElse("idDefaultValue", "1.0").toDouble
+    // 2 读取odps+表信息
+    val odpsOps = env.getODPS(sc)
+
+    // 3 循环执行数据生产
+    val timeRange = MyDateUtils.getDateHourRange(beginStr, endStr)
+    for (dt_hh <- timeRange) {
+      val dt = dt_hh.substring(0, 8)
+      val hh = dt_hh.substring(8, 10)
+      val partition = s"dt=$dt,hh=$hh"
+      if (filterHours.nonEmpty && filterHours.contains(hh)) {
+        println("不执行partiton:" + partition)
+      } else {
+        println("开始执行partiton:" + partition)
+        val odpsData = odpsOps.readTable(project = project,
+            table = table,
+            partition = partition,
+            transfer = func,
+            numPartition = tablePart)
+          .filter(record => {
+            val extendAlg: JSONObject = if (record.isNull("extend_alg")) new JSONObject() else
+              JSON.parseObject(record.getString("extend_alg"))
+            val isApi = extendAlg.getString("is_api")
+            "1".equals(isApi)
+          })
+          .map(record => {
+
+            val ts = record.getString("ts").toInt
+            val cid = record.getString("cid")
+            val apptype = record.getString("apptype")
+            val extend: JSONObject = if (record.isNull("extend")) new JSONObject() else
+              JSON.parseObject(record.getString("extend"))
+
+            val featureMap = new JSONObject()
+
+            val b1: JSONObject = if (record.isNull("b1_feature")) new JSONObject() else
+              JSON.parseObject(record.getString("b1_feature"))
+            val b2: JSONObject = if (record.isNull("b2_feature")) new JSONObject() else
+              JSON.parseObject(record.getString("b2_feature"))
+            val b3: JSONObject = if (record.isNull("b3_feature")) new JSONObject() else
+              JSON.parseObject(record.getString("b3_feature"))
+            val b4: JSONObject = if (record.isNull("b4_feature")) new JSONObject() else
+              JSON.parseObject(record.getString("b4_feature"))
+            val b5: JSONObject = if (record.isNull("b5_feature")) new JSONObject() else
+              JSON.parseObject(record.getString("b5_feature"))
+            val b6: JSONObject = if (record.isNull("b6_feature")) new JSONObject() else
+              JSON.parseObject(record.getString("b6_feature"))
+            val b7: JSONObject = if (record.isNull("b7_feature")) new JSONObject() else
+              JSON.parseObject(record.getString("b7_feature"))
+            val b8: JSONObject = if (record.isNull("b8_feature")) new JSONObject() else
+              JSON.parseObject(record.getString("b8_feature"))
+            val b9: JSONObject = if (record.isNull("b9_feature")) new JSONObject() else
+              JSON.parseObject(record.getString("b9_feature"))
+
+
+            featureMap.put("cid_" + cid, idDefaultValue)
+            if (b1.containsKey("adid") && b1.getString("adid").nonEmpty) {
+              featureMap.put("adid_" + b1.getString("adid"), idDefaultValue)
+            }
+            if (b1.containsKey("adverid") && b1.getString("adverid").nonEmpty) {
+              featureMap.put("adverid_" + b1.getString("adverid"), idDefaultValue)
+            }
+            if (b1.containsKey("targeting_conversion") && b1.getString("targeting_conversion").nonEmpty) {
+              featureMap.put("targeting_conversion_" + b1.getString("targeting_conversion"), idDefaultValue)
+            }
+
+            val hour = DateTimeUtil.getHourByTimestamp(ts)
+            featureMap.put("hour_" + hour, idDefaultValue)
+
+            val dayOfWeek = DateTimeUtil.getDayOrWeekByTimestamp(ts)
+            featureMap.put("dayofweek_" + dayOfWeek, idDefaultValue);
+
+            featureMap.put("apptype_" + apptype, idDefaultValue);
+
+            if (extend.containsKey("abcode") && extend.getString("abcode").nonEmpty) {
+              featureMap.put("abcode_" + extend.getString("abcode"), idDefaultValue)
+            }
+
+
+            if (b1.containsKey("cpa")) {
+              featureMap.put("cpa", b1.getString("cpa").toDouble)
+            }
+            if (b1.containsKey("weight") && b1.getString("weight").nonEmpty) {
+              featureMap.put("weight", b1.getString("weight").toDouble)
+            }
+
+            for ((bn, prefix1) <- List(
+              (b2, "b2"), (b3, "b3"), (b4, "b4"), (b5, "b5"), (b8, "b8"), (b9, "b9")
+            )) {
+              for (prefix2 <- List(
+                "1h", "2h", "3h", "4h", "5h", "6h", "12h", "1d", "3d", "7d", "today", "yesterday"
+              )) {
+                val view = if (bn.isEmpty) 0D else bn.getIntValue("ad_view_" + prefix2).toDouble
+                val click = if (bn.isEmpty) 0D else bn.getIntValue("ad_click_" + prefix2).toDouble
+                val conver = if (bn.isEmpty) 0D else bn.getIntValue("ad_conversion_" + prefix2).toDouble
+                val income = if (bn.isEmpty) 0D else bn.getIntValue("ad_income_" + prefix2).toDouble
+                val f1 = RankExtractorFeature_20240530.calDiv(click, view)
+                val f2 = RankExtractorFeature_20240530.calDiv(conver, view)
+                val f3 = RankExtractorFeature_20240530.calDiv(conver, click)
+                val f4 = conver
+                val f5 = RankExtractorFeature_20240530.calDiv(income * 1000, view)
+                featureMap.put(prefix1 + "_" + prefix2 + "_" + "ctr", f1)
+                featureMap.put(prefix1 + "_" + prefix2 + "_" + "ctcvr", f2)
+                featureMap.put(prefix1 + "_" + prefix2 + "_" + "cvr", f3)
+                featureMap.put(prefix1 + "_" + prefix2 + "_" + "conver", f4)
+                featureMap.put(prefix1 + "_" + prefix2 + "_" + "ecpm", f5)
+
+                featureMap.put(prefix1 + "_" + prefix2 + "_" + "click", click)
+                featureMap.put(prefix1 + "_" + prefix2 + "_" + "conver*log(view)", conver * RankExtractorFeature_20240530.calLog(view))
+                featureMap.put(prefix1 + "_" + prefix2 + "_" + "conver*ctcvr", conver * f2)
+              }
+            }
+
+            for ((bn, prefix1) <- List(
+              (b6, "b6"), (b7, "b7")
+            )) {
+              for (prefix2 <- List(
+                "7d", "14d"
+              )) {
+                val view = if (bn.isEmpty) 0D else bn.getIntValue("ad_view_" + prefix2).toDouble
+                val click = if (bn.isEmpty) 0D else bn.getIntValue("ad_click_" + prefix2).toDouble
+                val conver = if (bn.isEmpty) 0D else bn.getIntValue("ad_conversion_" + prefix2).toDouble
+                val income = if (bn.isEmpty) 0D else bn.getIntValue("ad_income_" + prefix2).toDouble
+                val f1 = RankExtractorFeature_20240530.calDiv(click, view)
+                val f2 = RankExtractorFeature_20240530.calDiv(conver, view)
+                val f3 = RankExtractorFeature_20240530.calDiv(conver, click)
+                val f4 = conver
+                val f5 = RankExtractorFeature_20240530.calDiv(income * 1000, view)
+                featureMap.put(prefix1 + "_" + prefix2 + "_" + "ctr", f1)
+                featureMap.put(prefix1 + "_" + prefix2 + "_" + "ctcvr", f2)
+                featureMap.put(prefix1 + "_" + prefix2 + "_" + "cvr", f3)
+                featureMap.put(prefix1 + "_" + prefix2 + "_" + "conver", f4)
+                featureMap.put(prefix1 + "_" + prefix2 + "_" + "ecpm", f5)
+
+                featureMap.put(prefix1 + "_" + prefix2 + "_" + "click", click)
+                featureMap.put(prefix1 + "_" + prefix2 + "_" + "conver*log(view)", conver * RankExtractorFeature_20240530.calLog(view))
+                featureMap.put(prefix1 + "_" + prefix2 + "_" + "conver*ctcvr", conver * f2)
+              }
+            }
+
+            val c1: JSONObject = if (record.isNull("c1_feature")) new JSONObject() else
+              JSON.parseObject(record.getString("c1_feature"))
+
+            val midActionList = if (c1.containsKey("action") && c1.getString("action").nonEmpty) {
+              c1.getString("action").split(",").map(r => {
+                val rList = r.split(":")
+                (rList(0), (rList(1).toInt, rList(2).toInt, rList(3).toInt, rList(4).toInt, rList(5)))
+              }).sortBy(-_._2._1).toList
+            } else {
+              new ArrayBuffer[(String, (Int, Int, Int, Int, String))]().toList
+            }
+            // u特征
+            val viewAll = midActionList.size.toDouble
+            val clickAll = midActionList.map(_._2._2).sum.toDouble
+            val converAll = midActionList.map(_._2._3).sum.toDouble
+            val incomeAll = midActionList.map(_._2._4).sum.toDouble
+            featureMap.put("viewAll", viewAll)
+            featureMap.put("clickAll", clickAll)
+            featureMap.put("converAll", converAll)
+            featureMap.put("incomeAll", incomeAll)
+            featureMap.put("ctr_all", RankExtractorFeature_20240530.calDiv(clickAll, viewAll))
+            featureMap.put("ctcvr_all", RankExtractorFeature_20240530.calDiv(converAll, viewAll))
+            featureMap.put("cvr_all", RankExtractorFeature_20240530.calDiv(clickAll, converAll))
+            featureMap.put("ecpm_all", RankExtractorFeature_20240530.calDiv(incomeAll * 1000, viewAll))
+
+            // ui特征
+            val midTimeDiff = scala.collection.mutable.Map[String, Double]()
+            midActionList.foreach {
+              case (cid, (ts_history, click, conver, income, title)) =>
+                if (!midTimeDiff.contains("timediff_view_" + cid)) {
+                  midTimeDiff.put("timediff_view_" + cid, 1.0 / ((ts - ts_history).toDouble / 3600.0 / 24.0))
+                }
+                if (!midTimeDiff.contains("timediff_click_" + cid) && click > 0) {
+                  midTimeDiff.put("timediff_click_" + cid, 1.0 / ((ts - ts_history).toDouble / 3600.0 / 24.0))
+                }
+                if (!midTimeDiff.contains("timediff_conver_" + cid) && conver > 0) {
+                  midTimeDiff.put("timediff_conver_" + cid, 1.0 / ((ts - ts_history).toDouble / 3600.0 / 24.0))
+                }
+            }
+
+            val midActionStatic = scala.collection.mutable.Map[String, Double]()
+            midActionList.foreach {
+              case (cid, (ts_history, click, conver, income, title)) =>
+                midActionStatic.put("actionstatic_view_" + cid, 1.0 + midActionStatic.getOrDefault("actionstatic_view_" + cid, 0.0))
+                midActionStatic.put("actionstatic_click_" + cid, click + midActionStatic.getOrDefault("actionstatic_click_" + cid, 0.0))
+                midActionStatic.put("actionstatic_conver_" + cid, conver + midActionStatic.getOrDefault("actionstatic_conver_" + cid, 0.0))
+                midActionStatic.put("actionstatic_income_" + cid, income + midActionStatic.getOrDefault("actionstatic_income_" + cid, 0.0))
+            }
+
+            if (midTimeDiff.contains("timediff_view_" + cid)) {
+              featureMap.put("timediff_view", midTimeDiff.getOrDefault("timediff_view_" + cid, 0.0))
+            }
+            if (midTimeDiff.contains("timediff_click_" + cid)) {
+              featureMap.put("timediff_click", midTimeDiff.getOrDefault("timediff_click_" + cid, 0.0))
+            }
+            if (midTimeDiff.contains("timediff_conver_" + cid)) {
+              featureMap.put("timediff_conver", midTimeDiff.getOrDefault("timediff_conver_" + cid, 0.0))
+            }
+            if (midActionStatic.contains("actionstatic_view_" + cid)) {
+              featureMap.put("actionstatic_view", midActionStatic.getOrDefault("actionstatic_view_" + cid, 0.0))
+            }
+            if (midActionStatic.contains("actionstatic_click_" + cid)) {
+              featureMap.put("actionstatic_click", midActionStatic.getOrDefault("actionstatic_click_" + cid, 0.0))
+            }
+            if (midActionStatic.contains("actionstatic_conver_" + cid)) {
+              featureMap.put("actionstatic_conver", midActionStatic.getOrDefault("actionstatic_conver_" + cid, 0.0))
+            }
+            if (midActionStatic.contains("actionstatic_income_" + cid)) {
+              featureMap.put("actionstatic_income", midActionStatic.getOrDefault("actionstatic_income_" + cid, 0.0))
+            }
+            if (midActionStatic.contains("actionstatic_view_" + cid) && midActionStatic.contains("actionstatic_click_" + cid)) {
+              featureMap.put("actionstatic_ctr", RankExtractorFeature_20240530.calDiv(
+                midActionStatic.getOrDefault("actionstatic_click_" + cid, 0.0),
+                midActionStatic.getOrDefault("actionstatic_view_" + cid, 0.0)
+              ))
+            }
+            if (midActionStatic.contains("actionstatic_view_" + cid) && midActionStatic.contains("actionstatic_conver_" + cid)) {
+              featureMap.put("actionstatic_ctcvr", RankExtractorFeature_20240530.calDiv(
+                midActionStatic.getOrDefault("actionstatic_conver_" + cid, 0.0),
+                midActionStatic.getOrDefault("actionstatic_view_" + cid, 0.0)
+              ))
+            }
+            if (midActionStatic.contains("actionstatic_conver_" + cid) && midActionStatic.contains("actionstatic_click_" + cid)) {
+              featureMap.put("actionstatic_cvr", RankExtractorFeature_20240530.calDiv(
+                midActionStatic.getOrDefault("actionstatic_conver_" + cid, 0.0),
+                midActionStatic.getOrDefault("actionstatic_click_" + cid, 0.0)
+              ))
+            }
+
+            val e1: JSONObject = if (record.isNull("e1_feature")) new JSONObject() else
+              JSON.parseObject(record.getString("e1_feature"))
+            val e2: JSONObject = if (record.isNull("e2_feature")) new JSONObject() else
+              JSON.parseObject(record.getString("e2_feature"))
+            val title = b1.getOrDefault("cidtitle", "").toString
+            if (title.nonEmpty) {
+              for ((en, prefix1) <- List((e1, "e1"), (e2, "e2"))) {
+                for (prefix2 <- List("tags_3d", "tags_7d", "tags_14d")) {
+                  if (en.nonEmpty && en.containsKey(prefix2) && en.getString(prefix2).nonEmpty) {
+                    val (f1, f2, f3, f4) = funcC34567ForTags(en.getString(prefix2), title)
+                    featureMap.put(prefix1 + "_" + prefix2 + "_matchnum", f1)
+                    featureMap.put(prefix1 + "_" + prefix2 + "_maxscore", f3)
+                    featureMap.put(prefix1 + "_" + prefix2 + "_avgscore", f4)
+
+                  }
+                }
+              }
+            }
+
+            val d1: JSONObject = if (record.isNull("d1_feature")) new JSONObject() else
+              JSON.parseObject(record.getString("d1_feature"))
+            val d2: JSONObject = if (record.isNull("d2_feature")) new JSONObject() else
+              JSON.parseObject(record.getString("d2_feature"))
+            val d3: JSONObject = if (record.isNull("d3_feature")) new JSONObject() else
+              JSON.parseObject(record.getString("d3_feature"))
+
+            if (d1.nonEmpty) {
+              for (prefix <- List("3h", "6h", "12h", "1d", "3d", "7d")) {
+                val view = if (!d1.containsKey("ad_view_" + prefix)) 0D else d1.getIntValue("ad_view_" + prefix).toDouble
+                val click = if (!d1.containsKey("ad_click_" + prefix)) 0D else d1.getIntValue("ad_click_" + prefix).toDouble
+                val conver = if (!d1.containsKey("ad_conversion_" + prefix)) 0D else d1.getIntValue("ad_conversion_" + prefix).toDouble
+                val income = if (!d1.containsKey("ad_income_" + prefix)) 0D else d1.getIntValue("ad_income_" + prefix).toDouble
+                val f1 = RankExtractorFeature_20240530.calDiv(click, view)
+                val f2 = RankExtractorFeature_20240530.calDiv(conver, view)
+                val f3 = RankExtractorFeature_20240530.calDiv(conver, click)
+                val f4 = conver
+                val f5 = RankExtractorFeature_20240530.calDiv(income * 1000, view)
+                featureMap.put("d1_feature" + "_" + prefix + "_" + "ctr", f1)
+                featureMap.put("d1_feature" + "_" + prefix + "_" + "ctcvr", f2)
+                featureMap.put("d1_feature" + "_" + prefix + "_" + "cvr", f3)
+                featureMap.put("d1_feature" + "_" + prefix + "_" + "conver", f4)
+                featureMap.put("d1_feature" + "_" + prefix + "_" + "ecpm", f5)
+              }
+            }
+
+            val vidRankMaps = scala.collection.mutable.Map[String, scala.collection.immutable.Map[String, Double]]()
+            if (d2.nonEmpty) {
+              d2.foreach(r => {
+                val key = r._1
+                val value = d2.getString(key).split(",").map(r => {
+                  val rList = r.split(":")
+                  (rList(0), rList(2).toDouble)
+                }).toMap
+                vidRankMaps.put(key, value)
+              })
+            }
+            for (prefix1 <- List("ctr", "ctcvr", "ecpm")) {
+              for (prefix2 <- List("1d", "3d", "7d", "14d")) {
+                if (vidRankMaps.contains(prefix1 + "_" + prefix2)) {
+                  val rank = vidRankMaps(prefix1 + "_" + prefix2).getOrDefault(cid, 0.0)
+                  if (rank >= 1.0) {
+                    featureMap.put("vid_rank_" + prefix1 + "_" + prefix2, 1.0 / rank)
+                  }
+                }
+              }
+            }
+
+            if (d3.nonEmpty) {
+              val vTitle = d3.getString("title")
+              val score = Similarity.conceptSimilarity(title, vTitle)
+              featureMap.put("ctitle_vtitle_similarity", score);
+            }
+
+            /*
+            广告
+              sparse:cid adid adverid targeting_conversion
+
+              cpa --> 1个
+              adverid下的 3h 6h 12h 1d 3d 7d 、 ctr ctcvr cvr conver ecpm  --> 30个
+              cid下的 3h 6h 12h 1d 3d 7d 、 ctr ctcvr cvr ecpm conver --> 30个
+              地理//cid下的 3h 6h 12h 1d 3d 7d 、 ctr ctcvr cvr ecpm conver --> 30个
+              app//cid下的 3h 6h 12h 1d 3d 7d 、 ctr ctcvr cvr ecpm conver --> 30个
+              手机品牌//cid下的 3h 6h 12h 1d 3d 7d 、 ctr ctcvr cvr ecpm conver --> 30个
+              系统 无数据
+              week//cid下的 7d 14d、 ctr ctcvr cvr ecpm conver --> 10个
+              hour//cid下的 7d 14d、 ctr ctcvr cvr ecpm conver --> 10个
+
+            用户
+              用户历史 点击/转化 的title tag;3d 7d 14d; cid的title; 数量/最高分/平均分 --> 18个
+              用户历史 14d 看过/点过/转化次数/income; ctr cvr ctcvr ecpm;  --> 8个
+
+              用户到cid的ui特征 --> 10个
+                1/用户最近看过这个cid的时间间隔
+                1/用户最近点过这个cid的时间间隔
+                1/用户最近转过这个cid的时间间隔
+                用户看过这个cid多少次
+                用户点过这个cid多少次
+                用户转过这个cid多少次
+                用户对这个cid花了多少钱
+                用户对这个cid的ctr ctcvr cvr
+
+            视频
+              title与cid的 sim-score-1/-2 无数据
+              vid//cid下的 3h 6h 12h 1d 3d 7d 、 ctr ctcvr cvr ecpm conver --> 30个
+              vid//cid下的 1d 3d 7d 14d、 ctr ctcvr ecpm 的rank值 倒数 --> 12个
+
+             */
+
+
+            //4 处理label信息。
+            val labels = new JSONObject
+            for (labelKey <- List("ad_is_click", "ad_is_conversion")) {
+              if (!record.isNull(labelKey)) {
+                labels.put(labelKey, record.getString(labelKey))
+              }
+            }
+            //5 处理log key表头。
+            val mid = record.getString("mid")
+            val headvideoid = record.getString("headvideoid")
+            val logKey = (apptype, mid, cid, ts, headvideoid).productIterator.mkString(",")
+            val labelKey = labels.toString()
+            val featureKey = featureMap.toString()
+            //6 拼接数据,保存。
+            logKey + "\t" + labelKey + "\t" + featureKey
+          })
+
+        // 4 保存数据到hdfs
+        val savePartition = dt + hh
+        val hdfsPath = savePath + "/" + savePartition
+        if (hdfsPath.nonEmpty && hdfsPath.startsWith("/dw/recommend/model/")) {
+          println("删除路径并开始数据写入:" + hdfsPath)
+          MyHdfsUtils.delete_hdfs_path(hdfsPath)
+          odpsData.coalesce(repartition).saveAsTextFile(hdfsPath, classOf[GzipCodec])
+        } else {
+          println("路径不合法,无法写入:" + hdfsPath)
+        }
+      }
+
+    }
+  }
+
+  def func(record: Record, schema: TableSchema): Record = {
+    record
+  }
+
+  def funcC34567ForTags(tags: String, title: String): Tuple4[Double, String, Double, Double] = {
+    // 匹配数量 匹配词 语义最高相似度分 语义平均相似度分
+    val tagsList = tags.split(",")
+    var d1 = 0.0
+    val d2 = new ArrayBuffer[String]()
+    var d3 = 0.0
+    var d4 = 0.0
+    for (tag <- tagsList) {
+      if (title.contains(tag)) {
+        d1 = d1 + 1.0
+        d2.add(tag)
+      }
+      val score = Similarity.conceptSimilarity(tag, title)
+      d3 = if (score > d3) score else d3
+      d4 = d4 + score
+    }
+    d4 = if (tagsList.nonEmpty) d4 / tagsList.size else d4
+    (d1, d2.mkString(","), d3, d4)
+  }
+}

+ 105 - 0
src/main/scala/com/aliyun/odps/spark/examples/makedata_ad/v20240718/makedata_ad_32_bucket_20240718.scala

@@ -0,0 +1,105 @@
+package com.aliyun.odps.spark.examples.makedata_ad.v20240718
+
+import com.alibaba.fastjson.JSON
+import com.aliyun.odps.spark.examples.myUtils.{MyHdfsUtils, ParamUtils}
+import org.apache.hadoop.io.compress.GzipCodec
+import org.apache.spark.sql.SparkSession
+
+import scala.collection.JavaConversions._
+import scala.collection.mutable.ArrayBuffer
+import scala.io.Source
+/*
+
+ */
+
+object makedata_ad_32_bucket_20240718 {
+  def main(args: Array[String]): Unit = {
+
+    val spark = SparkSession
+      .builder()
+      .appName(this.getClass.getName)
+      .getOrCreate()
+    val sc = spark.sparkContext
+
+    // 1 读取参数
+    val param = ParamUtils.parseArgs(args)
+    val readPath = param.getOrElse("readPath", "/dw/recommend/model/31_ad_sample_data/20240620*")
+    val savePath = param.getOrElse("savePath", "/dw/recommend/model/32_bucket_data/")
+    val fileName = param.getOrElse("fileName", "20240620_100")
+    val sampleRate = param.getOrElse("sampleRate", "1.0").toDouble
+    val bucketNum = param.getOrElse("bucketNum", "100").toInt
+    val featureNameFile = param.getOrElse("featureNameFile", "20240718_ad_feature_name.txt");
+
+
+    val loader = getClass.getClassLoader
+    val resourceUrl = loader.getResource(featureNameFile)
+    val content =
+      if (resourceUrl != null) {
+        val content = Source.fromURL(resourceUrl).getLines().mkString("\n")
+        Source.fromURL(resourceUrl).close()
+        content
+      } else {
+        ""
+      }
+    println(content)
+    val contentList = content.split("\n")
+      .map(r=> r.replace(" ", "").replaceAll("\n", ""))
+      .filter(r=> r.nonEmpty).toList
+
+
+
+    val data = sc.textFile(readPath)
+    println("问题数据数量:" + data.filter(r=>r.split("\t").length != 3).count())
+    val data1 = data.map(r => {
+      val rList = r.split("\t")
+      val jsons = JSON.parseObject(rList(2))
+      val doubles = scala.collection.mutable.Map[String, Double]()
+      jsons.foreach(r =>{
+        doubles.put(r._1, jsons.getDoubleValue(r._1))
+      })
+      doubles
+    }).sample(false, sampleRate ).repartition(20)
+
+    val result = new ArrayBuffer[String]()
+
+    for (i <- contentList.indices){
+      println("特征:" + contentList(i))
+      val data2 = data1.map(r => r.getOrDefault(contentList(i), 0D)).filter(_ > 1E-8).collect().sorted
+      val len = data2.length
+      if (len == 0){
+        result.add(contentList(i) + "\t" + bucketNum.toString + "\t" + "0")
+      }else{
+        val oneBucketNum = (len - 1) / (bucketNum - 1) + 1 // 确保每个桶至少有一个元素
+        val buffers = new ArrayBuffer[Double]()
+
+        var lastBucketValue = data2(0) // 记录上一个桶的切分点
+        for (j <- 0 until len by oneBucketNum) {
+          val d = data2(j)
+          if (j > 0 && d != lastBucketValue) {
+            // 如果当前切分点不同于上一个切分点,则保存当前切分点
+            buffers += d
+          }
+          lastBucketValue = d // 更新上一个桶的切分点
+        }
+
+        // 最后一个桶的结束点应该是数组的最后一个元素
+        if (!buffers.contains(data2.last)) {
+          buffers += data2.last
+        }
+        result.add(contentList(i) + "\t" + bucketNum.toString + "\t" + buffers.mkString(","))
+      }
+    }
+    val data3 = sc.parallelize(result)
+
+
+    // 4 保存数据到hdfs
+    val hdfsPath = savePath + "/" + fileName
+    if (hdfsPath.nonEmpty && hdfsPath.startsWith("/dw/recommend/model/")) {
+      println("删除路径并开始数据写入:" + hdfsPath)
+      MyHdfsUtils.delete_hdfs_path(hdfsPath)
+      data3.repartition(1).saveAsTextFile(hdfsPath, classOf[GzipCodec])
+    } else {
+      println("路径不合法,无法写入:" + hdfsPath)
+    }
+  }
+}

File diff suppressed because it is too large
+ 429 - 0
src/main/scala/com/aliyun/odps/spark/examples/makedata_ad/v20240718/makedata_ad_33_bucketDataPrint_20240718.scala


+ 128 - 0
src/main/scala/com/aliyun/odps/spark/examples/makedata_ad/v20240718/makedata_ad_33_bucketData_20240718.scala

@@ -0,0 +1,128 @@
+package com.aliyun.odps.spark.examples.makedata_ad.v20240718
+
+import com.alibaba.fastjson.JSON
+import com.aliyun.odps.spark.examples.myUtils.{MyDateUtils, MyHdfsUtils, ParamUtils}
+import examples.extractor.ExtractorUtils
+import org.apache.hadoop.io.compress.GzipCodec
+import org.apache.spark.sql.SparkSession
+
+import scala.collection.JavaConversions._
+import scala.collection.mutable.ArrayBuffer
+import scala.io.Source
+/*
+
+ */
+
+object makedata_ad_33_bucketData_20240718 {
+  def main(args: Array[String]): Unit = {
+
+    val spark = SparkSession
+      .builder()
+      .appName(this.getClass.getName)
+      .getOrCreate()
+    val sc = spark.sparkContext
+
+    val loader = getClass.getClassLoader
+
+    val resourceUrlBucket = loader.getResource("20240718_ad_bucket_688.txt")
+    val buckets =
+      if (resourceUrlBucket != null) {
+        val buckets = Source.fromURL(resourceUrlBucket).getLines().mkString("\n")
+        Source.fromURL(resourceUrlBucket).close()
+        buckets
+      } else {
+        ""
+      }
+    println(buckets)
+    val bucketsMap = buckets.split("\n")
+      .map(r => r.replace(" ", "").replaceAll("\n", ""))
+      .filter(r => r.nonEmpty)
+      .map(r =>{
+        val rList = r.split("\t")
+        (rList(0), (rList(1).toDouble, rList(2).split(",").map(_.toDouble)))
+      }).toMap
+    val bucketsMap_br = sc.broadcast(bucketsMap)
+
+
+    // 1 读取参数
+    val param = ParamUtils.parseArgs(args)
+    val readPath = param.getOrElse("readPath", "/dw/recommend/model/31_ad_sample_data/")
+    val savePath = param.getOrElse("savePath", "/dw/recommend/model/33_ad_train_data/")
+    val beginStr = param.getOrElse("beginStr", "20240620")
+    val endStr = param.getOrElse("endStr", "20240620")
+    val repartition = param.getOrElse("repartition", "100").toInt
+    val filterNames = param.getOrElse("filterNames", "").split(",").filter(_.nonEmpty).toSet
+    val whatLabel = param.getOrElse("whatLabel", "ad_is_conversion")
+
+    val dateRange = MyDateUtils.getDateRange(beginStr, endStr)
+    for (date <- dateRange) {
+      println("开始执行:" + date)
+      val data = sc.textFile(readPath + "/" + date + "*").map(r=>{
+        val rList = r.split("\t")
+        val logKey = rList(0)
+        val labelKey = rList(1)
+        val jsons = JSON.parseObject(rList(2))
+        val features = scala.collection.mutable.Map[String, Double]()
+        jsons.foreach(r => {
+          features.put(r._1, jsons.getDoubleValue(r._1))
+        })
+        (logKey, labelKey, features)
+      })
+        .filter{
+          case (logKey, labelKey, features) =>
+            val logKeyList = logKey.split(",")
+            val apptype = logKeyList(0)
+            !Set("12", "13").contains(apptype)
+        }
+        .map{
+          case (logKey, labelKey, features) =>
+            val label = JSON.parseObject(labelKey).getOrDefault(whatLabel, "0").toString
+            (label, features)
+        }
+        .mapPartitions(row => {
+          val result = new ArrayBuffer[String]()
+          val bucketsMap = bucketsMap_br.value
+          row.foreach{
+            case (label, features) =>
+              val featuresBucket = features.map{
+                case (name, score) =>
+                  var ifFilter = false
+                  if (filterNames.nonEmpty){
+                    filterNames.foreach(r=> if (!ifFilter && name.contains(r)) {ifFilter = true} )
+                  }
+                  if (ifFilter){
+                    ""
+                  }else{
+                    if (score > 1E-8) {
+                      if (bucketsMap.contains(name)) {
+                        val (bucketsNum, buckets) = bucketsMap(name)
+                        val scoreNew = 1.0 / bucketsNum * (ExtractorUtils.findInsertPosition(buckets, score).toDouble + 1.0)
+                        name + ":" + scoreNew.toString
+                      } else {
+                        name + ":" + score.toString
+                      }
+                    } else {
+                      ""
+                    }
+                  }
+              }.filter(_.nonEmpty)
+              result.add(label + "\t" + featuresBucket.mkString("\t"))
+          }
+          result.iterator
+      })
+
+      // 4 保存数据到hdfs
+      val hdfsPath = savePath + "/" + date
+      if (hdfsPath.nonEmpty && hdfsPath.startsWith("/dw/recommend/model/")) {
+        println("删除路径并开始数据写入:" + hdfsPath)
+        MyHdfsUtils.delete_hdfs_path(hdfsPath)
+        data.repartition(repartition).saveAsTextFile(hdfsPath, classOf[GzipCodec])
+      } else {
+        println("路径不合法,无法写入:" + hdfsPath)
+      }
+    }
+
+
+
+  }
+}

+ 135 - 0
src/main/scala/com/aliyun/odps/spark/examples/makedata_ad/v20240718/makedata_ad_33_bucketData_20240718_sample.scala

@@ -0,0 +1,135 @@
+package com.aliyun.odps.spark.examples.makedata_ad.v20240718
+
+import com.alibaba.fastjson.JSON
+import com.aliyun.odps.spark.examples.myUtils.{MyDateUtils, MyHdfsUtils, ParamUtils}
+import examples.extractor.ExtractorUtils
+import org.apache.hadoop.io.compress.GzipCodec
+import org.apache.spark.sql.SparkSession
+
+import scala.collection.JavaConversions._
+import scala.collection.mutable.ArrayBuffer
+import scala.io.Source
+import scala.util.Random
+
+/*
+
+ */
+
+object makedata_ad_33_bucketData_20240718_sample {
+  def main(args: Array[String]): Unit = {
+
+    val spark = SparkSession
+      .builder()
+      .appName(this.getClass.getName)
+      .getOrCreate()
+    val sc = spark.sparkContext
+
+
+    // 1 读取参数
+    val param = ParamUtils.parseArgs(args)
+    val readPath = param.getOrElse("readPath", "/dw/recommend/model/31_ad_sample_data/")
+    val savePath = param.getOrElse("savePath", "/dw/recommend/model/33_ad_train_data/")
+    val beginStr = param.getOrElse("beginStr", "20240620")
+    val endStr = param.getOrElse("endStr", "20240620")
+    val repartition = param.getOrElse("repartition", "100").toInt
+    val filterNames = param.getOrElse("filterNames", "").split(",").filter(_.nonEmpty).toSet
+    val whatLabel = param.getOrElse("whatLabel", "ad_is_conversion")
+    val sampleRate = param.getOrElse("sampleRate", "0.1").toDouble
+
+    val loader = getClass.getClassLoader
+
+    val resourceUrlBucket = loader.getResource("20240718_ad_bucket_688.txt")
+    val buckets =
+      if (resourceUrlBucket != null) {
+        val buckets = Source.fromURL(resourceUrlBucket).getLines().mkString("\n")
+        Source.fromURL(resourceUrlBucket).close()
+        buckets
+      } else {
+        ""
+      }
+    println(buckets)
+    val bucketsMap = buckets.split("\n")
+      .map(r => r.replace(" ", "").replaceAll("\n", ""))
+      .filter(r => r.nonEmpty)
+      .map(r => {
+        val rList = r.split("\t")
+        (rList(0), (rList(1).toDouble, rList(2).split(",").map(_.toDouble)))
+      }).toMap
+    val bucketsMap_br = sc.broadcast(bucketsMap)
+
+    val dateRange = MyDateUtils.getDateRange(beginStr, endStr)
+    for (date <- dateRange) {
+      println("开始执行:" + date)
+      val data = sc.textFile(readPath + "/" + date + "*").map(r => {
+          val rList = r.split("\t")
+          val logKey = rList(0)
+          val labelKey = rList(1)
+          val jsons = JSON.parseObject(rList(2))
+          val features = scala.collection.mutable.Map[String, Double]()
+          jsons.foreach(r => {
+            features.put(r._1, jsons.getDoubleValue(r._1))
+          })
+          (logKey, labelKey, features)
+        })
+        .filter {
+          case (logKey, labelKey, features) =>
+            val logKeyList = logKey.split(",")
+            val apptype = logKeyList(0)
+            !Set("12", "13").contains(apptype)
+        }.filter {
+          case (logKey, labelKey, features) =>
+            new Random().nextDouble() < sampleRate
+        }
+        .map {
+          case (logKey, labelKey, features) =>
+            val label = JSON.parseObject(labelKey).getOrDefault(whatLabel, "0").toString
+            (label, features)
+        }
+        .mapPartitions(row => {
+          val result = new ArrayBuffer[String]()
+          val bucketsMap = bucketsMap_br.value
+          row.foreach {
+            case (label, features) =>
+              val featuresBucket = features.map {
+                case (name, score) =>
+                  var ifFilter = false
+                  if (filterNames.nonEmpty) {
+                    filterNames.foreach(r => if (!ifFilter && name.contains(r)) {
+                      ifFilter = true
+                    })
+                  }
+                  if (ifFilter) {
+                    ""
+                  } else {
+                    if (score > 1E-8) {
+                      if (bucketsMap.contains(name)) {
+                        val (bucketsNum, buckets) = bucketsMap(name)
+                        val scoreNew = 1.0 / bucketsNum * (ExtractorUtils.findInsertPosition(buckets, score).toDouble + 1.0)
+                        name + ":" + scoreNew.toString
+                      } else {
+                        name + ":" + score.toString
+                      }
+                    } else {
+                      ""
+                    }
+                  }
+              }.filter(_.nonEmpty)
+              result.add(label + "\t" + featuresBucket.mkString("\t"))
+          }
+          result.iterator
+        })
+
+      // 4 保存数据到hdfs
+      val hdfsPath = savePath + "/" + date
+      if (hdfsPath.nonEmpty && hdfsPath.startsWith("/dw/recommend/model/")) {
+        println("删除路径并开始数据写入:" + hdfsPath)
+        MyHdfsUtils.delete_hdfs_path(hdfsPath)
+        data.repartition(repartition).saveAsTextFile(hdfsPath, classOf[GzipCodec])
+      } else {
+        println("路径不合法,无法写入:" + hdfsPath)
+      }
+    }
+
+
+  }
+}

+ 158 - 0
src/main/scala/com/aliyun/odps/spark/examples/makedata_ad/v20240718/makedata_ad_33_bucketData_20240726.scala

@@ -0,0 +1,158 @@
+package com.aliyun.odps.spark.examples.makedata_ad.v20240718
+
+import com.alibaba.fastjson.JSON
+import com.aliyun.odps.spark.examples.myUtils.{MyDateUtils, MyHdfsUtils, ParamUtils}
+import examples.extractor.ExtractorUtils
+import org.apache.hadoop.io.compress.GzipCodec
+import org.apache.spark.sql.SparkSession
+
+import scala.collection.JavaConversions._
+import scala.collection.mutable.ArrayBuffer
+import scala.io.Source
+
+/*
+
+ */
+
+object makedata_ad_33_bucketData_20240726 {
+  def main(args: Array[String]): Unit = {
+
+    val spark = SparkSession
+      .builder()
+      .appName(this.getClass.getName)
+      .getOrCreate()
+    val sc = spark.sparkContext
+
+
+    // 1 读取参数
+    val param = ParamUtils.parseArgs(args)
+    val readPath = param.getOrElse("readPath", "/dw/recommend/model/31_ad_sample_data/")
+    val savePath = param.getOrElse("savePath", "/dw/recommend/model/33_ad_train_data/")
+    val beginStr = param.getOrElse("beginStr", "20240620")
+    val endStr = param.getOrElse("endStr", "20240620")
+    val repartition = param.getOrElse("repartition", "100").toInt
+    val filterNames = param.getOrElse("filterNames", "").split(",").filter(_.nonEmpty).toSet
+    val whatLabel = param.getOrElse("whatLabel", "ad_is_conversion")
+    val featureNameFile = param.getOrElse("featureNameFile", "20240718_ad_feature_name_517.txt");
+
+
+    val loader = getClass.getClassLoader
+
+    val resourceUrlBucket = loader.getResource("20240718_ad_bucket_517.txt")
+    val buckets =
+      if (resourceUrlBucket != null) {
+        val buckets = Source.fromURL(resourceUrlBucket).getLines().mkString("\n")
+        Source.fromURL(resourceUrlBucket).close()
+        buckets
+      } else {
+        ""
+      }
+    println(buckets)
+    val bucketsMap = buckets.split("\n")
+      .map(r => r.replace(" ", "").replaceAll("\n", ""))
+      .filter(r => r.nonEmpty)
+      .map(r => {
+        val rList = r.split("\t")
+        (rList(0), (rList(1).toDouble, rList(2).split(",").map(_.toDouble)))
+      }).toMap
+    val bucketsMap_br = sc.broadcast(bucketsMap)
+
+    val resourceUrl = loader.getResource(featureNameFile)
+    val content =
+      if (resourceUrl != null) {
+        val content = Source.fromURL(resourceUrl).getLines().mkString("\n")
+        Source.fromURL(resourceUrl).close()
+        content
+      } else {
+        ""
+      }
+
+    println()
+    println()
+    println()
+    println(content)
+    val contentList = content.split("\n")
+      .map(r => r.replace(" ", "").replaceAll("\n", ""))
+      .filter(r => r.nonEmpty).toList
+
+    val dateRange = MyDateUtils.getDateRange(beginStr, endStr)
+    for (date <- dateRange) {
+      println("开始执行:" + date)
+      val data = sc.textFile(readPath + "/" + date + "*").map(r => {
+          val rList = r.split("\t")
+          val logKey = rList(0)
+          val labelKey = rList(1)
+          val jsons = JSON.parseObject(rList(2))
+          val features = scala.collection.mutable.Map[String, Double]()
+          jsons.foreach(r => {
+            features.put(r._1, jsons.getDoubleValue(r._1))
+          })
+
+          for (name <- contentList) {
+            if (!features.contains(name)) {
+              features.put(name, 0)
+            }
+          }
+
+          (logKey, labelKey, features)
+        })
+        .filter {
+          case (logKey, labelKey, features) =>
+            val logKeyList = logKey.split(",")
+            val apptype = logKeyList(0)
+            !Set("12", "13").contains(apptype)
+        }
+        .map {
+          case (logKey, labelKey, features) =>
+            val label = JSON.parseObject(labelKey).getOrDefault(whatLabel, "0").toString
+
+            (label, features)
+        }
+        .mapPartitions(row => {
+          val result = new ArrayBuffer[String]()
+          val bucketsMap = bucketsMap_br.value
+          row.foreach {
+            case (label, features) =>
+              val featuresBucket = features.map {
+                case (name, score) =>
+                  var ifFilter = false
+                  if (filterNames.nonEmpty) {
+                    filterNames.foreach(r => if (!ifFilter && name.contains(r)) {
+                      ifFilter = true
+                    })
+                  }
+                  if (ifFilter) {
+                    ""
+                  } else {
+                    if (score > 1E-8) {
+                      if (bucketsMap.contains(name)) {
+                        val (bucketsNum, buckets) = bucketsMap(name)
+                        val scoreNew = 0.01 + (1.0 / bucketsNum * (ExtractorUtils.findInsertPosition(buckets, score).toDouble + 1.0))
+                        name + ":" + scoreNew.toString
+                      } else {
+                        name + ":" + score.toString
+                      }
+                    } else {
+                      name + ":" + "0.01"
+                    }
+                  }
+              }.filter(_.nonEmpty)
+              result.add(label + "\t" + featuresBucket.mkString("\t"))
+          }
+          result.iterator
+        })
+
+      // 4 保存数据到hdfs
+      val hdfsPath = savePath + "/" + date
+      if (hdfsPath.nonEmpty && hdfsPath.startsWith("/dw/recommend/model/")) {
+        println("删除路径并开始数据写入:" + hdfsPath)
+        MyHdfsUtils.delete_hdfs_path(hdfsPath)
+        data.repartition(repartition).saveAsTextFile(hdfsPath, classOf[GzipCodec])
+      } else {
+        println("路径不合法,无法写入:" + hdfsPath)
+      }
+    }
+
+
+  }
+}

+ 152 - 0
src/main/scala/com/aliyun/odps/spark/examples/makedata_ad/v20240718/makedata_ad_33_bucketData_20240729.scala

@@ -0,0 +1,152 @@
+package com.aliyun.odps.spark.examples.makedata_ad.v20240718
+
+import com.alibaba.fastjson.JSON
+import com.aliyun.odps.spark.examples.myUtils.{MyDateUtils, MyHdfsUtils, ParamUtils}
+import examples.extractor.ExtractorUtils
+import org.apache.hadoop.io.compress.GzipCodec
+import org.apache.spark.sql.SparkSession
+
+import scala.collection.JavaConversions._
+import scala.collection.mutable.ArrayBuffer
+import scala.io.Source
+import scala.util.Random
+
+/*
+
+ */
+
+object makedata_ad_33_bucketData_20240729 {
+  def main(args: Array[String]): Unit = {
+
+    val spark = SparkSession
+      .builder()
+      .appName(this.getClass.getName)
+      .getOrCreate()
+    val sc = spark.sparkContext
+
+    // 1 读取参数
+    val param = ParamUtils.parseArgs(args)
+    val readPath = param.getOrElse("readPath", "/dw/recommend/model/31_ad_sample_data/")
+    val savePath = param.getOrElse("savePath", "/dw/recommend/model/33_ad_train_data/")
+    val beginStr = param.getOrElse("beginStr", "20240620")
+    val endStr = param.getOrElse("endStr", "20240620")
+    val repartition = param.getOrElse("repartition", "100").toInt
+    val filterNames = param.getOrElse("filterNames", "").split(",").filter(_.nonEmpty).toSet
+    val whatLabel = param.getOrElse("whatLabel", "ad_is_conversion")
+
+    val loader = getClass.getClassLoader
+
+    val resourceUrlBucket = loader.getResource("20240718_ad_bucket_688.txt")
+    val buckets =
+      if (resourceUrlBucket != null) {
+        val buckets = Source.fromURL(resourceUrlBucket).getLines().mkString("\n")
+        Source.fromURL(resourceUrlBucket).close()
+        buckets
+      } else {
+        ""
+      }
+    println(buckets)
+    val bucketsMap = buckets.split("\n")
+      .map(r => r.replace(" ", "").replaceAll("\n", ""))
+      .filter(r => r.nonEmpty)
+      .map(r => {
+        val rList = r.split("\t")
+        (rList(0), (rList(1).toDouble, rList(2).split(",").map(_.toDouble)))
+      }).toMap
+    val bucketsMap_br = sc.broadcast(bucketsMap)
+
+    val dateRange = MyDateUtils.getDateRange(beginStr, endStr)
+    val cidCountMap = scala.collection.mutable.Map[String, Int]()
+    for (date <- dateRange) {
+      println("开始执行:" + date)
+      val data = sc.textFile(readPath + "/" + date + "*").map(r => {
+          val rList = r.split("\t")
+          val logKey = rList(0)
+          val labelKey = rList(1)
+          val jsons = JSON.parseObject(rList(2))
+          val features = scala.collection.mutable.Map[String, Double]()
+          jsons.foreach(r => {
+            features.put(r._1, jsons.getDoubleValue(r._1))
+          })
+          (logKey, labelKey, features)
+        })
+        .filter {
+          case (logKey, labelKey, features) =>
+            val logKeyList = logKey.split(",")
+            val apptype = logKeyList(0)
+            !Set("12", "13").contains(apptype)
+        }.filter {
+          case (logKey, labelKey, features) =>
+            var key = ""
+            for (elem <- features) {
+              if (elem._1.contains("cid_")) {
+                key = elem._1
+              }
+            }
+
+            if (key.equals("cid_3319")) {
+              true
+            } else if (key.equals("cid_3024")) {
+              // 创建一个Random实例
+              val rand = new Random()
+
+              // 生成一个0到1之间的随机浮点数
+              val randomDouble = rand.nextDouble()
+
+              randomDouble < 0.01
+            } else {
+              false
+            }
+        }.map {
+          case (logKey, labelKey, features) =>
+            val label = JSON.parseObject(labelKey).getOrDefault(whatLabel, "0").toString
+            (label, features)
+        }
+        .mapPartitions(row => {
+          val result = new ArrayBuffer[String]()
+          val bucketsMap = bucketsMap_br.value
+          row.foreach {
+            case (label, features) =>
+              val featuresBucket = features.map {
+                case (name, score) =>
+                  var ifFilter = false
+                  if (filterNames.nonEmpty) {
+                    filterNames.foreach(r => if (!ifFilter && name.contains(r)) {
+                      ifFilter = true
+                    })
+                  }
+                  if (ifFilter) {
+                    ""
+                  } else {
+                    if (score > 1E-8) {
+                      if (bucketsMap.contains(name)) {
+                        val (bucketsNum, buckets) = bucketsMap(name)
+                        val scoreNew = 1.0 / bucketsNum * (ExtractorUtils.findInsertPosition(buckets, score).toDouble + 1.0)
+                        name + ":" + scoreNew.toString
+                      } else {
+                        name + ":" + score.toString
+                      }
+                    } else {
+                      ""
+                    }
+                  }
+              }.filter(_.nonEmpty)
+              result.add(label + "\t" + featuresBucket.mkString("\t"))
+          }
+          result.iterator
+        })
+
+      // 4 保存数据到hdfs
+      val hdfsPath = savePath + "/" + date
+      if (hdfsPath.nonEmpty && hdfsPath.startsWith("/dw/recommend/model/")) {
+        println("删除路径并开始数据写入:" + hdfsPath)
+        MyHdfsUtils.delete_hdfs_path(hdfsPath)
+        data.repartition(repartition).saveAsTextFile(hdfsPath, classOf[GzipCodec])
+      } else {
+        println("路径不合法,无法写入:" + hdfsPath)
+      }
+    }
+
+
+  }
+}

+ 181 - 0
src/main/scala/com/aliyun/odps/spark/examples/makedata_ad/v20240718/makedata_ad_33_bucketData_20240729_copy_zheng.scala

@@ -0,0 +1,181 @@
+package com.aliyun.odps.spark.examples.makedata_ad.v20240718
+
+import com.alibaba.fastjson.JSON
+import com.aliyun.odps.spark.examples.myUtils.{MyDateUtils, MyHdfsUtils, ParamUtils}
+import examples.extractor.ExtractorUtils
+import org.apache.hadoop.io.compress.GzipCodec
+import org.apache.spark.sql.SparkSession
+
+import scala.collection.JavaConversions._
+import scala.collection.mutable.ArrayBuffer
+import scala.io.Source
+import scala.util.Random
+
+/*
+
+ */
+
+object makedata_ad_33_bucketData_20240729_copy_zheng {
+  def main(args: Array[String]): Unit = {
+
+    val spark = SparkSession
+      .builder()
+      .appName(this.getClass.getName)
+      .getOrCreate()
+    val sc = spark.sparkContext
+
+    // 1 读取参数
+    val param = ParamUtils.parseArgs(args)
+    val readPath = param.getOrElse("readPath", "/dw/recommend/model/31_ad_sample_data/")
+    val savePath = param.getOrElse("savePath", "/dw/recommend/model/33_ad_train_data/")
+    val beginStr = param.getOrElse("beginStr", "20240620")
+    val endStr = param.getOrElse("endStr", "20240620")
+    val repartition = param.getOrElse("repartition", "100").toInt
+    val filterNames = param.getOrElse("filterNames", "").split(",").filter(_.nonEmpty).toSet
+    val whatLabel = param.getOrElse("whatLabel", "ad_is_conversion")
+
+    val loader = getClass.getClassLoader
+
+    val resourceUrlBucket = loader.getResource("20240718_ad_bucket_688.txt")
+    val buckets =
+      if (resourceUrlBucket != null) {
+        val buckets = Source.fromURL(resourceUrlBucket).getLines().mkString("\n")
+        Source.fromURL(resourceUrlBucket).close()
+        buckets
+      } else {
+        ""
+      }
+    println(buckets)
+    val bucketsMap = buckets.split("\n")
+      .map(r => r.replace(" ", "").replaceAll("\n", ""))
+      .filter(r => r.nonEmpty)
+      .map(r => {
+        val rList = r.split("\t")
+        (rList(0), (rList(1).toDouble, rList(2).split(",").map(_.toDouble)))
+      }).toMap
+    val bucketsMap_br = sc.broadcast(bucketsMap)
+
+    val dateRange = MyDateUtils.getDateRange(beginStr, endStr)
+    val cidCountMap = scala.collection.mutable.Map[String, Int]()
+    for (date <- dateRange) {
+      println("开始执行:" + date)
+      val data = sc.textFile(readPath + "/" + date + "*").map(r => {
+          val rList = r.split("\t")
+          val logKey = rList(0)
+          val labelKey = rList(1)
+          val jsons = JSON.parseObject(rList(2))
+          val features = scala.collection.mutable.Map[String, Double]()
+          jsons.foreach(r => {
+            features.put(r._1, jsons.getDoubleValue(r._1))
+          })
+          (logKey, labelKey, features)
+        })
+        .filter {
+          case (logKey, labelKey, features) =>
+            val logKeyList = logKey.split(",")
+            val apptype = logKeyList(0)
+            !Set("12", "13").contains(apptype)
+        }.filter {
+          case (logKey, labelKey, features) =>
+            var key = ""
+            for (elem <- features) {
+              if (elem._1.contains("cid_")) {
+                key = elem._1
+              }
+            }
+
+            if (key.equals("cid_3319")) {
+              true
+            } else if (key.equals("cid_3024")) {
+              // 创建一个Random实例
+              val rand = new Random()
+
+              // 生成一个0到1之间的随机浮点数
+              val randomDouble = rand.nextDouble()
+
+              randomDouble < 0.01
+            } else {
+              false
+            }
+        }.flatMap {
+          case (logKey, labelKey, features) =>
+            var key = ""
+            for (elem <- features) {
+              if (elem._1.contains("cid_")) {
+                key = elem._1
+              }
+            }
+            if (key.equals("cid_3319")) {
+              val label = JSON.parseObject(labelKey).getOrDefault(whatLabel, "0").toString
+              if (!label.equals("0")) {
+                Seq(
+                  (logKey, labelKey, features),
+                  (logKey, labelKey, features),
+                  (logKey, labelKey, features),
+                  (logKey, labelKey, features),
+                  (logKey, labelKey, features),
+                  (logKey, labelKey, features),
+                  (logKey, labelKey, features),
+                  (logKey, labelKey, features),
+                  (logKey, labelKey, features),
+                  (logKey, labelKey, features)
+                )
+              } else {
+                Seq((logKey, labelKey, features))
+              }
+            } else {
+              Seq((logKey, labelKey, features))
+            }
+        }.map {
+          case (logKey, labelKey, features) =>
+            val label = JSON.parseObject(labelKey).getOrDefault(whatLabel, "0").toString
+            (label, features)
+        }
+        .mapPartitions(row => {
+          val result = new ArrayBuffer[String]()
+          val bucketsMap = bucketsMap_br.value
+          row.foreach {
+            case (label, features) =>
+              val featuresBucket = features.map {
+                case (name, score) =>
+                  var ifFilter = false
+                  if (filterNames.nonEmpty) {
+                    filterNames.foreach(r => if (!ifFilter && name.contains(r)) {
+                      ifFilter = true
+                    })
+                  }
+                  if (ifFilter) {
+                    ""
+                  } else {
+                    if (score > 1E-8) {
+                      if (bucketsMap.contains(name)) {
+                        val (bucketsNum, buckets) = bucketsMap(name)
+                        val scoreNew = 1.0 / bucketsNum * (ExtractorUtils.findInsertPosition(buckets, score).toDouble + 1.0)
+                        name + ":" + scoreNew.toString
+                      } else {
+                        name + ":" + score.toString
+                      }
+                    } else {
+                      ""
+                    }
+                  }
+              }.filter(_.nonEmpty)
+              result.add(label + "\t" + featuresBucket.mkString("\t"))
+          }
+          result.iterator
+        })
+
+      // 4 保存数据到hdfs
+      val hdfsPath = savePath + "/" + date
+      if (hdfsPath.nonEmpty && hdfsPath.startsWith("/dw/recommend/model/")) {
+        println("删除路径并开始数据写入:" + hdfsPath)
+        MyHdfsUtils.delete_hdfs_path(hdfsPath)
+        data.repartition(repartition).saveAsTextFile(hdfsPath, classOf[GzipCodec])
+      } else {
+        println("路径不合法,无法写入:" + hdfsPath)
+      }
+    }
+
+
+  }
+}

+ 129 - 0
src/main/scala/com/aliyun/odps/spark/examples/makedata_ad/v20240718/makedata_ad_33_bucketData_20240729_reduce_feature.scala

@@ -0,0 +1,129 @@
+package com.aliyun.odps.spark.examples.makedata_ad.v20240718
+
+import com.alibaba.fastjson.JSON
+import com.aliyun.odps.spark.examples.myUtils.{MyDateUtils, MyHdfsUtils, ParamUtils}
+import examples.extractor.ExtractorUtils
+import org.apache.hadoop.io.compress.GzipCodec
+import org.apache.spark.sql.SparkSession
+
+import scala.collection.JavaConversions._
+import scala.collection.mutable.ArrayBuffer
+import scala.io.Source
+
+/*
+
+ */
+
+object makedata_ad_33_bucketData_20240729_reduce_feature {
+  def main(args: Array[String]): Unit = {
+
+    val spark = SparkSession
+      .builder()
+      .appName(this.getClass.getName)
+      .getOrCreate()
+    val sc = spark.sparkContext
+
+    // 1 读取参数
+    val param = ParamUtils.parseArgs(args)
+    val readPath = param.getOrElse("readPath", "/dw/recommend/model/31_ad_sample_data/")
+    val savePath = param.getOrElse("savePath", "/dw/recommend/model/33_ad_train_data/")
+    val beginStr = param.getOrElse("beginStr", "20240620")
+    val endStr = param.getOrElse("endStr", "20240620")
+    val repartition = param.getOrElse("repartition", "100").toInt
+    val retainNames = param.getOrElse("retainNames", "").split(",").filter(_.nonEmpty).toSet
+    val whatLabel = param.getOrElse("whatLabel", "ad_is_conversion")
+
+    val loader = getClass.getClassLoader
+
+    val resourceUrlBucket = loader.getResource("20240718_ad_bucket_688.txt")
+    val buckets =
+      if (resourceUrlBucket != null) {
+        val buckets = Source.fromURL(resourceUrlBucket).getLines().mkString("\n")
+        Source.fromURL(resourceUrlBucket).close()
+        buckets
+      } else {
+        ""
+      }
+    println(buckets)
+    val bucketsMap = buckets.split("\n")
+      .map(r => r.replace(" ", "").replaceAll("\n", ""))
+      .filter(r => r.nonEmpty)
+      .map(r => {
+        val rList = r.split("\t")
+        (rList(0), (rList(1).toDouble, rList(2).split(",").map(_.toDouble)))
+      }).toMap
+    val bucketsMap_br = sc.broadcast(bucketsMap)
+
+    val dateRange = MyDateUtils.getDateRange(beginStr, endStr)
+    val cidCountMap = scala.collection.mutable.Map[String, Int]()
+    for (date <- dateRange) {
+      println("开始执行:" + date)
+      val data = sc.textFile(readPath + "/" + date + "*").map(r => {
+          val rList = r.split("\t")
+          val logKey = rList(0)
+          val labelKey = rList(1)
+          val jsons = JSON.parseObject(rList(2))
+          val features = scala.collection.mutable.Map[String, Double]()
+          jsons.foreach(r => {
+            features.put(r._1, jsons.getDoubleValue(r._1))
+          })
+          (logKey, labelKey, features)
+        })
+        .filter {
+          case (logKey, labelKey, features) =>
+            val logKeyList = logKey.split(",")
+            val apptype = logKeyList(0)
+            !Set("12", "13").contains(apptype)
+        }.map {
+          case (logKey, labelKey, features) =>
+            val label = JSON.parseObject(labelKey).getOrDefault(whatLabel, "0").toString
+            (label, features)
+        }
+        .mapPartitions(row => {
+          val result = new ArrayBuffer[String]()
+          val bucketsMap = bucketsMap_br.value
+          row.foreach {
+            case (label, features) =>
+              val featuresBucket = features.map {
+                case (name, score) =>
+                  var isRetain = false
+                  if (retainNames.nonEmpty) {
+                    retainNames.foreach(r => if (!isRetain && name.contains(r)) {
+                      isRetain = true
+                    })
+                  }
+                  if (isRetain) {
+                    if (score > 1E-8) {
+                      if (bucketsMap.contains(name)) {
+                        val (bucketsNum, buckets) = bucketsMap(name)
+                        val scoreNew = 1.0 / bucketsNum * (ExtractorUtils.findInsertPosition(buckets, score).toDouble + 1.0)
+                        name + ":" + scoreNew.toString
+                      } else {
+                        name + ":" + score.toString
+                      }
+                    } else {
+                      ""
+                    }
+                  } else {
+                    ""
+                  }
+              }.filter(_.nonEmpty)
+              result.add(label + "\t" + featuresBucket.mkString("\t"))
+          }
+          result.iterator
+        })
+
+      // 4 保存数据到hdfs
+      val hdfsPath = savePath + "/" + date
+      if (hdfsPath.nonEmpty && hdfsPath.startsWith("/dw/recommend/model/")) {
+        println("删除路径并开始数据写入:" + hdfsPath)
+        MyHdfsUtils.delete_hdfs_path(hdfsPath)
+        data.repartition(repartition).saveAsTextFile(hdfsPath, classOf[GzipCodec])
+      } else {
+        println("路径不合法,无法写入:" + hdfsPath)
+      }
+    }
+
+
+  }
+}

+ 140 - 0
src/main/scala/com/aliyun/odps/spark/examples/makedata_ad/v20240718/makedata_ad_33_bucketData_default_value_20240718.scala

@@ -0,0 +1,140 @@
+package com.aliyun.odps.spark.examples.makedata_ad.v20240718
+
+import com.alibaba.fastjson.JSON
+import com.aliyun.odps.spark.examples.myUtils.{MyDateUtils, MyHdfsUtils, ParamUtils}
+import examples.extractor.ExtractorUtils
+import org.apache.hadoop.io.compress.GzipCodec
+import org.apache.spark.sql.SparkSession
+
+import scala.collection.JavaConversions._
+import scala.collection.mutable.ArrayBuffer
+import scala.io.Source
+/*
+
+ */
+
+object makedata_ad_33_bucketData_default_value_20240718 {
+  def main(args: Array[String]): Unit = {
+
+    val spark = SparkSession
+      .builder()
+      .appName(this.getClass.getName)
+      .getOrCreate()
+    val sc = spark.sparkContext
+
+    val loader = getClass.getClassLoader
+
+    val resourceUrlBucket = loader.getResource("20240718_ad_bucket_688.txt")
+    val buckets =
+      if (resourceUrlBucket != null) {
+        val buckets = Source.fromURL(resourceUrlBucket).getLines().mkString("\n")
+        Source.fromURL(resourceUrlBucket).close()
+        buckets
+      } else {
+        ""
+      }
+    println(buckets)
+    val bucketsMap = buckets.split("\n")
+      .map(r => r.replace(" ", "").replaceAll("\n", ""))
+      .filter(r => r.nonEmpty)
+      .map(r =>{
+        val rList = r.split("\t")
+        (rList(0), (rList(1).toDouble, rList(2).split(",").map(_.toDouble)))
+      }).toMap
+    val bucketsMap_br = sc.broadcast(bucketsMap)
+
+
+    // 1 读取参数
+    val param = ParamUtils.parseArgs(args)
+    val readPath = param.getOrElse("readPath", "/dw/recommend/model/31_ad_sample_data/")
+    val savePath = param.getOrElse("savePath", "/dw/recommend/model/33_ad_train_data/")
+    val beginStr = param.getOrElse("beginStr", "20240620")
+    val endStr = param.getOrElse("endStr", "20240620")
+    val repartition = param.getOrElse("repartition", "100").toInt
+    val filterNames = param.getOrElse("filterNames", "").split(",").filter(_.nonEmpty).toSet
+    val whatLabel = param.getOrElse("whatLabel", "ad_is_conversion")
+    val modifyFeatureName= param.getOrElse("modifyName", "").split(",").filter(_.nonEmpty).toSet
+    val defaultValue= param.getOrElse("defaultValue", "0.01")
+
+    val dateRange = MyDateUtils.getDateRange(beginStr, endStr)
+    for (date <- dateRange) {
+      println("开始执行:" + date)
+      val data = sc.textFile(readPath + "/" + date + "*").map(r=>{
+        val rList = r.split("\t")
+        val logKey = rList(0)
+        val labelKey = rList(1)
+        val jsons = JSON.parseObject(rList(2))
+        val features = scala.collection.mutable.Map[String, Double]()
+        jsons.foreach(r => {
+          features.put(r._1, jsons.getDoubleValue(r._1))
+        })
+        (logKey, labelKey, features)
+      })
+        .filter{
+          case (logKey, labelKey, features) =>
+            val logKeyList = logKey.split(",")
+            val apptype = logKeyList(0)
+            !Set("12", "13").contains(apptype)
+        }
+        .map{
+          case (logKey, labelKey, features) =>
+            val label = JSON.parseObject(labelKey).getOrDefault(whatLabel, "0").toString
+            (label, features)
+        }
+        .mapPartitions(row => {
+          val result = new ArrayBuffer[String]()
+          val bucketsMap = bucketsMap_br.value
+          row.foreach{
+            case (label, features) =>
+              val featuresBucket = features.map{
+                case (name, score) =>
+                  var ifFilter = false
+                  if (filterNames.nonEmpty){
+                    filterNames.foreach(r=> if (!ifFilter && name.contains(r)) {ifFilter = true} )
+                  }
+                  if (ifFilter){
+                    ""
+                  }else{
+                    if (score > 1E-8) {
+                      if (bucketsMap.contains(name)) {
+                        val (bucketsNum, buckets) = bucketsMap(name)
+                        val scoreNew = 1.0 / bucketsNum * (ExtractorUtils.findInsertPosition(buckets, score).toDouble + 1.0)
+                        name + ":" + scoreNew.toString
+                      } else {
+                        var isModify = false
+                        if (modifyFeatureName.nonEmpty) {
+                          modifyFeatureName.foreach(r => if (!isModify && name.startsWith(r)) {
+                            isModify = true
+                          })
+                        }
+                        if (isModify) {
+                          name + ":" + defaultValue
+                        } else {
+                          name + ":" + score.toString
+                        }
+                      }
+                    } else {
+                      ""
+                    }
+                  }
+              }.filter(_.nonEmpty)
+              result.add(label + "\t" + featuresBucket.mkString("\t"))
+          }
+          result.iterator
+      })
+
+      // 4 保存数据到hdfs
+      val hdfsPath = savePath + "/" + date
+      if (hdfsPath.nonEmpty && hdfsPath.startsWith("/dw/recommend/model/")) {
+        println("删除路径并开始数据写入:" + hdfsPath)
+        MyHdfsUtils.delete_hdfs_path(hdfsPath)
+        data.repartition(repartition).saveAsTextFile(hdfsPath, classOf[GzipCodec])
+      } else {
+        println("路径不合法,无法写入:" + hdfsPath)
+      }
+    }
+
+
+
+  }
+}

+ 24 - 0
src/main/scala/com/aliyun/odps/spark/examples/makedata_ad/v20240718/makedata_ad_34_statistics_20241111.scala

@@ -0,0 +1,24 @@
+package src.main.scala.com.aliyun.odps.spark.examples.makedata_ad.v20240718
+
+import com.aliyun.odps.spark.examples.myUtils.ParamUtils
+import org.apache.spark.sql.SparkSession
+
+/**
+ * 附件生产
+ * <br >
+ * 1. 按CID维度汇总,曝光总量,转化量等信息
+ */
+object makedata_ad_34_statistic_20241111 {
+  def main(args: Array[String]): Unit = {
+
+    val spark = SparkSession
+      .builder()
+      .appName(this.getClass.getName)
+      .getOrCreate();
+    val sc =  spark.sparkContext
+    val loader = getClass.getClassLoader
+
+    val param = ParamUtils.parseArgs(args)
+
+  }
+}

+ 549 - 0
src/main/scala/com/aliyun/odps/spark/examples/makedata_ad/xgb/makedata_31_bucketDataPrint_20240821.scala

@@ -0,0 +1,549 @@
+package com.aliyun.odps.spark.examples.makedata_ad.xgb
+
+import com.alibaba.fastjson.{JSON, JSONObject}
+import com.aliyun.odps.TableSchema
+import com.aliyun.odps.data.Record
+import com.aliyun.odps.spark.examples.myUtils.{MyDateUtils, MyHdfsUtils, ParamUtils, env}
+import examples.extractor.{ExtractorUtils, RankExtractorFeature_20240530}
+import examples.utils.DateTimeUtil
+import org.apache.hadoop.io.compress.GzipCodec
+import org.apache.spark.sql.SparkSession
+import org.xm.Similarity
+
+import scala.collection.JavaConversions._
+import scala.collection.mutable.ArrayBuffer
+import scala.io.Source
+
+object makedata_31_bucketDataPrint_20240821 {
+  def main(args: Array[String]): Unit = {
+    // 1 读取参数
+    val param = ParamUtils.parseArgs(args)
+    val tablePart = param.getOrElse("tablePart", "64").toInt
+    val beginStr = param.getOrElse("beginStr", "2024061500")
+    val endStr = param.getOrElse("endStr", "2024061523")
+    val savePath = param.getOrElse("savePath", "/dw/recommend/model/33_for_check")
+    val project = param.getOrElse("project", "loghubods")
+    val table = param.getOrElse("table", "alg_recsys_ad_sample_all")
+    val repartition = param.getOrElse("repartition", "32").toInt
+    val readDate = param.getOrElse("readDate", "20240615")
+    val featureNameFile = param.getOrElse("featureName", "20240718_ad_feature_name_517.txt")
+    val featureBucketFile = param.getOrElse("featureBucketFile", "20240718_ad_bucket_517.txt");
+    val filterHours = param.getOrElse("filterHours", "00,01,02,03,04,05,06,07").split(",").toSet
+    val idDefaultValue = param.getOrElse("idDefaultValue", "1.0").toDouble
+
+    val spark = SparkSession
+      .builder()
+      .appName(this.getClass.getName)
+      .getOrCreate()
+    val sc = spark.sparkContext
+
+    val loader = getClass.getClassLoader
+    val featureNameUrl = loader.getResource(featureNameFile)
+    val content =
+      if (featureNameUrl != null) {
+        val content = Source.fromURL(featureNameUrl).getLines().mkString("\n")
+        Source.fromURL(featureNameUrl).close()
+        content
+      } else {
+        ""
+      }
+    println(content)
+    val featureNameList = content.split("\n")
+      .map(r => r.replace(" ", "").replaceAll("\n", ""))
+      .filter(r => r.nonEmpty).toList
+    val contentList_br = sc.broadcast(featureNameList)
+
+    val resourceUrlBucket = loader.getResource(featureBucketFile)
+    val buckets =
+      if (resourceUrlBucket != null) {
+        val buckets = Source.fromURL(resourceUrlBucket).getLines().mkString("\n")
+        Source.fromURL(resourceUrlBucket).close()
+        buckets
+      } else {
+        ""
+      }
+    println(buckets)
+    val bucketsMap = buckets.split("\n")
+      .map(r => r.replace(" ", "").replaceAll("\n", ""))
+      .filter(r => r.nonEmpty)
+      .map(r => {
+        val rList = r.split("\t")
+        (rList(0), (rList(1).toDouble, rList(2).split(",").map(_.toDouble)))
+      }).toMap
+    val bucketsMap_br = sc.broadcast(bucketsMap)
+    // 2 读取odps+表信息
+    val odpsOps = env.getODPS(sc)
+
+    // 3 循环执行数据生产
+    val timeRange = MyDateUtils.getDateHourRange(beginStr, endStr)
+    for (dt_hh <- timeRange) {
+      val dt = dt_hh.substring(0, 8)
+      val hh = dt_hh.substring(8, 10)
+      val partition = s"dt=$dt,hh=$hh"
+      if (filterHours.nonEmpty && filterHours.contains(hh)) {
+        println("不执行partiton:" + partition)
+      } else {
+        println("开始执行partiton:" + partition)
+        val odpsData = odpsOps.readTable(project = project,
+            table = table,
+            partition = partition,
+            transfer = func,
+            numPartition = tablePart)
+          .map(record => {
+
+
+            val ts = record.getString("ts").toInt
+            val cid = record.getString("cid")
+            val apptype = record.getString("apptype")
+            val extend: JSONObject = if (record.isNull("extend")) new JSONObject() else
+              JSON.parseObject(record.getString("extend"))
+
+
+            val featureMap = new JSONObject()
+
+            val b1: JSONObject = if (record.isNull("b1_feature")) new JSONObject() else
+              JSON.parseObject(record.getString("b1_feature"))
+            val b2: JSONObject = if (record.isNull("b2_feature")) new JSONObject() else
+              JSON.parseObject(record.getString("b2_feature"))
+            val b3: JSONObject = if (record.isNull("b3_feature")) new JSONObject() else
+              JSON.parseObject(record.getString("b3_feature"))
+            val b4: JSONObject = if (record.isNull("b4_feature")) new JSONObject() else
+              JSON.parseObject(record.getString("b4_feature"))
+            val b5: JSONObject = if (record.isNull("b5_feature")) new JSONObject() else
+              JSON.parseObject(record.getString("b5_feature"))
+            val b6: JSONObject = if (record.isNull("b6_feature")) new JSONObject() else
+              JSON.parseObject(record.getString("b6_feature"))
+            val b7: JSONObject = if (record.isNull("b7_feature")) new JSONObject() else
+              JSON.parseObject(record.getString("b7_feature"))
+            val b8: JSONObject = if (record.isNull("b8_feature")) new JSONObject() else
+              JSON.parseObject(record.getString("b8_feature"))
+            val b9: JSONObject = if (record.isNull("b9_feature")) new JSONObject() else
+              JSON.parseObject(record.getString("b9_feature"))
+
+
+            featureMap.put("cid_" + cid, idDefaultValue)
+            if (b1.containsKey("adid") && b1.getString("adid").nonEmpty) {
+              featureMap.put("adid_" + b1.getString("adid"), idDefaultValue)
+            }
+            if (b1.containsKey("adverid") && b1.getString("adverid").nonEmpty) {
+              featureMap.put("adverid_" + b1.getString("adverid"), idDefaultValue)
+            }
+            if (b1.containsKey("targeting_conversion") && b1.getString("targeting_conversion").nonEmpty) {
+              featureMap.put("targeting_conversion_" + b1.getString("targeting_conversion"), idDefaultValue)
+            }
+
+            val hour = DateTimeUtil.getHourByTimestamp(ts)
+            featureMap.put("hour_" + hour, idDefaultValue)
+
+            val dayOfWeek = DateTimeUtil.getDayOrWeekByTimestamp(ts)
+            featureMap.put("dayofweek_" + dayOfWeek, idDefaultValue);
+
+            featureMap.put("apptype_" + apptype, idDefaultValue);
+
+            if (extend.containsKey("abcode") && extend.getString("abcode").nonEmpty) {
+              featureMap.put("abcode_" + extend.getString("abcode"), idDefaultValue)
+            }
+
+
+            if (b1.containsKey("cpa")) {
+              featureMap.put("cpa", b1.getString("cpa").toDouble)
+            }
+            if (b1.containsKey("weight") && b1.getString("weight").nonEmpty) {
+              featureMap.put("weight", b1.getString("weight").toDouble)
+            }
+
+            for ((bn, prefix1) <- List(
+              (b2, "b2"), (b3, "b3"), (b4, "b4"), (b5, "b5"), (b8, "b8"), (b9, "b9")
+            )) {
+              for (prefix2 <- List(
+                "1h", "2h", "3h", "4h", "5h", "6h", "12h", "1d", "3d", "7d", "today", "yesterday"
+              )) {
+                val view = if (bn.isEmpty) 0D else bn.getIntValue("ad_view_" + prefix2).toDouble
+                val click = if (bn.isEmpty) 0D else bn.getIntValue("ad_click_" + prefix2).toDouble
+                val conver = if (bn.isEmpty) 0D else bn.getIntValue("ad_conversion_" + prefix2).toDouble
+                val income = if (bn.isEmpty) 0D else bn.getIntValue("ad_income_" + prefix2).toDouble
+                val f1 = RankExtractorFeature_20240530.calDiv(click, view)
+                val f2 = RankExtractorFeature_20240530.calDiv(conver, view)
+                val f3 = RankExtractorFeature_20240530.calDiv(conver, click)
+                val f4 = conver
+                val f5 = RankExtractorFeature_20240530.calDiv(income * 1000, view)
+                featureMap.put(prefix1 + "_" + prefix2 + "_" + "ctr", f1)
+                featureMap.put(prefix1 + "_" + prefix2 + "_" + "ctcvr", f2)
+                featureMap.put(prefix1 + "_" + prefix2 + "_" + "cvr", f3)
+                featureMap.put(prefix1 + "_" + prefix2 + "_" + "conver", f4)
+                featureMap.put(prefix1 + "_" + prefix2 + "_" + "ecpm", f5)
+
+                featureMap.put(prefix1 + "_" + prefix2 + "_" + "click", click)
+                featureMap.put(prefix1 + "_" + prefix2 + "_" + "conver*log(view)", conver * RankExtractorFeature_20240530.calLog(view))
+                featureMap.put(prefix1 + "_" + prefix2 + "_" + "conver*ctcvr", conver * f2)
+              }
+            }
+
+            for ((bn, prefix1) <- List(
+              (b6, "b6"), (b7, "b7")
+            )) {
+              for (prefix2 <- List(
+                "7d", "14d"
+              )) {
+                val view = if (bn.isEmpty) 0D else bn.getIntValue("ad_view_" + prefix2).toDouble
+                val click = if (bn.isEmpty) 0D else bn.getIntValue("ad_click_" + prefix2).toDouble
+                val conver = if (bn.isEmpty) 0D else bn.getIntValue("ad_conversion_" + prefix2).toDouble
+                val income = if (bn.isEmpty) 0D else bn.getIntValue("ad_income_" + prefix2).toDouble
+                val f1 = RankExtractorFeature_20240530.calDiv(click, view)
+                val f2 = RankExtractorFeature_20240530.calDiv(conver, view)
+                val f3 = RankExtractorFeature_20240530.calDiv(conver, click)
+                val f4 = conver
+                val f5 = RankExtractorFeature_20240530.calDiv(income * 1000, view)
+                featureMap.put(prefix1 + "_" + prefix2 + "_" + "ctr", f1)
+                featureMap.put(prefix1 + "_" + prefix2 + "_" + "ctcvr", f2)
+                featureMap.put(prefix1 + "_" + prefix2 + "_" + "cvr", f3)
+                featureMap.put(prefix1 + "_" + prefix2 + "_" + "conver", f4)
+                featureMap.put(prefix1 + "_" + prefix2 + "_" + "ecpm", f5)
+
+                featureMap.put(prefix1 + "_" + prefix2 + "_" + "click", click)
+                featureMap.put(prefix1 + "_" + prefix2 + "_" + "conver*log(view)", conver * RankExtractorFeature_20240530.calLog(view))
+                featureMap.put(prefix1 + "_" + prefix2 + "_" + "conver*ctcvr", conver * f2)
+              }
+            }
+
+            val c1: JSONObject = if (record.isNull("c1_feature")) new JSONObject() else
+              JSON.parseObject(record.getString("c1_feature"))
+
+            val midActionList = if (c1.containsKey("action") && c1.getString("action").nonEmpty) {
+              c1.getString("action").split(",").map(r => {
+                val rList = r.split(":")
+                (rList(0), (rList(1).toInt, rList(2).toInt, rList(3).toInt, rList(4).toInt, rList(5)))
+              }).sortBy(-_._2._1).toList
+            } else {
+              new ArrayBuffer[(String, (Int, Int, Int, Int, String))]().toList
+            }
+            // u特征
+            val viewAll = midActionList.size.toDouble
+            val clickAll = midActionList.map(_._2._2).sum.toDouble
+            val converAll = midActionList.map(_._2._3).sum.toDouble
+            val incomeAll = midActionList.map(_._2._4).sum.toDouble
+            featureMap.put("viewAll", viewAll)
+            featureMap.put("clickAll", clickAll)
+            featureMap.put("converAll", converAll)
+            featureMap.put("incomeAll", incomeAll)
+            featureMap.put("ctr_all", RankExtractorFeature_20240530.calDiv(clickAll, viewAll))
+            featureMap.put("ctcvr_all", RankExtractorFeature_20240530.calDiv(converAll, viewAll))
+            featureMap.put("cvr_all", RankExtractorFeature_20240530.calDiv(clickAll, converAll))
+            featureMap.put("ecpm_all", RankExtractorFeature_20240530.calDiv(incomeAll * 1000, viewAll))
+
+            // ui特征
+            val midTimeDiff = scala.collection.mutable.Map[String, Double]()
+            midActionList.foreach {
+              case (cid, (ts_history, click, conver, income, title)) =>
+                if (!midTimeDiff.contains("timediff_view_" + cid)) {
+                  midTimeDiff.put("timediff_view_" + cid, 1.0 / ((ts - ts_history).toDouble / 3600.0 / 24.0))
+                }
+                if (!midTimeDiff.contains("timediff_click_" + cid) && click > 0) {
+                  midTimeDiff.put("timediff_click_" + cid, 1.0 / ((ts - ts_history).toDouble / 3600.0 / 24.0))
+                }
+                if (!midTimeDiff.contains("timediff_conver_" + cid) && conver > 0) {
+                  midTimeDiff.put("timediff_conver_" + cid, 1.0 / ((ts - ts_history).toDouble / 3600.0 / 24.0))
+                }
+            }
+
+            val midActionStatic = scala.collection.mutable.Map[String, Double]()
+            midActionList.foreach {
+              case (cid, (ts_history, click, conver, income, title)) =>
+                midActionStatic.put("actionstatic_view_" + cid, 1.0 + midActionStatic.getOrDefault("actionstatic_view_" + cid, 0.0))
+                midActionStatic.put("actionstatic_click_" + cid, click + midActionStatic.getOrDefault("actionstatic_click_" + cid, 0.0))
+                midActionStatic.put("actionstatic_conver_" + cid, conver + midActionStatic.getOrDefault("actionstatic_conver_" + cid, 0.0))
+                midActionStatic.put("actionstatic_income_" + cid, income + midActionStatic.getOrDefault("actionstatic_income_" + cid, 0.0))
+            }
+
+            if (midTimeDiff.contains("timediff_view_" + cid)) {
+              featureMap.put("timediff_view", midTimeDiff.getOrDefault("timediff_view_" + cid, 0.0))
+            }
+            if (midTimeDiff.contains("timediff_click_" + cid)) {
+              featureMap.put("timediff_click", midTimeDiff.getOrDefault("timediff_click_" + cid, 0.0))
+            }
+            if (midTimeDiff.contains("timediff_conver_" + cid)) {
+              featureMap.put("timediff_conver", midTimeDiff.getOrDefault("timediff_conver_" + cid, 0.0))
+            }
+            if (midActionStatic.contains("actionstatic_view_" + cid)) {
+              featureMap.put("actionstatic_view", midActionStatic.getOrDefault("actionstatic_view_" + cid, 0.0))
+            }
+            if (midActionStatic.contains("actionstatic_click_" + cid)) {
+              featureMap.put("actionstatic_click", midActionStatic.getOrDefault("actionstatic_click_" + cid, 0.0))
+            }
+            if (midActionStatic.contains("actionstatic_conver_" + cid)) {
+              featureMap.put("actionstatic_conver", midActionStatic.getOrDefault("actionstatic_conver_" + cid, 0.0))
+            }
+            if (midActionStatic.contains("actionstatic_income_" + cid)) {
+              featureMap.put("actionstatic_income", midActionStatic.getOrDefault("actionstatic_income_" + cid, 0.0))
+            }
+            if (midActionStatic.contains("actionstatic_view_" + cid) && midActionStatic.contains("actionstatic_click_" + cid)) {
+              featureMap.put("actionstatic_ctr", RankExtractorFeature_20240530.calDiv(
+                midActionStatic.getOrDefault("actionstatic_click_" + cid, 0.0),
+                midActionStatic.getOrDefault("actionstatic_view_" + cid, 0.0)
+              ))
+            }
+            if (midActionStatic.contains("actionstatic_view_" + cid) && midActionStatic.contains("actionstatic_conver_" + cid)) {
+              featureMap.put("actionstatic_ctcvr", RankExtractorFeature_20240530.calDiv(
+                midActionStatic.getOrDefault("actionstatic_conver_" + cid, 0.0),
+                midActionStatic.getOrDefault("actionstatic_view_" + cid, 0.0)
+              ))
+            }
+            if (midActionStatic.contains("actionstatic_conver_" + cid) && midActionStatic.contains("actionstatic_click_" + cid)) {
+              featureMap.put("actionstatic_cvr", RankExtractorFeature_20240530.calDiv(
+                midActionStatic.getOrDefault("actionstatic_conver_" + cid, 0.0),
+                midActionStatic.getOrDefault("actionstatic_click_" + cid, 0.0)
+              ))
+            }
+
+            val e1: JSONObject = if (record.isNull("e1_feature")) new JSONObject() else
+              JSON.parseObject(record.getString("e1_feature"))
+            val e2: JSONObject = if (record.isNull("e2_feature")) new JSONObject() else
+              JSON.parseObject(record.getString("e2_feature"))
+            val title = b1.getOrDefault("cidtitle", "").toString
+            if (title.nonEmpty) {
+              for ((en, prefix1) <- List((e1, "e1"), (e2, "e2"))) {
+                for (prefix2 <- List("tags_3d", "tags_7d", "tags_14d")) {
+                  if (en.nonEmpty && en.containsKey(prefix2) && en.getString(prefix2).nonEmpty) {
+                    val (f1, f2, f3, f4) = funcC34567ForTags(en.getString(prefix2), title)
+                    featureMap.put(prefix1 + "_" + prefix2 + "_matchnum", f1)
+                    featureMap.put(prefix1 + "_" + prefix2 + "_maxscore", f3)
+                    featureMap.put(prefix1 + "_" + prefix2 + "_avgscore", f4)
+
+                  }
+                }
+              }
+            }
+
+            val d1: JSONObject = if (record.isNull("d1_feature")) new JSONObject() else
+              JSON.parseObject(record.getString("d1_feature"))
+            val d2: JSONObject = if (record.isNull("d2_feature")) new JSONObject() else
+              JSON.parseObject(record.getString("d2_feature"))
+            val d3: JSONObject = if (record.isNull("d3_feature")) new JSONObject() else
+              JSON.parseObject(record.getString("d3_feature"))
+
+            if (d1.nonEmpty) {
+              for (prefix <- List("3h", "6h", "12h", "1d", "3d", "7d")) {
+                val view = if (!d1.containsKey("ad_view_" + prefix)) 0D else d1.getIntValue("ad_view_" + prefix).toDouble
+                val click = if (!d1.containsKey("ad_click_" + prefix)) 0D else d1.getIntValue("ad_click_" + prefix).toDouble
+                val conver = if (!d1.containsKey("ad_conversion_" + prefix)) 0D else d1.getIntValue("ad_conversion_" + prefix).toDouble
+                val income = if (!d1.containsKey("ad_income_" + prefix)) 0D else d1.getIntValue("ad_income_" + prefix).toDouble
+                val f1 = RankExtractorFeature_20240530.calDiv(click, view)
+                val f2 = RankExtractorFeature_20240530.calDiv(conver, view)
+                val f3 = RankExtractorFeature_20240530.calDiv(conver, click)
+                val f4 = conver
+                val f5 = RankExtractorFeature_20240530.calDiv(income * 1000, view)
+                featureMap.put("d1_feature" + "_" + prefix + "_" + "ctr", f1)
+                featureMap.put("d1_feature" + "_" + prefix + "_" + "ctcvr", f2)
+                featureMap.put("d1_feature" + "_" + prefix + "_" + "cvr", f3)
+                featureMap.put("d1_feature" + "_" + prefix + "_" + "conver", f4)
+                featureMap.put("d1_feature" + "_" + prefix + "_" + "ecpm", f5)
+              }
+            }
+
+            val vidRankMaps = scala.collection.mutable.Map[String, scala.collection.immutable.Map[String, Double]]()
+            if (d2.nonEmpty) {
+              d2.foreach(r => {
+                val key = r._1
+                val value = d2.getString(key).split(",").map(r => {
+                  val rList = r.split(":")
+                  (rList(0), rList(2).toDouble)
+                }).toMap
+                vidRankMaps.put(key, value)
+              })
+            }
+            for (prefix1 <- List("ctr", "ctcvr", "ecpm")) {
+              for (prefix2 <- List("1d", "3d", "7d", "14d")) {
+                if (vidRankMaps.contains(prefix1 + "_" + prefix2)) {
+                  val rank = vidRankMaps(prefix1 + "_" + prefix2).getOrDefault(cid, 0.0)
+                  if (rank >= 1.0) {
+                    featureMap.put("vid_rank_" + prefix1 + "_" + prefix2, 1.0 / rank)
+                  }
+                }
+              }
+            }
+
+            if (d3.nonEmpty) {
+              val vTitle = d3.getString("title")
+              val score = Similarity.conceptSimilarity(title, vTitle)
+              featureMap.put("ctitle_vtitle_similarity", score);
+            }
+
+            /*
+            广告
+              sparse:cid adid adverid targeting_conversion
+
+              cpa --> 1个
+              adverid下的 3h 6h 12h 1d 3d 7d 、 ctr ctcvr cvr conver ecpm  --> 30个
+              cid下的 3h 6h 12h 1d 3d 7d 、 ctr ctcvr cvr ecpm conver --> 30个
+              地理//cid下的 3h 6h 12h 1d 3d 7d 、 ctr ctcvr cvr ecpm conver --> 30个
+              app//cid下的 3h 6h 12h 1d 3d 7d 、 ctr ctcvr cvr ecpm conver --> 30个
+              手机品牌//cid下的 3h 6h 12h 1d 3d 7d 、 ctr ctcvr cvr ecpm conver --> 30个
+              系统 无数据
+              week//cid下的 7d 14d、 ctr ctcvr cvr ecpm conver --> 10个
+              hour//cid下的 7d 14d、 ctr ctcvr cvr ecpm conver --> 10个
+
+            用户
+              用户历史 点击/转化 的title tag;3d 7d 14d; cid的title; 数量/最高分/平均分 --> 18个
+              用户历史 14d 看过/点过/转化次数/income; ctr cvr ctcvr ecpm;  --> 8个
+
+              用户到cid的ui特征 --> 10个
+                1/用户最近看过这个cid的时间间隔
+                1/用户最近点过这个cid的时间间隔
+                1/用户最近转过这个cid的时间间隔
+                用户看过这个cid多少次
+                用户点过这个cid多少次
+                用户转过这个cid多少次
+                用户对这个cid花了多少钱
+                用户对这个cid的ctr ctcvr cvr
+
+            视频
+              title与cid的 sim-score-1/-2 无数据
+              vid//cid下的 3h 6h 12h 1d 3d 7d 、 ctr ctcvr cvr ecpm conver --> 30个
+              vid//cid下的 1d 3d 7d 14d、 ctr ctcvr ecpm 的rank值 倒数 --> 12个
+
+             */
+
+
+            //4 处理label信息。
+            val labels = new JSONObject
+            for (labelKey <- List("ad_is_click", "ad_is_conversion")) {
+              if (!record.isNull(labelKey)) {
+                labels.put(labelKey, record.getString(labelKey))
+              }
+            }
+            //5 处理log key表头。
+            val mid = record.getString("mid")
+            val allfeature = if (record.isNull("allfeaturemap")) new JSONObject() else
+              JSON.parseObject(record.getString("allfeaturemap"))
+
+            val headvideoid = record.getString("headvideoid")
+            // val logKey = (apptype, mid, cid, ts, headvideoid).productIterator.mkString(",")
+            val labelKey = labels.toString()
+            val label = record.getString("ad_is_conversion")
+            //6 拼接数据,保存。
+            (apptype, mid, cid, ts, headvideoid, label, allfeature, featureMap)
+          }).filter {
+            case (apptype, mid, cid, ts, headvideoid, label, allfeature, featureMap) =>
+              !(allfeature.isEmpty || allfeature.containsKey("weight_sum") || allfeature.contains("weight"))
+          }.mapPartitions(row => {
+            val result = new ArrayBuffer[String]()
+            val bucketsMap = bucketsMap_br.value
+            row.foreach {
+              case (apptype, mid, cid, ts, headvideoid, label, allfeature, featureMap) =>
+                val offlineFeatureMap = featureMap.map(r => {
+                  val score = r._2.toString.toDouble
+                  val name = r._1
+                  if (score > 1E-8) {
+                    if (bucketsMap.contains(name)) {
+                      val (bucketsNum, buckets) = bucketsMap(name)
+                      val scoreNew = 1.0 / bucketsNum * (ExtractorUtils.findInsertPosition(buckets, score).toDouble + 1.0)
+                      name + ":" + scoreNew.toString
+                    } else {
+                      name + ":" + score.toString
+                    }
+                  } else {
+                    ""
+                  }
+                }).filter(_.nonEmpty)
+                result.add(
+                  (apptype, mid, cid, ts, headvideoid, label, allfeature.toString(), offlineFeatureMap.iterator.mkString(",")).productIterator.mkString("\t")
+                )
+            }
+            result.iterator
+          })
+
+        // 4 保存数据到hdfs
+        val savePartition = dt + hh
+        val hdfsPath = savePath + "/" + savePartition
+        if (hdfsPath.nonEmpty && hdfsPath.startsWith("/dw/recommend/model/")) {
+          println("删除路径并开始数据写入:" + hdfsPath)
+          MyHdfsUtils.delete_hdfs_path(hdfsPath)
+          odpsData.coalesce(repartition).saveAsTextFile(hdfsPath, classOf[GzipCodec])
+        } else {
+          println("路径不合法,无法写入:" + hdfsPath)
+        }
+      }
+    }
+
+
+    val data2 = sc.textFile(savePath + "/" + readDate + "*").mapPartitions(row => {
+      val result = new ArrayBuffer[(String, List[String], List[String])]()
+      // 680实验,517个特征
+      row.foreach(r => {
+        val rList = r.split("\t")
+        val label = rList(5).toString
+        val allFeatureMap = JSON.parseObject(rList(6)).toMap.map(r => (r._1, r._2.toString))
+        val offlineFeature = rList(7).split(",").map(r => (r.split(":")(0), r.split(":")(1))).toMap
+
+        val offlineFeatureList = allFeatureMap.map {
+          case (key, value) =>
+            key + ":" + value
+        }.filter(_.nonEmpty).toList
+
+        val b8FeatureSet = Set("b8_3h_ctr", "b8_3h_ctcvr", "b8_3h_cvr", "b8_3h_conver", "b8_3h_ecpm", "b8_3h_click", "b8_3h_conver*log(view)", "b8_3h_conver*ctcvr", "b8_6h_ctr", "b8_6h_ctcvr", "b8_6h_cvr", "b8_6h_conver", "b8_6h_ecpm", "b8_6h_click", "b8_6h_conver*log(view)", "b8_6h_conver*ctcvr", "b8_12h_ctr", "b8_12h_ctcvr", "b8_12h_cvr", "b8_12h_conver", "b8_12h_ecpm", "b8_12h_click", "b8_12h_conver*log(view)", "b8_12h_conver*ctcvr", "b8_1d_ctr", "b8_1d_ctcvr", "b8_1d_cvr", "b8_1d_conver", "b8_1d_ecpm", "b8_1d_click", "b8_1d_conver*log(view)", "b8_1d_conver*ctcvr", "b8_3d_ctr", "b8_3d_ctcvr", "b8_3d_cvr", "b8_3d_conver", "b8_3d_ecpm", "b8_3d_click", "b8_3d_conver*log(view)", "b8_3d_conver*ctcvr", "b8_7d_ctr", "b8_7d_ctcvr", "b8_7d_cvr", "b8_7d_conver", "b8_7d_ecpm", "b8_7d_click", "b8_7d_conver*log(view)", "b8_7d_conver*ctcvr")
+        val b8AllFeatureMap = new JSONObject()
+        for (elem <- allFeatureMap) {
+          b8AllFeatureMap.put(elem._1, elem._2)
+        }
+        for (elem <- b8FeatureSet) {
+          if (!b8AllFeatureMap.containsKey(elem) && offlineFeature.contains(elem)) {
+            b8AllFeatureMap.put(elem, offlineFeature(elem))
+          }
+        }
+        val b8AllFeature = b8AllFeatureMap.map {
+          case (key, value) =>
+            key + ":" + value
+        }.filter(_.nonEmpty).toList
+
+
+
+        result.add((label, offlineFeatureList, b8AllFeature))
+      })
+
+      result.iterator
+    })
+
+    val offlineSave = "/dw/recommend/model/33_for_check_all/" + readDate
+    if (offlineSave.nonEmpty && offlineSave.startsWith("/dw/recommend/model/")) {
+      println("删除路径并开始数据写入:" + offlineSave)
+      MyHdfsUtils.delete_hdfs_path(offlineSave)
+      data2.map(r => r._1 + "\t" + r._2.mkString("\t")).saveAsTextFile(offlineSave, classOf[GzipCodec])
+    } else {
+      println("路径不合法,无法写入:" + offlineSave)
+    }
+
+    val allFeatureV1 = "/dw/recommend/model/33_for_check_all_b8/" + readDate
+    if (allFeatureV1.nonEmpty && allFeatureV1.startsWith("/dw/recommend/model/")) {
+      println("删除路径并开始数据写入:" + allFeatureV1)
+      MyHdfsUtils.delete_hdfs_path(allFeatureV1)
+      data2.map(r => r._1 + "\t" + r._3.mkString("\t")).saveAsTextFile(allFeatureV1, classOf[GzipCodec])
+    } else {
+      println("路径不合法,无法写入:" + allFeatureV1)
+    }
+
+  }
+
+  def func(record: Record, schema: TableSchema): Record = {
+    record
+  }
+
+  def funcC34567ForTags(tags: String, title: String): Tuple4[Double, String, Double, Double] = {
+    // 匹配数量 匹配词 语义最高相似度分 语义平均相似度分
+    val tagsList = tags.split(",")
+    var d1 = 0.0
+    val d2 = new ArrayBuffer[String]()
+    var d3 = 0.0
+    var d4 = 0.0
+    for (tag <- tagsList) {
+      if (title.contains(tag)) {
+        d1 = d1 + 1.0
+        d2.add(tag)
+      }
+      val score = Similarity.conceptSimilarity(tag, title)
+      d3 = if (score > d3) score else d3
+      d4 = d4 + score
+    }
+    d4 = if (tagsList.nonEmpty) d4 / tagsList.size else d4
+    (d1, d2.mkString(","), d3, d4)
+  }
+}

+ 278 - 0
src/main/scala/com/aliyun/odps/spark/examples/makedata_qiao/makedata_13_originData_20240705.scala

@@ -0,0 +1,278 @@
+package com.aliyun.odps.spark.examples.makedata_qiao
+
+import com.alibaba.fastjson.{JSON, JSONObject}
+import com.aliyun.odps.TableSchema
+import com.aliyun.odps.data.Record
+import com.aliyun.odps.spark.examples.myUtils.{MyDateUtils, MyHdfsUtils, ParamUtils, env}
+import examples.extractor.RankExtractorFeature_20240530
+import org.apache.hadoop.io.compress.GzipCodec
+import org.apache.spark.sql.SparkSession
+import org.xm.Similarity
+
+import scala.collection.JavaConversions._
+import scala.collection.mutable.ArrayBuffer
+/*
+   20240608 提取特征
+ */
+
+object makedata_13_originData_20240705 {
+  def main(args: Array[String]): Unit = {
+    val spark = SparkSession
+      .builder()
+      .appName(this.getClass.getName)
+      .getOrCreate()
+    val sc = spark.sparkContext
+
+    // 1 读取参数
+    val param = ParamUtils.parseArgs(args)
+    val tablePart = param.getOrElse("tablePart", "64").toInt
+    val beginStr = param.getOrElse("beginStr", "2023010100")
+    val endStr = param.getOrElse("endStr", "2023010123")
+    val savePath = param.getOrElse("savePath", "/dw/recommend/model/13_sample_data/")
+    val project = param.getOrElse("project", "loghubods")
+    val table = param.getOrElse("table", "XXXX")
+    val repartition = param.getOrElse("repartition", "100").toInt
+
+    // 2 读取odps+表信息
+    val odpsOps = env.getODPS(sc)
+
+    // 3 循环执行数据生产
+    val timeRange = MyDateUtils.getDateHourRange(beginStr, endStr)
+    for (dt_hh <- timeRange) {
+      val dt = dt_hh.substring(0, 8)
+      val hh = dt_hh.substring(8, 10)
+      val partition = s"dt=$dt,hh=$hh"
+      println("开始执行partiton:" + partition)
+      val odpsData = odpsOps.readTable(project = project,
+        table = table,
+        partition = partition,
+        transfer = func,
+        numPartition = tablePart)
+        .map(record => {
+
+          val featureMap = new JSONObject()
+
+          // a 视频特征
+          val b1: JSONObject = if (record.isNull("b1_feature")) new JSONObject() else
+            JSON.parseObject(record.getString("b1_feature"))
+          val b2: JSONObject = if (record.isNull("b2_feature")) new JSONObject() else
+            JSON.parseObject(record.getString("b2_feature"))
+          val b3: JSONObject = if (record.isNull("b3_feature")) new JSONObject() else
+            JSON.parseObject(record.getString("b3_feature"))
+          val b6: JSONObject = if (record.isNull("b6_feature")) new JSONObject() else
+            JSON.parseObject(record.getString("b6_feature"))
+          val b7: JSONObject = if (record.isNull("b7_feature")) new JSONObject() else
+            JSON.parseObject(record.getString("b7_feature"))
+
+          val b8: JSONObject = if (record.isNull("b8_feature")) new JSONObject() else
+            JSON.parseObject(record.getString("b8_feature"))
+          val b9: JSONObject = if (record.isNull("b9_feature")) new JSONObject() else
+            JSON.parseObject(record.getString("b9_feature"))
+          val b10: JSONObject = if (record.isNull("b10_feature")) new JSONObject() else
+            JSON.parseObject(record.getString("b10_feature"))
+          val b11: JSONObject = if (record.isNull("b11_feature")) new JSONObject() else
+            JSON.parseObject(record.getString("b11_feature"))
+          val b12: JSONObject = if (record.isNull("b12_feature")) new JSONObject() else
+            JSON.parseObject(record.getString("b12_feature"))
+          val b13: JSONObject = if (record.isNull("b13_feature")) new JSONObject() else
+            JSON.parseObject(record.getString("b13_feature"))
+          val b17: JSONObject = if (record.isNull("b17_feature")) new JSONObject() else
+            JSON.parseObject(record.getString("b17_feature"))
+          val b18: JSONObject = if (record.isNull("b18_feature")) new JSONObject() else
+            JSON.parseObject(record.getString("b18_feature"))
+          val b19: JSONObject = if (record.isNull("b19_feature")) new JSONObject() else
+            JSON.parseObject(record.getString("b19_feature"))
+
+
+          val origin_data = List(
+            (b1, b2, b3, "b123"), (b1, b6, b7, "b167"),
+            (b8, b9, b10, "b8910"), (b11, b12, b13, "b111213"),
+            (b17, b18, b19, "b171819")
+          )
+          for ((b_1, b_2, b_3, prefix1) <- origin_data){
+            for (prefix2 <- List(
+              "1h", "2h", "3h", "4h", "12h", "1d", "3d", "7d"
+            )){
+              val exp = if (b_1.isEmpty) 0D else b_1.getIntValue("exp_pv_" + prefix2).toDouble
+              val share = if (b_2.isEmpty) 0D else b_2.getIntValue("share_pv_" + prefix2).toDouble
+              val returns = if (b_3.isEmpty) 0D else b_3.getIntValue("return_uv_" + prefix2).toDouble
+              val f1 = RankExtractorFeature_20240530.calDiv(share, exp)
+              val f2 = RankExtractorFeature_20240530.calLog(share)
+              val f3 = RankExtractorFeature_20240530.calDiv(returns, exp)
+              val f4 = RankExtractorFeature_20240530.calLog(returns)
+              val f5 = f3 * f4
+              featureMap.put(prefix1 + "_" + prefix2 + "_" + "STR", f1)
+              featureMap.put(prefix1 + "_" + prefix2 + "_" + "log(share)", f2)
+              featureMap.put(prefix1 + "_" + prefix2 + "_" + "ROV", f3)
+              featureMap.put(prefix1 + "_" + prefix2 + "_" + "log(return)", f4)
+              featureMap.put(prefix1 + "_" + prefix2 + "_" + "ROV*log(return)", f5)
+            }
+          }
+
+          val video_info: JSONObject = if (record.isNull("t_v_info_feature")) new JSONObject() else
+            JSON.parseObject(record.getString("t_v_info_feature"))
+          featureMap.put("total_time", if (video_info.containsKey("total_time")) video_info.getIntValue("total_time").toDouble else 0D)
+          featureMap.put("bit_rate", if (video_info.containsKey("bit_rate")) video_info.getIntValue("bit_rate").toDouble else 0D)
+
+          val c1: JSONObject = if (record.isNull("c1_feature")) new JSONObject() else
+            JSON.parseObject(record.getString("c1_feature"))
+          if (c1.nonEmpty){
+            featureMap.put("playcnt_6h", if (c1.containsKey("playcnt_6h")) c1.getIntValue("playcnt_6h").toDouble else 0D)
+            featureMap.put("playcnt_1d", if (c1.containsKey("playcnt_1d")) c1.getIntValue("playcnt_1d").toDouble else 0D)
+            featureMap.put("playcnt_3d", if (c1.containsKey("playcnt_3d")) c1.getIntValue("playcnt_3d").toDouble else 0D)
+            featureMap.put("playcnt_7d", if (c1.containsKey("playcnt_7d")) c1.getIntValue("playcnt_7d").toDouble else 0D)
+          }
+          val c2: JSONObject = if (record.isNull("c2_feature")) new JSONObject() else
+            JSON.parseObject(record.getString("c2_feature"))
+          if (c2.nonEmpty){
+            featureMap.put("share_pv_12h", if (c2.containsKey("share_pv_12h")) c2.getIntValue("share_pv_12h").toDouble else 0D)
+            featureMap.put("share_pv_1d", if (c2.containsKey("share_pv_1d")) c2.getIntValue("share_pv_1d").toDouble else 0D)
+            featureMap.put("share_pv_3d", if (c2.containsKey("share_pv_3d")) c2.getIntValue("share_pv_3d").toDouble else 0D)
+            featureMap.put("share_pv_7d", if (c2.containsKey("share_pv_7d")) c2.getIntValue("share_pv_7d").toDouble else 0D)
+            featureMap.put("return_uv_12h", if (c2.containsKey("return_uv_12h")) c2.getIntValue("return_uv_12h").toDouble else 0D)
+            featureMap.put("return_uv_1d", if (c2.containsKey("return_uv_1d")) c2.getIntValue("return_uv_1d").toDouble else 0D)
+            featureMap.put("return_uv_3d", if (c2.containsKey("return_uv_3d")) c2.getIntValue("return_uv_3d").toDouble else 0D)
+            featureMap.put("return_uv_7d", if (c2.containsKey("return_uv_7d")) c2.getIntValue("return_uv_7d").toDouble else 0D)
+          }
+
+          val title = if (video_info.containsKey("title")) video_info.getString("title") else ""
+          if (!title.equals("")){
+            for (key_feature <- List("c3_feature", "c4_feature", "c5_feature", "c6_feature", "c7_feature")){
+              val c34567: JSONObject = if (record.isNull(key_feature)) new JSONObject() else
+                JSON.parseObject(record.getString(key_feature))
+              for (key_time <- List("tags_1d", "tags_3d", "tags_7d")) {
+                val tags = if (c34567.containsKey(key_time)) c34567.getString(key_time) else ""
+                if (!tags.equals("")){
+                  val (f1, f2, f3, f4) = funcC34567ForTags(tags, title)
+                  featureMap.put(key_feature + "_" + key_time + "_matchnum", f1)
+                  featureMap.put(key_feature + "_" + key_time + "_maxscore", f3)
+                  featureMap.put(key_feature + "_" + key_time + "_avgscore", f4)
+                }
+              }
+            }
+          }
+
+          val vid = if (record.isNull("vid")) "" else record.getString("vid")
+          if (!vid.equals("")){
+            for (key_feature <- List("c8_feature", "c9_feature")){
+              val c89: JSONObject = if (record.isNull(key_feature)) new JSONObject() else
+                JSON.parseObject(record.getString(key_feature))
+              for (key_action <- List("share", "return")){
+                  val cfListStr = if (c89.containsKey(key_action)) c89.getString(key_action) else ""
+                  if (!cfListStr.equals("")){
+                    val cfMap = cfListStr.split(",").map(r =>{
+                      val rList = r.split(":")
+                      (rList(0), (rList(1), rList(2), rList(3)))
+                    }).toMap
+                    if (cfMap.contains(vid)){
+                      val (score, num, rank) = cfMap(vid)
+                      featureMap.put(key_feature + "_" + key_action + "_score", score.toDouble)
+                      featureMap.put(key_feature + "_" + key_action + "_num", num.toDouble)
+                      featureMap.put(key_feature + "_" + key_action + "_rank", 1.0 / rank.toDouble)
+                    }
+                  }
+              }
+            }
+          }
+
+          val d1: JSONObject = if (record.isNull("d1_feature")) new JSONObject() else
+            JSON.parseObject(record.getString("d1_feature"))
+          if (d1.nonEmpty){
+            featureMap.put("d1_exp", if (d1.containsKey("exp")) d1.getString("exp").toDouble else 0D)
+            featureMap.put("d1_return_n", if (d1.containsKey("return_n")) d1.getString("return_n").toDouble else 0D)
+            featureMap.put("d1_rovn", if (d1.containsKey("rovn")) d1.getString("rovn").toDouble else 0D)
+          }
+
+
+          /*
+
+
+          视频:
+          曝光使用pv 分享使用pv 回流使用uv --> 1h 2h 3h 4h 12h 1d 3d 7d
+          STR log(share) ROV log(return) ROV*log(return)
+          40个特征组合
+          整体、整体曝光对应、推荐非冷启root、推荐冷启root、分省份root
+          200个特征值
+
+          视频:
+          视频时长、比特率
+
+          人:
+          播放次数 --> 6h 1d 3d 7d --> 4个
+          带回来的分享pv 回流uv --> 12h 1d 3d 7d --> 8个
+          人+vid-title:
+          播放点/回流点/分享点/累积分享/累积回流 --> 1d 3d 7d --> 匹配数量 语义最高相似度分 语义平均相似度分 --> 45个
+          人+vid-cf
+          基于分享行为/基于回流行为 -->  “分享cf”+”回流点击cf“ 相似分 相似数量 相似rank的倒数 --> 12个
+
+          头部视频:
+          曝光 回流 ROVn 3个特征
+
+          场景:
+          小时 星期 apptype city province pagesource 机器型号
+           */
+
+
+
+          //4 处理label信息。
+          val labels = new JSONObject
+          for (labelKey <- List(
+            "is_play", "is_share", "is_return", "noself_is_return", "return_uv", "noself_return_uv", "total_return_uv",
+            "share_pv", "total_share_uv"
+          )){
+            if (!record.isNull(labelKey)){
+              labels.put(labelKey, record.getString(labelKey))
+            }
+          }
+          //5 处理log key表头。
+          val apptype = record.getString("apptype")
+          val pagesource = record.getString("pagesource")
+          val mid = record.getString("mid")
+          // vid 已经提取了
+          val ts = record.getString("ts")
+          val abcode = record.getString("abcode")
+          val level = if (record.isNull("level")) "0" else record.getString("level")
+          val logKey = (apptype, pagesource, mid, vid, ts, abcode, level).productIterator.mkString(",")
+          val labelKey = labels.toString()
+          val featureKey = featureMap.toString()
+          //6 拼接数据,保存。
+          logKey + "\t" + labelKey + "\t" + featureKey
+
+        })
+
+      // 4 保存数据到hdfs
+      val savePartition = dt + hh
+      val hdfsPath = savePath + "/" + savePartition
+      if (hdfsPath.nonEmpty && hdfsPath.startsWith("/dw/recommend/model/")){
+        println("删除路径并开始数据写入:" + hdfsPath)
+        MyHdfsUtils.delete_hdfs_path(hdfsPath)
+        odpsData.coalesce(repartition).saveAsTextFile(hdfsPath, classOf[GzipCodec])
+      }else{
+        println("路径不合法,无法写入:" + hdfsPath)
+      }
+    }
+  }
+
+  def func(record: Record, schema: TableSchema): Record = {
+    record
+  }
+  def funcC34567ForTags(tags: String, title: String): Tuple4[Double, String, Double, Double] = {
+    // 匹配数量 匹配词 语义最高相似度分 语义平均相似度分
+    val tagsList = tags.split(",")
+    var d1 = 0.0
+    val d2 = new ArrayBuffer[String]()
+    var d3 = 0.0
+    var d4 = 0.0
+    for (tag <- tagsList){
+      if (title.contains(tag)){
+        d1 = d1 + 1.0
+        d2.add(tag)
+      }
+      val score = Similarity.conceptSimilarity(tag, title)
+      d3 = if (score > d3) score else d3
+      d4 = d4 + score
+    }
+    d4 = if (tagsList.nonEmpty) d4 / tagsList.size else d4
+    (d1, d2.mkString(","), d3, d4)
+  }
+}

+ 91 - 0
src/main/scala/com/aliyun/odps/spark/examples/makedata_qiao/makedata_14_valueData_20240705.scala

@@ -0,0 +1,91 @@
+package com.aliyun.odps.spark.examples.makedata_qiao
+
+import com.alibaba.fastjson.JSON
+import com.aliyun.odps.spark.examples.myUtils.{MyDateUtils, MyHdfsUtils, ParamUtils}
+import org.apache.hadoop.io.compress.GzipCodec
+import org.apache.spark.sql.SparkSession
+
+import scala.collection.JavaConversions._
+import scala.collection.mutable.ArrayBuffer
+import scala.io.Source
+/*
+
+ */
+
+object makedata_14_valueData_20240705 {
+  def main(args: Array[String]): Unit = {
+
+    val spark = SparkSession
+      .builder()
+      .appName(this.getClass.getName)
+      .getOrCreate()
+    val sc = spark.sparkContext
+
+    val loader = getClass.getClassLoader
+    val resourceUrl = loader.getResource("20240608_feature_name.txt")
+    val content =
+      if (resourceUrl != null) {
+        val content = Source.fromURL(resourceUrl).getLines().mkString("\n")
+        Source.fromURL(resourceUrl).close()
+        content
+      } else {
+        ""
+      }
+    println(content)
+    val contentList = content.split("\n")
+      .map(r=> r.replace(" ", "").replaceAll("\n", ""))
+      .filter(r=> r.nonEmpty).toList
+    val contentList_bc = sc.broadcast(contentList)
+
+
+
+    // 1 读取参数
+    val param = ParamUtils.parseArgs(args)
+    val beginStr = param.getOrElse("beginStr", "20230101")
+    val endStr = param.getOrElse("endStr", "20230101")
+    val readPath = param.getOrElse("readPath", "/dw/recommend/model/13_sample_data/")
+    val savePath = param.getOrElse("savePath", "/dw/recommend/model/14_feature_data/")
+    val repartition = param.getOrElse("repartition", "200").toInt
+    val dateRange = MyDateUtils.getDateRange(beginStr, endStr)
+    for (date <- dateRange) {
+      val data = sc.textFile(readPath + "/" + date + "*")
+      val data1 = data.map(r => {
+        val rList = r.split("\t")
+        val logKey = rList(0)
+        val labelKey = rList(1)
+        val featureKey = rList(2)
+        (logKey, labelKey, featureKey)
+      }).filter(r =>
+        r._1.split(",")(6).equals("0")
+      ).mapPartitions(row => {
+        val result = new ArrayBuffer[String]()
+        val contentList = contentList_bc.value
+        row.foreach {
+          case (logKey, labelKey, featureKey) =>
+            val featureJson = JSON.parseObject(featureKey)
+
+            val featureValues = contentList.map(key => {
+              if (featureJson.containsKey(key)) {
+                featureJson.getDouble(key)
+              } else {
+                0.0
+              }
+            })
+            result.add(logKey + "\t" + labelKey + "\t" + featureValues.mkString(","))
+        }
+        result.iterator
+      })
+
+      // 4 保存数据到hdfs
+      val hdfsPath = savePath + "/" + date
+      if (hdfsPath.nonEmpty && hdfsPath.startsWith("/dw/recommend/model/")) {
+        println("删除路径并开始数据写入:" + hdfsPath)
+        MyHdfsUtils.delete_hdfs_path(hdfsPath)
+        data1.coalesce(repartition).saveAsTextFile(hdfsPath, classOf[GzipCodec])
+      } else {
+        println("路径不合法,无法写入:" + hdfsPath)
+      }
+    }
+
+  }
+}

+ 127 - 0
src/main/scala/com/aliyun/odps/spark/examples/makedata_qiao/makedata_16_bucketData_20240705.scala

@@ -0,0 +1,127 @@
+package com.aliyun.odps.spark.examples.makedata_qiao
+
+import com.alibaba.fastjson.JSON
+import com.aliyun.odps.spark.examples.myUtils.{MyDateUtils, MyHdfsUtils, ParamUtils}
+import examples.extractor.ExtractorUtils
+import org.apache.hadoop.io.compress.GzipCodec
+import org.apache.spark.sql.SparkSession
+
+import scala.collection.JavaConversions._
+import scala.collection.mutable.ArrayBuffer
+import scala.io.Source
+/*
+
+ */
+
+object makedata_16_bucketData_20240705 {
+  def main(args: Array[String]): Unit = {
+
+    val spark = SparkSession
+      .builder()
+      .appName(this.getClass.getName)
+      .getOrCreate()
+    val sc = spark.sparkContext
+
+    val loader = getClass.getClassLoader
+    val resourceUrl = loader.getResource("20240608_feature_name.txt")
+    val content =
+      if (resourceUrl != null) {
+        val content = Source.fromURL(resourceUrl).getLines().mkString("\n")
+        Source.fromURL(resourceUrl).close()
+        content
+      } else {
+        ""
+      }
+    println(content)
+    val contentList = content.split("\n")
+      .map(r=> r.replace(" ", "").replaceAll("\n", ""))
+      .filter(r=> r.nonEmpty).toList
+    val contentList_br = sc.broadcast(contentList)
+
+    val resourceUrlBucket = loader.getResource("20240609_bucket_274.txt")
+    val buckets =
+      if (resourceUrlBucket != null) {
+        val buckets = Source.fromURL(resourceUrlBucket).getLines().mkString("\n")
+        Source.fromURL(resourceUrlBucket).close()
+        buckets
+      } else {
+        ""
+      }
+    println(buckets)
+    val bucketsMap = buckets.split("\n")
+      .map(r => r.replace(" ", "").replaceAll("\n", ""))
+      .filter(r => r.nonEmpty)
+      .map(r =>{
+        val rList = r.split("\t")
+        (rList(0), (rList(1).toDouble, rList(2).split(",").map(_.toDouble)))
+      }).toMap
+    val bucketsMap_br = sc.broadcast(bucketsMap)
+
+
+    // 1 读取参数
+    val param = ParamUtils.parseArgs(args)
+    val readPath = param.getOrElse("readPath", "/dw/recommend/model/14_feature_data/")
+    val savePath = param.getOrElse("savePath", "/dw/recommend/model/16_train_data/")
+    val beginStr = param.getOrElse("beginStr", "20240606")
+    val endStr = param.getOrElse("endStr", "20240607")
+    val repartition = param.getOrElse("repartition", "200").toInt
+
+    val dateRange = MyDateUtils.getDateRange(beginStr, endStr)
+    for (date <- dateRange) {
+      println("开始执行:" + date)
+      val data = sc.textFile(readPath + date).map(r=>{
+        val rList = r.split("\t")
+        val logKey = rList(0)
+        val labelKey = rList(1)
+        val features = rList(2).split(",").map(_.toDouble)
+        (logKey, labelKey, features)
+      })
+        .filter{
+          case (logKey, labelKey, features) =>
+            val logKeyList = logKey.split(",")
+            val apptype = logKeyList(0)
+            val pagesource = logKeyList(1)
+            Set("0", "4", "5", "21", "3", "6").contains(apptype) && pagesource.endsWith("recommend")
+        }
+        .map{
+          case (logKey, labelKey, features) =>
+            val label = JSON.parseObject(labelKey).getOrDefault("is_return", "0").toString
+            (label, features)
+        }
+        .mapPartitions(row => {
+        val result = new ArrayBuffer[String]()
+        val contentList = contentList_br.value
+        val bucketsMap = bucketsMap_br.value
+        row.foreach{
+          case (label, features) =>
+            val featuresBucket = contentList.indices.map(i =>{
+              val featureName = contentList(i)
+              val score = features(i)
+              if (score > 1E-8){
+                val (bucketNum, buckets) = bucketsMap(featureName)
+                val scoreNew = 1.0 / bucketNum * (ExtractorUtils.findInsertPosition(buckets, score).toDouble + 1.0)
+                featureName + ":" + scoreNew.toString
+              }else{
+                ""
+              }
+            }).filter(_.nonEmpty)
+            result.add(label + "\t" + featuresBucket.mkString("\t"))
+        }
+        result.iterator
+      })
+
+      // 4 保存数据到hdfs
+      val hdfsPath = savePath + "/" + date
+      if (hdfsPath.nonEmpty && hdfsPath.startsWith("/dw/recommend/model/")) {
+        println("删除路径并开始数据写入:" + hdfsPath)
+        MyHdfsUtils.delete_hdfs_path(hdfsPath)
+        data.repartition(repartition).saveAsTextFile(hdfsPath, classOf[GzipCodec])
+      } else {
+        println("路径不合法,无法写入:" + hdfsPath)
+      }
+    }
+
+
+
+  }
+}

+ 1 - 1
src/main/scala/com/aliyun/odps/spark/examples/makedata_recsys/makedata_recsys_43_bucketData_20240709.scala

@@ -93,7 +93,7 @@ object makedata_recsys_43_bucketData_20240709 {
                 case (name, score) =>
                   var ifFilter = false
                   if (filterNames.nonEmpty){
-                    filterNames.foreach(r=> if (!ifFilter && name.startsWith(r)) {ifFilter = true} )
+                    filterNames.foreach(r=> if (!ifFilter && name.contains(r)) {ifFilter = true} )
                   }
                   if (ifFilter){
                     ""

+ 141 - 0
src/main/scala/com/aliyun/odps/spark/examples/makedata_recsys/makedata_recsys_43_bucketData_20240709_vid.scala

@@ -0,0 +1,141 @@
+package com.aliyun.odps.spark.examples.makedata_recsys
+
+import com.alibaba.fastjson.JSON
+import com.aliyun.odps.spark.examples.myUtils.{MyDateUtils, MyHdfsUtils, ParamUtils}
+import examples.extractor.ExtractorUtils
+import org.apache.hadoop.io.compress.GzipCodec
+import org.apache.spark.sql.SparkSession
+
+import scala.collection.JavaConversions._
+import scala.collection.mutable.ArrayBuffer
+import scala.io.Source
+
+object makedata_recsys_43_bucketData_20240709_vid {
+  def main(args: Array[String]): Unit = {
+
+    // 1 读取参数
+    val param = ParamUtils.parseArgs(args)
+    param.foreach {
+      case (key, value) => {
+        println("Key: " + key + "; Value: " + value)
+      }
+    }
+
+    val readPath = param.getOrElse("readPath", "/dw/recommend/model/41_recsys_sample_data_v1/")
+    val savePath = param.getOrElse("savePath", "/dw/recommend/model/43_recsys_train_data_v1/")
+    val beginStr = param.getOrElse("beginStr", "20240703")
+    val endStr = param.getOrElse("endStr", "20240703")
+    val repartition = param.getOrElse("repartition", "100").toInt
+    val filterNames = param.getOrElse("filterNames", "").split(",").filter(_.nonEmpty).toSet
+    val filterVids = param.getOrElse("filterVids", "").split(",").filter(_.nonEmpty).toSet
+    val whatLabel = param.getOrElse("whatLabel", "is_return")
+    val whatApps = param.getOrElse("whatApps", "0,4,5,21,3,6").split(",").toSet
+    val fileName = param.getOrElse("fileName", "20240709_recsys_bucket_314.txt")
+
+    val spark = SparkSession
+      .builder()
+      .appName(this.getClass.getName)
+      .getOrCreate()
+    val sc = spark.sparkContext
+
+    val loader = getClass.getClassLoader
+
+    val resourceUrlBucket = loader.getResource(fileName)
+    val buckets =
+      if (resourceUrlBucket != null) {
+        val buckets = Source.fromURL(resourceUrlBucket).getLines().mkString("\n")
+        Source.fromURL(resourceUrlBucket).close()
+        buckets
+      } else {
+        ""
+      }
+    println(buckets)
+    val bucketsMap = buckets.split("\n")
+      .map(r => r.replace(" ", "").replaceAll("\n", ""))
+      .filter(r => r.nonEmpty)
+      .map(r => {
+        val rList = r.split("\t")
+        (rList(0), (rList(1).toDouble, rList(2).split(",").map(_.toDouble)))
+      }).toMap
+    val bucketsMap_br = sc.broadcast(bucketsMap)
+
+    val dateRange = MyDateUtils.getDateRange(beginStr, endStr)
+    for (date <- dateRange) {
+      println("开始执行:" + date)
+      val data = sc.textFile(readPath + "/" + date + "*").map(r => {
+          val rList = r.split("\t")
+          val logKey = rList(0)
+          val labelKey = rList(1)
+          val jsons = JSON.parseObject(rList(2))
+          val features = scala.collection.mutable.Map[String, Double]()
+          jsons.foreach(r => {
+            features.put(r._1, jsons.getDoubleValue(r._1))
+          })
+          (logKey, labelKey, features)
+        })
+        .filter {
+          case (logKey, labelKey, features) =>
+            val logKeyList = logKey.split(",")
+            val apptype = logKeyList(0)
+            val pagesource = logKeyList(1)
+            whatApps.contains(apptype) && pagesource.endsWith("recommend")
+        }
+        .filter {
+          case (logKey, labelKey, features) =>
+            val logKeyList = logKey.split(",")
+            val vid = logKeyList(3)
+            filterVids.isEmpty || filterVids.contains(vid)
+        }
+        .map {
+          case (logKey, labelKey, features) =>
+            val label = JSON.parseObject(labelKey).getOrDefault(whatLabel, "0").toString
+            val vid = logKey.split(",")(3)
+            (label, vid, features)
+        }
+        .mapPartitions(row => {
+          val result = new ArrayBuffer[String]()
+          val bucketsMap = bucketsMap_br.value
+          row.foreach {
+            case (label, vid, features) =>
+              val featuresBucket = features.map {
+                case (name, score) =>
+                  var ifFilter = false
+                  if (filterNames.nonEmpty) {
+                    filterNames.foreach(r => if (!ifFilter && name.contains(r)) {
+                      ifFilter = true
+                    })
+                  }
+                  if (ifFilter) {
+                    ""
+                  } else {
+                    if (score > 1E-8) {
+                      if (bucketsMap.contains(name)) {
+                        val (bucketsNum, buckets) = bucketsMap(name)
+                        val scoreNew = 1.0 / bucketsNum * (ExtractorUtils.findInsertPosition(buckets, score).toDouble + 1.0)
+                        name + ":" + scoreNew.toString
+                      } else {
+                        name + ":" + score.toString
+                      }
+                    } else {
+                      ""
+                    }
+                  }
+              }.filter(_.nonEmpty)
+              result.add(label + "\t" + vid + "\t" + featuresBucket.mkString("\t"))
+          }
+          result.iterator
+        })
+
+      // 4 保存数据到hdfs
+      val hdfsPath = savePath + "/" + date
+      if (hdfsPath.nonEmpty && hdfsPath.startsWith("/dw/recommend/model/")) {
+        println("删除路径并开始数据写入:" + hdfsPath)
+        MyHdfsUtils.delete_hdfs_path(hdfsPath)
+        data.repartition(repartition).saveAsTextFile(hdfsPath, classOf[GzipCodec])
+      } else {
+        println("路径不合法,无法写入:" + hdfsPath)
+      }
+    }
+  }
+}
+

+ 136 - 0
src/main/scala/com/aliyun/odps/spark/examples/makedata_recsys/makedata_recsys_43_bucketData_fu_sample_20240709.scala

@@ -0,0 +1,136 @@
+package com.aliyun.odps.spark.examples.makedata_recsys
+
+import com.alibaba.fastjson.JSON
+import com.aliyun.odps.spark.examples.myUtils.{MyDateUtils, MyHdfsUtils, ParamUtils}
+import examples.extractor.ExtractorUtils
+import org.apache.hadoop.io.compress.GzipCodec
+import org.apache.spark.sql.SparkSession
+
+import scala.collection.JavaConversions._
+import scala.collection.mutable.ArrayBuffer
+import scala.io.Source
+import scala.util.Random
+/*
+
+ */
+
+object makedata_recsys_43_bucketData_fu_sample_20240709 {
+  def main(args: Array[String]): Unit = {
+
+    // 1 读取参数
+    val param = ParamUtils.parseArgs(args)
+    val readPath = param.getOrElse("readPath", "/dw/recommend/model/41_recsys_sample_data_v1/")
+    val savePath = param.getOrElse("savePath", "/dw/recommend/model/43_recsys_train_data_v1/")
+    val beginStr = param.getOrElse("beginStr", "20240703")
+    val endStr = param.getOrElse("endStr", "20240703")
+    val repartition = param.getOrElse("repartition", "100").toInt
+    val filterNames = param.getOrElse("filterNames", "XXXXXXXXXX").split(",").filter(_.nonEmpty).toSet
+    val whatLabel = param.getOrElse("whatLabel", "is_return")
+    val whatApps = param.getOrElse("whatApps", "0,4,5,21,3,6").split(",").toSet
+    val fuSampleRate= param.getOrElse("fuSampleRate", "0.1").toDouble
+    val fileName = param.getOrElse("fileName", "20240709_recsys_bucket_314.txt")
+
+    val spark = SparkSession
+      .builder()
+      .appName(this.getClass.getName)
+      .getOrCreate()
+    val sc = spark.sparkContext
+
+    val loader = getClass.getClassLoader
+
+    val resourceUrlBucket = loader.getResource(fileName)
+    val buckets =
+      if (resourceUrlBucket != null) {
+        val buckets = Source.fromURL(resourceUrlBucket).getLines().mkString("\n")
+        Source.fromURL(resourceUrlBucket).close()
+        buckets
+      } else {
+        ""
+      }
+    println(buckets)
+    val bucketsMap = buckets.split("\n")
+      .map(r => r.replace(" ", "").replaceAll("\n", ""))
+      .filter(r => r.nonEmpty)
+      .map(r =>{
+        val rList = r.split("\t")
+        (rList(0), (rList(1).toDouble, rList(2).split(",").map(_.toDouble)))
+      }).toMap
+    val bucketsMap_br = sc.broadcast(bucketsMap)
+
+
+
+
+    val dateRange = MyDateUtils.getDateRange(beginStr, endStr)
+    for (date <- dateRange) {
+      println("开始执行:" + date)
+      val data = sc.textFile(readPath + "/" + date + "*").map(r=>{
+        val rList = r.split("\t")
+        val logKey = rList(0)
+        val labelKey = rList(1)
+        val jsons = JSON.parseObject(rList(2))
+        val features = scala.collection.mutable.Map[String, Double]()
+        jsons.foreach(r => {
+          features.put(r._1, jsons.getDoubleValue(r._1))
+        })
+        (logKey, labelKey, features)
+      })
+        .filter{
+          case (logKey, labelKey, features) =>
+            val logKeyList = logKey.split(",")
+            val apptype = logKeyList(0)
+            val pagesource = logKeyList(1)
+            whatApps.contains(apptype) && pagesource.endsWith("recommend")
+        }.filter{
+          case (logKey, labelKey, features) =>
+            val label = JSON.parseObject(labelKey).getOrDefault(whatLabel, "0").toString
+            "1".equals(label) || new Random().nextDouble() <= fuSampleRate
+        }
+        .map{
+          case (logKey, labelKey, features) =>
+            val label = JSON.parseObject(labelKey).getOrDefault(whatLabel, "0").toString
+            (label, features)
+        }
+        .mapPartitions(row => {
+          val result = new ArrayBuffer[String]()
+          val bucketsMap = bucketsMap_br.value
+          row.foreach{
+            case (label, features) =>
+              val featuresBucket = features.map{
+                case (name, score) =>
+                  var ifFilter = false
+                  if (filterNames.nonEmpty){
+                    filterNames.foreach(r=> if (!ifFilter && name.contains(r)) {ifFilter = true} )
+                  }
+                  if (ifFilter){
+                    ""
+                  }else{
+                    if (score > 1E-8) {
+                      if (bucketsMap.contains(name)) {
+                        val (bucketsNum, buckets) = bucketsMap(name)
+                        val scoreNew = 1.0 / bucketsNum * (ExtractorUtils.findInsertPosition(buckets, score).toDouble + 1.0)
+                        name + ":" + scoreNew.toString
+                      } else {
+                        name + ":" + score.toString
+                      }
+                    } else {
+                      ""
+                    }
+                  }
+              }.filter(_.nonEmpty)
+              result.add(label + "\t" + featuresBucket.mkString("\t"))
+          }
+          result.iterator
+      })
+
+      // 4 保存数据到hdfs
+      val hdfsPath = savePath + "/" + date
+      if (hdfsPath.nonEmpty && hdfsPath.startsWith("/dw/recommend/model/")) {
+        println("删除路径并开始数据写入:" + hdfsPath)
+        MyHdfsUtils.delete_hdfs_path(hdfsPath)
+        data.repartition(repartition).saveAsTextFile(hdfsPath, classOf[GzipCodec])
+      } else {
+        println("路径不合法,无法写入:" + hdfsPath)
+      }
+    }
+  }
+}

+ 1 - 1
src/main/scala/com/aliyun/odps/spark/examples/myUtils/ParamUtils.scala

@@ -3,7 +3,7 @@ package com.aliyun.odps.spark.examples.myUtils
 import scala.collection.mutable
 object ParamUtils {
   def parseArgs(args: Array[String]): mutable.HashMap[String, String] = {
-    println("args size:" + args.size)
+    println("args size:" + args.length)
 
     val rst = new mutable.HashMap[String, String]() {
       override def default(key: String) = "无参数传入"

+ 4 - 4
src/main/scala/com/aliyun/odps/spark/examples/临时记录的脚本-广告

@@ -4,11 +4,11 @@ nohup /opt/apps/SPARK2/spark-2.4.8-hadoop3.2-1.0.8/bin/spark-class2 org.apache.s
 --master yarn --driver-memory 1G --executor-memory 2G --executor-cores 1 --num-executors 16 \
 ./target/spark-examples-1.0.0-SNAPSHOT-shaded.jar \
 tablePart:64 repartition:16 \
-beginStr:2024070108 endStr:2024070323 \
+beginStr:2024072408 endStr:2024072423 \
 savePath:/dw/recommend/model/31_ad_sample_data_v3/ \
 table:alg_recsys_ad_sample_all filterHours:00,01,02,03,04,05,06,07 \
 idDefaultValue:0.01 \
-> p31_2024070108.log 2>&1 &
+> p31_2024072423.log 2>&1 &
 
 
 nohup /opt/apps/SPARK2/spark-2.4.8-hadoop3.2-1.0.8/bin/spark-class2 org.apache.spark.deploy.SparkSubmit \
@@ -28,9 +28,9 @@ nohup /opt/apps/SPARK2/spark-2.4.8-hadoop3.2-1.0.8/bin/spark-class2 org.apache.s
 ./target/spark-examples-1.0.0-SNAPSHOT-shaded.jar \
 readPath:/dw/recommend/model/31_ad_sample_data_v3/ \
 savePath:/dw/recommend/model/33_ad_train_data_v3/ \
-beginStr:20240703 endStr:20240703 repartition:100 \
+beginStr:20240724 endStr:20240724 repartition:100 \
 filterNames:adid_,targeting_conversion_ \
-> p33_20240703_.log 2>&1 &
+> p33_20240724_.log 2>&1 &
 
 filterNames:adid_,targeting_conversion_ \
 filterNames:cid_,adid_,adverid_,targeting_conversion_ \

+ 5 - 0
src/main/scala/com/tzld/recommend/recall/algo/CollaborativeFilteringAlgo.scala

@@ -0,0 +1,5 @@
+package com.tzld.recommend.recall.algo
+
+class CollaborativeFilteringAlgo {
+
+}

+ 0 - 0
zhangbo/01_train.sh


+ 0 - 0
zhangbo/02_train_go.sh


+ 0 - 0
zhangbo/03_predict.sh


+ 0 - 0
zhangbo/04_upload.sh


+ 0 - 0
zhangbo/05_update_everyday_2model.sh


+ 0 - 0
zhangbo/05_update_everyday_str.sh


+ 0 - 0
zhangbo/06_update_everyday_feature.sh


+ 0 - 0
zhangbo/50_delete_hdfs.sh


+ 0 - 0
zhangbo/train.sh


+ 0 - 0
zhangbo/up.sh


+ 0 - 0
zhangbo/up2.sh


+ 1 - 1
zhangbo/utils.py

@@ -92,7 +92,7 @@ if __name__ == '__main__':
     elif args.excute_program == "check_user_hive":
         check_user_hive(args)
     elif args.excute_program == "check_hive":
-            check_hive(args)
+        check_hive(args)
     else:
         print("无合法参数,验证失败。")
         exit(999)

Some files were not shown because too many files changed in this diff