소스 검색

feat:修改脚本

zhaohaipeng 10 달 전
부모
커밋
658693329d

+ 4 - 4
src/main/scala/com/aliyun/odps/spark/zhp/makedata_ad_31_originData_20240620.scala → src/main/scala/com/aliyun/odps/spark/zhp/makedata_ad/makedata_ad_31_originData_20240620.scala

@@ -1,4 +1,4 @@
-package com.aliyun.odps.spark.zhp
+package com.aliyun.odps.spark.zhp.makedata_ad
 
 import com.alibaba.fastjson.{JSON, JSONObject}
 import com.aliyun.odps.TableSchema
@@ -215,16 +215,16 @@ object makedata_ad_31_originData_20240620 {
               midActionStatic.getOrDefault("actionstatic_view_" + cid, 0.0)
             ))
           }
-          if (midActionStatic.contains("actionstatic_view_" + cid) && midActionStatic.contains("timediff_conver_" + cid)) {
+          if (midActionStatic.contains("actionstatic_view_" + cid) && midActionStatic.contains("actionstatic_conver_" + cid)) {
             featureMap.put("actionstatic_ctcvr", RankExtractorFeature_20240530.calDiv(
-              midActionStatic.getOrDefault("timediff_conver_" + cid, 0.0),
+              midActionStatic.getOrDefault("actionstatic_conver_" + cid, 0.0),
               midActionStatic.getOrDefault("actionstatic_view_" + cid, 0.0)
             ))
           }
           if (midActionStatic.contains("actionstatic_conver_" + cid) && midActionStatic.contains("actionstatic_click_" + cid)) {
             featureMap.put("actionstatic_cvr", RankExtractorFeature_20240530.calDiv(
               midActionStatic.getOrDefault("actionstatic_click_" + cid, 0.0),
-              midActionStatic.getOrDefault("timediff_conver_" + cid, 0.0)
+              midActionStatic.getOrDefault("actionstatic_conver_" + cid, 0.0)
             ))
           }
 

+ 1 - 1
src/main/scala/com/aliyun/odps/spark/zhp/makedata_ad_32_bucket_20240622.scala → src/main/scala/com/aliyun/odps/spark/zhp/makedata_ad/makedata_ad_32_bucket_20240622.scala

@@ -1,4 +1,4 @@
-package com.aliyun.odps.spark.zhp
+package com.aliyun.odps.spark.zhp.makedata_ad
 
 import com.alibaba.fastjson.JSON
 import com.aliyun.odps.spark.examples.myUtils.{MyHdfsUtils, ParamUtils}

파일 크기가 너무 크기때문에 변경 상태를 표시하지 않습니다.
+ 396 - 0
src/main/scala/com/aliyun/odps/spark/zhp/makedata_ad/makedata_ad_33_bucketDataPrint_20240628.scala


+ 20 - 11
src/main/scala/com/aliyun/odps/spark/zhp/makedata_ad_33_bucketData_20240622.scala → src/main/scala/com/aliyun/odps/spark/zhp/makedata_ad/makedata_ad_33_bucketData_20240622.scala

@@ -1,4 +1,4 @@
-package com.aliyun.odps.spark.zhp
+package com.aliyun.odps.spark.zhp.makedata_ad
 
 import com.alibaba.fastjson.JSON
 import com.aliyun.odps.spark.examples.myUtils.{MyDateUtils, MyHdfsUtils, ParamUtils}
@@ -24,7 +24,7 @@ object makedata_ad_33_bucketData_20240622 {
 
     val loader = getClass.getClassLoader
 
-    val resourceUrlBucket = loader.getResource("20240622_ad_bucket_249.txt")
+    val resourceUrlBucket = loader.getResource("20240624_ad_bucket_249.txt")
     val buckets =
       if (resourceUrlBucket != null) {
         val buckets = Source.fromURL(resourceUrlBucket).getLines().mkString("\n")
@@ -51,6 +51,7 @@ object makedata_ad_33_bucketData_20240622 {
     val beginStr = param.getOrElse("beginStr", "20240620")
     val endStr = param.getOrElse("endStr", "20240620")
     val repartition = param.getOrElse("repartition", "200").toInt
+    val filterNames = param.getOrElse("filterNames", "").split(",").toSet
 
     val dateRange = MyDateUtils.getDateRange(beginStr, endStr)
     for (date <- dateRange) {
@@ -84,16 +85,24 @@ object makedata_ad_33_bucketData_20240622 {
             case (label, features) =>
               val featuresBucket = features.map{
                 case (name, score) =>
-                  if (score > 1E-8) {
-                    if (bucketsMap.contains(name)){
-                      val (_, buckets) = bucketsMap(name)
-                      val scoreNew = 1.0 / (buckets.length + 1) * (ExtractorUtils.findInsertPosition(buckets, score).toDouble + 1.0)
-                      name + ":" + scoreNew.toString
-                    }else{
-                      name + ":" + score.toString
-                    }
-                  } else {
+                  var ifFilter = false
+                  if (filterNames.nonEmpty){
+                    filterNames.foreach(r=> if (!ifFilter && name.startsWith(r)) {ifFilter = true} )
+                  }
+                  if (ifFilter){
                     ""
+                  }else{
+                    if (score > 1E-8) {
+                      if (bucketsMap.contains(name)) {
+                        val (_, buckets) = bucketsMap(name)
+                        val scoreNew = 1.0 / (buckets.length + 1) * (ExtractorUtils.findInsertPosition(buckets, score).toDouble + 1.0)
+                        name + ":" + scoreNew.toString
+                      } else {
+                        name + ":" + score.toString
+                      }
+                    } else {
+                      ""
+                    }
                   }
               }.filter(_.nonEmpty)
               result.add(label + "\t" + featuresBucket.mkString("\t"))

파일 크기가 너무 크기때문에 변경 상태를 표시하지 않습니다.
+ 0 - 19
src/main/scala/com/aliyun/odps/spark/zhp/makedata_ad_31_originData_20240628.scala


+ 60 - 0
src/main/scala/com/aliyun/odps/spark/zhp/临时记录的脚本-广告

@@ -0,0 +1,60 @@
+
+
+// 模型特征生产
+nohup /opt/apps/SPARK2/spark-2.4.8-hadoop3.2-1.0.8/bin/spark-class2 org.apache.spark.deploy.SparkSubmit \
+--class com.aliyun.odps.spark.examples.makedata_ad.makedata_ad_31_originData_20240620 \
+--master yarn --driver-memory 1G --executor-memory 2G --executor-cores 1 --num-executors 16 \
+./target/spark-examples-1.0.0-SNAPSHOT-shaded.jar \
+tablePart:64 repartition:32 \
+beginStr:2024062008 endStr:2024062223 \
+savePath:/dw/recommend/model/31_ad_sample_data_fix/ \
+table:alg_recsys_ad_sample_all \
+> p31_2024062008.log 2>&1 &
+
+
+
+// 特征分桶--生成分桶文件
+nohup /opt/apps/SPARK2/spark-2.4.8-hadoop3.2-1.0.8/bin/spark-class2 org.apache.spark.deploy.SparkSubmit \
+--class com.aliyun.odps.spark.examples.makedata_ad.makedata_ad_32_bucket_20240622 \
+--master yarn --driver-memory 16G --executor-memory 1G --executor-cores 1 --num-executors 16 \
+--conf spark.driver.maxResultSize=16G \
+./target/spark-examples-1.0.0-SNAPSHOT-shaded.jar \
+fileName:20240620_100_fix \
+savePath:/dw/recommend/model/32_bucket_data/ \
+> p32_data.log 2>&1 &
+
+
+// 特征分桶--过滤固定前缀的特征
+nohup /opt/apps/SPARK2/spark-2.4.8-hadoop3.2-1.0.8/bin/spark-class2 org.apache.spark.deploy.SparkSubmit \
+--class com.aliyun.odps.spark.examples.makedata_ad.makedata_ad_33_bucketData_20240622 \
+--master yarn --driver-memory 2G --executor-memory 4G --executor-cores 1 --num-executors 16 \
+./target/spark-examples-1.0.0-SNAPSHOT-shaded.jar \
+beginStr:20240623 endStr:20240623 repartition:400 \
+filterNames:XXXXXXX \
+> p33_data.log 2>&1 &
+
+filterNames:cid_,adid_,adverid_,targeting_conversion_ \
+savePath:/dw/recommend/model/33_ad_train_data_nosparse/ \
+
+
+/dw/recommend/model/31_ad_sample_data/
+/dw/recommend/model/33_ad_train_data/
+
+/dw/recommend/model/31_ad_sample_data_fix/
+
+
+
+
+
+nohup /opt/apps/SPARK2/spark-2.4.8-hadoop3.2-1.0.8/bin/spark-class2 org.apache.spark.deploy.SparkSubmit \
+--class com.aliyun.odps.spark.examples.makedata_ad.makedata_ad_33_bucketDataPrint_20240628 \
+--master yarn --driver-memory 2G --executor-memory 4G --executor-cores 1 --num-executors 16 \
+./target/spark-examples-1.0.0-SNAPSHOT-shaded.jar \
+beginStr:2024062717 endStr:2024062723 \
+readDate:20240627 \
+table:alg_recsys_ad_sample_all_new \
+savePath:/dw/recommend/model/33_for_check/ \
+> p33_data_check.log 2>&1 &
+
+
+/dw/recommend/model/33_for_check_v1/

이 변경점에서 너무 많은 파일들이 변경되어 몇몇 파일들은 표시되지 않았습니다.