Browse Source

dssm过滤掉无品类的item

zhangbo 4 months ago
parent
commit
cfc9bd3986

+ 11 - 1
src/main/scala/com/aliyun/odps/spark/examples/makedata_dssm/makedata_i2i_06_itemPred_20241206.scala

@@ -32,6 +32,7 @@ object makedata_i2i_06_itemPred_20241206 {
     val savePath = param.getOrElse("savePath", "/dw/recommend/model/56_dssm_i2i_itempredData/")
     val project = param.getOrElse("project", "loghubods")
     val repartition = param.getOrElse("repartition", "100").toInt
+    val ifFilterCate = param.getOrElse("ifFilterCate", "true").toBoolean
 
     // 2 读取onehot文件
     val onehotMap_br = sc.broadcast(
@@ -126,7 +127,16 @@ object makedata_i2i_06_itemPred_20241206 {
           result.add((vid, (feature, feature_action, feature_cate1, feature_cate2)))
       }
       result.iterator
-    }).mapPartitions(row =>{
+    }).filter{
+      case (vid, (feature, feature_action, feature_cate1, feature_cate2)) =>
+        if (ifFilterCate){
+          val cate1 = JSON.parseObject(feature).getOrDefault("category1", "无").toString
+          val cate2 = JSON.parseObject(feature).getOrDefault("category2_1", "无").toString
+          !cate1.equals("无") || !cate2.equals("无")
+        }else{
+          true
+        }
+    }.mapPartitions(row =>{
       val result = new ArrayBuffer[String]()
       val onehotMap = onehotMap_br.value
       val bucketsMap = bucketsMap_br.value

+ 2 - 2
src/main/scala/com/aliyun/odps/spark/examples/临时记录的脚本-I2I

@@ -67,7 +67,7 @@ nohup /opt/apps/SPARK2/spark-2.4.8-hadoop3.2-1.0.8/bin/spark-class2 org.apache.s
 --master yarn --driver-memory 2G --executor-memory 2G --executor-cores 1 --num-executors 16 \
 ./target/spark-examples-1.0.0-SNAPSHOT-shaded.jar \
 onehotPath:/dw/recommend/model/53_dssm_i2i_onehot/after_20241201_file \
-bucketFile:20241128_recsys_i2i_bucket_47_v2.txt repartition:100 \
-dt:20241206 \
+bucketFile:20241128_recsys_i2i_bucket_47_v2.txt repartition:10 ifFilterCate:true \
+dt:20241225 \
 savePath:/dw/recommend/model/56_dssm_i2i_itempredData/ \
 > p56.log 2>&1 &

+ 2 - 1
src/main/scala/com/aliyun/odps/spark/examples/临时记录的脚本-广告

@@ -51,10 +51,11 @@ nohup /opt/apps/SPARK2/spark-2.4.8-hadoop3.2-1.0.8/bin/spark-class2 org.apache.s
 --class com.aliyun.odps.spark.examples.makedata_ad.v20240718.makedata_ad_34_bucketDataPrint_20241217 \
 --master yarn --driver-memory 2G --executor-memory 4G --executor-cores 1 --num-executors 16 \
 ./target/spark-examples-1.0.0-SNAPSHOT-shaded.jar \
-beginStr:2024121708 endStr:2024121709 \
+beginStr:2024121708 endStr:2024121708 \
 readDate:20241217 \
 table:alg_recsys_ad_sample_all \
 savePath:/dw/recommend/model/34_for_check/ \
+filterNames:adid_,targeting_conversion_ \
 > p34_data_check.log 2>&1 &