소스 검색

feat:修改str特征生产脚本

zhaohaipeng 1 개월 전
부모
커밋
3301b2439a

+ 2 - 24
src/main/scala/com/aliyun/odps/spark/examples/makedata_recsys/v20250218/makedata_recsys_41_str_train_data_sample_20250319.scala

@@ -9,11 +9,8 @@ import examples.extractor.v20250218.ExtractFeature20250218
 import examples.utils.{FestiveUtil, SimilarityUtils, StatisticsUtil}
 import org.apache.hadoop.io.compress.GzipCodec
 import org.apache.spark.sql.SparkSession
-import org.xm.Similarity
 
 import java.util
-import scala.collection.JavaConversions._
-import scala.collection.mutable.ArrayBuffer
 import scala.util.Random
 
 /*
@@ -178,13 +175,13 @@ object makedata_recsys_41_str_train_data_sample_20250319 {
         })
     }
     }.reduce(_ union _)
-
+    println(s"odos count: " + odpsData.count())
     // 4 保存数据到hdfs
     val hdfsPath = savePath
     if (hdfsPath.nonEmpty && hdfsPath.startsWith("/dw/recommend/model/")) {
       println("删除路径并开始数据写入:" + hdfsPath)
       MyHdfsUtils.delete_hdfs_path(hdfsPath)
-      odpsData.coalesce(repartition).saveAsTextFile(hdfsPath, classOf[GzipCodec])
+      odpsData.coalesce(repartition, shuffle = true).saveAsTextFile(hdfsPath, classOf[GzipCodec])
     } else {
       println("路径不合法,无法写入:" + hdfsPath)
     }
@@ -195,23 +192,4 @@ object makedata_recsys_41_str_train_data_sample_20250319 {
     record
   }
 
-  def funcC34567ForTags(tags: String, title: String): Tuple4[Double, String, Double, Double] = {
-    // 匹配数量 匹配词 语义最高相似度分 语义平均相似度分
-    val tagsList = tags.split(",")
-    var d1 = 0.0
-    val d2 = new ArrayBuffer[String]()
-    var d3 = 0.0
-    var d4 = 0.0
-    for (tag <- tagsList) {
-      if (title.contains(tag)) {
-        d1 = d1 + 1.0
-        d2.add(tag)
-      }
-      val score = Similarity.conceptSimilarity(tag, title)
-      d3 = if (score > d3) score else d3
-      d4 = d4 + score
-    }
-    d4 = if (tagsList.nonEmpty) d4 / tagsList.size else d4
-    (d1, d2.mkString(","), d3, d4)
-  }
 }