|
@@ -9,11 +9,8 @@ import examples.extractor.v20250218.ExtractFeature20250218
|
|
|
import examples.utils.{FestiveUtil, SimilarityUtils, StatisticsUtil}
|
|
|
import org.apache.hadoop.io.compress.GzipCodec
|
|
|
import org.apache.spark.sql.SparkSession
|
|
|
-import org.xm.Similarity
|
|
|
|
|
|
import java.util
|
|
|
-import scala.collection.JavaConversions._
|
|
|
-import scala.collection.mutable.ArrayBuffer
|
|
|
import scala.util.Random
|
|
|
|
|
|
/*
|
|
@@ -178,13 +175,13 @@ object makedata_recsys_41_str_train_data_sample_20250319 {
|
|
|
})
|
|
|
}
|
|
|
}.reduce(_ union _)
|
|
|
-
|
|
|
+ println(s"odos count: " + odpsData.count())
|
|
|
// 4 保存数据到hdfs
|
|
|
val hdfsPath = savePath
|
|
|
if (hdfsPath.nonEmpty && hdfsPath.startsWith("/dw/recommend/model/")) {
|
|
|
println("删除路径并开始数据写入:" + hdfsPath)
|
|
|
MyHdfsUtils.delete_hdfs_path(hdfsPath)
|
|
|
- odpsData.coalesce(repartition).saveAsTextFile(hdfsPath, classOf[GzipCodec])
|
|
|
+ odpsData.coalesce(repartition, shuffle = true).saveAsTextFile(hdfsPath, classOf[GzipCodec])
|
|
|
} else {
|
|
|
println("路径不合法,无法写入:" + hdfsPath)
|
|
|
}
|
|
@@ -195,23 +192,4 @@ object makedata_recsys_41_str_train_data_sample_20250319 {
|
|
|
record
|
|
|
}
|
|
|
|
|
|
- def funcC34567ForTags(tags: String, title: String): Tuple4[Double, String, Double, Double] = {
|
|
|
- // 匹配数量 匹配词 语义最高相似度分 语义平均相似度分
|
|
|
- val tagsList = tags.split(",")
|
|
|
- var d1 = 0.0
|
|
|
- val d2 = new ArrayBuffer[String]()
|
|
|
- var d3 = 0.0
|
|
|
- var d4 = 0.0
|
|
|
- for (tag <- tagsList) {
|
|
|
- if (title.contains(tag)) {
|
|
|
- d1 = d1 + 1.0
|
|
|
- d2.add(tag)
|
|
|
- }
|
|
|
- val score = Similarity.conceptSimilarity(tag, title)
|
|
|
- d3 = if (score > d3) score else d3
|
|
|
- d4 = d4 + score
|
|
|
- }
|
|
|
- d4 = if (tagsList.nonEmpty) d4 / tagsList.size else d4
|
|
|
- (d1, d2.mkString(","), d3, d4)
|
|
|
- }
|
|
|
}
|