|
@@ -56,6 +56,13 @@ object makedata_ad_33_bucketDataFromOriginToHive_20250228 {
|
|
(rList(0), (rList(1).toDouble, rList(2).split(",").map(_.toDouble)))
|
|
(rList(0), (rList(1).toDouble, rList(2).split(",").map(_.toDouble)))
|
|
}).toMap
|
|
}).toMap
|
|
val bucketsMap_br = sc.broadcast(bucketsMap)
|
|
val bucketsMap_br = sc.broadcast(bucketsMap)
|
|
|
|
+ val denseFeatureNames = bucketsMap.keySet
|
|
|
|
+ val sparseFeatureNames = Set(
|
|
|
|
+ "cid", "adid", "adverid", "targeting_conversion",
|
|
|
|
+ "region", "city", "brand",
|
|
|
|
+ "vid", "cate1", "cate2",
|
|
|
|
+ "user_cid_click_list", "user_cid_conver_list",
|
|
|
|
+ "user_vid_return_tags_2h", "user_vid_return_tags_1d", "user_vid_return_tags_3d", "user_vid_return_tags_7d", "user_vid_return_tags_14d")
|
|
|
|
|
|
|
|
|
|
// 2 读取odps+表信息
|
|
// 2 读取odps+表信息
|
|
@@ -469,38 +476,35 @@ object makedata_ad_33_bucketDataFromOriginToHive_20250228 {
|
|
val headvideoid = record.getString("headvideoid")
|
|
val headvideoid = record.getString("headvideoid")
|
|
val logKey = (apptype, mid, cid, ts, headvideoid).productIterator.mkString(",")
|
|
val logKey = (apptype, mid, cid, ts, headvideoid).productIterator.mkString(",")
|
|
val labelKey = labels.toString()
|
|
val labelKey = labels.toString()
|
|
- val featureKey = featureMap.toString()
|
|
|
|
-
|
|
|
|
- val mutableMap = scala.collection.mutable.Map[String, String]()
|
|
|
|
- //6 拼接数据,保存。
|
|
|
|
- mutableMap.put("logKey", logKey)
|
|
|
|
- mutableMap.put("labelKey", labelKey)
|
|
|
|
- mutableMap.put("featureKey", featureKey)
|
|
|
|
- mutableMap
|
|
|
|
|
|
+ (logKey, labelKey, featureMap)
|
|
})
|
|
})
|
|
odpsData
|
|
odpsData
|
|
}).reduce(_ union _)
|
|
}).reduce(_ union _)
|
|
- .map(record => {
|
|
|
|
- val logKey = record.getOrElse("logKey", "")
|
|
|
|
- val labelKey = record.getOrElse("labelKey", "")
|
|
|
|
- val featureKey = record.getOrElse("featureKey", "")
|
|
|
|
- val jsons = JSON.parseObject(featureKey)
|
|
|
|
- val features = scala.collection.mutable.Map[String, Double]()
|
|
|
|
- jsons.foreach(r => {
|
|
|
|
- features.put(r._1, jsons.getDoubleValue(r._1))
|
|
|
|
|
|
+ .map{ case (logKey, labelKey, jsons) =>
|
|
|
|
+ val denseFeatures = scala.collection.mutable.Map[String, Double]()
|
|
|
|
+ val sparseFeatures = scala.collection.mutable.Map[String, String]()
|
|
|
|
+ denseFeatureNames.foreach(r => {
|
|
|
|
+ if (jsons.containsKey(r)) {
|
|
|
|
+ denseFeatures.put(r, jsons.getDoubleValue(r))
|
|
|
|
+ }
|
|
})
|
|
})
|
|
- (logKey, labelKey, features)
|
|
|
|
- }).filter {
|
|
|
|
- case (logKey, labelKey, features) =>
|
|
|
|
|
|
+ sparseFeatureNames.foreach(r => {
|
|
|
|
+ if (jsons.containsKey(r)) {
|
|
|
|
+ sparseFeatures.put(r, jsons.get(r).toString)
|
|
|
|
+ }
|
|
|
|
+ })
|
|
|
|
+ (logKey, labelKey, denseFeatures, sparseFeatures)
|
|
|
|
+ }.filter {
|
|
|
|
+ case (logKey, labelKey, denseFeatures, sparseFeatures) =>
|
|
val logKeyList = logKey.split(",")
|
|
val logKeyList = logKey.split(",")
|
|
val apptype = logKeyList(0)
|
|
val apptype = logKeyList(0)
|
|
!Set("12", "13").contains(apptype)
|
|
!Set("12", "13").contains(apptype)
|
|
}
|
|
}
|
|
.map {
|
|
.map {
|
|
- case (logKey, labelKey, features) =>
|
|
|
|
|
|
+ case (logKey, labelKey, denseFeatures, sparseFeatures) =>
|
|
val label = JSON.parseObject(labelKey).getOrDefault(whatLabel, "0").toString
|
|
val label = JSON.parseObject(labelKey).getOrDefault(whatLabel, "0").toString
|
|
val bucketsMap = bucketsMap_br.value
|
|
val bucketsMap = bucketsMap_br.value
|
|
- var resultMap = features.collect {
|
|
|
|
|
|
+ var resultMap = denseFeatures.collect {
|
|
case (name, score) if !filterNames.exists(name.contains) && score > 1E-8 =>
|
|
case (name, score) if !filterNames.exists(name.contains) && score > 1E-8 =>
|
|
var key = name.replace("*", "_x_").replace("(view)", "_view")
|
|
var key = name.replace("*", "_x_").replace("(view)", "_view")
|
|
if (key == "ad_is_click") {
|
|
if (key == "ad_is_click") {
|
|
@@ -514,6 +518,9 @@ object makedata_ad_33_bucketDataFromOriginToHive_20250228 {
|
|
}
|
|
}
|
|
key -> value.toString
|
|
key -> value.toString
|
|
}.toMap
|
|
}.toMap
|
|
|
|
+ sparseFeatures.foreach(kv => {
|
|
|
|
+ resultMap += (kv._1 -> kv._2)
|
|
|
|
+ })
|
|
resultMap += ("has_conversion" -> label)
|
|
resultMap += ("has_conversion" -> label)
|
|
resultMap += ("logkey" -> logKey)
|
|
resultMap += ("logkey" -> logKey)
|
|
(label.toInt, resultMap, Random.nextDouble())
|
|
(label.toInt, resultMap, Random.nextDouble())
|
|
@@ -535,13 +542,13 @@ object makedata_ad_33_bucketDataFromOriginToHive_20250228 {
|
|
try {
|
|
try {
|
|
columnType.getTypeName match {
|
|
columnType.getTypeName match {
|
|
case "STRING" =>
|
|
case "STRING" =>
|
|
- record.setString(columnIndex, value.toString)
|
|
|
|
|
|
+ record.setString(columnIndex, value)
|
|
case "BIGINT" =>
|
|
case "BIGINT" =>
|
|
- record.setBigint(columnIndex, value.toString.toLong)
|
|
|
|
|
|
+ record.setBigint(columnIndex, value.toLong)
|
|
case "DOUBLE" =>
|
|
case "DOUBLE" =>
|
|
- record.setDouble(columnIndex, value.toString.toDouble)
|
|
|
|
|
|
+ record.setDouble(columnIndex, value.toDouble)
|
|
case "BOOLEAN" =>
|
|
case "BOOLEAN" =>
|
|
- record.setBoolean(columnIndex, value.toString.toBoolean)
|
|
|
|
|
|
+ record.setBoolean(columnIndex, value.toBoolean)
|
|
case other =>
|
|
case other =>
|
|
throw new IllegalArgumentException(s"Unsupported column type: $other")
|
|
throw new IllegalArgumentException(s"Unsupported column type: $other")
|
|
}
|
|
}
|