|
@@ -1,15 +1,5 @@
|
|
|
package com.aliyun.odps.spark.examples.makedata
|
|
|
|
|
|
-import com.alibaba.fastjson.JSON
|
|
|
-import com.aliyun.odps.spark.examples.myUtils.{MyDateUtils, MyHdfsUtils, ParamUtils}
|
|
|
-import examples.extractor.ExtractorUtils
|
|
|
-import org.apache.hadoop.io.compress.GzipCodec
|
|
|
-import org.apache.spark.sql.SparkSession
|
|
|
-import com.alibaba.fastjson.{JSON, JSONObject}
|
|
|
-
|
|
|
-import scala.collection.JavaConversions._
|
|
|
-import scala.collection.mutable.ArrayBuffer
|
|
|
-import scala.io.Source
|
|
|
/*
|
|
|
|
|
|
*/
|
|
@@ -35,8 +25,8 @@ object makedata_16_bucketData_20240609_check {
|
|
|
}
|
|
|
println(content)
|
|
|
val contentList = content.split("\n")
|
|
|
- .map(r=> r.replace(" ", "").replaceAll("\n", ""))
|
|
|
- .filter(r=> r.nonEmpty).toList
|
|
|
+ .map(r => r.replace(" ", "").replaceAll("\n", ""))
|
|
|
+ .filter(r => r.nonEmpty).toList
|
|
|
val contentList_br = sc.broadcast(contentList)
|
|
|
|
|
|
val resourceUrlBucket = loader.getResource("20240609_bucket_274.txt")
|
|
@@ -52,7 +42,7 @@ object makedata_16_bucketData_20240609_check {
|
|
|
val bucketsMap = buckets.split("\n")
|
|
|
.map(r => r.replace(" ", "").replaceAll("\n", ""))
|
|
|
.filter(r => r.nonEmpty)
|
|
|
- .map(r =>{
|
|
|
+ .map(r => {
|
|
|
val rList = r.split("\t")
|
|
|
(rList(0), (rList(1).toDouble, rList(2).split(",").map(_.toDouble)))
|
|
|
}).toMap
|
|
@@ -72,16 +62,16 @@ object makedata_16_bucketData_20240609_check {
|
|
|
val dateRange = MyDateUtils.getDateRange(beginStr, endStr)
|
|
|
for (date <- dateRange) {
|
|
|
println("开始执行:" + date)
|
|
|
- val data = sc.textFile(readPath + date).map(r=>{
|
|
|
+ val data = sc.textFile(readPath + date).map(r => {
|
|
|
val rList = r.split("\t")
|
|
|
val logKey = rList(0)
|
|
|
val labelKey = rList(1)
|
|
|
val features = rList(2).split(",").map(_.toDouble)
|
|
|
val allFeature: JSONObject = if (rList(3).equals("\\N")) new JSONObject() else
|
|
|
- JSON.parseObject(rList(3))
|
|
|
+ JSON.parseObject(rList(3))
|
|
|
(logKey, labelKey, features, allFeature)
|
|
|
})
|
|
|
- .filter{
|
|
|
+ .filter {
|
|
|
case (logKey, labelKey, features, allFeature) =>
|
|
|
val logKeyList = logKey.split(",")
|
|
|
val apptype = logKeyList(0)
|
|
@@ -91,42 +81,42 @@ object makedata_16_bucketData_20240609_check {
|
|
|
APPSETS.contains(apptype) && pagesource.endsWith("recommend") &&
|
|
|
ABSETS.contains(abcode) && level.equals("0")
|
|
|
}
|
|
|
- .map{
|
|
|
+ .map {
|
|
|
case (logKey, labelKey, features, allFeature) =>
|
|
|
val label = JSON.parseObject(labelKey).getOrDefault("is_return", "0").toString
|
|
|
(label, features, allFeature)
|
|
|
}
|
|
|
.mapPartitions(row => {
|
|
|
- val result = new ArrayBuffer[String]()
|
|
|
- val contentList = contentList_br.value
|
|
|
- val bucketsMap = bucketsMap_br.value
|
|
|
- row.foreach{
|
|
|
- case (label, features, allFeature) =>
|
|
|
- val featuresBucket = contentList.indices.map(i =>{
|
|
|
- val featureName = contentList(i)
|
|
|
- val score = features(i)
|
|
|
- // 用户
|
|
|
- if (featureName.startsWith("c")) {
|
|
|
- val scoreNew = allFeature.getOrDefault(featureName, "").toString
|
|
|
- if (scoreNew.equals("")) {
|
|
|
- ""
|
|
|
- } else {
|
|
|
- featureName + ":" + scoreNew.toString
|
|
|
- }
|
|
|
- } else {
|
|
|
- if (score > 1E-8) {
|
|
|
- val (bucketNum, buckets) = bucketsMap(featureName)
|
|
|
- val scoreNew = 1.0 / bucketNum * (ExtractorUtils.findInsertPosition(buckets, score).toDouble + 1.0)
|
|
|
- featureName + ":" + scoreNew.toString
|
|
|
+ val result = new ArrayBuffer[String]()
|
|
|
+ val contentList = contentList_br.value
|
|
|
+ val bucketsMap = bucketsMap_br.value
|
|
|
+ row.foreach {
|
|
|
+ case (label, features, allFeature) =>
|
|
|
+ val featuresBucket = contentList.indices.map(i => {
|
|
|
+ val featureName = contentList(i)
|
|
|
+ val score = features(i)
|
|
|
+ // 用户
|
|
|
+ if (featureName.startsWith("c") || featureName.startsWith("b")) {
|
|
|
+ val scoreNew = allFeature.getOrDefault(featureName, "").toString
|
|
|
+ if (scoreNew.equals("")) {
|
|
|
+ ""
|
|
|
+ } else {
|
|
|
+ featureName + ":" + scoreNew
|
|
|
+ }
|
|
|
} else {
|
|
|
- ""
|
|
|
+ if (score > 1E-8) {
|
|
|
+ val (bucketNum, buckets) = bucketsMap(featureName)
|
|
|
+ val scoreNew = 1.0 / bucketNum * (ExtractorUtils.findInsertPosition(buckets, score).toDouble + 1.0)
|
|
|
+ featureName + ":" + scoreNew.toString
|
|
|
+ } else {
|
|
|
+ ""
|
|
|
+ }
|
|
|
}
|
|
|
- }
|
|
|
- }).filter(_.nonEmpty)
|
|
|
- result.add(label + "\t" + featuresBucket.mkString("\t"))
|
|
|
- }
|
|
|
- result.iterator
|
|
|
- })
|
|
|
+ }).filter(_.nonEmpty)
|
|
|
+ result.add(label + "\t" + featuresBucket.mkString("\t"))
|
|
|
+ }
|
|
|
+ result.iterator
|
|
|
+ })
|
|
|
|
|
|
// 4 保存数据到hdfs
|
|
|
val hdfsPath = savePath + "/" + date
|
|
@@ -140,6 +130,5 @@ object makedata_16_bucketData_20240609_check {
|
|
|
}
|
|
|
|
|
|
|
|
|
-
|
|
|
}
|
|
|
}
|