|
@@ -129,23 +129,28 @@ object train_recsys_61_xgb_nor_20241209 {
|
|
|
}
|
|
|
|
|
|
def createData(data: RDD[String], features: Array[String]): RDD[Row] = {
|
|
|
- data.map(r => {
|
|
|
- val line: Array[String] = StringUtils.split(r, '\t')
|
|
|
- // val logKey = line(0)
|
|
|
- val label: Double = NumberUtils.toDouble(line(1))
|
|
|
- // val scoresMap = line(2)
|
|
|
- val map: util.Map[String, Double] = new util.HashMap[String, Double]
|
|
|
- for (i <- 3 until line.length) {
|
|
|
- val fv: Array[String] = StringUtils.split(line(i), ':')
|
|
|
- map.put(fv(0), NumberUtils.toDouble(fv(1), 0.0))
|
|
|
- }
|
|
|
+ data
|
|
|
+ .filter(r => {
|
|
|
+ val line: Array[String] = StringUtils.split(r, '\t')
|
|
|
+ line.length > 10
|
|
|
+ })
|
|
|
+ .map(r => {
|
|
|
+ val line: Array[String] = StringUtils.split(r, '\t')
|
|
|
+ // val logKey = line(0)
|
|
|
+ val label: Double = NumberUtils.toDouble(line(1))
|
|
|
+ // val scoresMap = line(2)
|
|
|
+ val map: util.Map[String, Double] = new util.HashMap[String, Double]
|
|
|
+ for (i <- 3 until line.length) {
|
|
|
+ val fv: Array[String] = StringUtils.split(line(i), ':')
|
|
|
+ map.put(fv(0), NumberUtils.toDouble(fv(1), 0.0))
|
|
|
+ }
|
|
|
|
|
|
- val v: Array[Any] = new Array[Any](features.length + 1)
|
|
|
- v(0) = label
|
|
|
- for (i <- 0 until features.length) {
|
|
|
- v(i + 1) = map.getOrDefault(features(i), 0.0d)
|
|
|
- }
|
|
|
- Row(v: _*)
|
|
|
- })
|
|
|
+ val v: Array[Any] = new Array[Any](features.length + 1)
|
|
|
+ v(0) = label
|
|
|
+ for (i <- 0 until features.length) {
|
|
|
+ v(i + 1) = map.getOrDefault(features(i), 0.0d)
|
|
|
+ }
|
|
|
+ Row(v: _*)
|
|
|
+ })
|
|
|
}
|
|
|
}
|