|
@@ -86,13 +86,15 @@ object train_01_xgb_ad_20240808{
|
|
|
.setNumRound(num_round)
|
|
|
.setSubsample(0.8)
|
|
|
.setColsampleBytree(0.8)
|
|
|
-// .setScalePosWeight(1)
|
|
|
+ .setScalePosWeight(1)
|
|
|
.setObjective(func_object)
|
|
|
.setEvalMetric(func_metric)
|
|
|
.setFeaturesCol("features")
|
|
|
.setLabelCol("label")
|
|
|
.setNthread(1)
|
|
|
.setNumWorkers(num_worker)
|
|
|
+ .setSeed(2024)
|
|
|
+ .setMinChildWeight(1)
|
|
|
val model = xgbClassifier.fit(xgbInput)
|
|
|
|
|
|
|
|
@@ -126,6 +128,22 @@ object train_01_xgb_ad_20240808{
|
|
|
.setMetricName("areaUnderROC")
|
|
|
val auc = evaluator.evaluate(predictions.select("label", "probability"))
|
|
|
println("zhangbo:auc:" + auc)
|
|
|
+
|
|
|
+ // 统计分cid的分数
|
|
|
+ sc.textFile(hdfsPath).map(r=>{
|
|
|
+ val rList = r.split("\t")
|
|
|
+ val cid = rList(3)
|
|
|
+ val score = rList(2).replace("[", "").replace("]", "")
|
|
|
+ .split(",")(2).toDouble
|
|
|
+ val label = rList(0).toDouble
|
|
|
+ (cid, (1, label, score))
|
|
|
+ }).reduceByKey{
|
|
|
+ case (a, b) => (a._1 + b._1, a._2 + b._2, a._3 + b._3)
|
|
|
+ }.map{
|
|
|
+ case (cid, (all, zheng, scores)) =>
|
|
|
+ (cid, all, zheng, scores, zheng / all, scores / all)
|
|
|
+ }.collect().sortBy(_._1).map(_.productIterator.mkString("\t")).foreach(println)
|
|
|
+
|
|
|
}
|
|
|
|
|
|
|