| 
					
				 | 
			
			
				@@ -1,75 +0,0 @@ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-package com.tzld.piaoquan.recommend.model 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				- 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-import ml.dmlc.xgboost4j.scala.spark.XGBoostClassifier 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-import org.apache.commons.lang.math.NumberUtils 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-import org.apache.commons.lang3.StringUtils 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-import org.apache.hadoop.io.compress.GzipCodec 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-import org.apache.spark.ml.evaluation.BinaryClassificationEvaluator 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-import org.apache.spark.ml.feature.VectorAssembler 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-import org.apache.spark.rdd.RDD 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-import org.apache.spark.sql.types.DataTypes 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-import org.apache.spark.sql.{Dataset, Row, SparkSession} 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				- 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-import java.util 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-import scala.io.Source 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				- 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-object ana_01_xgb_ad_20240809{ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-  def main(args: Array[String]): Unit = { 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-    val spark = SparkSession 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-      .builder() 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-      .appName(this.getClass.getName) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-      .getOrCreate() 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-    val sc = spark.sparkContext 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				- 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-    val param = ParamUtils.parseArgs(args) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-    val savePath = param.getOrElse("savePath", "/dw/recommend/model/34_ad_predict_data/") 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				- 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-    val hdfsPath = savePath 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-    // 统计分cid的分数 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-    sc.textFile(hdfsPath).map(r=>{ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-      val rList = r.split("\t") 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-      val cid = rList(3) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-      val score = rList(2).replace("[", "").replace("]", "") 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-        .split(",")(1).toDouble 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-      val label = rList(0).toDouble 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-      (cid, (1, label, score)) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-    }).reduceByKey{ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-      case (a, b) => (a._1 + b._1, a._2 + b._2, a._3 + b._3) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-    }.map{ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-      case (cid, (all, zheng, scores)) => 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-        (cid, all, zheng, scores, zheng / all, scores / all) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-    }.collect().sortBy(_._1).map(_.productIterator.mkString("\t")).foreach(println) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				- 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-  } 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				- 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				- 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-} 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				- 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				- 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				- 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-//rabit_timeout -> -1 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-//scale_pos_weight -> 1.0 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-//seed -> 0 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-//handle_invalid -> error 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-//features_col -> features 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-//label_col -> label 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-//num_workers -> 1 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-//subsample -> 0.8 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-//max_depth -> 5 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-//probability_col -> probability 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-//raw_prediction_col -> rawPrediction 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-//tree_limit -> 0 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-//dmlc_worker_connect_retry -> 5 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-//train_test_ratio -> 1.0 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-//use_external_memory -> false 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-//objective -> binary:logistic 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-//eval_metric -> auc 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-//num_round -> 1000 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-//missing -> 0.0 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-//rabit_ring_reduce_threshold -> 32768 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-//tracker_conf -> TrackerConf(0,python,,) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-//eta -> 0.009999999776482582 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-//colsample_bytree -> 0.8 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-//allow_non_zero_for_missing -> false 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-//nthread -> 8 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-//prediction_col -> prediction 
			 |