|  | @@ -1,248 +1,176 @@
 | 
											
												
													
														|  | -//package com.aliyun.odps.spark.examples.makedata
 |  | 
 | 
											
												
													
														|  | -//
 |  | 
 | 
											
												
													
														|  | -//import com.aliyun.odps.TableSchema
 |  | 
 | 
											
												
													
														|  | -//import com.aliyun.odps.data.Record
 |  | 
 | 
											
												
													
														|  | -//import com.aliyun.odps.spark.examples.myUtils.{MyHdfsUtils, ParamUtils, env}
 |  | 
 | 
											
												
													
														|  | -//import com.google.gson.GsonBuilder
 |  | 
 | 
											
												
													
														|  | -//import examples.dataloader.RequestContextOffline
 |  | 
 | 
											
												
													
														|  | -//import org.apache.hadoop.io.compress.GzipCodec
 |  | 
 | 
											
												
													
														|  | -//import org.apache.spark.sql.SparkSession
 |  | 
 | 
											
												
													
														|  | -//
 |  | 
 | 
											
												
													
														|  | -//import java.util
 |  | 
 | 
											
												
													
														|  | -//import java.util.concurrent.TimeUnit
 |  | 
 | 
											
												
													
														|  | -//import scala.collection.JavaConversions._
 |  | 
 | 
											
												
													
														|  | -//
 |  | 
 | 
											
												
													
														|  | -//
 |  | 
 | 
											
												
													
														|  | -//object makedata_09_user2redis {
 |  | 
 | 
											
												
													
														|  | -//  def main(args: Array[String]) {
 |  | 
 | 
											
												
													
														|  | -//    val spark = SparkSession
 |  | 
 | 
											
												
													
														|  | -//      .builder()
 |  | 
 | 
											
												
													
														|  | -//      .appName(this.getClass.getName)
 |  | 
 | 
											
												
													
														|  | -//      .getOrCreate()
 |  | 
 | 
											
												
													
														|  | -//    val sc = spark.sparkContext
 |  | 
 | 
											
												
													
														|  | -//
 |  | 
 | 
											
												
													
														|  | -//    // 1 读取参数
 |  | 
 | 
											
												
													
														|  | -//    val param = ParamUtils.parseArgs(args)
 |  | 
 | 
											
												
													
														|  | -//    val partitionPrefix = param.getOrElse("partitionPrefix", "dt=")
 |  | 
 | 
											
												
													
														|  | -//    val tablePart = param.getOrElse("tablePart", "64").toInt
 |  | 
 | 
											
												
													
														|  | -//    val ifUser = param.getOrDefault("ifUser", "False").toBoolean
 |  | 
 | 
											
												
													
														|  | -//    val ifVideo = param.getOrDefault("ifVideo", "False").toBoolean
 |  | 
 | 
											
												
													
														|  | -//    val date = param.getOrDefault("date", "20231220")
 |  | 
 | 
											
												
													
														|  | -//    val expireDay = param.getOrDefault("expireDay", "2").toInt
 |  | 
 | 
											
												
													
														|  | -//    val ifDebug = param.getOrDefault("ifDebug", "False").toBoolean
 |  | 
 | 
											
												
													
														|  | -//    val ifDeleteRedisUser = param.getOrDefault("ifDeleteRedisUser", "False").toBoolean
 |  | 
 | 
											
												
													
														|  | -//    val ifWriteRedisUser = param.getOrDefault("ifWriteRedisUser", "False").toBoolean
 |  | 
 | 
											
												
													
														|  | -//    val ifWriteRedis = param.getOrDefault("ifWriteRedis", "True").toBoolean
 |  | 
 | 
											
												
													
														|  | -//    val partition = partitionPrefix + date
 |  | 
 | 
											
												
													
														|  | -//    val savePathUser = param.getOrDefault("savePathUser", "")
 |  | 
 | 
											
												
													
														|  | -//    val savePathVideo = param.getOrDefault("savePathVideo", "")
 |  | 
 | 
											
												
													
														|  | -//    val userSampleIDs = param.getOrDefault("userSampleIDs", "")
 |  | 
 | 
											
												
													
														|  | -//    val sampleRate = param.getOrDefault("sampleRate", "1.0").toDouble
 |  | 
 | 
											
												
													
														|  | -////    val userSampleIDsPathFix = param.getOrDefault("userSampleIDsPathFix", "")
 |  | 
 | 
											
												
													
														|  | -//    //  /dw/recommend/model/feature/
 |  | 
 | 
											
												
													
														|  | -//
 |  | 
 | 
											
												
													
														|  | -//
 |  | 
 | 
											
												
													
														|  | -//    // 2 读取数据库odps
 |  | 
 | 
											
												
													
														|  | -//    val odpsOps = env.getODPS(sc)
 |  | 
 | 
											
												
													
														|  | -//    val project = "loghubods"
 |  | 
 | 
											
												
													
														|  | -//    val tableUser = "alg_recsys_user_info"
 |  | 
 | 
											
												
													
														|  | -//    val tableItem = "alg_recsys_video_info"
 |  | 
 | 
											
												
													
														|  | -//    val userRedisKeyPrefix = "user_info_4video_"
 |  | 
 | 
											
												
													
														|  | -//    val videoRedisKeyPrefix = "video_info_"
 |  | 
 | 
											
												
													
														|  | -//
 |  | 
 | 
											
												
													
														|  | -//
 |  | 
 | 
											
												
													
														|  | -//
 |  | 
 | 
											
												
													
														|  | -//    // 3 用户测特征处理
 |  | 
 | 
											
												
													
														|  | -//    if (ifUser){
 |  | 
 | 
											
												
													
														|  | -//      println("user特征处理")
 |  | 
 | 
											
												
													
														|  | -//      var userData = odpsOps.readTable(project = project, table = tableUser, partition = partition, transfer = handleUser, numPartition = tablePart)
 |  | 
 | 
											
												
													
														|  | -//        .filter {
 |  | 
 | 
											
												
													
														|  | -//          case (mid, fea, feaSize) =>
 |  | 
 | 
											
												
													
														|  | -//            mid.nonEmpty && fea.nonEmpty && feaSize > 0
 |  | 
 | 
											
												
													
														|  | -//        }
 |  | 
 | 
											
												
													
														|  | -//      if (userSampleIDs.nonEmpty){
 |  | 
 | 
											
												
													
														|  | -//        val IDs = userSampleIDs.split(",").filter(_.nonEmpty).map(_.toInt).toList
 |  | 
 | 
											
												
													
														|  | -//        userData = userData.filter(r => IDs.contains(r._1.hashCode % 10))
 |  | 
 | 
											
												
													
														|  | -//      }
 |  | 
 | 
											
												
													
														|  | -//      if (ifDebug){
 |  | 
 | 
											
												
													
														|  | -//        println("user特征处理-debug开启-只保留5条数据-特征数量大于1")
 |  | 
 | 
											
												
													
														|  | -//        val userDataTake = userData.take(5)
 |  | 
 | 
											
												
													
														|  | -//        userDataTake.foreach(r=> println(r._1 + "\t" + r._2 + "\t" + r._3))
 |  | 
 | 
											
												
													
														|  | -//        userData = sc.parallelize(userDataTake)
 |  | 
 | 
											
												
													
														|  | -//      }
 |  | 
 | 
											
												
													
														|  | -//      if (savePathUser.nonEmpty && savePathUser.startsWith("/dw/recommend/model/")) {
 |  | 
 | 
											
												
													
														|  | -//        var savePathPart = savePathUser + "/" + partition
 |  | 
 | 
											
												
													
														|  | -//        if (userSampleIDs.nonEmpty) {
 |  | 
 | 
											
												
													
														|  | -//          savePathPart = savePathPart + "_" + userSampleIDs
 |  | 
 | 
											
												
													
														|  | -//        }
 |  | 
 | 
											
												
													
														|  | -//        MyHdfsUtils.delete_hdfs_path(savePathPart)
 |  | 
 | 
											
												
													
														|  | -//        userData.map(r => r._1 + "\t" + r._2).saveAsTextFile(savePathPart, classOf[GzipCodec])
 |  | 
 | 
											
												
													
														|  | -//      }
 |  | 
 | 
											
												
													
														|  | -//      println("user.action.count=" + userData.count())
 |  | 
 | 
											
												
													
														|  | -//    } else {
 |  | 
 | 
											
												
													
														|  | -//      println("不处理user")
 |  | 
 | 
											
												
													
														|  | -//    }
 |  | 
 | 
											
												
													
														|  | -//
 |  | 
 | 
											
												
													
														|  | -//    if (ifDeleteRedisUser){
 |  | 
 | 
											
												
													
														|  | -//      println("user redis 删除")
 |  | 
 | 
											
												
													
														|  | -//      var savePathPart = savePathUser + "/" + partition
 |  | 
 | 
											
												
													
														|  | -//      if (userSampleIDs.nonEmpty) {
 |  | 
 | 
											
												
													
														|  | -//        savePathPart = savePathPart + "_" + userSampleIDs
 |  | 
 | 
											
												
													
														|  | -//      }
 |  | 
 | 
											
												
													
														|  | -//      println("读取数据路径:" + savePathPart)
 |  | 
 | 
											
												
													
														|  | -//      val userDataRead = sc.textFile(savePathPart)
 |  | 
 | 
											
												
													
														|  | -//      val userDataRead2 = userDataRead.filter(_.split("\t").length >= 2).map(r => {
 |  | 
 | 
											
												
													
														|  | -//        val rList = r.split("\t")
 |  | 
 | 
											
												
													
														|  | -//        (rList(0), rList(1))
 |  | 
 | 
											
												
													
														|  | -//      })
 |  | 
 | 
											
												
													
														|  | -//      println("预计删除数据量:" + userDataRead2.count())
 |  | 
 | 
											
												
													
														|  | -//      val userDataTakeRddRun = userDataRead2.mapPartitions(row => {
 |  | 
 | 
											
												
													
														|  | -//        val redisFormat = new util.HashMap[String, String]
 |  | 
 | 
											
												
													
														|  | -//        val redisTemplate = env.getRedisTemplate()
 |  | 
 | 
											
												
													
														|  | -//        var i = 1
 |  | 
 | 
											
												
													
														|  | -//        row.foreach {
 |  | 
 | 
											
												
													
														|  | -//          case (key, value) =>
 |  | 
 | 
											
												
													
														|  | -//            if (key.nonEmpty) {
 |  | 
 | 
											
												
													
														|  | -//              redisFormat.put(userRedisKeyPrefix + key, value)
 |  | 
 | 
											
												
													
														|  | -//            }
 |  | 
 | 
											
												
													
														|  | -//            if (i % 1000 == 0) {
 |  | 
 | 
											
												
													
														|  | -//              redisTemplate.delete(redisFormat.map(_._1))
 |  | 
 | 
											
												
													
														|  | -//              redisFormat.clear()
 |  | 
 | 
											
												
													
														|  | -//            }
 |  | 
 | 
											
												
													
														|  | -//            i = i + 1
 |  | 
 | 
											
												
													
														|  | -//        }
 |  | 
 | 
											
												
													
														|  | -//        redisTemplate.delete(redisFormat.map(_._1))
 |  | 
 | 
											
												
													
														|  | -//        redisFormat.clear()
 |  | 
 | 
											
												
													
														|  | -//        redisFormat.iterator
 |  | 
 | 
											
												
													
														|  | -//      })
 |  | 
 | 
											
												
													
														|  | -//      println("delete redis.count=" + userDataTakeRddRun.count())
 |  | 
 | 
											
												
													
														|  | -//    } else {
 |  | 
 | 
											
												
													
														|  | -//      println("不处理user的redis删除")
 |  | 
 | 
											
												
													
														|  | -//    }
 |  | 
 | 
											
												
													
														|  | -//
 |  | 
 | 
											
												
													
														|  | -//    if (ifWriteRedisUser){
 |  | 
 | 
											
												
													
														|  | -//      println("user redis 写入")
 |  | 
 | 
											
												
													
														|  | -//      var savePathPart = savePathUser + "/" + partition
 |  | 
 | 
											
												
													
														|  | -//      if (userSampleIDs.nonEmpty) {
 |  | 
 | 
											
												
													
														|  | -//        savePathPart = savePathPart + "_" + userSampleIDs
 |  | 
 | 
											
												
													
														|  | -//      }
 |  | 
 | 
											
												
													
														|  | -//      val userDataRead = sc.textFile(savePathPart).filter(_.split("\t").length >= 2)
 |  | 
 | 
											
												
													
														|  | -//        .sample(false, sampleRate)
 |  | 
 | 
											
												
													
														|  | -//        .map(r => {
 |  | 
 | 
											
												
													
														|  | -//        val rList = r.split("\t")
 |  | 
 | 
											
												
													
														|  | -//        (rList(0), rList(1))
 |  | 
 | 
											
												
													
														|  | -//      })
 |  | 
 | 
											
												
													
														|  | -//      val userDataTakeRddRun = userDataRead.mapPartitions(row => {
 |  | 
 | 
											
												
													
														|  | -//        val redisFormat = new util.HashMap[String, String]
 |  | 
 | 
											
												
													
														|  | -//        val redisTemplate = env.getRedisTemplate()
 |  | 
 | 
											
												
													
														|  | -//        var i = 1
 |  | 
 | 
											
												
													
														|  | -//        row.foreach {
 |  | 
 | 
											
												
													
														|  | -//          case (key, value) =>
 |  | 
 | 
											
												
													
														|  | -//            if (key.nonEmpty) {
 |  | 
 | 
											
												
													
														|  | -//              redisFormat.put(userRedisKeyPrefix + key, value)
 |  | 
 | 
											
												
													
														|  | -//            }
 |  | 
 | 
											
												
													
														|  | -//            if (i % 1000 == 0) {
 |  | 
 | 
											
												
													
														|  | -//              redisTemplate.opsForValue.multiSet(redisFormat)
 |  | 
 | 
											
												
													
														|  | -//              redisFormat.keySet.foreach(r => redisTemplate.expire(r, 24 * expireDay, TimeUnit.HOURS))
 |  | 
 | 
											
												
													
														|  | -//              redisFormat.clear()
 |  | 
 | 
											
												
													
														|  | -//            }
 |  | 
 | 
											
												
													
														|  | -//            i = i + 1
 |  | 
 | 
											
												
													
														|  | -//        }
 |  | 
 | 
											
												
													
														|  | -//        redisTemplate.opsForValue.multiSet(redisFormat)
 |  | 
 | 
											
												
													
														|  | -//        redisFormat.keySet.foreach(r => redisTemplate.expire(r, 24 * expireDay, TimeUnit.HOURS))
 |  | 
 | 
											
												
													
														|  | -//        redisFormat.clear()
 |  | 
 | 
											
												
													
														|  | -//        redisFormat.iterator
 |  | 
 | 
											
												
													
														|  | -//      })
 |  | 
 | 
											
												
													
														|  | -//      println("put in redis.count=" + userDataTakeRddRun.count())
 |  | 
 | 
											
												
													
														|  | -//    } else {
 |  | 
 | 
											
												
													
														|  | -//      println("不处理user的redis写入")
 |  | 
 | 
											
												
													
														|  | -//    }
 |  | 
 | 
											
												
													
														|  | -//
 |  | 
 | 
											
												
													
														|  | -//
 |  | 
 | 
											
												
													
														|  | -//
 |  | 
 | 
											
												
													
														|  | -//
 |  | 
 | 
											
												
													
														|  | -//    // 4 video测特征处理
 |  | 
 | 
											
												
													
														|  | -//    if (ifVideo){
 |  | 
 | 
											
												
													
														|  | -//      println("video特征处理")
 |  | 
 | 
											
												
													
														|  | -//      val handleItemFunction: (Record, TableSchema) => Tuple3[String, String, Int] = handleItem(_, _, date)
 |  | 
 | 
											
												
													
														|  | -//      var itemData = odpsOps.readTable(project = project, table = tableItem, partition = partition, transfer = handleItemFunction, numPartition = tablePart)
 |  | 
 | 
											
												
													
														|  | -//      if (ifDebug) {
 |  | 
 | 
											
												
													
														|  | -//        println("video特征处理-debug开启-只保留5条数据-特征数量大于1")
 |  | 
 | 
											
												
													
														|  | -//        val itemDataTake = itemData.filter(_._3 > 1).take(5)
 |  | 
 | 
											
												
													
														|  | -//        itemDataTake.foreach(r => println(r._1 + "\t" + r._2 + "\t" + r._3))
 |  | 
 | 
											
												
													
														|  | -//        itemData = sc.parallelize(itemDataTake)
 |  | 
 | 
											
												
													
														|  | -//      }
 |  | 
 | 
											
												
													
														|  | -//      val itemDataTakeRddRun = itemData.mapPartitions(row => {
 |  | 
 | 
											
												
													
														|  | -//        val redisFormat = new util.HashMap[String, String]
 |  | 
 | 
											
												
													
														|  | -//        val redisTemplate = env.getRedisTemplate()
 |  | 
 | 
											
												
													
														|  | -//        row.foreach {
 |  | 
 | 
											
												
													
														|  | -//          case (key, value, _) =>
 |  | 
 | 
											
												
													
														|  | -//            if (key.nonEmpty && value != null && value.nonEmpty) {
 |  | 
 | 
											
												
													
														|  | -//              redisFormat.put(videoRedisKeyPrefix + key, value)
 |  | 
 | 
											
												
													
														|  | -//              if (ifWriteRedis) {
 |  | 
 | 
											
												
													
														|  | -//                redisTemplate.opsForValue.set(videoRedisKeyPrefix + key, value, 24 * expireDay, TimeUnit.HOURS)
 |  | 
 | 
											
												
													
														|  | -//              }
 |  | 
 | 
											
												
													
														|  | -//            }
 |  | 
 | 
											
												
													
														|  | -//        }
 |  | 
 | 
											
												
													
														|  | -////        if (ifWriteRedis){
 |  | 
 | 
											
												
													
														|  | -////          redisTemplate.opsForValue.multiSet(redisFormat)
 |  | 
 | 
											
												
													
														|  | -////          redisFormat.keySet.foreach(key => redisTemplate.expire(key, 24 * expireDay, TimeUnit.HOURS))
 |  | 
 | 
											
												
													
														|  | -////        }
 |  | 
 | 
											
												
													
														|  | -//        redisFormat.iterator
 |  | 
 | 
											
												
													
														|  | -//      })
 |  | 
 | 
											
												
													
														|  | -//      if (savePathVideo.nonEmpty && savePathVideo.startsWith("/dw/recommend/model/")){
 |  | 
 | 
											
												
													
														|  | -//        val savePathPart = savePathVideo + "/" + partition
 |  | 
 | 
											
												
													
														|  | -//        MyHdfsUtils.delete_hdfs_path(savePathPart)
 |  | 
 | 
											
												
													
														|  | -//        itemDataTakeRddRun.map(r => r._1 + "\t" + r._2).saveAsTextFile(savePathPart, classOf[GzipCodec])
 |  | 
 | 
											
												
													
														|  | -//      }
 |  | 
 | 
											
												
													
														|  | -//      println("item.action.count=" + itemDataTakeRddRun.count())
 |  | 
 | 
											
												
													
														|  | -//    }else{
 |  | 
 | 
											
												
													
														|  | -//      println("不处理video")
 |  | 
 | 
											
												
													
														|  | -//    }
 |  | 
 | 
											
												
													
														|  | -//  }
 |  | 
 | 
											
												
													
														|  | -//
 |  | 
 | 
											
												
													
														|  | -//  def handleUser(record: Record, schema: TableSchema): Tuple3[String, String, Int] = {
 |  | 
 | 
											
												
													
														|  | -//    val userKey = "mids"
 |  | 
 | 
											
												
													
														|  | -//    val mid = record.getString(userKey)
 |  | 
 | 
											
												
													
														|  | -//    val reqContext: RequestContextOffline = new RequestContextOffline()
 |  | 
 | 
											
												
													
														|  | -//    reqContext.putUserFeature(record)
 |  | 
 | 
											
												
													
														|  | -//    // reqContext.featureMap.put("mid", mid)
 |  | 
 | 
											
												
													
														|  | -//    val gson = (new GsonBuilder).serializeSpecialFloatingPointValues.create
 |  | 
 | 
											
												
													
														|  | -//    val value = gson.toJson(reqContext.featureMap)
 |  | 
 | 
											
												
													
														|  | -//    (mid, value, reqContext.featureMap.size())
 |  | 
 | 
											
												
													
														|  | -//  }
 |  | 
 | 
											
												
													
														|  | -//
 |  | 
 | 
											
												
													
														|  | -//  def handleItem(record: Record, schema: TableSchema, date:String): Tuple3[String, String, Int] = {
 |  | 
 | 
											
												
													
														|  | -//    val videoKey = "videoid"
 |  | 
 | 
											
												
													
														|  | -//    val videoid = record.getBigint(videoKey).toString
 |  | 
 | 
											
												
													
														|  | -//    val reqContext: RequestContextOffline = new RequestContextOffline()
 |  | 
 | 
											
												
													
														|  | -//
 |  | 
 | 
											
												
													
														|  | -//    //---------todo 有特征不在表里 临时修复---------
 |  | 
 | 
											
												
													
														|  | -////    val i_title_len =  if (record.getString("title") != null) record.getString("title").length.toString else ""
 |  | 
 | 
											
												
													
														|  | -////    val i_days_since_upload = if (record.getDatetime("gmt_create") != null){
 |  | 
 | 
											
												
													
														|  | -////      val format = new SimpleDateFormat("yyyyMMdd")
 |  | 
 | 
											
												
													
														|  | -////      val dateOld = format.format(record.getDatetime("gmt_create"))
 |  | 
 | 
											
												
													
														|  | -////      val dayDiff = MyDateUtils.calculateDateDifference(dateOld, date)
 |  | 
 | 
											
												
													
														|  | -////      dayDiff.toString
 |  | 
 | 
											
												
													
														|  | -////    }else{
 |  | 
 | 
											
												
													
														|  | -////      ""
 |  | 
 | 
											
												
													
														|  | -////    }
 |  | 
 | 
											
												
													
														|  | -////    if (i_title_len.nonEmpty){
 |  | 
 | 
											
												
													
														|  | -////      val d = reqContext.bucketRatioFeature(i_title_len.toDouble)
 |  | 
 | 
											
												
													
														|  | -////      reqContext.featureMap.put("i_title_len", d.toString)
 |  | 
 | 
											
												
													
														|  | -////    }
 |  | 
 | 
											
												
													
														|  | -////    if (i_days_since_upload.nonEmpty) {
 |  | 
 | 
											
												
													
														|  | -////      val d = reqContext.bucketRatioFeature(i_days_since_upload.toDouble)
 |  | 
 | 
											
												
													
														|  | -////      reqContext.featureMap.put("i_days_since_upload", d.toString)
 |  | 
 | 
											
												
													
														|  | -////    }
 |  | 
 | 
											
												
													
														|  | -//    //------修复完成---------
 |  | 
 | 
											
												
													
														|  | -//
 |  | 
 | 
											
												
													
														|  | -//    reqContext.putItemFeature(record)
 |  | 
 | 
											
												
													
														|  | -//     reqContext.featureMap.put("videoid", videoid)
 |  | 
 | 
											
												
													
														|  | -//
 |  | 
 | 
											
												
													
														|  | -//    val gson = (new GsonBuilder).serializeSpecialFloatingPointValues.create
 |  | 
 | 
											
												
													
														|  | -//    val value = gson.toJson(reqContext.featureMap)
 |  | 
 | 
											
												
													
														|  | -//    (videoid, value, reqContext.featureMap.size())
 |  | 
 | 
											
												
													
														|  | -//  }
 |  | 
 | 
											
												
													
														|  | -//
 |  | 
 | 
											
												
													
														|  | -//}
 |  | 
 | 
											
												
													
														|  | 
 |  | +package com.aliyun.odps.spark.examples.makedata
 | 
											
												
													
														|  | 
 |  | +
 | 
											
												
													
														|  | 
 |  | +import com.aliyun.odps.TableSchema
 | 
											
												
													
														|  | 
 |  | +import com.aliyun.odps.data.Record
 | 
											
												
													
														|  | 
 |  | +import com.aliyun.odps.spark.examples.myUtils.{MyHdfsUtils, ParamUtils, env}
 | 
											
												
													
														|  | 
 |  | +import com.google.gson.GsonBuilder
 | 
											
												
													
														|  | 
 |  | +import examples.dataloader.RequestContextOffline
 | 
											
												
													
														|  | 
 |  | +import org.apache.hadoop.io.compress.GzipCodec
 | 
											
												
													
														|  | 
 |  | +import org.apache.spark.sql.SparkSession
 | 
											
												
													
														|  | 
 |  | +
 | 
											
												
													
														|  | 
 |  | +import java.util
 | 
											
												
													
														|  | 
 |  | +import java.util.concurrent.TimeUnit
 | 
											
												
													
														|  | 
 |  | +import scala.collection.JavaConversions._
 | 
											
												
													
														|  | 
 |  | +
 | 
											
												
													
														|  | 
 |  | +
 | 
											
												
													
														|  | 
 |  | +object makedata_09_user2redis {
 | 
											
												
													
														|  | 
 |  | +  def main(args: Array[String]) {
 | 
											
												
													
														|  | 
 |  | +    val spark = SparkSession
 | 
											
												
													
														|  | 
 |  | +      .builder()
 | 
											
												
													
														|  | 
 |  | +      .appName(this.getClass.getName)
 | 
											
												
													
														|  | 
 |  | +      .getOrCreate()
 | 
											
												
													
														|  | 
 |  | +    val sc = spark.sparkContext
 | 
											
												
													
														|  | 
 |  | +
 | 
											
												
													
														|  | 
 |  | +    // 1 读取参数
 | 
											
												
													
														|  | 
 |  | +    val param = ParamUtils.parseArgs(args)
 | 
											
												
													
														|  | 
 |  | +    val partitionPrefix = param.getOrElse("partitionPrefix", "dt=")
 | 
											
												
													
														|  | 
 |  | +    val tablePart = param.getOrElse("tablePart", "64").toInt
 | 
											
												
													
														|  | 
 |  | +    val ifUser = param.getOrDefault("ifUser", "False").toBoolean
 | 
											
												
													
														|  | 
 |  | +    val ifVideo = param.getOrDefault("ifVideo", "False").toBoolean
 | 
											
												
													
														|  | 
 |  | +    val date = param.getOrDefault("date", "20231220")
 | 
											
												
													
														|  | 
 |  | +    val expireDay = param.getOrDefault("expireDay", "2").toInt
 | 
											
												
													
														|  | 
 |  | +    val ifDebug = param.getOrDefault("ifDebug", "False").toBoolean
 | 
											
												
													
														|  | 
 |  | +    val ifDeleteRedisUser = param.getOrDefault("ifDeleteRedisUser", "False").toBoolean
 | 
											
												
													
														|  | 
 |  | +    val ifWriteRedisUser = param.getOrDefault("ifWriteRedisUser", "False").toBoolean
 | 
											
												
													
														|  | 
 |  | +    val ifWriteRedis = param.getOrDefault("ifWriteRedis", "True").toBoolean
 | 
											
												
													
														|  | 
 |  | +    val partition = partitionPrefix + date
 | 
											
												
													
														|  | 
 |  | +    val savePathUser = param.getOrDefault("savePathUser", "")
 | 
											
												
													
														|  | 
 |  | +    val savePathVideo = param.getOrDefault("savePathVideo", "")
 | 
											
												
													
														|  | 
 |  | +    val userSampleIDs = param.getOrDefault("userSampleIDs", "")
 | 
											
												
													
														|  | 
 |  | +    val sampleRate = param.getOrDefault("sampleRate", "1.0").toDouble
 | 
											
												
													
														|  | 
 |  | +
 | 
											
												
													
														|  | 
 |  | +
 | 
											
												
													
														|  | 
 |  | +    // 2 读取数据库odps
 | 
											
												
													
														|  | 
 |  | +    val odpsOps = env.getODPS(sc)
 | 
											
												
													
														|  | 
 |  | +    val project = "loghubods"
 | 
											
												
													
														|  | 
 |  | +    val tableUser = "alg_recsys_user_info"
 | 
											
												
													
														|  | 
 |  | +    val userRedisKeyPrefix = "user_info_4video_"
 | 
											
												
													
														|  | 
 |  | +
 | 
											
												
													
														|  | 
 |  | +
 | 
											
												
													
														|  | 
 |  | +
 | 
											
												
													
														|  | 
 |  | +    // 3 用户测特征处理
 | 
											
												
													
														|  | 
 |  | +    if (ifUser){
 | 
											
												
													
														|  | 
 |  | +      println("user特征处理")
 | 
											
												
													
														|  | 
 |  | +      var userData = odpsOps.readTable(project = project, table = tableUser, partition = partition, transfer = handleUser, numPartition = tablePart)
 | 
											
												
													
														|  | 
 |  | +        .filter {
 | 
											
												
													
														|  | 
 |  | +          case (mid, fea, feaSize) =>
 | 
											
												
													
														|  | 
 |  | +            mid.nonEmpty && fea.nonEmpty && feaSize > 0
 | 
											
												
													
														|  | 
 |  | +        }
 | 
											
												
													
														|  | 
 |  | +      if (userSampleIDs.nonEmpty){
 | 
											
												
													
														|  | 
 |  | +        val IDs = userSampleIDs.split(",").filter(_.nonEmpty).map(_.toInt).toList
 | 
											
												
													
														|  | 
 |  | +        userData = userData.filter(r => IDs.contains(r._1.hashCode % 10))
 | 
											
												
													
														|  | 
 |  | +      }
 | 
											
												
													
														|  | 
 |  | +      if (ifDebug){
 | 
											
												
													
														|  | 
 |  | +        println("user特征处理-debug开启-只保留5条数据-特征数量大于1")
 | 
											
												
													
														|  | 
 |  | +        val userDataTake = userData.take(5)
 | 
											
												
													
														|  | 
 |  | +        userDataTake.foreach(r=> println(r._1 + "\t" + r._2 + "\t" + r._3))
 | 
											
												
													
														|  | 
 |  | +        userData = sc.parallelize(userDataTake)
 | 
											
												
													
														|  | 
 |  | +      }
 | 
											
												
													
														|  | 
 |  | +      if (savePathUser.nonEmpty && savePathUser.startsWith("/dw/recommend/model/")) {
 | 
											
												
													
														|  | 
 |  | +        var savePathPart = savePathUser + "/" + partition
 | 
											
												
													
														|  | 
 |  | +        if (userSampleIDs.nonEmpty) {
 | 
											
												
													
														|  | 
 |  | +          savePathPart = savePathPart + "_" + userSampleIDs
 | 
											
												
													
														|  | 
 |  | +        }
 | 
											
												
													
														|  | 
 |  | +        MyHdfsUtils.delete_hdfs_path(savePathPart)
 | 
											
												
													
														|  | 
 |  | +        userData.map(r => r._1 + "\t" + r._2).saveAsTextFile(savePathPart, classOf[GzipCodec])
 | 
											
												
													
														|  | 
 |  | +      }
 | 
											
												
													
														|  | 
 |  | +      println("user.action.count=" + userData.count())
 | 
											
												
													
														|  | 
 |  | +    } else {
 | 
											
												
													
														|  | 
 |  | +      println("不处理user")
 | 
											
												
													
														|  | 
 |  | +    }
 | 
											
												
													
														|  | 
 |  | +
 | 
											
												
													
														|  | 
 |  | +    if (ifDeleteRedisUser){
 | 
											
												
													
														|  | 
 |  | +      println("user redis 删除")
 | 
											
												
													
														|  | 
 |  | +      var savePathPart = savePathUser + "/" + partition
 | 
											
												
													
														|  | 
 |  | +      if (userSampleIDs.nonEmpty) {
 | 
											
												
													
														|  | 
 |  | +        savePathPart = savePathPart + "_" + userSampleIDs
 | 
											
												
													
														|  | 
 |  | +      }
 | 
											
												
													
														|  | 
 |  | +      println("读取数据路径:" + savePathPart)
 | 
											
												
													
														|  | 
 |  | +      val userDataRead = sc.textFile(savePathPart)
 | 
											
												
													
														|  | 
 |  | +      val userDataRead2 = userDataRead.filter(_.split("\t").length >= 2).map(r => {
 | 
											
												
													
														|  | 
 |  | +        val rList = r.split("\t")
 | 
											
												
													
														|  | 
 |  | +        (rList(0), rList(1))
 | 
											
												
													
														|  | 
 |  | +      })
 | 
											
												
													
														|  | 
 |  | +      println("预计删除数据量:" + userDataRead2.count())
 | 
											
												
													
														|  | 
 |  | +      val userDataTakeRddRun = userDataRead2.mapPartitions(row => {
 | 
											
												
													
														|  | 
 |  | +        val redisFormat = new util.HashMap[String, String]
 | 
											
												
													
														|  | 
 |  | +        val redisTemplate = env.getRedisTemplate()
 | 
											
												
													
														|  | 
 |  | +        var i = 1
 | 
											
												
													
														|  | 
 |  | +        row.foreach {
 | 
											
												
													
														|  | 
 |  | +          case (key, value) =>
 | 
											
												
													
														|  | 
 |  | +            if (key.nonEmpty) {
 | 
											
												
													
														|  | 
 |  | +              redisFormat.put(userRedisKeyPrefix + key, value)
 | 
											
												
													
														|  | 
 |  | +            }
 | 
											
												
													
														|  | 
 |  | +            if (i % 1000 == 0) {
 | 
											
												
													
														|  | 
 |  | +              redisTemplate.delete(redisFormat.map(_._1))
 | 
											
												
													
														|  | 
 |  | +              redisFormat.clear()
 | 
											
												
													
														|  | 
 |  | +            }
 | 
											
												
													
														|  | 
 |  | +            i = i + 1
 | 
											
												
													
														|  | 
 |  | +        }
 | 
											
												
													
														|  | 
 |  | +        redisTemplate.delete(redisFormat.map(_._1))
 | 
											
												
													
														|  | 
 |  | +        redisFormat.clear()
 | 
											
												
													
														|  | 
 |  | +        redisFormat.iterator
 | 
											
												
													
														|  | 
 |  | +      })
 | 
											
												
													
														|  | 
 |  | +      println("delete redis.count=" + userDataTakeRddRun.count())
 | 
											
												
													
														|  | 
 |  | +    } else {
 | 
											
												
													
														|  | 
 |  | +      println("不处理user的redis删除")
 | 
											
												
													
														|  | 
 |  | +    }
 | 
											
												
													
														|  | 
 |  | +
 | 
											
												
													
														|  | 
 |  | +    if (ifWriteRedisUser){
 | 
											
												
													
														|  | 
 |  | +      println("user redis 写入")
 | 
											
												
													
														|  | 
 |  | +      var savePathPart = savePathUser + "/" + partition
 | 
											
												
													
														|  | 
 |  | +      if (userSampleIDs.nonEmpty) {
 | 
											
												
													
														|  | 
 |  | +        savePathPart = savePathPart + "_" + userSampleIDs
 | 
											
												
													
														|  | 
 |  | +      }
 | 
											
												
													
														|  | 
 |  | +      val userDataRead = sc.textFile(savePathPart).filter(_.split("\t").length >= 2)
 | 
											
												
													
														|  | 
 |  | +        .sample(false, sampleRate)
 | 
											
												
													
														|  | 
 |  | +        .map(r => {
 | 
											
												
													
														|  | 
 |  | +        val rList = r.split("\t")
 | 
											
												
													
														|  | 
 |  | +        (rList(0), rList(1))
 | 
											
												
													
														|  | 
 |  | +      })
 | 
											
												
													
														|  | 
 |  | +      val userDataTakeRddRun = userDataRead.mapPartitions(row => {
 | 
											
												
													
														|  | 
 |  | +        val redisFormat = new util.HashMap[String, String]
 | 
											
												
													
														|  | 
 |  | +        val redisTemplate = env.getRedisTemplate()
 | 
											
												
													
														|  | 
 |  | +        var i = 1
 | 
											
												
													
														|  | 
 |  | +        row.foreach {
 | 
											
												
													
														|  | 
 |  | +          case (key, value) =>
 | 
											
												
													
														|  | 
 |  | +            if (key.nonEmpty) {
 | 
											
												
													
														|  | 
 |  | +              redisFormat.put(userRedisKeyPrefix + key, value)
 | 
											
												
													
														|  | 
 |  | +            }
 | 
											
												
													
														|  | 
 |  | +            if (i % 1000 == 0) {
 | 
											
												
													
														|  | 
 |  | +              redisTemplate.opsForValue.multiSet(redisFormat)
 | 
											
												
													
														|  | 
 |  | +              redisFormat.keySet.foreach(r => redisTemplate.expire(r, 24 * expireDay, TimeUnit.HOURS))
 | 
											
												
													
														|  | 
 |  | +              redisFormat.clear()
 | 
											
												
													
														|  | 
 |  | +            }
 | 
											
												
													
														|  | 
 |  | +            i = i + 1
 | 
											
												
													
														|  | 
 |  | +        }
 | 
											
												
													
														|  | 
 |  | +        redisTemplate.opsForValue.multiSet(redisFormat)
 | 
											
												
													
														|  | 
 |  | +        redisFormat.keySet.foreach(r => redisTemplate.expire(r, 24 * expireDay, TimeUnit.HOURS))
 | 
											
												
													
														|  | 
 |  | +        redisFormat.clear()
 | 
											
												
													
														|  | 
 |  | +        redisFormat.iterator
 | 
											
												
													
														|  | 
 |  | +      })
 | 
											
												
													
														|  | 
 |  | +      println("put in redis.count=" + userDataTakeRddRun.count())
 | 
											
												
													
														|  | 
 |  | +    } else {
 | 
											
												
													
														|  | 
 |  | +      println("不处理user的redis写入")
 | 
											
												
													
														|  | 
 |  | +    }
 | 
											
												
													
														|  | 
 |  | +
 | 
											
												
													
														|  | 
 |  | +
 | 
											
												
													
														|  | 
 |  | +
 | 
											
												
													
														|  | 
 |  | +  }
 | 
											
												
													
														|  | 
 |  | +
 | 
											
												
													
														|  | 
 |  | +  def handleUser(record: Record, schema: TableSchema): Tuple3[String, String, Int] = {
 | 
											
												
													
														|  | 
 |  | +    val userKey = "mids"
 | 
											
												
													
														|  | 
 |  | +    val mid = record.getString(userKey)
 | 
											
												
													
														|  | 
 |  | +    val reqContext: RequestContextOffline = new RequestContextOffline()
 | 
											
												
													
														|  | 
 |  | +    reqContext.putUserFeature(record)
 | 
											
												
													
														|  | 
 |  | +    // reqContext.featureMap.put("mid", mid)
 | 
											
												
													
														|  | 
 |  | +    val gson = (new GsonBuilder).serializeSpecialFloatingPointValues.create
 | 
											
												
													
														|  | 
 |  | +    val value = gson.toJson(reqContext.featureMap)
 | 
											
												
													
														|  | 
 |  | +    (mid, value, reqContext.featureMap.size())
 | 
											
												
													
														|  | 
 |  | +  }
 | 
											
												
													
														|  | 
 |  | +
 | 
											
												
													
														|  | 
 |  | +  def func(record: Record, schema: TableSchema): Record = {
 | 
											
												
													
														|  | 
 |  | +    record
 | 
											
												
													
														|  | 
 |  | +  }
 | 
											
												
													
														|  | 
 |  | +
 | 
											
												
													
														|  | 
 |  | +}
 |