@@ -234,7 +234,7 @@ object makedata_06_originData {
try{
result.put(r, record.getString(r))
}catch {
- case Exception => result.put(r, String.valueOf(record.getBigint(r)))
+ case _ => result.put(r, String.valueOf(record.getBigint(r)))
}
})
@@ -0,0 +1,248 @@
+//package com.aliyun.odps.spark.examples.makedata
+//
+//import com.aliyun.odps.TableSchema
+//import com.aliyun.odps.data.Record
+//import com.aliyun.odps.spark.examples.myUtils.{MyHdfsUtils, ParamUtils, env}
+//import com.google.gson.GsonBuilder
+//import examples.dataloader.RequestContextOffline
+//import org.apache.hadoop.io.compress.GzipCodec
+//import org.apache.spark.sql.SparkSession
+//import java.util
+//import java.util.concurrent.TimeUnit
+//import scala.collection.JavaConversions._
+//object makedata_09_user2redis {
+// def main(args: Array[String]) {
+// val spark = SparkSession
+// .builder()
+// .appName(this.getClass.getName)
+// .getOrCreate()
+// val sc = spark.sparkContext
+// // 1 读取参数
+// val param = ParamUtils.parseArgs(args)
+// val partitionPrefix = param.getOrElse("partitionPrefix", "dt=")
+// val tablePart = param.getOrElse("tablePart", "64").toInt
+// val ifUser = param.getOrDefault("ifUser", "False").toBoolean
+// val ifVideo = param.getOrDefault("ifVideo", "False").toBoolean
+// val date = param.getOrDefault("date", "20231220")
+// val expireDay = param.getOrDefault("expireDay", "2").toInt
+// val ifDebug = param.getOrDefault("ifDebug", "False").toBoolean
+// val ifDeleteRedisUser = param.getOrDefault("ifDeleteRedisUser", "False").toBoolean
+// val ifWriteRedisUser = param.getOrDefault("ifWriteRedisUser", "False").toBoolean
+// val ifWriteRedis = param.getOrDefault("ifWriteRedis", "True").toBoolean
+// val partition = partitionPrefix + date
+// val savePathUser = param.getOrDefault("savePathUser", "")
+// val savePathVideo = param.getOrDefault("savePathVideo", "")
+// val userSampleIDs = param.getOrDefault("userSampleIDs", "")
+// val sampleRate = param.getOrDefault("sampleRate", "1.0").toDouble
+//// val userSampleIDsPathFix = param.getOrDefault("userSampleIDsPathFix", "")
+// // /dw/recommend/model/feature/
+// // 2 读取数据库odps
+// val odpsOps = env.getODPS(sc)
+// val project = "loghubods"
+// val tableUser = "alg_recsys_user_info"
+// val tableItem = "alg_recsys_video_info"
+// val userRedisKeyPrefix = "user_info_4video_"
+// val videoRedisKeyPrefix = "video_info_"
+// // 3 用户测特征处理
+// if (ifUser){
+// println("user特征处理")
+// var userData = odpsOps.readTable(project = project, table = tableUser, partition = partition, transfer = handleUser, numPartition = tablePart)
+// .filter {
+// case (mid, fea, feaSize) =>
+// mid.nonEmpty && fea.nonEmpty && feaSize > 0
+// }
+// if (userSampleIDs.nonEmpty){
+// val IDs = userSampleIDs.split(",").filter(_.nonEmpty).map(_.toInt).toList
+// userData = userData.filter(r => IDs.contains(r._1.hashCode % 10))
+// if (ifDebug){
+// println("user特征处理-debug开启-只保留5条数据-特征数量大于1")
+// val userDataTake = userData.take(5)
+// userDataTake.foreach(r=> println(r._1 + "\t" + r._2 + "\t" + r._3))
+// userData = sc.parallelize(userDataTake)
+// if (savePathUser.nonEmpty && savePathUser.startsWith("/dw/recommend/model/")) {
+// var savePathPart = savePathUser + "/" + partition
+// if (userSampleIDs.nonEmpty) {
+// savePathPart = savePathPart + "_" + userSampleIDs
+// MyHdfsUtils.delete_hdfs_path(savePathPart)
+// userData.map(r => r._1 + "\t" + r._2).saveAsTextFile(savePathPart, classOf[GzipCodec])
+// println("user.action.count=" + userData.count())
+// } else {
+// println("不处理user")
+// if (ifDeleteRedisUser){
+// println("user redis 删除")
+// println("读取数据路径:" + savePathPart)
+// val userDataRead = sc.textFile(savePathPart)
+// val userDataRead2 = userDataRead.filter(_.split("\t").length >= 2).map(r => {
+// val rList = r.split("\t")
+// (rList(0), rList(1))
+// })
+// println("预计删除数据量:" + userDataRead2.count())
+// val userDataTakeRddRun = userDataRead2.mapPartitions(row => {
+// val redisFormat = new util.HashMap[String, String]
+// val redisTemplate = env.getRedisTemplate()
+// var i = 1
+// row.foreach {
+// case (key, value) =>
+// if (key.nonEmpty) {
+// redisFormat.put(userRedisKeyPrefix + key, value)
+// if (i % 1000 == 0) {
+// redisTemplate.delete(redisFormat.map(_._1))
+// redisFormat.clear()
+// i = i + 1
+// redisFormat.iterator
+// println("delete redis.count=" + userDataTakeRddRun.count())
+// println("不处理user的redis删除")
+// if (ifWriteRedisUser){
+// println("user redis 写入")
+// val userDataRead = sc.textFile(savePathPart).filter(_.split("\t").length >= 2)
+// .sample(false, sampleRate)
+// .map(r => {
+// val userDataTakeRddRun = userDataRead.mapPartitions(row => {
+// redisTemplate.opsForValue.multiSet(redisFormat)
+// redisFormat.keySet.foreach(r => redisTemplate.expire(r, 24 * expireDay, TimeUnit.HOURS))
+// println("put in redis.count=" + userDataTakeRddRun.count())
+// println("不处理user的redis写入")
+// // 4 video测特征处理
+// if (ifVideo){
+// println("video特征处理")
+// val handleItemFunction: (Record, TableSchema) => Tuple3[String, String, Int] = handleItem(_, _, date)
+// var itemData = odpsOps.readTable(project = project, table = tableItem, partition = partition, transfer = handleItemFunction, numPartition = tablePart)
+// if (ifDebug) {
+// println("video特征处理-debug开启-只保留5条数据-特征数量大于1")
+// val itemDataTake = itemData.filter(_._3 > 1).take(5)
+// itemDataTake.foreach(r => println(r._1 + "\t" + r._2 + "\t" + r._3))
+// itemData = sc.parallelize(itemDataTake)
+// val itemDataTakeRddRun = itemData.mapPartitions(row => {
+// case (key, value, _) =>
+// if (key.nonEmpty && value != null && value.nonEmpty) {
+// redisFormat.put(videoRedisKeyPrefix + key, value)
+// if (ifWriteRedis) {
+// redisTemplate.opsForValue.set(videoRedisKeyPrefix + key, value, 24 * expireDay, TimeUnit.HOURS)
+//// if (ifWriteRedis){
+//// redisTemplate.opsForValue.multiSet(redisFormat)
+//// redisFormat.keySet.foreach(key => redisTemplate.expire(key, 24 * expireDay, TimeUnit.HOURS))
+//// }
+// if (savePathVideo.nonEmpty && savePathVideo.startsWith("/dw/recommend/model/")){
+// val savePathPart = savePathVideo + "/" + partition
+// itemDataTakeRddRun.map(r => r._1 + "\t" + r._2).saveAsTextFile(savePathPart, classOf[GzipCodec])
+// println("item.action.count=" + itemDataTakeRddRun.count())
+// }else{
+// println("不处理video")
+// def handleUser(record: Record, schema: TableSchema): Tuple3[String, String, Int] = {
+// val userKey = "mids"
+// val mid = record.getString(userKey)
+// val reqContext: RequestContextOffline = new RequestContextOffline()
+// reqContext.putUserFeature(record)
+// // reqContext.featureMap.put("mid", mid)
+// val gson = (new GsonBuilder).serializeSpecialFloatingPointValues.create
+// val value = gson.toJson(reqContext.featureMap)
+// (mid, value, reqContext.featureMap.size())
+// def handleItem(record: Record, schema: TableSchema, date:String): Tuple3[String, String, Int] = {
+// val videoKey = "videoid"
+// val videoid = record.getBigint(videoKey).toString
+// //---------todo 有特征不在表里 临时修复---------
+//// val i_title_len = if (record.getString("title") != null) record.getString("title").length.toString else ""
+//// val i_days_since_upload = if (record.getDatetime("gmt_create") != null){
+//// val format = new SimpleDateFormat("yyyyMMdd")
+//// val dateOld = format.format(record.getDatetime("gmt_create"))
+//// val dayDiff = MyDateUtils.calculateDateDifference(dateOld, date)
+//// dayDiff.toString
+//// }else{
+//// ""
+//// if (i_title_len.nonEmpty){
+//// val d = reqContext.bucketRatioFeature(i_title_len.toDouble)
+//// reqContext.featureMap.put("i_title_len", d.toString)
+//// if (i_days_since_upload.nonEmpty) {
+//// val d = reqContext.bucketRatioFeature(i_days_since_upload.toDouble)
+//// reqContext.featureMap.put("i_days_since_upload", d.toString)
+// //------修复完成---------
+// reqContext.putItemFeature(record)
+// reqContext.featureMap.put("videoid", videoid)
+// (videoid, value, reqContext.featureMap.size())
+//}