|
@@ -48,9 +48,9 @@ object makedata_12_rosData_v3 {
|
|
|
val feaStr = rList(2)
|
|
|
val labelJson = JSON.parseObject(labelStr)
|
|
|
val is_share = labelJson.getString("is_share")
|
|
|
- (logKeyStr, labelJson, feaStr, is_share, pagesource_change, video_recommend, apptype, logtimestamp.toLong)
|
|
|
+ (mid, logKeyStr, labelJson, feaStr, is_share, pagesource_change, video_recommend, apptype, logtimestamp.toLong)
|
|
|
}).filter({
|
|
|
- case (logKeyStr, labelJson, feaStr, is_share, pagesource_change, video_recommend, apptype, logtimestamp) =>
|
|
|
+ case (mid, logKeyStr, labelJson, feaStr, is_share, pagesource_change, video_recommend, apptype, logtimestamp) =>
|
|
|
val pages = Set("2")
|
|
|
val video_status = Set("-6")
|
|
|
val apps = Set("0", "4", "5", "21", "3", "6")
|
|
@@ -59,7 +59,7 @@ object makedata_12_rosData_v3 {
|
|
|
|
|
|
//2 样本采样(多个回流的样本复制,等价回流量的加权)
|
|
|
val data2 = data1.flatMap({
|
|
|
- case (logKeyStr, labelJson, feaStr, is_share, pagesource_change, video_recommend, apptype, logtimestamp) =>
|
|
|
+ case (mid, logKeyStr, labelJson, feaStr, is_share, pagesource_change, video_recommend, apptype, logtimestamp) =>
|
|
|
val res = ArrayBuffer[(String, JSONObject)]()
|
|
|
val feaJson = JSON.parseObject(feaStr)
|
|
|
val is_return = labelJson.getString("is_return")
|
|
@@ -70,15 +70,28 @@ object makedata_12_rosData_v3 {
|
|
|
val midReturn = r.split(":")(0)
|
|
|
val ts = r.split(":")(1).toLong
|
|
|
(midReturn, ts)
|
|
|
- }).sortBy(_._2)
|
|
|
- var midSet = scala.collection.mutable.HashSet[String]()
|
|
|
- for ((midReturn, tsReturn) <- return_mid_ts_list){
|
|
|
- if (!midSet.contains(midReturn)){
|
|
|
- midSet.add(midReturn)
|
|
|
- if ((tsReturn / 1000 - logtimestamp / 1000) <= 3600 && tsReturn - logtimestamp > 0){
|
|
|
- res.add(("1", feaJson))
|
|
|
+ }).filter(!_._1.equals(mid)).sortBy(_._2)
|
|
|
+ // 样本中做了一个必要的过滤,如果是自己的回流,过滤掉。
|
|
|
+
|
|
|
+ if (return_mid_ts_list.nonEmpty){
|
|
|
+ var flag = true
|
|
|
+ val midSet = scala.collection.mutable.HashSet[String]()
|
|
|
+ for ((midReturn, tsReturn) <- return_mid_ts_list) {
|
|
|
+ if (!midSet.contains(midReturn)) {
|
|
|
+ midSet.add(midReturn)
|
|
|
+ if ((tsReturn / 1000 - logtimestamp / 1000) <= 3600 && tsReturn - logtimestamp > 0) {
|
|
|
+ res.add(("1", feaJson))
|
|
|
+ flag = false
|
|
|
+ }
|
|
|
}
|
|
|
}
|
|
|
+ if (flag) {
|
|
|
+ // 如果上面一个正样本都没添加,那么添加一个负样本。代表近一个小时内没有回流。
|
|
|
+ res.add(("0", feaJson))
|
|
|
+ }
|
|
|
+ }else {
|
|
|
+ // 如果把自己的回流过滤掉了之后,没有其他回流,那么是负样本。
|
|
|
+ res.add(("0", feaJson))
|
|
|
}
|
|
|
}
|
|
|
res.iterator
|