Explorar o código

i2i样本制作,第5步。 训练数据

zhangbo hai 4 meses
pai
achega
61a3269603

+ 47 - 0
src/main/resources/20241128_recsys_i2i_bucket_47.txt

@@ -0,0 +1,47 @@
+action:rov_day336	100	0.019854,0.024964,0.025876,0.027656,0.03079,0.033211,0.034639,0.036093,0.036864,0.037524,0.037918,0.038907,0.039264,0.039654,0.040559,0.041864,0.042842,0.044022,0.04473,0.045306,0.04625,0.046736,0.047414,0.047688,0.047849,0.048131,0.048751,0.0488,0.049114,0.049622,0.050601,0.050742,0.050954,0.051239,0.052302,0.052783,0.052804,0.053545,0.054264,0.054668,0.055453,0.056391,0.057024,0.057573,0.058317,0.058989,0.059534,0.061089,0.06275,0.063433,0.064061,0.065245,0.065346,0.067091,0.067592,0.068102,0.068852,0.069307,0.070297,0.071266,0.072067,0.072495,0.073921,0.07496,0.075024,0.078836,0.079371,0.080406,0.081501,0.090802,0.103937,0.206944,0.23445
+cate1:ros_day30	100	0.568913,0.775484,0.780939,0.802548,0.869337,0.893388,0.928003,0.928705,0.935544,0.986266,0.994482,1.022163,1.054331,1.085911,1.176906,1.193622,1.227002,1.294849,1.402791
+action:ros_day21	100	0.25,0.328602,0.365854,0.387535,0.404136,0.452126,0.465116,0.479065,0.496933,0.516129,0.52012,0.53876,0.543434,0.556543,0.589769,0.6,0.604136,0.609922,0.617376,0.631866,0.651121,0.673048,0.688716,0.707252,0.720015,0.728878,0.746781,0.757868,0.767714,0.778018,0.793709,0.814338,0.81546,0.820637,0.835823,0.848272,0.860068,0.875434,0.893281,0.899405,0.908121,0.912633,0.916364,0.922581,0.92456,0.937168,0.956125,0.965547,0.966331,0.994555,1.004505,1.010526,1.017194,1.018069,1.023735,1.0359,1.054629,1.076291,1.096155,1.114377,1.120894,1.131766,1.169232,1.184152,1.187872,1.20162,1.209869,1.210319,1.223057,1.278414,1.302882,1.31542,1.359835,1.383904,1.418945,1.451771,1.493695,1.542568,1.622861,1.66065,1.793735,22.0
+action:str_day336	100	0.027696,0.030167,0.031642,0.033286,0.0335,0.03405,0.034097,0.034306,0.034368,0.034781,0.03511,0.03518,0.035215,0.035287,0.035835,0.036406,0.037134,0.037399,0.03829,0.038314,0.038991,0.039151,0.039866,0.040019,0.040565,0.041311,0.041606,0.041796,0.042902,0.042939,0.043499,0.043906,0.044136,0.044519,0.045218,0.045581,0.04618,0.046208,0.046557,0.047247,0.047858,0.048742,0.04893,0.049489,0.049953,0.050708,0.051685,0.052182,0.052284,0.05309,0.053489,0.054153,0.055099,0.056472,0.056837,0.056984,0.058334,0.059659,0.060481,0.061499,0.06314,0.064178,0.066172,0.0669,0.069459,0.070702,0.072543,0.076603,0.08157,0.084894,0.088682,0.093633,0.102991,1.278912
+action:rov_day7	100	0.011834,0.01391,0.015558,0.017769,0.019506,0.020764,0.020934,0.022329,0.024305,0.025658,0.026879,0.027964,0.02951,0.030255,0.030529,0.031355,0.03175,0.032465,0.033259,0.033396,0.033947,0.034491,0.034926,0.035653,0.036802,0.037338,0.037833,0.039025,0.039665,0.040042,0.04057,0.04097,0.041458,0.041772,0.042002,0.04288,0.04373,0.044321,0.044634,0.045391,0.046466,0.046521,0.046577,0.047264,0.047882,0.048202,0.048778,0.049052,0.049129,0.049817,0.049925,0.050304,0.050754,0.051228,0.051571,0.051769,0.051897,0.052922,0.053455,0.054602,0.055906,0.056395,0.056713,0.056761,0.057419,0.057487,0.058291,0.058677,0.058815,0.059958,0.062278,0.062582,0.063155,0.064239,0.066725,0.067458,0.06892,0.073396,0.074284,0.080075,0.082772,0.083973,9.666667
+cate2:rov_day1	100	0.026118,0.02717,0.0278,0.032997,0.033808,0.033995,0.037557,0.037572,0.040033,0.041213,0.04411,0.045117,0.047059,0.048886,0.049236,0.052352,0.053487,0.056874,0.059551,0.060377,0.063341,0.095622
+cate2:rov_day30	100	0.033632,0.038707,0.041573,0.044125,0.045851,0.046259,0.046815,0.047222,0.047335,0.047616,0.048904,0.04908,0.050429,0.051145,0.052511,0.054189,0.055182,0.057711,0.063536
+cate1:vovd1_day30	100	0.419087,0.421103,0.457025,0.480801,0.489344,0.498186,0.512282,0.514182,0.51446,0.517509,0.518859,0.525887,0.542596,0.571369,0.571914,0.58392,0.618863,0.662473,0.67297
+cate2:str_day1	100	0.035124,0.038644,0.039721,0.040222,0.040396,0.040966,0.041068,0.043825,0.045943,0.046239,0.046699,0.050678,0.052013,0.052262,0.059016,0.068581,0.071479,0.071686,0.080863,0.08236
+cate2:rov_day3	100	0.026846,0.027896,0.029803,0.032872,0.033258,0.034565,0.03626,0.03945,0.041853,0.044162,0.045321,0.046555,0.0479,0.050589,0.052187,0.053566,0.057363,0.059005,0.078679
+cate1:rov_day1	100	0.032978,0.041151,0.041269,0.04214,0.04282,0.043561,0.04411,0.046826,0.048145,0.048886,0.050366,0.051036,0.052371,0.054471,0.057223,0.0585,0.063412,0.068202,0.069998
+cate2:ros_day3	100	0.314491,0.376721,0.521202,0.62513,0.715105,0.723817,0.74503,0.779485,0.791123,0.821386,0.823861,0.832823,0.905328,0.9225,1.043391,1.107022,1.151774,1.159685,1.180504,1.236465,1.275264,1.469523
+cate2:str_day7	100	0.038319,0.039771,0.042056,0.042651,0.043288,0.043932,0.04456,0.044671,0.044821,0.045327,0.046423,0.046512,0.047975,0.054029,0.05444,0.055424,0.063758,0.068376,0.07106,0.081865,0.08189,0.094847
+cate1:rov_day3	100	0.033398,0.040212,0.042089,0.042606,0.042607,0.044162,0.045844,0.046173,0.047263,0.04779,0.0479,0.048133,0.04828,0.050624,0.051353,0.051611,0.054355,0.064011,0.064333,0.068423
+action:rov_day1	100	0.006645,0.010671,0.0131,0.015215,0.016408,0.017405,0.018827,0.019993,0.020532,0.0222,0.022965,0.023864,0.024839,0.02634,0.027364,0.028571,0.029044,0.030108,0.030729,0.03236,0.033761,0.03442,0.035693,0.037016,0.038361,0.039324,0.039844,0.040173,0.040317,0.040831,0.041873,0.042067,0.0426,0.042731,0.043056,0.043605,0.044204,0.044778,0.045463,0.046139,0.046308,0.04694,0.048285,0.048383,0.048873,0.048901,0.04912,0.049384,0.049504,0.049659,0.050326,0.050395,0.050727,0.050889,0.051361,0.051669,0.052166,0.052325,0.052673,0.052999,0.053938,0.054452,0.055513,0.055937,0.056182,0.056993,0.057215,0.057573,0.059021,0.059079,0.059399,0.059889,0.060132,0.060645,0.060833,0.063576,0.065285,0.068285,0.07037,0.072983,0.073639,0.075258,0.084614,0.091848,0.100404,7.0
+action:ros_day336	100	0.348613,0.434862,0.453572,0.513781,0.534658,0.556676,0.576444,0.603072,0.654221,0.693712,0.715447,0.731941,0.752071,0.761062,0.772958,0.797642,0.807481,0.836035,0.853369,0.88675,0.904697,0.931785,0.947704,0.965427,0.981116,0.996465,1.011519,1.027801,1.051123,1.055017,1.059061,1.060222,1.065558,1.076977,1.080115,1.10106,1.101198,1.113232,1.133651,1.142515,1.15843,1.19254,1.219361,1.245118,1.269212,1.26929,1.281102,1.316645,1.369174,1.418259,1.424134,1.457729,1.481606,1.508852,1.532746,1.552832,1.55388,1.561422,1.636015,1.703974,1.751432,1.776608,1.803191,1.80948,1.820939,1.822301,1.823567,1.855319,1.862946,1.899702,1.981808,2.041562,2.313638,2.334647,3.364988,5.065342
+action:vovd1_day7	100	0.137666,0.205238,0.239319,0.257397,0.278364,0.297826,0.308853,0.323599,0.330846,0.339474,0.345131,0.355611,0.365971,0.378241,0.386534,0.394485,0.401745,0.4053,0.4096,0.418047,0.423613,0.43029,0.437099,0.440698,0.44838,0.452893,0.457572,0.460936,0.461213,0.469853,0.472214,0.478025,0.485588,0.491587,0.492244,0.497435,0.498857,0.50138,0.502244,0.510647,0.51313,0.51375,0.518796,0.522852,0.52421,0.526356,0.531474,0.534258,0.54725,0.553808,0.555702,0.561948,0.57056,0.573226,0.574231,0.580617,0.592342,0.606824,0.61233,0.616977,0.619873,0.624001,0.629072,0.630574,0.638138,0.641281,0.6527,0.653959,0.659965,0.661916,0.665125,0.665898,0.676628,0.691067,0.69406,0.697332,0.697739,0.708397,0.720571,0.739653,0.765283,0.788233,0.820842,0.974022,66.0
+cate1:str_day30	100	0.043294,0.043718,0.044014,0.045551,0.045938,0.046901,0.04757,0.048139,0.050299,0.051885,0.052461,0.052698,0.053114,0.054098,0.060293,0.062666,0.063203,0.072972,0.083519
+cate1:rov_day30	100	0.041354,0.041515,0.045137,0.046642,0.047085,0.047451,0.047478,0.048904,0.049013,0.04908,0.049602,0.050241,0.050293,0.053031,0.053143,0.053609,0.054005,0.059483,0.060733
+cate2:ros_day7	100	0.390503,0.39766,0.578565,0.716623,0.752324,0.754474,0.776025,0.838096,0.868822,0.871489,0.878833,0.885504,0.888563,0.925512,1.012344,1.081793,1.102885,1.11219,1.127939,1.163207,1.238196,1.394579
+cate2:str_day30	100	0.036034,0.040884,0.042769,0.044628,0.04554,0.045578,0.046651,0.046863,0.047346,0.049194,0.051813,0.052461,0.052698,0.056434,0.061652,0.061676,0.065602,0.072413,0.076976
+cate1:vovd1_day1	100	0.381544,0.441132,0.445324,0.467233,0.469086,0.469838,0.478436,0.48103,0.519413,0.522035,0.523529,0.52888,0.53239,0.537782,0.565363,0.574103,0.575852,0.589783,0.706623,0.743416,0.912354
+cate1:vovd1_day3	100	0.373584,0.417485,0.445449,0.452683,0.460663,0.47318,0.473297,0.473597,0.475075,0.488597,0.492541,0.509136,0.510421,0.519506,0.555438,0.593366,0.653946,0.685791,0.798356
+cate1:ros_day3	100	0.540985,0.651424,0.687296,0.70954,0.760934,0.821386,0.838121,0.872193,0.900231,0.905328,0.967716,1.021737,1.063502,1.086678,1.097226,1.167411,1.181074,1.342978,1.369609,1.400263
+cate1:str_day7	100	0.040618,0.040838,0.041237,0.041273,0.042872,0.046365,0.046746,0.046876,0.047723,0.047737,0.048656,0.051212,0.053431,0.05444,0.055187,0.055424,0.06004,0.061695,0.062462,0.067108,0.067844,0.068583,0.085334
+cate2:rov_day7	100	0.030134,0.031978,0.033098,0.036565,0.037636,0.038902,0.04181,0.044573,0.045799,0.046316,0.046451,0.047662,0.048373,0.049034,0.049267,0.05026,0.051588,0.051632,0.056584,0.06822
+action:ros_day7	100	0.235294,0.285472,0.3,0.336364,0.346555,0.365527,0.370192,0.38741,0.401695,0.406203,0.435685,0.451705,0.472144,0.483173,0.506849,0.533366,0.548633,0.565574,0.584775,0.601219,0.628705,0.655896,0.670468,0.688501,0.696132,0.706748,0.722412,0.740503,0.753801,0.781575,0.791011,0.813642,0.815923,0.817254,0.82943,0.836958,0.847162,0.853241,0.856176,0.864239,0.886174,0.889372,0.894357,0.904244,0.922517,0.9375,0.950874,0.969653,0.991328,1.011533,1.020426,1.022207,1.041193,1.063075,1.069047,1.071736,1.076515,1.084513,1.085469,1.102302,1.110046,1.146137,1.165161,1.186323,1.189572,1.210379,1.217369,1.218697,1.230732,1.251783,1.289474,1.302491,1.318563,1.337901,1.339483,1.399507,1.431193,1.446613,1.448433,1.533333,1.602524,1.68916,1.705158,2.134192,56.0
+action:str_day21	100	0.02529,0.025918,0.026903,0.027762,0.028975,0.029358,0.029759,0.031034,0.031404,0.031486,0.031523,0.03199,0.033401,0.033404,0.033816,0.034923,0.035043,0.035242,0.035405,0.036101,0.036718,0.036796,0.036905,0.036934,0.037151,0.037436,0.037648,0.037743,0.038547,0.039119,0.03998,0.040804,0.041602,0.041932,0.042623,0.042741,0.042858,0.043654,0.043674,0.043831,0.044052,0.044966,0.045036,0.045468,0.04597,0.046178,0.047072,0.047778,0.047981,0.048224,0.048268,0.048517,0.048907,0.049482,0.050441,0.051567,0.052579,0.053673,0.055797,0.056378,0.057306,0.057499,0.059821,0.061331,0.061444,0.06305,0.063712,0.064352,0.067355,0.071703,0.076664,0.078362,0.080203,0.082609,0.083713,0.089455,0.104439,12.2
+action:str_day1	100	0.017324,0.021725,0.02407,0.025399,0.026721,0.027972,0.028949,0.030162,0.031023,0.032131,0.032741,0.033061,0.034113,0.034893,0.035262,0.035872,0.035971,0.036071,0.036502,0.037155,0.037493,0.037617,0.037677,0.038439,0.039401,0.040038,0.040611,0.040757,0.041109,0.041609,0.042446,0.042696,0.043116,0.043287,0.043652,0.043956,0.044368,0.044531,0.045105,0.045271,0.045521,0.045849,0.045905,0.046234,0.046542,0.046948,0.046957,0.047394,0.047995,0.048265,0.049201,0.049995,0.050314,0.051602,0.052819,0.053446,0.054108,0.05495,0.055071,0.055349,0.056438,0.056748,0.057858,0.059144,0.060907,0.063513,0.063704,0.065497,0.065569,0.066413,0.067175,0.067752,0.069989,0.07229,0.074536,0.076372,0.078269,0.079786,0.081651,0.087351,0.092959,0.09716,0.098375,0.111731,0.114471,4.0
+cate2:vovd1_day7	100	0.356425,0.362591,0.373599,0.40264,0.420118,0.426541,0.447309,0.452845,0.459098,0.463879,0.487661,0.491812,0.493725,0.499914,0.507697,0.5156,0.523791,0.534578,0.547378,0.561814,0.574644
+cate2:vovd1_day30	100	0.370944,0.443582,0.463276,0.482661,0.483314,0.48982,0.491756,0.49963,0.513591,0.514496,0.517509,0.518859,0.529267,0.537233,0.542512,0.551456,0.564547,0.62894,0.685266
+action:vovd1_day336	100	0.248048,0.296243,0.309127,0.337845,0.343714,0.372605,0.390476,0.400762,0.410739,0.429253,0.443089,0.451999,0.456237,0.463742,0.46671,0.473092,0.481418,0.482771,0.491281,0.497179,0.503485,0.509267,0.511139,0.511264,0.51902,0.522388,0.52348,0.529949,0.534409,0.537132,0.541402,0.543096,0.544218,0.546881,0.556071,0.55956,0.566337,0.570893,0.575426,0.577667,0.582255,0.585322,0.586476,0.592316,0.596397,0.597903,0.60241,0.610119,0.615851,0.624853,0.636624,0.649087,0.659438,0.662265,0.67485,0.679742,0.688169,0.693111,0.694488,0.699781,0.702592,0.708096,0.727242,0.749829,0.755498,0.758281,0.785222,0.786199,0.790469,0.791362,0.794589,0.827586,0.924981,1.108323,1.803599,2.663537
+cate1:str_day1	100	0.035608,0.03656,0.041586,0.042091,0.043074,0.046,0.046257,0.046782,0.047716,0.047823,0.048981,0.04941,0.052013,0.052262,0.053882,0.060194,0.064039,0.066976,0.067135,0.081139
+action:vovd1_day21	100	0.156087,0.199765,0.226021,0.251534,0.267941,0.280154,0.285975,0.291124,0.296886,0.313966,0.32,0.322216,0.327358,0.330584,0.334689,0.340277,0.343639,0.347172,0.350845,0.354745,0.362177,0.367886,0.371314,0.371816,0.38003,0.383879,0.392,0.396224,0.401546,0.408392,0.413505,0.417704,0.419192,0.419686,0.430666,0.43259,0.438847,0.443279,0.445256,0.447127,0.452189,0.461607,0.472475,0.473825,0.479948,0.487464,0.494902,0.500678,0.50193,0.506677,0.507536,0.511972,0.521198,0.529263,0.532299,0.54172,0.543133,0.544893,0.548595,0.551223,0.56902,0.579565,0.586334,0.5894,0.603461,0.611138,0.611801,0.620005,0.636173,0.663272,0.698996,0.713037,0.750082,0.761945,0.821366,0.844093,0.851573,29.0
+cate2:ros_day30	100	0.436914,0.518208,0.761409,0.784233,0.812472,0.829719,0.894708,0.896962,0.913583,0.928003,0.935544,0.951323,0.969855,1.029472,1.036543,1.106437,1.114378,1.120517,1.287282,1.310473,1.479188
+cate2:vovd1_day1	100	0.281409,0.303588,0.313957,0.366302,0.386282,0.392412,0.432701,0.436011,0.467233,0.48802,0.491061,0.506301,0.507795,0.528009,0.52888,0.538166,0.557429,0.580056,0.618626,0.639548,0.661279
+action:ros_day1	100	0.192308,0.273587,0.283582,0.310345,0.335807,0.357913,0.374771,0.4,0.425856,0.429936,0.45159,0.466262,0.48,0.494245,0.506438,0.521403,0.550201,0.571721,0.586611,0.60972,0.620032,0.643861,0.652411,0.660232,0.687753,0.716007,0.728519,0.74258,0.753153,0.764706,0.790535,0.796106,0.80349,0.836454,0.846076,0.860111,0.881054,0.884846,0.895225,0.905669,0.920299,0.923548,0.938824,0.9625,0.981852,0.992625,0.998067,1.016669,1.018298,1.022431,1.029801,1.052233,1.068822,1.071901,1.088889,1.108568,1.127615,1.156554,1.170245,1.22768,1.267618,1.26937,1.293097,1.311524,1.318015,1.329235,1.333881,1.366553,1.403406,1.438059,1.478431,1.49064,1.498601,1.505104,1.52908,1.566296,1.603306,1.617925,1.694509,1.769295,1.831797,1.965463,2.047187,17.0
+action:str_day7	100	0.02371,0.027264,0.029139,0.030503,0.031529,0.032865,0.03318,0.03385,0.034541,0.035331,0.035675,0.036936,0.037649,0.038091,0.038308,0.038784,0.039253,0.039561,0.04015,0.040387,0.040991,0.041353,0.041436,0.041776,0.042165,0.042581,0.042944,0.043717,0.043982,0.044402,0.045644,0.046125,0.046733,0.046875,0.047026,0.047115,0.047583,0.047815,0.048015,0.048511,0.048933,0.049401,0.04988,0.050733,0.051045,0.051934,0.052151,0.052965,0.053091,0.053144,0.053185,0.053187,0.054155,0.054389,0.05503,0.055528,0.056086,0.05609,0.056597,0.057627,0.058523,0.060062,0.060753,0.06266,0.063993,0.064689,0.066117,0.067578,0.068979,0.069121,0.07051,0.072706,0.076075,0.076781,0.08086,0.085863,0.086148,0.089955,0.091713,0.095259,0.106007,0.124617,0.127122,3.5
+cate1:ros_day1	100	0.519355,0.66865,0.69915,0.811369,0.84014,0.84403,0.870042,0.897146,0.939885,1.00774,1.143832,1.176866,1.183983,1.196572,1.211117,1.22336,1.395947,1.474438,1.496271
+cate1:str_day3	100	0.038626,0.040474,0.043725,0.043868,0.044441,0.045714,0.046753,0.046972,0.047257,0.049522,0.050513,0.052909,0.053765,0.055499,0.058611,0.060402,0.065406,0.066702,0.067354,0.074332,0.084941
+cate1:ros_day7	100	0.635802,0.653971,0.65945,0.786459,0.838096,0.855184,0.868264,0.870284,0.888563,0.922251,0.9723,0.998596,1.082772,1.098859,1.111132,1.140248,1.222733,1.26638,1.323897,1.333723
+cate1:vovd1_day7	100	0.429809,0.433815,0.448448,0.452044,0.456199,0.461723,0.468483,0.473863,0.482026,0.491812,0.505874,0.513677,0.5156,0.521871,0.585771,0.590815,0.623261,0.623742,0.654694,0.69024
+action:rov_day21	100	0.012194,0.015795,0.018156,0.019479,0.020871,0.022283,0.02243,0.022818,0.023734,0.023792,0.024912,0.02574,0.026363,0.026752,0.027949,0.027969,0.029016,0.029888,0.030381,0.030485,0.030778,0.030791,0.031497,0.031525,0.031665,0.032079,0.032328,0.033126,0.033836,0.034304,0.034613,0.034621,0.035005,0.035409,0.036235,0.036529,0.037285,0.037697,0.037845,0.03808,0.039235,0.039677,0.04048,0.040792,0.040898,0.040997,0.041597,0.042572,0.043476,0.043678,0.044648,0.045501,0.046636,0.046883,0.04704,0.047114,0.048054,0.048444,0.048891,0.049643,0.050005,0.050938,0.051163,0.052014,0.052591,0.053466,0.054724,0.055605,0.057282,0.058256,0.058721,0.060107,0.062902,0.063506,0.065871,0.068452,0.074924,0.075672,0.080895,0.083062,3.0
+cate1:rov_day7	100	0.036666,0.042247,0.042418,0.042667,0.043535,0.044368,0.045227,0.046451,0.047062,0.047219,0.04723,0.04767,0.048373,0.049901,0.050948,0.05276,0.054466,0.057854,0.060435,0.06132,0.061887
+cate2:vovd1_day3	100	0.310271,0.323133,0.340636,0.36469,0.364953,0.387506,0.393317,0.39418,0.435147,0.452893,0.453187,0.467103,0.473297,0.479693,0.503452,0.510421,0.516627,0.519987,0.540275,0.569572,0.58257,0.607636
+cate2:ros_day1	100	0.337544,0.398472,0.525643,0.636424,0.737146,0.746133,0.756802,0.823211,0.841549,0.84403,0.850929,0.871358,0.939885,1.03452,1.18473,1.206837,1.217894,1.290502,1.358827,1.369865,1.694856
+cate2:str_day3	100	0.03577,0.039348,0.039679,0.040957,0.04204,0.042171,0.043124,0.043777,0.044224,0.044981,0.045699,0.046297,0.047884,0.052909,0.053765,0.055292,0.066415,0.0721,0.082512,0.088704

+ 1 - 1
src/main/scala/com/aliyun/odps/spark/examples/makedata_dssm/makedata_i2i_03_onehotFile_20241128.scala

@@ -73,7 +73,7 @@ object makedata_i2i_03_onehotFile_20241128 {
               case "category2_2" => result += "cate2:" + value
               case "category2_3" => result += "cate2:" + value
               case "valid_time" => result += "valid_time:" + value
-              case " timeliness" => result += "timeliness:" + value
+              case "timeliness" => result += "timeliness:" + value
               case "sentiment_tendency" => result += "sentiment_tendency:" + value
               case "has_end_credit_guide" => result += "has_end_credit_guide:" + value
               case "background_music_type" => result += "background_music_type:" + value

+ 588 - 0
src/main/scala/com/aliyun/odps/spark/examples/makedata_dssm/makedata_i2i_05_trainData_20241129.scala

@@ -0,0 +1,588 @@
+package com.aliyun.odps.spark.examples.makedata_dssm
+
+import com.alibaba.fastjson.JSON
+import com.alibaba.fastjson.JSONObject
+import com.aliyun.odps.TableSchema
+import com.aliyun.odps.data.Record
+import com.aliyun.odps.spark.examples.myUtils.{MyDateUtils, MyHdfsUtils, ParamUtils, env}
+import org.apache.hadoop.io.compress.GzipCodec
+import org.apache.spark.sql.SparkSession
+import examples.extractor.ExtractorUtils
+import scala.collection.JavaConversions._
+import scala.collection.mutable.ArrayBuffer
+import scala.io.Source
+
+object makedata_i2i_05_trainData_20241129 {
+  def func(record: Record, schema: TableSchema): Record = {
+    record
+  }
+  def main(args: Array[String]): Unit = {
+    val spark = SparkSession
+      .builder()
+      .appName(this.getClass.getName)
+      .getOrCreate()
+    val sc = spark.sparkContext
+
+    // 1 读取参数
+    val param = ParamUtils.parseArgs(args)
+    val beginStr = param.getOrElse("beginStr", "2024062008")
+    val endStr = param.getOrElse("endStr", "2024062023")
+    val readPath = param.getOrElse("readPath", "/dw/recommend/model/52_dssm_i2i_joinfeature/")
+    val savePath = param.getOrElse("savePath", "/dw/recommend/model/55_dssm_i2i_traindata/")
+    val onehotPath = param.getOrElse("onehotPath", "/dw/recommend/model/53_dssm_i2i_onehot/20241128")
+    val bucketFile = param.getOrElse("bucketFile", "20241128_recsys_i2i_bucket_47")
+    val repartition = param.getOrElse("repartition", "100").toInt
+    val filterHours = param.getOrElse("filterHours", "25").split(",").toSet
+    val ifDebug = param.getOrElse("ifDebug", "false").toBoolean
+
+    // 2 读取onehot文件
+    val onehotMap_br = sc.broadcast(
+      sc.textFile(onehotPath).map(r => {
+        val rList = r.split("\t")
+        (rList(0), rList(1))
+      }).collectAsMap()
+    )
+
+    // 3 读取dense分桶文件
+    val resourceUrlBucket = this.getClass.getClassLoader.getResource(bucketFile)
+    val buckets =
+      if (resourceUrlBucket != null) {
+        val buckets = Source.fromURL(resourceUrlBucket).getLines().mkString("\n")
+        Source.fromURL(resourceUrlBucket).close()
+        buckets
+      } else {
+        ""
+      }
+    println(buckets)
+    val bucketsMap_br = sc.broadcast(
+      buckets.split("\n")
+        .map(r => r.replace(" ", "").replaceAll("\n", ""))
+        .filter(r => r.nonEmpty)
+        .map(r => {
+          val rList = r.split("\t")
+          (rList(0), (rList(1).toDouble, rList(2).split(",").map(_.toDouble)))
+        }).toMap
+    )
+
+    // 4 循环执行数据生产
+    val timeRange = MyDateUtils.getDateHourRange(beginStr, endStr)
+    for (dt_hh <- timeRange) {
+      val dt = dt_hh.substring(0, 8)
+      val hh = dt_hh.substring(8, 10)
+      val data = sc.textFile(readPath + "/" + dt_hh).map(r=>{
+        val rList = r.split("\t")
+        val logKey = rList(0)
+        val label = rList(1)
+        val vid_left = rList(2)
+        val vid_right = rList(3)
+        val feature_left = rList(4)
+        val feature_right = rList(5)
+        val feature_left_action = rList(6)
+        val feature_right_action = rList(7)
+        val feature_left_cate1 = rList(8)
+        val feature_right_cate1 = rList(9)
+        val feature_left_cate2 = rList(10)
+        val feature_right_cate2 = rList(11)
+        (logKey, label, vid_left, vid_right, feature_left, feature_right, feature_left_action, feature_right_action,
+          feature_left_cate1, feature_right_cate1, feature_left_cate2, feature_right_cate2)
+      }).mapPartitions(row =>{
+        val result = new ArrayBuffer[String]()
+        val onehotMap = onehotMap_br.value
+        val bucketsMap = bucketsMap_br.value
+        row.foreach{
+          case (logKey, label, vid_left, vid_right, feature_left, feature_right, feature_left_action, feature_right_action,
+          feature_left_cate1, feature_right_cate1, feature_left_cate2, feature_right_cate2) =>
+            val left = new ArrayBuffer[String]()
+            val right = new ArrayBuffer[String]()
+            val left_dense1 = new ArrayBuffer[String]()
+            val right_dense1 = new ArrayBuffer[String]()
+            val left_dense2 = new ArrayBuffer[String]()
+            val right_dense2 = new ArrayBuffer[String]()
+            // 1 sparse 特征 16个
+            // vid cate1 cate2 video_style valid_time captions_color audience_age_group
+            // audience_value_type font_size cover_persons_num audience_gender sentiment_tendency
+            // video_type background_music_type captions has_end_credit_guide
+            left += onehotMap.getOrElse("vid:" + vid_left, "0")
+            right += onehotMap.getOrElse("vid:" + vid_right, "0")
+            var jsonLeft = JSON.parseObject(feature_left)
+            left += getOnehotValue(jsonLeft, onehotMap, "category1", "cate1:")
+            left += getOnehotValue(jsonLeft, onehotMap, "category2_1", "cate2:")
+            left += getOnehotValue(jsonLeft, onehotMap, "video_style", "video_style:")
+            left += getOnehotValue(jsonLeft, onehotMap, "valid_time", "valid_time:")
+            left += getOnehotValue(jsonLeft, onehotMap, "captions_color", "captions_color:")
+            left += getOnehotValue(jsonLeft, onehotMap, "audience_age_group", "audience_age_group:")
+            left += getOnehotValue(jsonLeft, onehotMap, "audience_value_type", "audience_value_type:")
+            left += getOnehotValue(jsonLeft, onehotMap, "font_size", "font_size:")
+            left += getOnehotValue(jsonLeft, onehotMap, "cover_persons_num", "cover_persons_num:")
+            left += getOnehotValue(jsonLeft, onehotMap, "audience_gender", "audience_gender:")
+            left += getOnehotValue(jsonLeft, onehotMap, "sentiment_tendency", "sentiment_tendency:")
+            left += getOnehotValue(jsonLeft, onehotMap, "video_type", "video_type:")
+            left += getOnehotValue(jsonLeft, onehotMap, "background_music_type", "background_music_type:")
+            left += getOnehotValue(jsonLeft, onehotMap, "captions", "captions:")
+            left += getOnehotValue(jsonLeft, onehotMap, "has_end_credit_guide", "has_end_credit_guide:")
+            var jsonRight = JSON.parseObject(feature_right)
+            right += getOnehotValue(jsonRight, onehotMap, "category1", "cate1:")
+            right += getOnehotValue(jsonRight, onehotMap, "category2_1", "cate2:")
+            right += getOnehotValue(jsonRight, onehotMap, "video_style", "video_style:")
+            right += getOnehotValue(jsonRight, onehotMap, "valid_time", "valid_time:")
+            right += getOnehotValue(jsonRight, onehotMap, "captions_color", "captions_color:")
+            right += getOnehotValue(jsonRight, onehotMap, "audience_age_group", "audience_age_group:")
+            right += getOnehotValue(jsonRight, onehotMap, "audience_value_type", "audience_value_type:")
+            right += getOnehotValue(jsonRight, onehotMap, "font_size", "font_size:")
+            right += getOnehotValue(jsonRight, onehotMap, "cover_persons_num", "cover_persons_num:")
+            right += getOnehotValue(jsonRight, onehotMap, "audience_gender", "audience_gender:")
+            right += getOnehotValue(jsonRight, onehotMap, "sentiment_tendency", "sentiment_tendency:")
+            right += getOnehotValue(jsonRight, onehotMap, "video_type", "video_type:")
+            right += getOnehotValue(jsonRight, onehotMap, "background_music_type", "background_music_type:")
+            right += getOnehotValue(jsonRight, onehotMap, "captions", "captions:")
+            right += getOnehotValue(jsonRight, onehotMap, "has_end_credit_guide", "has_end_credit_guide:")
+            // 2 dense通过分桶转换成sparse特征 47个 * 3 * 2
+            jsonLeft = JSON.parseObject(feature_left_action)
+            var res = getDenseBucketValue(jsonLeft, bucketsMap, "str_day1", "action:str_day1")
+            left += res._1.toString
+            left_dense1 += res._2.toString
+            left_dense2 += res._3.toString
+            res = getDenseBucketValue(jsonLeft, bucketsMap, "rov_day1", "action:rov_day1")
+            left += res._1.toString
+            left_dense1 += res._2.toString
+            left_dense2 += res._3.toString
+            res = getDenseBucketValue(jsonLeft, bucketsMap, "ros_day1", "action:ros_day1")
+            left += res._1.toString
+            left_dense1 += res._2.toString
+            left_dense2 += res._3.toString
+            res = getDenseBucketValue(jsonLeft, bucketsMap, "str_day7", "action:str_day7")
+            left += res._1.toString
+            left_dense1 += res._2.toString
+            left_dense2 += res._3.toString
+            res = getDenseBucketValue(jsonLeft, bucketsMap, "rov_day7", "action:rov_day7")
+            left += res._1.toString
+            left_dense1 += res._2.toString
+            left_dense2 += res._3.toString
+            res = getDenseBucketValue(jsonLeft, bucketsMap, "ros_day7", "action:ros_day7")
+            left += res._1.toString
+            left_dense1 += res._2.toString
+            left_dense2 += res._3.toString
+            res = getDenseBucketValue(jsonLeft, bucketsMap, "str_day21", "action:str_day21")
+            left += res._1.toString
+            left_dense1 += res._2.toString
+            left_dense2 += res._3.toString
+            res = getDenseBucketValue(jsonLeft, bucketsMap, "rov_day21", "action:rov_day21")
+            left += res._1.toString
+            left_dense1 += res._2.toString
+            left_dense2 += res._3.toString
+            res = getDenseBucketValue(jsonLeft, bucketsMap, "ros_day21", "action:ros_day21")
+            left += res._1.toString
+            left_dense1 += res._2.toString
+            left_dense2 += res._3.toString
+            res = getDenseBucketValue(jsonLeft, bucketsMap, "str_day336", "action:str_day336")
+            left += res._1.toString
+            left_dense1 += res._2.toString
+            left_dense2 += res._3.toString
+            res = getDenseBucketValue(jsonLeft, bucketsMap, "rov_day336", "action:rov_day336")
+            left += res._1.toString
+            left_dense1 += res._2.toString
+            left_dense2 += res._3.toString
+            res = getDenseBucketValue(jsonLeft, bucketsMap, "ros_day336", "action:ros_day336")
+            left += res._1.toString
+            left_dense1 += res._2.toString
+            left_dense2 += res._3.toString
+            res = getDenseBucketValue(jsonLeft, bucketsMap, "vovd1_day7", "action:vovd1_day7")
+            left += res._1.toString
+            left_dense1 += res._2.toString
+            left_dense2 += res._3.toString
+            res = getDenseBucketValue(jsonLeft, bucketsMap, "vovd1_day21", "action:vovd1_day21")
+            left += res._1.toString
+            left_dense1 += res._2.toString
+            left_dense2 += res._3.toString
+            res = getDenseBucketValue(jsonLeft, bucketsMap, "vovd1_day336", "action:vovd1_day336")
+            left += res._1.toString
+            left_dense1 += res._2.toString
+            left_dense2 += res._3.toString
+
+            jsonRight = JSON.parseObject(feature_right_action)
+            res = getDenseBucketValue(jsonRight, bucketsMap, "str_day1", "action:str_day1")
+            right += res._1.toString
+            right_dense1 += res._2.toString
+            right_dense2 += res._3.toString
+            res = getDenseBucketValue(jsonRight, bucketsMap, "rov_day1", "action:rov_day1")
+            right += res._1.toString
+            right_dense1 += res._2.toString
+            right_dense2 += res._3.toString
+            res = getDenseBucketValue(jsonRight, bucketsMap, "ros_day1", "action:ros_day1")
+            right += res._1.toString
+            right_dense1 += res._2.toString
+            right_dense2 += res._3.toString
+            res = getDenseBucketValue(jsonRight, bucketsMap, "str_day7", "action:str_day7")
+            right += res._1.toString
+            right_dense1 += res._2.toString
+            right_dense2 += res._3.toString
+            res = getDenseBucketValue(jsonRight, bucketsMap, "rov_day7", "action:rov_day7")
+            right += res._1.toString
+            right_dense1 += res._2.toString
+            right_dense2 += res._3.toString
+            res = getDenseBucketValue(jsonRight, bucketsMap, "ros_day7", "action:ros_day7")
+            right += res._1.toString
+            right_dense1 += res._2.toString
+            right_dense2 += res._3.toString
+            res = getDenseBucketValue(jsonRight, bucketsMap, "str_day21", "action:str_day21")
+            right += res._1.toString
+            right_dense1 += res._2.toString
+            right_dense2 += res._3.toString
+            res = getDenseBucketValue(jsonRight, bucketsMap, "rov_day21", "action:rov_day21")
+            right += res._1.toString
+            right_dense1 += res._2.toString
+            right_dense2 += res._3.toString
+            res = getDenseBucketValue(jsonRight, bucketsMap, "ros_day21", "action:ros_day21")
+            right += res._1.toString
+            right_dense1 += res._2.toString
+            right_dense2 += res._3.toString
+            res = getDenseBucketValue(jsonRight, bucketsMap, "str_day336", "action:str_day336")
+            right += res._1.toString
+            right_dense1 += res._2.toString
+            right_dense2 += res._3.toString
+            res = getDenseBucketValue(jsonRight, bucketsMap, "rov_day336", "action:rov_day336")
+            right += res._1.toString
+            right_dense1 += res._2.toString
+            right_dense2 += res._3.toString
+            res = getDenseBucketValue(jsonRight, bucketsMap, "ros_day336", "action:ros_day336")
+            right += res._1.toString
+            right_dense1 += res._2.toString
+            right_dense2 += res._3.toString
+            res = getDenseBucketValue(jsonRight, bucketsMap, "vovd1_day7", "action:vovd1_day7")
+            right += res._1.toString
+            right_dense1 += res._2.toString
+            right_dense2 += res._3.toString
+            res = getDenseBucketValue(jsonRight, bucketsMap, "vovd1_day21", "action:vovd1_day21")
+            right += res._1.toString
+            right_dense1 += res._2.toString
+            right_dense2 += res._3.toString
+            res = getDenseBucketValue(jsonRight, bucketsMap, "vovd1_day336", "action:vovd1_day336")
+            right += res._1.toString
+            right_dense1 += res._2.toString
+            right_dense2 += res._3.toString
+
+            //----------------------cate1-----------------------------cate1---------------------------cate1----------------------
+            jsonLeft = JSON.parseObject(feature_left_cate1)
+            res = getDenseBucketValue(jsonLeft, bucketsMap, "str_day1", "cate1:str_day1")
+            left += res._1.toString
+            left_dense1 += res._2.toString
+            left_dense2 += res._3.toString
+            res = getDenseBucketValue(jsonLeft, bucketsMap, "rov_day1", "cate1:rov_day1")
+            left += res._1.toString
+            left_dense1 += res._2.toString
+            left_dense2 += res._3.toString
+            res = getDenseBucketValue(jsonLeft, bucketsMap, "ros_day1", "cate1:ros_day1")
+            left += res._1.toString
+            left_dense1 += res._2.toString
+            left_dense2 += res._3.toString
+            res = getDenseBucketValue(jsonLeft, bucketsMap, "str_day3", "cate1:str_day3")
+            left += res._1.toString
+            left_dense1 += res._2.toString
+            left_dense2 += res._3.toString
+            res = getDenseBucketValue(jsonLeft, bucketsMap, "rov_day3", "cate1:rov_day3")
+            left += res._1.toString
+            left_dense1 += res._2.toString
+            left_dense2 += res._3.toString
+            res = getDenseBucketValue(jsonLeft, bucketsMap, "ros_day3", "cate1:ros_day3")
+            left += res._1.toString
+            left_dense1 += res._2.toString
+            left_dense2 += res._3.toString
+            res = getDenseBucketValue(jsonLeft, bucketsMap, "str_day7", "cate1:str_day7")
+            left += res._1.toString
+            left_dense1 += res._2.toString
+            left_dense2 += res._3.toString
+            res = getDenseBucketValue(jsonLeft, bucketsMap, "rov_day7", "cate1:rov_day7")
+            left += res._1.toString
+            left_dense1 += res._2.toString
+            left_dense2 += res._3.toString
+            res = getDenseBucketValue(jsonLeft, bucketsMap, "ros_day7", "cate1:ros_day7")
+            left += res._1.toString
+            left_dense1 += res._2.toString
+            left_dense2 += res._3.toString
+            res = getDenseBucketValue(jsonLeft, bucketsMap, "str_day30", "cate1:str_day30")
+            left += res._1.toString
+            left_dense1 += res._2.toString
+            left_dense2 += res._3.toString
+            res = getDenseBucketValue(jsonLeft, bucketsMap, "rov_day30", "cate1:rov_day30")
+            left += res._1.toString
+            left_dense1 += res._2.toString
+            left_dense2 += res._3.toString
+            res = getDenseBucketValue(jsonLeft, bucketsMap, "ros_day30", "cate1:ros_day30")
+            left += res._1.toString
+            left_dense1 += res._2.toString
+            left_dense2 += res._3.toString
+            res = getDenseBucketValue(jsonLeft, bucketsMap, "vovd1_day1", "cate1:vovd1_day1")
+            left += res._1.toString
+            left_dense1 += res._2.toString
+            left_dense2 += res._3.toString
+            res = getDenseBucketValue(jsonLeft, bucketsMap, "vovd1_day3", "cate1:vovd1_day3")
+            left += res._1.toString
+            left_dense1 += res._2.toString
+            left_dense2 += res._3.toString
+            res = getDenseBucketValue(jsonLeft, bucketsMap, "vovd1_day7", "cate1:vovd1_day7")
+            left += res._1.toString
+            left_dense1 += res._2.toString
+            left_dense2 += res._3.toString
+            res = getDenseBucketValue(jsonLeft, bucketsMap, "vovd1_day30", "cate1:vovd1_day30")
+            left += res._1.toString
+            left_dense1 += res._2.toString
+            left_dense2 += res._3.toString
+
+            jsonRight = JSON.parseObject(feature_right_cate1)
+            res = getDenseBucketValue(jsonRight, bucketsMap, "str_day1", "cate1:str_day1")
+            right += res._1.toString
+            right_dense1 += res._2.toString
+            right_dense2 += res._3.toString
+            res = getDenseBucketValue(jsonRight, bucketsMap, "rov_day1", "cate1:rov_day1")
+            right += res._1.toString
+            right_dense1 += res._2.toString
+            right_dense2 += res._3.toString
+            res = getDenseBucketValue(jsonRight, bucketsMap, "ros_day1", "cate1:ros_day1")
+            right += res._1.toString
+            right_dense1 += res._2.toString
+            right_dense2 += res._3.toString
+            res = getDenseBucketValue(jsonRight, bucketsMap, "str_day3", "cate1:str_day3")
+            right += res._1.toString
+            right_dense1 += res._2.toString
+            right_dense2 += res._3.toString
+            res = getDenseBucketValue(jsonRight, bucketsMap, "rov_day3", "cate1:rov_day7")
+            right += res._1.toString
+            right_dense1 += res._2.toString
+            right_dense2 += res._3.toString
+            res = getDenseBucketValue(jsonRight, bucketsMap, "ros_day3", "cate1:ros_day3")
+            right += res._1.toString
+            right_dense1 += res._2.toString
+            right_dense2 += res._3.toString
+            res = getDenseBucketValue(jsonRight, bucketsMap, "str_day7", "cate1:str_day7")
+            right += res._1.toString
+            right_dense1 += res._2.toString
+            right_dense2 += res._3.toString
+            res = getDenseBucketValue(jsonRight, bucketsMap, "rov_day7", "cate1:rov_day7")
+            right += res._1.toString
+            right_dense1 += res._2.toString
+            right_dense2 += res._3.toString
+            res = getDenseBucketValue(jsonRight, bucketsMap, "ros_day7", "cate1:ros_day7")
+            right += res._1.toString
+            right_dense1 += res._2.toString
+            right_dense2 += res._3.toString
+            res = getDenseBucketValue(jsonRight, bucketsMap, "str_day30", "cate1:str_day30")
+            right += res._1.toString
+            right_dense1 += res._2.toString
+            right_dense2 += res._3.toString
+            res = getDenseBucketValue(jsonRight, bucketsMap, "rov_day30", "cate1:rov_day30")
+            right += res._1.toString
+            right_dense1 += res._2.toString
+            right_dense2 += res._3.toString
+            res = getDenseBucketValue(jsonRight, bucketsMap, "ros_day30", "cate1:ros_day30")
+            right += res._1.toString
+            right_dense1 += res._2.toString
+            right_dense2 += res._3.toString
+            res = getDenseBucketValue(jsonRight, bucketsMap, "vovd1_day1", "cate1:vovd1_day1")
+            right += res._1.toString
+            right_dense1 += res._2.toString
+            right_dense2 += res._3.toString
+            res = getDenseBucketValue(jsonRight, bucketsMap, "vovd1_day3", "cate1:vovd1_day3")
+            right += res._1.toString
+            right_dense1 += res._2.toString
+            right_dense2 += res._3.toString
+            res = getDenseBucketValue(jsonRight, bucketsMap, "vovd1_day7", "cate1:vovd1_day7")
+            right += res._1.toString
+            right_dense1 += res._2.toString
+            right_dense2 += res._3.toString
+            res = getDenseBucketValue(jsonRight, bucketsMap, "vovd1_day30", "cate1:vovd1_day30")
+            right += res._1.toString
+            right_dense1 += res._2.toString
+            right_dense2 += res._3.toString
+
+          //----------------------cate2-----------------------------cate2---------------------------cate2----------------------
+            jsonLeft = JSON.parseObject(feature_left_cate2)
+            res = getDenseBucketValue(jsonLeft, bucketsMap, "str_day1", "cate2:str_day1")
+            left += res._1.toString
+            left_dense1 += res._2.toString
+            left_dense2 += res._3.toString
+            res = getDenseBucketValue(jsonLeft, bucketsMap, "rov_day1", "cate2:rov_day1")
+            left += res._1.toString
+            left_dense1 += res._2.toString
+            left_dense2 += res._3.toString
+            res = getDenseBucketValue(jsonLeft, bucketsMap, "ros_day1", "cate2:ros_day1")
+            left += res._1.toString
+            left_dense1 += res._2.toString
+            left_dense2 += res._3.toString
+            res = getDenseBucketValue(jsonLeft, bucketsMap, "str_day3", "cate2:str_day3")
+            left += res._1.toString
+            left_dense1 += res._2.toString
+            left_dense2 += res._3.toString
+            res = getDenseBucketValue(jsonLeft, bucketsMap, "rov_day3", "cate2:rov_day3")
+            left += res._1.toString
+            left_dense1 += res._2.toString
+            left_dense2 += res._3.toString
+            res = getDenseBucketValue(jsonLeft, bucketsMap, "ros_day3", "cate2:ros_day3")
+            left += res._1.toString
+            left_dense1 += res._2.toString
+            left_dense2 += res._3.toString
+            res = getDenseBucketValue(jsonLeft, bucketsMap, "str_day7", "cate2:str_day7")
+            left += res._1.toString
+            left_dense1 += res._2.toString
+            left_dense2 += res._3.toString
+            res = getDenseBucketValue(jsonLeft, bucketsMap, "rov_day7", "cate2:rov_day7")
+            left += res._1.toString
+            left_dense1 += res._2.toString
+            left_dense2 += res._3.toString
+            res = getDenseBucketValue(jsonLeft, bucketsMap, "ros_day7", "cate2:ros_day7")
+            left += res._1.toString
+            left_dense1 += res._2.toString
+            left_dense2 += res._3.toString
+            res = getDenseBucketValue(jsonLeft, bucketsMap, "str_day30", "cate2:str_day30")
+            left += res._1.toString
+            left_dense1 += res._2.toString
+            left_dense2 += res._3.toString
+            res = getDenseBucketValue(jsonLeft, bucketsMap, "rov_day30", "cate2:rov_day30")
+            left += res._1.toString
+            left_dense1 += res._2.toString
+            left_dense2 += res._3.toString
+            res = getDenseBucketValue(jsonLeft, bucketsMap, "ros_day30", "cate2:ros_day30")
+            left += res._1.toString
+            left_dense1 += res._2.toString
+            left_dense2 += res._3.toString
+            res = getDenseBucketValue(jsonLeft, bucketsMap, "vovd1_day1", "cate2:vovd1_day1")
+            left += res._1.toString
+            left_dense1 += res._2.toString
+            left_dense2 += res._3.toString
+            res = getDenseBucketValue(jsonLeft, bucketsMap, "vovd1_day3", "cate2:vovd1_day3")
+            left += res._1.toString
+            left_dense1 += res._2.toString
+            left_dense2 += res._3.toString
+            res = getDenseBucketValue(jsonLeft, bucketsMap, "vovd1_day7", "cate2:vovd1_day7")
+            left += res._1.toString
+            left_dense1 += res._2.toString
+            left_dense2 += res._3.toString
+            res = getDenseBucketValue(jsonLeft, bucketsMap, "vovd1_day30", "cate2:vovd1_day30")
+            left += res._1.toString
+            left_dense1 += res._2.toString
+            left_dense2 += res._3.toString
+
+            jsonRight = JSON.parseObject(feature_right_cate2)
+            res = getDenseBucketValue(jsonRight, bucketsMap, "str_day1", "cate2:str_day1")
+            right += res._1.toString
+            right_dense1 += res._2.toString
+            right_dense2 += res._3.toString
+            res = getDenseBucketValue(jsonRight, bucketsMap, "rov_day1", "cate2:rov_day1")
+            right += res._1.toString
+            right_dense1 += res._2.toString
+            right_dense2 += res._3.toString
+            res = getDenseBucketValue(jsonRight, bucketsMap, "ros_day1", "cate2:ros_day1")
+            right += res._1.toString
+            right_dense1 += res._2.toString
+            right_dense2 += res._3.toString
+            res = getDenseBucketValue(jsonRight, bucketsMap, "str_day3", "cate2:str_day3")
+            right += res._1.toString
+            right_dense1 += res._2.toString
+            right_dense2 += res._3.toString
+            res = getDenseBucketValue(jsonRight, bucketsMap, "rov_day3", "cate2:rov_day7")
+            right += res._1.toString
+            right_dense1 += res._2.toString
+            right_dense2 += res._3.toString
+            res = getDenseBucketValue(jsonRight, bucketsMap, "ros_day3", "cate2:ros_day3")
+            right += res._1.toString
+            right_dense1 += res._2.toString
+            right_dense2 += res._3.toString
+            res = getDenseBucketValue(jsonRight, bucketsMap, "str_day7", "cate2:str_day7")
+            right += res._1.toString
+            right_dense1 += res._2.toString
+            right_dense2 += res._3.toString
+            res = getDenseBucketValue(jsonRight, bucketsMap, "rov_day7", "cate2:rov_day7")
+            right += res._1.toString
+            right_dense1 += res._2.toString
+            right_dense2 += res._3.toString
+            res = getDenseBucketValue(jsonRight, bucketsMap, "ros_day7", "cate2:ros_day7")
+            right += res._1.toString
+            right_dense1 += res._2.toString
+            right_dense2 += res._3.toString
+            res = getDenseBucketValue(jsonRight, bucketsMap, "str_day30", "cate2:str_day30")
+            right += res._1.toString
+            right_dense1 += res._2.toString
+            right_dense2 += res._3.toString
+            res = getDenseBucketValue(jsonRight, bucketsMap, "rov_day30", "cate2:rov_day30")
+            right += res._1.toString
+            right_dense1 += res._2.toString
+            right_dense2 += res._3.toString
+            res = getDenseBucketValue(jsonRight, bucketsMap, "ros_day30", "cate2:ros_day30")
+            right += res._1.toString
+            right_dense1 += res._2.toString
+            right_dense2 += res._3.toString
+            res = getDenseBucketValue(jsonRight, bucketsMap, "vovd1_day1", "cate2:vovd1_day1")
+            right += res._1.toString
+            right_dense1 += res._2.toString
+            right_dense2 += res._3.toString
+            res = getDenseBucketValue(jsonRight, bucketsMap, "vovd1_day3", "cate2:vovd1_day3")
+            right += res._1.toString
+            right_dense1 += res._2.toString
+            right_dense2 += res._3.toString
+            res = getDenseBucketValue(jsonRight, bucketsMap, "vovd1_day7", "cate2:vovd1_day7")
+            right += res._1.toString
+            right_dense1 += res._2.toString
+            right_dense2 += res._3.toString
+            res = getDenseBucketValue(jsonRight, bucketsMap, "vovd1_day30", "cate2:vovd1_day30")
+            right += res._1.toString
+            right_dense1 += res._2.toString
+            right_dense2 += res._3.toString
+
+          // 3 left 和 right 分别 16+47*3=16+141 = 157
+            left ++= left_dense1
+            left ++= left_dense2
+            right ++= right_dense1
+            right ++= right_dense2
+
+            result.add(
+              (logKey, label, vid_left, vid_right, left.mkString(","), right.mkString(",")).productIterator.mkString("\t")
+            )
+        }
+        result.iterator
+      })
+
+      // 4 保存数据到hdfs
+      val hdfsPath = savePath + "/" + dt_hh
+      if (hdfsPath.nonEmpty && hdfsPath.startsWith("/dw/recommend/model/")) {
+        println("删除路径并开始数据写入:" + hdfsPath)
+        MyHdfsUtils.delete_hdfs_path(hdfsPath)
+        data.coalesce(repartition).saveAsTextFile(hdfsPath, classOf[GzipCodec])
+      } else {
+        println("路径不合法,无法写入:" + hdfsPath)
+      }
+    }
+  }
+
+  def getOnehotValue(obj: JSONObject, m: scala.collection.Map[String, String], key1: String, key2: String): String = {
+    if (obj.containsKey(key1)) {
+      val value1 = obj.get(key1)
+      val value2 = if (value1 == null) "无" else value1.toString
+      m.getOrElse(key2 + value2, "0")
+    } else {
+      "0"
+    }
+  }
+  def getDenseBucketValue(obj: JSONObject, bucketsMap: Map[String, (Double, Array[Double])], key1: String, name: String): (Int, Double, Double) = {
+    if (obj.containsKey(key1)) {
+      val value1 = obj.get(key1)
+      val score = try {
+        value1.toString.toDouble
+      } catch {
+        case _: Exception => 0D
+      }
+      if (score > 1E-8) {
+        if (bucketsMap.contains(name)) {
+          val (bucketsNum, buckets) = bucketsMap(name)
+          val index = ExtractorUtils.findInsertPosition(buckets, score).toDouble + 1.0
+          val scoreNew = 1.0 / bucketsNum * index
+          (index.toInt, scoreNew, score)
+        } else {
+          (0, 0D, score)
+        }
+      } else {
+        (0, 0D, 0D)
+      }
+    } else {
+      (0, 0D, 0D)
+    }
+  }
+}

+ 13 - 3
src/main/scala/com/aliyun/odps/spark/examples/临时记录的脚本-I2I

@@ -1,8 +1,8 @@
 nohup /opt/apps/SPARK2/spark-2.4.8-hadoop3.2-1.0.8/bin/spark-class2 org.apache.spark.deploy.SparkSubmit \
 --class com.aliyun.odps.spark.examples.makedata_dssm.makedata_i2i_01_originData_20241127 \
---master yarn --driver-memory 2G --executor-memory 4G --executor-cores 1 --num-executors 16 \
+--master yarn --driver-memory 2G --executor-memory 2G --executor-cores 1 --num-executors 16 \
 ./target/spark-examples-1.0.0-SNAPSHOT-shaded.jar \
-beginStr:2024112612 endStr:2024112612 negCnt:20 \
+beginStr:2024112700 endStr:2024112723 negCnt:20 \
 tablePart:64 savePath:/dw/recommend/model/51_dssm_i2i_sample/ > p51.log 2>&1 &
 
 nohup /opt/apps/SPARK2/spark-2.4.8-hadoop3.2-1.0.8/bin/spark-class2 org.apache.spark.deploy.SparkSubmit \
@@ -50,4 +50,14 @@ nohup /opt/apps/SPARK2/spark-2.4.8-hadoop3.2-1.0.8/bin/spark-class2 org.apache.s
 ./target/spark-examples-1.0.0-SNAPSHOT-shaded.jar \
 readPath:/dw/recommend/model/52_dssm_i2i_joinfeature/20241126* \
 savePath:/dw/recommend/model/54_dssm_i2i_bucketfile/ \
-fileName:47_rate_v1  bucketNum:100 > p54.log 2>&1 &
+fileName:47_rate_v1  bucketNum:100 > p54.log 2>&1 &
+
+nohup /opt/apps/SPARK2/spark-2.4.8-hadoop3.2-1.0.8/bin/spark-class2 org.apache.spark.deploy.SparkSubmit \
+--class com.aliyun.odps.spark.examples.makedata_dssm.makedata_i2i_05_trainData_20241129 \
+--master yarn --driver-memory 2G --executor-memory 4G --executor-cores 1 --num-executors 32 \
+./target/spark-examples-1.0.0-SNAPSHOT-shaded.jar \
+beginStr:2024112612 endStr:2024112612 \
+readPath:/dw/recommend/model/52_dssm_i2i_joinfeature/ \
+savePath:/dw/recommend/model/55_dssm_i2i_traindata/ \
+onehotPath:/dw/recommend/model/53_dssm_i2i_onehot/20241128 \
+bucketFile:20241128_recsys_i2i_bucket_47 > p55.log 2>&1 &