liqian 3 years ago
parent
commit
2c04eeca30
2 changed files with 250 additions and 0 deletions
  1. 179 0
      process_feature.py
  2. 71 0
      rov_train.py

+ 179 - 0
process_feature.py

@@ -0,0 +1,179 @@
+features_name_list = [
+    'dt',
+    'videoid',
+
+    'day1viewcount',  # 1/3/7/14/30/60日内曝光
+    'day3viewcount',
+    'day7viewcount',
+    'day14viewcount',
+    'day30viewcount',
+    'day60viewcount',
+
+    'day1playcount',  # 1/3/7/14/30/60日内播放
+    'day3playcount',
+    'day7playcount',
+    'day14playcount',
+    'day30playcount',
+    'day60playcount',
+
+    'day1sharecount',  # 1/3/7/14/30/60日内分享,一层回流
+    'day3sharecount',
+    'day7sharecount',
+    'day14sharecount',
+    'day30sharecount',
+    'day60sharecount',
+
+    'day1returncount',
+    'day3returncount',
+    'day7returncount',
+    'day14returncount',
+    'day30returncount',
+    'day60returncount',
+
+    'videocategory11',
+    'videocategory12',
+    'videocategory45',
+    'videocategory49',
+    'videocategory1',
+    'videocategory2',
+    'videocategory3',
+    'videocategory4',
+    'videocategory5',
+    'videocategory6',
+    'videocategory7',
+    'videocategory8',
+    'videocategory9',
+    'videocategory85',
+    'videocategory10',
+    'videocategory555',
+
+    'usercategory1',
+    'usercategory2',
+    'usercategory3',
+    'usercategory4',
+    'usercategory5',
+    'usercategory6',
+    'usercategory7',
+    'usercategory8',
+    'usercategory9',
+    'usercategory10',
+    'usercategory11',
+    'usercategory12',
+    'usercategory45',
+    'usercategory49',
+    'usercategory85',
+    'usercategory555',
+
+    'todyviewcount',  # 5日曝光
+
+    'day5returncount_1_stage',
+    'day5returncount_2_stage',
+    'day5returncount_3_stage',
+    'day5returncount_4_stage',
+
+    'stage_one_retrn',
+    'stage_two_retrn',
+    'stage_three_retrn',
+    'stage_four_retrn']
+
+add_feature = [
+    'all_return_day1_return_count',  # -- 1/3/7/14日内总回流  #12
+    'all_return_day3_return_count',
+    'all_return_day7_return_count',
+    'all_return_day14_return_count',
+
+    'three_return_day1_return_count',  # -- 1/3/7/14日内前三层回流 #14
+    'three_return_day3_return_count',
+    'three_return_day7_return_count',
+    'three_return_day14_return_count',
+
+    'four_up_return_day1_return_count',  # -- 1/3/7/14日内四+层回流 #15
+    'four_up_return_day3_return_count',
+    'four_up_return_day7_return_count',
+    'four_up_return_day14_return_count',
+
+    'one_return_day1_return_count',  # -- 1/3/7/14日内一层回流  #13
+    'one_return_day3_return_count',
+    'one_return_day7_return_count',
+    'one_return_day14_return_count',
+
+    'four_up_return_div_three_return_day1',  # -- 1/3/7/14日内四+层回流/前三层回流   #23
+    'four_up_return_div_three_return_day3',
+    'four_up_return_div_three_return_day7',
+    'four_up_return_div_three_return_day14',
+
+    'all_return_day1_view_day1_return_count',  # -- 1/3/7/14日内曝光在1/3/7/14日内回流  #8
+    'all_return_day3_view_day3_return_count',
+    'all_return_day7_view_day7_return_count',
+    'all_return_day14_view_day14_return_count',
+
+    'three_return_day1_view_day1_return_count',  # -- 1/3/7/14日内曝光在1/3/7/14日内前三层回流 #10
+    'three_return_day3_view_day3_return_count',
+    'three_return_day7_view_day7_return_count',
+    'three_return_day14_view_day14_return_count',
+
+    'four_up_return_day1_view_day1_return_count',  # -- 1/3/7/14日内曝光在1/3/7/14日内四+层回流  # 11
+    'four_up_return_day3_view_day3_return_count',
+    'four_up_return_day7_view_day7_return_count',
+    'four_up_return_day14_view_day14_return_count',
+
+    'one_return_day1_view_day1_return_count',  # -- 1/3/7/14日内曝光在1/3/7/14日内一层回流 #9
+    'one_return_day3_view_day3_return_count',
+    'one_return_day7_view_day7_return_count',
+    'one_return_day14_view_day14_return_count',
+
+    'all_return_day1_on_day1_return_count',  # 前day1+1 / day1+3/day1+7/day1+14 到前 day1+1日内曝光在 day1的总回流   #16
+    'all_return_day3_on_day1_return_count',
+    'all_return_day7_on_day1_return_count',
+    'all_return_day14_on_day1_return_count',
+
+    'four_up_return_day1_view_day1_return_div_three_d1',  # -- 1/3/7/14日内曝光在1/3/7/14日内四+层回流/前三层回流  #22
+    'four_up_return_day3_view_day3_return_div_three_d3',
+    'four_up_return_day7_view_day7_return_div_three_d7',
+    'four_up_return_day14_view_day14_return_div_three_d14',
+
+    'day1ctr',  # -- 1/3/7/14/30/60日内播放/曝光   #17
+    'day3ctr',
+    'day7ctr',
+    'day14ctr',
+    'day30ctr',
+    'day60ctr',
+
+    'day1sov',  # --  1/3/7/14/30/60日内分享/曝光  #18
+    'day3sov',
+    'day7sov',
+    'day14sov',
+    'day30sov',
+    'day60sov',
+
+    'day1rov',  # -- 1/3/7/14日内曝光的回流/曝光   #19
+    'day3rov',
+    'day7rov',
+    'day14rov',
+
+    'day1soc',  # -- 1/3/7/14/30/60日内分享/播放  #20
+    'day3soc',
+    'day7soc',
+    'day14soc',
+    'day30soc',
+    'day60soc',
+
+    'day1roc',  # -- 1/3/7/14日内曝光的回流/播放  #21
+    'day3roc',
+    'day7roc',
+    'day14roc',
+
+    'oneday_day1rov',  # -- 1/3/7/14日内曝光在今日的回流/ 1/3/7/14日内曝光  #24
+    'oneday_day3rov',
+    'oneday_day7rov',
+    'oneday_day14rov',
+
+    'futre7dayreturn',  # 预测目标,未来7天回流
+
+    'todyviewcount_rank',
+    'day1viewcount_rank'
+]
+
+words = ['videotags', 'words_without_tags']
+
+features = features_name_list + add_feature + words

+ 71 - 0
rov_train.py

@@ -0,0 +1,71 @@
+import pandas as pd
+import datetime
+import pickle
+import process_feature
+
+from odps import ODPS
+from datetime import datetime as dt
+
+
+def get_rov_feature_table(date, table):
+    """
+    从DataWorks表中获取对应的特征值
+    :param date: 日期 type-string '%Y%m%d'
+    :param table: 表名 type-string
+    :return: feature_array type-DataFrame
+    """
+    odps = ODPS(
+        access_id='LTAI4FtW5ZzxMvdw35aNkmcp',
+        secret_access_key='0VKnydcaHK3ITjylbgUsLubX6rwiwc',
+        project='usercdm',
+        endpoint='http://service.cn.maxcompute.aliyun.com/api',
+        connect_timeout=3000,
+        read_timeout=500000,
+        pool_maxsize=1000,
+        pool_connections=1000
+    )
+    feature_value_list = []
+    for record in odps.read_table(name=table, partition='dt=%s' % date):
+        feature_value = {}
+        for feature_name in process_feature.features:
+            if feature_name == 'dt':
+                feature_value[feature_name] = date
+            else:
+                feature_value[feature_name] = record[feature_name]
+        feature_value_list.append(feature_value)
+    feature_array = pd.DataFrame(feature_value_list)
+    print(date, table, 'feature table finish')
+    return feature_array
+
+
+def get_data_with_date(date, delta_days, table):
+    """
+    获取某一时间范围的特征数据
+    :param date: 标准日期,delta基准,type-string,'%Y%m%d'
+    :param delta_days: 日期范围(天),type-int,「 >0: date前,<0: date后 」
+    :param table: DataWorks表名,type-string
+    :return: data,type-DataFrame
+    """
+    base_date = dt.strptime(date, '%Y%m%d')
+    data_list = []
+    for days in range(0, delta_days):
+        delta = datetime.timedelta(days=days)
+        delta_date = base_date - delta
+        # 获取特征数据
+        delta_data = get_rov_feature_table(date=delta_date.strftime('%Y%m%d'), table=table)
+        data_list.append(delta_data)
+    data = pd.concat(data_list)
+    # 重新进行索引
+    data.reset_index(inplace=True)
+    # 删除index列
+    data = data.drop(columns=['index'])
+    return data
+
+
+if __name__ == '__main__':
+    dt_test = '20211006'
+    table_test = 'rov_feature_add_v1'
+    # res = get_rov_feature_table(dt_test, table_test)
+    res = get_data_with_date(date=dt_test, delta_days=3, table=table_test)
+    print(res.shape)
+