123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155 |
- #coding utf-8
- import sys
- import pandas as pd
- from tqdm import tqdm
- from collections import defaultdict
- types = defaultdict(str)
- item_sparse_conf = [
- # 基础特征_视频
- 'i_id',
- 'i_up_id',
- # 'i_tag',
- # 'i_title',
- 'i_title_len',
- 'i_play_len',
- 'i_days_since_upload',
- # 统计特征_视频
- 'i_1day_exp_cnt',
- 'i_1day_click_cnt',
- 'i_1day_share_cnt',
- 'i_1day_return_cnt',
- 'i_3day_exp_cnt',
- 'i_3day_click_cnt',
- 'i_3day_share_cnt',
- 'i_3day_return_cnt',
- 'i_7day_exp_cnt',
- 'i_7day_click_cnt',
- 'i_7day_share_cnt',
- 'i_7day_return_cnt',
- 'i_3month_exp_cnt',
- 'i_3month_click_cnt',
- 'i_3month_share_cnt',
- 'i_3month_return_cnt',
- ]
- item_dense_conf = [
- 'i_ctr_1day',
- 'i_str_1day',
- 'i_rov_1day',
- 'i_ros_1day',
- 'i_ctr_3day',
- 'i_str_3day',
- 'i_rov_3day',
- 'i_ros_3day',
- 'i_ctr_7day',
- 'i_str_7day',
- 'i_rov_7day',
- 'i_ros_7day',
- 'i_ctr_3month',
- 'i_str_3month',
- 'i_rov_3month',
- 'i_ros_3month',
- ]
- user_sparse_conf = [
- 'u_brand',
- 'u_device',
- 'u_system',
- 'u_system_ver',
- 'ctx_region',
- 'ctx_city',
- # 统计特征_用户
- 'u_cycle_bucket_7days',
- 'u_cycle_bucket_30days',
- 'u_share_bucket_30days',
- 'u_1day_exp_cnt',
- 'u_1day_click_cnt',
- 'u_1day_share_cnt',
- 'u_1day_return_cnt',
- 'u_3day_exp_cnt',
- 'u_3day_click_cnt',
- 'u_3day_share_cnt',
- 'u_3day_return_cnt',
- 'u_7day_exp_cnt',
- 'u_7day_click_cnt',
- 'u_7day_share_cnt',
- 'u_7day_return_cnt',
- 'u_3month_exp_cnt',
- 'u_3month_click_cnt',
- 'u_3month_share_cnt',
- 'u_3month_return_cnt',
- ]
- user_dense_conf = [
- 'u_ctr_1day',
- 'u_str_1day',
- 'u_rov_1day',
- 'u_ros_1day',
- 'u_ctr_3day',
- 'u_str_3day',
- 'u_rov_3day',
- 'u_ros_3day',
- 'u_ctr_7day',
- 'u_str_7day',
- 'u_rov_7day',
- 'u_ros_7day',
- 'u_ctr_3month',
- 'u_str_3month',
- 'u_rov_3month',
- 'u_ros_3month',
- ]
- def format_x(x):
- if x is None:
- x = ''
- return str(x).replace(' ', '').replace(':', '_')
- def sparse_fea_2_feature(v, k):
- f_k = format_x(k)
- f_v = format_x(v)
- if len(f_v) < 1:
- return (None, None)
- return ('#'.join([f_k, f_v]), 1.0)
- def dense_fea_2_feature(v, k):
- f_k = format_x(k)
- f_v = format_x(v)
- if len(f_v) < 1:
- return (None, None)
- return (f_k, float(f_v))
- def get_features(sparse_conf, dense_conf, row):
- features = dict(map(lambda k:sparse_fea_2_feature(row[k], k), sparse_conf))
- dense_features = dict(map(lambda k:dense_fea_2_feature(row[k], k), dense_conf))
- features.update(dense_features)
- if None in features:
- del(features[None])
- return features
- def get_item_features(row):
- return get_features(item_sparse_conf, item_dense_conf, row)
- def get_user_features(row):
- return get_features(user_sparse_conf, user_dense_conf, row)
-
- label_col = 'ui_is_out'
- sparse_fea_cols = [
- # 'u_id',
- # 基础特征_场景
- #'ctx_day','
- 'ctx_apptype',
- 'ctx_week',
- 'ctx_hour',
- # 基础特征_交叉
- #'ui_is_out',
- #'playtime',
- #'ui_root_id',
- #'ui_share_id',
- ]
- dense_fea_cols = [
- ]
|