#coding utf-8 import sys import pandas as pd from tqdm import tqdm from collections import defaultdict types = defaultdict(str) item_sparse_conf = [ # 基础特征_视频 'i_id', 'i_up_id', # 'i_tag', # 'i_title', 'i_title_len', 'i_play_len', 'i_days_since_upload', # 统计特征_视频 'i_1day_exp_cnt', 'i_1day_click_cnt', 'i_1day_share_cnt', 'i_1day_return_cnt', 'i_3day_exp_cnt', 'i_3day_click_cnt', 'i_3day_share_cnt', 'i_3day_return_cnt', 'i_7day_exp_cnt', 'i_7day_click_cnt', 'i_7day_share_cnt', 'i_7day_return_cnt', 'i_3month_exp_cnt', 'i_3month_click_cnt', 'i_3month_share_cnt', 'i_3month_return_cnt', ] item_dense_conf = [ 'i_ctr_1day', 'i_str_1day', 'i_rov_1day', 'i_ros_1day', 'i_ctr_3day', 'i_str_3day', 'i_rov_3day', 'i_ros_3day', 'i_ctr_7day', 'i_str_7day', 'i_rov_7day', 'i_ros_7day', 'i_ctr_3month', 'i_str_3month', 'i_rov_3month', 'i_ros_3month', ] user_sparse_conf = [ 'u_brand', 'u_device', 'u_system', 'u_system_ver', 'ctx_region', 'ctx_city', # 统计特征_用户 'u_cycle_bucket_7days', 'u_cycle_bucket_30days', 'u_share_bucket_30days', 'u_1day_exp_cnt', 'u_1day_click_cnt', 'u_1day_share_cnt', 'u_1day_return_cnt', 'u_3day_exp_cnt', 'u_3day_click_cnt', 'u_3day_share_cnt', 'u_3day_return_cnt', 'u_7day_exp_cnt', 'u_7day_click_cnt', 'u_7day_share_cnt', 'u_7day_return_cnt', 'u_3month_exp_cnt', 'u_3month_click_cnt', 'u_3month_share_cnt', 'u_3month_return_cnt', ] user_dense_conf = [ 'u_ctr_1day', 'u_str_1day', 'u_rov_1day', 'u_ros_1day', 'u_ctr_3day', 'u_str_3day', 'u_rov_3day', 'u_ros_3day', 'u_ctr_7day', 'u_str_7day', 'u_rov_7day', 'u_ros_7day', 'u_ctr_3month', 'u_str_3month', 'u_rov_3month', 'u_ros_3month', ] def format_x(x): if x is None: x = '' return str(x).replace(' ', '').replace(':', '_') def sparse_fea_2_feature(v, k): f_k = format_x(k) f_v = format_x(v) if len(f_v) < 1: return (None, None) return ('#'.join([f_k, f_v]), 1.0) def dense_fea_2_feature(v, k): f_k = format_x(k) f_v = format_x(v) if len(f_v) < 1: return (None, None) return (f_k, float(f_v)) def get_features(sparse_conf, dense_conf, row): features = dict(map(lambda k:sparse_fea_2_feature(row[k], k), sparse_conf)) dense_features = dict(map(lambda k:dense_fea_2_feature(row[k], k), dense_conf)) features.update(dense_features) if None in features: del(features[None]) return features def get_item_features(row): return get_features(item_sparse_conf, item_dense_conf, row) def get_user_features(row): return get_features(user_sparse_conf, user_dense_conf, row) label_col = 'ui_is_out' sparse_fea_cols = [ # 'u_id', # 基础特征_场景 #'ctx_day',' 'ctx_apptype', 'ctx_week', 'ctx_hour', # 基础特征_交叉 #'ui_is_out', #'playtime', #'ui_root_id', #'ui_share_id', ] dense_fea_cols = [ ]