import numpy as np import pandas as pd from scipy.optimize import minimize from sklearn.metrics import r2_score from sklearn.model_selection import train_test_split import pickle # 1. 加载数据 def load_data(file_path): df = pd.read_csv(file_path, na_values='\\N') return df # 2. 数据预处理 def preprocess_data(df, features, target, exposure_col, top_k): # 按曝光量排序并选择 Top k 数据 df_sorted = df.sort_values(by=exposure_col, ascending=False) df_topk = df_sorted.head(top_k) X = df_topk[features] y = df_topk[target] # 获取 Top K 对应的曝光阈值 exposure_threshold = df_topk[exposure_col].min() return X, y, exposure_threshold,df_topk # 3. 计算相关系数 def calculate_correlations(df, features, target): correlations = {} for feature in features: # 删除 target 或 feature 列中任一为空的行 valid_data = df[[target, feature]].dropna() # 如果没有有效数据,相关系数设为 0 if len(valid_data) == 0: correlations[feature] = 0 else: # 计算相关系数 corr = valid_data[target].corr(valid_data[feature]) correlations[feature] = corr if not np.isnan(corr) else 0 # 转换为 Series 并按绝对值大小排序 corr_series = pd.Series(correlations).abs().sort_values(ascending=False) return corr_series # 4. 定义动态加权和函数 def dynamic_weighted_sum(features, weights): valid_features = ~np.isnan(features) if np.sum(valid_features) == 0: return np.nan normalized_weights = weights[valid_features] / np.sum(weights[valid_features]) return np.sum(features[valid_features] * normalized_weights) # 5. 定义损失函数 def mse_loss(y_true, y_pred): valid = ~np.isnan(y_true) & ~np.isnan(y_pred) return np.mean((y_true[valid] - y_pred[valid])**2) # 6. 定义目标函数 def objective(weights, X, y_true): y_pred = np.array([dynamic_weighted_sum(x, weights) for x in X.values]) return mse_loss(y_true, y_pred) # 7. 搜索最佳权重 def find_best_weights(X, y, initial_weights): result = minimize(objective, initial_weights, args=(X, y), method='Nelder-Mead') return result.x # 8. 评估模型 def evaluate_model(X, y, weights): y_pred = np.array([dynamic_weighted_sum(x, weights) for x in X.values]) valid = ~np.isnan(y) & ~np.isnan(y_pred) r2 = r2_score(y[valid], y_pred[valid]) mse = mse_loss(y, y_pred) return r2, mse # 9. 保存模型 def save_model(weights, features, exposure_threshold,top_k, file_path): model = { 'weights': weights, 'features': features, 'exposure_threshold': exposure_threshold, 'top_k':top_k } with open(file_path, 'wb') as f: pickle.dump(model, f) # 10. 加载模型 def load_model(file_path): with open(file_path, 'rb') as f: model = pickle.load(f) return model['weights'], model['features'], model['exposure_threshold'],model['top_k'] # 12. 主函数 def main(): # 加载数据 df = load_data('train_20240921.csv') # 定义特征、目标变量和曝光量列 features = ['h1_ago_vov', 'h2_ago_vov', 'h3_ago_vov', 'h24_ago_vov', 'h48_ago_vov', 'd1_ago_vov', 'd2_ago_vov'] target = 'cur_hour_vov' exposure_col = 'h1_ago_view' # 请确保你的数据中有这个列 top_k = 1000 # 设置你想要使用的 Top k 数据点数量 # 预处理数据 X, y, exposure_threshold,df_topk = preprocess_data(df, features, target, exposure_col, top_k) # 划分训练集和测试集 X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) # 计算相关系数 correlations = calculate_correlations(df_topk, features, target) print("Feature correlations:") print(correlations) # 使用相关系数作为初始权重 initial_weights = correlations[features].values # 搜索最佳权重 best_weights = find_best_weights(X_train, y_train, initial_weights) # 评估模型 r2_train, mse_train = evaluate_model(X_train, y_train, best_weights) r2_test, mse_test = evaluate_model(X_test, y_test, best_weights) print(f"\nTrain R² Score: {r2_train:.4f}, MSE: {mse_train:.4f}") print(f"Test R² Score: {r2_test:.4f}, MSE: {mse_test:.4f}") # 输出特征重要性 print("\nFeature importance:") for feature, weight in zip(features, best_weights): print(f"{feature}: {weight:.4f}") # 保存模型 save_model(pd.Series(best_weights, index=features), features, exposure_threshold,top_k, 'top'+str(top_k)+'_linear_weighted_model.pkl') # 测试加载模型 loaded_weights, loaded_features, loaded_threshold,topk = load_model('top'+str(top_k)+'_linear_weighted_model.pkl') print("\nLoaded model weights:") for feature, weight in loaded_weights.items(): print(f"{feature}: {weight:.4f}") print(f"Exposure threshold: {loaded_threshold}") print(f"TopK: {topk}") if __name__ == "__main__": main()