import os import time import pandas as pd predict_data_dir = './data/predict_data' user_filename = 'user_feature.csv' video_filename = 'video_feature.csv' def read_csv_data(filepath): if os.path.exists(filepath): data = pd.read_csv(filepath, sep=',', engine='python', iterator=True) chunk_size = 1000000 chunks = [] loop = True while loop: try: chunk_data = data.get_chunk(chunk_size) chunks.append(chunk_data) except StopIteration: loop = False df = pd.concat(chunks, ignore_index=True) return df else: print("Don't have this file!") return None if __name__ == '__main__': st_time = time.time() # 1. 获取用户特征数据 user_filepath = f"{predict_data_dir}/{user_filename}" user_df = read_csv_data(filepath=user_filepath) user_df = user_df[user_df['mid'] != '-1'] print(f"user_df shape: {user_df.shape}") # 2. 获取视频特征数据 video_filepath = f"{predict_data_dir}/{video_filename}" video_df = read_csv_data(filepath=video_filepath) video_df = video_df[video_df['videoid' != '-1']] print(f"video_df shape: {video_df.shape}") # 3. 用户特征和视频特征进行拼接 video_features = [ 'videoid', 'video_preview_count_uv_30day', 'video_preview_count_pv_30day', 'video_view_count_uv_30day', 'video_view_count_pv_30day', 'video_play_count_uv_30day', 'video_play_count_pv_30day', 'video_share_count_uv_30day', 'video_share_count_pv_30day', 'video_return_count_30day', 'video_ctr_uv_30day', 'video_ctr_pv_30day', 'video_share_rate_uv_30day', 'video_share_rate_pv_30day', 'video_return_rate_30day', ] predict_data_dir = './data/predict_data' file_list = [f"{predict_data_dir}/predict_data_0.csv", f"{predict_data_dir}/predict_data_1.csv"] for file in file_list: try: os.remove(file) except: continue merge_df_list = [] for ind, row in video_df.iterrows(): merge_df_temp = user_df.copy() for feature in video_features: merge_df_temp[feature] = row[feature] merge_df_list.append(merge_df_temp) if ind % 100 == 0: merge_df = pd.concat(merge_df_list, ignore_index=True) print(f"ind: {ind}, merge_df shape: {merge_df.shape}") # 4. 拼接广告特征ad_status for ad_status in [0, 1]: res_df = merge_df.copy() res_df['ad_status'] = ad_status # 写入csv if not os.path.exists(predict_data_dir): os.makedirs(predict_data_dir) res_df.to_csv(f"{predict_data_dir}/predict_data_{ad_status}.csv", index=False, mode='a') merge_df_list = [] if len(merge_df_list) > 0: merge_df = pd.concat(merge_df_list, ignore_index=True) print(f"merge_df shape: {merge_df.shape}") # 4. 拼接广告特征ad_status for ad_status in [0, 1]: res_df = merge_df.copy() res_df['ad_status'] = ad_status # 写入csv if not os.path.exists(predict_data_dir): os.makedirs(predict_data_dir) res_df.to_csv(f"{predict_data_dir}/predict_data_{ad_status}.csv", index=False, mode='a') print(f"{time.time() - st_time}s")