12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697 |
- import os
- import time
- import pandas as pd
- predict_data_dir = './data/predict_data'
- user_filename = 'user_feature.csv'
- video_filename = 'video_feature.csv'
- def read_csv_data(filepath):
- if os.path.exists(filepath):
- data = pd.read_csv(filepath, sep=',', engine='python', iterator=True)
- chunk_size = 1000000
- chunks = []
- loop = True
- while loop:
- try:
- chunk_data = data.get_chunk(chunk_size)
- chunks.append(chunk_data)
- except StopIteration:
- loop = False
- df = pd.concat(chunks, ignore_index=True)
- return df
- else:
- print("Don't have this file!")
- return None
- if __name__ == '__main__':
- st_time = time.time()
- # 1. 获取用户特征数据
- user_filepath = f"{predict_data_dir}/{user_filename}"
- user_df = read_csv_data(filepath=user_filepath)
- user_df = user_df[user_df['mid'] != '-1']
- print(f"user_df shape: {user_df.shape}")
- # 2. 获取视频特征数据
- video_filepath = f"{predict_data_dir}/{video_filename}"
- video_df = read_csv_data(filepath=video_filepath)
- video_df = video_df[video_df['videoid' != '-1']]
- print(f"video_df shape: {video_df.shape}")
- # 3. 用户特征和视频特征进行拼接
- video_features = [
- 'videoid',
- 'video_preview_count_uv_30day',
- 'video_preview_count_pv_30day',
- 'video_view_count_uv_30day',
- 'video_view_count_pv_30day',
- 'video_play_count_uv_30day',
- 'video_play_count_pv_30day',
- 'video_share_count_uv_30day',
- 'video_share_count_pv_30day',
- 'video_return_count_30day',
- 'video_ctr_uv_30day',
- 'video_ctr_pv_30day',
- 'video_share_rate_uv_30day',
- 'video_share_rate_pv_30day',
- 'video_return_rate_30day',
- ]
- predict_data_dir = './data/predict_data'
- file_list = [f"{predict_data_dir}/predict_data_0.csv", f"{predict_data_dir}/predict_data_1.csv"]
- for file in file_list:
- try:
- os.remove(file)
- except:
- continue
- merge_df_list = []
- for ind, row in video_df.iterrows():
- merge_df_temp = user_df.copy()
- for feature in video_features:
- merge_df_temp[feature] = row[feature]
- merge_df_list.append(merge_df_temp)
- if ind % 100 == 0:
- merge_df = pd.concat(merge_df_list, ignore_index=True)
- print(f"ind: {ind}, merge_df shape: {merge_df.shape}")
- # 4. 拼接广告特征ad_status
- for ad_status in [0, 1]:
- res_df = merge_df.copy()
- res_df['ad_status'] = ad_status
- # 写入csv
- if not os.path.exists(predict_data_dir):
- os.makedirs(predict_data_dir)
- res_df.to_csv(f"{predict_data_dir}/predict_data_{ad_status}.csv", index=False, mode='a')
- merge_df_list = []
- if len(merge_df_list) > 0:
- merge_df = pd.concat(merge_df_list, ignore_index=True)
- print(f"merge_df shape: {merge_df.shape}")
- # 4. 拼接广告特征ad_status
- for ad_status in [0, 1]:
- res_df = merge_df.copy()
- res_df['ad_status'] = ad_status
- # 写入csv
- if not os.path.exists(predict_data_dir):
- os.makedirs(predict_data_dir)
- res_df.to_csv(f"{predict_data_dir}/predict_data_{ad_status}.csv", index=False, mode='a')
- print(f"{time.time() - st_time}s")
|