1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677 |
- import numpy as np
- import pandas as pd
- from client import ODPSClient
- recall_result = "/Users/zhao/Desktop/20241124_recall.csv"
- day_oss = "/Users/zhao/Desktop/20241124_day_oss.csv"
- odps = ODPSClient.ODPSClient()
- def read_day_recall_v2() -> pd.DataFrame:
- df = pd.read_csv(day_oss)
- df['cpm_rank'] = df.groupby("type")['score'].rank(method='first', ascending=False).astype(int)
- df['cpm_rank_2'] = df.groupby("type")['cpm'].rank(method='first', ascending=False).astype(int)
- df['view_rank'] = df.groupby("type")['view_rate'].rank(method='first', ascending=False).astype(int)
- df['day_recall_v2'] = np.where(
- ((df['type'] == '14d') & ((df['cpm_rank'] <= 30) | (df['cpm_rank_2'] <= 20) | (df['view_rank'] <= 30))) |
- ((df['type'] == '3d') & ((df['cpm_rank'] <= 50) | (df['cpm_rank_2'] <= 30) | (df['view_rank'] <= 50))) |
- ((df['type'] == '1d') & ((df['cpm_rank'] <= 80) | (df['cpm_rank_2'] <= 50) | (df['view_rank'] <= 100))),
- True,
- False
- )
- df.to_csv("/Users/zhao/Desktop/3.csv", index=False)
- grouped_df = (
- df.groupby('cid', as_index=False) # 按 CID 分组
- .agg(day_recall_v2=('day_recall_v2', 'any')) # 只要有一个为 True,就为 True
- )
- return grouped_df
- def read_day_recall() -> pd.DataFrame:
- df = pd.read_csv(day_oss)
- df['cpm_rank'] = df.groupby("type")['score'].rank(method='first', ascending=False).astype(int)
- df['view_rank'] = df.groupby("type")['view_rate'].rank(method='first', ascending=False).astype(int)
- df['day_recall_v1'] = np.where(
- ((df['type'] == '14d') & ((df['cpm_rank'] <= 30) | (df['view_rank'] <= 20))) |
- ((df['type'] == '3d') & ((df['cpm_rank'] <= 50) | (df['view_rank'] <= 30))) |
- ((df['type'] == '1d') & ((df['cpm_rank'] <= 80) | (df['view_rank'] <= 50))),
- True,
- False
- )
- df.to_csv("/Users/zhao/Desktop/2.csv", index=False)
- grouped_df = (
- df.groupby('cid', as_index=False) # 按 CID 分组
- .agg(day_recall_v1=('day_recall_v1', 'any')) # 只要有一个为 True,就为 True
- )
- return grouped_df
- def _main():
- day_recall = read_day_recall()
- day_recall_v2 = read_day_recall_v2()
- recall = pd.read_csv(recall_result)
- recall['base_diff'] = recall['产品-17-前五组'] - recall['产品-35-前五组']
- recall = (pd.merge(recall, day_recall, on='cid', how='left')
- .merge(day_recall_v2, on='cid', how='left'))
- recall.to_csv("/Users/zhao/Desktop/1.csv", index=False)
- print(recall)
- if __name__ == '__main__':
- _main()
|