t.py 2.5 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677
  1. import numpy as np
  2. import pandas as pd
  3. from client import ODPSClient
  4. recall_result = "/Users/zhao/Desktop/20241124_recall.csv"
  5. day_oss = "/Users/zhao/Desktop/20241124_day_oss.csv"
  6. odps = ODPSClient.ODPSClient()
  7. def read_day_recall_v2() -> pd.DataFrame:
  8. df = pd.read_csv(day_oss)
  9. df['cpm_rank'] = df.groupby("type")['score'].rank(method='first', ascending=False).astype(int)
  10. df['cpm_rank_2'] = df.groupby("type")['cpm'].rank(method='first', ascending=False).astype(int)
  11. df['view_rank'] = df.groupby("type")['view_rate'].rank(method='first', ascending=False).astype(int)
  12. df['day_recall_v2'] = np.where(
  13. ((df['type'] == '14d') & ((df['cpm_rank'] <= 30) | (df['cpm_rank_2'] <= 20) | (df['view_rank'] <= 30))) |
  14. ((df['type'] == '3d') & ((df['cpm_rank'] <= 50) | (df['cpm_rank_2'] <= 30) | (df['view_rank'] <= 50))) |
  15. ((df['type'] == '1d') & ((df['cpm_rank'] <= 80) | (df['cpm_rank_2'] <= 50) | (df['view_rank'] <= 100))),
  16. True,
  17. False
  18. )
  19. df.to_csv("/Users/zhao/Desktop/3.csv", index=False)
  20. grouped_df = (
  21. df.groupby('cid', as_index=False) # 按 CID 分组
  22. .agg(day_recall_v2=('day_recall_v2', 'any')) # 只要有一个为 True,就为 True
  23. )
  24. return grouped_df
  25. def read_day_recall() -> pd.DataFrame:
  26. df = pd.read_csv(day_oss)
  27. df['cpm_rank'] = df.groupby("type")['score'].rank(method='first', ascending=False).astype(int)
  28. df['view_rank'] = df.groupby("type")['view_rate'].rank(method='first', ascending=False).astype(int)
  29. df['day_recall_v1'] = np.where(
  30. ((df['type'] == '14d') & ((df['cpm_rank'] <= 30) | (df['view_rank'] <= 20))) |
  31. ((df['type'] == '3d') & ((df['cpm_rank'] <= 50) | (df['view_rank'] <= 30))) |
  32. ((df['type'] == '1d') & ((df['cpm_rank'] <= 80) | (df['view_rank'] <= 50))),
  33. True,
  34. False
  35. )
  36. df.to_csv("/Users/zhao/Desktop/2.csv", index=False)
  37. grouped_df = (
  38. df.groupby('cid', as_index=False) # 按 CID 分组
  39. .agg(day_recall_v1=('day_recall_v1', 'any')) # 只要有一个为 True,就为 True
  40. )
  41. return grouped_df
  42. def _main():
  43. day_recall = read_day_recall()
  44. day_recall_v2 = read_day_recall_v2()
  45. recall = pd.read_csv(recall_result)
  46. recall['base_diff'] = recall['产品-17-前五组'] - recall['产品-35-前五组']
  47. recall = (pd.merge(recall, day_recall, on='cid', how='left')
  48. .merge(day_recall_v2, on='cid', how='left'))
  49. recall.to_csv("/Users/zhao/Desktop/1.csv", index=False)
  50. print(recall)
  51. if __name__ == '__main__':
  52. _main()