analyse_account_user_duplicate.py 1.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657
  1. #! /usr/bin/env python
  2. # -*- coding: utf-8 -*-
  3. # vim:fenc=utf-8
  4. #
  5. # Copyright © 2024 StrayWarrior <i@straywarrior.com>
  6. #
  7. # Distributed under terms of the MIT license.
  8. """
  9. """
  10. from my_utils import get_odps_instance, get_dataframe_from_odps
  11. import pandas as pd
  12. pd.set_option('display.max_rows', None)
  13. odps_df = get_dataframe_from_odps('loghubods',
  14. 'tmp_long_articles_users')
  15. account_user_count = odps_df.groupby('account_id') \
  16. .agg(count=odps_df.union_id.count()) \
  17. .sort('count', ascending=False) \
  18. .to_pandas()
  19. account_ids = account_user_count['account_id'].tolist()
  20. # df = odps_df.to_pandas()
  21. account_user_map = {}
  22. for gh_id in account_ids:
  23. account_user_map[gh_id] = set(
  24. odps_df.filter(odps_df.account_id == gh_id)['union_id'].to_pandas())
  25. print(f'{gh_id}: {len(account_user_map[gh_id])}')
  26. overlap_map = {k: {} for k in account_ids}
  27. for i in range(len(account_ids)):
  28. for j in range(i + 1, len(account_ids)):
  29. gh1 = account_ids[i]
  30. gh2 = account_ids[j]
  31. users1 = account_user_map[gh1]
  32. users2 = account_user_map[gh2]
  33. print((gh1, gh2))
  34. inter_set = users1 & users2
  35. ratio1 = len(inter_set) / len(users1)
  36. ratio2 = len(inter_set) / len(users2)
  37. overlap_map[gh1][gh2] = ratio1
  38. overlap_map[gh2][gh1] = ratio2
  39. print('{}\t{}'.format('gh_id', '\t'.join(account_ids)))
  40. for i in range(len(account_ids)):
  41. gh1 = account_ids[i]
  42. data = []
  43. for j in range(len(account_ids)):
  44. data.append(overlap_map[gh1].get(account_ids[j], 0))
  45. data_str = '\t'.join(['{:.1%}'.format(x * 100) for x in data])
  46. print('{}\t{}'.format(gh1, data_str))