123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657 |
- #! /usr/bin/env python
- # -*- coding: utf-8 -*-
- # vim:fenc=utf-8
- #
- # Copyright © 2024 StrayWarrior <i@straywarrior.com>
- #
- # Distributed under terms of the MIT license.
- """
- """
- from my_utils import get_odps_instance, get_dataframe_from_odps
- import pandas as pd
- pd.set_option('display.max_rows', None)
- odps_df = get_dataframe_from_odps('loghubods',
- 'tmp_long_articles_users')
- account_user_count = odps_df.groupby('account_id') \
- .agg(count=odps_df.union_id.count()) \
- .sort('count', ascending=False) \
- .to_pandas()
- account_ids = account_user_count['account_id'].tolist()
- # df = odps_df.to_pandas()
- account_user_map = {}
- for gh_id in account_ids:
- account_user_map[gh_id] = set(
- odps_df.filter(odps_df.account_id == gh_id)['union_id'].to_pandas())
- print(f'{gh_id}: {len(account_user_map[gh_id])}')
- overlap_map = {k: {} for k in account_ids}
- for i in range(len(account_ids)):
- for j in range(i + 1, len(account_ids)):
- gh1 = account_ids[i]
- gh2 = account_ids[j]
- users1 = account_user_map[gh1]
- users2 = account_user_map[gh2]
- print((gh1, gh2))
- inter_set = users1 & users2
- ratio1 = len(inter_set) / len(users1)
- ratio2 = len(inter_set) / len(users2)
- overlap_map[gh1][gh2] = ratio1
- overlap_map[gh2][gh1] = ratio2
- print('{}\t{}'.format('gh_id', '\t'.join(account_ids)))
- for i in range(len(account_ids)):
- gh1 = account_ids[i]
- data = []
- for j in range(len(account_ids)):
- data.append(overlap_map[gh1].get(account_ids[j], 0))
- data_str = '\t'.join(['{:.1%}'.format(x * 100) for x in data])
- print('{}\t{}'.format(gh1, data_str))
|