#! /usr/bin/env python # -*- coding: utf-8 -*- # vim:fenc=utf-8 # # Copyright © 2024 StrayWarrior # # Distributed under terms of the MIT license. """ """ from my_utils import get_odps_instance, get_dataframe_from_odps import pandas as pd pd.set_option('display.max_rows', None) odps_df = get_dataframe_from_odps('loghubods', 'tmp_long_articles_users') account_user_count = odps_df.groupby('account_id') \ .agg(count=odps_df.union_id.count()) \ .sort('count', ascending=False) \ .to_pandas() account_ids = account_user_count['account_id'].tolist() # df = odps_df.to_pandas() account_user_map = {} for gh_id in account_ids: account_user_map[gh_id] = set( odps_df.filter(odps_df.account_id == gh_id)['union_id'].to_pandas()) print(f'{gh_id}: {len(account_user_map[gh_id])}') overlap_map = {k: {} for k in account_ids} for i in range(len(account_ids)): for j in range(i + 1, len(account_ids)): gh1 = account_ids[i] gh2 = account_ids[j] users1 = account_user_map[gh1] users2 = account_user_map[gh2] print((gh1, gh2)) inter_set = users1 & users2 ratio1 = len(inter_set) / len(users1) ratio2 = len(inter_set) / len(users2) overlap_map[gh1][gh2] = ratio1 overlap_map[gh2][gh1] = ratio2 print('{}\t{}'.format('gh_id', '\t'.join(account_ids))) for i in range(len(account_ids)): gh1 = account_ids[i] data = [] for j in range(len(account_ids)): data.append(overlap_map[gh1].get(account_ids[j], 0)) data_str = '\t'.join(['{:.1%}'.format(x * 100) for x in data]) print('{}\t{}'.format(gh1, data_str))