algorithm
/
rov-offline


			
				
					
						
						
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657
							#! /usr/bin/env python
# -*- coding: utf-8 -*-
# vim:fenc=utf-8
#
# Copyright © 2024 StrayWarrior <i@straywarrior.com>
#
# Distributed under terms of the MIT license.

"""

"""

from my_utils import get_odps_instance, get_dataframe_from_odps
import pandas as pd
pd.set_option('display.max_rows', None)

odps_df = get_dataframe_from_odps('loghubods',
                                  'tmp_long_articles_users')
account_user_count = odps_df.groupby('account_id') \
    .agg(count=odps_df.union_id.count()) \
    .sort('count', ascending=False) \
    .to_pandas()
account_ids = account_user_count['account_id'].tolist()

# df = odps_df.to_pandas()
account_user_map = {}

for gh_id in account_ids:
    account_user_map[gh_id] = set(
        odps_df.filter(odps_df.account_id == gh_id)['union_id'].to_pandas())
    print(f'{gh_id}: {len(account_user_map[gh_id])}')

overlap_map = {k: {} for k in account_ids}

for i in range(len(account_ids)):
    for j in range(i + 1, len(account_ids)):
        gh1 = account_ids[i]
        gh2 = account_ids[j]
        users1 = account_user_map[gh1]
        users2 = account_user_map[gh2]
        print((gh1, gh2))
        inter_set = users1 & users2
        ratio1 = len(inter_set) / len(users1)
        ratio2 = len(inter_set) / len(users2)
        overlap_map[gh1][gh2] = ratio1
        overlap_map[gh2][gh1] = ratio2

print('{}\t{}'.format('gh_id', '\t'.join(account_ids)))
for i in range(len(account_ids)):
    gh1 = account_ids[i]
    data = []
    for j in range(len(account_ids)):
        data.append(overlap_map[gh1].get(account_ids[j], 0))
    data_str = '\t'.join(['{:.1%}'.format(x * 100) for x in data])
    print('{}\t{}'.format(gh1, data_str))