|
@@ -0,0 +1,57 @@
|
|
|
+#! /usr/bin/env python
|
|
|
+# -*- coding: utf-8 -*-
|
|
|
+# vim:fenc=utf-8
|
|
|
+#
|
|
|
+# Copyright © 2024 StrayWarrior <i@straywarrior.com>
|
|
|
+#
|
|
|
+# Distributed under terms of the MIT license.
|
|
|
+
|
|
|
+"""
|
|
|
+
|
|
|
+"""
|
|
|
+
|
|
|
+from my_utils import get_odps_instance, get_dataframe_from_odps
|
|
|
+import pandas as pd
|
|
|
+pd.set_option('display.max_rows', None)
|
|
|
+
|
|
|
+odps_df = get_dataframe_from_odps('loghubods',
|
|
|
+ 'tmp_long_articles_users')
|
|
|
+account_user_count = odps_df.groupby('account_id') \
|
|
|
+ .agg(count=odps_df.union_id.count()) \
|
|
|
+ .sort('count', ascending=False) \
|
|
|
+ .to_pandas()
|
|
|
+account_ids = account_user_count['account_id'].tolist()
|
|
|
+
|
|
|
+# df = odps_df.to_pandas()
|
|
|
+account_user_map = {}
|
|
|
+
|
|
|
+for gh_id in account_ids:
|
|
|
+ account_user_map[gh_id] = set(
|
|
|
+ odps_df.filter(odps_df.account_id == gh_id)['union_id'].to_pandas())
|
|
|
+ print(f'{gh_id}: {len(account_user_map[gh_id])}')
|
|
|
+
|
|
|
+overlap_map = {k: {} for k in account_ids}
|
|
|
+
|
|
|
+for i in range(len(account_ids)):
|
|
|
+ for j in range(i + 1, len(account_ids)):
|
|
|
+ gh1 = account_ids[i]
|
|
|
+ gh2 = account_ids[j]
|
|
|
+ users1 = account_user_map[gh1]
|
|
|
+ users2 = account_user_map[gh2]
|
|
|
+ print((gh1, gh2))
|
|
|
+ inter_set = users1 & users2
|
|
|
+ ratio1 = len(inter_set) / len(users1)
|
|
|
+ ratio2 = len(inter_set) / len(users2)
|
|
|
+ overlap_map[gh1][gh2] = ratio1
|
|
|
+ overlap_map[gh2][gh1] = ratio2
|
|
|
+
|
|
|
+print('{}\t{}'.format('gh_id', '\t'.join(account_ids)))
|
|
|
+for i in range(len(account_ids)):
|
|
|
+ gh1 = account_ids[i]
|
|
|
+ data = []
|
|
|
+ for j in range(len(account_ids)):
|
|
|
+ data.append(overlap_map[gh1].get(account_ids[j], 0))
|
|
|
+ data_str = '\t'.join(['{:.1%}'.format(x * 100) for x in data])
|
|
|
+ print('{}\t{}'.format(gh1, data_str))
|
|
|
+
|
|
|
+
|