浏览代码

Add analyse_account_user_duplicate

StrayWarrior 7 月之前
父节点
当前提交
0ce076746b
共有 1 个文件被更改,包括 57 次插入0 次删除
  1. 57 0
      analyse_account_user_duplicate.py

+ 57 - 0
analyse_account_user_duplicate.py

@@ -0,0 +1,57 @@
+#! /usr/bin/env python
+# -*- coding: utf-8 -*-
+# vim:fenc=utf-8
+#
+# Copyright © 2024 StrayWarrior <i@straywarrior.com>
+#
+# Distributed under terms of the MIT license.
+
+"""
+
+"""
+
+from my_utils import get_odps_instance, get_dataframe_from_odps
+import pandas as pd
+pd.set_option('display.max_rows', None)
+
+odps_df = get_dataframe_from_odps('loghubods',
+                                  'tmp_long_articles_users')
+account_user_count = odps_df.groupby('account_id') \
+    .agg(count=odps_df.union_id.count()) \
+    .sort('count', ascending=False) \
+    .to_pandas()
+account_ids = account_user_count['account_id'].tolist()
+
+# df = odps_df.to_pandas()
+account_user_map = {}
+
+for gh_id in account_ids:
+    account_user_map[gh_id] = set(
+        odps_df.filter(odps_df.account_id == gh_id)['union_id'].to_pandas())
+    print(f'{gh_id}: {len(account_user_map[gh_id])}')
+
+overlap_map = {k: {} for k in account_ids}
+
+for i in range(len(account_ids)):
+    for j in range(i + 1, len(account_ids)):
+        gh1 = account_ids[i]
+        gh2 = account_ids[j]
+        users1 = account_user_map[gh1]
+        users2 = account_user_map[gh2]
+        print((gh1, gh2))
+        inter_set = users1 & users2
+        ratio1 = len(inter_set) / len(users1)
+        ratio2 = len(inter_set) / len(users2)
+        overlap_map[gh1][gh2] = ratio1
+        overlap_map[gh2][gh1] = ratio2
+
+print('{}\t{}'.format('gh_id', '\t'.join(account_ids)))
+for i in range(len(account_ids)):
+    gh1 = account_ids[i]
+    data = []
+    for j in range(len(account_ids)):
+        data.append(overlap_map[gh1].get(account_ids[j], 0))
+    data_str = '\t'.join(['{:.1%}'.format(x * 100) for x in data])
+    print('{}\t{}'.format(gh1, data_str))
+
+