2 سال پیش · 7055132980
--- a/cal_24h_score.py
+++ b/cal_24h_score.py
@@ -0,0 +1,85 @@
 
															+# coding utf-8
														
 
															+import sys
														
 
															+import json
														
 
															+import math
														
 
															+import pandas as pd
														
 
															+
														
 
															+
														
 
															+features = [
														
 
															+    'apptype',
														
 
															+    'videoid',
														
 
															+    'preview人数',  # 过去24h预曝光人数
														
 
															+    'view人数',  # 过去24h曝光人数
														
 
															+    'play人数',  # 过去24h播放人数
														
 
															+    'share人数',  # 过去24h分享人数
														
 
															+    '回流人数',  # 过去24h分享，过去24h回流人数
														
 
															+    'preview次数',  # 过去24h预曝光次数
														
 
															+    'view次数',  # 过去24h曝光次数
														
 
															+    'play次数',  # 过去24h播放次数
														
 
															+    'share次数',  # 过去24h分享次数
														
 
															+    'platform_return',
														
 
															+    'platform_preview',
														
 
															+    'platform_preview_total',
														
 
															+    'platform_show',
														
 
															+    'platform_show_total',
														
 
															+    'platform_view',
														
 
															+    'platform_view_total',
														
 
															+]
														
 
															+
														
 
															+
														
 
															+def data_group(data_path):
														
 
															+    """将数据按照videoid聚合（求和）"""
														
 
															+    f = open(data_path)
														
 
															+    index = 0
														
 
															+    data_dict = {}
														
 
															+    while True:
														
 
															+        line = f.readline()
														
 
															+        if not line:
														
 
															+            break
														
 
															+        if index == 0:
														
 
															+            index += 1
														
 
															+            continue
														
 
															+        index += 1
														
 
															+        items = line.strip().split(",")
														
 
															+        # print(items)
														
 
															+        if len(items) < len(features):
														
 
															+            continue
														
 
															+        video_id = items[1]
														
 
															+        if video_id not in data_dict:
														
 
															+            data_dict[video_id] = {'videoid': video_id}
														
 
															+            for i, feature in enumerate(features):
														
 
															+                if feature in ['apptype', 'videoid']:
														
 
															+                    continue
														
 
															+                data_dict[video_id][feature] = int(float(items[i]))
														
 
															+        else:
														
 
															+            for i, feature in enumerate(features):
														
 
															+                if feature in ['apptype', 'videoid']:
														
 
															+                    continue
														
 
															+                data_dict[video_id][feature] = data_dict[video_id][feature] + int(float(items[i]))
														
 
															+    f.close()
														
 
															+    data_list = [item for video_id, item in data_dict.items()]
														
 
															+    data_df = pd.DataFrame(data_list)
														
 
															+    return data_df
														
 
															+
														
 
															+
														
 
															+def cal_score(data_df):
														
 
															+    """计算score"""
														
 
															+    df = data_df.copy()
														
 
															+    # score1 = 回流/(view+10)
														
 
															+    df['24h_score1'] = df['回流人数'] / (df['view人数'] + 10)
														
 
															+    score_df = df[['videoid', '24h_score1']]
														
 
															+    # print(score_df)
														
 
															+    return score_df
														
 
															+
														
 
															+
														
 
															+if __name__ == "__main__":
														
 
															+    # 1.load data
														
 
															+    now_date = sys.argv[1]
														
 
															+    print(f"now_date: {now_date}")
														
 
															+    data_path = f"./data/24h_video_data_{now_date}.csv"
														
 
															+    data_df = data_group(data_path=data_path)
														
 
															+    print(f"data_df shape: {data_df.shape}")
														
 
															+    hour_score_path = f"./data/24h_score_{now_date}.csv"
														
 
															+    score_df = cal_score(data_df=data_df)
														
 
															+    score_df.to_csv(hour_score_path, index=False)
														
 
															+    print(f"score_df shape: {score_df.shape}")
														
--- a/cal_hour_score.py
+++ b/cal_hour_score.py
@@ -0,0 +1,129 @@
 
															+# coding utf-8
														
 
															+import sys
														
 
															+import json
														
 
															+import math
														
 
															+import pandas as pd
														
 
															+
														
 
															+
														
 
															+features = [
														
 
															+    'apptype',
														
 
															+    'code',
														
 
															+    'videoid',
														
 
															+    'lastonehour_preview',  # 过去1小时预曝光人数
														
 
															+    'lastonehour_view',  # 过去1小时曝光人数
														
 
															+    'lastonehour_play',  # 过去1小时播放人数
														
 
															+    'lastonehour_share',  # 过去1小时分享人数
														
 
															+    'lastonehour_return',  # 过去1小时分享，过去1小时回流人数
														
 
															+    'lastonehour_preview_total',  # 过去1小时预曝光次数
														
 
															+    'lastonehour_view_total',  # 过去1小时曝光次数
														
 
															+    'lastonehour_play_total',  # 过去1小时播放次数
														
 
															+    'lastonehour_share_total',  # 过去1小时分享次数
														
 
															+    'platform_return',
														
 
															+    'lastonehour_show',  # 不区分地域
														
 
															+    'lastonehour_show_region',  # 地域分组
														
 
															+    'lasttwohour_share',  # h-2小时分享人数
														
 
															+    'lasttwohour_return_now',  # h-2分享，过去1小时回流人数
														
 
															+    'lasttwohour_return',  # h-2分享，h-2回流人数
														
 
															+    'lastthreehour_share',  # h-3小时分享人数
														
 
															+    'lastthreehour_return_now',  # h-3分享，过去1小时回流人数
														
 
															+    'lastthreehour_return',  # h-3分享，h-3回流人数
														
 
															+
														
 
															+    'lastonehour_return_new',  # 过去1小时分享，过去1小时回流人数（回流统计为对应地域分享带回的回流，分享限制地域，回流不限制地域）
														
 
															+    'lasttwohour_return_now_new',  # h-2分享，过去1小时回流人数（回流统计为对应地域分享带回的回流，分享限制地域，回流不限制地域）
														
 
															+    'lasttwohour_return_new',  # h-2分享，h-2回流人数（回流统计为对应地域分享带回的回流，分享限制地域，回流不限制地域）
														
 
															+    'lastthreehour_return_now_new',  # h-3分享，过去1小时回流人数（回流统计为对应地域分享带回的回流，分享限制地域，回流不限制地域）
														
 
															+    'lastthreehour_return_new',  # h-3分享，h-3回流人数（回流统计为对应地域分享带回的回流，分享限制地域，回流不限制地域）
														
 
															+    'platform_return_new',  # 平台分发回流（回流统计为对应地域分享带回的回流，分享限制地域，回流不限制地域）
														
 
															+]
														
 
															+
														
 
															+
														
 
															+def data_group(data_path):
														
 
															+    """将数据按照videoid聚合（求和）"""
														
 
															+    f = open(data_path)
														
 
															+    index = 0
														
 
															+    data_dict = {}
														
 
															+    while True:
														
 
															+        line = f.readline()
														
 
															+        if not line:
														
 
															+            break
														
 
															+        if index == 0:
														
 
															+            index += 1
														
 
															+            continue
														
 
															+        index += 1
														
 
															+        items = line.strip().split(",")
														
 
															+        # print(items)
														
 
															+        if len(items) < len(features):
														
 
															+            continue
														
 
															+        video_id = items[2]
														
 
															+        if video_id not in data_dict:
														
 
															+            data_dict[video_id] = {'videoid': video_id}
														
 
															+            for i, feature in enumerate(features):
														
 
															+                if feature in ['apptype', 'code', 'videoid']:
														
 
															+                    continue
														
 
															+                data_dict[video_id][feature] = int(float(items[i]))
														
 
															+        else:
														
 
															+            for i, feature in enumerate(features):
														
 
															+                if feature in ['apptype', 'code', 'videoid']:
														
 
															+                    continue
														
 
															+                data_dict[video_id][feature] = data_dict[video_id][feature] + int(float(items[i]))
														
 
															+    f.close()
														
 
															+    data_list = [item for video_id, item in data_dict.items()]
														
 
															+    data_df = pd.DataFrame(data_list)
														
 
															+    return data_df
														
 
															+
														
 
															+
														
 
															+def cal_score(data_df):
														
 
															+    """计算score"""
														
 
															+    df = data_df.copy()
														
 
															+    # share_rate_view = (share+1)/(view+1000)
														
 
															+    df['share_rate_view'] = (df['lastonehour_share'] + 1) / (df['lastonehour_view'] + 1000)
														
 
															+
														
 
															+    # back_rate = (return+1)/(share+10)
														
 
															+    df['back_rate'] = (df['lastonehour_return'] + 1) / (df['lastonehour_share'] + 10)
														
 
															+    # back_rate_2h = (lasttwohour_return_now+1)/(share+10)
														
 
															+    df['back_rate_2h'] = (df['lasttwohour_return_now'] + 1) / (df['lasttwohour_share'] + 10)
														
 
															+    # back_rate_3h = (lastthreehour_return_now+1)/(share+10)
														
 
															+    df['back_rate_3h'] = (df['lastthreehour_return_now'] + 1) / (df['lastthreehour_share'] + 10)
														
 
															+
														
 
															+    df['log_back'] = (df['lastonehour_return'] + 1).apply(math.log)
														
 
															+
														
 
															+    # h-2回流留存
														
 
															+    df['return_retention_initial_2h'] = (df['lasttwohour_return_now'] + 1) / (df['lasttwohour_return'] + 5)
														
 
															+    df['return_retention_2h'] = df['return_retention_initial_2h'].apply(lambda x: 1 if x > 1 else x)
														
 
															+    # h-3回流留存
														
 
															+    df['return_retention_initial_3h'] = (df['lastthreehour_return_now'] + 1) / (df['lastthreehour_return'] + 10)
														
 
															+    df['return_retention_3h'] = df['return_retention_initial_3h'].apply(lambda x: 0.8 if x > 0.8 else x)
														
 
															+
														
 
															+    # score1 = 回流/(view+5)
														
 
															+    df['hour_score1'] = df['lastonehour_return'] / (df['lastonehour_view'] + 5)
														
 
															+
														
 
															+    # score2 = (回流 * (1 + h-2回流留存 + h-3回流留存))/(view+1000)
														
 
															+    df['hour_score2'] = (df['lastonehour_return'] * (1 + df['return_retention_2h'] + df['return_retention_3h'])) / \
														
 
															+                   (df['lastonehour_view'] + 1000)
														
 
															+
														
 
															+    # score3 = (lastthreehour_return_now + lasttwohour_return_now + lastonehour_return)/(lastonehour_view+1000)
														
 
															+    df['hour_score3'] = (df['lastthreehour_return_now'] + df['lasttwohour_return_now'] + df['lastonehour_return']) / \
														
 
															+                   (df['lastonehour_view'] + 1000)
														
 
															+
														
 
															+    # score4 = share/view * back_rate * logback
														
 
															+    df['hour_score4'] = df['share_rate_view'] * df['back_rate'] * df['log_back']
														
 
															+
														
 
															+    # score5 = share/view * (back_rate + back_rate_2h + back_rate_3h) * logback
														
 
															+    df['hour_score5'] = df['share_rate_view'] * (df['back_rate'] + df['back_rate_2h'] + df['back_rate_3h']) * df['log_back']
														
 
															+
														
 
															+    score_df = df[['videoid', 'hour_score1', 'hour_score2', 'hour_score3', 'hour_score4', 'hour_score5']]
														
 
															+    # print(score_df)
														
 
															+    return score_df
														
 
															+
														
 
															+
														
 
															+if __name__ == "__main__":
														
 
															+    # 1.load data
														
 
															+    now_date = sys.argv[1]
														
 
															+    print(f"now_date: {now_date}")
														
 
															+    data_path = f"./data/hour_video_data_{now_date}.csv"
														
 
															+    data_df = data_group(data_path=data_path)
														
 
															+    print(f"data_df shape: {data_df.shape}")
														
 
															+    hour_score_path = f"./data/hour_score_{now_date}.csv"
														
 
															+    score_df = cal_score(data_df=data_df)
														
 
															+    score_df.to_csv(hour_score_path, index=False)
														
 
															+    print(f"score_df shape: {score_df.shape}")
														
--- a/compose_score.py
+++ b/compose_score.py
@@ -0,0 +1,61 @@
 
															+import sys
														
 
															+import pandas as pd
														
 
															+from db_helper import RedisHelper
														
 
															+
														
 
															+redis_helper = RedisHelper()
														
 
															+
														
 
															+
														
 
															+def cal_compose_score(score_hour_path, score_24h_path):
														
 
															+    """分值合并"""
														
 
															+    score_hour_df = pd.read_csv(score_hour_path)
														
 
															+    score_24h_df = pd.read_csv(score_24h_path)
														
 
															+    print(score_hour_df)
														
 
															+    print(score_24h_df)
														
 
															+    score_hour_df['videoid'] = score_hour_df['videoid'].astype(int)
														
 
															+    score_24h_df['videoid'] = score_24h_df['videoid'].astype(int)
														
 
															+    score_merge_df = pd.merge(score_hour_df, score_24h_df, on='videoid', how='outer')
														
 
															+    score_merge_df.fillna(0, inplace=True)
														
 
															+    print(score_merge_df)
														
 
															+    print(score_hour_df.shape)
														
 
															+    print(score_24h_df.shape)
														
 
															+    print(score_merge_df.shape)
														
 
															+    score_merge_df['score1'] = score_merge_df['24h_score1'] + score_merge_df['hour_score1']
														
 
															+    score_merge_df['score2'] = score_merge_df['24h_score1'] + score_merge_df['hour_score2']
														
 
															+    score_merge_df['score3'] = score_merge_df['24h_score1'] + score_merge_df['hour_score3']
														
 
															+    score_merge_df['score4'] = score_merge_df['24h_score1'] + score_merge_df['hour_score4']
														
 
															+    score_merge_df['score5'] = score_merge_df['24h_score1'] + score_merge_df['hour_score5']
														
 
															+    print(score_merge_df)
														
 
															+    print(score_merge_df.shape)
														
 
															+    score_df = score_merge_df[['videoid', 'score1', 'score2', 'score3', 'score4', 'score5']]
														
 
															+    return score_df
														
 
															+
														
 
															+
														
 
															+def score_to_redis(score_df):
														
 
															+    redis_data = dict()
														
 
															+    rank_score_key_prefix = 'rank:'
														
 
															+    score_name_list = score_df.columns.to_list()[1:]
														
 
															+    for ind, row in score_df.iterrows():
														
 
															+        video_id = int(row['videoid'])
														
 
															+        for score_name in score_name_list:
														
 
															+            score = row[score_name]
														
 
															+            rank_score_key = f"{rank_score_key_prefix}{score_name}:{video_id}"
														
 
															+            redis_data[rank_score_key] = score
														
 
															+            print(rank_score_key, score)
														
 
															+            # redis_helper.set_data_to_redis(key_name=rank_score_key, value=score, expire_time=24*60*60)
														
 
															+            if ind % 1000 == 0:
														
 
															+                if len(redis_data) > 0:
														
 
															+                    print(ind, len(redis_data))
														
 
															+                    redis_helper.update_batch_set_key(data=redis_data, expire_time=24*60*60)
														
 
															+                    redis_data = {}
														
 
															+    if len(redis_data) > 0:
														
 
															+        redis_helper.update_batch_set_key(data=redis_data, expire_time=24 * 60 * 60)
														
 
															+    print(len(redis_data))
														
 
															+
														
 
															+
														
 
															+if __name__ == '__main__':
														
 
															+    now_date = sys.argv[1]
														
 
															+    print("now date:", now_date)
														
 
															+    score_hour_path = f"./data/hour_score_{now_date}.csv"
														
 
															+    score_24h_path = f"./data/24h_score_{now_date}.csv"
														
 
															+    score_df = cal_compose_score(score_hour_path=score_hour_path, score_24h_path=score_24h_path)
														
 
															+    score_to_redis(score_df=score_df)
														
--- a/config.py
+++ b/config.py
@@ -2553,8 +2553,8 @@ class ProductionConfig(BaseConfig):
 
															 def set_config():
														
 
															     # 获取环境变量 ROV_OFFLINE_ENV
														
 
															-    env = os.environ.get('ROV_OFFLINE_ENV')
														
 
															-    # env = 'dev'
														
 
															+    # env = os.environ.get('ROV_OFFLINE_ENV')
														
 
															+    env = 'dev'
														
 
															     if env is None:
														
 
															         # log_.error('ENV ERROR: is None!')
														
 
															         return
														
--- a/db_helper.py
+++ b/db_helper.py
@@ -304,6 +304,12 @@ class RedisHelper(object):
 
															         conn = self.connect()
														
 
															         conn.expire(name=key_name, time=int(expire_time))
														
 
															+    def update_batch_set_key(self, data, expire_time=5*60):
														
 
															+        conn = self.connect()
														
 
															+        conn.mset(data)
														
 
															+        for key_name in data:
														
 
															+            conn.expire(name=key_name, time=int(expire_time))
														
 
															+
														
 
															 class HologresHelper(object):
														
 
															     def __init__(self):
														
--- a/export_24h_vid.py
+++ b/export_24h_vid.py
@@ -0,0 +1,54 @@
 
															+import sys
														
 
															+import pandas as pd
														
 
															+from utils import get_data_from_odps
														
 
															+from config import set_config
														
 
															+from log import Log
														
 
															+config_, _ = set_config()
														
 
															+log_ = Log()
														
 
															+
														
 
															+features = [
														
 
															+    'apptype',
														
 
															+    'videoid',
														
 
															+    'preview人数',  # 过去24h预曝光人数
														
 
															+    'view人数',  # 过去24h曝光人数
														
 
															+    'play人数',  # 过去24h播放人数
														
 
															+    'share人数',  # 过去24h分享人数
														
 
															+    '回流人数',  # 过去24h分享，过去24h回流人数
														
 
															+    'preview次数',  # 过去24h预曝光次数
														
 
															+    'view次数',  # 过去24h曝光次数
														
 
															+    'play次数',  # 过去24h播放次数
														
 
															+    'share次数',  # 过去24h分享次数
														
 
															+    'platform_return',
														
 
															+    'platform_preview',
														
 
															+    'platform_preview_total',
														
 
															+    'platform_show',
														
 
															+    'platform_show_total',
														
 
															+    'platform_view',
														
 
															+    'platform_view_total',
														
 
															+]
														
 
															+
														
 
															+
														
 
															+def get_feature_data(project, table, now_date):
														
 
															+    """获取特征数据"""
														
 
															+    # dt = datetime.datetime.strftime(now_date, '%Y%m%d%H')
														
 
															+    # dt = '2022041310'
														
 
															+    records = get_data_from_odps(date=now_date, project=project, table=table)
														
 
															+    feature_data = []
														
 
															+    for record in records:
														
 
															+        item = {}
														
 
															+        for feature_name in features:
														
 
															+            item[feature_name] = record[feature_name]
														
 
															+        feature_data.append(item)
														
 
															+    feature_df = pd.DataFrame(feature_data)
														
 
															+    return feature_df
														
 
															+
														
 
															+
														
 
															+if __name__ == "__main__":
														
 
															+    project = config_.PROJECT_24H_APP_TYPE
														
 
															+    table = config_.TABLE_24H_APP_TYPE
														
 
															+    now_date = sys.argv[1]
														
 
															+    print("now date:", now_date)
														
 
															+    data = get_feature_data(project=project, table=table, now_date=now_date)
														
 
															+    data = data.fillna(0)
														
 
															+    data.to_csv(f"./data/24h_video_data_{now_date}.csv", index=False)
														
 
															+    print(f"data shape: {data.shape}")
														
--- a/export_hour_vid.py
+++ b/export_hour_vid.py
@@ -0,0 +1,64 @@
 
															+import sys
														
 
															+import pandas as pd
														
 
															+from utils import get_data_from_odps
														
 
															+from config import set_config
														
 
															+from log import Log
														
 
															+config_, _ = set_config()
														
 
															+log_ = Log()
														
 
															+
														
 
															+features = [
														
 
															+    'apptype',
														
 
															+    'code',
														
 
															+    'videoid',
														
 
															+    'lastonehour_preview',  # 过去1小时预曝光人数
														
 
															+    'lastonehour_view',  # 过去1小时曝光人数
														
 
															+    'lastonehour_play',  # 过去1小时播放人数
														
 
															+    'lastonehour_share',  # 过去1小时分享人数
														
 
															+    'lastonehour_return',  # 过去1小时分享，过去1小时回流人数
														
 
															+    'lastonehour_preview_total',  # 过去1小时预曝光次数
														
 
															+    'lastonehour_view_total',  # 过去1小时曝光次数
														
 
															+    'lastonehour_play_total',  # 过去1小时播放次数
														
 
															+    'lastonehour_share_total',  # 过去1小时分享次数
														
 
															+    'platform_return',
														
 
															+    'lastonehour_show',  # 不区分地域
														
 
															+    'lastonehour_show_region',  # 地域分组
														
 
															+    'lasttwohour_share',  # h-2小时分享人数
														
 
															+    'lasttwohour_return_now',  # h-2分享，过去1小时回流人数
														
 
															+    'lasttwohour_return',  # h-2分享，h-2回流人数
														
 
															+    'lastthreehour_share',  # h-3小时分享人数
														
 
															+    'lastthreehour_return_now',  # h-3分享，过去1小时回流人数
														
 
															+    'lastthreehour_return',  # h-3分享，h-3回流人数
														
 
															+
														
 
															+    'lastonehour_return_new',  # 过去1小时分享，过去1小时回流人数（回流统计为对应地域分享带回的回流，分享限制地域，回流不限制地域）
														
 
															+    'lasttwohour_return_now_new',  # h-2分享，过去1小时回流人数（回流统计为对应地域分享带回的回流，分享限制地域，回流不限制地域）
														
 
															+    'lasttwohour_return_new',  # h-2分享，h-2回流人数（回流统计为对应地域分享带回的回流，分享限制地域，回流不限制地域）
														
 
															+    'lastthreehour_return_now_new',  # h-3分享，过去1小时回流人数（回流统计为对应地域分享带回的回流，分享限制地域，回流不限制地域）
														
 
															+    'lastthreehour_return_new',  # h-3分享，h-3回流人数（回流统计为对应地域分享带回的回流，分享限制地域，回流不限制地域）
														
 
															+    'platform_return_new',  # 平台分发回流（回流统计为对应地域分享带回的回流，分享限制地域，回流不限制地域）
														
 
															+]
														
 
															+
														
 
															+
														
 
															+def get_feature_data(project, table, now_date):
														
 
															+    """获取特征数据"""
														
 
															+    # dt = datetime.datetime.strftime(now_date, '%Y%m%d%H')
														
 
															+    # dt = '2022041310'
														
 
															+    records = get_data_from_odps(date=now_date, project=project, table=table)
														
 
															+    feature_data = []
														
 
															+    for record in records:
														
 
															+        item = {}
														
 
															+        for feature_name in features:
														
 
															+            item[feature_name] = record[feature_name]
														
 
															+        feature_data.append(item)
														
 
															+    feature_df = pd.DataFrame(feature_data)
														
 
															+    return feature_df
														
 
															+
														
 
															+
														
 
															+if __name__ == "__main__":
														
 
															+    project = config_.PROJECT_REGION_APP_TYPE
														
 
															+    table = config_.TABLE_REGION_APP_TYPE
														
 
															+    now_date = sys.argv[1]
														
 
															+    print("now date:", now_date)
														
 
															+    data = get_feature_data(project=project, table=table, now_date=now_date)
														
 
															+    data = data.fillna(0)
														
 
															+    data.to_csv(f"./data/hour_video_data_{now_date}.csv", index=False)
														
 
															+    print(f"data shape: {data.shape}")