baichongyang 1 år sedan
förälder
incheckning
5eeab7c791
13 ändrade filer med 730 tillägg och 23 borttagningar
  1. 9 9
      calCtr.py
  2. 79 0
      calCtr1days.py
  3. 79 0
      calCtr3days.py
  4. 78 0
      calCtr7days.py
  5. 9 9
      calHourCtr.py
  6. 80 0
      calHourData.py
  7. 1 1
      clean.sh
  8. 9 4
      compose_score.py
  9. 91 0
      compose_score_3day.py
  10. 53 0
      export_3_day.py
  11. 53 0
      export_7_day.py
  12. 95 0
      get3HotRecall.py
  13. 94 0
      get7HotRecall.py

+ 9 - 9
calCtr.py

@@ -29,13 +29,13 @@ if __name__=="__main__":
         if len(items)<11:
             continue
         vid = items[1]
-        view_users = items[4] 
-        view_pv = items[5]
-        play_users = items[6]
-        play_pv = items[7]
-        share_users = items[8]
-        share_pv = items[9]
-        return_users = items[10]
+        view_users = int(items[4])
+        view_pv = int(items[5])
+        play_users = int(items[6])
+        play_pv = int(items[7])
+        share_users = int(items[8])
+        share_pv = int(items[9])
+        return_users = int(items[10])
         #print(items)
         if vid not in data_dict:
             data_dict[vid] = (view_users, view_pv, play_users, play_pv, share_users, share_pv, return_users)
@@ -59,13 +59,13 @@ if __name__=="__main__":
         view_users = v[0]
         view_pv = v[1]
         share_pv = v[5]
-        share_users = [4]
+        share_users = v[4]
         play_users = v[2]
         #print("return_users:", return_users) 
         k_score = float(return_users)/(float(view_users)+10)
         #print(k_score)
         share_score = float(share_pv)/(float(view_pv)+5)
-        backrate = float(return_users)/(float(share_pv)+5)
+        backrate = float(return_users)/(float(view_users)+5)
         #print(k, k_score, share_score*backrate, share_score, backrate) 
         score_info = [share_score, share_score*backrate, share_score, backrate]
         k = "k_p:"+k

+ 79 - 0
calCtr1days.py

@@ -0,0 +1,79 @@
+#coding utf-8
+import sys
+from operator import itemgetter
+import json
+import pandas as pd
+from db_help import RedisHelper
+
+if __name__=="__main__":
+    #1.load data
+    nowdate=sys.argv[1]
+    vlog='0'
+    love_live = 4
+    data_path = "./data/video_data_"+nowdate
+    f = open(data_path)
+    #data = pd.read_csv(data_path, encoding="utf-8", sep='\t')
+    #print(data)
+    index = 0
+    data_dict = {}
+    redis_helper = RedisHelper()
+    while True:
+        line = f.readline()
+        if not line:
+            break
+        if index==0:
+            index += 1
+            continue
+        index +=1
+        items = line.strip().split("\t")
+        if len(items)<11:
+            continue
+        vid = int(items[1])
+        view_users = int(items[4])
+        view_pv = int(items[5])
+        play_users = int(items[6])
+        play_pv = int(items[7])
+        share_users = int(items[8])
+        share_pv = int(items[9])
+        return_users = int(items[10])
+        #print(items)
+        if vid not in data_dict:
+            data_dict[vid] = (view_users, view_pv, play_users, play_pv, share_users, share_pv, return_users)
+        else:
+            item_info = data_dict[vid]
+            view_users = item_info[0]+view_users
+            view_pv = item_info[1]+view_pv
+            play_users = item_info[2]+play_pv
+            share_users = item_info[3]+share_users
+            share_pv = item_info[4]+share_pv
+            return_users = item_info[5]+return_users 
+            data_dict[vid] = (view_users, view_pv, play_users, play_pv, share_users, share_pv, return_users)
+    #print(data_dict.items())
+    info_dict = {}
+    data_path = "./data/1_days_sorted_data_"+nowdate
+    f = open(data_path, 'w')
+    for k, v in data_dict.items():
+        #print(v)
+        return_users = v[6]
+        #print(return_users)
+        view_users = v[0]
+        view_pv = v[1]
+        share_pv = v[5]
+        share_users = v[4]
+        play_users = v[2]
+        #print("return_users:", return_users) 
+        k_score = float(return_users)/(float(view_users)+10)
+        #print(k_score)
+        share_score = float(share_pv)/(float(view_pv)+50)
+        backrate = float(return_users)/(float(view_users)+10)
+        ctr_score = float(play_pv)/(float(view_pv)+50)
+        #print(k, k_score, share_score*backrate, share_score, backrate) 
+        score_info = [k_score, share_score*backrate, share_score, backrate, ctr_score, view_users, view_pv, play_users, play_pv, share_users, share_pv, return_users]
+        #k = "k_p3:"+k
+        score_info = json.dumps(score_info)
+        info_dict[k] = score_info
+        f.write(str(k)+"\t"+score_info+"\n")
+    #redis_helper.update_batch_setnx_key(info_dict, 60*60*24*15) 
+    f.close()
+
+     

+ 79 - 0
calCtr3days.py

@@ -0,0 +1,79 @@
+#coding utf-8
+import sys
+from operator import itemgetter
+import json
+import pandas as pd
+from db_help import RedisHelper
+
+if __name__=="__main__":
+    #1.load data
+    nowdate=sys.argv[1]
+    vlog='0'
+    love_live = 4
+    data_path = "./data/3_days_video_data_"+nowdate
+    f = open(data_path)
+    #data = pd.read_csv(data_path, encoding="utf-8", sep='\t')
+    #print(data)
+    index = 0
+    data_dict = {}
+    redis_helper = RedisHelper()
+    while True:
+        line = f.readline()
+        if not line:
+            break
+        if index==0:
+            index += 1
+            continue
+        index +=1
+        items = line.strip().split("\t")
+        if len(items)<11:
+            continue
+        vid = items[1]
+        view_users = items[4] 
+        view_pv = items[5]
+        play_users = items[6]
+        play_pv = items[7]
+        share_users = items[8]
+        share_pv = items[9]
+        return_users = items[10]
+        #print(items)
+        if vid not in data_dict:
+            data_dict[vid] = (view_users, view_pv, play_users, play_pv, share_users, share_pv, return_users)
+        else:
+            item_info = data_dict[vid]
+            view_users = item_info[0]+view_users
+            view_pv = item_info[1]+view_pv
+            play_users = item_info[2]+play_pv
+            share_users = item_info[3]+share_users
+            share_pv = item_info[4]+share_pv
+            return_users = item_info[5]+return_users 
+            data_dict[vid] = (view_users, view_pv, play_users, play_pv, share_users, share_pv, return_users)
+    #print(data_dict.items())
+    info_dict = {}
+    data_path = "./data/3_days_sorted_data_"+nowdate
+    f = open(data_path, 'w')
+    for k, v in data_dict.items():
+        #print(v)
+        return_users = v[6]
+        #print(return_users)
+        view_users = v[0]
+        view_pv = v[1]
+        share_pv = v[5]
+        share_users = v[4]
+        play_users = v[2]
+        #print("return_users:", return_users) 
+        k_score = float(return_users)/(float(view_users)+30)
+        #print(k_score)
+        share_score = float(share_pv)/(float(view_pv)+100)
+        backrate = float(return_users)/(float(view_users)+30)
+        ctr_score = float(play_pv)/(float(view_pv)+100)
+        #print(k, k_score, share_score*backrate, share_score, backrate) 
+        score_info = [k_score, share_score*backrate, share_score, backrate, ctr_score, view_users, view_pv, play_users, play_pv, share_users, share_pv, return_users]
+        #k = "k_p3:"+k
+        score_info = json.dumps(score_info)
+        info_dict[k] = score_info
+        f.write(str(k)+"\t"+score_info+"\n")
+    #redis_helper.update_batch_setnx_key(info_dict, 60*60*24*15) 
+    f.close()
+
+     

+ 78 - 0
calCtr7days.py

@@ -0,0 +1,78 @@
+#coding utf-8
+import sys
+from operator import itemgetter
+import json
+import pandas as pd
+from db_help import RedisHelper
+
+if __name__=="__main__":
+    #1.load data
+    nowdate=sys.argv[1]
+    vlog='0'
+    love_live = 4
+    data_path = "./data/7_days_video_data_"+nowdate
+    f = open(data_path)
+    #data = pd.read_csv(data_path, encoding="utf-8", sep='\t')
+    #print(data)
+    index = 0
+    data_dict = {}
+    redis_helper = RedisHelper()
+    while True:
+        line = f.readline()
+        if not line:
+            break
+        if index==0:
+            index += 1
+            continue
+        index +=1
+        items = line.strip().split("\t")
+        if len(items)<11:
+            continue
+        vid = items[1]
+        view_users = items[4] 
+        view_pv = items[5]
+        play_users = items[6]
+        play_pv = items[7]
+        share_users = items[8]
+        share_pv = items[9]
+        return_users = items[10]
+        #print(items)
+        if vid not in data_dict:
+            data_dict[vid] = (view_users, view_pv, play_users, play_pv, share_users, share_pv, return_users)
+        else:
+            item_info = data_dict[vid]
+            view_users = item_info[0]+view_users
+            view_pv = item_info[1]+view_pv
+            play_users = item_info[2]+play_pv
+            share_users = item_info[3]+share_users
+            share_pv = item_info[4]+share_pv
+            return_users = item_info[5]+return_users 
+            data_dict[vid] = (view_users, view_pv, play_users, play_pv, share_users, share_pv, return_users)
+    #print(data_dict.items())
+    info_dict = {}
+    data_path = "./data/7_days_sorted_data_"+nowdate
+    f = open(data_path, 'w')
+    for k, v in data_dict.items():
+        #print(v)
+        return_users = v[6]
+        #print(return_users)
+        view_users = v[0]
+        view_pv = v[1]
+        share_pv = v[5]
+        share_users = [4]
+        play_users = v[2]
+        #print("return_users:", return_users) 
+        k_score = float(return_users)/(float(view_users)+10)
+        #print(k_score)
+        share_score = float(share_pv)/(float(view_pv)+5)
+        backrate = float(return_users)/(float(view_users)+5)
+        #print(k, k_score, share_score*backrate, share_score, backrate) 
+        score_info = [share_score, share_score*backrate, share_score, backrate]
+        #k = "k_p4:"+k
+        score_info = json.dumps(score_info)
+        info_dict[k] = score_info
+        #f.write(k+"\t"+score_info+"\n")
+    #redis_helper.update_batch_setnx_key(info_dict, 60*60*24*15) 
+    f.close()
+
+     

+ 9 - 9
calHourCtr.py

@@ -30,13 +30,13 @@ if __name__=="__main__":
         if len(items)<9:
             continue
         vid = items[1]
-        view_users = items[2] 
-        view_pv = items[3]
-        play_users = items[4]
-        play_pv = items[5]
-        share_users = items[6]
-        share_pv = items[7]
-        return_users = items[8]
+        view_users = int(items[2])
+        view_pv = int(items[3])
+        play_users = int(items[4])
+        play_pv = int(items[5])
+        share_users = int(items[6])
+        share_pv = int(items[7])
+        return_users = int(items[8])
         #print(items)
         if vid not in data_dict:
             data_dict[vid] = (view_users, view_pv, play_users, play_pv, share_users, share_pv, return_users)
@@ -61,14 +61,14 @@ if __name__=="__main__":
         view_users = v[0]
         view_pv = v[1]
         share_pv = v[5]
-        share_users = [4]
+        share_users = v[4]
         play_users = v[2]
         #print("return_users:", return_users) 
         k_score = float(return_users)/(float(view_users)+5)
         #k_score2 = float(return_users)/(float(share_pv)+5)
         #print(k_score)
         share_score = float(share_pv)/(float(view_pv)+5)
-        backrate = float(return_users)/(float(share_pv)+5)
+        backrate = float(return_users)/(float(view_users)+5)
         #print(k, k_score, share_score*backrate, share_score, backrate) 
         score_info = [share_score, share_score*backrate, share_score, backrate]
         score_info = json.dumps(score_info)

+ 80 - 0
calHourData.py

@@ -0,0 +1,80 @@
+#coding utf-8
+import sys
+from operator import itemgetter
+import json
+import pandas as pd
+#from db_help import RedisHelper
+
+if __name__=="__main__":
+    #1.load data
+    nowdate=sys.argv[1]
+    vlog='0'
+    love_live = 4
+    data_path = "./data/hour_video_data_"+nowdate
+    f = open(data_path)
+    #data = pd.read_csv(data_path, encoding="utf-8", sep='\t')
+    #print(data)
+    index = 0
+    data_dict = {}
+    #redis_helper = RedisHelper()
+    while True:
+        line = f.readline()
+        if not line:
+            break
+        if index==0:
+            index += 1
+            continue
+        index +=1
+        items = line.strip().split("\t")
+        #print(items)
+        if len(items)<9:
+            continue
+        vid = items[1]
+        view_users = int(items[2])
+        view_pv = int(items[3])
+        play_users = int(items[4])
+        play_pv = int(items[5])
+        share_users = int(items[6])
+        share_pv = int(items[7])
+        return_users = int(items[8])
+        #print(items)
+        if vid not in data_dict:
+            data_dict[vid] = (view_users, view_pv, play_users, play_pv, share_users, share_pv, return_users)
+        else:
+            item_info = data_dict[vid]
+            view_users = item_info[0]+view_users
+            view_pv = item_info[1]+view_pv
+            play_users = item_info[2]+play_pv
+            share_users = item_info[3]+share_users
+            share_pv = item_info[4]+share_pv
+            return_users = item_info[5]+return_users 
+            data_dict[vid] = (view_users, view_pv, play_users, play_pv, share_users, share_pv, return_users)
+    #print(data_dict.items())
+    f.close()
+    info_dict = {}
+    hour_data_path = "./data/sorted_hour_info_"+nowdate
+    f = open(hour_data_path, 'w')
+    for k, v in data_dict.items():
+        #print(v)
+        return_users = v[6]
+        #print(return_users)
+        view_users = v[0]
+        view_pv = v[1]
+        share_pv = v[5]
+        share_users = v[4]
+        play_users = v[2]
+        #print("return_users:", return_users) 
+        k_score = float(return_users)/(float(view_users)+5)
+        #k_score2 = float(return_users)/(float(share_pv)+5)
+        #print(k_score)
+        share_score = float(share_pv)/(float(view_pv)+5)
+        backrate = float(return_users)/(float(view_users)+5)
+        #print(k, k_score, share_score*backrate, share_score, backrate) 
+        score_info = [k_score, share_score*backrate, share_score, backrate, view_users, view_pv, play_users, play_pv, share_users, share_pv, return_users]
+        score_info = json.dumps(score_info)
+        info_dict[k] = score_info
+        f.write(k+"\t"+score_info+"\n")
+    #redis_helper.update_batch_setnx_key(info_dict, 60*60*24*15) 
+    f.close()
+
+     

+ 1 - 1
clean.sh

@@ -8,7 +8,7 @@ merge_path=./data/merge_score_${last3day}'*'
 video_data_path=./data/video_data_${last3day}'*'
 sorted_path=./data/sorted_data_${last3day}'*'
 cls_path=./data/redis_cls_${last3day}'*'
-hour_video_path=./data/hour_video_data_'*'
+hour_video_path=./data/hour_video_data_${last3day}'*'
 sorted_hour_path=./data/sorted_hour_data_'*'
 rec_path=./data/rec_result_'*'
 echo ${merge_path}

+ 9 - 4
compose_score.py

@@ -39,7 +39,9 @@ if __name__=="__main__":
             #print("d:",d_item_info)
             total_info = []
             for i in range(len(item_info)):
-                total_info.append(0.3*item_info[i]+0.7*d_item_info[i])
+                total_info.append(0.001*item_info[i]+d_item_info[i])
+            if len(total_info)>2:
+                total_info[0] = total_info[0]+0.1*total_info[3]
             total_item_info = json.dumps(total_info)
             f3.write(kid2+"\t"+total_item_info+"\n")
             info_dict[kid2] = total_item_info
@@ -47,11 +49,14 @@ if __name__=="__main__":
         else:
             total_info = []
             for i  in range(len(d_item_info)):
-                total_info.append(0.7*d_item_info[i])
+                total_info.append(d_item_info[i])
+            if len(total_info)>2:
+                total_info[0] = total_info[1]+0.1*total_info[3]
             total_item_info = json.dumps(total_info)
             f3.write(kid2+"\t"+total_item_info+"\n")
             info_dict[kid2] = total_item_info
     print(info_dict)
-    redis_helper = RedisHelper()
-    redis_helper.update_batch_setnx_key(info_dict, 60*60*24*15)
+    if len(info_dict)>0:
+        redis_helper = RedisHelper()
+        redis_helper.update_batch_setnx_key(info_dict, 60*60*24*15)
     f2.close()

+ 91 - 0
compose_score_3day.py

@@ -0,0 +1,91 @@
+#coding utf-8
+import sys
+import json
+from db_help import RedisHelper
+
+if __name__=="__main__":
+    nowdate = sys.argv[1]
+    f1 = open("./data/sorted_hour_info_"+nowdate)
+    f2 = open("./data/1_days_sorted_data_"+nowdate)
+    f3 = open("./data/3_days_sorted_data_"+nowdate)
+    data_dict = {}
+    while True:
+        line = f1.readline()
+        if not line:
+            break
+        items = line.strip().split("\t")
+        if len(items)<2:
+            continue
+        kid = items[0]
+        #print(items[1])
+        item_info = json.loads(items[1])
+        data_dict[kid] = item_info
+    f1.close()
+    #f3 = open("./data/merge_score_"+nowdate, 'w')
+    info_dict = {}
+    while True:
+        line = f2.readline()
+        if not line:
+            break
+        items = line.split("\t")
+        if len(items)<2:
+            continue
+        kid = items[0]
+        d_item_info = json.loads(items[1])
+        if kid in data_dict:
+            item_info = data_dict[kid]
+            total_info = []
+            for i in range(len(item_info)):
+                total_info.append(0.001*float(item_info[i])+float(d_item_info[i]))
+            info_dict[kid] = total_info
+        else:
+            total_info = []
+            for i  in range(len(d_item_info)):
+                total_info.append(float(d_item_info[i]))
+            #if len(total_info)>2:
+            #    total_info[0] = total_info[1]+0.1*total_info[3]
+            #total_item_info = json.dumps(total_info)
+            #f3.write(kid2+"\t"+total_item_info+"\n")
+            info_dict[kid] = total_info
+    #print(info_dict)
+    print("info:", len(info_dict))
+    day3_dict = {}
+    while True:
+        line = f3.readline()
+        if not line:
+            break
+        items = line.split("\t")
+        if len(items)<2:
+            continue
+        kid = items[0]
+        d_item_info = json.loads(items[1])
+        if kid in info_dict:
+            item_info = info_dict[kid]
+            total_info = []
+            for i in range(len(item_info)):
+                total_info.append(0.1*float(item_info[i])+0.3*float(d_item_info[i]))
+            day3_dict[kid] = total_info
+        else:
+            total_info = []
+            for i  in range(len(d_item_info)):
+                total_info.append(float(d_item_info[i]))
+            day3_dict[kid] = total_info
+    print("info3:", len(day3_dict))
+    f3 = open("./data/merge_3_days_score_"+nowdate, 'w')
+    res_dict = {}
+    for k, v in day3_dict.items():
+        score = v[2]+0.1*v[3] 
+        new_arr = []
+        new_arr.append(score)
+        for i in range(4):
+            new_arr.append(v[i])
+        #print(v)
+        #print(new_arr)
+        total_item_info = json.dumps(new_arr)
+        kid2 = "kp_3:"+k     
+        f3.write(kid2+"\t"+total_item_info+"\n")
+        res_dict[kid2] = total_item_info
+    if len(res_dict)>0:
+        redis_helper = RedisHelper()
+        redis_helper.update_batch_setnx_key(res_dict, 60*60*24*15)
+    f2.close()

+ 53 - 0
export_3_day.py

@@ -0,0 +1,53 @@
+#coding utf-8
+from odps import ODPS
+from config import set_config
+import datetime
+import pandas as pd
+from collections import defaultdict
+import sys
+
+config_ = set_config()
+
+odps = ODPS(
+        access_id=config_.ODPS_CONFIG['ACCESSID'],
+        secret_access_key=config_.ODPS_CONFIG['ACCESSKEY'],
+        project="loghubods",
+        endpoint=config_.ODPS_CONFIG['ENDPOINT'])
+
+
+def get_data_from_odps(date, project, table, connect_timeout=3000, read_timeout=500000,
+                       pool_maxsize=1000, pool_connections=1000):
+    """
+    从odps获取数据
+    :param date: 日期 type-string '%Y%m%d'
+    :param project: type-string
+    :param table: 表名 type-string
+    :param connect_timeout: 连接超时设置
+    :param read_timeout: 读取超时设置
+    :param pool_maxsize:
+    :param pool_connections:
+    :return: records
+    """
+    records = odps.read_table(name=table, partition='dt=%s' % date)
+    return records
+
+def exe_sql(sql):    
+    data = []
+    with odps.execute_sql(sql).open_reader() as reader:
+        d = defaultdict(list)  # collection默认一个dict
+        for record in reader:
+            for res in record:
+                d[res[0]].append(res[1])  # 解析record中的每一个元组,存储方式为(k,v),以k作为key,存储每一列的内容;
+        data = pd.DataFrame.from_dict(d, orient='index').T  # 转换为数据框,并转置,不转置的话是横条数据
+    return data
+
+
+if __name__=="__main__":
+    project = 'loghubods'
+    now_date=sys.argv[1]
+    print("now date:", now_date)
+    table = 'loghubods.video_data_each_hour_dataset_3days_total_apptype'
+    sql = "select apptype, videoid, preview_users, preview_times, view_users, view_times, play_users, play_times, share_users, share_times, return_users from loghubods.video_data_each_hour_dataset_3days_total_apptype where dt="+now_date
+    print(sql)
+    data = exe_sql(sql)
+    data.to_csv("./data/3_days_video_data_"+now_date, sep='\t', index=None) 

+ 53 - 0
export_7_day.py

@@ -0,0 +1,53 @@
+#coding utf-8
+from odps import ODPS
+from config import set_config
+import datetime
+import pandas as pd
+from collections import defaultdict
+import sys
+
+config_ = set_config()
+
+odps = ODPS(
+        access_id=config_.ODPS_CONFIG['ACCESSID'],
+        secret_access_key=config_.ODPS_CONFIG['ACCESSKEY'],
+        project="loghubods",
+        endpoint=config_.ODPS_CONFIG['ENDPOINT'])
+
+
+def get_data_from_odps(date, project, table, connect_timeout=3000, read_timeout=500000,
+                       pool_maxsize=1000, pool_connections=1000):
+    """
+    从odps获取数据
+    :param date: 日期 type-string '%Y%m%d'
+    :param project: type-string
+    :param table: 表名 type-string
+    :param connect_timeout: 连接超时设置
+    :param read_timeout: 读取超时设置
+    :param pool_maxsize:
+    :param pool_connections:
+    :return: records
+    """
+    records = odps.read_table(name=table, partition='dt=%s' % date)
+    return records
+
+def exe_sql(sql):    
+    data = []
+    with odps.execute_sql(sql).open_reader() as reader:
+        d = defaultdict(list)  # collection默认一个dict
+        for record in reader:
+            for res in record:
+                d[res[0]].append(res[1])  # 解析record中的每一个元组,存储方式为(k,v),以k作为key,存储每一列的内容;
+        data = pd.DataFrame.from_dict(d, orient='index').T  # 转换为数据框,并转置,不转置的话是横条数据
+    return data
+
+
+if __name__=="__main__":
+    project = 'loghubods'
+    now_date=sys.argv[1]
+    print("now date:", now_date)
+    table = 'loghubods.video_data_each_hour_dataset_7days_total_apptype'
+    sql = "select apptype, videoid, preview_users, preview_times, view_users, view_times, play_users, play_times, share_users, share_times, return_users from loghubods.video_data_each_hour_dataset_7days_total_apptype where dt="+now_date
+    print(sql)
+    data = exe_sql(sql)
+    data.to_csv("./data/7_days_video_data_"+now_date, sep='\t', index=None) 

+ 95 - 0
get3HotRecall.py

@@ -0,0 +1,95 @@
+#coding utf-8
+import sys
+from operator import itemgetter
+import json
+import pandas as pd
+from db_help import RedisHelper
+
+if __name__=="__main__":
+    #1.load data
+    nowdate=sys.argv[1]
+    vlog='0'
+    love_live = 4
+    data_path = "./data/3_days_video_data_"+nowdate
+    f = open(data_path)
+    #data = pd.read_csv(data_path, encoding="utf-8", sep='\t')
+    #print(data)
+    index = 0
+    data_dict = {}
+    redis_helper = RedisHelper()
+    while True:
+        line = f.readline()
+        if not line:
+            break
+        if index==0:
+            index += 1
+            continue
+        index +=1
+        items = line.strip().split("\t")
+        if len(items)<11:
+            continue
+        vid = int(items[1])
+        view_users = int(items[4])
+        view_pv = int(items[5])
+        play_users = int(items[6])
+        play_pv = int(items[7])
+        share_users = int(items[8])
+        share_pv = int(items[9])
+        return_users = int(items[10])
+        #print(items)
+        if vid not in data_dict:
+            data_dict[vid] = (view_users, view_pv, play_users, play_pv, share_users, share_pv, return_users)
+        else:
+            item_info = data_dict[vid]
+            view_users = item_info[0]+view_users
+            view_pv = item_info[1]+view_pv
+            play_users = item_info[2]+play_pv
+            share_users = item_info[3]+share_users
+            share_pv = item_info[4]+share_pv
+            return_users = item_info[5]+return_users 
+            data_dict[vid] = (view_users, view_pv, play_users, play_pv, share_users, share_pv, return_users)
+    #print(data_dict.items())
+    info_dict = {}
+    data_path = "./data/3_days_recall_hot_"+nowdate
+    f = open(data_path, 'w')
+    for k, v in data_dict.items():
+        #print(v)
+        return_users = v[6]
+        #print(return_users)
+        view_users = v[0]
+        view_pv = v[1]
+        if view_pv<100 or view_users<10:
+            continue
+        share_pv = v[5]
+        share_users = v[4]
+        play_users = v[2]
+        play_pv = v[3]
+        #print("return_users:", return_users) 
+        k_score = float(return_users)/(float(view_users)+30)
+        #print(k_score)
+        share_score = float(share_pv)/(float(view_pv)+100)
+        backrate = float(return_users)/(float(view_users)+30)
+        ctr_score = float(play_pv)/float(view_pv+100)
+        if ctr_score<=0.5:
+            continue
+        #print(k, k_score, share_score*backrate, share_score, backrate) 
+        score_info = [k_score, share_score*backrate, share_score, backrate, ctr_score, view_users, view_pv, play_users, play_pv, share_users, share_pv, return_users]
+        info_dict[k] = score_info
+    sorted_v = sorted(info_dict.items(), key=lambda s:s[1][1], reverse=True) 
+    print("sorted_v:", sorted_v[:30])
+    recall_name = "hot_3day:"
+    hot_recall_res = []
+    for item in sorted_v[:30]:
+        hot_recall_res.append((item[0], item[1][1]))
+    if len(hot_recall_res)>10:
+        score_info = json.dumps(hot_recall_res)
+        print("score_info:", score_info)
+        redis_helper.set_data_to_redis(recall_name, score_info, 60*60*24*15)
+        f.write(recall_name+"\t"+score_info+"\n")
+    f.close()
+    #info_dict[k] = score_info
+    #f.write(k+"\t"+score_info+"\n")
+    #redis_helper.update_batch_setnx_key(info_dict, 60*60*24*15) 
+    #f.close()
+
+     

+ 94 - 0
get7HotRecall.py

@@ -0,0 +1,94 @@
+#coding utf-8
+import sys
+from operator import itemgetter
+import json
+import pandas as pd
+from db_help import RedisHelper
+
+if __name__=="__main__":
+    #1.load data
+    nowdate=sys.argv[1]
+    vlog='0'
+    love_live = 4
+    data_path = "./data/7_days_video_data_"+nowdate
+    f = open(data_path)
+    #data = pd.read_csv(data_path, encoding="utf-8", sep='\t')
+    #print(data)
+    index = 0
+    data_dict = {}
+    redis_helper = RedisHelper()
+    while True:
+        line = f.readline()
+        if not line:
+            break
+        if index==0:
+            index += 1
+            continue
+        index +=1
+        items = line.strip().split("\t")
+        if len(items)<11:
+            continue
+        vid = int(items[1])
+        view_users = int(items[4])
+        view_pv = int(items[5])
+        play_users = int(items[6])
+        play_pv = int(items[7])
+        share_users = int(items[8])
+        share_pv = int(items[9])
+        return_users = int(items[10])
+        #print(items)
+        if vid not in data_dict:
+            data_dict[vid] = (view_users, view_pv, play_users, play_pv, share_users, share_pv, return_users)
+        else:
+            item_info = data_dict[vid]
+            view_users = item_info[0]+view_users
+            view_pv = item_info[1]+view_pv
+            play_users = item_info[2]+play_pv
+            share_users = item_info[3]+share_users
+            share_pv = item_info[4]+share_pv
+            return_users = item_info[5]+return_users 
+            data_dict[vid] = (view_users, view_pv, play_users, play_pv, share_users, share_pv, return_users)
+    #print(data_dict.items())
+    info_dict = {}
+    data_path = "./data/7_days_recall_hot_"+nowdate
+    f = open(data_path, 'w')
+    for k, v in data_dict.items():
+        #print(v)
+        return_users = v[6]
+        #print(return_users)
+        view_users = v[0]
+        view_pv = v[1]
+        if view_pv<100 or view_users<10:
+            continue
+        share_pv = v[5]
+        share_users = v[4]
+        play_users = v[2]
+        play_pv = v[3]
+        #print("return_users:", return_users) 
+        k_score = float(return_users)/(float(view_users)+5)
+        #print(k_score)
+        share_score = float(share_pv)/(float(view_pv)+5)
+        backrate = float(return_users)/(float(view_users)+5)
+        ctr_score = float(play_pv)/float(view_pv+5)
+        if ctr_score<=0.5:
+            continue
+        #print(k, k_score, share_score*backrate, share_score, backrate) 
+        score_info = [k_score, share_score*backrate, share_score, backrate, ctr_score, view_users, view_pv, play_users, play_pv, share_users, share_pv, return_users]
+        info_dict[k] = score_info
+    sorted_v = sorted(info_dict.items(), key=lambda s:s[1][1], reverse=True) 
+    print("sorted_v:", sorted_v[:100])
+    recall_name = "hot_7day:"
+    hot_recall_res = []
+    for item in sorted_v[:100]:
+        hot_recall_res.append((item[0], item[1][1]))
+    if len(hot_recall_res)>10:
+        score_info = json.dumps(hot_recall_res)
+        print("score_info:", score_info)
+        redis_helper.set_data_to_redis(recall_name, score_info, 60*60*24*15)
+        f.write(recall_name+"\t"+score_info+"\n") 
+    #info_dict[k] = score_info
+    #f.write(k+"\t"+score_info+"\n")
+    #redis_helper.update_batch_setnx_key(info_dict, 60*60*24*15) 
+    f.close()
+
+