Pārlūkot izejas kodu

add political videos filter

liqian 2 gadi atpakaļ
vecāks
revīzija
550c010bb5
4 mainītis faili ar 86 papildinājumiem un 15 dzēšanām
  1. 17 4
      config.py
  2. 27 11
      region_rule_rank_h.py
  3. 22 0
      shield_videos.py
  4. 20 0
      utils.py

+ 17 - 4
config.py

@@ -294,10 +294,16 @@ class BaseConfig(object):
             #           'region_24h_rule_key': 'rule2', '24h_rule_key': 'rule2'},
             'rule4': {'view_type': 'video-show-region', 'platform_return_rate': 0.001,
                       'region_24h_rule_key': 'rule2', '24h_rule_key': 'rule3'},
+            # 涉政视频过滤
+            'rule4-1': {'view_type': 'video-show-region', 'platform_return_rate': 0.001,
+                        'region_24h_rule_key': 'rule2', '24h_rule_key': 'rule3', 'political_filter': True},
             # 'rule6': {'view_type': 'preview', 'platform_return_rate': 0.001,
             #           'region_24h_rule_key': 'rule3', '24h_rule_key': 'rule2'},
             'rule7': {'view_type': 'video-show-region', 'platform_return_rate': 0.001,
                       'region_24h_rule_key': 'rule4', '24h_rule_key': 'rule4', 'merge_func': 2},
+            'rule7-1': {'view_type': 'video-show-region', 'platform_return_rate': 0.001,
+                        'region_24h_rule_key': 'rule4', '24h_rule_key': 'rule4', 'merge_func': 2,
+                        'political_filter': True},
             'rule8': {'view_type': 'preview', 'platform_return_rate': 0.001,
                       'region_24h_rule_key': 'rule5', '24h_rule_key': 'rule4', 'merge_func': 2},
             # 'rule9': {'view_type': 'video-show-region', 'platform_return_rate': 0.001,
@@ -318,19 +324,20 @@ class BaseConfig(object):
         },
         'data_params': DATA_PARAMS,
         'params_list': [
-            {'data': 'data1', 'rule': 'rule4'},
+            {'data': 'data1', 'rule': 'rule4'},  # 095 vlog
+            {'data': 'data1', 'rule': 'rule4-1'},  # 095-1
             # {'data': 'data2', 'rule': 'rule4'},
-            {'data': 'data2', 'rule': 'rule7'},
+            {'data': 'data2', 'rule': 'rule7-1'},  # 121 内容精选
             # {'data': 'data3', 'rule': 'rule7'},
             # {'data': 'data4', 'rule': 'rule7'},
             # {'data': 'data6', 'rule': 'rule7'},
-            {'data': 'data7', 'rule': 'rule8'},
+            {'data': 'data7', 'rule': 'rule8'},  # 票圈视频APP 10003.110156
             # {'data': 'data1', 'rule': 'rule9'},
             # {'data': 'data1', 'rule': 'rule10'},
             # {'data': 'data1', 'rule': 'rule11'},
             # {'data': 'data8', 'rule': 'rule7'},
             # {'data': 'data9', 'rule': 'rule7'},
-            {'data': 'data10', 'rule': 'rule7'},
+            {'data': 'data10', 'rule': 'rule7'},  # 144 票圈视频
             # {'data': 'data11', 'rule': 'rule7'},
             # {'data': 'data12', 'rule': 'rule7'},
             # {'data': 'data13', 'rule': 'rule7'},
@@ -575,6 +582,12 @@ class BaseConfig(object):
         CITY_CODE['成都']: [SPECIAL_AREA_LIMIT_KEY_NAME, ],
     }
 
+    # 涉政视频列表,除票圈vlog、票圈视频、票圈视频APP外其他端都屏蔽
+    PROJECT_POLITICAL_VIDEOS = 'loghubods'
+    TABLE_POLITICAL_VIDEOS = 'shielded_politics_videolist'
+    # 涉政视频列表redis存储key
+    POLITICAL_VIDEOS_KEY_NAME = 'political:videos'
+
     # 宗教视频更新使用数据
     RELIGION_VIDEOS_PROJECT = 'loghubods'
     RELIGION_VIDEOS_TABLE = 'religion_video'

+ 27 - 11
region_rule_rank_h.py

@@ -17,7 +17,7 @@ from functools import reduce
 from odps import ODPS
 from threading import Timer, Thread
 from utils import MysqlHelper, RedisHelper, get_data_from_odps, filter_video_status, filter_shield_video, \
-    check_table_partition_exits, filter_video_status_app, send_msg_to_feishu
+    check_table_partition_exits, filter_video_status_app, send_msg_to_feishu, filter_political_videos
 from config import set_config
 from log import Log
 from check_video_limit_distribute import update_limit_video_score
@@ -249,6 +249,13 @@ def video_rank(df, now_date, now_h, rule_key, param, region, data_key, rule_rank
         filtered_videos = filter_shield_video(video_ids=filtered_videos, shield_key_name_list=shield_key_name_list)
         # log_.info(f"shield filtered_videos count = {len(filtered_videos)}")
 
+    # 涉政视频过滤
+    political_filter = param.get('political_filter', None)
+    if political_filter is True:
+        log_.info(f"political filter videos count = {len(filtered_videos)}")
+        filtered_videos = filter_political_videos(video_ids=filtered_videos)
+        log_.info(f"political filtered videos count = {len(filtered_videos)}")
+
     # 写入对应的redis
     h_video_ids = []
     by_30day_rule_key = param.get('30day_rule_key', None)
@@ -283,10 +290,10 @@ def video_rank(df, now_date, now_h, rule_key, param, region, data_key, rule_rank
     # 与其他召回视频池去重,存入对应的redis
     dup_to_redis(h_video_ids=h_video_ids, now_date=now_date, now_h=now_h, rule_key=rule_key,
                  region_24h_rule_key=region_24h_rule_key, by_24h_rule_key=by_24h_rule_key, by_48h_rule_key=by_48h_rule_key,
-                 region=region, data_key=data_key, rule_rank_h_flag=rule_rank_h_flag)
+                 region=region, data_key=data_key, rule_rank_h_flag=rule_rank_h_flag, political_filter=political_filter)
 
 
-def dup_data(h_video_ids, initial_key_name, dup_key_name, region):
+def dup_data(h_video_ids, initial_key_name, dup_key_name, region, political_filter):
     redis_helper = RedisHelper()
     if redis_helper.key_exists(key_name=initial_key_name):
         initial_data = redis_helper.get_all_data_from_zset(key_name=initial_key_name, with_scores=True)
@@ -294,7 +301,11 @@ def dup_data(h_video_ids, initial_key_name, dup_key_name, region):
         initial_video_ids = [int(video_id) for video_id, _ in initial_data]
         shield_key_name_list = config_.SHIELD_CONFIG.get(region, None)
         if shield_key_name_list is not None:
-            initial_video_ids = filter_shield_video(video_ids=initial_video_ids, shield_key_name_list=shield_key_name_list)
+            initial_video_ids = filter_shield_video(video_ids=initial_video_ids,
+                                                    shield_key_name_list=shield_key_name_list)
+        # 涉政视频过滤
+        if political_filter is True:
+            initial_video_ids = filter_political_videos(video_ids=initial_video_ids)
 
         dup_data = {}
         for video_id, score in initial_data:
@@ -309,7 +320,8 @@ def dup_data(h_video_ids, initial_key_name, dup_key_name, region):
     return h_video_ids
 
 
-def dup_to_redis(h_video_ids, now_date, now_h, rule_key, region_24h_rule_key, by_24h_rule_key, by_48h_rule_key, region, data_key, rule_rank_h_flag):
+def dup_to_redis(h_video_ids, now_date, now_h, rule_key, region_24h_rule_key, by_24h_rule_key, by_48h_rule_key,
+                 region, data_key, rule_rank_h_flag, political_filter):
     """将地域分组小时级数据与其他召回视频池去重,存入对应的redis"""
     # ##### 去重更新地域分组小时级24h列表,并另存为redis中
     region_24h_key_name = \
@@ -319,7 +331,7 @@ def dup_to_redis(h_video_ids, now_date, now_h, rule_key, region_24h_rule_key, by
         f"{config_.RECALL_KEY_NAME_PREFIX_DUP1_REGION_24H_H}{region}:{data_key}:{rule_key}:" \
         f"{datetime.datetime.strftime(now_date, '%Y%m%d')}:{now_h}"
     h_video_ids = dup_data(h_video_ids=h_video_ids, initial_key_name=region_24h_key_name,
-                           dup_key_name=region_24h_dup_key_name, region=region)
+                           dup_key_name=region_24h_dup_key_name, region=region, political_filter=political_filter)
 
     if rule_rank_h_flag == '48h':
 
@@ -330,7 +342,7 @@ def dup_to_redis(h_video_ids, now_date, now_h, rule_key, region_24h_rule_key, by
             f"{config_.RECALL_KEY_NAME_PREFIX_DUP2_REGION_48H_H}{region}:{data_key}:{rule_key}:" \
             f"{datetime.datetime.strftime(now_date, '%Y%m%d')}:{now_h}"
         h_video_ids = dup_data(h_video_ids=h_video_ids, initial_key_name=h_48h_key_name,
-                               dup_key_name=h_48h_dup_key_name, region=region)
+                               dup_key_name=h_48h_dup_key_name, region=region, political_filter=political_filter)
 
         # ##### 去重小程序相对48h 筛选后剩余数据 更新结果,并另存为redis中
         if by_48h_rule_key == 'rule1':
@@ -340,7 +352,8 @@ def dup_to_redis(h_video_ids, now_date, now_h, rule_key, region_24h_rule_key, by
                 f"{config_.RECALL_KEY_NAME_PREFIX_DUP3_REGION_48H_H}{region}:{data_key}:{rule_key}:" \
                 f"{datetime.datetime.strftime(now_date, '%Y%m%d')}:{now_h}"
             h_video_ids = dup_data(h_video_ids=h_video_ids, initial_key_name=other_h_48h_key_name,
-                                   dup_key_name=other_h_48h_dup_key_name, region=region)
+                                   dup_key_name=other_h_48h_dup_key_name, region=region,
+                                   political_filter=political_filter)
 
     else:
         # ##### 去重小程序相对24h更新结果,并另存为redis中
@@ -350,7 +363,7 @@ def dup_to_redis(h_video_ids, now_date, now_h, rule_key, region_24h_rule_key, by
             f"{config_.RECALL_KEY_NAME_PREFIX_DUP2_REGION_24H_H}{region}:{data_key}:{rule_key}:" \
             f"{datetime.datetime.strftime(now_date, '%Y%m%d')}:{now_h}"
         h_video_ids = dup_data(h_video_ids=h_video_ids, initial_key_name=h_24h_key_name,
-                               dup_key_name=h_24h_dup_key_name, region=region)
+                               dup_key_name=h_24h_dup_key_name, region=region, political_filter=political_filter)
 
         # ##### 去重小程序相对24h 筛选后剩余数据 更新结果,并另存为redis中
         # if by_24h_rule_key in ['rule3', 'rule4']:
@@ -360,7 +373,7 @@ def dup_to_redis(h_video_ids, now_date, now_h, rule_key, region_24h_rule_key, by
             f"{config_.RECALL_KEY_NAME_PREFIX_DUP3_REGION_24H_H}{region}:{data_key}:{rule_key}:" \
             f"{datetime.datetime.strftime(now_date, '%Y%m%d')}:{now_h}"
         h_video_ids = dup_data(h_video_ids=h_video_ids, initial_key_name=other_h_24h_key_name,
-                               dup_key_name=other_h_24h_dup_key_name, region=region)
+                               dup_key_name=other_h_24h_dup_key_name, region=region, political_filter=political_filter)
 
     # ##### 去重小程序模型更新结果,并另存为redis中
     # model_key_name = get_rov_redis_key(now_date=now_date)
@@ -681,6 +694,8 @@ def h_rank_bottom(now_date, now_h, rule_params, region_code_list, rule_rank_h_fl
         region_24h_rule_key = rule_param.get('region_24h_rule_key', 'rule1')
         by_24h_rule_key = rule_param.get('24h_rule_key', None)
         by_48h_rule_key = rule_param.get('48h_rule_key', None)
+        # 涉政视频过滤
+        political_filter = param.get('political_filter', None)
         for region in region_code_list:
             log_.info(f"region = {region}")
             key_name = f"{key_prefix}{region}:{data_key}:{rule_key}:{redis_dt}:{redis_h}"
@@ -701,7 +716,8 @@ def h_rank_bottom(now_date, now_h, rule_params, region_code_list, rule_rank_h_fl
             dup_to_redis(h_video_ids=h_video_ids, now_date=now_date, now_h=now_h, rule_key=rule_key,
                          region_24h_rule_key=region_24h_rule_key, region=region,
                          data_key=data_key, by_24h_rule_key=by_24h_rule_key,
-                         by_48h_rule_key=by_48h_rule_key, rule_rank_h_flag=rule_rank_h_flag)
+                         by_48h_rule_key=by_48h_rule_key, rule_rank_h_flag=rule_rank_h_flag,
+                         political_filter=political_filter)
         # 特殊城市视频数据准备
         for region, city_list in config_.REGION_CITY_MAPPING.items():
             t = [

+ 22 - 0
shield_videos.py

@@ -79,12 +79,34 @@ def get_special_area_limit_videos():
         log_.error(traceback.format_exc())
 
 
+def get_political_videos():
+    """获取涉政视频并存入redis"""
+    try:
+        # 获取涉政视频
+        sql = f"SELECT videoid FROM {config_.PROJECT_POLITICAL_VIDEOS}.{config_.TABLE_POLITICAL_VIDEOS};"
+        records = execute_sql_from_odps(project=config_.PROJECT_POLITICAL_VIDEOS, sql=sql)
+        video_id_list = []
+        with records.open_reader() as reader:
+            for record in reader:
+                video_id = int(record['videoid'])
+                video_id_list.append(video_id)
+        log_.info(f"political videos count = {len(video_id_list)}")
+        # 存入redis
+        if len(video_id_list) > 0:
+            redis_helper.del_keys(key_name=config_.POLITICAL_VIDEOS_KEY_NAME)
+            redis_helper.add_data_with_set(key_name=config_.POLITICAL_VIDEOS_KEY_NAME, values=video_id_list,
+                                           expire_time=25 * 3600)
+    except Exception as e:
+        log_.error(traceback.format_exc())
+
+
 def main():
     now_h = datetime.datetime.now().hour
     if now_h == 4:
         get_benshanzhufu_videos()
     get_xng_videos()
     get_special_area_limit_videos()
+    get_political_videos()
 
 
 if __name__ == '__main__':

+ 20 - 0
utils.py

@@ -391,6 +391,26 @@ def filter_shield_video(video_ids, shield_key_name_list):
     return video_ids
 
 
+def filter_political_videos(video_ids):
+    """
+    过滤涉政视频
+    :param video_ids: 需过滤的视频列表 type-list
+    :return: filtered_video_ids  过滤后的列表  type-list
+    """
+    if len(video_ids) == 0:
+        return video_ids
+    # 根据Redis缓存中的数据过滤
+    redis_helper = RedisHelper()
+    political_key_name = f""
+    political_videos_list = redis_helper.get_data_from_set(key_name=political_key_name)
+    if not political_videos_list:
+        return video_ids
+    political_videos = [int(video) for video in political_videos_list]
+    filtered_video_ids = [int(video_id) for video_id in video_ids if int(video_id) not in political_videos]
+
+    return filtered_video_ids
+
+
 def update_video_w_h_rate(video_ids, key_name):
     """
     获取横屏视频的宽高比,并存入redis中 (width/height>1)