luojunhui
/
LongArticlesJob


			
							12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182
							"""
@author: luojunhui
"""

import os
import json

from applications.utils import download_gzh_video
from applications.utils import download_toutiao_video
from applications.utils import upload_to_oss

from config import apolloConfig

my_config = apolloConfig()

empty_dict = {}
sensitive_word_list = json.loads(my_config.getConfigValue("sensitive_word_list"))


def whether_title_sensitive(title: str) -> bool:
    """
    title sensitive words filter
    """
    for word in sensitive_word_list:
        if word in title:
            return True

    return False


def whether_duplicate_video_title(video_title: str, db_client) -> bool:
    """
    whether duplicate video title
    """
    sql = f"""
        select id from publish_single_video_source
        where article_title = %s;
    """
    duplicate_id = db_client.fetch(query=sql, params=(video_title,))
    if duplicate_id:
        return True

    return False


def scrape_video_entities_process(video_item, db_client) -> dict:
    """
    video crawler pipeline
    """
    video_title = video_item["article_title"]
    # whether title sensitive
    if whether_title_sensitive(video_title):
        return empty_dict

    # whether duplicate video title
    if whether_duplicate_video_title(video_title, db_client):
        return empty_dict

    # download video
    article_url = video_item["article_url"]
    platform = video_item["platform"]

    match platform:
        case "toutiao":
            video_path = download_toutiao_video(article_url)
        case "gzh":
            video_path = download_gzh_video(article_url)
        case "hksp":
            video_path = ""
        case "sph":
            video_path = ""
        case _:
            return empty_dict

    if video_path:
        # upload video to oss
        oss_path = upload_to_oss(video_path)
        video_item["video_oss_path"] = oss_path
        os.remove(video_path)
        return video_item
    else:
        return empty_dict