|
@@ -7,31 +7,34 @@ from pymysql.cursors import DictCursor
|
|
|
from tqdm import tqdm
|
|
|
|
|
|
from applications import log
|
|
|
+from applications.const import SohuVideoCrawlerConst
|
|
|
from applications.db import DatabaseConnector
|
|
|
from applications.pipeline import scrape_video_entities_process
|
|
|
from applications.utils import Item
|
|
|
from applications.utils import str_to_md5
|
|
|
from applications.utils import insert_into_single_video_source_table
|
|
|
|
|
|
+from coldStartTasks.crawler.sohu import get_video_detail
|
|
|
from coldStartTasks.crawler.sohu import get_hot_point_videos
|
|
|
from coldStartTasks.crawler.sohu import get_recommendation_video_list
|
|
|
from coldStartTasks.crawler.sohu import get_user_homepage_videos
|
|
|
|
|
|
from config import long_articles_config
|
|
|
|
|
|
+const = SohuVideoCrawlerConst()
|
|
|
+
|
|
|
|
|
|
class CrawlerSohuVideos:
|
|
|
def __init__(self):
|
|
|
self.db_client = DatabaseConnector(long_articles_config)
|
|
|
self.db_client.connect()
|
|
|
- self.platform = "sohu"
|
|
|
|
|
|
def crawler_each_video(self, video_data):
|
|
|
"""
|
|
|
crawler each video data
|
|
|
"""
|
|
|
video_item = Item()
|
|
|
- unique_id = f"{self.platform}-{video_data['id']}"
|
|
|
+ unique_id = f"{const.PLATFORM}-{video_data['id']}"
|
|
|
|
|
|
# add info into item
|
|
|
video_item.add("content_trace_id", "video{}".format(str_to_md5(unique_id)))
|
|
@@ -40,9 +43,9 @@ class CrawlerSohuVideos:
|
|
|
video_item.add("out_account_id", video_data["authorId"])
|
|
|
video_item.add("out_account_name", video_data["authorName"])
|
|
|
video_item.add("publish_timestamp", video_data["postTime"] / 1000)
|
|
|
- video_item.add("platform", self.platform)
|
|
|
+ video_item.add("platform", const.PLATFORM)
|
|
|
video_item.add("article_url", video_data["videoUrl"])
|
|
|
- video_item.add("source_account", 0)
|
|
|
+ video_item.add("source_account", const.GET_RECOMMEND_INIT_STATUS)
|
|
|
video_item.add("crawler_timestamp", int(time.time()))
|
|
|
|
|
|
# check item before insert
|
|
@@ -61,10 +64,34 @@ class CrawlerSohuVideos:
|
|
|
"error": str(e),
|
|
|
"traceback": traceback.format_exc(),
|
|
|
}
|
|
|
- print(detail)
|
|
|
+ log(
|
|
|
+ task="crawler_sohu_videos",
|
|
|
+ function="crawler_each_video",
|
|
|
+ message="crawler_sohu_videos failed",
|
|
|
+ status="failed",
|
|
|
+ data=detail,
|
|
|
+ )
|
|
|
|
|
|
|
|
|
class CrawlerSohuHotVideos(CrawlerSohuVideos):
|
|
|
+
|
|
|
+ # process hot video obj to satisfy video item
|
|
|
+ def process_hot_video_obj(self, video_obj):
|
|
|
+ """
|
|
|
+ process hot video obj
|
|
|
+ """
|
|
|
+ article_url = f"https://www.sohu.com{video_obj['url']}"
|
|
|
+ video_detail_response = get_video_detail(article_url=article_url)
|
|
|
+ item_obj = {
|
|
|
+ "id": video_obj["id"],
|
|
|
+ "title": video_obj["title"],
|
|
|
+ "authorId": video_detail_response["account_id"],
|
|
|
+ "authorName": video_detail_response["account_name"],
|
|
|
+ "postTime": video_detail_response["publish_timestamp"],
|
|
|
+ "videoUrl": video_detail_response["video_url"],
|
|
|
+ }
|
|
|
+ self.crawler_each_video(item_obj)
|
|
|
+
|
|
|
def deal(self):
|
|
|
"""
|
|
|
crawler sohu hot videos every day
|
|
@@ -75,7 +102,7 @@ class CrawlerSohuHotVideos(CrawlerSohuVideos):
|
|
|
]["list"]
|
|
|
for video in tqdm(hot_point_video_list, desc="crawler sohu hot videos"):
|
|
|
try:
|
|
|
- self.crawler_each_video(video)
|
|
|
+ self.process_hot_video_obj(video)
|
|
|
|
|
|
except Exception as e:
|
|
|
log(
|
|
@@ -100,7 +127,11 @@ class CrawlerSohuRecommendVideos(CrawlerSohuVideos):
|
|
|
fetch_query = f"""
|
|
|
select id, out_account_id, url_unique_md5, article_title, score
|
|
|
from publish_single_video_source
|
|
|
- where platform = 'sohu' and source_account = 0 and score > 0.6 and audit_status = 1 and bad_status = 0;
|
|
|
+ where platform = '{const.PLATFORM}'
|
|
|
+ and source_account = {const.GET_RECOMMEND_INIT_STATUS}
|
|
|
+ and score > {const.GET_RECOMMEND_THRESHOLD_SCORE}
|
|
|
+ and audit_status = {const.AUDIT_SUCCESS_STATUS}
|
|
|
+ and bad_status = {const.VIDEO_NOT_BAD_STATUS};
|
|
|
"""
|
|
|
seed_videos = self.db_client.fetch(query=fetch_query, cursor_type=DictCursor)
|
|
|
return seed_videos
|
|
@@ -112,8 +143,7 @@ class CrawlerSohuRecommendVideos(CrawlerSohuVideos):
|
|
|
author_id = seed_video["out_account_id"]
|
|
|
article_id = seed_video["url_unique_md5"]
|
|
|
outside_url = f"https://www.sohu.com/a/{article_id}_{author_id}"
|
|
|
- page_list = [i for i in range(1, 8)]
|
|
|
- for page in page_list:
|
|
|
+ for page in const.PAGE_LIST:
|
|
|
try:
|
|
|
response = get_recommendation_video_list(
|
|
|
outside_url, author_id, article_id, page
|
|
@@ -124,9 +154,17 @@ class CrawlerSohuRecommendVideos(CrawlerSohuVideos):
|
|
|
self.crawler_each_video(video)
|
|
|
|
|
|
except Exception as e:
|
|
|
- print(e)
|
|
|
- print(traceback.format_exc())
|
|
|
- continue
|
|
|
+ log(
|
|
|
+ task="crawler_sohu_videos",
|
|
|
+ function="get_each_video_recommendation",
|
|
|
+ message="get_each_video_recommendation failed",
|
|
|
+ status="failed",
|
|
|
+ data={
|
|
|
+ "error": str(e),
|
|
|
+ "traceback": traceback.format_exc(),
|
|
|
+ "page": page,
|
|
|
+ },
|
|
|
+ )
|
|
|
|
|
|
def update_seed_video_status(self, task_id: int) -> int:
|
|
|
"""
|
|
@@ -135,7 +173,14 @@ class CrawlerSohuRecommendVideos(CrawlerSohuVideos):
|
|
|
update_query = f"""
|
|
|
update publish_single_video_source set source_account = %s where id = %s and source_account = %s;
|
|
|
"""
|
|
|
- return self.db_client.save(query=update_query, params=(1, task_id, 0))
|
|
|
+ return self.db_client.save(
|
|
|
+ query=update_query,
|
|
|
+ params=(
|
|
|
+ const.GET_RECOMMEND_SUCCESS_STATUS,
|
|
|
+ task_id,
|
|
|
+ const.GET_RECOMMEND_INIT_STATUS,
|
|
|
+ ),
|
|
|
+ )
|
|
|
|
|
|
def deal(self):
|
|
|
task_list = self.fetch_seed_videos()
|
|
@@ -143,6 +188,7 @@ class CrawlerSohuRecommendVideos(CrawlerSohuVideos):
|
|
|
try:
|
|
|
self.get_each_video_recommendation(task)
|
|
|
self.update_seed_video_status(task_id=task["id"])
|
|
|
+
|
|
|
except Exception as e:
|
|
|
log(
|
|
|
task="crawler_sohu_videos",
|
|
@@ -158,11 +204,15 @@ class CrawlerSohuRecommendVideos(CrawlerSohuVideos):
|
|
|
|
|
|
|
|
|
class CrawlerSohuUserPageVideos(CrawlerSohuVideos):
|
|
|
+ """
|
|
|
+ 个人主页文章占比大,账号体系还未建设,本次上线暂时不抓取,后续有需要再考虑
|
|
|
+ """
|
|
|
+
|
|
|
def get_author_list(self):
|
|
|
"""
|
|
|
get author list from database
|
|
|
"""
|
|
|
- return [121644888]
|
|
|
+ return []
|
|
|
|
|
|
def process_each_page(self, response: dict):
|
|
|
"""
|