|
@@ -4,8 +4,6 @@
|
|
|
|
|
|
from __future__ import annotations
|
|
from __future__ import annotations
|
|
|
|
|
|
-import json
|
|
|
|
-import time
|
|
|
|
import datetime
|
|
import datetime
|
|
import traceback
|
|
import traceback
|
|
import numpy as np
|
|
import numpy as np
|
|
@@ -19,6 +17,7 @@ from applications.db import DatabaseConnector
|
|
from applications.pipeline import scrape_account_entities_process
|
|
from applications.pipeline import scrape_account_entities_process
|
|
from applications.utils import Item
|
|
from applications.utils import Item
|
|
from applications.utils import insert_into_associated_recommendation_table
|
|
from applications.utils import insert_into_associated_recommendation_table
|
|
|
|
+from coldStartTasks.crawler.baidu import haokan_search_videos
|
|
from coldStartTasks.crawler.toutiao import get_associated_recommendation
|
|
from coldStartTasks.crawler.toutiao import get_associated_recommendation
|
|
from coldStartTasks.crawler.channels import search_in_wechat_channel
|
|
from coldStartTasks.crawler.channels import search_in_wechat_channel
|
|
from coldStartTasks.crawler.channels import get_channel_account_videos
|
|
from coldStartTasks.crawler.channels import get_channel_account_videos
|
|
@@ -104,10 +103,10 @@ class CrawlerAccounts:
|
|
fetch_query = f"""
|
|
fetch_query = f"""
|
|
select id, account_name, recommend_video_id, title, read_cnt, duration, seed_account, seed_title
|
|
select id, account_name, recommend_video_id, title, read_cnt, duration, seed_account, seed_title
|
|
from video_association
|
|
from video_association
|
|
- where score > 0.5 and platform = '{platform}' and status = 0
|
|
|
|
|
|
+ where score > %s and platform = %s and status = %s
|
|
order by account_name;
|
|
order by account_name;
|
|
"""
|
|
"""
|
|
- fetch_response = self.db_client.fetch(query=fetch_query)
|
|
|
|
|
|
+ fetch_response = self.db_client.fetch(query=fetch_query, params=(0.5, platform, 0))
|
|
return fetch_response
|
|
return fetch_response
|
|
|
|
|
|
def update_video_status(self, video_id_tuple: tuple, ori_status: int, new_status: int) -> int:
|
|
def update_video_status(self, video_id_tuple: tuple, ori_status: int, new_status: int) -> int:
|
|
@@ -129,42 +128,54 @@ class ChannelsAccountCrawler(CrawlerAccounts):
|
|
2. use search api to get accounts
|
|
2. use search api to get accounts
|
|
"""
|
|
"""
|
|
|
|
|
|
- def process_each_video(self, video: dict, seed_title: str):
|
|
|
|
|
|
+ def process_channels_video(self, video: dict, seed_title: str, account_name: str, account_id: str):
|
|
"""
|
|
"""
|
|
process video item and save to database
|
|
process video item and save to database
|
|
"""
|
|
"""
|
|
|
|
+
|
|
|
|
+ video_item = Item()
|
|
|
|
+ video_item.add("account_name", account_name)
|
|
|
|
+ video_item.add("account_id", account_id)
|
|
|
|
+ video_item.add("recommend_video_id", video["id"])
|
|
|
|
+ video_item.add("title", video["objectDesc"]["description"])
|
|
|
|
+ video_item.add("duration", video["objectDesc"]["media"][0]["VideoPlayLen"])
|
|
|
|
+ video_item.add("seed_account", "SearchWithOutAccount")
|
|
|
|
+ video_item.add("seed_title", seed_title)
|
|
|
|
+ video_item.add(
|
|
|
|
+ "recommend_date", datetime.datetime.today().strftime("%Y-%m-%d")
|
|
|
|
+ )
|
|
|
|
+ video_item.add("platform", "sph")
|
|
|
|
+ # check item
|
|
|
|
+ video_item.check(source="association")
|
|
|
|
+
|
|
|
|
+ # save to db
|
|
|
|
+ self.insert_video_into_recommend_table(video_item.item)
|
|
|
|
+
|
|
|
|
+ def process_search_response(self, video: dict, seed_title: str):
|
|
|
|
+ """
|
|
|
|
+ 通过搜索视频的账号名称去搜索账号,并且抓取该账号下的第一页视频
|
|
|
|
+ """
|
|
account_name = video["items"][0]["source"]["title"]
|
|
account_name = video["items"][0]["source"]["title"]
|
|
|
|
+ # search account detail
|
|
search_account_response = search_in_wechat_channel(
|
|
search_account_response = search_in_wechat_channel(
|
|
search_key=account_name, search_type=2
|
|
search_key=account_name, search_type=2
|
|
)
|
|
)
|
|
account_detail = search_account_response["data"]["data"][0]["items"][0]
|
|
account_detail = search_account_response["data"]["data"][0]["items"][0]
|
|
account_id = account_detail["jumpInfo"]["userName"]
|
|
account_id = account_detail["jumpInfo"]["userName"]
|
|
|
|
+
|
|
|
|
+ # fetch account video list
|
|
search_video_response = get_channel_account_videos(account_id)
|
|
search_video_response = get_channel_account_videos(account_id)
|
|
video_list = search_video_response["data"]["object"]
|
|
video_list = search_video_response["data"]["object"]
|
|
- for video in video_list[:5]:
|
|
|
|
- try:
|
|
|
|
- video_item = Item()
|
|
|
|
- video_item.add("account_name", account_name)
|
|
|
|
- video_item.add("account_id", account_id)
|
|
|
|
- video_item.add("recommend_video_id", video["id"])
|
|
|
|
- video_item.add("title", video["objectDesc"]["description"])
|
|
|
|
- video_item.add("duration", video["objectDesc"]["media"][0]["VideoPlayLen"])
|
|
|
|
- video_item.add("seed_account", "SearchWithOutAccount")
|
|
|
|
- video_item.add("seed_title", seed_title)
|
|
|
|
- video_item.add(
|
|
|
|
- "recommend_date", datetime.datetime.today().strftime("%Y-%m-%d")
|
|
|
|
- )
|
|
|
|
- video_item.add("platform", "sph")
|
|
|
|
- # check item
|
|
|
|
- video_item.check(source="association")
|
|
|
|
|
|
|
|
- # save to db
|
|
|
|
- self.insert_video_into_recommend_table(video_item.item)
|
|
|
|
|
|
+ # process and insert each video
|
|
|
|
+ for video in video_list:
|
|
|
|
+ try:
|
|
|
|
+ self.process_channels_video(video, seed_title, account_name, account_id)
|
|
except Exception as e:
|
|
except Exception as e:
|
|
log(
|
|
log(
|
|
- task="channel account crawler",
|
|
|
|
- function="process_each_video",
|
|
|
|
- message="create item and save to db failed",
|
|
|
|
|
|
+ task="crawler_channels_account_videos",
|
|
|
|
+ function="process_channels_video",
|
|
|
|
+ message="process video failed",
|
|
data={
|
|
data={
|
|
"video": video,
|
|
"video": video,
|
|
"error": str(e),
|
|
"error": str(e),
|
|
@@ -172,29 +183,43 @@ class ChannelsAccountCrawler(CrawlerAccounts):
|
|
}
|
|
}
|
|
)
|
|
)
|
|
|
|
|
|
- def search_by_title_from_database(self, title: str) -> None:
|
|
|
|
|
|
+ def search_video_in_channels(self, title: str) -> None:
|
|
"""
|
|
"""
|
|
search
|
|
search
|
|
"""
|
|
"""
|
|
search_response = search_in_wechat_channel(search_key=title, search_type=1)
|
|
search_response = search_in_wechat_channel(search_key=title, search_type=1)
|
|
- # print(search_response)
|
|
|
|
video_list = search_response["data"]["data"][0]["subBoxes"]
|
|
video_list = search_response["data"]["data"][0]["subBoxes"]
|
|
for video in tqdm(video_list, desc="crawler each video"):
|
|
for video in tqdm(video_list, desc="crawler each video"):
|
|
try:
|
|
try:
|
|
- self.process_each_video(video, seed_title=title)
|
|
|
|
|
|
+ self.process_search_response(video, seed_title=title)
|
|
except Exception as e:
|
|
except Exception as e:
|
|
- print(e)
|
|
|
|
-
|
|
|
|
- def search_by_title_from_hotpoint(self, title: str) -> None:
|
|
|
|
- return
|
|
|
|
|
|
+ log(
|
|
|
|
+ task="channels account crawler",
|
|
|
|
+ function="process_search_response",
|
|
|
|
+ message="search by title failed",
|
|
|
|
+ data={
|
|
|
|
+ "video": video,
|
|
|
|
+ "error": str(e),
|
|
|
|
+ "traceback": traceback.format_exc()
|
|
|
|
+ }
|
|
|
|
+ )
|
|
|
|
|
|
def deal(self):
|
|
def deal(self):
|
|
seed_title_list = self.get_seed_keys()
|
|
seed_title_list = self.get_seed_keys()
|
|
for item in tqdm(seed_title_list, desc="crawler each title"):
|
|
for item in tqdm(seed_title_list, desc="crawler each title"):
|
|
try:
|
|
try:
|
|
- self.search_by_title_from_database(title=item["title"])
|
|
|
|
|
|
+ self.search_video_in_channels(title=item["title"])
|
|
except Exception as e:
|
|
except Exception as e:
|
|
- print(e)
|
|
|
|
|
|
+ log(
|
|
|
|
+ task="channels account crawler",
|
|
|
|
+ function="search_video_in_channels",
|
|
|
|
+ message="search video in channels failed",
|
|
|
|
+ data={
|
|
|
|
+ "title": item["title"],
|
|
|
|
+ "error": str(e),
|
|
|
|
+ "traceback": traceback.format_exc()
|
|
|
|
+ }
|
|
|
|
+ )
|
|
|
|
|
|
# cal similarity score
|
|
# cal similarity score
|
|
video_list = self.get_video_list_without_score()
|
|
video_list = self.get_video_list_without_score()
|
|
@@ -216,7 +241,7 @@ class ToutiaoAccountCrawler(CrawlerAccounts):
|
|
)
|
|
)
|
|
return seed_video_list
|
|
return seed_video_list
|
|
|
|
|
|
- def process_each_video(self, video, seed_account_name, seed_title):
|
|
|
|
|
|
+ def process_toutiao_video(self, video, seed_account_name, seed_title):
|
|
|
|
|
|
# process video item and save to database
|
|
# process video item and save to database
|
|
video_item = Item()
|
|
video_item = Item()
|
|
@@ -247,9 +272,25 @@ class ToutiaoAccountCrawler(CrawlerAccounts):
|
|
recommend_video_list = recommend_response["data"]
|
|
recommend_video_list = recommend_response["data"]
|
|
for video in tqdm(recommend_video_list):
|
|
for video in tqdm(recommend_video_list):
|
|
try:
|
|
try:
|
|
- self.process_each_video(video, seed_account_name, seed_title)
|
|
|
|
|
|
+ self.process_toutiao_video(video, seed_account_name, seed_title)
|
|
|
|
+
|
|
except Exception as e:
|
|
except Exception as e:
|
|
- print(e)
|
|
|
|
|
|
+ log(
|
|
|
|
+ task="toutiao account crawler",
|
|
|
|
+ function="process_toutiao_video",
|
|
|
|
+ message="get recommend video failed",
|
|
|
|
+ data={
|
|
|
|
+ "video": video,
|
|
|
|
+ "error": str(e),
|
|
|
|
+ "traceback": traceback.format_exc()
|
|
|
|
+ }
|
|
|
|
+ )
|
|
|
|
+
|
|
|
|
+ def get_category_recommend_list(self):
|
|
|
|
+ """
|
|
|
|
+ 品类推荐流几乎无视频,暂时不做
|
|
|
|
+ """
|
|
|
|
+ return NotImplementedError()
|
|
|
|
|
|
def deal(self):
|
|
def deal(self):
|
|
|
|
|
|
@@ -277,5 +318,80 @@ class ToutiaoAccountCrawler(CrawlerAccounts):
|
|
|
|
|
|
|
|
|
|
class HaoKanAccountCrawler(CrawlerAccounts):
|
|
class HaoKanAccountCrawler(CrawlerAccounts):
|
|
|
|
+
|
|
|
|
+ def process_haokan_video(self, video: dict, seed_title: str) -> None:
|
|
|
|
+ """
|
|
|
|
+ process_haokan_video
|
|
|
|
+ """
|
|
|
|
+
|
|
|
|
+ video_item = Item()
|
|
|
|
+ video_item.add("account_name", video['author'])
|
|
|
|
+ video_item.add("account_id", video['author_id'])
|
|
|
|
+ video_item.add("platform", "hksp")
|
|
|
|
+ video_item.add("recommend_video_id", video['vid'])
|
|
|
|
+ video_item.add("title", video['title'])
|
|
|
|
+ read_num_string = video['read_num'].replace("次播放", "")
|
|
|
|
+ if "万" in read_num_string:
|
|
|
|
+ read_num_string = read_num_string.replace("万", "")
|
|
|
|
+ read_num = int(float(read_num_string) * 10000)
|
|
|
|
+ else:
|
|
|
|
+ read_num = int(read_num_string)
|
|
|
|
+ video_item.add("read_cnt", int(read_num))
|
|
|
|
+ duration_string = video['duration']
|
|
|
|
+ duration_list = duration_string.split(":")
|
|
|
|
+ if len(duration_list) > 2:
|
|
|
|
+ # video too long
|
|
|
|
+ return
|
|
|
|
+ duration = int(duration_list[0]) * 60 + int(duration_list[1])
|
|
|
|
+ video_item.add("duration", duration)
|
|
|
|
+ video_item.add("seed_account", "SearchWithOutAccount")
|
|
|
|
+ video_item.add("seed_title", seed_title)
|
|
|
|
+ video_item.add("recommend_date", datetime.datetime.today().strftime("%Y-%m-%d"))
|
|
|
|
+ # check item
|
|
|
|
+ video_item.check(source="association")
|
|
|
|
+
|
|
|
|
+ # insert into database
|
|
|
|
+ self.insert_video_into_recommend_table(video_item.item)
|
|
|
|
+
|
|
|
|
+ def search_videos_in_haokan_video(self, title: str) -> None:
|
|
|
|
+ """
|
|
|
|
+ search_
|
|
|
|
+ """
|
|
|
|
+ search_response = haokan_search_videos(title)
|
|
|
|
+ video_list = search_response["data"]["list"]
|
|
|
|
+ for video in tqdm(video_list, desc="search videos"):
|
|
|
|
+ try:
|
|
|
|
+ self.process_haokan_video(video, seed_title=title)
|
|
|
|
+ except Exception as e:
|
|
|
|
+ log(
|
|
|
|
+ task="haokan_search_crawler",
|
|
|
|
+ function="process_haokan_video",
|
|
|
|
+ message="process haokan video failed",
|
|
|
|
+ data={
|
|
|
|
+ "video": video,
|
|
|
|
+ "error": str(e),
|
|
|
|
+ "traceback": traceback.format_exc()
|
|
|
|
+ }
|
|
|
|
+
|
|
|
|
+ )
|
|
|
|
+
|
|
def deal(self):
|
|
def deal(self):
|
|
- raise NotImplementedError()
|
|
|
|
|
|
+ seed_title_list = self.get_seed_keys()
|
|
|
|
+ for seed_title in tqdm(seed_title_list, desc="crawler each title"):
|
|
|
|
+ try:
|
|
|
|
+ self.search_videos_in_haokan_video(seed_title["title"])
|
|
|
|
+ except Exception as e:
|
|
|
|
+ log(
|
|
|
|
+ task="haokan_search_crawler",
|
|
|
|
+ function="search_videos_in_haokan_video",
|
|
|
|
+ message="search videos in haokan video failed",
|
|
|
|
+ data={
|
|
|
|
+ "title": seed_title["title"],
|
|
|
|
+ "error": str(e),
|
|
|
|
+ "traceback": traceback.format_exc()
|
|
|
|
+ }
|
|
|
|
+ )
|
|
|
|
+
|
|
|
|
+ video_list = self.get_video_list_without_score()
|
|
|
|
+ affected_rows = self.save_similarity_score_to_table(video_list)
|
|
|
|
+ print(affected_rows)
|