|
@@ -37,11 +37,11 @@ class GongzhonghaoAuthor1:
|
|
|
return None
|
|
|
token_dict = {
|
|
|
"token_id": configs[0]["id"],
|
|
|
- "title": configs[0]["title"],
|
|
|
- "token": dict(eval(configs[0]["config"]))["token"],
|
|
|
- "cookie": dict(eval(configs[0]["config"]))["cookie"],
|
|
|
+ "title": configs[0]["title"].strip(),
|
|
|
+ "token": dict(eval(configs[0]["config"]))["token"].strip(),
|
|
|
+ "cookie": dict(eval(configs[0]["config"]))["cookie"].strip(),
|
|
|
"update_time": time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(int(configs[0]["update_time"]/1000))),
|
|
|
- "operator": configs[0]["operator"]
|
|
|
+ "operator": configs[0]["operator"].strip()
|
|
|
}
|
|
|
# for k, v in token_dict.items():
|
|
|
# print(f"{k}:{v}")
|
|
@@ -59,6 +59,7 @@ class GongzhonghaoAuthor1:
|
|
|
len_sheet = len(user_sheet)
|
|
|
if len_sheet >= 101:
|
|
|
len_sheet = 101
|
|
|
+ Common.logger(log_type, crawler).info(f"len_sheet:{len_sheet}")
|
|
|
for i in range(1, len_sheet):
|
|
|
# for i in range(1, 3):
|
|
|
user_name = user_sheet[i][0]
|
|
@@ -76,7 +77,7 @@ class GongzhonghaoAuthor1:
|
|
|
tag4 = user_sheet[i][10]
|
|
|
tag5 = user_sheet[i][11]
|
|
|
tag6 = user_sheet[i][12]
|
|
|
- Common.logger(log_type, crawler).info(f"正在更新 {user_name} 用户信息")
|
|
|
+ Common.logger(log_type, crawler).info(f"正在更新第{i}个用户:{user_name}")
|
|
|
if out_uid is None or our_uid is None:
|
|
|
# 用来创建our_id的信息
|
|
|
user_dict = {
|
|
@@ -287,60 +288,60 @@ class GongzhonghaoAuthor1:
|
|
|
begin += 5
|
|
|
app_msg_list = r.json()['app_msg_list']
|
|
|
for article in app_msg_list:
|
|
|
- try:
|
|
|
- create_time = article.get('create_time', 0)
|
|
|
- publish_time_stamp = int(create_time)
|
|
|
- publish_time_str = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(publish_time_stamp))
|
|
|
- article_url = article.get('link', '')
|
|
|
- video_dict = {
|
|
|
- 'video_id': article.get('aid', ''),
|
|
|
- 'video_title': article.get("title", "").replace(' ', '').replace('"', '').replace("'", ""),
|
|
|
- 'publish_time_stamp': publish_time_stamp,
|
|
|
- 'publish_time_str': publish_time_str,
|
|
|
- 'user_name': user_dict["user_name"],
|
|
|
- 'play_cnt': 0,
|
|
|
- 'comment_cnt': 0,
|
|
|
- 'like_cnt': 0,
|
|
|
- 'share_cnt': 0,
|
|
|
- 'user_id': user_dict['user_id'],
|
|
|
- 'avatar_url': user_dict['avatar_url'],
|
|
|
- 'cover_url': article.get('cover', ''),
|
|
|
- 'article_url': article.get('link', ''),
|
|
|
- 'video_url': cls.get_video_url(article_url, env),
|
|
|
- 'session': f'gongzhonghao-author1-{int(time.time())}'
|
|
|
- }
|
|
|
- for k, v in video_dict.items():
|
|
|
- Common.logger(log_type, crawler).info(f"{k}:{v}")
|
|
|
+ # try:
|
|
|
+ create_time = article.get('create_time', 0)
|
|
|
+ publish_time_stamp = int(create_time)
|
|
|
+ publish_time_str = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(publish_time_stamp))
|
|
|
+ article_url = article.get('link', '')
|
|
|
+ video_dict = {
|
|
|
+ 'video_id': article.get('aid', ''),
|
|
|
+ 'video_title': article.get("title", "").replace(' ', '').replace('"', '').replace("'", ""),
|
|
|
+ 'publish_time_stamp': publish_time_stamp,
|
|
|
+ 'publish_time_str': publish_time_str,
|
|
|
+ 'user_name': user_dict["user_name"],
|
|
|
+ 'play_cnt': 0,
|
|
|
+ 'comment_cnt': 0,
|
|
|
+ 'like_cnt': 0,
|
|
|
+ 'share_cnt': 0,
|
|
|
+ 'user_id': user_dict['user_id'],
|
|
|
+ 'avatar_url': user_dict['avatar_url'],
|
|
|
+ 'cover_url': article.get('cover', ''),
|
|
|
+ 'article_url': article.get('link', ''),
|
|
|
+ 'video_url': cls.get_video_url(article_url, env),
|
|
|
+ 'session': f'gongzhonghao-author1-{int(time.time())}'
|
|
|
+ }
|
|
|
+ for k, v in video_dict.items():
|
|
|
+ Common.logger(log_type, crawler).info(f"{k}:{v}")
|
|
|
|
|
|
- if int(time.time()) - publish_time_stamp > 3600 * 24 * int(rule_dict.get('period', {}).get('max', 1000)):
|
|
|
- Common.logger(log_type, crawler).info(f"发布时间超过{int(rule_dict.get('period', {}).get('max', 1000))}天\n")
|
|
|
- return
|
|
|
+ if int(time.time()) - publish_time_stamp > 3600 * 24 * int(rule_dict.get('period', {}).get('max', 1000)):
|
|
|
+ Common.logger(log_type, crawler).info(f"发布时间超过{int(rule_dict.get('period', {}).get('max', 1000))}天\n")
|
|
|
+ return
|
|
|
|
|
|
- if video_dict['article_url'] == 0 or video_dict['video_url'] == 0:
|
|
|
- Common.logger(log_type, crawler).info("文章涉嫌违反相关法律法规和政策\n")
|
|
|
- # 标题敏感词过滤
|
|
|
- elif any(str(word) if str(word) in video_dict['video_title'] else False
|
|
|
- for word in get_config_from_mysql(log_type=log_type,
|
|
|
- source=crawler,
|
|
|
- env=env,
|
|
|
- text="filter",
|
|
|
- action="")) is True:
|
|
|
- Common.logger(log_type, crawler).info("标题已中过滤词\n")
|
|
|
- # 已下载判断
|
|
|
- elif cls.repeat_video(log_type, crawler, video_dict['video_id'], env) != 0:
|
|
|
- Common.logger(log_type, crawler).info("视频已下载\n")
|
|
|
- # 标题相似度
|
|
|
- elif title_like(log_type, crawler, video_dict['video_title'], cls.platform, env) is True:
|
|
|
- Common.logger(log_type, crawler).info(f'标题相似度>=80%:{video_dict["video_title"]}\n')
|
|
|
- else:
|
|
|
- cls.download_publish(log_type=log_type,
|
|
|
- crawler=crawler,
|
|
|
- video_dict=video_dict,
|
|
|
- rule_dict=rule_dict,
|
|
|
- user_dict=user_dict,
|
|
|
- env=env)
|
|
|
- except Exception as e:
|
|
|
- Common.logger(log_type, crawler).error(f"抓取单条视频异常:{e}\n")
|
|
|
+ if video_dict['article_url'] == 0 or video_dict['video_url'] == 0:
|
|
|
+ Common.logger(log_type, crawler).info("文章涉嫌违反相关法律法规和政策\n")
|
|
|
+ # 标题敏感词过滤
|
|
|
+ elif any(str(word) if str(word) in video_dict['video_title'] else False
|
|
|
+ for word in get_config_from_mysql(log_type=log_type,
|
|
|
+ source=crawler,
|
|
|
+ env=env,
|
|
|
+ text="filter",
|
|
|
+ action="")) is True:
|
|
|
+ Common.logger(log_type, crawler).info("标题已中过滤词\n")
|
|
|
+ # 已下载判断
|
|
|
+ elif cls.repeat_video(log_type, crawler, video_dict['video_id'], env) != 0:
|
|
|
+ Common.logger(log_type, crawler).info("视频已下载\n")
|
|
|
+ # 标题相似度
|
|
|
+ elif title_like(log_type, crawler, video_dict['video_title'], cls.platform, env) is True:
|
|
|
+ Common.logger(log_type, crawler).info(f'标题相似度>=80%:{video_dict["video_title"]}\n')
|
|
|
+ else:
|
|
|
+ cls.download_publish(log_type=log_type,
|
|
|
+ crawler=crawler,
|
|
|
+ video_dict=video_dict,
|
|
|
+ rule_dict=rule_dict,
|
|
|
+ user_dict=user_dict,
|
|
|
+ env=env)
|
|
|
+ # except Exception as e:
|
|
|
+ # Common.logger(log_type, crawler).error(f"抓取单条视频异常:{e}\n")
|
|
|
Common.logger(log_type, crawler).info('休眠 60 秒\n')
|
|
|
time.sleep(60)
|
|
|
|
|
@@ -476,23 +477,23 @@ class GongzhonghaoAuthor1:
|
|
|
Common.logger(log_type, crawler).warning(f"抓取用户列表为空\n")
|
|
|
return
|
|
|
for user_dict in user_list:
|
|
|
- try:
|
|
|
- Common.logger(log_type, crawler).info(f'获取 {user_dict["user_name"]} 公众号视频\n')
|
|
|
- cls.get_videoList(log_type=log_type,
|
|
|
- crawler=crawler,
|
|
|
- rule_dict=rule_dict,
|
|
|
- user_dict=user_dict,
|
|
|
- env=env)
|
|
|
- Common.logger(log_type, crawler).info('休眠 60 秒\n')
|
|
|
- time.sleep(60)
|
|
|
- except Exception as e:
|
|
|
- Common.logger(log_type, crawler).info(f'抓取{user_dict["user_name"]}公众号时异常:{e}\n')
|
|
|
+ # try:
|
|
|
+ Common.logger(log_type, crawler).info(f'获取 {user_dict["user_name"]} 公众号视频\n')
|
|
|
+ cls.get_videoList(log_type=log_type,
|
|
|
+ crawler=crawler,
|
|
|
+ rule_dict=rule_dict,
|
|
|
+ user_dict=user_dict,
|
|
|
+ env=env)
|
|
|
+ Common.logger(log_type, crawler).info('休眠 60 秒\n')
|
|
|
+ time.sleep(60)
|
|
|
+ # except Exception as e:
|
|
|
+ # Common.logger(log_type, crawler).info(f'抓取{user_dict["user_name"]}公众号时异常:{e}\n')
|
|
|
|
|
|
|
|
|
if __name__ == "__main__":
|
|
|
- # GongzhonghaoAuthor1.get_token("author", "gongzhonghao", "dev")
|
|
|
+ GongzhonghaoAuthor1.get_token("author", "gongzhonghao", "prod")
|
|
|
# print(GongzhonghaoAuthor1.get_users("author", "gongzhonghao", "Bzv72P", "dev"))
|
|
|
# print(get_config_from_mysql("author", "gongzhonghao", "dev", "filter", action=""))
|
|
|
# print(title_like("author", "gongzhonghao", "公众号", "123", "dev"))
|
|
|
- print(GongzhonghaoAuthor1.get_user_info("author", "gongzhonghao", "幸福花朵", "dev"))
|
|
|
+ # print(GongzhonghaoAuthor1.get_user_info("author", "gongzhonghao", "幸福花朵", "dev"))
|
|
|
pass
|