123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161 |
- # -*- coding: utf-8 -*-
- # @Author: wangkun
- # @Time: 2023/4/17
- import json
- import os
- import sys
- import time
- from datetime import date, timedelta
- from hashlib import md5
- sys.path.append(os.getcwd())
- from common.common import Common
- from common.feishu import Feishu
- from common.scheduling_db import MysqlHelper
- class Insert:
- @classmethod
- def get_config(cls, log_type, crawler, text, env):
- select_sql = f"""select * from crawler_config where source="benshanzhufu" """
- contents = MysqlHelper.get_values(log_type, crawler, select_sql, env, action='')
- title_list = []
- filter_list = []
- for content in contents:
- config = content['config']
- config_dict = eval(config)
- for k, v in config_dict.items():
- if k == "title":
- title_list_config = v.split(",")
- for title in title_list_config:
- title_list.append(title)
- if k == "filter":
- filter_list_config = v.split(",")
- for filter_word in filter_list_config:
- filter_list.append(filter_word)
- if text == "title":
- return title_list
- elif text == "filter":
- return filter_list
- @classmethod
- def before_day(cls):
- publish_time_str_rule = (date.today() + timedelta(days=-30)).strftime("%Y-%m-%d %H:%M:%S")
- publish_time_stamp_rule = int(time.mktime(time.strptime(publish_time_str_rule, "%Y-%m-%d %H:%M:%S")))
- print(publish_time_str_rule)
- print(publish_time_stamp_rule)
- @classmethod
- def insert_config(cls, log_type, crawler, env):
- filter_sheet = Feishu.get_values_batch(log_type, crawler, "DjXfqG")
- # title_sheet = Feishu.get_values_batch(log_type, crawler, "bHSW1p")
- filter_list = []
- # title_list = []
- for x in filter_sheet:
- for y in x:
- if y is None:
- pass
- else:
- filter_list.append(y)
- # for x in title_sheet:
- # for y in x:
- # if y is None:
- # pass
- # else:
- # title_list.append(y)
- # str_title = ','.join(title_list)
- str_filter = ','.join(filter_list)
- config_dict = {
- # "title": str_title,
- "filter": str_filter
- }
- str_config_dict = str(config_dict)
- # print(f"config_dict:{config_dict}")
- # print(f"str_config_dict:{str_config_dict}")
- insert_sql = f""" insert into crawler_config(title, source, config) values("本山祝福小程序", "benshanzhufu", "{str_config_dict}") """
- MysqlHelper.update_values(log_type, crawler, insert_sql, env)
- @classmethod
- def insert_video_from_feishu_to_mysql(cls, log_type, crawler, env):
- jixiangxingfu_sheetid = ['d9e9b1']
- for sheetid in jixiangxingfu_sheetid:
- xiaoniangao_sheet = Feishu.get_values_batch(log_type, crawler, sheetid)
- for i in range(1, len(xiaoniangao_sheet)):
- # for i in range(1, 5):
- if xiaoniangao_sheet[i][5] is None or xiaoniangao_sheet[i][7] is None:
- continue
- video_id = xiaoniangao_sheet[i][12].replace("https://admin.piaoquantv.com/cms/post-detail/", "").replace(
- "/info", "")
- if video_id == "None":
- continue
- video_id = int(video_id)
- out_user_id = "jixiangxingfu"
- platform = "吉祥幸福"
- strategy = "推荐榜爬虫策略"
- video_title = str(xiaoniangao_sheet[i][7])
- play_cnt = int(xiaoniangao_sheet[i][9].split("万")[0])*10000
- duration = str(xiaoniangao_sheet[i][10])
- width = int(xiaoniangao_sheet[i][11].split("*")[0])
- height = int(xiaoniangao_sheet[i][11].split("*")[1])
- cover_url = str(xiaoniangao_sheet[i][13])
- video_url = str(xiaoniangao_sheet[i][14])
- crawler_rule = json.dumps({})
- out_video_id = md5(video_title.encode('utf8')).hexdigest()
- # print(f"video_id:{video_id}, type:{type(video_id)}")
- # print(f"out_user_id:{out_user_id}, type:{type(out_user_id)}")
- # print(f"platform:{platform}, type:{type(platform)}")
- # print(f"strategy:{strategy}, type:{type(strategy)}")
- # print(f"video_title:{video_title}, type:{type(video_title)}")
- # print(f"cover_url:{cover_url}, type:{type(cover_url)}")
- # print(f"video_url:{video_url}, type:{type(video_url)}")
- # print(f"crawler_rule:{crawler_rule}, type:{type(crawler_rule)}")
- select_sql = f""" select * from crawler_video where platform="{platform}" and video_url="{video_url}" """
- Common.logger(log_type, crawler).info(f"select_sql:{select_sql}")
- repeat_video = MysqlHelper.get_values(log_type, crawler, select_sql, env)
- Common.logger(log_type, crawler).info(f"repeat_video:{repeat_video}")
- if repeat_video is not None and len(repeat_video) != 0:
- Common.logger(log_type, crawler).info(f"{video_title} 已存在数据库中\n")
- else:
- # 视频信息保存数据库
- insert_sql = f""" insert into crawler_video(video_id,
- out_user_id,
- platform,
- strategy,
- out_video_id,
- video_title,
- cover_url,
- video_url,
- duration,
- play_cnt,
- crawler_rule,
- width,
- height)
- values({video_id},
- "{out_user_id}",
- "{platform}",
- "{strategy}",
- "{out_video_id}",
- "{video_title}",
- "{cover_url}",
- "{video_url}",
- {duration},
- {play_cnt},
- '{crawler_rule}',
- {width},
- {height}) """
- Common.logger(log_type, crawler).info(f"insert_sql:{insert_sql}")
- MysqlHelper.update_values(log_type, crawler, insert_sql, env, action='')
- Common.logger(log_type, crawler).info('视频信息插入数据库成功!\n')
- if __name__ == "__main__":
- # Insert.insert_config("insert", "benshanzhufu", "dev")
- # print(Insert.get_config("insert", "ganggangdouchuan", "filter", "dev"))
- # Insert.insert_video_from_feishu_to_mysql("insert-dev", "ganggangdouchuan", "dev")
- # Insert.insert_video_from_feishu_to_mysql("insert-dev", "jixiangxingfu", "dev")
- Insert.insert_video_from_feishu_to_mysql("insert-prod", "jixiangxingfu", "prod")
- pass
|