Quellcode durchsuchen

搜索接口上线

罗俊辉 vor 1 Jahr
Ursprung
Commit
e7ab37b107

+ 88 - 0
app/search_app.py

@@ -0,0 +1,88 @@
+"""
+@author: luojunhui
+爬虫搜索服务
+接受 search_keys, 然后在搜索,并且把执行结果发送到 ETL
+"""
+import os
+import sys
+import json
+import asyncio
+
+from mq_http_sdk.mq_consumer import *
+from mq_http_sdk.mq_exception import MQExceptionBase
+
+sys.path.append(os.getcwd())
+
+from application.common import AliyunLogger, get_consumer, ack_message
+from application.common.log import Local
+from spider.crawler_search import *
+
+
+async def search(params):
+    """
+    传入参数,然后根据参数执行爬虫代码
+    :return: None
+    """
+    # await weixin_search(params)
+    try:
+        await weixin_search(params)
+    except Exception as e:
+        print(e)
+
+
+async def consume_search_message():
+    """
+    消费单个消息,若消费成功则启动搜索爬虫
+    """
+    topic = "search_spider_prod"
+    group = "search_spider_prod"
+    mode = "search"
+    platform = "search_platform"
+    logger = AliyunLogger(platform=platform, mode=mode)
+    consumer = get_consumer(topic, group)
+    try:
+        messages = consumer.consume_message(wait_seconds=10, batch_size=1)
+        if messages:
+            # 在这里消费消息,做一些数据处理分析
+            for single_message in messages:
+                Local.logger(platform, mode).info(
+                    "收到一条消息\t{}{}".format(single_message, single_message.message_body))
+                ack_message(
+                    mode=mode, platform=platform, recv_msgs=messages, consumer=consumer
+                )
+                logger.logging(
+                    code=5000,
+                    message="successfully consumed message",
+                    data=single_message.message_body,
+                )
+                message_body = single_message.message_body
+                params = json.loads(message_body)
+                # 创建爬虫task
+                await search(params)
+                logger.logging(code=5001, message="successfully created task")
+        else:
+            logger.logging(code=5003, message="Messages Queue is Empty")
+
+    except MQExceptionBase as err:
+        # Topic中没有消息可消费。
+        if err.type == "MessageNotExist":
+            message = "No new message! RequestId:{}\n".format(err.req_id)
+            logger.logging(code="5004", message=message)
+        else:
+            message = "Consume Message Fail! Exception:{}\n".format(err)
+            logger.logging(code="5004", message=message)
+
+
+async def main():
+    """
+    主函数
+    每隔一秒接受一次消息
+    """
+    while True:
+        await consume_search_message()
+        await asyncio.sleep(5)
+
+
+if __name__ == "__main__":
+    # 运行主事件循环
+    asyncio.run(main())

+ 81 - 0
scheduler/search_spider.py

@@ -0,0 +1,81 @@
+"""
+获取热点日历
+"""
+import schedule
+import requests
+import json
+import datetime
+
+
+class PQuanCalendar(object):
+    """
+    热点日历
+    """
+
+    def __init__(self):
+        self.url = "https://www.adguider.com/sv1/calendar/getCalendarAjax"
+        self.c_dict = {}
+
+    def get_calendar(self):
+        """
+        请求日历
+        :return:
+        """
+        today = datetime.datetime.today()
+        tomorrow = today + datetime.timedelta(days=1)
+        tomorrow = tomorrow.strftime("%Y-%m-%d")
+        payload = json.dumps({
+            "startTime": tomorrow,
+            "endTime": tomorrow,
+            "fdIdList": [
+                13,
+                4,
+                3,
+                10,
+                5,
+                8,
+            ]
+        })
+        headers = {
+            'Accept': 'application/json, text/javascript, */*; q=0.01',
+            'Accept-Language': 'en,zh;q=0.9,zh-CN;q=0.8',
+            'Connection': 'keep-alive',
+            'Content-Type': 'application/json',
+            'Origin': 'https://www.adguider.com',
+            'Referer': 'https://www.adguider.com/sv1/calendar/getCalendar?mode=1&startDate=2024/03/13',
+            'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36',
+        }
+        response = requests.post(self.url, headers=headers, data=payload)
+        result = response.json()
+        # print(json.dumps(result, ensure_ascii=False, indent=4))
+        return result
+
+    def process_response(self, response_json):
+        """
+        对获取的数据进行处理, 处理结构为
+        date_info: [festival_obj1, festival_obj2, festival_obj3, festival_obj4 ......]
+        :param response_json:
+        :return: calender_dict
+        """
+        if response_json['data']:
+            for cat_obj in response_json['data']:
+                if cat_obj.get("adFestivalFixedVos"):
+                    festival_list = cat_obj["adFestivalFixedVos"]
+                    category = cat_obj["ftName"]
+                    self.c_dict[category] = [item['ffName'] for item in festival_list]
+            print(json.dumps(self.c_dict, ensure_ascii=False, indent=4))
+        return self.c_dict
+
+
+class SearchSpider(object):
+    """
+    定时从日历中获取明天的节日,通过节日去搜索视频,并且把视频发送至 ETL 下载
+    """
+    def __init__(self, festival_dict):
+        self.key_list = festival_dict
+
+
+if __name__ == '__main__':
+    P = PQuanCalendar()
+    res = P.get_calendar()
+    P.process_response(res)

+ 5 - 0
spider/crawler_search/__init__.py

@@ -0,0 +1,5 @@
+"""
+search function
+"""
+from .baidu_search import hksp_search
+from .weixin_search import weixin_search

+ 1 - 1
spider/crawler_search/baidu_search.py

@@ -51,4 +51,4 @@ def hksp_search(key):
 
 
 if __name__ == '__main__':
-    hksp_search("王雪珂")
+    hksp_search("人类首次从恐龙蛋化石中获得恐龙的遗传物质")

+ 390 - 0
spider/crawler_search/weixin_search.py

@@ -0,0 +1,390 @@
+"""
+@author: luojunhui
+微信 search
+"""
+import os
+import sys
+import json
+import time
+import requests
+
+sys.path.append(os.getcwd())
+
+from application.items import VideoItem
+from application.common.messageQueue import MQ
+from application.common.log import AliyunLogger
+
+ETL_MQ = MQ(topic_name="topic_crawler_etl_prod")
+aliyun_logger = AliyunLogger(platform="weixin_search", mode="search")
+
+
+async def weixin_search(params):
+    """
+    通过搜索爬虫 + search_keys 来获取视频信息,并且以 MQ 的方式发送给 ETL, 正常上传发布
+    只抓一页,不做去重
+    :param params: []
+    :return:
+    """
+    gh_id_dict = {
+        "gh_01f8afd03366": {
+            "uid": 69629493,
+            "nick_name": "非亲非故"
+        },
+        "gh_058e41145a0c": {
+            "uid": 69629452,
+            "nick_name": "甜腻梦话"
+        },
+        "gh_084a485e859a": {
+            "uid": 69629447,
+            "nick_name": "梦星月"
+        },
+        "gh_0921c03402cd": {
+            "uid": 69629504,
+            "nick_name": "你的女友"
+        },
+        "gh_0c89e11f8bf3": {
+            "uid": 69629482,
+            "nick_name": "粟米"
+        },
+        "gh_171cec079b2a": {
+            "uid": 69629475,
+            "nick_name": "海上"
+        },
+        "gh_183d80deffb8": {
+            "uid": 69629465,
+            "nick_name": "论趣"
+        },
+        "gh_1ee2e1b39ccf": {
+            "uid": 69629448,
+            "nick_name": "纵有疾风起"
+        },
+        "gh_234ef02cdee5": {
+            "uid": 69629486,
+            "nick_name": "夹逼"
+        },
+        "gh_26a307578776": {
+            "uid": 69629464,
+            "nick_name": "最宝贝的宝贝"
+        },
+        "gh_29074b51f2b7": {
+            "uid": 69629503,
+            "nick_name": "沉舸"
+        },
+        "gh_2b8c6aa035ae": {
+            "uid": 69629443,
+            "nick_name": "懶得取名"
+        },
+        "gh_34318194fd0e": {
+            "uid": 69629490,
+            "nick_name": "徒四壁"
+        },
+        "gh_3845af6945d0": {
+            "uid": 69629518,
+            "nick_name": "秋水娉婷"
+        },
+        "gh_3ac6d7208961": {
+            "uid": 69629471,
+            "nick_name": "小熊的少女梦"
+        },
+        "gh_3c7d38636846": {
+            "uid": 69629492,
+            "nick_name": "油腻腻"
+        },
+        "gh_3df10391639c": {
+            "uid": 69629514,
+            "nick_name": "六郎娇面"
+        },
+        "gh_40a0ad154478": {
+            "uid": 69629489,
+            "nick_name": "禁止"
+        },
+        "gh_424c8eeabced": {
+            "uid": 69629495,
+            "nick_name": "认命"
+        },
+        "gh_4568b5a7e2fe": {
+            "uid": 69629457,
+            "nick_name": "香腮"
+        },
+        "gh_45beb952dc74": {
+            "uid": 69629462,
+            "nick_name": "毋庸"
+        },
+        "gh_484de412b0ef": {
+            "uid": 69629456,
+            "nick_name": "婪"
+        },
+        "gh_4c058673c07e": {
+            "uid": 69629449,
+            "nick_name": "影帝"
+        },
+        "gh_538f78f9d3aa": {
+            "uid": 69629454,
+            "nick_name": "伤痕"
+        },
+        "gh_56a6765df869": {
+            "uid": 69629487,
+            "nick_name": "风月"
+        },
+        "gh_56ca3dae948c": {
+            "uid": 69629511,
+            "nick_name": "留下太多回忆"
+        },
+        "gh_5e543853d8f0": {
+            "uid": 69629516,
+            "nick_name": "不知春秋"
+        },
+        "gh_5ff48e9fb9ef": {
+            "uid": 69629468,
+            "nick_name": "寻她找他"
+        },
+        "gh_671f460c856c": {
+            "uid": 69629496,
+            "nick_name": "绝不改悔"
+        },
+        "gh_6b7c2a257263": {
+            "uid": 69629501,
+            "nick_name": "奶牙"
+        },
+        "gh_6d205db62f04": {
+            "uid": 69629483,
+            "nick_name": "怕羞"
+        },
+        "gh_6d9f36e3a7be": {
+            "uid": 69629472,
+            "nick_name": "望长安"
+        },
+        "gh_73be0287bb94": {
+            "uid": 69629510,
+            "nick_name": "戏剧"
+        },
+        "gh_744cb16f6e16": {
+            "uid": 69629479,
+            "nick_name": "反駁"
+        },
+        "gh_7b4a5f86d68c": {
+            "uid": 69629453,
+            "nick_name": "我很想你"
+        },
+        "gh_7bca1c99aea0": {
+            "uid": 69629484,
+            "nick_name": "从小就很傲"
+        },
+        "gh_7e5818b2dd83": {
+            "uid": 69629505,
+            "nick_name": "二八佳人"
+        },
+        "gh_89ef4798d3ea": {
+            "uid": 69629506,
+            "nick_name": "彼岸花"
+        },
+        "gh_901b0d722749": {
+            "uid": 69629491,
+            "nick_name": "深情不为我"
+        },
+        "gh_9161517e5676": {
+            "uid": 69629469,
+            "nick_name": "折磨"
+        },
+        "gh_93e00e187787": {
+            "uid": 69629478,
+            "nick_name": "理会"
+        },
+        "gh_9877c8541764": {
+            "uid": 69629481,
+            "nick_name": "我沿着悲伤"
+        },
+        "gh_9cf3b7ff486b": {
+            "uid": 69629466,
+            "nick_name": "hoit"
+        },
+        "gh_9e559b3b94ca": {
+            "uid": 69629444,
+            "nick_name": "我与你相遇"
+        },
+        "gh_9f8dc5b0c74e": {
+            "uid": 69629470,
+            "nick_name": "港口"
+        },
+        "gh_a182cfc94dad": {
+            "uid": 69629512,
+            "nick_name": "四海八荒"
+        },
+        "gh_a2901d34f75b": {
+            "uid": 69629508,
+            "nick_name": "听腻了谎话"
+        },
+        "gh_a307072c04b9": {
+            "uid": 69629494,
+            "nick_name": "踏步"
+        },
+        "gh_a6351b447819": {
+            "uid": 69629513,
+            "nick_name": "七猫酒馆"
+        },
+        "gh_ac43e43b253b": {
+            "uid": 69629473,
+            "nick_name": "一厢情愿"
+        },
+        "gh_adca24a8f429": {
+            "uid": 69629458,
+            "nick_name": "对你何止一句喜欢"
+        },
+        "gh_b15de7c99912": {
+            "uid": 69629509,
+            "nick_name": "糖炒板栗"
+        },
+        "gh_b32125c73861": {
+            "uid": 69629467,
+            "nick_name": "发尾"
+        },
+        "gh_b3ffc1ca3a04": {
+            "uid": 69629519,
+            "nick_name": "主宰你心"
+        },
+        "gh_b8baac4296cb": {
+            "uid": 69629463,
+            "nick_name": "生性"
+        },
+        "gh_b9b99173ff8a": {
+            "uid": 69629497,
+            "nick_name": "养一只月亮"
+        },
+        "gh_bd57b6978e06": {
+            "uid": 69629500,
+            "nick_name": "厌遇"
+        },
+        "gh_be8c29139989": {
+            "uid": 69629476,
+            "nick_name": "不负"
+        },
+        "gh_bfe5b705324a": {
+            "uid": 69629502,
+            "nick_name": "乐极"
+        },
+        "gh_bff0bcb0694a": {
+            "uid": 69629507,
+            "nick_name": "简迷离"
+        },
+        "gh_c69776baf2cd": {
+            "uid": 69629485,
+            "nick_name": "骄纵"
+        },
+        "gh_c91b42649690": {
+            "uid": 69629477,
+            "nick_name": "荟萃"
+        },
+        "gh_d2cc901deca7": {
+            "uid": 69629461,
+            "nick_name": "恶意调笑"
+        },
+        "gh_d5f935d0d1f2": {
+            "uid": 69629474,
+            "nick_name": "青少年哪吒"
+        },
+        "gh_da76772d8d15": {
+            "uid": 69629499,
+            "nick_name": "独揽风月"
+        },
+        "gh_de9f9ebc976b": {
+            "uid": 69629450,
+            "nick_name": "剑出鞘恩怨了"
+        },
+        "gh_e0eb490115f5": {
+            "uid": 69629460,
+            "nick_name": "赋别"
+        },
+        "gh_e24da99dc899": {
+            "uid": 69629459,
+            "nick_name": "恋雨夏季"
+        },
+        "gh_e2576b7181c6": {
+            "uid": 69629488,
+            "nick_name": "满天星"
+        },
+        "gh_e75dbdc73d80": {
+            "uid": 69629515,
+            "nick_name": "情战"
+        },
+        "gh_e9d819f9e147": {
+            "uid": 69629498,
+            "nick_name": "与卿"
+        },
+        "gh_efaf7da157f5": {
+            "uid": 69629520,
+            "nick_name": "心野性子浪"
+        },
+        "gh_f4594783f5b8": {
+            "uid": 69629517,
+            "nick_name": "自缚"
+        },
+        "gh_fe6ef3a65a48": {
+            "uid": 69629455,
+            "nick_name": "风间"
+        }
+    }
+
+    search_keys = params['search_keys']
+    user = gh_id_dict.get(params['ghId'])
+    url = "http://8.217.190.241:8888/crawler/wei_xin/keyword"
+    payload = json.dumps({
+        "keyword": ",".join(search_keys),
+        "cursor": "0",
+        "content_type": "video"
+    })
+    headers = {
+        'Content-Type': 'application/json'
+    }
+    response = requests.request("POST", url, headers=headers, data=payload)
+    data_list = response.json()['data']['data']
+    for item in data_list:
+        video_obj = item['items'][0]
+        # await process_weixin_video_obj(video_obj, user)
+        try:
+            aliyun_logger.logging(
+                code="1001",
+                message="扫描到一条视频",
+                account=user['uid'],
+                data=video_obj
+            )
+            await process_weixin_video_obj(video_obj, user)
+        except Exception as e:
+            aliyun_logger.logging(
+                code="3000",
+                message="有报错信息---{}".format(e),
+                account=user['uid']
+            )
+
+
+async def process_weixin_video_obj(video_obj, user):
+    """
+    异步处理微信 video_obj
+    公众号和站内账号一一对应
+    :param user:
+    :param video_obj:
+    :return:
+    """
+
+    platform = "weixin_search"
+    publish_time_stamp = int(video_obj['pubTime'])
+    title = video_obj['title'].replace('<em class=\"highlight\">', '').replace('</em>', '').replace("#", "")
+    item = VideoItem()
+    item.add_video_info("user_id", user["uid"])
+    item.add_video_info("user_name", user["nick_name"])
+    item.add_video_info("video_id", video_obj['hashDocID'])
+    item.add_video_info("video_title", title)
+    item.add_video_info("publish_time_stamp", int(publish_time_stamp))
+    item.add_video_info("video_url", video_obj["videoUrl"])
+    item.add_video_info("cover_url", video_obj["image"])
+    item.add_video_info("out_video_id", video_obj['hashDocID'])
+    item.add_video_info("platform", platform)
+    item.add_video_info("strategy", "search")
+    item.add_video_info("session", "{}-{}".format(platform, int(time.time())))
+    mq_obj = item.produce_item()
+    ETL_MQ.send_msg(video_dict=mq_obj)
+    aliyun_logger.logging(
+        code="1002",
+        message="成功发送到 ETL",
+        account=user["uid"],
+        data=mq_obj
+    )