Ver código fonte

dev sohu crawler

luojunhui 6 meses atrás
pai
commit
3e4a661d1f

+ 4 - 0
applications/pipeline/crawler_pipeline.py

@@ -54,10 +54,12 @@ def scrape_video_entities_process(video_item, db_client) -> dict:
     video_title = video_item["article_title"]
     # whether title sensitive
     if whether_title_sensitive(video_title):
+        print("title is sensitive")
         return empty_dict
 
     # whether duplicate video title
     if whether_duplicate_video_title(video_title, db_client):
+        print("duplicate video title")
         return empty_dict
 
     # download video
@@ -70,6 +72,8 @@ def scrape_video_entities_process(video_item, db_client) -> dict:
             video_path = ""
         case "sph":
             video_path = ""
+        case "sohu":
+            video_path = download_toutiao_video(article_url)
         case _:
             return empty_dict
 

+ 0 - 0
coldStartTasks/crawler/sohu/__init__.py


+ 74 - 0
coldStartTasks/crawler/sohu/get_recommedation.py

@@ -0,0 +1,74 @@
+import requests
+import json
+
+
+def get_recommendation_video_list(page_id, page_size):
+    url = "https://odin.sohu.com/odin/api/a/blockdata?origin=article"
+    payload = json.dumps({
+        "url": "//odin.sohu.com/odin/api/a/blockdata?origin=article",
+        "pageId": "1744186073720NK8",
+        "pvId": "1744186073657DQHXa2g",
+        "mainContent": {
+            "productId": "",
+            "productType": "",
+            "secureScore": "100",
+            "categoryId": "13",
+            "authorId": "121141867",
+            "articleId": "877216434"
+        },
+        "resourceList": [
+            {
+                "tplCompKey": "recommendVideoFeed",
+                "content": {
+                    "page": page_id,
+                    "requestId": "17441860918364TZ",
+                    "size": page_size,
+                    "productId": 1558,
+                    "productType": 13,
+                    "spm": "smpc.vd-land.end-rec"
+                },
+                "context": {
+                    "page_refer_url": "",
+                    "mkey": "channelId_13--mpid_877216434"
+                },
+                "adInfo": {},
+                "spmCCode": "end-rec",
+                "resourceId": "000000000000000000"
+            }
+        ]
+    })
+    headers = {
+        'Accept': 'application/json, text/plain, */*',
+        'Accept-Language': 'zh,zh-CN;q=0.9',
+        'Connection': 'keep-alive',
+        'Content-Type': 'application/json',
+        'Origin': 'https://www.sohu.com',
+        'Referer': 'https://www.sohu.com/a/877216434_121141867?scm=10001.325_13-109000.0.0.5_32',
+        'Sec-Fetch-Dest': 'empty',
+        'Sec-Fetch-Mode': 'cors',
+        'Sec-Fetch-Site': 'same-site',
+        'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/134.0.0.0 Safari/537.36',
+        'sec-ch-ua': '"Chromium";v="134", "Not:A-Brand";v="24", "Google Chrome";v="134"',
+        'sec-ch-ua-mobile': '?0',
+        'sec-ch-ua-platform': '"macOS"',
+        'Cookie': 'IPLOC=CN1200; SUV=250319174303GUDT; cto_bidid=DbraLl80TnZBSiUyRjd5Y3R3d3BPQ2gwNkhCQzFYcTR3cERUelpzdkVsOHIwbUx4VVB2Nm9obktXS1JicW00ZkZJbkY4MWtWTXJHajdKeEdIUEpnQ3ZNM2NNWFJRJTNEJTNE; _cc_id=16be057c82f6c7b9487f8e9de87093f8; cto_bundle=G-1cul95czh2RVh4SnRnZXRlUllFb0hyZFhKRkFiSHpWaU5JZDNNM0pKc25UMlUyQk9FcDYwRVNWcmc4VjdadmxDVyUyQmNhN3NydzJ6NVpJZSUyRjdHSnlhNVViSVUySDl0SCUyQk52blJFNk9VJTJCNTJZaVZHYzdUdUkwcHltWmkzR2d6aTI1TzNheFhkWiUyQjlvaGJkUldEQlElMkJOWTUlMkJTQSUzRCUzRA; gidinf=x099980107ee1a664f21e8892000bfb0cb568460d4f7; FCNEC=%5B%5B%22AKsRol-M9pfdhPRV6WdT0_UpWwGGHATDkhGhu3WhCRwchHNYyaiiIzdgxL07iMyYWnsT_EtmqDWtsEXTVncdSYtqnSPa5geKzsupz1uaDinhxC5vtZ5VYpyP2ce9ihomBxnBnoeGfP_Lbib3u5FF1RQacGdUubuKpg%3D%3D%22%5D%5D; clt=1743410197; cld=20250331163637; _ga=GA1.1.954524343.1743578691; reqtype=pc; _ga_DFBWYFE6Q0=GS1.1.1743578690.1.1.1743578734.16.0.0; cityIpLocation=61.48.133.26; beans_dmp=%7B%2210191%22%3A1744104695%2C%22admaster%22%3A1744104695%2C%22shunfei%22%3A1744104695%2C%22reachmax%22%3A1744104695%2C%22lingji%22%3A1744104695%2C%22yoyi%22%3A1744104695%2C%22ipinyou%22%3A1744104695%2C%22ipinyou_admaster%22%3A1744104695%2C%22miaozhen%22%3A1744104695%2C%22diantong%22%3A1744104695%2C%22huayang%22%3A1744104695%2C%22precisionS%22%3A1744104695%2C%22qunyi%22%3A1744104695%7D; _dfp=q4xXm1uuBqdI3QKRaKHbjDocPoUeGdyjTp29AM1Eak4%3D; hideAddDesktop=true; t=1744186073675'
+    }
+
+    response = requests.request("POST", url, headers=headers, data=payload)
+    recommend_list = response.json()['data']['recommendVideoFeed']['list']
+    for item in recommend_list:
+        L.append(item)
+
+L = []
+for i in range(1, 20):
+    try:
+        get_recommendation_video_list(i, 30)
+    except Exception as e:
+        print(e)
+        print("page: ", i)
+        continue
+
+with open("877216434.json", "w") as f:
+    f.write(json.dumps(L, ensure_ascii=False, indent=4))
+    print("done")
+

+ 0 - 0
coldStartTasks/crawler/sohu/get_user_homepage.py


+ 33 - 3
coldStartTasks/publish/publish_single_video_pool_videos.py

@@ -13,7 +13,34 @@ from config import long_articles_config, apolloConfig
 config = apolloConfig()
 const = SingleVideoPoolPublishTaskConst()
 
-video_pool_config = json.loads(config.getConfigValue(key="video_pool_publish_config"))
+# video_pool_config = json.loads(config.getConfigValue(key="video_pool_publish_config"))
+video_pool_config = {
+    "sph": {
+        "nick_name": "视频号",
+        "process_num_each_day": 218,
+        "generate_plan_id": "20250325025917853810062"
+    },
+    "gzh": {
+        "nick_name": "公众号",
+        "process_num_each_day": 201,
+        "generate_plan_id": "20250324132413116896899"
+    },
+    "toutiao": {
+        "nick_name": "头条号",
+        "process_num_each_day": 411,
+        "generate_plan_id": "20250324132226090387919"
+    },
+    "hksp": {
+        "nick_name": "好看视频",
+        "process_num_each_day": 165,
+        "generate_plan_id": "20250325025446821867933"
+    },
+    "sohu": {
+        "nick_name": "搜狐",
+        "process_num_each_day": 100,
+        "generate_plan_id": "20250409083938381788492"
+    }
+}
 
 
 class PublishSingleVideoPoolVideos:
@@ -52,7 +79,7 @@ class PublishSingleVideoPoolVideos:
         """
         entrance of this class
         """
-        platform_list = ["sph", "gzh", "toutiao", "hksp"]
+        platform_list = ["sohu"]
         for platform in tqdm(platform_list, desc='process each platform'):
             task_list = self.get_task_list(platform)
             task_id_tuple = tuple([task['id'] for task in task_list])
@@ -114,4 +141,7 @@ class PublishSingleVideoPoolVideos:
                         'msg': '该平台无待发布视频,请关注供给的抓取'
                     },
                     mention=False
-                )
+                )
+
+if __name__ == '__main__':
+    PublishSingleVideoPoolVideos().deal()

+ 2 - 1
coldStartTasks/publish/publish_video_to_pq_for_audit.py

@@ -43,7 +43,8 @@ class PublishVideosForAudit(object):
         sql = f"""
             SELECT id, article_title, video_oss_path 
             FROM publish_single_video_source 
-            WHERE audit_status = {const.VIDEO_AUDIT_INIT_STATUS} and bad_status = {const.TITLE_DEFAULT_STATUS}
+            WHERE audit_status = {const.VIDEO_AUDIT_INIT_STATUS} and bad_status = {const.TITLE_DEFAULT_STATUS} and platform = 'sohu'
+                and score > 0.5
             ORDER BY score DESC
             LIMIT {limit_count};
             """