罗俊辉 10 місяців тому
батько
коміт
8a4982ba23
4 змінених файлів з 274 додано та 1 видалено
  1. 48 0
      applications/ai.py
  2. 130 1
      applications/functions.py
  3. 0 0
      spider/__init__.py
  4. 96 0
      spider/toutiao.py

+ 48 - 0
applications/ai.py

@@ -0,0 +1,48 @@
+"""
+@author: luojunhui
+"""
+import json
+import requests
+from openai import OpenAI
+
+
+def kimi_ai(prompt):
+    """
+    kimi extract text
+    :param prompt:
+    :return:
+    """
+    client = OpenAI(
+        api_key='sk-5DqYCa88kche6nwIWjLE1p4oMm8nXrR9kQMKbBolNAWERu7q',
+        base_url="https://api.moonshot.cn/v1"
+    )
+    chat_completion = client.chat.completions.create(
+        messages=[
+            {
+                "role": "user",
+                "content": prompt,
+            }
+        ],
+        model="moonshot-v1-8k",
+    )
+    response = chat_completion.choices[0].message.content.replace('```json', '').replace('```', '')
+    return response
+
+
+def metaSo(prompt, mode="深入"):
+    """
+    meta
+    :param prompt:
+    :param mode:
+    :return:
+    """
+    url = "http://8.217.190.241:8888/crawler/metaso/meta_ai"
+    body = {
+        "prompt": prompt,
+        "mode": mode
+    }
+    header = {
+        "Content-Type": "application/json"
+    }
+    response = requests.post(url=url, json=body, headers=header)
+    return response.json()['data']

+ 130 - 1
applications/functions.py

@@ -4,6 +4,9 @@
 from datetime import datetime, timedelta
 
 import requests
+import json
+import oss2
+from uuid import uuid4
 
 
 def generate_daily_strings(start_date, end_date):
@@ -62,4 +65,130 @@ def get_text(video_id):
         json=body,
         headers=header
     )
-    return response.json()
+    return response.json()
+
+
+def auto_upload_aigc():
+    """
+    auto publish
+    :return:
+    """
+    url = "http://aigc-api.cybertogether.net/aigc/crawler/plan/save"
+
+    payload = json.dumps({
+        "params": {
+            "contentFilters": [],
+            "accountFilters": [],
+            "filterAccountMatchMode": 1,
+            "filterContentMatchMode": 1,
+            "selectModeValues": [],
+            "imageSearchModeValues": [],
+            "contentModal": 3,
+            "analyze": {},
+            "crawlerComment": 0,
+            "inputGroup": [
+                [
+                    {
+                        "inputValue": [
+                            {
+                                "fileName": "pqzf.png",
+                                "ossKey": "upload/03bf695277827c2387133a1ac9290fd2.png",
+                                "type": "image/png",
+                                "size": 2978
+                            }
+                        ],
+                        "fieldName": "cover",
+                        "fieldType": 1,
+                        "groupId": "fa9557a13208975a893777188f9e4b28"
+                    },
+                    {
+                        "fieldName": "title",
+                        "fieldType": 0,
+                        "groupId": "fa9557a13208975a893777188f9e4b28",
+                        "inputValue": "412412412"
+                    },
+                    {
+                        "fieldName": "content",
+                        "fieldType": 0,
+                        "groupId": "fa9557a13208975a893777188f9e4b28",
+                        "inputValue": "12312442"
+                    },
+                    {
+                        "fieldName": "image",
+                        "fieldType": 1,
+                        "groupId": "fa9557a13208975a893777188f9e4b28",
+                        "inputValue": [
+                            {
+                                "fileName": "lehuo.png",
+                                "ossKey": "upload/4bf6db57ccd1629909e070833aab8878.png",
+                                "type": "image/png",
+                                "size": 5085
+                            }
+                        ]
+                    },
+                    {
+                        "fieldName": "video",
+                        "fieldType": 2,
+                        "groupId": "fa9557a13208975a893777188f9e4b28"
+                    },
+                    {
+                        "fieldName": "audio",
+                        "fieldType": 3,
+                        "groupId": "fa9557a13208975a893777188f9e4b28"
+                    },
+                    {
+                        "fieldName": "tag",
+                        "fieldType": 0,
+                        "groupId": "fa9557a13208975a893777188f9e4b28"
+                    }
+                ]
+            ],
+            "inputSourceGroups": [],
+            "modePublishTime": [],
+            "name": "junhui测试自动上传_by_python",
+            "frequencyType": 3,
+            "planType": 2
+        },
+        "baseInfo": {
+            "token": "af54cdc404c3464d896745df389b2dce",
+            "appType": 9,
+            "platform": "pc",
+            "appVersionCode": 1000,
+            "clientTimestamp": 1,
+            "fid": 1,
+            "loginUid": 1,
+            "pageSource": 1,
+            "requestId": 1,
+            "rid": 1,
+            "uid": 1
+        }
+    })
+    headers = {
+        'Accept': 'application/json',
+        'Accept-Language': 'en,zh;q=0.9,zh-CN;q=0.8',
+        'Content-Type': 'application/json',
+        'Origin': 'http://aigc-admin.cybertogether.net',
+        'Proxy-Connection': 'keep-alive',
+        'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Safari/537.36'
+    }
+
+    response = requests.request("POST", url, headers=headers, data=payload)
+
+    print(response.text)
+
+
+def upload_to_oss(local_path):
+    """
+    上传到oss
+    :return:
+    """
+    oss_video_key = str(uuid4())
+    access_key_id = "LTAIP6x1l3DXfSxm"
+    access_key_secret = "KbTaM9ars4OX3PMS6Xm7rtxGr1FLon"
+    endpoint = "oss-cn-hangzhou.aliyuncs.com"
+    bucket_name = "art-pubbucket"
+    bucket = oss2.Bucket(
+        oss2.Auth(access_key_id, access_key_secret), endpoint, bucket_name
+    )
+    bucket.put_object_from_file(key=oss_video_key, filename=local_path)
+    return oss_video_key

+ 0 - 0
spider/__init__.py


+ 96 - 0
spider/toutiao.py

@@ -0,0 +1,96 @@
+"""
+@author: luojunhui
+"""
+import urllib.parse
+import requests
+
+from fake_useragent import FakeUserAgent
+from lxml import etree
+
+
+def tunnel_proxies():
+    """
+        快代理方法
+        :return:
+        """
+    tunnel = "q796.kdltps.com:15818"
+    username = "t17772369458618"
+    password = "5zqcjkmy"
+    proxies = {
+        "http": "http://%(user)s:%(pwd)s@%(proxy)s/" % {"user": username, "pwd": password, "proxy": tunnel},
+        "https": "http://%(user)s:%(pwd)s@%(proxy)s/" % {"user": username, "pwd": password, "proxy": tunnel}
+    }
+    return proxies
+
+
+def search_article(title):
+    """
+    通过标题搜索文章
+    :param title:
+    :return:
+    """
+    url = "https://so.toutiao.com/search"
+    params = {
+        "dvpf": "pc",
+        "source": "search_subtab_switch",
+        "keyword": title,
+        "page_num": 0,
+        "pd": "information",
+        "action_type": "search_subtab_switch",
+        "search_id": "",
+        "from": "news",
+        "cur_tab_title": "news"
+    }
+    headers = {
+        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
+        'Accept-Language': 'en,zh;q=0.9,zh-CN;q=0.8',
+        'Cache-Control': 'max-age=0',
+        'Connection': 'keep-alive',
+        'Cookie': '__ac_nonce=06656db2c0044df592020; __ac_signature=_02B4Z6wo00f01f7SgiAAAIDA3giITC7VZDn-8oaAABnk69; msToken=2BrMcxLg3_PsS1iKcjuLpU1GS9iZsZ-51KSQTgUSRRLhGQqsQV3zKuJR49smQ7f8hQ8fahWtYCKC6TKJO3kR8022S-NsNfdHXu7X7mPM; ttwid=1%7C9b5sTIuwZxZKt0wFsvE-2t5OoFxH_Q5VIpVNWEREbAo%7C1716968863%7C3459387c0d736a3410ba2c5dbdaeb61a7f85f161bbf6cb1a4f646c2a2edc9aa5;',
+        'Sec-Fetch-Dest': 'document',
+        'Sec-Fetch-Mode': 'navigate',
+        'Sec-Fetch-Site': 'none',
+        'Sec-Fetch-User': '?1',
+        'Upgrade-Insecure-Requests': '1',
+        'User-Agent': FakeUserAgent().random,
+    }
+    response = requests.get(url=url, headers=headers, params=params)
+    html = etree.HTML(response.text)
+    xpath = '//div[@class="cs-view cs-view-block cs-card-content"]//a/@href'
+    result_list = html.xpath(xpath)
+    urls = [urllib.parse.unquote(i.replace("/search/jump?url=", "")) for i in result_list]
+    article_urls = [url.replace("http://www.toutiao.com/a", "https://www.toutiao.com/article/") for url in urls]
+    return article_urls
+
+
+def parse_detail(url):
+    """
+
+    :param url:
+    :return:
+    """
+    payload = {}
+    headers = {
+        'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
+        'accept-language': 'en',
+        'cookie': '__ac_signature=_02B4Z6wo00f01uL1Y5QAAIDDwi9p-RyULmbi1WcAAN8B5b; tt_webid=7371293454351697471; ttcid=a7499fc4f17243e1a6f1d47fc054799e16; _ga=GA1.1.1771235425.1716434457; s_v_web_id=verify_lwiorbpi_iEdehlbX_70wd_4ldg_Bctq_iI8TdJIzXVnI; csrftoken=ee756af695a449eeb73b5a3fc78978b2; _S_DPR=2.200000047683716; _S_IPAD=0; notRedShot=1; local_city_cache=%E5%8C%97%E4%BA%AC; gfkadpd=24,6457; _S_WIN_WH=1336_726; ttwid=1%7C9b5sTIuwZxZKt0wFsvE-2t5OoFxH_Q5VIpVNWEREbAo%7C1717060308%7C4203558278ecaad696c5c5d3ff2bbab682c3ae0f89b876ab618b4646f04c81ef; tt_scid=GNVu.6vOxpT8JE14FSVyGcUl7IItTWcIP.qGQ.gRujo02xjV1BqV1tKznH7NlULMc251; msToken=dG4Y0x3XtlJGBx5WEqYilfjCRWJq629eYRg-hLWdfiTavnI5szx-h9KwUUAggGcG2i03AksGmCfQ034JCbnfWnTnhuguXU1yISZMr6YE; _ga_QEHZPBE5HH=GS1.1.1717057829.7.1.1717060394.0.0.0',
+        'priority': 'u=0, i',
+        'referer': url,
+        'upgrade-insecure-requests': '1',
+        'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Safari/537.36'
+    }
+    response = requests.request("GET", url, headers=headers, data=payload, proxies=tunnel_proxies())
+    html = etree.HTML(response.text)
+    text_xpath = '//div[@class="article-content"]//p/text()'
+    img_xpath = '//div[@class="pgc-img"]/img/@src'
+    title_xpath = '//div[@class="article-content"]/h1/text()'
+    result_list = html.xpath(text_xpath)
+    img_list = html.xpath(img_xpath)
+    title = html.xpath(title_xpath)
+    obj = {
+        "article_url": url,
+        "text": "\n".join(result_list),
+        "img_urls": img_list,
+        "title": title
+    }
+    return obj