فهرست منبع

文章生成字数: 1000+
公众号修改为: 无忧潮生活

罗俊辉 11 ماه پیش
والد
کامیت
c1f70866aa

+ 2 - 2
applications/functions.py

@@ -146,7 +146,7 @@ def title_filter(title_list):
     """
     url = "http://61.48.133.26:8179/score_list"
     body = {
-        "account_nickname_list": ["生活良读"],
+        "account_nickname_list": ["无忧潮生活"],
         "text_list": title_list,
         "max_time": None,
         "min_time": None,
@@ -155,6 +155,6 @@ def title_filter(title_list):
         "rate": 0.1
     }
     response = requests.post(url=url, headers={}, json=body).json()
-    score_list = response['生活良读']['score_list']
+    score_list = response['无忧潮生活']['score_list']
     # title_score_list = list(zip(title_list, score_list))
     return score_list

+ 1 - 1
applications/migrate.py

@@ -26,7 +26,7 @@ def migrate_daily(dt):
     print("{} successfully insert {} rows, totally cost {} seconds".format(dt, len(data), b - a))
 
 
-dt_list = generate_daily_strings("20240626", "20240702")
+dt_list = generate_daily_strings("20240705", "20240707")
 for dt in dt_list:
     print(dt)
     migrate_daily(dt)

+ 7 - 2
deal/videos_deal.py

@@ -82,10 +82,15 @@ class VideoDeal(object):
         """
         result_list = await self.mysql_client.select(sql)
         title_list = [i[1] for i in result_list]
-        socre_list = title_filter(title_list)
+        score_list = title_filter(title_list)
+        # print(score_list)
+        res = list(zip(score_list, title_list))
+        s_r = sorted(res, key=lambda x:x[0], reverse=True)
+        for i in s_r:
+            print(i)
         result_list_final = []
         for index, item in enumerate(result_list):
-            if socre_list[index] > 0.45:
+            if score_list[index] > 0.5:
                 result_list_final.append(item)
         return self.response_obj(result_list_final)
 

+ 133 - 0
spider/baijiahao_article.py

@@ -0,0 +1,133 @@
+"""
+@author: luojunhui
+"""
+import time
+import requests
+from selenium import webdriver
+from selenium.webdriver.chrome.service import Service
+from selenium.webdriver.chrome.options import Options
+from selenium.webdriver.common.by import By
+from webdriver_manager.chrome import ChromeDriverManager
+
+
+def tunnel_proxies():
+    """
+    快代理
+    :return:
+    """
+    # 隧道域名:端口号
+    tunnel = "l901.kdltps.com:15818"
+
+    # 用户名密码方式
+    username = "t11983523373311"
+    password = "mtuhdr2z"
+    proxies = {
+        "http": "http://%(user)s:%(pwd)s@%(proxy)s/" % {"user": username, "pwd": password, "proxy": tunnel},
+        "https": "http://%(user)s:%(pwd)s@%(proxy)s/" % {"user": username, "pwd": password, "proxy": tunnel}
+    }
+    return proxies
+
+
+def bjh_url_list(search_title):
+    """
+    获取图片list
+    :return:
+    """
+    url = "https://lab.magiconch.com/api/baidu/images"
+    params = {
+        "text": search_title,
+        "index": 0,
+        "size": 60
+    }
+    headers = {
+        'accept': '*/*',
+        'accept-language': 'en,zh;q=0.9,zh-CN;q=0.8',
+        'content-type': 'application/json',
+        'cookie': 'Hm_lvt_f4e477c61adf5c145ce938a05611d5f0=1718784293; Hm_lpvt_f4e477c61adf5c145ce938a05611d5f0=1718784293',
+        'if-none-match': 'W/"5e03-9dK2z/6rD0/7aX0R6HraLuFnLjI"',
+        'priority': 'u=1, i',
+        'referer': 'https://lab.magiconch.com/baidu-images/',
+        'sec-ch-ua': '"Not/A)Brand";v="8", "Chromium";v="126", "Google Chrome";v="126"',
+        'sec-ch-ua-mobile': '?0',
+        'sec-ch-ua-platform': '"macOS"',
+        'sec-fetch-dest': 'empty',
+        'sec-fetch-mode': 'cors',
+        'sec-fetch-site': 'same-origin',
+        'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0.0.0 Safari/537.36'
+    }
+    response = requests.request(
+        "GET",
+        url,
+        headers=headers,
+        params=params,
+        proxies=tunnel_proxies()
+    )
+    res = response.json()
+    url_list = []
+    for item in res:
+        if "baijiahao.baidu.com" in item['url']:
+            url_list.append(item['url'].split("&")[0])
+    return url_list
+
+
+def bjh_article(content_url):
+    """
+    百家号获取文章
+    :param content_url:
+    """
+    # 配置无头浏览器模式
+    print(content_url)
+    chrome_options = Options()
+    chrome_options.add_argument("--headless")
+    chrome_options.add_argument("--disable-gpu")
+    chrome_options.add_argument("--no-sandbox")
+    chrome_options.add_argument("--disable-dev-shm-usage")
+
+    # 安装并设置Chrome驱动
+    service = Service(ChromeDriverManager().install())
+    driver = webdriver.Chrome(service=service, options=chrome_options)
+    # 打开指定网页
+    driver.get(content_url)
+    # 等待网页加载完毕
+    driver.implicitly_wait(5)
+
+    # 模拟滚动页面
+    def scroll_page():
+        # 获取页面高度
+        last_height = driver.execute_script("return document.body.scrollHeight")
+        while True:
+            # 向下滚动到页面底部
+            driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
+            # 等待页面加载
+            time.sleep(2)
+            # 计算新的页面高度并与上次页面高度进行比较
+            new_height = driver.execute_script("return document.body.scrollHeight")
+            if new_height == last_height:
+                break
+            last_height = new_height
+
+    scroll_page()
+    title_element = driver.find_element(By.XPATH, '//div[@class="sKHSJ"]')
+    title = title_element.text
+    page_text_elements = driver.find_elements(By.XPATH, '//div[@data-testid="article"]//p')
+    page_text = '\n'.join([element.text for element in page_text_elements])
+    img_elements = driver.find_elements(By.XPATH, '//div[@class="_1NCGf"]/img')
+    img_url_list = [element.get_attribute("src") for element in img_elements]
+    # 打印网页文本
+    print(title)
+    print(page_text)
+    print(img_url_list)
+    # 关闭浏览器
+    driver.quit()
+    return title, page_text, img_url_list
+
+
+if __name__ == '__main__':
+    title = "老祖宗说“寿有三不过,子孙福气多”!原来话里藏着大秘密!"
+    url_list = bjh_url_list(title)
+    # print(url_list)
+    for url in url_list:
+        try:
+            bjh_article(url)
+        except:
+            pass

+ 0 - 15
spider/tencent_news.py

@@ -1,15 +0,0 @@
-"""
-@author: luojunhui
-"""
-import requests
-
-url = "https://new.qq.com/search?query=%E5%8C%97%E5%A4%A7%E6%95%99%E6%8E%88%E5%A4%AB%E4%BA%BA96%E5%B2%81%E9%80%9D%E4%B8%96%EF%BC%8C%E7%94%9F%E5%89%8D%E6%9B%BE%E6%96%AD%E9%A3%9F%E6%96%AD%E6%B0%B44%E5%A4%A9%EF%BC%8C%E7%95%99%E4%B8%8B%E4%B8%80%E5%8F%A5%E8%AF%9D%E4%BB%A4%E4%BA%BA%E6%B7%B1%E6%80%9D&page=1"
-
-
-headers = {
-    "Content-Type": "application/json",
-    "cookie": "RK=kreEdgt2YJ; ptcz=988b2dee721fc7f396a696a31bcfaca33cdb372f1b881ee5affbce5e5d978e8c; _qimei_uuid42=186031009051009d7cd1945011a64a99cb68d2482e; _qimei_q36=; _qimei_h38=428c111f7cd1945011a64a990300000ca18603; pgv_pvid=2616476048; pgv_pvi=2160320512; pgv_si=s1462014976; pgv_info=ssid=s1768029950; pac_uid=0_ddQwmCn3ZjrMh; _qimei_fingerprint=6326615306fcfb00937ca380512eb6b7; current-city-name=bj; ad_play_index=9; suid=0_ddQwmCn3ZjrMh; lcad_o_minduid=U_QlocTufXCe5zVOsVXp6pMQbmOA_IyY; lcad_appuser=603532C44F72F827; lcad_Lturn=937; lcad_LKBturn=178; lcad_LPVLturn=762; lcad_LPLFturn=93"
-}
-response = requests.get(url, headers=headers)
-
-print(response.text)

+ 0 - 96
spider/wechatSogou.py

@@ -1,96 +0,0 @@
-"""
-@author: luojunhui
-"""
-import time
-import random
-import requests
-from lxml import etree
-from fake_useragent import FakeUserAgent
-from gne import GeneralNewsExtractor
-from selenium import webdriver
-from selenium.webdriver.chrome.service import Service
-from selenium.webdriver.chrome.options import Options
-from webdriver_manager.chrome import ChromeDriverManager
-
-
-def selenium_text(url):
-    # 配置 Chrome 选项
-    chrome_options = Options()
-    chrome_options.add_argument('--headless')  # 无头模式
-    # chrome_options.add_argument('--disable-gpu')
-    # chrome_options.add_argument('--no-sandbox')
-    # chrome_options.add_argument('--disable-dev-shm-usage')
-    # chrome_options.add_argument('--disable-blink-features=AutomationControlled')
-    # chrome_options.add_argument(
-    #     f'user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36')
-    # chrome_options.add_argument('--incognito')
-    window_width = random.randint(800, 1200)
-    window_height = random.randint(600, 800)
-
-
-    # chrome_options.add_argument('--proxy-server=http://t17772369458618:5zqcjkmy@q796.kdltps.com:15818/')
-    service = Service(ChromeDriverManager().install())
-    driver = webdriver.Chrome(service=service, options=chrome_options)
-    driver.set_window_size(window_width, window_height)
-    driver.get(url)
-    page_text = driver.page_source
-    driver.quit()
-    return page_text
-
-
-def tunnel_proxies():
-    # 隧道域名:端口号
-    tunnel = "q796.kdltps.com:15818"
-    # 用户名密码方式
-    username = "t17772369458618"
-    password = "5zqcjkmy"
-    proxies = {
-        "http": "http://%(user)s:%(pwd)s@%(proxy)s/" % {"user": username, "pwd": password, "proxy": tunnel},
-        "https": "http://%(user)s:%(pwd)s@%(proxy)s/" % {"user": username, "pwd": password, "proxy": tunnel}
-    }
-    return proxies
-
-
-def extract(url):
-    """
-    ttt
-    :param url:
-    :return:
-    """
-    html_text = selenium_text(url)
-    extractor = GeneralNewsExtractor()
-    result = extractor.extract(html_text)
-    print(result)
-
-
-def sogou_wechat(keyword):
-    """
-    :param keyword:
-    :return:
-    """
-    url = "https://weixin.sogou.com/weixin?type=2&query={}".format(keyword)
-    print(url)
-    headers = {
-        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
-        'Accept-Language': 'en,zh;q=0.9,zh-CN;q=0.8',
-        'Cache-Control': 'max-age=0',
-        'Connection': 'keep-alive',
-        'Cookie': '',
-        'Referer': 'https://weixin.sogou.com/weixin',
-        'Upgrade-Insecure-Requests': '1',
-        'User-Agent': FakeUserAgent().chrome
-    }
-
-    response = requests.request("GET", url, headers=headers, proxies=tunnel_proxies())
-    e_tree = etree.HTML(response.text)
-
-    xpath = r'//ul[@class="news-list"]/li/div/a/@href'
-
-    url_list = e_tree.xpath(xpath)
-    url_list = ["https://weixin.sogou.com/" + i for i in url_list]
-    for url in url_list:
-        print(url)
-        extract(url)
-
-
-sogou_wechat("人民日报")

+ 3 - 0
task/__init__.py

@@ -0,0 +1,3 @@
+"""
+@author: luojunhui
+"""

+ 95 - 0
task/baidu_search_task.py

@@ -0,0 +1,95 @@
+"""
+@author: luojunhui
+"""
+import json
+import time
+
+import requests
+from uuid import uuid4
+
+from applications.config import db_config
+from applications.functions import whisper
+from applications.ai import kimi_ai
+from spider.baijiahao_article import bjh_article, bjh_url_list
+
+
+class BaiduGenerateTask(object):
+    """
+    视频匹配文章流程
+    流程
+    1. 拿视频id,标题等信息匹配账号
+    """
+
+    def __init__(self, mysql_client):
+        """
+        :param mysql_client mysql服务池
+        """
+        self.mysql_client = mysql_client
+
+    async def whisper_task(self):
+        """
+        执行定时任务,把库里面的视频转文本
+        :return:
+        """
+        select_sql = f"""SELECT video_id FROM {db_config} WHERE status_code = 0 ORDER BY id ASC limit 1;"""
+        video_list = await self.mysql_client.select(select_sql)
+
+        async def whisper_and_update(video_id, mysql_client):
+            """
+            whisper处理视频并且把信息更新到mysql表中
+            :param video_id:
+            :param mysql_client:
+            :return:
+            """
+            try:
+                w_response = whisper(video_id)
+            except:
+                w_response = {"text": "whisper failed"}
+            print(w_response)
+            text = w_response['text'].replace("'", "")
+            update_sql = f"""
+            UPDATE {db_config}
+            SET 
+                video_text = %s,
+                status_code = %s
+            WHERE video_id = %s;
+            """
+            print(update_sql)
+            await mysql_client.async_insert(sql=update_sql, params=(text, 1, video_id))
+
+        for vid in video_list:
+            await whisper_and_update(video_id=vid[0], mysql_client=self.mysql_client)
+
+    async def materials_task(self):
+        """
+        获取task的材料
+        :return:
+        """
+        select_sql = f"""SELECT task_id, video_title, video_text FROM {db_config} WHERE status_code = 1 ORDER BY id ASC limit 1;"""
+        task_list = await self.mysql_client.select(select_sql)
+
+        async def baidu_search(task_tuple, mysql_client):
+            """
+            :param task_tuple:
+            :param mysql_client:
+            """
+            task_id, title, text = task_tuple
+            url_list = bjh_url_list(title)
+            L = []
+            for url in url_list:
+                a_text, img_list = bjh_article(url.split("&")[0])
+                obj = {
+                    "text": a_text,
+                    "img_list": img_list
+                }
+                L.append(obj)
+            update_sql = f"""
+            UPDATE {db_config}
+            SET materials = %s, status_code = %s
+            WHERE task_id = %s;
+            """
+            print(update_sql)
+            await mysql_client.async_insert(sql=update_sql, params=(L, 2, task_id))
+
+        for task in task_list:
+            await baidu_search(task, self.mysql_client)

+ 114 - 0
task/wx_search_task.py

@@ -0,0 +1,114 @@
+"""
+@author: luojunhui
+"""
+import json
+import time
+
+import requests
+from uuid import uuid4
+
+from applications.config import db_config
+from applications.functions import whisper
+
+
+class wxGenerateTask(object):
+    """
+    视频匹配文章流程
+    流程
+    1. 拿视频id,标题等信息匹配账号
+    """
+
+    def __init__(self, mysql_client):
+        """
+        :param mysql_client mysql服务池
+        """
+        self.mysql_client = mysql_client
+
+    async def whisper_task(self):
+        """
+        执行定时任务,把库里面的视频转文本
+        :return:
+        """
+        select_sql = f"""SELECT video_id FROM {db_config} WHERE status_code = 0 ORDER BY id ASC limit 1;"""
+        video_list = await self.mysql_client.select(select_sql)
+
+        async def whisper_and_update(video_id, mysql_client):
+            """
+            whisper处理视频并且把信息更新到mysql表中
+            :param video_id:
+            :param mysql_client:
+            :return:
+            """
+            try:
+                w_response = whisper(video_id)
+            except:
+                w_response = {"text": "whisper failed"}
+            print(w_response)
+            text = w_response['text'].replace("'", "")
+            update_sql = f"""
+            UPDATE {db_config}
+            SET 
+                video_text = %s,
+                status_code = %s
+            WHERE video_id = %s;
+            """
+            print(update_sql)
+            await mysql_client.async_insert(sql=update_sql, params=(text, 1, video_id))
+
+        for vid in video_list:
+            await whisper_and_update(video_id=vid[0], mysql_client=self.mysql_client)
+
+    @classmethod
+    def search_articles(cls, title):
+        """
+        search articles in wx
+        :return:
+        """
+        url = "http://8.217.190.241:8888/crawler/wei_xin/keyword"
+        payload = json.dumps({
+            "keyword": title,
+            "cursor": "1"
+        })
+        headers = {
+            'Content-Type': 'application/json'
+        }
+
+        response = requests.request("POST", url, headers=headers, data=payload)
+        return response.json()
+
+    @classmethod
+    def get_article_text(cls, content_link):
+        """
+        获取文章
+        :param content_link:
+        :return:
+        """
+        url = "http://8.217.190.241:8888/crawler/wei_xin/detail"
+        payload = json.dumps({
+            "content_link": content_link,
+            "is_count": False,
+            "is_ad": False
+        })
+        headers = {
+            'Content-Type': 'application/json'
+        }
+        response = requests.request("POST", url, headers=headers, data=payload)
+        return response.json()
+
+
+if __name__ == '__main__':
+    wgt = wxGenerateTask(mysql_client="client")
+    text = wgt.get_article_text("https://mp.weixin.qq.com/s/BzLrY7QD_XzLzvCq2ScodA")
+    img_list = text['data']['data']['image_url_list']
+    img_list = [i['image_url'] for i in img_list]
+    cover = img_list[0]
+    title = text['data']['data']['title']
+    res = {
+        "text": text['data']['data']['body_text'],
+        "title": title,
+        "cover": cover,
+        "img_list": img_list
+    }
+    print(json.dumps(res, ensure_ascii=False))
+
+

+ 0 - 11
test/ai_dev.py

@@ -1,11 +0,0 @@
-"""
-@author: luojunhui
-"""
-from applications.ai import tencent_ai
-
-
-prompt = "易中天对退休人的忠告,看了才没白活, 通过这个标题搜索相关文章"
-
-res = tencent_ai(prompt)
-
-print(res)

+ 41 - 0
test/baidu_img_test.py

@@ -0,0 +1,41 @@
+import json
+
+import requests
+
+
+def get_img_list(search_title):
+    """
+    获取图片list
+    :return:
+    """
+    url = "https://lab.magiconch.com/api/baidu/images"
+    params = {
+        "text": search_title,
+        "index": 0,
+        "size": 60
+    }
+    headers = {
+        'accept': '*/*',
+        'accept-language': 'en,zh;q=0.9,zh-CN;q=0.8',
+        'content-type': 'application/json',
+        'cookie': 'Hm_lvt_f4e477c61adf5c145ce938a05611d5f0=1718784293; Hm_lpvt_f4e477c61adf5c145ce938a05611d5f0=1718784293',
+        'if-none-match': 'W/"5e03-9dK2z/6rD0/7aX0R6HraLuFnLjI"',
+        'priority': 'u=1, i',
+        'referer': 'https://lab.magiconch.com/baidu-images/',
+        'sec-ch-ua': '"Not/A)Brand";v="8", "Chromium";v="126", "Google Chrome";v="126"',
+        'sec-ch-ua-mobile': '?0',
+        'sec-ch-ua-platform': '"macOS"',
+        'sec-fetch-dest': 'empty',
+        'sec-fetch-mode': 'cors',
+        'sec-fetch-site': 'same-origin',
+        'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0.0.0 Safari/537.36'
+    }
+    response = requests.request("GET", url, headers=headers, params=params)
+    res = response.json()
+    return res
+
+
+title = "周总理巧解十二生肖!"
+r = get_img_list(title)
+for i in r:
+    print(json.dumps(i, ensure_ascii=False, indent=4))

تفاوت فایلی نمایش داده نمی شود زیرا این فایل بسیار بزرگ است
+ 4 - 0
test/mysql.py


+ 0 - 0
test/t.py


تفاوت فایلی نمایش داده نمی شود زیرا این فایل بسیار بزرگ است
+ 19 - 195
test/test4.py


+ 8 - 4
test/videos_dev.py

@@ -14,9 +14,9 @@ url = "http://localhost:8888/videos"
 
 body = {
     "cate": "video_return",
-    "start_date": "2024-07-02",
-    "end_date": "2024-07-03",
-    "topN": 500
+    "start_date": "2024-06-01",
+    "end_date": "2024-07-06",
+    "topN": 800
 }
 a = time.time()
 header = {
@@ -26,4 +26,8 @@ header = {
 response = requests.post(url, json=body, headers=header, timeout=600)
 b = time.time()
 print(b - a)
-print(json.dumps(response.json(), ensure_ascii=False, indent=4))
+print(len(response.json()['data']))
+for i in response.json()['data']:
+    print(i['title'])
+    print(i['video_url'])
+    print("\n")

برخی فایل ها در این مقایسه diff نمایش داده نمی شوند زیرا تعداد فایل ها بسیار زیاد است