Explorar o código

Merge branch '2025-02-06-filter-title-while-crawling' of luojunhui/LongArticlesJob into master

luojunhui hai 2 meses
pai
achega
020b4cfd76

+ 2 - 2
applications/llm_sensitivity.py

@@ -4,11 +4,11 @@
 
 import json
 from openai import OpenAI
-import pandas as pd
+
 
 def request_llm_api(prompt, text):
     client = OpenAI(
-        api_key='sk-30d00642c8b643cab3f54e5672f651c9',
+        api_key='sk-c1b18099dadc4dd1b48239bdde184f6c',
         base_url="https://api.deepseek.com"
     )
     chat_completion = client.chat.completions.create(

+ 11 - 2
coldStartTasks/crawler/toutiao_recommend_crawler.py

@@ -15,6 +15,7 @@ from applications import log
 from applications import Functions
 from applications.db import DatabaseConnector
 from config import long_articles_config
+from coldStartTasks.filter import article_crawler_duplicate_filter
 
 functions = Functions()
 
@@ -91,11 +92,19 @@ class ToutiaoRecommendCrawler(object):
         :param category
         :return:
         """
+        title = item['title']
+        if article_crawler_duplicate_filter(new_article_title=title, db_client=self.db_client):
+            log(
+                function='toutiao_recommend_crawler',
+                task='toutiao_recommend',
+                message='标题去重'
+            )
+            return
+
         item_id = item.get('item_id')
         article_url = item['article_url']
         like_count = item['like_count']
         read_count = item['read_count']
-        title = item['title']
         user_info = item['user_info']
         user_id = user_info.get('user_id')
         abstract = item['Abstract']
@@ -125,7 +134,7 @@ class ToutiaoRecommendCrawler(object):
             )
         )
 
-    def process_recommendation(self, category, recommendation) -> Dict:
+    def process_recommendation(self, category, recommendation):
         """
         处理推荐流文章
         :param recommendation

+ 13 - 1
coldStartTasks/crawler/weixinCategoryCrawler.py

@@ -6,7 +6,8 @@
 import time
 
 from tqdm import tqdm
-from applications import WeixinSpider, Functions, llm_sensitivity
+from applications import WeixinSpider, Functions, llm_sensitivity, log
+from coldStartTasks.filter import article_crawler_duplicate_filter
 
 # 常量
 ACCOUNT_GOOD_STATUS = 1
@@ -60,6 +61,17 @@ class weixinCategory(object):
             detail_article_list = article_obj["AppMsg"]["DetailInfo"]
             for obj in detail_article_list:
                 try:
+                    # 判断文章是否存在相同的标题
+                    if article_crawler_duplicate_filter(
+                        new_article_title=obj["Title"], db_client=self.db_client_lam
+                    ):
+                        log(
+                            function="weixinCategory",
+                            task="weixinCategory",
+                            message="文章去重",
+                            data={"title": obj["Title"]}
+                        )
+                        continue
                     show_stat = self.function.show_desc_to_sta(obj["ShowDesc"])
                     show_view_count = show_stat.get("show_view_count", DEFAULT_VIEW_COUNT)
                     show_like_count = show_stat.get("show_like_count", DEFAULT_LIKE_COUNT)

+ 12 - 0
coldStartTasks/crawler/weixin_video_crawler.py

@@ -17,6 +17,7 @@ from applications import Functions
 from applications import WeixinSpider
 from applications import longArticlesMySQL
 from applications.const import WeixinVideoCrawlerConst
+from coldStartTasks.filter import video_crawler_duplicate_filter
 
 spider = WeixinSpider()
 functions = Functions()
@@ -158,6 +159,17 @@ class WeixinVideoCrawler(object):
                     # 判断该视频链接是否下载,若已经下载则直接跳过
                     if self.is_downloaded(url_unique):
                         continue
+
+                    # 判断标题是否重复
+                    if video_crawler_duplicate_filter(article_url, self.db_client):
+                        log(
+                            task='weixin_video_crawler',
+                            function="insert_msg_list",
+                            message="标题去重",
+                            data={"url": article_url}
+                        )
+                        continue
+
                     try:
                         download_path = functions.download_gzh_video(article_url)
                         if download_path:

+ 38 - 0
coldStartTasks/filter/__init__.py

@@ -0,0 +1,38 @@
+"""
+@author: luojunhui
+"""
+
+
+def article_crawler_duplicate_filter(new_article_title, db_client) -> bool:
+    """
+    filter duplicate article title
+    :param: new_article_title
+    :return: True or False
+    """
+    select_sql = f"""
+        select article_id from crawler_meta_article where title = '{new_article_title}';
+    """
+    response = db_client.select(select_sql)
+    if response:
+        return True
+    else:
+        return False
+
+
+def video_crawler_duplicate_filter(new_video_title, db_client) -> bool:
+    """
+    filter duplicate video title
+    :param: new_video_title
+    :return: True or False
+    """
+    select_sql = f"""
+        select article_title from publish_single_video_source where article_title = '{new_video_title}';
+    """
+    response = db_client.select(select_sql)
+    if response:
+        return True
+    else:
+        return False
+
+
+

+ 4 - 1
coldStartTasks/publish/publishCategoryArticles.py

@@ -121,11 +121,13 @@ class CategoryColdStartTask(object):
                 WHERE
                     title in %s and status = %s;
                 """
-                self.db_client.update(
+                affected_rows = self.db_client.update(
                     sql=update_sql,
                     params=(self.PUBLISHED_STATUS, tuple(title_list), self.INIT_STATUS)
                 )
+                print(affected_rows)
         else:
+            print("未获取到计划id")
             return
 
     def change_article_status_while_publishing(self, article_id_list):
@@ -361,3 +363,4 @@ class CategoryColdStartTask(object):
                         "traceback": traceback.format_exc()
                     }
                 )
+