Ver código fonte

Merge branch '2025-04-29-schedule-improve' of luojunhui/LongArticlesJob into master

luojunhui 5 meses atrás
pai
commit
9710cd7c33

+ 6 - 27
account_cold_start_daily.py

@@ -8,29 +8,12 @@ from argparse import ArgumentParser
 
 from applications import longArticlesMySQL, bot
 from coldStartTasks.crawler.weixinCategoryCrawler import weixinCategory
-from coldStartTasks.publish.publish_single_video_pool_videos import PublishSingleVideoPoolVideos
 from coldStartTasks.publish.publishCategoryArticles import CategoryColdStartTask
 from coldStartTasks.filter.title_similarity_task import ColdStartTitleSimilarityTask
 
 DEFAULT_CATEGORY_LIST = ['1030-手动挑号', 'account_association']
 
 
-def publish_single_video_task():
-    """
-    从视频内容池获取抓取
-    """
-    try:
-        publish_single_video_pool_videos = PublishSingleVideoPoolVideos()
-        publish_single_video_pool_videos.deal()
-    except Exception as e:
-        bot(
-            title="视频内容池任务创建失败",
-            detail={
-                "error": str(e),
-                "error_msg": traceback.format_exc()
-            }
-        )
-
 
 class AccountColdStartDailyTask(object):
     """
@@ -127,21 +110,17 @@ def main(date_str, category_list=None, article_source=None):
     main job, use crontab to do job daily
     :return:
     """
-    # 首先发布视频内容池
-    publish_single_video_task()
-
-    # 再处理文章内容池
     if not category_list:
         category_list = DEFAULT_CATEGORY_LIST
     if not article_source:
         article_source = 'weixin'
     task = AccountColdStartDailyTask()
     if task.init_db():
-        # if article_source == 'weixin':
-        #     task.crawler_task(category_list=category_list, date_str=date_str)
-
         task.publish_article_task(category_list=category_list, article_source=article_source)
 
+        if article_source == 'weixin':
+            task.crawler_task(category_list=category_list, date_str=date_str)
+
 
 if __name__ == '__main__':
     parser = ArgumentParser()
@@ -153,9 +132,6 @@ if __name__ == '__main__':
     else:
         run_date = datetime.date.today().isoformat()
 
-    # 执行微信抓取发布
-    main(date_str=run_date)
-
     # 执行头条发布
     main(
         date_str=run_date,
@@ -163,4 +139,7 @@ if __name__ == '__main__':
         article_source='toutiao'
     )
 
+    # 执行微信抓取发布
+    main(date_str=run_date)
+
 

+ 37 - 0
coldStartTasks/publish/publish_article_pool_articles.py

@@ -0,0 +1,37 @@
+import datetime
+import json
+import time
+import traceback
+
+from pandas import DataFrame
+
+from applications import aiditApi, log, bot
+from applications.db import DatabaseConnector
+from config import long_articles_config
+
+
+class CategoryColdStartTask:
+    def __init__(self):
+        self.db_client = DatabaseConnector(long_articles_config)
+        self.db_client.connect()
+
+    def insert_crawler_plan(self, crawler_plan_id,crawler_plan_name, create_timestamp):
+        insert_query = f"""
+            insert into article_crawler_plan (crawler_plan_id, name, create_timestamp) values (%s, %s, %s);
+        """
+        try:
+            self.db_client.save(
+                query=insert_query,
+                params=(crawler_plan_id, crawler_plan_name, create_timestamp)
+            )
+        except Exception as e:
+            bot(
+                title="品类冷启任务,记录抓取计划id失败",
+                detail={
+                    "error": str(e),
+                    "error_msg": traceback.format_exc(),
+                    "crawler_plan_id": crawler_plan_id,
+                    "crawler_plan_name": crawler_plan_name
+                }
+            )
+

+ 6 - 0
cold_start_publish_to_aigc.py

@@ -0,0 +1,6 @@
+from tasks.publish_tasks.cold_start_publish_daily import ColdStartPublishDailyTask
+
+
+if __name__ == '__main__':
+    cold_start_publish_daily_task = ColdStartPublishDailyTask()
+    cold_start_publish_daily_task.publish_articles_from_video_pool()

+ 0 - 14
crawler_sohu_videos_task.py

@@ -1,14 +0,0 @@
-from tasks.crawler_tasks.crawler_video.crawler_sohu_videos import CrawlerSohuHotVideos
-from tasks.crawler_tasks.crawler_video.crawler_sohu_videos import CrawlerSohuRecommendVideos
-
-def main():
-    # step1, crawl sohu hot videos
-    crawler_sohu_hot_videos = CrawlerSohuHotVideos()
-    crawler_sohu_hot_videos.deal()
-
-    # step2, crawl sohu recommend videos
-    crawler_sohu_recommend_videos = CrawlerSohuRecommendVideos()
-    crawler_sohu_recommend_videos.deal()
-
-if __name__ == '__main__':
-    main()

+ 0 - 0
article_association_task.py → not_used_tasks/article_association_task.py


+ 26 - 0
sh/run_cold_start_publish.sh

@@ -0,0 +1,26 @@
+#!/bin/bash
+
+# 获取当前日期,格式为 YYYY-MM-DD
+CURRENT_DATE=$(date +%F)
+
+# 日志文件路径,包含日期
+LOG_FILE="/root/luojunhui/logs/cold_start_publish_log_$CURRENT_DATE.txt"
+
+# 重定向整个脚本的输出到带日期的日志文件
+exec >> "$LOG_FILE" 2>&1
+if pgrep -f "python3 cold_start_publish_to_aigc.py" > /dev/null
+then
+    echo "$(date '+%Y-%m-%d %H:%M:%S') - cold_start_publish_to_aigc.py is running"
+else
+    echo "$(date '+%Y-%m-%d %H:%M:%S') - trying to restart cold_start_publish_to_aigc.py"
+    # 切换到指定目录
+    cd /root/luojunhui/LongArticlesJob
+
+    # 激活 Conda 环境
+    source /root/miniconda3/etc/profile.d/conda.sh
+    conda activate tasks
+
+    # 在后台运行 Python 脚本并重定向日志输出
+    nohup python3 cold_start_publish_to_aigc.py >> "${LOG_FILE}" 2>&1 &
+    echo "$(date '+%Y-%m-%d %H:%M:%S') - successfully restarted cold_start_publish_to_aigc.py"
+fi

+ 6 - 6
sh/run_article_association.sh → sh/run_schedule_app.sh

@@ -4,15 +4,15 @@
 CURRENT_DATE=$(date +%F)
 
 # 日志文件路径,包含日期
-LOG_FILE="/root/luojunhui/logs/article_association_crawler_log_$CURRENT_DATE.txt"
+LOG_FILE="/root/luojunhui/logs/schedule_app_log_$CURRENT_DATE.txt"
 
 # 重定向整个脚本的输出到带日期的日志文件
 exec >> "$LOG_FILE" 2>&1
-if pgrep -f "python3 article_association_task.py" > /dev/null
+if pgrep -f "python3 schedule_app.py" > /dev/null
 then
-    echo "$(date '+%Y-%m-%d %H:%M:%S') - article_association_task.py is running"
+    echo "$(date '+%Y-%m-%d %H:%M:%S') - schedule_app.py is running"
 else
-    echo "$(date '+%Y-%m-%d %H:%M:%S') - trying to restart article_association_task.py"
+    echo "$(date '+%Y-%m-%d %H:%M:%S') - trying to restart schedule_app.py"
     # 切换到指定目录
     cd /root/luojunhui/LongArticlesJob
 
@@ -21,6 +21,6 @@ else
     conda activate tasks
 
     # 在后台运行 Python 脚本并重定向日志输出
-    nohup python3 article_association_task.py >> "${LOG_FILE}" 2>&1 &
-    echo "$(date '+%Y-%m-%d %H:%M:%S') - successfully restarted article_association_task.py"
+    nohup python3 schedule_app.py >> "${LOG_FILE}" 2>&1 &
+    echo "$(date '+%Y-%m-%d %H:%M:%S') - successfully restarted schedule_app.py"
 fi

+ 47 - 0
tasks/publish_tasks/cold_start_publish_daily.py

@@ -0,0 +1,47 @@
+import json
+import traceback
+
+from applications import bot
+from applications.db import DatabaseConnector
+from config import long_articles_config
+from coldStartTasks.publish.publish_single_video_pool_videos import PublishSingleVideoPoolVideos
+
+
+class ColdStartPublishDailyTask:
+
+    def __init__(self):
+        self.db_client = DatabaseConnector(long_articles_config)
+        self.db_client.connect()
+
+
+    def publish_articles_from_article_pool(self):
+        """
+        从 meta_article_pool 表中获取文章,发布到 AIGC 平台
+        """
+        # publish_article_task = CategoryColdStartTask(db_client=self.db_client)
+        # 执行浸提头条品类发布
+
+
+        pass
+
+
+    def publish_articles_from_video_pool(self):
+        """
+        从 meta_video_pool 表中获取视频,发布到 AIGC 平台
+        """
+        try:
+            publish_single_video_pool_videos = PublishSingleVideoPoolVideos()
+            publish_single_video_pool_videos.deal()
+        except Exception as e:
+            bot(
+                title="视频内容池任务创建失败",
+                detail={
+                    "error": str(e),
+                    "error_msg": traceback.format_exc()
+                }
+            )
+
+
+
+
+

+ 0 - 34
update_published_articles_v2.py

@@ -1,34 +0,0 @@
-"""
-@author: luojunhui
-"""
-from argparse import ArgumentParser
-
-from tasks.update_published_articles_read_detail import UpdatePublishedArticlesReadDetail
-
-
-def main():
-    """
-    update mini program detail main
-    :return:
-    """
-    parser = ArgumentParser()
-    parser.add_argument("--run-date",
-                        help="Run only once for date in format of %Y-%m-%d. \
-                            If no specified, run as daily jobs.")
-    args = parser.parse_args()
-
-    update_publish_articles_task = UpdatePublishedArticlesReadDetail()
-    update_publish_articles_task.init_database()
-
-    if args.run_date:
-        update_publish_articles_task.update_job(args.run_date)
-        update_publish_articles_task.check_job(args.run_date)
-    else:
-        update_publish_articles_task.update_job()
-        update_publish_articles_task.check_job()
-
-    update_publish_articles_task.get_article_detail_job()
-
-
-if __name__ == '__main__':
-    main()