Przeglądaj źródła

Merge branch '2024-10-29-luojunhui-update-official-articles-v2-add-publish-time' of luojunhui/LongArticlesJob into master

luojunhui 7 miesięcy temu
rodzic
commit
3ca81111ee
1 zmienionych plików z 105 dodań i 2 usunięć
  1. 105 2
      updatePublishedMsgDaily.py

+ 105 - 2
updatePublishedMsgDaily.py

@@ -7,12 +7,21 @@ import time
 import json
 import traceback
 
+import urllib.parse
 from tqdm import tqdm
 from datetime import datetime
 
+
 from applications import PQMySQL, WeixinSpider, Functions, log, bot, aiditApi
 
 ARTICLE_TABLE = "official_articles_v2"
+ARTICLE_DELETE_CODE = 25005
+ARTICLE_SUCCESS_CODE = 0
+
+DEFAULT_STATUS = 0
+REQUEST_FAIL_STATUS = -1
+DELETE_STATUS = -2
+UNKNOWN_STATUS = -3
 
 
 def get_accounts_v1():
@@ -446,15 +455,109 @@ def check_job():
         )
 
 
+def get_articles(db_client):
+    """
+
+    :return:
+    """
+    sql = f"""
+    SELECT ContentUrl, wx_sn 
+    FROM official_articles_v2 
+    WHERE publish_timestamp in {(DEFAULT_STATUS, REQUEST_FAIL_STATUS)};"""
+    response = db_client.select(sql)
+    return response
+
+
+def update_publish_timestamp(db_client, row):
+    """
+    更新发布时间戳 && minigram 信息
+    :param db_client:
+    :param row:
+    :return:
+    """
+    url = row[0]
+    wx_sn = row[1]
+    try:
+        response = WeixinSpider().get_article_text(url)
+        response_code = response['code']
+
+        if response_code == ARTICLE_DELETE_CODE:
+            publish_timestamp_s = DELETE_STATUS
+            root_source_id_list = []
+        elif response_code == ARTICLE_SUCCESS_CODE:
+            data = response['data']['data']
+            publish_timestamp_ms = data['publish_timestamp']
+            publish_timestamp_s = int(publish_timestamp_ms / 1000)
+            mini_program = data.get('mini_program', [])
+            if mini_program:
+                root_source_id_list = [
+                    urllib.parse.parse_qs(
+                        urllib.parse.unquote(i['path'])
+                    )['rootSourceId'][0]
+                    for i in mini_program
+                ]
+            else:
+                root_source_id_list = []
+        else:
+            publish_timestamp_s = UNKNOWN_STATUS
+            root_source_id_list = []
+    except Exception as e:
+        publish_timestamp_s = REQUEST_FAIL_STATUS
+        root_source_id_list = []
+        error_msg = traceback.format_exc()
+        print(e, error_msg)
+
+    update_sql = f"""
+            UPDATE official_articles_v2
+            SET publish_timestamp = %s, root_source_id_list = %s
+            WHERE wx_sn = %s;
+        """
+    db_client.update(
+        sql=update_sql,
+        params=(
+            publish_timestamp_s,
+            json.dumps(root_source_id_list, ensure_ascii=False),
+            wx_sn
+        ))
+
+
+def get_article_detail_job():
+    """
+    获取发布文章详情
+    :return:
+    """
+    try:
+        db_client = PQMySQL()
+    except Exception as e:
+        error_msg = traceback.format_exc()
+        bot(
+            title="获取文章详情任务连接数据库失败",
+            detail={
+                "job": "get_article_detail_job",
+                "error": e,
+                "msg": error_msg
+            }
+        )
+        return
+    article_tuple = get_articles(db_client)
+    for article in tqdm(article_tuple):
+        try:
+            update_publish_timestamp(db_client=db_client, row=article)
+        except Exception as e:
+            print(e)
+            error_msg = traceback.format_exc()
+            print(error_msg)
+
+
 def main():
     """
     main
     :return:
     """
     update_job()
-    time.sleep(60)
     check_job()
+    get_article_detail_job()
 
 
 if __name__ == '__main__':
-    main()
+    main()