Преглед изворни кода

Merge branch '2024-11-21-luojunhui-updatePublishedMsg-Improve' of luojunhui/LongArticlesJob into master

luojunhui пре 11 месеци
родитељ
комит
9043dc8698
3 измењених фајлова са 185 додато и 37 уклоњено
  1. 127 0
      applications/aiditApi.py
  2. 38 0
      applications/const.py
  3. 20 37
      updatePublishedMsgDaily.py

+ 127 - 0
applications/aiditApi.py

@@ -30,7 +30,13 @@ PERSON_COOKIE = {
     "uid": 1
 }
 
+
 def get_generated_article_list(plan_id):
+    """
+    自动生成计划 id 获取该生成计划已经生成过的文章列表
+    :param plan_id:
+    :return:
+    """
     db = DeNetMysql()
     sql = f"""
         SELECT 
@@ -56,6 +62,7 @@ def get_generated_article_list(plan_id):
     article_list = db.select(sql)
     return article_list
 
+
 def get_generated_article_title(generate_task_id):
     """
     生成计划 id 获取该生成计划已经生成过的文章标题
@@ -117,6 +124,7 @@ def get_publish_account_from_aigc():
     ]
     return info_list
 
+
 def auto_create_crawler_task(plan_id, plan_name, plan_tag, url_list):
     """
     通过 url 自动创建抓取计划
@@ -236,3 +244,122 @@ def get_generate_task_detail(generate_task_id):
     else:
         return {}
 
+
+@retryOnTimeout()
+def get_publish_task_detail(publish_task_id):
+    """
+    通过发布计划的 id,获取该发布计划已有的抓取计划 list
+    :param publish_task_id:
+    :param generate_task_id:
+    :return:
+    """
+    url = "http://aigc-api.cybertogether.net/aigc/publish/plan/detail"
+
+    payload = json.dumps({
+        "params": {
+            "id": publish_task_id
+        },
+        "baseInfo": PERSON_COOKIE
+    })
+    response = requests.request("POST", url, headers=HEADERS, data=payload)
+    return response.json()
+
+
+def bind_crawler_task_to_publish_task(target_publish_task_id, crawler_task_name, crawler_task_id):
+    """
+    将抓取计划绑定至发布计划
+    发布计划已经存在
+    :param crawler_task_id: 抓取计划ID
+    :param crawler_task_name: 抓取计划名称
+    :param target_publish_task_id: 目标发布计划 id
+    :return: response
+    """
+    publish_task_detail = get_publish_task_detail(target_publish_task_id)
+    publish_task_detail_data = publish_task_detail.get("data")
+    already_exist_crawler_task_list = publish_task_detail_data.get("inputGroups")[0].get("inputSources")
+    new_crawler_task_list = [
+        {
+            "sourceCategory": 1,
+            "inputSourceValueType": 1,
+            "inputSourceValue": crawler_task_id,
+            "inputSourceLabel": crawler_task_name
+        }
+    ]
+    new_input_source_group = already_exist_crawler_task_list + new_crawler_task_list
+    if publish_task_detail_data:
+        url = "http://aigc-api.cybertogether.net/aigc/publish/plan/save"
+        payload = json.dumps({
+            "params": {
+                "accountIds": [i['id'] for i in publish_task_detail_data.get("accountIds")],
+                "inputGroups": [
+                    {
+                        "groupId": "e40cd06daeb5345ed26256c8744f7a33",
+                        "groupName": None,
+                        "channel": None,
+                        "contentModal": None,
+                        "groupIndex": 1,
+                        "filterMatchMode": 2,
+                        "inputSources": new_input_source_group,
+                        "inputFilters": [],
+                        "inputOrders": [],
+                        "label": "input1"
+                    }
+                ],
+                "inputSources": [],
+                "inputFilters": [],
+                "activeManualReview": publish_task_detail_data.get("activeManualReview"),
+                "channel": publish_task_detail_data.get("channel"),
+                "contentAllocationRules": publish_task_detail_data.get("contentAllocationRules"),
+                "contentModal": publish_task_detail_data.get("contentModal"),
+                "contentSortingRules": publish_task_detail_data.get("contentSortingRules"),
+                "douyinPublishAccoutSetting": publish_task_detail_data.get("douyinPublishAccoutSetting"),
+                "filterMatchMode": 1,
+                "name": publish_task_detail_data.get("name"),
+                "publishAccoutJson": "",
+                "publishBgmType": publish_task_detail_data.get("publishBgmType"),
+                "publishDate": publish_task_detail_data.get("publishDate"),
+                "publishLocation": publish_task_detail_data.get("publishLocation"),
+                "publishNum": publish_task_detail_data.get("publishNum"),
+                "publishPushTime": publish_task_detail_data.get("publishPushTime"),
+                "publishRate": publish_task_detail_data.get("publishRate"),
+                "publishTimeInterval": publish_task_detail_data.get("publishTimeInterval"),
+                "publishWindowEnd": publish_task_detail_data.get("publishWindowEnd"),
+                "publishWindowStart": publish_task_detail_data.get("publishWindowStart"),
+                "wxContentInsert":  publish_task_detail_data.get("wxContentInsert"),
+                "wxVideoPublishAccountSetting": publish_task_detail_data.get("wxVideoPublishAccountSetting"),
+                "scoreJudgeFlag": publish_task_detail_data.get("scoreJudgeFlag"),
+                "scoreJudgeTasks": publish_task_detail_data.get("scoreJudgeTasks"),
+                "machineReviewMatchMode": publish_task_detail_data.get("machineReviewMatchMode"),
+                "id": publish_task_detail_data.get("id"),
+                "planType": publish_task_detail_data.get("planType"),
+                "planTag": publish_task_detail_data.get("planTag"),
+                "tagPenetrateFlag": publish_task_detail_data.get("tagPenetrateFlag"),
+                "actionObjects": publish_task_detail_data.get("actionObjects"),
+                "actionContents": publish_task_detail_data.get("actionContents"),
+                "accountFrom": publish_task_detail_data.get("accountFrom"),
+                "actionContentAllocationRule": publish_task_detail_data.get("actionContentAllocationRule"),
+                "publishPerNum": publish_task_detail_data.get("publishPerNum"),
+                "publishPerMinNum": publish_task_detail_data.get("publishPerMinNum"),
+                "pushType": publish_task_detail_data.get("pushType"),
+                "triggerEvent": publish_task_detail_data.get("triggerEvent"),
+                "pushContentSortingRules": publish_task_detail_data.get("pushContentSortingRules"),
+                "biliDistrict": publish_task_detail_data.get("biliDistrict"),
+                "firstItemScoreJudgeTaskId": publish_task_detail_data.get("firstItemScoreJudgeTaskId"),
+                "secondItemScoreJudgeTaskId": publish_task_detail_data.get("secondItemScoreJudgeTaskId"),
+                "otherItemScoreJudgeTaskId": publish_task_detail_data.get("otherItemScoreJudgeTaskId"),
+                "gzhArticleSortFlag": publish_task_detail_data.get("gzhArticleSortFlag"),
+                "gzhArticleSortTask": publish_task_detail_data.get("gzhArticleSortTask"),
+                "miniprogramInsertFlag": publish_task_detail_data.get("miniprogramInsertFlag"),
+                "miniprogramInsertTasks": publish_task_detail_data.get("miniprogramInsertTasks"),
+                "machineReviewConditions": publish_task_detail_data.get("machineReviewConditions"),
+                "gzhTriggerSyncFrequency": publish_task_detail_data.get("gzhTriggerSyncFrequency"),
+                "gzhTriggerSendContentType": publish_task_detail_data.get("gzhTriggerSendContentType"),
+                "longArticleSystemHost": publish_task_detail_data.get("longArticleSystemHost"),
+            },
+            "baseInfo": PERSON_COOKIE
+        })
+        response = requests.request("POST", url, headers=HEADERS, data=payload)
+        print(response.json())
+    else:
+        return
+

+ 38 - 0
applications/const.py

@@ -0,0 +1,38 @@
+"""
+@author: luojunhui
+任务常量配置文件
+"""
+
+
+class coldStartTaskConst:
+    """
+    冷启动任务常量配置
+    """
+    PUBLISHED_STATUS = 2  # 文章已发布状态
+    INIT_STATUS = 1  # 文章初始状态
+    BAD_STATUS = 0  # 低质量文章状态
+
+
+class updatePublishedMsgTaskConst:
+    """
+    更新已发布文章消息常量配置
+    """
+    # 爬虫详情接口返回code
+    ARTICLE_DELETE_CODE = 25005
+    ARTICLE_SUCCESS_CODE = 0
+
+    # 请求爬虫详情接口状态码
+    # 记录默认状态
+    DEFAULT_STATUS = 0
+    # 请求接口失败状态
+    REQUEST_FAIL_STATUS = -1
+    # 文章被删除状态
+    DELETE_STATUS = -2
+    # 未知原因无信息返回状态
+    UNKNOWN_STATUS = -3
+
+    # 公众号类型(订阅号 or 服务号)
+    # 订阅号
+    SUBSCRIBE_TYPE_SET = {0, 1}
+    # 服务号
+    SERVICE_TYPE = 2

+ 20 - 37
updatePublishedMsgDaily.py

@@ -13,27 +13,10 @@ from datetime import datetime
 
 
 from applications import PQMySQL, WeixinSpider, Functions, log, bot, aiditApi
+from applications.const import updatePublishedMsgTaskConst
 
-ARTICLE_TABLE = "official_articles_v2"
-ARTICLE_DELETE_CODE = 25005
-ARTICLE_SUCCESS_CODE = 0
-
-DEFAULT_STATUS = 0
-REQUEST_FAIL_STATUS = -1
-DELETE_STATUS = -2
-UNKNOWN_STATUS = -3
-
-
-def get_accounts_v1():
-    """
-    获取账号信息
-    :return: [{}, {},...], [{}, {}, {}...]
-    """
-    with open("config/accountInfoV0914.json", encoding="utf-8") as f:
-        account_list = json.loads(f.read())
-    subscription_account = [i for i in account_list if i['type'] == '订阅号']
-    server_account = [i for i in account_list if i['type'] == '服务号']
-    return subscription_account, server_account
+ARTICLE_TABLE = "official_articles"
+const = updatePublishedMsgTaskConst()
 
 
 def get_account_using_status():
@@ -57,7 +40,7 @@ def get_accounts():
     "ghId": line[1],
     "follower_count": line[2],
     "account_init_time": int(line[3] / 1000),
-    "account_type": line[4],
+    "account_type": line[4], # 订阅号 or 服务号
     "account_auth": line[5]
     """
     using_account_set = get_account_using_status()
@@ -69,8 +52,8 @@ def get_accounts():
         else:
             item['using_status'] = 0
         account_list.append(item)
-    subscription_account = [i for i in account_list if i['account_type'] in {0, 1}]
-    server_account = [i for i in account_list if i['account_type'] == 2]
+    subscription_account = [i for i in account_list if i['account_type'] in const.SUBSCRIBE_TYPE_SET]
+    server_account = [i for i in account_list if i['account_type'] == const.SERVICE_TYPE]
     return subscription_account, server_account
 
 
@@ -313,7 +296,7 @@ def check_single_account(db_client, account_item):
     try:
         latest_update_time = db_client.select(sql)[0][0]
         # 判断该账号当天发布的文章是否被收集
-        if account_type in {0, 1}:
+        if account_type in const.SUBSCRIBE_TYPE_SET:
             if int(latest_update_time) > int(today_timestamp):
                 return True
             else:
@@ -462,8 +445,8 @@ def get_articles(db_client):
     """
     sql = f"""
     SELECT ContentUrl, wx_sn 
-    FROM official_articles_v2 
-    WHERE publish_timestamp in {(DEFAULT_STATUS, REQUEST_FAIL_STATUS)};"""
+    FROM {ARTICLE_TABLE}
+    WHERE publish_timestamp in {(const.DEFAULT_STATUS, const.REQUEST_FAIL_STATUS)};"""
     response = db_client.select(sql)
     return response
 
@@ -481,10 +464,10 @@ def update_publish_timestamp(db_client, row):
         response = WeixinSpider().get_article_text(url)
         response_code = response['code']
 
-        if response_code == ARTICLE_DELETE_CODE:
-            publish_timestamp_s = DELETE_STATUS
+        if response_code == const.ARTICLE_DELETE_CODE:
+            publish_timestamp_s = const.DELETE_STATUS
             root_source_id_list = []
-        elif response_code == ARTICLE_SUCCESS_CODE:
+        elif response_code == const.ARTICLE_SUCCESS_CODE:
             data = response['data']['data']
             publish_timestamp_ms = data['publish_timestamp']
             publish_timestamp_s = int(publish_timestamp_ms / 1000)
@@ -499,16 +482,16 @@ def update_publish_timestamp(db_client, row):
             else:
                 root_source_id_list = []
         else:
-            publish_timestamp_s = UNKNOWN_STATUS
+            publish_timestamp_s = const.UNKNOWN_STATUS
             root_source_id_list = []
     except Exception as e:
-        publish_timestamp_s = REQUEST_FAIL_STATUS
+        publish_timestamp_s = const.REQUEST_FAIL_STATUS
         root_source_id_list = []
         error_msg = traceback.format_exc()
         print(e, error_msg)
 
     update_sql = f"""
-            UPDATE official_articles_v2
+            UPDATE {ARTICLE_TABLE}
             SET publish_timestamp = %s, root_source_id_list = %s
             WHERE wx_sn = %s;
         """
@@ -519,7 +502,7 @@ def update_publish_timestamp(db_client, row):
             json.dumps(root_source_id_list, ensure_ascii=False),
             wx_sn
         ))
-    if publish_timestamp_s == REQUEST_FAIL_STATUS:
+    if publish_timestamp_s == const.REQUEST_FAIL_STATUS:
         return row
     else:
         return None
@@ -566,10 +549,10 @@ def get_article_detail_job():
 
     # 通过msgId 来修改publish_timestamp
     update_sql = f"""
-        UPDATE official_articles_v2 oav 
+        UPDATE {ARTICLE_TABLE} oav 
         JOIN (
             SELECT appMsgId, MAX(publish_timestamp) AS publish_timestamp 
-            FROM official_articles_v2 
+            FROM {ARTICLE_TABLE} 
             WHERE publish_timestamp > %s 
             GROUP BY appMsgId
             ) vv 
@@ -584,8 +567,8 @@ def get_article_detail_job():
 
     # 若还是无 publish_timestamp,用update_time当作 publish_timestamp
     update_sql_2 = f"""
-        UPDATE official_articles_v2
-        SET publish_timestamp = update_time
+        UPDATE {ARTICLE_TABLE}
+        SET publish_timestamp = updateTime
         WHERE publish_timestamp < %s;
     """
     db_client.update(