罗俊辉 9 mēneši atpakaļ
vecāks
revīzija
3edd2052ec
5 mainītis faili ar 258 papildinājumiem un 46 dzēšanām
  1. 11 27
      develop/t.py
  2. 20 7
      tasks/migrate.py
  3. 120 0
      tasks/migrate_articles2.py
  4. 103 0
      tasks/migrate_file_to_db.py
  5. 4 12
      tasks/update_rootSourceId.py

+ 11 - 27
develop/t.py

@@ -1,27 +1,11 @@
-"""
-@author: luojunhui
-"""
-import requests
-
-url = "https://admin.piaoquantv.com/manager/video/audit/v2/updateAuditStatus"
-
-payload = "videoId=21486692&auditStatus=2&updateReasonJson=&rejectReasonJson=%5B%7B%22reason%22%3A%22%E5%85%B6%E4%BB%96%22%2C%22reasonId%22%3A-1%7D%5D&adminUid=206"
-headers = {
-  'accept': 'application/json',
-  'accept-language': 'zh,zh-CN;q=0.9',
-  'content-type': 'application/x-www-form-urlencoded;charset=UTF-8',
-  'cookie': 'SESSION=YjIyYTIxOWUtYTAyNC00YTMxLWFiZGEtZmFlODY2MTVkOGYx',
-  'origin': 'https://admin.piaoquantv.com',
-  'priority': 'u=1, i',
-  'sec-ch-ua': '"Not/A)Brand";v="8", "Chromium";v="126", "Google Chrome";v="126"',
-  'sec-ch-ua-mobile': '?0',
-  'sec-ch-ua-platform': '"macOS"',
-  'sec-fetch-dest': 'empty',
-  'sec-fetch-mode': 'cors',
-  'sec-fetch-site': 'same-origin',
-  'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0.0.0 Safari/537.36'
-}
-
-response = requests.request("POST", url, headers=headers, data=payload)
-
-print(response.text)
+source_list = [
+    "longArticles_3f4d2a1c1cece4cf88d348f46fa3c20d",
+    "longArticles_2872ec9931e405b04c70b5b35d64fa07",
+    "longArticles_8ceadda6dfd935c9f85c5f3b5abd32a0",
+    "longArticles_6b1f49e1bc19e22b1a2c2c252722f099",
+    "longArticles_480827356b0eabc4b03b21acf4c2e664",
+    "longArticles_60ac70dc7cbf8bf40fac1d51c007213e"
+]
+
+w = set(source_list)
+print(w)

+ 20 - 7
tasks/migrate.py

@@ -55,7 +55,7 @@ def migrate_data_to_mysql(video_id, title, view_, return_, video_url):
             view_,
             view_,
             return_,
             return_,
             video_url,
             video_url,
-            "20240710",
+            "20240715",
             rov
             rov
         )
         )
     )
     )
@@ -64,16 +64,29 @@ def migrate_data_to_mysql(video_id, title, view_, return_, video_url):
 
 
 def process(line):
 def process(line):
     title = line[0]
     title = line[0]
-    video_id = line[1]
-    view = line[3]
-    return_count = line[4]
+    video_id = line[1].replace('"', '')
+    view = int(line[3])
+    return_count = int(line[4])
     video_url = request_for_info(video_id)['data'][0]['videoPath']
     video_url = request_for_info(video_id)['data'][0]['videoPath']
     migrate_data_to_mysql(video_id, title, view, return_count, video_url)
     migrate_data_to_mysql(video_id, title, view, return_count, video_url)
 
 
 
 
-df = pd.read_excel("/Users/luojunhui/Downloads/top_return_data.xlsx")
+path = "/Users/luojunhui/Downloads/2022-top10000.csv"
 
 
-data_list = df.values.tolist()
+with open(path, encoding="gbk", errors='ignore') as f:
+    data = f.readlines()
+
+L = []
+for line in data:
+    temp = line.replace("\n", "").split(",")
+    # print(len(temp))
+    if len(temp) == 5:
+        L.append(temp)
+# for line in L:
+#     print(line)
+
+
+# data_list = df.values.tolist()
 
 
 with ThreadPoolExecutor(max_workers=10) as pool:
 with ThreadPoolExecutor(max_workers=10) as pool:
-    pool.map(process, data_list)
+    pool.map(process, L)

+ 120 - 0
tasks/migrate_articles2.py

@@ -0,0 +1,120 @@
+"""
+@author: luojunhui
+"""
+import os
+import json
+import pymysql
+from tqdm import tqdm
+
+from concurrent.futures.thread import ThreadPoolExecutor
+
+def insert_into_mysql(path):
+    """
+    :param path: 文件路径
+    :return:
+    """
+    with open(path, encoding="utf-8") as f:
+        info = json.loads(f.read())
+    gzh_info = path.split("/")[-3]
+    accountName = gzh_info.split("_")[-1]
+    ghId = gzh_info.replace("_" + accountName, "")
+    baseInfo = info.get("BaseInfo", {})
+    appMsgId = info.get("AppMsg", {}).get("BaseInfo", {}).get("AppMsgId", None)
+    createTime = info.get("AppMsg", {}).get("BaseInfo", {}).get("CreateTime", None)
+    updateTime = info.get("AppMsg", {}).get("BaseInfo", {}).get("UpdateTime", None)
+    Type = info.get("AppMsg", {}).get("BaseInfo", {}).get("Type", None)
+    detail_article_list = info.get("AppMsg", {}).get("DetailInfo", [])
+    if detail_article_list:
+        for article in detail_article_list:
+            title = article.get("Title", None)
+            Digest = article.get("Digest", None)
+            ItemIndex = article.get("ItemIndex", None)
+            ContentUrl = article.get("ContentUrl", None)
+            SourceUrl = article.get("SourceUrl", None)
+            CoverImgUrl = article.get("CoverImgUrl", None)
+            CoverImgUrl_1_1 = article.get("CoverImgUrl_1_1", None)
+            CoverImgUrl_235_1 = article.get("CoverImgUrl_235_1", None)
+            ItemShowType = article.get("ItemShowType", None)
+            IsOriginal = article.get("IsOriginal", None)
+            ShowDesc = article.get("ShowDesc", None)
+            ori_content = article.get("ori_content", None)
+            show_view_count = article.get("show_view_count", 0)
+            show_like_count = article.get("show_like_count", 0)
+            show_zs_count = article.get("show_zs_count", 0)
+            show_pay_count = article.get("show_pay_count", 0)
+            wx_sn = ContentUrl.split("&sn=")[1].split("&")[0] if ContentUrl else None
+            info_tuple = (
+                ghId,
+                accountName,
+                appMsgId,
+                title,
+                Type,
+                createTime,
+                updateTime,
+                Digest,
+                ItemIndex,
+                ContentUrl,
+                SourceUrl,
+                CoverImgUrl,
+                CoverImgUrl_1_1,
+                CoverImgUrl_235_1,
+                ItemShowType,
+                IsOriginal,
+                ShowDesc,
+                ori_content,
+                show_view_count,
+                show_like_count,
+                show_zs_count,
+                show_pay_count,
+                wx_sn,
+                json.dumps(baseInfo, ensure_ascii=False)
+            )
+            connection = pymysql.connect(
+                host='rm-bp1159bu17li9hi94.mysql.rds.aliyuncs.com',
+                port=3306,
+                user='crawler',
+                password='crawler123456@',
+                db='piaoquan-crawler',
+                charset='utf8mb4'
+            )
+            insert_sql = f"""
+            INSERT INTO official_articles
+            (ghId, accountName, appMsgId, title, Type, createTime, updateTime, Digest, ItemIndex, ContentUrl, SourceUrl, CoverImgUrl, CoverImgUrl_1_1, CoverImgUrl_255_1, ItemShowType, IsOriginal, ShowDesc, ori_content, show_view_count, show_like_count, show_zs_count, show_pay_count, wx_sn, baseInfo)
+            values
+            (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s);
+            """
+            cursor = connection.cursor()
+            cursor.execute(
+                insert_sql,
+                info_tuple
+            )
+            connection.commit()
+
+
+def get_file_list():
+    """
+    获取文件
+    :return:
+    """
+    path = 'account'
+    dirs = os.listdir(path)
+    sub_dirs = [os.path.join(path, i, "msg") for i in dirs]
+    L = []
+    for sub_dir in sub_dirs:
+        try:
+            file_list = os.listdir(sub_dir)
+            file_path_list = [os.path.join(sub_dir, i) for i in file_list]
+            L.append(file_path_list)
+        except:
+            pass
+    return L
+
+if __name__ == '__main__':
+    file_list = get_file_list()
+    L_files = []
+    c = 0
+    for files in tqdm(file_list):
+        c += len(files)
+    print(c)
+        # with ThreadPoolExecutor(max_workers=10) as pool:
+        #     pool.map(insert_into_mysql, files)

+ 103 - 0
tasks/migrate_file_to_db.py

@@ -0,0 +1,103 @@
+"""
+@author: luojunhui
+"""
+import os
+import json
+import pymysql
+from tqdm import tqdm
+
+
+def insert_into_mysql(path):
+    """
+    :param path: 文件路径
+    :return:
+    {
+    "channel": 5,
+    "channel_account_id": "72ed4e3ca6c846cba40e5b736387c760",
+    "xhs_id": null,
+    "dy_id": null,
+    "account_link": "https://mp.weixin.qq.com/mp/profile_ext?action=home&__biz=MzkwMTQ0NDYwNg==&scene=124#wechat_redirect",
+    "account_name": "龙虾探长",
+    "avatar_url": null,
+    "background_url": null,
+    "gender": "其他",
+    "description": "分享趣味故事",
+    "ip_location": null,
+    "tags": null,
+    "follower_count": null,
+    "publish_count": null,
+    "like_count": null,
+    "collect_count": null,
+    "comment_count": null,
+    "looking_count": null,
+    "biz_info": "MzkwMTQ0NDYwNg==",
+    "wx_gh": "gh_5cc284077cda",
+    "update_timestamp": 1720577540593
+}
+    """
+    with open(path, encoding="utf-8") as f:
+        info = json.loads(f.read())
+    accountName = info.get("account_name", None)
+    ghId = info.get("wx_gh", None)
+    bizInfo = info.get("biz_info", None)
+    accountLink = info.get("account_link", None)
+    avatarUrl = info.get("avatar_url", None)
+    description = info.get("description", None)
+    updateTimestamp = info.get("update_timestamp", None)
+    print(updateTimestamp)
+    connection = pymysql.connect(
+        host='rm-bp1159bu17li9hi94.mysql.rds.aliyuncs.com',
+        port=3306,
+        user='crawler',
+        password='crawler123456@',
+        db='piaoquan-crawler',
+        charset='utf8mb4'
+    )
+    # insert_sql = f"""
+    # INSERT INTO official_accounts
+    # (accountName, ghId, bizInfo, accountLink, avatarUrl, description, updateTimestamp)
+    # values
+    # (%s, %s, %s, %s, %s, %s, %s);
+    # """
+    update_sql = f"""
+    UPDATE official_accounts
+    SET updateTimestamp = %s
+    WHERE ghId = %s
+    """
+    cursor = connection.cursor()
+    cursor.execute(
+        update_sql,
+        (
+            # accountName,
+            # bizInfo,
+            # accountLink,
+            # avatarUrl,
+            # description,
+            updateTimestamp,
+            ghId
+        )
+    )
+    connection.commit()
+
+
+def read_account_info():
+    """
+    获取公众号账号信息
+    :return:
+    """
+    path = 'account'
+    file_list = []
+    for parent, dirs, files in os.walk(path):
+        for file in files:
+            if file == "account.json":
+                target_path = os.path.join(parent, file)
+                file_list.append(target_path)
+    return file_list
+
+
+if __name__ == '__main__':
+    p_list = read_account_info()
+    for fp in tqdm(p_list):
+        insert_into_mysql(fp)
+
+

+ 4 - 12
tasks/update_rootSourceId.py

@@ -11,12 +11,6 @@ import schedule
 from concurrent.futures.thread import ThreadPoolExecutor
 from concurrent.futures.thread import ThreadPoolExecutor
 
 
 source_list = [
 source_list = [
-    "longArticles_3f4d2a1c1cece4cf88d348f46fa3c20d",
-    "longArticles_2872ec9931e405b04c70b5b35d64fa07",
-    "longArticles_8ceadda6dfd935c9f85c5f3b5abd32a0",
-    "longArticles_6b1f49e1bc19e22b1a2c2c252722f099",
-    "longArticles_480827356b0eabc4b03b21acf4c2e664",
-    "longArticles_60ac70dc7cbf8bf40fac1d51c007213e",
     "touliu_tencentGzhArticle_cc284926a7d1c19f9a4e6abe5520468b",
     "touliu_tencentGzhArticle_cc284926a7d1c19f9a4e6abe5520468b",
     "touliu_tencentGzhArticle_2e4c21de3707f3b368b0cc4500d120f0",
     "touliu_tencentGzhArticle_2e4c21de3707f3b368b0cc4500d120f0",
     "touliu_tencentGzhArticle_a18c11dd294df014334f7db72830221a",
     "touliu_tencentGzhArticle_a18c11dd294df014334f7db72830221a",
@@ -26,12 +20,10 @@ source_list = [
 ]
 ]
 
 
 source_id_list = {
 source_id_list = {
-    "longArticles_3f4d2a1c1cece4cf88d348f46fa3c20d": 1,
-    "longArticles_2872ec9931e405b04c70b5b35d64fa07": 1,
-    "longArticles_8ceadda6dfd935c9f85c5f3b5abd32a0": 1,
-    "longArticles_6b1f49e1bc19e22b1a2c2c252722f099": 1,
-    "longArticles_480827356b0eabc4b03b21acf4c2e664": 1,
-    "longArticles_60ac70dc7cbf8bf40fac1d51c007213e": 1,
+    'longArticles_2d311f88a9c1bd5a90ce88339ae93e78': 1,
+    'longArticles_8d9fd0553c988e7a6bf3a6198f78d890': 1,
+    'longArticles_99763b3ad92c781194dbd3eb3321542c': 1,
+    'longArticles_2a27f501ef0d758c35dd3b70cf3bbfa3': 1,
     "touliu_tencentGzhArticle_cc284926a7d1c19f9a4e6abe5520468b": 1,
     "touliu_tencentGzhArticle_cc284926a7d1c19f9a4e6abe5520468b": 1,
     "touliu_tencentGzhArticle_2e4c21de3707f3b368b0cc4500d120f0": 1,
     "touliu_tencentGzhArticle_2e4c21de3707f3b368b0cc4500d120f0": 1,
     "touliu_tencentGzhArticle_a18c11dd294df014334f7db72830221a": 1,
     "touliu_tencentGzhArticle_a18c11dd294df014334f7db72830221a": 1,