luojunhui 1 week ago
parent
commit
c6e2d8aef9
4 changed files with 214 additions and 0 deletions
  1. 116 0
      applications/api/es_api.py
  2. 31 0
      applications/api/es_certs.crt
  3. 31 0
      config/es_certs.crt
  4. 36 0
      config/es_mappings.py

+ 116 - 0
applications/api/es_api.py

@@ -0,0 +1,116 @@
+import ssl
+import datetime
+import time
+import requests
+import json
+from pymysql.cursors import DictCursor
+
+from elasticsearch import Elasticsearch, ApiError
+from elasticsearch import helpers
+from applications.db import DatabaseConnector
+from config import long_articles_config
+from config.es_mappings import index_name, mappings, settings
+
+db_client = DatabaseConnector(long_articles_config)
+db_client.connect()
+
+ctx = ssl.create_default_context(cafile="es_certs.crt")
+
+es_password = 'nkvvASQuQ0XUGRq5OLvm'
+es = Elasticsearch(
+    "https://192.168.205.85:9200",
+    # 如果启用了用户名密码
+    basic_auth=("elastic", es_password),
+    ssl_context=ctx
+)
+
+
+def create_index():
+    # 2. 防守式删除旧索引(可选)
+    if es.indices.exists(index=index_name):
+        es.indices.delete(index=index_name)
+
+    # 3. 创建索引
+    try:
+        es.indices.create(
+            index=index_name,
+            settings=settings,
+            mappings=mappings
+        )
+        print(f"✅ Index <{index_name}> created.")
+    except ApiError as e:
+        # 打印 ES 返回的具体错误
+        print(f"❌ Failed: {e.meta.error['type']} – {e.meta.error['reason']}")
+
+
+def get_max_article_id():
+    response = es.search(
+        index=index_name,
+        size=1,
+        sort="article_id:desc",
+        _source=["article_id"]
+    )
+    # print(json.dumps(response.body, indent=4))
+    return response['hits']['hits'][0]['_source']['article_id']
+
+
+def get_articles(id_):
+    fetch_query = f"""
+        select article_id, platform, out_account_id, title
+        from crawler_meta_article
+        where status = 1 and article_id > %s
+        order by article_id limit 10000;
+    """
+    # 执行查询
+    results = db_client.fetch(fetch_query, cursor_type=DictCursor, params=(id_,))
+    docs = [
+    {
+        "_index": index_name,
+        "_id": item['article_id'],
+        "_source": {
+            "article_id": item['article_id'],
+            "platform":  item['platform'],
+            "out_account_id": item['out_account_id'],
+            "title": item['title']
+    }
+    } for item in results
+]
+    return docs
+
+
+def search():
+    query = {
+        "query": {
+            "match": {
+                "title": "刘伯承元帅"
+            }
+        },
+        "_source": ["article_id", "title"]
+    }
+
+    a = time.time()
+    resp = es.search(index=index_name, body=query)
+    b = time.time()
+    for hit in resp["hits"]["hits"]:
+        print(hit["_source"])
+
+    print(b - a)
+
+def get_cluster_docs_stats():
+    """获取集群文档统计信息"""
+    stats = es.nodes.stats()
+    # print(stats)
+    # print(type(stats))
+    print(json.dumps(stats.body, indent=4, ensure_ascii=False))
+
+
+if __name__ == "__main__":
+    max_id = get_max_article_id()
+    i = 0
+    while int(max_id) < 27492350:
+        articles = get_articles(max_id)
+        res = helpers.bulk(es, articles)
+        print(es.count(index=index_name))
+        max_id = get_max_article_id()
+        i += 1
+        print(i)

+ 31 - 0
applications/api/es_certs.crt

@@ -0,0 +1,31 @@
+-----BEGIN CERTIFICATE-----
+MIIFaTCCA1GgAwIBAgIUWHH9T8PVfiSyvT6S6NrAQ9iSLeEwDQYJKoZIhvcNAQEL
+BQAwPDE6MDgGA1UEAxMxRWxhc3RpY3NlYXJjaCBzZWN1cml0eSBhdXRvLWNvbmZp
+Z3VyYXRpb24gSFRUUCBDQTAeFw0yNTA3MDcwNzIwNTRaFw0yODA3MDYwNzIwNTRa
+MDwxOjA4BgNVBAMTMUVsYXN0aWNzZWFyY2ggc2VjdXJpdHkgYXV0by1jb25maWd1
+cmF0aW9uIEhUVFAgQ0EwggIiMA0GCSqGSIb3DQEBAQUAA4ICDwAwggIKAoICAQCb
+Y8E68+7S+hGKQX6vhyOxuCe3QyBHYlsxiSqGhi+WFx953u4SEMqrbqiyg2QquB9/
+ynjKo3Tvhn0OPjuJRytteKn9OZkVhUT1D5P6PFo0j8x1LIJZm551XRCnQUZ8jC0C
+REHy/JoKdT4YSCRIuXVTM5iM66vQ1t5Du4sb70mTygtc2DyXwgE4LkVnrHcwr2BZ
+3/O69WvF7Zd7WP93yEfUsLsAAQStaCYMeYyaY5K8UwIVcFyWKJ9lnDGbR9KmuXb9
+ipWqGw6aAYhmSs5gL+6xJ5dBpgMOqoBTvZpNniLA/phkelq9W2nAhBLFpRGRof8K
+5iKwjAN8gnBXeSVklBoL23QD5zfoVjz+5eaXWO4qP+90jbwf+vEg/duncDRONGtk
+TQd0Vr9NeO3Aye8PZsmmhKAaciaPWYyQO30omUq9kPsSUzZPu4k+CYb8qwVQCHpn
+Za19NkvERQ8hCQks08/ly5qDM+5lBxJQFQjhjtzSDQ/ybbarMmgaBxpCexiksRmP
+CQqVLW6IaLxUGEkIJqXRx8nmKUfK43vTBitOBFt5UcKob6+ikZLrqZ6xLY/jklE8
+Z1wt9I8ZdQ3L3X9EORgmQ+4KIu/JQfBdfAYtLaS6MYWhiZSaKaIhgfXiZQTO9YuW
+KrI5g+d2Yu2BYgIioLKo9LFWK1eTG2gNAGUI/+rqswIDAQABo2MwYTAdBgNVHQ4E
+FgQUab2kAtPlJHLirQvbThvIwJ7hbLwwHwYDVR0jBBgwFoAUab2kAtPlJHLirQvb
+ThvIwJ7hbLwwDwYDVR0TAQH/BAUwAwEB/zAOBgNVHQ8BAf8EBAMCAQYwDQYJKoZI
+hvcNAQELBQADggIBAF+wJ598Krfai5Br6Vq0Z1jj0JsU8Kij4t9D+89QPgI85/Mv
+zwj8xRgxx9RinKYdnzFJWrD9BITG2l3D0zcJhXfYUpq5HLP+c3zMwEMGzTLbgi70
+cpYqkTJ+g/Ah5WRYZRHJIMF6BVK6izCOO0J49eYC6AONNxG2HeeUvEL4cNnxpw8T
+NUe7v0FXe2iPLeE713h99ray0lBgI6J9QZqc/oEM47gHy+ByfWCv6Yw9qLlprppP
+taHz2VWnCAACDLzbDnYhemQDji86yrUTEdCT8at1jAwHSixgkm88nEBgxPHDuq8t
+thmiS6dELvXVUbyeWO7A/7zVde0Kndxe003OuYcX9I2IX7aIpC8sW/yY+alRhklq
+t9vF6g1qvsN69xXfW5yI5G31TYMUw/3ng0aVJfRFaXkEV2SWEZD+4sWoYC/GU7kK
+zlfaF22jTeul5qCKkN1k+i8K2lheEE3ZBC358W0RyvsrDwtXOra3VCpZ7qrez8OA
+/HeY6iISZQ7g0s209KjqOPqVGcI8B0p6KMh00AeWisU6E/wy1LNTxkf2IS9b88n6
+a3rj0TCycwhKOPTPB5pwlfbZNI00tGTFjqqi07SLqO9ZypsVkyR32G16JPJzk8Zw
+kngBZt6y9LtCMRVbyDuIDNq+fjtDjgxMI9bQXtve4bOuq8cZzcMjC6khz/Ja
+-----END CERTIFICATE-----

+ 31 - 0
config/es_certs.crt

@@ -0,0 +1,31 @@
+-----BEGIN CERTIFICATE-----
+MIIFaTCCA1GgAwIBAgIUWHH9T8PVfiSyvT6S6NrAQ9iSLeEwDQYJKoZIhvcNAQEL
+BQAwPDE6MDgGA1UEAxMxRWxhc3RpY3NlYXJjaCBzZWN1cml0eSBhdXRvLWNvbmZp
+Z3VyYXRpb24gSFRUUCBDQTAeFw0yNTA3MDcwNzIwNTRaFw0yODA3MDYwNzIwNTRa
+MDwxOjA4BgNVBAMTMUVsYXN0aWNzZWFyY2ggc2VjdXJpdHkgYXV0by1jb25maWd1
+cmF0aW9uIEhUVFAgQ0EwggIiMA0GCSqGSIb3DQEBAQUAA4ICDwAwggIKAoICAQCb
+Y8E68+7S+hGKQX6vhyOxuCe3QyBHYlsxiSqGhi+WFx953u4SEMqrbqiyg2QquB9/
+ynjKo3Tvhn0OPjuJRytteKn9OZkVhUT1D5P6PFo0j8x1LIJZm551XRCnQUZ8jC0C
+REHy/JoKdT4YSCRIuXVTM5iM66vQ1t5Du4sb70mTygtc2DyXwgE4LkVnrHcwr2BZ
+3/O69WvF7Zd7WP93yEfUsLsAAQStaCYMeYyaY5K8UwIVcFyWKJ9lnDGbR9KmuXb9
+ipWqGw6aAYhmSs5gL+6xJ5dBpgMOqoBTvZpNniLA/phkelq9W2nAhBLFpRGRof8K
+5iKwjAN8gnBXeSVklBoL23QD5zfoVjz+5eaXWO4qP+90jbwf+vEg/duncDRONGtk
+TQd0Vr9NeO3Aye8PZsmmhKAaciaPWYyQO30omUq9kPsSUzZPu4k+CYb8qwVQCHpn
+Za19NkvERQ8hCQks08/ly5qDM+5lBxJQFQjhjtzSDQ/ybbarMmgaBxpCexiksRmP
+CQqVLW6IaLxUGEkIJqXRx8nmKUfK43vTBitOBFt5UcKob6+ikZLrqZ6xLY/jklE8
+Z1wt9I8ZdQ3L3X9EORgmQ+4KIu/JQfBdfAYtLaS6MYWhiZSaKaIhgfXiZQTO9YuW
+KrI5g+d2Yu2BYgIioLKo9LFWK1eTG2gNAGUI/+rqswIDAQABo2MwYTAdBgNVHQ4E
+FgQUab2kAtPlJHLirQvbThvIwJ7hbLwwHwYDVR0jBBgwFoAUab2kAtPlJHLirQvb
+ThvIwJ7hbLwwDwYDVR0TAQH/BAUwAwEB/zAOBgNVHQ8BAf8EBAMCAQYwDQYJKoZI
+hvcNAQELBQADggIBAF+wJ598Krfai5Br6Vq0Z1jj0JsU8Kij4t9D+89QPgI85/Mv
+zwj8xRgxx9RinKYdnzFJWrD9BITG2l3D0zcJhXfYUpq5HLP+c3zMwEMGzTLbgi70
+cpYqkTJ+g/Ah5WRYZRHJIMF6BVK6izCOO0J49eYC6AONNxG2HeeUvEL4cNnxpw8T
+NUe7v0FXe2iPLeE713h99ray0lBgI6J9QZqc/oEM47gHy+ByfWCv6Yw9qLlprppP
+taHz2VWnCAACDLzbDnYhemQDji86yrUTEdCT8at1jAwHSixgkm88nEBgxPHDuq8t
+thmiS6dELvXVUbyeWO7A/7zVde0Kndxe003OuYcX9I2IX7aIpC8sW/yY+alRhklq
+t9vF6g1qvsN69xXfW5yI5G31TYMUw/3ng0aVJfRFaXkEV2SWEZD+4sWoYC/GU7kK
+zlfaF22jTeul5qCKkN1k+i8K2lheEE3ZBC358W0RyvsrDwtXOra3VCpZ7qrez8OA
+/HeY6iISZQ7g0s209KjqOPqVGcI8B0p6KMh00AeWisU6E/wy1LNTxkf2IS9b88n6
+a3rj0TCycwhKOPTPB5pwlfbZNI00tGTFjqqi07SLqO9ZypsVkyR32G16JPJzk8Zw
+kngBZt6y9LtCMRVbyDuIDNq+fjtDjgxMI9bQXtve4bOuq8cZzcMjC6khz/Ja
+-----END CERTIFICATE-----

+ 36 - 0
config/es_mappings.py

@@ -0,0 +1,36 @@
+index_name = "meta_articles_v1"
+
+settings = {
+    "number_of_shards": 3,
+    "number_of_replicas": 1,
+    "analysis": {
+        "analyzer": {
+            "ik_smart": {"type": "ik_smart"},
+            "ik_max_word": {"type": "ik_max_word"},
+        }
+    }
+}
+
+mappings = {
+    "properties": {
+        "auto_id": {
+            "type": "long",
+            "doc_values": True,
+        },
+        "article_id": {"type": "long"},
+        "platform": {"type": "keyword"},
+        "out_account_id": {"type": "keyword"},
+        "title": {
+            "type": "text",
+            "analyzer": "ik_max_word",
+            "search_analyzer": "ik_smart",
+            "fields": {
+                "keyword": {"type": "keyword", "ignore_above": 256}
+            }
+        },
+        "created_at": {
+            "type": "date",
+            "format": "yyyy-MM-dd HH:mm:ss||epoch_millis"
+        }
+    }
+}