Преглед изворни кода

优化关键词,实体抽取, 优化分词工具

luojunhui пре 2 недеља
родитељ
комит
6cda09e574

+ 10 - 8
applications/async_task/build_graph.py

@@ -33,7 +33,7 @@ class BuildGraph(AsyncNeo4jRepository):
         acquire_lock = await self.chunk_manager.update_graph_status(
             doc_id, chunk_id, self.INIT_STATUS, self.PROCESSING_STATUS
         )
-        if acquire_lock:
+        if not acquire_lock:
             print(f"while building graph, acquire lock for chunk {chunk_id}")
             return
 
@@ -68,10 +68,12 @@ class BuildGraph(AsyncNeo4jRepository):
     async def deal(self, doc_id):
         """async process single chunk"""
         chunk_list = await self.get_chunk_list_from_es(doc_id)
-        await run_tasks_with_asyncio_task_group(
-            task_list=chunk_list,
-            handler=self.add_single_chunk,
-            description="build graph",
-            unit="chunk",
-            max_concurrency=10,
-        )
+        for chunk in chunk_list:
+            await self.add_single_chunk(chunk)
+        # await run_tasks_with_asyncio_task_group(
+        #     task_list=chunk_list,
+        #     handler=self.add_single_chunk,
+        #     description="build graph",
+        #     unit="chunk",
+        #     max_concurrency=10,
+        # )

+ 2 - 1
applications/utils/chunks/topic_aware_chunking.py

@@ -32,7 +32,8 @@ class TopicAwareChunker(BoundaryDetector, SplitTextIntoSentences):
         return np.stack(embs)
 
     async def _raw_chunk(self, text: str) -> Dict[str, Any]:
-        sentence_list = self.jieba_sent_tokenize(text)
+        # sentence_list = self.jieba_sent_tokenize(text)
+        sentence_list = self.lang_chain_tokenize(text)
         if not sentence_list:
             return {}
 

+ 1 - 1
applications/utils/mysql/content_chunks.py

@@ -187,4 +187,4 @@ class ContentChunks(BaseMySQLClient):
             "page": page_num,
             "page_size": page_size,
             "total_pages": total_pages,
-        }
+        }

+ 2 - 1
applications/utils/mysql/contents.py

@@ -1,5 +1,6 @@
 from .base import BaseMySQLClient
 
+
 class Contents(BaseMySQLClient):
     async def insert_content(self, doc_id, text, text_type, title, dataset_id):
         query = """
@@ -128,4 +129,4 @@ class Contents(BaseMySQLClient):
             "page": page_num,
             "page_size": page_size,
             "total_pages": total_pages,
-        }
+        }

+ 7 - 0
applications/utils/nlp/split_text_into_sentences.py

@@ -3,6 +3,7 @@ import nltk
 import jieba
 
 from typing import List
+from langchain.text_splitter import RecursiveCharacterTextSplitter
 
 
 class SplitTextIntoSentences:
@@ -26,3 +27,9 @@ class SplitTextIntoSentences:
         if buf.strip():
             sentence_list.append(buf.strip())
         return sentence_list
+
+    @staticmethod
+    def lang_chain_tokenize(text: str) -> List[str]:
+        splitter = RecursiveCharacterTextSplitter(chunk_size=100, chunk_overlap=10)
+        docs = splitter.split_text(text)
+        return docs

+ 5 - 1
routes/buleprint.py

@@ -560,6 +560,10 @@ async def delete_task():
         return jsonify({"status_code": 500, "detail": "docId not found", "data": {}})
 
     resource = get_resource_manager()
-    build_graph_task = BuildGraph(neo4j=resource.graph_client, es_client=resource.es_client)
+    build_graph_task = BuildGraph(
+        neo4j=resource.graph_client,
+        es_client=resource.es_client,
+        mysql_client=resource.mysql_client,
+    )
     await build_graph_task.deal(doc_id)
     return jsonify({"status_code": 200, "detail": "success", "data": {}})