|
@@ -1,20 +1,4 @@
|
|
-import json
|
|
|
|
-
|
|
|
|
-from applications.config import Chunk
|
|
|
|
-
|
|
|
|
-
|
|
|
|
-class TaskConst:
|
|
|
|
- INIT_STATUS = 0
|
|
|
|
- PROCESSING_STATUS = 1
|
|
|
|
- FINISHED_STATUS = 2
|
|
|
|
- FAILED_STATUS = 3
|
|
|
|
-
|
|
|
|
- CHUNK_USEFUL_STATUS = 1
|
|
|
|
-
|
|
|
|
-
|
|
|
|
-class BaseMySQLClient(TaskConst):
|
|
|
|
- def __init__(self, pool):
|
|
|
|
- self.pool = pool
|
|
|
|
|
|
+from .base import BaseMySQLClient
|
|
|
|
|
|
|
|
|
|
class Dataset(BaseMySQLClient):
|
|
class Dataset(BaseMySQLClient):
|
|
@@ -45,304 +29,6 @@ class Dataset(BaseMySQLClient):
|
|
return await self.pool.async_fetch(query=query, params=(id_, status))
|
|
return await self.pool.async_fetch(query=query, params=(id_, status))
|
|
|
|
|
|
|
|
|
|
-class Contents(BaseMySQLClient):
|
|
|
|
- async def insert_content(self, doc_id, text, text_type, title, dataset_id):
|
|
|
|
- query = """
|
|
|
|
- INSERT IGNORE INTO contents
|
|
|
|
- (doc_id, text, text_type, title, dataset_id)
|
|
|
|
- VALUES (%s, %s, %s, %s, %s);
|
|
|
|
- """
|
|
|
|
- return await self.pool.async_save(
|
|
|
|
- query=query, params=(doc_id, text, text_type, title, dataset_id)
|
|
|
|
- )
|
|
|
|
-
|
|
|
|
- async def update_content_info(self, doc_id, text, text_type, title, dataset_id):
|
|
|
|
- query = """
|
|
|
|
- UPDATE contents
|
|
|
|
- SET text = %s, text_type = %s, title = %s, dataset_id = %s, status = %s
|
|
|
|
- WHERE doc_id = %s;
|
|
|
|
- """
|
|
|
|
- return await self.pool.async_save(
|
|
|
|
- query=query,
|
|
|
|
- params=(text, text_type, title, dataset_id, self.INIT_STATUS, doc_id),
|
|
|
|
- )
|
|
|
|
-
|
|
|
|
- async def update_content_status(self, doc_id, ori_status, new_status):
|
|
|
|
- query = """
|
|
|
|
- UPDATE contents
|
|
|
|
- SET status = %s
|
|
|
|
- WHERE doc_id = %s AND status = %s;
|
|
|
|
- """
|
|
|
|
- return await self.pool.async_save(
|
|
|
|
- query=query, params=(new_status, doc_id, ori_status)
|
|
|
|
- )
|
|
|
|
-
|
|
|
|
- async def update_dataset_status(self, dataset_id, ori_status, new_status):
|
|
|
|
- query = """
|
|
|
|
- UPDATE contents
|
|
|
|
- SET status = %s
|
|
|
|
- WHERE dataset_id = %s AND status = %s;
|
|
|
|
- """
|
|
|
|
- return await self.pool.async_save(
|
|
|
|
- query=query, params=(new_status, dataset_id, ori_status)
|
|
|
|
- )
|
|
|
|
-
|
|
|
|
- async def update_doc_status(self, doc_id, ori_status, new_status):
|
|
|
|
- """
|
|
|
|
- this function is to change the using status of each document
|
|
|
|
- :param doc_id:
|
|
|
|
- :param ori_status:
|
|
|
|
- :param new_status:
|
|
|
|
- :return:
|
|
|
|
- """
|
|
|
|
- query = """
|
|
|
|
- UPDATE contents SET doc_status = %s WHERE doc_id = %s AND doc_status = %s;
|
|
|
|
- """
|
|
|
|
- return await self.pool.async_save(
|
|
|
|
- query=query, params=(new_status, doc_id, ori_status)
|
|
|
|
- )
|
|
|
|
-
|
|
|
|
- async def select_count(self, dataset_id, doc_status=1):
|
|
|
|
- query = """
|
|
|
|
- SELECT count(*) AS count FROM contents WHERE dataset_id = %s AND doc_status = %s;
|
|
|
|
- """
|
|
|
|
- rows = await self.pool.async_fetch(query=query, params=(dataset_id, doc_status))
|
|
|
|
- return rows[0]["count"] if rows else 0
|
|
|
|
-
|
|
|
|
- async def select_content_by_doc_id(self, doc_id):
|
|
|
|
- query = """SELECT * FROM contents WHERE doc_id = %s;"""
|
|
|
|
- return await self.pool.async_fetch(query=query, params=(doc_id,))
|
|
|
|
-
|
|
|
|
- async def select_contents(
|
|
|
|
- self,
|
|
|
|
- page_num: int,
|
|
|
|
- page_size: int,
|
|
|
|
- order_by=None,
|
|
|
|
- dataset_id: int = None,
|
|
|
|
- doc_status: int = 1,
|
|
|
|
- ):
|
|
|
|
- """
|
|
|
|
- 分页查询 contents 表,并返回分页信息
|
|
|
|
- :param page_num: 页码,从 1 开始
|
|
|
|
- :param page_size: 每页数量
|
|
|
|
- :param order_by: 排序条件,例如 {"id": "desc"} 或 {"created_at": "asc"}
|
|
|
|
- :param dataset_id: 数据集 ID
|
|
|
|
- :param doc_status: 文档状态(默认 1)
|
|
|
|
- :return: dict,包含 entities、total_count、page、page_size、total_pages
|
|
|
|
- """
|
|
|
|
- if order_by is None:
|
|
|
|
- order_by = {"id": "desc"}
|
|
|
|
- offset = (page_num - 1) * page_size
|
|
|
|
-
|
|
|
|
- # 动态拼接 where 条件
|
|
|
|
- where_clauses = ["doc_status = %s"]
|
|
|
|
- params = [doc_status]
|
|
|
|
-
|
|
|
|
- if dataset_id:
|
|
|
|
- where_clauses.append("dataset_id = %s")
|
|
|
|
- params.append(dataset_id)
|
|
|
|
-
|
|
|
|
- where_sql = " AND ".join(where_clauses)
|
|
|
|
-
|
|
|
|
- # 动态拼接 order by
|
|
|
|
- order_field, order_direction = list(order_by.items())[0]
|
|
|
|
- order_sql = f"ORDER BY {order_field} {order_direction.upper()}"
|
|
|
|
-
|
|
|
|
- # 查询总数
|
|
|
|
- count_query = f"SELECT COUNT(*) as total_count FROM contents WHERE {where_sql};"
|
|
|
|
- count_result = await self.pool.async_fetch(
|
|
|
|
- query=count_query, params=tuple(params)
|
|
|
|
- )
|
|
|
|
- total_count = count_result[0]["total_count"] if count_result else 0
|
|
|
|
-
|
|
|
|
- # 查询分页数据
|
|
|
|
- query = f"""
|
|
|
|
- SELECT * FROM contents
|
|
|
|
- WHERE {where_sql}
|
|
|
|
- {order_sql}
|
|
|
|
- LIMIT %s OFFSET %s;
|
|
|
|
- """
|
|
|
|
- params.extend([page_size, offset])
|
|
|
|
- entities = await self.pool.async_fetch(query=query, params=tuple(params))
|
|
|
|
-
|
|
|
|
- total_pages = (total_count + page_size - 1) // page_size # 向上取整
|
|
|
|
-
|
|
|
|
- return {
|
|
|
|
- "entities": entities,
|
|
|
|
- "total_count": total_count,
|
|
|
|
- "page": page_num,
|
|
|
|
- "page_size": page_size,
|
|
|
|
- "total_pages": total_pages,
|
|
|
|
- }
|
|
|
|
-
|
|
|
|
-
|
|
|
|
-class ContentChunks(BaseMySQLClient):
|
|
|
|
- async def insert_chunk(self, chunk: Chunk) -> int:
|
|
|
|
- query = """
|
|
|
|
- INSERT IGNORE INTO content_chunks
|
|
|
|
- (chunk_id, doc_id, text, tokens, topic_purity, text_type, dataset_id, status)
|
|
|
|
- VALUES (%s, %s, %s, %s, %s, %s, %s, %s);
|
|
|
|
- """
|
|
|
|
- return await self.pool.async_save(
|
|
|
|
- query=query,
|
|
|
|
- params=(
|
|
|
|
- chunk.chunk_id,
|
|
|
|
- chunk.doc_id,
|
|
|
|
- chunk.text,
|
|
|
|
- chunk.tokens,
|
|
|
|
- chunk.topic_purity,
|
|
|
|
- chunk.text_type,
|
|
|
|
- chunk.dataset_id,
|
|
|
|
- chunk.status,
|
|
|
|
- ),
|
|
|
|
- )
|
|
|
|
-
|
|
|
|
- async def update_chunk_status(self, doc_id, chunk_id, ori_status, new_status):
|
|
|
|
- query = """
|
|
|
|
- UPDATE content_chunks
|
|
|
|
- SET chunk_status = %s
|
|
|
|
- WHERE doc_id = %s AND chunk_id = %s AND chunk_status = %s and status = %s;
|
|
|
|
- """
|
|
|
|
- return await self.pool.async_save(
|
|
|
|
- query=query,
|
|
|
|
- params=(new_status, doc_id, chunk_id, ori_status, self.CHUNK_USEFUL_STATUS),
|
|
|
|
- )
|
|
|
|
-
|
|
|
|
- async def update_embedding_status(self, doc_id, chunk_id, ori_status, new_status):
|
|
|
|
- query = """
|
|
|
|
- UPDATE content_chunks
|
|
|
|
- SET embedding_status = %s
|
|
|
|
- WHERE doc_id = %s AND chunk_id = %s AND embedding_status = %s;
|
|
|
|
- """
|
|
|
|
- return await self.pool.async_save(
|
|
|
|
- query=query, params=(new_status, doc_id, chunk_id, ori_status)
|
|
|
|
- )
|
|
|
|
-
|
|
|
|
- async def set_chunk_result(self, chunk: Chunk, ori_status, new_status):
|
|
|
|
- query = """
|
|
|
|
- UPDATE content_chunks
|
|
|
|
- SET summary = %s, topic = %s, domain = %s, task_type = %s, concepts = %s,
|
|
|
|
- keywords = %s, questions = %s, chunk_status = %s, entities = %s
|
|
|
|
- WHERE doc_id = %s AND chunk_id = %s AND chunk_status = %s;
|
|
|
|
- """
|
|
|
|
- return await self.pool.async_save(
|
|
|
|
- query=query,
|
|
|
|
- params=(
|
|
|
|
- chunk.summary,
|
|
|
|
- chunk.topic,
|
|
|
|
- chunk.domain,
|
|
|
|
- chunk.task_type,
|
|
|
|
- json.dumps(chunk.concepts),
|
|
|
|
- json.dumps(chunk.keywords),
|
|
|
|
- json.dumps(chunk.questions),
|
|
|
|
- new_status,
|
|
|
|
- json.dumps(chunk.entities),
|
|
|
|
- chunk.doc_id,
|
|
|
|
- chunk.chunk_id,
|
|
|
|
- ori_status,
|
|
|
|
- ),
|
|
|
|
- )
|
|
|
|
-
|
|
|
|
- async def update_es_status(self, doc_id, chunk_id, ori_status, new_status):
|
|
|
|
- query = """
|
|
|
|
- UPDATE content_chunks SET es_status = %s
|
|
|
|
- WHERE doc_id = %s AND chunk_id = %s AND es_status = %s;
|
|
|
|
- """
|
|
|
|
- return await self.pool.async_save(
|
|
|
|
- query=query, params=(new_status, doc_id, chunk_id, ori_status)
|
|
|
|
- )
|
|
|
|
-
|
|
|
|
- async def update_doc_chunk_status(self, doc_id, chunk_id, ori_status, new_status):
|
|
|
|
- query = """
|
|
|
|
- UPDATE content_chunks set status = %s
|
|
|
|
- WHERE doc_id = %s AND chunk_id = %s AND status = %s;
|
|
|
|
- """
|
|
|
|
- return await self.pool.async_save(
|
|
|
|
- query=query, params=(new_status, doc_id, chunk_id, ori_status)
|
|
|
|
- )
|
|
|
|
-
|
|
|
|
- async def update_doc_status(self, doc_id, ori_status, new_status):
|
|
|
|
- query = """
|
|
|
|
- UPDATE content_chunks set status = %s
|
|
|
|
- WHERE doc_id = %s AND status = %s;
|
|
|
|
- """
|
|
|
|
- return await self.pool.async_save(
|
|
|
|
- query=query, params=(new_status, doc_id, ori_status)
|
|
|
|
- )
|
|
|
|
-
|
|
|
|
- async def update_dataset_status(self, dataset_id, ori_status, new_status):
|
|
|
|
- query = """
|
|
|
|
- UPDATE content_chunks set status = %s
|
|
|
|
- WHERE dataset_id = %s AND status = %s;
|
|
|
|
- """
|
|
|
|
- return await self.pool.async_save(
|
|
|
|
- query=query, params=(new_status, dataset_id, ori_status)
|
|
|
|
- )
|
|
|
|
-
|
|
|
|
- async def select_chunk_content(self, doc_id, chunk_id):
|
|
|
|
- query = """
|
|
|
|
- SELECT * FROM content_chunks WHERE doc_id = %s AND chunk_id = %s;
|
|
|
|
- """
|
|
|
|
- return await self.pool.async_fetch(query=query, params=(doc_id, chunk_id))
|
|
|
|
-
|
|
|
|
- async def select_chunk_contents(
|
|
|
|
- self,
|
|
|
|
- page_num: int,
|
|
|
|
- page_size: int,
|
|
|
|
- order_by: dict = {"chunk_id": "asc"},
|
|
|
|
- doc_id: str = None,
|
|
|
|
- doc_status: int = None,
|
|
|
|
- ):
|
|
|
|
- offset = (page_num - 1) * page_size
|
|
|
|
-
|
|
|
|
- # 动态拼接 where 条件
|
|
|
|
- where_clauses = []
|
|
|
|
- params = []
|
|
|
|
-
|
|
|
|
- if doc_id:
|
|
|
|
- where_clauses.append("doc_id = %s")
|
|
|
|
- params.append(doc_id)
|
|
|
|
-
|
|
|
|
- if doc_status:
|
|
|
|
- where_clauses.append("doc_status = %s")
|
|
|
|
- params.append(doc_status)
|
|
|
|
-
|
|
|
|
- where_sql = " AND ".join(where_clauses)
|
|
|
|
-
|
|
|
|
- # 动态拼接 order by
|
|
|
|
- order_field, order_direction = list(order_by.items())[0]
|
|
|
|
- order_sql = f"ORDER BY {order_field} {order_direction.upper()}"
|
|
|
|
-
|
|
|
|
- # 查询总数
|
|
|
|
- count_query = (
|
|
|
|
- f"SELECT COUNT(*) as total_count FROM content_chunks WHERE {where_sql};"
|
|
|
|
- )
|
|
|
|
- count_result = await self.pool.async_fetch(
|
|
|
|
- query=count_query, params=tuple(params)
|
|
|
|
- )
|
|
|
|
- total_count = count_result[0]["total_count"] if count_result else 0
|
|
|
|
-
|
|
|
|
- # 查询分页数据
|
|
|
|
- query = f"""
|
|
|
|
- SELECT * FROM content_chunks
|
|
|
|
- WHERE {where_sql}
|
|
|
|
- {order_sql}
|
|
|
|
- LIMIT %s OFFSET %s;
|
|
|
|
- """
|
|
|
|
- params.extend([page_size, offset])
|
|
|
|
- entities = await self.pool.async_fetch(query=query, params=tuple(params))
|
|
|
|
-
|
|
|
|
- total_pages = (total_count + page_size - 1) // page_size # 向上取整
|
|
|
|
- print(total_pages)
|
|
|
|
- return {
|
|
|
|
- "entities": entities,
|
|
|
|
- "total_count": total_count,
|
|
|
|
- "page": page_num,
|
|
|
|
- "page_size": page_size,
|
|
|
|
- "total_pages": total_pages,
|
|
|
|
- }
|
|
|
|
-
|
|
|
|
-
|
|
|
|
class ChatResult(BaseMySQLClient):
|
|
class ChatResult(BaseMySQLClient):
|
|
async def insert_chat_result(
|
|
async def insert_chat_result(
|
|
self, query_text, dataset_ids, search_res, chat_res, score, has_answer
|
|
self, query_text, dataset_ids, search_res, chat_res, score, has_answer
|