Server
/
llm_vector_server


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452
							"""
主题感知分块
"""

from __future__ import annotations

import re, uuid, math
from dataclasses import dataclass, field, asdict
from typing import List, Dict, Any, Tuple, Optional

import optuna
import numpy as np

from sentence_transformers import SentenceTransformer, util

from applications.utils import SplitTextIntoSentences, detect_language, num_tokens


# ---------- Utilities ----------
def simple_sent_tokenize(text: str) -> List[str]:
    text = re.sub(r"\n{2,}", "\n", text)
    parts = re.split(r"([。！？!?；;]+)\s*|\n+", text)
    sents, buf = [], ""
    for p in parts:
        if p is None:
            continue
        if re.match(r"[。！？!?；;]+", p or ""):
            buf += p or ""
            if buf.strip():
                sents.append(buf.strip())
            buf = ""
        elif p.strip() == "":
            if buf.strip():
                sents.append(buf.strip())
                buf = ""
        else:
            buf += p or ""
    if buf.strip():
        sents.append(buf.strip())

    merged = []
    for s in sents:
        if merged and (len(s) < 10 or len(merged[-1]) < 10):
            merged[-1] += " " + s
        else:
            merged.append(s)
    return [s for s in merged if s.strip()]


def approx_tokens(text: str) -> int:
    """Cheap token estimator (≈4 chars/token for zh, ≈0.75 words/token for en)."""
    # This is a heuristic; replace with tiktoken if desired.
    cjk = re.findall(r"[\u4e00-\u9fff]", text)
    others = re.sub(r"[\u4e00-\u9fff]", " ", text).split()
    return max(1, int(len(cjk) / 2.5 + len(others) / 0.75))


# ---------- Knowledge Graph Stub ----------
class KGClassifier:
    """
    Hierarchical topic classifier using embedding prototypes per node.
    Replace `nodes` with your KG; each node keeps a centroid embedding.
    """

    def __init__(self, model: SentenceTransformer, kg_spec: Dict[str, Any]):
        """
        kg_spec example:
        {
          "root": {
            "name": "root",
            "children": [
              {"name": "Computer Science", "children":[
                  {"name":"NLP", "children":[{"name":"RAG", "children":[]}]}]},
              {"name": "Finance", "children":[{"name":"AP/AR", "children":[]}]}]}
        }
        """
        self.model = model
        self.root = kg_spec["root"]
        self._embed_cache = {}  # name -> vector

        def build_centroid(node):
            name = node["name"]
            if name not in self._embed_cache:
                self._embed_cache[name] = self.model.encode(
                    name, normalize_embeddings=True
                )
            for ch in node.get("children", []):
                build_centroid(ch)

        build_centroid(self.root)

    def classify(self, text_emb: np.ndarray, topk: int = 3) -> Tuple[List[str], float]:
        """
        Return (topic_path, purity). Purity is soft max margin across levels.
        """
        path, purities = [], []
        node = self.root
        while True:
            # score current node children
            children = node.get("children", [])
            if not children:
                break
            scores = []
            for ch in children:
                vec = self._embed_cache[ch["name"]]
                scores.append((ch, float(util.cos_sim(text_emb, vec).item())))
            scores.sort(key=lambda x: x[1], reverse=True)
            best, second = scores[0], (scores[1] if len(scores) > 1 else (None, -1.0))
            path.append(best[0]["name"])
            margin = max(0.0, (best[1] - max(second[1], -1.0)))
            purities.append(1 / (1 + math.exp(-5 * margin)))  # squash to (0,1)
            node = best[0]
        purity = float(np.mean(purities)) if purities else 1.0
        return path, purity


# ---------- Core Chunker ----------
@dataclass
class Chunk:
    id: str
    text: str
    tokens: int
    topics: List[str] = field(default_factory=list)
    topic_purity: float = 1.0
    meta: Dict[str, Any] = field(default_factory=dict)


@dataclass
class ChunkerConfig:
    model_name: str = "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2"
    target_tokens: int = 80
    max_tokens: int = 80
    overlap_ratio: float = 0.12
    boundary_threshold: float = 0.50  # similarity drop boundary (lower -> more cuts)
    min_sent_per_chunk: int = 1
    max_sent_per_chunk: int = 8
    enable_adaptive_boundary: bool = True
    enable_kg: bool = True
    topic_purity_floor: float = 0.65
    kg_topk: int = 3


class TopicAwareChunker:
    def __init__(self, cfg: ChunkerConfig, kg_spec: Optional[Dict[str, Any]] = None):
        self.cfg = cfg
        self.model = SentenceTransformer(
            cfg.model_name, device="cpu"
        )  # set gpu if available
        self.model.max_seq_length = 512
        self.kg = (
            KGClassifier(self.model, kg_spec) if (cfg.enable_kg and kg_spec) else None
        )

    # ---------- Public API ----------
    def chunk(self, text: str) -> List[Chunk]:
        sents = simple_sent_tokenize(text)
        if not sents:
            return []
        sent_embs = self.model.encode(sents, normalize_embeddings=True)
        boundaries = self._detect_boundaries(sents, sent_embs)
        raw_chunks = self._pack_by_boundaries(sents, sent_embs, boundaries)
        final_chunks = self._classify_and_refine(raw_chunks)
        return final_chunks

    # ---------- Boundary detection ----------
    def _detect_boundaries(self, sents: List[str], embs: np.ndarray) -> List[int]:
        sims = util.cos_sim(embs[:-1], embs[1:]).cpu().numpy().reshape(-1)
        cut_scores = 1 - sims  # higher means more likely boundary

        # use np.ptp instead of ndarray.ptp (NumPy 2.0 compatibility)
        rng = np.ptp(cut_scores) if np.ptp(cut_scores) > 0 else 1e-6
        cut_scores = (cut_scores - cut_scores.min()) / (rng + 1e-6)

        boundaries = []
        for i, score in enumerate(cut_scores):
            # 对应的是句对 (i, i+1)，这里可以检查 sents[i] 或 sents[i+1]
            sent_to_check = sents[i] if i < len(sents) else sents[-1]
            # 防御性写法，避免越界
            snippet = sent_to_check[-20:] if sent_to_check else ""

            turn = (
                0.1
                if re.search(
                    r"(因此|但是|综上|然而|另一方面|In conclusion|However|Therefore)",
                    snippet,
                )
                else 0.0
            )
            fig = (
                0.1
                if re.search(
                    r"(见下图|如表|表\s*\d+|图\s*\d+|Figure|Table)", sent_to_check
                )
                else 0.0
            )

            adj_score = score + turn + fig
            if adj_score >= self.cfg.boundary_threshold:
                boundaries.append(i)

        return boundaries

    # ---------- Packing ----------
    def _pack_by_boundaries(
        self, sents: List[str], embs: np.ndarray, boundaries: List[int]
    ) -> List[Chunk]:
        """Greedy pack around boundaries to meet target length & sentence counts."""
        boundary_set = set(boundaries)
        chunks: List[Chunk] = []
        start = 0
        n = len(sents)
        while start < n:
            end = start
            cur_tokens = 0
            sent_count = 0
            last_boundary = start - 1
            while end < n and sent_count < self.cfg.max_sent_per_chunk:
                cur_tokens = approx_tokens(" ".join(sents[start : end + 1]))
                sent_count += 1
                if cur_tokens >= self.cfg.target_tokens:
                    # try to cut at nearest boundary to 'end'
                    cut = end
                    # search backward to nearest boundary within window
                    for b in range(end, start - 1, -1):
                        if b in boundary_set:
                            cut = b
                            break
                    # avoid too small chunks
                    if cut - start + 1 >= self.cfg.min_sent_per_chunk:
                        end = cut
                    break
                end += 1

            # finalize chunk
            text = " ".join(sents[start : end + 1]).strip()
            tokens = approx_tokens(text)
            chunk = Chunk(id=str(uuid.uuid4()), text=text, tokens=tokens)
            chunks.append(chunk)

            # soft overlap (append tail sentences of current as head of next)
            if self.cfg.overlap_ratio > 0 and end + 1 < n:
                overlap_tokens = int(tokens * self.cfg.overlap_ratio)
                # approximate by sentences
                overlap_sents = []
                t = 0
                for s in reversed(sents[start : end + 1]):
                    t += approx_tokens(s)
                    overlap_sents.append(s)
                    if t >= overlap_tokens:
                        break
                # prepend to next start by reducing start index backward (not altering original sents)
            start = end + 1
        return chunks

    # ---------- KG classify & refine ----------
    def _classify_and_refine(self, chunks: List[Chunk]) -> List[Chunk]:
        if not self.kg:
            return chunks
        refined: List[Chunk] = []
        for ch in chunks:
            emb = self.model.encode(ch.text, normalize_embeddings=True)
            topics, purity = self.kg.classify(emb, topk=self.cfg.kg_topk)
            ch.topics, ch.topic_purity = topics, purity
            # If purity is low, try a secondary split inside the chunk
            if purity < self.cfg.topic_purity_floor:
                sub = self._refine_chunk_by_topic(ch)
                refined.extend(sub)
            else:
                refined.append(ch)
        return refined

    def _refine_chunk_by_topic(self, chunk: Chunk) -> List[Chunk]:
        """Second-pass split inside a low-purity chunk."""
        sents = simple_sent_tokenize(chunk.text)
        if len(sents) <= self.cfg.min_sent_per_chunk * 2:
            return [chunk]
        embs = self.model.encode(sents, normalize_embeddings=True)
        # force more boundaries by lowering threshold a bit
        orig = self.cfg.boundary_threshold
        try:
            self.cfg.boundary_threshold = max(0.3, orig - 0.1)
            boundaries = self._detect_boundaries(sents, embs)
            sub_chunks = self._pack_by_boundaries(sents, embs, boundaries)
            # inherit topics again
            final = []
            for ch in sub_chunks:
                emb = self.model.encode(ch.text, normalize_embeddings=True)
                topics, purity = self.kg.classify(emb, topk=self.cfg.kg_topk)
                ch.topics, ch.topic_purity = topics, purity
                final.append(ch)
            return final
        finally:
            self.cfg.boundary_threshold = orig


# ---------- Auto-tuning (unsupervised objective) ----------
class UnsupervisedEvaluator:
    """
    Build a score: higher is better.
    - Intra-chunk coherence (avg similarity of neighboring sentences)
    - Inter-chunk separation (low similarity of chunk medoids to neighbors)
    - Length penalty (deviation from target_tokens)
    - Topic purity reward (if KG is enabled)
    """

    def __init__(
        self, model: SentenceTransformer, target_tokens: int, kg_weight: float = 0.5
    ):
        self.model = model
        self.target = target_tokens
        self.kg_weight = kg_weight

    def score(self, chunks: List[Chunk], kg_present: bool = True) -> float:
        if not chunks:
            return -1e6
        # Intra coherence: reward high
        intra = []
        for ch in chunks:
            sents = simple_sent_tokenize(ch.text)
            if len(sents) < 2:
                continue
            embs = self.model.encode(sents, normalize_embeddings=True)
            sims = util.cos_sim(embs[:-1], embs[1:]).cpu().numpy().reshape(-1)
            intra.append(float(np.mean(sims)))
        intra_score = float(np.mean(intra)) if intra else 0.0

        # Inter separation: penalize adjacent chunk similarity
        if len(chunks) > 1:
            reps = self.model.encode(
                [c.text for c in chunks], normalize_embeddings=True
            )
            adj = []
            for i in range(len(chunks) - 1):
                adj.append(float(util.cos_sim(reps[i], reps[i + 1]).item()))
            inter_penalty = float(np.mean(adj))
        else:
            inter_penalty = 0.0

        # Length penalty
        dev = [abs(c.tokens - self.target) / max(1, self.target) for c in chunks]
        len_penalty = float(np.mean(dev))

        # Topic purity
        if kg_present:
            pur = [c.topic_purity for c in chunks]
            purity = float(np.mean(pur))
        else:
            purity = 0.0

        # Final score
        return (
            intra_score
            - 0.6 * inter_penalty
            - 0.4 * len_penalty
            + self.kg_weight * purity
        )


def auto_tune_params(
    raw_texts: List[str],
    kg_spec: Optional[Dict[str, Any]] = None,
    n_trials: int = 20,
    seed: int = 42,
) -> ChunkerConfig:
    """Bayesian-like search with Optuna to find a good config on your corpus."""

    def objective(trial: optuna.Trial):
        cfg = ChunkerConfig(
            target_tokens=trial.suggest_int("target_tokens", 30, 400, step=10),
            max_tokens=trial.suggest_int("max_tokens", 30, 520, step=10),
            overlap_ratio=trial.suggest_float("overlap_ratio", 0.05, 0.25, step=0.05),
            boundary_threshold=trial.suggest_float(
                "boundary_threshold", 0.45, 0.75, step=0.05
            ),
            min_sent_per_chunk=trial.suggest_int("min_sent_per_chunk", 2, 4),
            max_sent_per_chunk=trial.suggest_int("max_sent_per_chunk", 8, 16),
            enable_adaptive_boundary=True,
            enable_kg=(kg_spec is not None),
            topic_purity_floor=trial.suggest_float(
                "topic_purity_floor", 0.55, 0.8, step=0.05
            ),
        )
        chunker = TopicAwareChunker(cfg, kg_spec=kg_spec)
        evaluator = UnsupervisedEvaluator(
            chunker.model, cfg.target_tokens, kg_weight=0.5 if kg_spec else 0.0
        )

        # Evaluate across a small sample
        scores = []
        for t in raw_texts:
            chunks = chunker.chunk(t)
            s = evaluator.score(chunks, kg_present=(kg_spec is not None))
            scores.append(s)
        return float(np.mean(scores))

    sampler = optuna.samplers.TPESampler(seed=seed)
    study = optuna.create_study(direction="maximize", sampler=sampler)
    study.optimize(objective, n_trials=n_trials, show_progress_bar=False)
    best_params = study.best_params

    return ChunkerConfig(
        target_tokens=best_params["target_tokens"],
        max_tokens=best_params["max_tokens"],
        overlap_ratio=best_params["overlap_ratio"],
        boundary_threshold=best_params["boundary_threshold"],
        min_sent_per_chunk=best_params["min_sent_per_chunk"],
        max_sent_per_chunk=best_params["max_sent_per_chunk"],
        enable_adaptive_boundary=True,
        enable_kg=(kg_spec is not None),
        topic_purity_floor=best_params["topic_purity_floor"],
    )


# ---------- Example usage ----------
if __name__ == "__main__":
    sample_text = """
    RAG（Retrieval-Augmented Generation）是一种增强生成的技术。
    在复杂知识问答中，RAG 通过检索相关文档片段来改善答案质量。
    然而，分块策略会显著影响检索召回与可引用性。
    因此，我们提出一种主题感知的分块方法，结合 Transformer 边界探测与知识图谱层次分类。
    然后，我们讲一个新的主题，篮球
    这个也就是罚球动作。一般原地动作分为两种。
    第一种原地投篮动作是先下蹲，做好投篮的发力前上举动作，然后竖直向上伸直身体，右臂顺势在身体向上的过程中竖直向上将球向上投出。这种原地投篮的好处是，发力轻松，可以借助身体向上竖直的这个力度的趋势，帮助投篮发力，会让投篮的力气减少很多。尤其是在比赛后半程体力不好的时候，依然可以做到很高的命中略。这种投篮的要领是：主动的竖直向上的意识。我们以前就经常强调竖直起跳和竖直的概念，但是，同样看起来是竖直，但是用出来的效果却很不同，这主要就是技巧的关系了。这个技巧的精髓就在于“主动意识”。在你练习这种投篮的时候，每一次，都要在下蹲以后，明确的在脑子里想着，要竖直向上发力。双腿要竖直向上用力，整个身体也是这样，而且，最为重要的是，你一定要在练习的时候每次都要主动的去想，然后刻意的去竖直向上。这样，长久下去，养成习惯，你的这种投篮才会稳定。这里我们要顺便强调之前的一篇文章，就是录像纠错法，我们这里之所以一再强调要主动意识的竖直上起，就是因为，在录像上，未必能看得出来这个问题。也就是说，你的录像虽然看起来你是竖直起跳的，但是你没有一个主动的也就是刻意的竖直起跳的意识的话，这个球也不是竖直起跳。另外，相反的，如果你在视频上看到自己不是竖直起跳，但是实际上这个球是你使用了竖直起跳的主动意识来发力的。那么，尽管看起来不是很竖直，却依然可以很稳定。也就是说，眼睛会欺骗你，一定要注重你的意识。
    """
    kg_spec = {
        "root": {
            "name": "root",
            "children": [
                {
                    "name": "Computer Science",
                    "children": [
                        {"name": "NLP", "children": [{"name": "RAG", "children": []}]}
                    ],
                },
                {"name": "Finance", "children": [{"name": "AP/AR", "children": []}]},
                {
                    "name": "体育",
                    "children": [
                        {"name": "篮球", "children": [{"name": "投篮", "children": []}]}
                    ],
                },
            ],
        }
    }
    cfg = auto_tune_params([sample_text], kg_spec=kg_spec, n_trials=10, seed=42)
    chunker = TopicAwareChunker(cfg, kg_spec=kg_spec)
    chunks = chunker.chunk(sample_text)
    for i, ch in enumerate(chunks, 1):
        print(f"\n== Chunk {i} ==")
        print("Tokens:", ch.tokens)
        print("Topics:", " / ".join(ch.topics), "Purity:", round(ch.topic_purity, 3))
        print(ch.text)