from __future__ import annotations import re from typing import List import numpy as np from sklearn.preprocessing import minmax_scale from applications.config import ChunkerConfig class BoundaryDetector(ChunkerConfig): def __init__(self): self.signal_boost_turn = 0.20 self.signal_boost_fig = 0.20 self.min_gap = 1 @staticmethod def cosine_sim(u: np.ndarray, v: np.ndarray) -> float: """计算余弦相似度""" return float(np.dot(u, v) / (np.linalg.norm(u) * np.linalg.norm(v) + 1e-8)) def turn_signal(self, text: str) -> float: pattern = r"(因此|但是|综上所述?|然而|另一方面|总之|结论是|In conclusion\b|To conclude\b|However\b|Therefore\b|Thus\b|On the other hand\b)" if re.search(pattern, text, flags=re.IGNORECASE): return self.signal_boost_turn return 0.0 def figure_signal(self, text: str) -> float: pattern = r"(见下图|如下图所示|如表所示|如下表所示|表\s*\d+[::]?|图\s*\d+[::]?|Figure\s*\d+|Table\s*\d+)" if re.search(pattern, text, flags=re.IGNORECASE): return self.signal_boost_fig return 0.0 def detect_boundaries( self, sentence_list: List[str], embs: np.ndarray, debug: bool = False ) -> List[int]: sims = np.array( [self.cosine_sim(embs[i], embs[i + 1]) for i in range(len(embs) - 1)] ) cut_scores = 1 - sims cut_scores = minmax_scale(cut_scores) if len(cut_scores) > 0 else [] boundaries = [] last_boundary = -999 for index, base_score in enumerate(cut_scores): sent_to_check = ( sentence_list[index] if index < len(sentence_list) else sentence_list[-1] ) snippet = sent_to_check[-20:] if sent_to_check else "" adj_score = ( base_score + self.turn_signal(snippet) + self.figure_signal(sent_to_check) ) if adj_score >= self.boundary_threshold and ( index - last_boundary >= self.min_gap ): boundaries.append(index) last_boundary = index # Debug 输出 if debug: print( f"[{index}] sim={sims[index]:.3f}, cut={base_score:.3f}, adj={adj_score:.3f}, boundary={index in boundaries}" ) return boundaries