|
@@ -70,7 +70,7 @@ class BoundaryDetector(ChunkerConfig):
|
|
return boundaries
|
|
return boundaries
|
|
|
|
|
|
def detect_boundaries_v2(
|
|
def detect_boundaries_v2(
|
|
- self, sentence_list: List[str], embs: np.ndarray, debug: bool = False
|
|
|
|
|
|
+ self, sentence_list: List[str], embs: np.ndarray, debug: bool = False
|
|
) -> List[int]:
|
|
) -> List[int]:
|
|
"""
|
|
"""
|
|
约束:相邻 boundary(含开头到第一个 boundary)之间的句子数 ∈ [3, 10]
|
|
约束:相邻 boundary(含开头到第一个 boundary)之间的句子数 ∈ [3, 10]
|
|
@@ -89,11 +89,11 @@ class BoundaryDetector(ChunkerConfig):
|
|
adj_scores = np.zeros_like(cut_scores)
|
|
adj_scores = np.zeros_like(cut_scores)
|
|
for i in range(len(cut_scores)):
|
|
for i in range(len(cut_scores)):
|
|
sent_to_check = sentence_list[i] if i < n else sentence_list[-1]
|
|
sent_to_check = sentence_list[i] if i < n else sentence_list[-1]
|
|
- snippet = (sent_to_check[-20:] if sent_to_check else "")
|
|
|
|
|
|
+ snippet = sent_to_check[-20:] if sent_to_check else ""
|
|
adj_scores[i] = (
|
|
adj_scores[i] = (
|
|
- cut_scores[i]
|
|
|
|
- + self.turn_signal(snippet)
|
|
|
|
- + self.figure_signal(sent_to_check)
|
|
|
|
|
|
+ cut_scores[i]
|
|
|
|
+ + self.turn_signal(snippet)
|
|
|
|
+ + self.figure_signal(sent_to_check)
|
|
)
|
|
)
|
|
|
|
|
|
# --- 3-10 句强约束切分 ---
|
|
# --- 3-10 句强约束切分 ---
|
|
@@ -158,12 +158,14 @@ class BoundaryDetector(ChunkerConfig):
|
|
|
|
|
|
if lower <= upper:
|
|
if lower <= upper:
|
|
# 在允许区间里找 adj_scores 最高的位置
|
|
# 在允许区间里找 adj_scores 最高的位置
|
|
- window = adj_scores[lower: upper + 1]
|
|
|
|
|
|
+ window = adj_scores[lower : upper + 1]
|
|
j = int(np.argmax(window)) + lower
|
|
j = int(np.argmax(window)) + lower
|
|
if j != boundaries[-1]:
|
|
if j != boundaries[-1]:
|
|
boundaries[-1] = j
|
|
boundaries[-1] = j
|
|
if debug:
|
|
if debug:
|
|
- print(f"[fix-tail] move last boundary -> {j}, tail_len={n - 1 - j}")
|
|
|
|
|
|
+ print(
|
|
|
|
+ f"[fix-tail] move last boundary -> {j}, tail_len={n - 1 - j}"
|
|
|
|
+ )
|
|
else:
|
|
else:
|
|
# 没有可行区间:退化为合并尾段(删掉最后一个 boundary)
|
|
# 没有可行区间:退化为合并尾段(删掉最后一个 boundary)
|
|
dropped = boundaries.pop()
|
|
dropped = boundaries.pop()
|
|
@@ -171,4 +173,3 @@ class BoundaryDetector(ChunkerConfig):
|
|
print(f"[fix-tail] drop last boundary {dropped} to avoid tiny tail")
|
|
print(f"[fix-tail] drop last boundary {dropped} to avoid tiny tail")
|
|
|
|
|
|
return boundaries
|
|
return boundaries
|
|
-
|
|
|