|
|
@@ -179,6 +179,16 @@ SUBORDINATORS_TO_FUNCTION = {
|
|
|
}
|
|
|
FINITE_VERB_TAGS = {"VBD", "VBP", "VBZ"}
|
|
|
NONFINITE_VERB_TAGS = {"VBG", "VBN"}
|
|
|
+CLAUSE_PREDICATE_DEPS = {
|
|
|
+ "advcl",
|
|
|
+ "ccomp",
|
|
|
+ "xcomp",
|
|
|
+ "acl",
|
|
|
+ "relcl",
|
|
|
+ "csubj",
|
|
|
+ "csubjpass",
|
|
|
+ "parataxis",
|
|
|
+}
|
|
|
FIXED_MULTIWORD_PHRASES: Tuple[Tuple[re.Pattern, str], ...] = tuple(
|
|
|
(
|
|
|
re.compile(pattern, re.IGNORECASE),
|
|
|
@@ -562,7 +572,7 @@ def _is_finite_predicate_head(token: SpacyToken) -> bool:
|
|
|
return True
|
|
|
verb_forms = set(token.morph.get("VerbForm"))
|
|
|
if "Inf" in verb_forms:
|
|
|
- return False
|
|
|
+ return _has_finite_auxiliary(token)
|
|
|
if verb_forms & {"Part", "Ger"}:
|
|
|
return _has_finite_auxiliary(token)
|
|
|
if token.tag_ in NONFINITE_VERB_TAGS:
|
|
|
@@ -602,6 +612,24 @@ def _predicate_heads(sentence: SpacySpan) -> List[SpacyToken]:
|
|
|
return ordered
|
|
|
|
|
|
|
|
|
+def _is_clause_predicate(token: SpacyToken) -> bool:
|
|
|
+ """Return True if predicate originates inside从句."""
|
|
|
+ if token.dep_ in CLAUSE_PREDICATE_DEPS:
|
|
|
+ return True
|
|
|
+ if token.dep_ != "conj":
|
|
|
+ return False
|
|
|
+ ancestor = token.head
|
|
|
+ safety = 0
|
|
|
+ while ancestor is not None and safety < 10:
|
|
|
+ if ancestor.dep_ in CLAUSE_PREDICATE_DEPS:
|
|
|
+ return True
|
|
|
+ if ancestor.dep_ != "conj" or ancestor.head is ancestor:
|
|
|
+ break
|
|
|
+ ancestor = ancestor.head
|
|
|
+ safety += 1
|
|
|
+ return False
|
|
|
+
|
|
|
+
|
|
|
def _add_fixed_phrases(
|
|
|
sentence: SpacySpan,
|
|
|
mapping: Dict[int, int],
|
|
|
@@ -652,7 +680,10 @@ def annotate_sentence(
|
|
|
|
|
|
for head in _predicate_heads(sentence):
|
|
|
start_char, end_char = _predicate_span_bounds(head)
|
|
|
- add_char_based_span(spans, start_char, end_char, "role-predicate", mapping)
|
|
|
+ cls = "role-predicate"
|
|
|
+ if _is_clause_predicate(head):
|
|
|
+ cls = "role-predicate role-predicate-clause"
|
|
|
+ add_char_based_span(spans, start_char, end_char, cls, mapping)
|
|
|
predicate_text = sentence.doc.text[start_char:end_char].strip()
|
|
|
if summary is not None:
|
|
|
summary.predicates.append(predicate_text or head.text)
|
|
|
@@ -906,6 +937,7 @@ def highlight_text_with_spacy(
|
|
|
text: str,
|
|
|
paragraph_meta: Optional[List[Dict[str, str]]] = None,
|
|
|
include_helper: bool = False,
|
|
|
+ paragraph_ranges: Optional[List[Tuple[int, int]]] = None,
|
|
|
) -> str:
|
|
|
if NLP is None:
|
|
|
raise RuntimeError(f"spaCy pipeline unavailable: {NLP_LOAD_ERROR}")
|
|
|
@@ -923,12 +955,22 @@ def highlight_text_with_spacy(
|
|
|
)
|
|
|
doc = _run_pipeline_without_benepar(text)
|
|
|
|
|
|
- paragraph_ranges = _split_paragraph_ranges(text)
|
|
|
- paragraph_counters = [0 for _ in paragraph_ranges]
|
|
|
+ ranges = None
|
|
|
+ if paragraph_ranges:
|
|
|
+ valid = True
|
|
|
+ for start, end in paragraph_ranges:
|
|
|
+ if start < 0 or end < start or end > len(text):
|
|
|
+ valid = False
|
|
|
+ break
|
|
|
+ if valid:
|
|
|
+ ranges = list(paragraph_ranges)
|
|
|
+ if not ranges:
|
|
|
+ ranges = _split_paragraph_ranges(text)
|
|
|
+ paragraph_counters = [0 for _ in ranges]
|
|
|
paragraph_idx = 0
|
|
|
paragraph_spans: List[Span] = []
|
|
|
- paragraph_attrs = paragraph_meta if paragraph_meta and len(paragraph_meta) == len(paragraph_ranges) else None
|
|
|
- for idx, (start, end) in enumerate(paragraph_ranges):
|
|
|
+ paragraph_attrs = paragraph_meta if paragraph_meta and len(paragraph_meta) == len(ranges) else None
|
|
|
+ for idx, (start, end) in enumerate(ranges):
|
|
|
attrs = None
|
|
|
if paragraph_attrs:
|
|
|
attrs = paragraph_attrs[idx] or None
|
|
|
@@ -937,9 +979,9 @@ def highlight_text_with_spacy(
|
|
|
spans: List[Span] = list(paragraph_spans)
|
|
|
|
|
|
for sent in doc.sents:
|
|
|
- while paragraph_idx < len(paragraph_ranges) and paragraph_ranges[paragraph_idx][1] <= sent.start_char:
|
|
|
+ while paragraph_idx < len(ranges) and ranges[paragraph_idx][1] <= sent.start_char:
|
|
|
paragraph_idx += 1
|
|
|
- current_idx = min(paragraph_idx, len(paragraph_ranges) - 1)
|
|
|
+ current_idx = min(paragraph_idx, len(ranges) - 1)
|
|
|
paragraph_counters[current_idx] += 1
|
|
|
sentence_label = _circled_number(paragraph_counters[current_idx])
|
|
|
|
|
|
@@ -1032,8 +1074,20 @@ async def proxy(url: Optional[str] = None, show_images: bool = False):
|
|
|
if not url:
|
|
|
return HTMLResponse(_render_proxy_page(show_images=show_images))
|
|
|
try:
|
|
|
- normalized_url, title, page_text, images, code_blocks, paragraph_meta = await _fetch_remote_plaintext(url)
|
|
|
- highlighted_fragment = highlight_text_with_spacy(page_text, paragraph_meta=paragraph_meta or None)
|
|
|
+ (
|
|
|
+ normalized_url,
|
|
|
+ title,
|
|
|
+ page_text,
|
|
|
+ images,
|
|
|
+ code_blocks,
|
|
|
+ paragraph_meta,
|
|
|
+ paragraph_ranges,
|
|
|
+ ) = await _fetch_remote_plaintext(url)
|
|
|
+ highlighted_fragment = highlight_text_with_spacy(
|
|
|
+ page_text,
|
|
|
+ paragraph_meta=paragraph_meta or None,
|
|
|
+ paragraph_ranges=paragraph_ranges or None,
|
|
|
+ )
|
|
|
if code_blocks:
|
|
|
highlighted_fragment = _inject_proxy_codeblocks(highlighted_fragment, code_blocks)
|
|
|
image_notice = None
|
|
|
@@ -1392,9 +1446,30 @@ function handlePauseResumeToggle() {
|
|
|
}
|
|
|
}
|
|
|
|
|
|
+function normalizeTtsLine(rawLine) {
|
|
|
+ if (typeof rawLine !== 'string') {
|
|
|
+ return '';
|
|
|
+ }
|
|
|
+ let trimmed = rawLine.replace(/\\r/g, '').trim();
|
|
|
+ if (!trimmed) {
|
|
|
+ return '';
|
|
|
+ }
|
|
|
+ if (trimmed.startsWith('data:')) {
|
|
|
+ trimmed = trimmed.slice(5).trim();
|
|
|
+ }
|
|
|
+ if (!trimmed || trimmed === '[DONE]') {
|
|
|
+ return '';
|
|
|
+ }
|
|
|
+ return trimmed;
|
|
|
+}
|
|
|
+
|
|
|
function parseTtsLine(line) {
|
|
|
+ const normalized = normalizeTtsLine(line);
|
|
|
+ if (!normalized) {
|
|
|
+ return false;
|
|
|
+ }
|
|
|
try {
|
|
|
- const parsed = JSON.parse(line);
|
|
|
+ const parsed = JSON.parse(normalized);
|
|
|
if (parsed && parsed.audio) {
|
|
|
enqueueAudioChunk(parsed.audio);
|
|
|
return true;
|
|
|
@@ -1408,9 +1483,7 @@ function parseTtsLine(line) {
|
|
|
async function consumeTtsResponse(response) {
|
|
|
let chunkCount = 0;
|
|
|
const handleLine = rawLine => {
|
|
|
- const trimmed = rawLine.replace(/\\r/g, '').trim();
|
|
|
- if (!trimmed) return;
|
|
|
- if (parseTtsLine(trimmed)) {
|
|
|
+ if (parseTtsLine(rawLine)) {
|
|
|
chunkCount += 1;
|
|
|
}
|
|
|
};
|
|
|
@@ -1768,9 +1841,30 @@ $source_text_script
|
|
|
}
|
|
|
}
|
|
|
|
|
|
+ function normalizeProxyTtsLine(rawLine) {
|
|
|
+ if (typeof rawLine !== 'string') {
|
|
|
+ return '';
|
|
|
+ }
|
|
|
+ var trimmed = rawLine.replace(/\\r/g, '').trim();
|
|
|
+ if (!trimmed) {
|
|
|
+ return '';
|
|
|
+ }
|
|
|
+ if (trimmed.indexOf('data:') === 0) {
|
|
|
+ trimmed = trimmed.slice(5).trim();
|
|
|
+ }
|
|
|
+ if (!trimmed || trimmed === '[DONE]') {
|
|
|
+ return '';
|
|
|
+ }
|
|
|
+ return trimmed;
|
|
|
+ }
|
|
|
+
|
|
|
function parseTtsLine(line) {
|
|
|
+ var normalized = normalizeProxyTtsLine(line);
|
|
|
+ if (!normalized) {
|
|
|
+ return false;
|
|
|
+ }
|
|
|
try {
|
|
|
- var parsed = JSON.parse(line);
|
|
|
+ var parsed = JSON.parse(normalized);
|
|
|
if (parsed && parsed.audio) {
|
|
|
enqueueAudioChunk(parsed.audio);
|
|
|
return true;
|
|
|
@@ -1784,9 +1878,7 @@ $source_text_script
|
|
|
async function consumeTtsResponse(response) {
|
|
|
var chunkCount = 0;
|
|
|
var handleLine = function(rawLine) {
|
|
|
- var trimmed = rawLine.replace(/\\r/g, '').trim();
|
|
|
- if (!trimmed) return;
|
|
|
- if (parseTtsLine(trimmed)) {
|
|
|
+ if (parseTtsLine(rawLine)) {
|
|
|
chunkCount += 1;
|
|
|
}
|
|
|
};
|
|
|
@@ -2476,7 +2568,9 @@ class SimpleHTMLStripper(HTMLParser):
|
|
|
blocks = self._selected_blocks()
|
|
|
if not blocks:
|
|
|
return ""
|
|
|
- return "\n\n".join(block["text"] for block in blocks)
|
|
|
+ # Keep natural paragraphs contiguous with a single newline instead of
|
|
|
+ # injecting blank lines that did not exist in the source.
|
|
|
+ return "\n".join(block["text"] for block in blocks)
|
|
|
|
|
|
def _selected_blocks(self) -> List[Dict[str, Any]]:
|
|
|
if not self._blocks:
|
|
|
@@ -2571,6 +2665,24 @@ def _build_paragraph_metadata(blocks: List[Dict[str, Any]]) -> List[Dict[str, st
|
|
|
return paragraph_meta
|
|
|
|
|
|
|
|
|
+def _build_paragraph_ranges(blocks: List[Dict[str, Any]]) -> List[Tuple[int, int]]:
|
|
|
+ """Map each stripped block to its char span within the joined plain text."""
|
|
|
+ if not blocks:
|
|
|
+ return []
|
|
|
+ ranges: List[Tuple[int, int]] = []
|
|
|
+ cursor = 0
|
|
|
+ for idx, block in enumerate(blocks):
|
|
|
+ text = block.get("text") or ""
|
|
|
+ start = cursor
|
|
|
+ end = start + len(text)
|
|
|
+ ranges.append((start, end))
|
|
|
+ cursor = end
|
|
|
+ # Plain text joins blocks with a single newline; skip trailing newline.
|
|
|
+ if idx < len(blocks) - 1:
|
|
|
+ cursor += 1
|
|
|
+ return ranges
|
|
|
+
|
|
|
+
|
|
|
def _decode_html_bytes(raw_content: bytes, encoding_hint: Optional[str]) -> str:
|
|
|
encoding_candidates: List[str] = []
|
|
|
if encoding_hint:
|
|
|
@@ -2643,7 +2755,15 @@ async def _download_html_with_fallback(url: str) -> str:
|
|
|
|
|
|
async def _fetch_remote_plaintext(
|
|
|
url: str,
|
|
|
-) -> Tuple[str, str, str, List[Dict[str, str]], List[Dict[str, str]], List[Dict[str, str]]]:
|
|
|
+) -> Tuple[
|
|
|
+ str,
|
|
|
+ str,
|
|
|
+ str,
|
|
|
+ List[Dict[str, str]],
|
|
|
+ List[Dict[str, str]],
|
|
|
+ List[Dict[str, str]],
|
|
|
+ List[Tuple[int, int]],
|
|
|
+]:
|
|
|
normalized = _normalize_target_url(url)
|
|
|
html_body = await _download_html_with_fallback(normalized)
|
|
|
stripper = SimpleHTMLStripper()
|
|
|
@@ -2653,6 +2773,7 @@ async def _fetch_remote_plaintext(
|
|
|
code_blocks = stripper.get_code_blocks()
|
|
|
plain_text = stripper.get_text()
|
|
|
block_info = stripper.get_blocks()
|
|
|
+ paragraph_ranges = _build_paragraph_ranges(block_info)
|
|
|
if not plain_text:
|
|
|
plain_text = _fallback_html_to_text(html_body)
|
|
|
if not plain_text:
|
|
|
@@ -2662,8 +2783,9 @@ async def _fetch_remote_plaintext(
|
|
|
images = []
|
|
|
code_blocks = []
|
|
|
block_info = []
|
|
|
+ paragraph_ranges = []
|
|
|
paragraph_meta = _build_paragraph_metadata(block_info)
|
|
|
- return normalized, title, plain_text, images, code_blocks, paragraph_meta
|
|
|
+ return normalized, title, plain_text, images, code_blocks, paragraph_meta, paragraph_ranges
|
|
|
|
|
|
|
|
|
def _render_proxy_page(
|