|
|
@@ -4,16 +4,16 @@
|
|
|
import asyncio
|
|
|
import html
|
|
|
import json
|
|
|
+import os
|
|
|
import re
|
|
|
from collections import Counter
|
|
|
from dataclasses import dataclass, field
|
|
|
from html.parser import HTMLParser
|
|
|
from string import Template
|
|
|
-from typing import Any, Dict, List, Optional, Tuple
|
|
|
+from typing import Any, Dict, List, Optional, Set, Tuple
|
|
|
from urllib import error as urllib_error, request as urllib_request
|
|
|
from urllib.parse import urlparse, urlunparse
|
|
|
|
|
|
-import benepar
|
|
|
import httpx
|
|
|
import spacy
|
|
|
from fastapi import FastAPI, HTTPException
|
|
|
@@ -47,7 +47,7 @@ def _load_spacy_pipeline(
|
|
|
except OSError:
|
|
|
try:
|
|
|
spacy_download(model_name)
|
|
|
- nlp = spacy.load(model_name)
|
|
|
+ nlp = spacy.load(model_name, disable=["tagger", "lemmatizer"])
|
|
|
except Exception as exc: # pragma: no cover - install helper
|
|
|
raise RuntimeError(
|
|
|
f"spaCy model '{model_name}' is required. Install via `python -m spacy download {model_name}`."
|
|
|
@@ -57,17 +57,34 @@ def _load_spacy_pipeline(
|
|
|
pipe_names = set(nlp.pipe_names)
|
|
|
if not ({"parser", "senter", "sentencizer"} & pipe_names):
|
|
|
try:
|
|
|
- nlp.add_pipe("sentencizer")
|
|
|
+ nlp.add_pipe("sentencizer", disable=["tagger", "lemmatizer"])
|
|
|
except Exception:
|
|
|
pass # if already present or unavailable, ignore
|
|
|
|
|
|
+ enable_benepar = os.getenv("ENABLE_BENEPAR", "0").strip().lower() in {
|
|
|
+ "1",
|
|
|
+ "true",
|
|
|
+ "yes",
|
|
|
+ "on",
|
|
|
+ }
|
|
|
+
|
|
|
+ if not enable_benepar:
|
|
|
+ BENE_PAR_WARNING = (
|
|
|
+ "Benepar is disabled by ENABLE_BENEPAR. Using dependency-based spans."
|
|
|
+ )
|
|
|
+ return nlp
|
|
|
+
|
|
|
# Try to add benepar
|
|
|
if "benepar" not in nlp.pipe_names:
|
|
|
try:
|
|
|
+ import benepar
|
|
|
+
|
|
|
nlp.add_pipe("benepar", config={"model": benepar_model}, last=True)
|
|
|
HAS_BENEPAR = True
|
|
|
except ValueError:
|
|
|
try:
|
|
|
+ import benepar
|
|
|
+
|
|
|
benepar.download(benepar_model)
|
|
|
nlp.add_pipe("benepar", config={"model": benepar_model}, last=True)
|
|
|
HAS_BENEPAR = True
|
|
|
@@ -179,6 +196,16 @@ SUBORDINATORS_TO_FUNCTION = {
|
|
|
}
|
|
|
FINITE_VERB_TAGS = {"VBD", "VBP", "VBZ"}
|
|
|
NONFINITE_VERB_TAGS = {"VBG", "VBN"}
|
|
|
+CLAUSE_PREDICATE_DEPS = {
|
|
|
+ "advcl",
|
|
|
+ "ccomp",
|
|
|
+ "xcomp",
|
|
|
+ "acl",
|
|
|
+ "relcl",
|
|
|
+ "csubj",
|
|
|
+ "csubjpass",
|
|
|
+ "parataxis",
|
|
|
+}
|
|
|
FIXED_MULTIWORD_PHRASES: Tuple[Tuple[re.Pattern, str], ...] = tuple(
|
|
|
(
|
|
|
re.compile(pattern, re.IGNORECASE),
|
|
|
@@ -370,6 +397,35 @@ def add_span(spans: List[Span], start_token: int, end_token: int, cls: str, attr
|
|
|
spans.append(Span(start_token=start_token, end_token=end_token, cls=cls, attrs=attrs))
|
|
|
|
|
|
|
|
|
+def _prune_adverbial_spans(spans: List[Span], sentence_token_bounds: Tuple[int, int]) -> None:
|
|
|
+ """Drop redundant/oversized adverbial spans that make entire sentences underline."""
|
|
|
+ sent_start, sent_end = sentence_token_bounds
|
|
|
+ if sent_start < 0 or sent_end <= sent_start:
|
|
|
+ return
|
|
|
+ sent_length = sent_end - sent_start
|
|
|
+ filtered: List[Span] = []
|
|
|
+ seen_ranges: Set[Tuple[int, int]] = set()
|
|
|
+ for span in spans:
|
|
|
+ classes = span.cls.split()
|
|
|
+ if "role-adverbial" not in classes:
|
|
|
+ filtered.append(span)
|
|
|
+ continue
|
|
|
+ span_length = span.end_token - span.start_token
|
|
|
+ # Skip single-token adverbs and spans that swallow the whole sentence.
|
|
|
+ if span_length <= 1:
|
|
|
+ continue
|
|
|
+ coverage_start = max(span.start_token, sent_start)
|
|
|
+ coverage_end = min(span.end_token, sent_end)
|
|
|
+ if coverage_end - coverage_start >= sent_length:
|
|
|
+ continue
|
|
|
+ range_key = (coverage_start, coverage_end)
|
|
|
+ if range_key in seen_ranges:
|
|
|
+ continue
|
|
|
+ seen_ranges.add(range_key)
|
|
|
+ filtered.append(span)
|
|
|
+ spans[:] = filtered
|
|
|
+
|
|
|
+
|
|
|
def subtree_char_span(token: SpacyToken) -> Tuple[int, int]:
|
|
|
subtree = list(token.subtree)
|
|
|
if not subtree:
|
|
|
@@ -562,7 +618,7 @@ def _is_finite_predicate_head(token: SpacyToken) -> bool:
|
|
|
return True
|
|
|
verb_forms = set(token.morph.get("VerbForm"))
|
|
|
if "Inf" in verb_forms:
|
|
|
- return False
|
|
|
+ return _has_finite_auxiliary(token)
|
|
|
if verb_forms & {"Part", "Ger"}:
|
|
|
return _has_finite_auxiliary(token)
|
|
|
if token.tag_ in NONFINITE_VERB_TAGS:
|
|
|
@@ -602,6 +658,24 @@ def _predicate_heads(sentence: SpacySpan) -> List[SpacyToken]:
|
|
|
return ordered
|
|
|
|
|
|
|
|
|
+def _is_clause_predicate(token: SpacyToken) -> bool:
|
|
|
+ """Return True if predicate originates inside从句."""
|
|
|
+ if token.dep_ in CLAUSE_PREDICATE_DEPS:
|
|
|
+ return True
|
|
|
+ if token.dep_ != "conj":
|
|
|
+ return False
|
|
|
+ ancestor = token.head
|
|
|
+ safety = 0
|
|
|
+ while ancestor is not None and safety < 10:
|
|
|
+ if ancestor.dep_ in CLAUSE_PREDICATE_DEPS:
|
|
|
+ return True
|
|
|
+ if ancestor.dep_ != "conj" or ancestor.head is ancestor:
|
|
|
+ break
|
|
|
+ ancestor = ancestor.head
|
|
|
+ safety += 1
|
|
|
+ return False
|
|
|
+
|
|
|
+
|
|
|
def _add_fixed_phrases(
|
|
|
sentence: SpacySpan,
|
|
|
mapping: Dict[int, int],
|
|
|
@@ -652,7 +726,10 @@ def annotate_sentence(
|
|
|
|
|
|
for head in _predicate_heads(sentence):
|
|
|
start_char, end_char = _predicate_span_bounds(head)
|
|
|
- add_char_based_span(spans, start_char, end_char, "role-predicate", mapping)
|
|
|
+ cls = "role-predicate"
|
|
|
+ if _is_clause_predicate(head):
|
|
|
+ cls = "role-predicate role-predicate-clause"
|
|
|
+ add_char_based_span(spans, start_char, end_char, cls, mapping)
|
|
|
predicate_text = sentence.doc.text[start_char:end_char].strip()
|
|
|
if summary is not None:
|
|
|
summary.predicates.append(predicate_text or head.text)
|
|
|
@@ -693,12 +770,14 @@ def annotate_sentence(
|
|
|
if tok.dep_ in {"amod", "poss", "compound", "nummod"}:
|
|
|
add_token(tok, "role-modifier")
|
|
|
|
|
|
- adverbial_ranges = set()
|
|
|
- for tok in sentence:
|
|
|
- if tok.dep_ in ADVERBIAL_DEPS:
|
|
|
- adverbial_ranges.add(subtree_char_span(tok))
|
|
|
- for start_char, end_char in adverbial_ranges:
|
|
|
- add_char_based_span(spans, start_char, end_char, "role-adverbial", mapping)
|
|
|
+ # Dependency-based adverbial spans are a fallback when constituency data is unavailable.
|
|
|
+ if not HAS_BENEPAR or BENE_PAR_WARNING:
|
|
|
+ adverbial_ranges = set()
|
|
|
+ for tok in sentence:
|
|
|
+ if tok.dep_ in ADVERBIAL_DEPS:
|
|
|
+ adverbial_ranges.add(subtree_char_span(tok))
|
|
|
+ for start_char, end_char in adverbial_ranges:
|
|
|
+ add_char_based_span(spans, start_char, end_char, "role-adverbial", mapping)
|
|
|
|
|
|
for tok in sentence:
|
|
|
if tok.dep_ == "appos":
|
|
|
@@ -738,6 +817,7 @@ def annotate_sentence(
|
|
|
summary,
|
|
|
)
|
|
|
_add_fixed_phrases(sentence, mapping, spans, summary)
|
|
|
+ _prune_adverbial_spans(spans, sent_bounds)
|
|
|
|
|
|
return spans, summary
|
|
|
|
|
|
@@ -906,6 +986,7 @@ def highlight_text_with_spacy(
|
|
|
text: str,
|
|
|
paragraph_meta: Optional[List[Dict[str, str]]] = None,
|
|
|
include_helper: bool = False,
|
|
|
+ paragraph_ranges: Optional[List[Tuple[int, int]]] = None,
|
|
|
) -> str:
|
|
|
if NLP is None:
|
|
|
raise RuntimeError(f"spaCy pipeline unavailable: {NLP_LOAD_ERROR}")
|
|
|
@@ -923,12 +1004,22 @@ def highlight_text_with_spacy(
|
|
|
)
|
|
|
doc = _run_pipeline_without_benepar(text)
|
|
|
|
|
|
- paragraph_ranges = _split_paragraph_ranges(text)
|
|
|
- paragraph_counters = [0 for _ in paragraph_ranges]
|
|
|
+ ranges = None
|
|
|
+ if paragraph_ranges:
|
|
|
+ valid = True
|
|
|
+ for start, end in paragraph_ranges:
|
|
|
+ if start < 0 or end < start or end > len(text):
|
|
|
+ valid = False
|
|
|
+ break
|
|
|
+ if valid:
|
|
|
+ ranges = list(paragraph_ranges)
|
|
|
+ if not ranges:
|
|
|
+ ranges = _split_paragraph_ranges(text)
|
|
|
+ paragraph_counters = [0 for _ in ranges]
|
|
|
paragraph_idx = 0
|
|
|
paragraph_spans: List[Span] = []
|
|
|
- paragraph_attrs = paragraph_meta if paragraph_meta and len(paragraph_meta) == len(paragraph_ranges) else None
|
|
|
- for idx, (start, end) in enumerate(paragraph_ranges):
|
|
|
+ paragraph_attrs = paragraph_meta if paragraph_meta and len(paragraph_meta) == len(ranges) else None
|
|
|
+ for idx, (start, end) in enumerate(ranges):
|
|
|
attrs = None
|
|
|
if paragraph_attrs:
|
|
|
attrs = paragraph_attrs[idx] or None
|
|
|
@@ -937,9 +1028,9 @@ def highlight_text_with_spacy(
|
|
|
spans: List[Span] = list(paragraph_spans)
|
|
|
|
|
|
for sent in doc.sents:
|
|
|
- while paragraph_idx < len(paragraph_ranges) and paragraph_ranges[paragraph_idx][1] <= sent.start_char:
|
|
|
+ while paragraph_idx < len(ranges) and ranges[paragraph_idx][1] <= sent.start_char:
|
|
|
paragraph_idx += 1
|
|
|
- current_idx = min(paragraph_idx, len(paragraph_ranges) - 1)
|
|
|
+ current_idx = min(paragraph_idx, len(ranges) - 1)
|
|
|
paragraph_counters[current_idx] += 1
|
|
|
sentence_label = _circled_number(paragraph_counters[current_idx])
|
|
|
|
|
|
@@ -1032,8 +1123,20 @@ async def proxy(url: Optional[str] = None, show_images: bool = False):
|
|
|
if not url:
|
|
|
return HTMLResponse(_render_proxy_page(show_images=show_images))
|
|
|
try:
|
|
|
- normalized_url, title, page_text, images, code_blocks, paragraph_meta = await _fetch_remote_plaintext(url)
|
|
|
- highlighted_fragment = highlight_text_with_spacy(page_text, paragraph_meta=paragraph_meta or None)
|
|
|
+ (
|
|
|
+ normalized_url,
|
|
|
+ title,
|
|
|
+ page_text,
|
|
|
+ images,
|
|
|
+ code_blocks,
|
|
|
+ paragraph_meta,
|
|
|
+ paragraph_ranges,
|
|
|
+ ) = await _fetch_remote_plaintext(url)
|
|
|
+ highlighted_fragment = highlight_text_with_spacy(
|
|
|
+ page_text,
|
|
|
+ paragraph_meta=paragraph_meta or None,
|
|
|
+ paragraph_ranges=paragraph_ranges or None,
|
|
|
+ )
|
|
|
if code_blocks:
|
|
|
highlighted_fragment = _inject_proxy_codeblocks(highlighted_fragment, code_blocks)
|
|
|
image_notice = None
|
|
|
@@ -1392,9 +1495,30 @@ function handlePauseResumeToggle() {
|
|
|
}
|
|
|
}
|
|
|
|
|
|
+function normalizeTtsLine(rawLine) {
|
|
|
+ if (typeof rawLine !== 'string') {
|
|
|
+ return '';
|
|
|
+ }
|
|
|
+ let trimmed = rawLine.replace(/\\r/g, '').trim();
|
|
|
+ if (!trimmed) {
|
|
|
+ return '';
|
|
|
+ }
|
|
|
+ if (trimmed.startsWith('data:')) {
|
|
|
+ trimmed = trimmed.slice(5).trim();
|
|
|
+ }
|
|
|
+ if (!trimmed || trimmed === '[DONE]') {
|
|
|
+ return '';
|
|
|
+ }
|
|
|
+ return trimmed;
|
|
|
+}
|
|
|
+
|
|
|
function parseTtsLine(line) {
|
|
|
+ const normalized = normalizeTtsLine(line);
|
|
|
+ if (!normalized) {
|
|
|
+ return false;
|
|
|
+ }
|
|
|
try {
|
|
|
- const parsed = JSON.parse(line);
|
|
|
+ const parsed = JSON.parse(normalized);
|
|
|
if (parsed && parsed.audio) {
|
|
|
enqueueAudioChunk(parsed.audio);
|
|
|
return true;
|
|
|
@@ -1408,9 +1532,7 @@ function parseTtsLine(line) {
|
|
|
async function consumeTtsResponse(response) {
|
|
|
let chunkCount = 0;
|
|
|
const handleLine = rawLine => {
|
|
|
- const trimmed = rawLine.replace(/\\r/g, '').trim();
|
|
|
- if (!trimmed) return;
|
|
|
- if (parseTtsLine(trimmed)) {
|
|
|
+ if (parseTtsLine(rawLine)) {
|
|
|
chunkCount += 1;
|
|
|
}
|
|
|
};
|
|
|
@@ -1768,9 +1890,30 @@ $source_text_script
|
|
|
}
|
|
|
}
|
|
|
|
|
|
+ function normalizeProxyTtsLine(rawLine) {
|
|
|
+ if (typeof rawLine !== 'string') {
|
|
|
+ return '';
|
|
|
+ }
|
|
|
+ var trimmed = rawLine.replace(/\\r/g, '').trim();
|
|
|
+ if (!trimmed) {
|
|
|
+ return '';
|
|
|
+ }
|
|
|
+ if (trimmed.indexOf('data:') === 0) {
|
|
|
+ trimmed = trimmed.slice(5).trim();
|
|
|
+ }
|
|
|
+ if (!trimmed || trimmed === '[DONE]') {
|
|
|
+ return '';
|
|
|
+ }
|
|
|
+ return trimmed;
|
|
|
+ }
|
|
|
+
|
|
|
function parseTtsLine(line) {
|
|
|
+ var normalized = normalizeProxyTtsLine(line);
|
|
|
+ if (!normalized) {
|
|
|
+ return false;
|
|
|
+ }
|
|
|
try {
|
|
|
- var parsed = JSON.parse(line);
|
|
|
+ var parsed = JSON.parse(normalized);
|
|
|
if (parsed && parsed.audio) {
|
|
|
enqueueAudioChunk(parsed.audio);
|
|
|
return true;
|
|
|
@@ -1784,9 +1927,7 @@ $source_text_script
|
|
|
async function consumeTtsResponse(response) {
|
|
|
var chunkCount = 0;
|
|
|
var handleLine = function(rawLine) {
|
|
|
- var trimmed = rawLine.replace(/\\r/g, '').trim();
|
|
|
- if (!trimmed) return;
|
|
|
- if (parseTtsLine(trimmed)) {
|
|
|
+ if (parseTtsLine(rawLine)) {
|
|
|
chunkCount += 1;
|
|
|
}
|
|
|
};
|
|
|
@@ -2476,7 +2617,9 @@ class SimpleHTMLStripper(HTMLParser):
|
|
|
blocks = self._selected_blocks()
|
|
|
if not blocks:
|
|
|
return ""
|
|
|
- return "\n\n".join(block["text"] for block in blocks)
|
|
|
+ # Keep natural paragraphs contiguous with a single newline instead of
|
|
|
+ # injecting blank lines that did not exist in the source.
|
|
|
+ return "\n".join(block["text"] for block in blocks)
|
|
|
|
|
|
def _selected_blocks(self) -> List[Dict[str, Any]]:
|
|
|
if not self._blocks:
|
|
|
@@ -2571,6 +2714,24 @@ def _build_paragraph_metadata(blocks: List[Dict[str, Any]]) -> List[Dict[str, st
|
|
|
return paragraph_meta
|
|
|
|
|
|
|
|
|
+def _build_paragraph_ranges(blocks: List[Dict[str, Any]]) -> List[Tuple[int, int]]:
|
|
|
+ """Map each stripped block to its char span within the joined plain text."""
|
|
|
+ if not blocks:
|
|
|
+ return []
|
|
|
+ ranges: List[Tuple[int, int]] = []
|
|
|
+ cursor = 0
|
|
|
+ for idx, block in enumerate(blocks):
|
|
|
+ text = block.get("text") or ""
|
|
|
+ start = cursor
|
|
|
+ end = start + len(text)
|
|
|
+ ranges.append((start, end))
|
|
|
+ cursor = end
|
|
|
+ # Plain text joins blocks with a single newline; skip trailing newline.
|
|
|
+ if idx < len(blocks) - 1:
|
|
|
+ cursor += 1
|
|
|
+ return ranges
|
|
|
+
|
|
|
+
|
|
|
def _decode_html_bytes(raw_content: bytes, encoding_hint: Optional[str]) -> str:
|
|
|
encoding_candidates: List[str] = []
|
|
|
if encoding_hint:
|
|
|
@@ -2643,7 +2804,15 @@ async def _download_html_with_fallback(url: str) -> str:
|
|
|
|
|
|
async def _fetch_remote_plaintext(
|
|
|
url: str,
|
|
|
-) -> Tuple[str, str, str, List[Dict[str, str]], List[Dict[str, str]], List[Dict[str, str]]]:
|
|
|
+) -> Tuple[
|
|
|
+ str,
|
|
|
+ str,
|
|
|
+ str,
|
|
|
+ List[Dict[str, str]],
|
|
|
+ List[Dict[str, str]],
|
|
|
+ List[Dict[str, str]],
|
|
|
+ List[Tuple[int, int]],
|
|
|
+]:
|
|
|
normalized = _normalize_target_url(url)
|
|
|
html_body = await _download_html_with_fallback(normalized)
|
|
|
stripper = SimpleHTMLStripper()
|
|
|
@@ -2653,6 +2822,7 @@ async def _fetch_remote_plaintext(
|
|
|
code_blocks = stripper.get_code_blocks()
|
|
|
plain_text = stripper.get_text()
|
|
|
block_info = stripper.get_blocks()
|
|
|
+ paragraph_ranges = _build_paragraph_ranges(block_info)
|
|
|
if not plain_text:
|
|
|
plain_text = _fallback_html_to_text(html_body)
|
|
|
if not plain_text:
|
|
|
@@ -2662,8 +2832,9 @@ async def _fetch_remote_plaintext(
|
|
|
images = []
|
|
|
code_blocks = []
|
|
|
block_info = []
|
|
|
+ paragraph_ranges = []
|
|
|
paragraph_meta = _build_paragraph_metadata(block_info)
|
|
|
- return normalized, title, plain_text, images, code_blocks, paragraph_meta
|
|
|
+ return normalized, title, plain_text, images, code_blocks, paragraph_meta, paragraph_ranges
|
|
|
|
|
|
|
|
|
def _render_proxy_page(
|