Gogs il y a 1 mois
Parent
commit
045cc2bd36
3 fichiers modifiés avec 191 ajouts et 113 suppressions
  1. 1 2
      README.md
  2. 143 21
      spacyback/mainspacy.py
  3. 47 90
      spacyback/style_config.py

+ 1 - 2
README.md

@@ -4,8 +4,7 @@
 
 ## 功能特性
 - **句法角色高亮**:识别主语、谓语、宾语、补语、同位语、状语、固定搭配等多种结构,同时对括号说明、绝对结构、固定短语等给予特殊样式。
-- **benepar Constituency 支持**:在成功加载 benepar 时会标注状语从句、名词性从句、非限定结构、关系从句等;若模型缺失自动回退到依存句法并给出警告。
-- **句子分析摘要**:每个句子带中文提示(可在 `style_config.py` 中通过 `SENTENCE_HELPER_ENABLED` 开关控制),说明句型、主谓宾、从句功能、连接词等。
+
 - **网页代理模式**:`/proxy` 路由用 httpx 拉取远程页面,提取正文、列表、代码块与图片占位符,随后复用同一高亮流程,可按需重新注入图片或代码片段。
 - **TTS 朗读**:UI 中的“朗读高亮文本/朗读选中文本”按钮会将文本发送到 `TTS_ENDPOINT`(默认 `http://141.140.15.30:8028/generate`),逐段播放流式音频。
 - **健壮的降级策略**:自动补全 spaCy 句子切分、禁用 benepar 时继续运行;`/health` 接口会暴露 benepar 状态与任何加载警告。

+ 143 - 21
spacyback/mainspacy.py

@@ -179,6 +179,16 @@ SUBORDINATORS_TO_FUNCTION = {
 }
 FINITE_VERB_TAGS = {"VBD", "VBP", "VBZ"}
 NONFINITE_VERB_TAGS = {"VBG", "VBN"}
+CLAUSE_PREDICATE_DEPS = {
+    "advcl",
+    "ccomp",
+    "xcomp",
+    "acl",
+    "relcl",
+    "csubj",
+    "csubjpass",
+    "parataxis",
+}
 FIXED_MULTIWORD_PHRASES: Tuple[Tuple[re.Pattern, str], ...] = tuple(
     (
         re.compile(pattern, re.IGNORECASE),
@@ -562,7 +572,7 @@ def _is_finite_predicate_head(token: SpacyToken) -> bool:
         return True
     verb_forms = set(token.morph.get("VerbForm"))
     if "Inf" in verb_forms:
-        return False
+        return _has_finite_auxiliary(token)
     if verb_forms & {"Part", "Ger"}:
         return _has_finite_auxiliary(token)
     if token.tag_ in NONFINITE_VERB_TAGS:
@@ -602,6 +612,24 @@ def _predicate_heads(sentence: SpacySpan) -> List[SpacyToken]:
     return ordered
 
 
+def _is_clause_predicate(token: SpacyToken) -> bool:
+    """Return True if predicate originates inside从句."""
+    if token.dep_ in CLAUSE_PREDICATE_DEPS:
+        return True
+    if token.dep_ != "conj":
+        return False
+    ancestor = token.head
+    safety = 0
+    while ancestor is not None and safety < 10:
+        if ancestor.dep_ in CLAUSE_PREDICATE_DEPS:
+            return True
+        if ancestor.dep_ != "conj" or ancestor.head is ancestor:
+            break
+        ancestor = ancestor.head
+        safety += 1
+    return False
+
+
 def _add_fixed_phrases(
     sentence: SpacySpan,
     mapping: Dict[int, int],
@@ -652,7 +680,10 @@ def annotate_sentence(
 
     for head in _predicate_heads(sentence):
         start_char, end_char = _predicate_span_bounds(head)
-        add_char_based_span(spans, start_char, end_char, "role-predicate", mapping)
+        cls = "role-predicate"
+        if _is_clause_predicate(head):
+            cls = "role-predicate role-predicate-clause"
+        add_char_based_span(spans, start_char, end_char, cls, mapping)
         predicate_text = sentence.doc.text[start_char:end_char].strip()
         if summary is not None:
             summary.predicates.append(predicate_text or head.text)
@@ -906,6 +937,7 @@ def highlight_text_with_spacy(
     text: str,
     paragraph_meta: Optional[List[Dict[str, str]]] = None,
     include_helper: bool = False,
+    paragraph_ranges: Optional[List[Tuple[int, int]]] = None,
 ) -> str:
     if NLP is None:
         raise RuntimeError(f"spaCy pipeline unavailable: {NLP_LOAD_ERROR}")
@@ -923,12 +955,22 @@ def highlight_text_with_spacy(
         )
         doc = _run_pipeline_without_benepar(text)
 
-    paragraph_ranges = _split_paragraph_ranges(text)
-    paragraph_counters = [0 for _ in paragraph_ranges]
+    ranges = None
+    if paragraph_ranges:
+        valid = True
+        for start, end in paragraph_ranges:
+            if start < 0 or end < start or end > len(text):
+                valid = False
+                break
+        if valid:
+            ranges = list(paragraph_ranges)
+    if not ranges:
+        ranges = _split_paragraph_ranges(text)
+    paragraph_counters = [0 for _ in ranges]
     paragraph_idx = 0
     paragraph_spans: List[Span] = []
-    paragraph_attrs = paragraph_meta if paragraph_meta and len(paragraph_meta) == len(paragraph_ranges) else None
-    for idx, (start, end) in enumerate(paragraph_ranges):
+    paragraph_attrs = paragraph_meta if paragraph_meta and len(paragraph_meta) == len(ranges) else None
+    for idx, (start, end) in enumerate(ranges):
         attrs = None
         if paragraph_attrs:
             attrs = paragraph_attrs[idx] or None
@@ -937,9 +979,9 @@ def highlight_text_with_spacy(
     spans: List[Span] = list(paragraph_spans)
 
     for sent in doc.sents:
-        while paragraph_idx < len(paragraph_ranges) and paragraph_ranges[paragraph_idx][1] <= sent.start_char:
+        while paragraph_idx < len(ranges) and ranges[paragraph_idx][1] <= sent.start_char:
             paragraph_idx += 1
-        current_idx = min(paragraph_idx, len(paragraph_ranges) - 1)
+        current_idx = min(paragraph_idx, len(ranges) - 1)
         paragraph_counters[current_idx] += 1
         sentence_label = _circled_number(paragraph_counters[current_idx])
 
@@ -1032,8 +1074,20 @@ async def proxy(url: Optional[str] = None, show_images: bool = False):
     if not url:
         return HTMLResponse(_render_proxy_page(show_images=show_images))
     try:
-        normalized_url, title, page_text, images, code_blocks, paragraph_meta = await _fetch_remote_plaintext(url)
-        highlighted_fragment = highlight_text_with_spacy(page_text, paragraph_meta=paragraph_meta or None)
+        (
+            normalized_url,
+            title,
+            page_text,
+            images,
+            code_blocks,
+            paragraph_meta,
+            paragraph_ranges,
+        ) = await _fetch_remote_plaintext(url)
+        highlighted_fragment = highlight_text_with_spacy(
+            page_text,
+            paragraph_meta=paragraph_meta or None,
+            paragraph_ranges=paragraph_ranges or None,
+        )
         if code_blocks:
             highlighted_fragment = _inject_proxy_codeblocks(highlighted_fragment, code_blocks)
         image_notice = None
@@ -1392,9 +1446,30 @@ function handlePauseResumeToggle() {
   }
 }
 
+function normalizeTtsLine(rawLine) {
+  if (typeof rawLine !== 'string') {
+    return '';
+  }
+  let trimmed = rawLine.replace(/\\r/g, '').trim();
+  if (!trimmed) {
+    return '';
+  }
+  if (trimmed.startsWith('data:')) {
+    trimmed = trimmed.slice(5).trim();
+  }
+  if (!trimmed || trimmed === '[DONE]') {
+    return '';
+  }
+  return trimmed;
+}
+
 function parseTtsLine(line) {
+  const normalized = normalizeTtsLine(line);
+  if (!normalized) {
+    return false;
+  }
   try {
-    const parsed = JSON.parse(line);
+    const parsed = JSON.parse(normalized);
     if (parsed && parsed.audio) {
       enqueueAudioChunk(parsed.audio);
       return true;
@@ -1408,9 +1483,7 @@ function parseTtsLine(line) {
 async function consumeTtsResponse(response) {
   let chunkCount = 0;
   const handleLine = rawLine => {
-    const trimmed = rawLine.replace(/\\r/g, '').trim();
-    if (!trimmed) return;
-    if (parseTtsLine(trimmed)) {
+    if (parseTtsLine(rawLine)) {
       chunkCount += 1;
     }
   };
@@ -1768,9 +1841,30 @@ $source_text_script
     }
   }
 
+  function normalizeProxyTtsLine(rawLine) {
+    if (typeof rawLine !== 'string') {
+      return '';
+    }
+    var trimmed = rawLine.replace(/\\r/g, '').trim();
+    if (!trimmed) {
+      return '';
+    }
+    if (trimmed.indexOf('data:') === 0) {
+      trimmed = trimmed.slice(5).trim();
+    }
+    if (!trimmed || trimmed === '[DONE]') {
+      return '';
+    }
+    return trimmed;
+  }
+
   function parseTtsLine(line) {
+    var normalized = normalizeProxyTtsLine(line);
+    if (!normalized) {
+      return false;
+    }
     try {
-      var parsed = JSON.parse(line);
+      var parsed = JSON.parse(normalized);
       if (parsed && parsed.audio) {
         enqueueAudioChunk(parsed.audio);
         return true;
@@ -1784,9 +1878,7 @@ $source_text_script
   async function consumeTtsResponse(response) {
     var chunkCount = 0;
     var handleLine = function(rawLine) {
-      var trimmed = rawLine.replace(/\\r/g, '').trim();
-      if (!trimmed) return;
-      if (parseTtsLine(trimmed)) {
+      if (parseTtsLine(rawLine)) {
         chunkCount += 1;
       }
     };
@@ -2476,7 +2568,9 @@ class SimpleHTMLStripper(HTMLParser):
         blocks = self._selected_blocks()
         if not blocks:
             return ""
-        return "\n\n".join(block["text"] for block in blocks)
+        # Keep natural paragraphs contiguous with a single newline instead of
+        # injecting blank lines that did not exist in the source.
+        return "\n".join(block["text"] for block in blocks)
 
     def _selected_blocks(self) -> List[Dict[str, Any]]:
         if not self._blocks:
@@ -2571,6 +2665,24 @@ def _build_paragraph_metadata(blocks: List[Dict[str, Any]]) -> List[Dict[str, st
     return paragraph_meta
 
 
+def _build_paragraph_ranges(blocks: List[Dict[str, Any]]) -> List[Tuple[int, int]]:
+    """Map each stripped block to its char span within the joined plain text."""
+    if not blocks:
+        return []
+    ranges: List[Tuple[int, int]] = []
+    cursor = 0
+    for idx, block in enumerate(blocks):
+        text = block.get("text") or ""
+        start = cursor
+        end = start + len(text)
+        ranges.append((start, end))
+        cursor = end
+        # Plain text joins blocks with a single newline; skip trailing newline.
+        if idx < len(blocks) - 1:
+            cursor += 1
+    return ranges
+
+
 def _decode_html_bytes(raw_content: bytes, encoding_hint: Optional[str]) -> str:
     encoding_candidates: List[str] = []
     if encoding_hint:
@@ -2643,7 +2755,15 @@ async def _download_html_with_fallback(url: str) -> str:
 
 async def _fetch_remote_plaintext(
     url: str,
-) -> Tuple[str, str, str, List[Dict[str, str]], List[Dict[str, str]], List[Dict[str, str]]]:
+) -> Tuple[
+    str,
+    str,
+    str,
+    List[Dict[str, str]],
+    List[Dict[str, str]],
+    List[Dict[str, str]],
+    List[Tuple[int, int]],
+]:
     normalized = _normalize_target_url(url)
     html_body = await _download_html_with_fallback(normalized)
     stripper = SimpleHTMLStripper()
@@ -2653,6 +2773,7 @@ async def _fetch_remote_plaintext(
     code_blocks = stripper.get_code_blocks()
     plain_text = stripper.get_text()
     block_info = stripper.get_blocks()
+    paragraph_ranges = _build_paragraph_ranges(block_info)
     if not plain_text:
         plain_text = _fallback_html_to_text(html_body)
         if not plain_text:
@@ -2662,8 +2783,9 @@ async def _fetch_remote_plaintext(
         images = []
         code_blocks = []
         block_info = []
+        paragraph_ranges = []
     paragraph_meta = _build_paragraph_metadata(block_info)
-    return normalized, title, plain_text, images, code_blocks, paragraph_meta
+    return normalized, title, plain_text, images, code_blocks, paragraph_meta, paragraph_ranges
 
 
 def _render_proxy_page(

+ 47 - 90
spacyback/style_config.py

@@ -41,10 +41,7 @@ STYLE_RULES: List[StyleRule] = [
         selector=".sentence-scope",
         target="句子外层容器",
         description="包裹整句,方便显示序号与复杂度指示。",
-        css=(
-            "position:relative;display:inline;padding:0;margin:0;"
-            "box-decoration-break:clone"
-        ),
+        css=("position:relative;display:inline;padding:0;margin:0;box-decoration-break:clone"),
     ),
     StyleRule(
         selector=".sentence-scope::before",
@@ -114,24 +111,7 @@ STYLE_RULES: List[StyleRule] = [
         description="使用原编号恢复有序列表样式。",
         css="content:attr(data-list-index) '. ';",
     ),
-    # StyleRule(
-    #     selector=".sentence-scope[data-complex='1']",
-    #     target="复杂句提示",
-    #     description="复杂句底部加淡橙色阴影,以提示结构较复杂。",
-    #     css="box-shadow:inset 0 -0.2rem 0 rgba(250,209,155,.6)",
-    # ),
-    # StyleRule(
-    #     selector=".analysis[data-helper='on'] .sentence-scope::after",
-    #     target="句子辅助说明",
-    #     description="在句后输出中文提示,解释成分与从句情况。",
-    #     css="content:attr(data-note);display:block;font-size:.85rem;color:#64748b;margin:.2rem 0 .45rem 1.5rem;line-height:1.4",
-    # ),
-    # StyleRule(
-    #     selector=".analysis[data-helper='off'] .sentence-scope::after",
-    #     target="关闭辅助说明",
-    #     description="当 helper 关闭时隐藏说明,避免额外占位。",
-    #     css="content:'';display:none",
-    # ),
+
     StyleRule(
         selector=".role-subject",
         target="主语",
@@ -142,58 +122,64 @@ STYLE_RULES: List[StyleRule] = [
         selector=".role-predicate",
         target="谓语动词",
         description="字体加粗,强调谓语中心。",
-        css="color:#000000!important;font-weight:700;",
+        css="color:#000000;font-weight:700;",
+    ),
+    StyleRule(
+        selector=".role-predicate-clause",
+        target="从句谓语动词",
+        description="使用深蓝色加粗以凸显从句谓语。",
+        css="color:#0d1c5e;font-weight:700;font-style:italic;",
     ),
     StyleRule(
         selector=".role-object-do",
         target="直接宾语",
-        description="浅绿底色显示直接宾语。",
+        description="浅绿字体显示直接宾语。",
         # css="background-color:#e5ffcc",
-        css ="border-bottom:2px solid #e5ffcc; color:#2a5700"
+        css ="color:#2a5700"
     ),
     StyleRule(
         selector=".role-object-io",
         target="间接宾语",
-        description="黄绿底色区分间接宾语。",
+        description="黄绿字体区分间接宾语。",
         # css="background-color:#cef0a3",
-        css ="border-bottom:2px solid #120d4a; color:#120d4a"
-    ),
-    StyleRule(
-        selector=".role-complement",
-        target="表语/主补语",
-        description="实线下划线指示补语区域。",
-        css="border-bottom:2px solid #e6a04c",
-    ),
-    StyleRule(
-        selector=".role-object-complement",
-        target="宾补",
-        description="虚线下划线提示补充说明的宾补。",
-        css="border-bottom:2px dashed #e6a04c",
-    ),
-    StyleRule(
-        selector=".role-apposition",
-        target="同位语",
-        description="蓝色立线和缩进强调同位语说明。",
-        css="border-left:2px solid #63a4d4;padding-left:.15rem",
+        css ="color:#0b6779"
     ),
+    # StyleRule(
+    #     selector=".role-complement",
+    #     target="表语/主补语",
+    #     description="实线下划线指示补语区域。",
+    #     css="border-bottom:2px dotted #af6a18",
+    # ),
+    # StyleRule(
+    #     selector=".role-object-complement",
+    #     target="宾补",
+    #     description="虚线下划线提示补充说明的宾补。",
+    #     css="border-bottom:2px dotted #92252c",
+    # ),
+    # StyleRule(
+    #     selector=".role-apposition",
+    #     target="同位语",
+    #     description="蓝色立线和缩进强调同位语说明。",
+    #     css="border-left:2px dotted #63a4d4;padding-left:.15rem",
+    # ),
     StyleRule(
         selector=".role-adverbial",
         target="状语短语",
-        description="黄绿底色突出状语信息。",
+        description="深绿实线突出状语信息。",
         # css="background-color:#f6fef8",
-        css="border-bottom:2px solid #f6fef8",
+        css="border-bottom:1.5px dotted #c8f9d4",
     ),
     StyleRule(
         selector=".verbal-infinitive",
         target="不定式结构",
-        description="虚线下划线提示 to+动词的不定式短语。",
-        css="border-bottom:2px dashed #c084fc;color:#581c87",
+        description="颜色提示 to+动词的不定式短语。",
+        css="color:#200d72",
     ),
     StyleRule(
         selector=".verbal-gerund",
         target="动名词结构",
         description="淡紫底纹提示 V-ing 充当名词的结构。",
-        css="border-bottom:2px dashed #c084fc;color:#581c87",
+        css="color:#3f033d",
     ),
     StyleRule(
         selector=".role-connector",
@@ -207,12 +193,12 @@ STYLE_RULES: List[StyleRule] = [
     #     description="更浅的背景温和提示限定词。",
     #     css="background-color:#f8fafc;color:#475569",
     # ),
-    StyleRule(
-        selector=".role-modifier",
-        target="形容词或并列修饰",
-        description="虚线下划线标出修饰信息,保证主体和修饰对比。",
-        css="border-bottom:1px dotted #93c5fd",
-    ),
+    # StyleRule(
+    #     selector=".role-modifier",
+    #     target="形容词或并列修饰",
+    #     description="虚线下划线标出修饰信息,保证主体和修饰对比。",
+    #     css="border-bottom:1px dotted #93c5fd",
+    # ),
     StyleRule(
         selector=".role-parenthetical",
         target="插入语",
@@ -258,41 +244,12 @@ STYLE_RULES: List[StyleRule] = [
     
     
     # StyleRule(
-    #     selector=".analysis[data-helper='on'] .clause-relative[data-modifies]::before,.analysis[data-helper='on'] .clause-adverbial[data-modifies]::before",
-    #     target="从句修饰箭头",
-    #     description="在辅助开启时显示“→”指向被修饰的成分。",
-    #     css="content:'→'attr(data-modifies)' ';color:#666;font-size:.85em",
-    # ),
-    # StyleRule(
-    #     selector=".analysis[data-helper='on'] .clause-adverbial[data-function]::after",
-    #     target="状语从句功能标签",
-    #     description="在尾部追加方括号说明(时间/原因等)。",
-    #     css="content:' ['attr(data-function)']';color:#1b5e20;font-size:.85em",
-    # ),
-    # StyleRule(
-    #     selector=".analysis[data-helper='on'] .clause-noun[data-clause-role]::after",
-    #     target="名词从句句法角色",
-    #     description="括号提示该名词从句在句中的角色(主语/宾语)。",
-    #     css="content:' ('attr(data-clause-role)')';color:#3f6212;font-size:.78em",
-    # ),
-    StyleRule(
-        selector=".phrase-fixed",
-        target="固定搭配",
-        description="米色底与虚线强调固定表达或习语。",
-        css="background-color:#fff8f0;border-bottom:1px dashed #c28150",
-    ),
-    # StyleRule(
-    #     selector=".role-residual",
-    #     target="未分类成分",
-    #     description="浅灰背景提示未归类成分,并通过 data-role 提供中文标签。",
-    #     css="background-color:#f6f8fa;color:#475569;border-bottom:1px dotted #cbd5e1",
-    # ),
-    # StyleRule(
-    #     selector=".lex-rare",
-    #     target="低频词",
-    #     description="深蓝色字体提示低频或重点词汇。",
-    #     css="color:#000080",
+    #     selector=".phrase-fixed",
+    #     target="固定搭配",
+    #     description="米色底与虚线强调固定表达或习语。",
+    #     css="background-color:#fff8f0;border-bottom:1px dashed #c28150",
     # ),
+
 ]
 
 STYLE_BLOCK = build_style_block(STYLE_RULES)