mainspacy.py 96 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662166316641665166616671668166916701671167216731674167516761677167816791680168116821683168416851686168716881689169016911692169316941695169616971698169917001701170217031704170517061707170817091710171117121713171417151716171717181719172017211722172317241725172617271728172917301731173217331734173517361737173817391740174117421743174417451746174717481749175017511752175317541755175617571758175917601761176217631764176517661767176817691770177117721773177417751776177717781779178017811782178317841785178617871788178917901791179217931794179517961797179817991800180118021803180418051806180718081809181018111812181318141815181618171818181918201821182218231824182518261827182818291830183118321833183418351836183718381839184018411842184318441845184618471848184918501851185218531854185518561857185818591860186118621863186418651866186718681869187018711872187318741875187618771878187918801881188218831884188518861887188818891890189118921893189418951896189718981899190019011902190319041905190619071908190919101911191219131914191519161917191819191920192119221923192419251926192719281929193019311932193319341935193619371938193919401941194219431944194519461947194819491950195119521953195419551956195719581959196019611962196319641965196619671968196919701971197219731974197519761977197819791980198119821983198419851986198719881989199019911992199319941995199619971998199920002001200220032004200520062007200820092010201120122013201420152016201720182019202020212022202320242025202620272028202920302031203220332034203520362037203820392040204120422043204420452046204720482049205020512052205320542055205620572058205920602061206220632064206520662067206820692070207120722073207420752076207720782079208020812082208320842085208620872088208920902091209220932094209520962097209820992100210121022103210421052106210721082109211021112112211321142115211621172118211921202121212221232124212521262127212821292130213121322133213421352136213721382139214021412142214321442145214621472148214921502151215221532154215521562157215821592160216121622163216421652166216721682169217021712172217321742175217621772178217921802181218221832184218521862187218821892190219121922193219421952196219721982199220022012202220322042205220622072208220922102211221222132214221522162217221822192220222122222223222422252226222722282229223022312232223322342235223622372238223922402241224222432244224522462247224822492250225122522253225422552256225722582259226022612262226322642265226622672268226922702271227222732274227522762277227822792280228122822283228422852286228722882289229022912292229322942295229622972298229923002301230223032304230523062307230823092310231123122313231423152316231723182319232023212322232323242325232623272328232923302331233223332334233523362337233823392340234123422343234423452346234723482349235023512352235323542355235623572358235923602361236223632364236523662367236823692370237123722373237423752376237723782379238023812382238323842385238623872388238923902391239223932394239523962397239823992400240124022403240424052406240724082409241024112412241324142415241624172418241924202421242224232424242524262427242824292430243124322433243424352436243724382439244024412442244324442445244624472448244924502451245224532454245524562457245824592460246124622463246424652466246724682469247024712472247324742475247624772478247924802481248224832484248524862487248824892490249124922493249424952496249724982499250025012502250325042505250625072508250925102511251225132514251525162517251825192520252125222523252425252526252725282529253025312532253325342535253625372538253925402541254225432544254525462547254825492550255125522553255425552556255725582559256025612562256325642565256625672568256925702571257225732574257525762577257825792580258125822583258425852586258725882589259025912592259325942595259625972598259926002601260226032604260526062607260826092610261126122613261426152616261726182619262026212622262326242625262626272628262926302631263226332634263526362637263826392640264126422643264426452646264726482649265026512652265326542655265626572658265926602661266226632664266526662667266826692670267126722673267426752676267726782679268026812682268326842685268626872688268926902691269226932694269526962697269826992700270127022703270427052706270727082709271027112712271327142715271627172718271927202721272227232724272527262727272827292730273127322733273427352736273727382739274027412742274327442745274627472748274927502751275227532754275527562757275827592760276127622763276427652766276727682769277027712772277327742775277627772778277927802781278227832784278527862787278827892790279127922793279427952796279727982799280028012802280328042805280628072808280928102811281228132814281528162817281828192820282128222823282428252826282728282829283028312832283328342835283628372838283928402841284228432844284528462847284828492850285128522853285428552856285728582859286028612862286328642865286628672868286928702871287228732874287528762877287828792880288128822883288428852886
  1. # -*- coding: utf-8 -*-
  2. """Grammar highlighter powered by spaCy + benepar constituency parsing."""
  3. import asyncio
  4. import html
  5. import json
  6. import os
  7. import re
  8. from collections import Counter
  9. from dataclasses import dataclass, field
  10. from html.parser import HTMLParser
  11. from string import Template
  12. from typing import Any, Dict, List, Optional, Set, Tuple
  13. from urllib import error as urllib_error, request as urllib_request
  14. from urllib.parse import urlparse, urlunparse
  15. import httpx
  16. import spacy
  17. from fastapi import FastAPI, HTTPException
  18. from fastapi.middleware.cors import CORSMiddleware
  19. from fastapi.responses import HTMLResponse
  20. from pydantic import BaseModel, Field
  21. from spacy.cli import download as spacy_download
  22. from spacy.language import Language
  23. from spacy.tokens import Span as SpacySpan, Token as SpacyToken
  24. from style_config import STYLE_BLOCK
  25. BENE_PAR_WARNING: Optional[str] = None
  26. HAS_BENEPAR: bool = False # new: track whether benepar was successfully attached
  27. def _ensure_benepar_warning(message: str) -> None:
  28. """Record a warning once when benepar annotations are unavailable."""
  29. global BENE_PAR_WARNING
  30. if not BENE_PAR_WARNING:
  31. BENE_PAR_WARNING = message
  32. def _load_spacy_pipeline(
  33. model_name: str = "en_core_web_sm", benepar_model: str = "benepar_en3"
  34. ) -> Language:
  35. global BENE_PAR_WARNING, HAS_BENEPAR
  36. BENE_PAR_WARNING = None
  37. HAS_BENEPAR = False
  38. try:
  39. nlp = spacy.load(model_name)
  40. except OSError:
  41. try:
  42. spacy_download(model_name)
  43. nlp = spacy.load(model_name, disable=["tagger", "lemmatizer"])
  44. except Exception as exc: # pragma: no cover - install helper
  45. raise RuntimeError(
  46. f"spaCy model '{model_name}' is required. Install via `python -m spacy download {model_name}`."
  47. ) from exc
  48. # Ensure we have sentence segmentation available
  49. pipe_names = set(nlp.pipe_names)
  50. if not ({"parser", "senter", "sentencizer"} & pipe_names):
  51. try:
  52. nlp.add_pipe("sentencizer", disable=["tagger", "lemmatizer"])
  53. except Exception:
  54. pass # if already present or unavailable, ignore
  55. enable_benepar = os.getenv("ENABLE_BENEPAR", "0").strip().lower() in {
  56. "1",
  57. "true",
  58. "yes",
  59. "on",
  60. }
  61. if not enable_benepar:
  62. BENE_PAR_WARNING = (
  63. "Benepar is disabled by ENABLE_BENEPAR. Using dependency-based spans."
  64. )
  65. return nlp
  66. # Try to add benepar
  67. if "benepar" not in nlp.pipe_names:
  68. try:
  69. import benepar
  70. nlp.add_pipe("benepar", config={"model": benepar_model}, last=True)
  71. HAS_BENEPAR = True
  72. except ValueError:
  73. try:
  74. import benepar
  75. benepar.download(benepar_model)
  76. nlp.add_pipe("benepar", config={"model": benepar_model}, last=True)
  77. HAS_BENEPAR = True
  78. except Exception as exc: # pragma: no cover - install helper
  79. HAS_BENEPAR = False
  80. BENE_PAR_WARNING = (
  81. "Benepar model '{model}' unavailable ({err}). Falling back to dependency-based spans."
  82. ).format(model=benepar_model, err=exc)
  83. except Exception as exc:
  84. HAS_BENEPAR = False
  85. BENE_PAR_WARNING = (
  86. "Failed to attach benepar parser to spaCy pipeline. Falling back to dependency-based spans ({err})."
  87. ).format(err=exc)
  88. else:
  89. HAS_BENEPAR = True
  90. return nlp
  91. try:
  92. NLP: Optional[Language] = _load_spacy_pipeline()
  93. NLP_LOAD_ERROR: Optional[Exception] = None
  94. except Exception as exc: # pragma: no cover - import-time diagnostics
  95. NLP = None
  96. NLP_LOAD_ERROR = exc
  97. class AnalyzeRequest(BaseModel):
  98. text: str = Field(..., description="Raw English text to highlight")
  99. class AnalyzeResponse(BaseModel):
  100. highlighted_html: str
  101. @dataclass
  102. class Token:
  103. text: str
  104. start: int
  105. end: int
  106. kind: str # 'word' | 'space' | 'punct'
  107. @dataclass
  108. class Span:
  109. start_token: int
  110. end_token: int
  111. cls: str
  112. attrs: Optional[Dict[str, str]] = None
  113. @dataclass
  114. class SentenceSummary:
  115. subjects: List[str] = field(default_factory=list)
  116. predicates: List[str] = field(default_factory=list)
  117. objects: List[str] = field(default_factory=list)
  118. complements: List[str] = field(default_factory=list)
  119. clauses: List[str] = field(default_factory=list)
  120. clause_functions: List[str] = field(default_factory=list)
  121. connectors: List[str] = field(default_factory=list)
  122. residual_roles: List[str] = field(default_factory=list)
  123. sentence_length: int = 0
  124. TOKEN_REGEX = re.compile(
  125. r"""
  126. (?:\s+)
  127. |(?:\d+(?:[\.,]\d+)*)
  128. |(?:\w+(?:[-']\w+)*)
  129. |(?:.)
  130. """,
  131. re.VERBOSE | re.UNICODE,
  132. )
  133. WORD_LIKE_RE = re.compile(r"\w+(?:[-']\w+)*\Z", re.UNICODE)
  134. NUMBER_RE = re.compile(r"\d+(?:[\.,]\d+)*\Z", re.UNICODE)
  135. PARAGRAPH_BREAK_RE = re.compile(r"(?:\r?\n[ \t]*){2,}")
  136. SUBJECT_DEPS = {"nsubj", "nsubjpass", "csubj", "csubjpass"}
  137. DIRECT_OBJECT_DEPS = {"dobj", "obj"}
  138. INDIRECT_OBJECT_DEPS = {"iobj", "dative"}
  139. COMPLEMENT_DEPS = {"attr", "oprd", "acomp", "ccomp", "xcomp"}
  140. ADVERBIAL_DEPS = {"advmod", "npadvmod", "advcl", "obl", "prep", "pcomp"}
  141. RELATIVE_PRONOUNS = {"which", "that", "who", "whom", "whose", "where", "when"}
  142. SUBORDINATORS_TO_FUNCTION = {
  143. "when": "TIME",
  144. "while": "TIME",
  145. "after": "TIME",
  146. "before": "TIME",
  147. "until": "TIME",
  148. "as": "TIME",
  149. "once": "TIME",
  150. "since": "TIME",
  151. "because": "REASON",
  152. "now that": "REASON",
  153. "if": "CONDITION",
  154. "unless": "CONDITION",
  155. "provided": "CONDITION",
  156. "provided that": "CONDITION",
  157. "although": "CONCESSION",
  158. "though": "CONCESSION",
  159. "even though": "CONCESSION",
  160. "whereas": "CONCESSION",
  161. "so that": "RESULT",
  162. "so": "RESULT",
  163. "lest": "PURPOSE",
  164. "in order that": "PURPOSE",
  165. }
  166. FINITE_VERB_TAGS = {"VBD", "VBP", "VBZ"}
  167. NONFINITE_VERB_TAGS = {"VBG", "VBN"}
  168. CLAUSE_PREDICATE_DEPS = {
  169. "advcl",
  170. "ccomp",
  171. "xcomp",
  172. "acl",
  173. "relcl",
  174. "csubj",
  175. "csubjpass",
  176. "parataxis",
  177. }
  178. FIXED_MULTIWORD_PHRASES: Tuple[Tuple[re.Pattern, str], ...] = tuple(
  179. (
  180. re.compile(pattern, re.IGNORECASE),
  181. label,
  182. )
  183. for pattern, label in [
  184. (r"\bas well as\b", "as well as"),
  185. (r"\brather than\b", "rather than"),
  186. (r"\bin addition to\b", "in addition to"),
  187. (r"\bin spite of\b", "in spite of"),
  188. (r"\baccording to\b", "according to"),
  189. (r"\bas soon as\b", "as soon as"),
  190. ]
  191. )
  192. CLAUSE_FUNCTION_LABELS = {
  193. "TIME": "时间",
  194. "REASON": "原因",
  195. "CONDITION": "条件",
  196. "CONCESSION": "让步",
  197. "RESULT": "结果",
  198. "PURPOSE": "目的",
  199. }
  200. def _iter_infinitive_markers(token: SpacyToken) -> List[SpacyToken]:
  201. """Collect 'to' markers attached to a verb head."""
  202. markers = []
  203. for child in token.children:
  204. if child.lower_ == "to" and child.tag_ == "TO":
  205. markers.append(child)
  206. return markers
  207. def _token_is_infinitive(token: SpacyToken) -> bool:
  208. if token.pos_ not in {"VERB", "AUX"}:
  209. return False
  210. verb_forms = set(token.morph.get("VerbForm"))
  211. if "Inf" not in verb_forms and token.tag_ != "VB":
  212. return False
  213. return bool(_iter_infinitive_markers(token))
  214. def _token_is_gerund(token: SpacyToken) -> bool:
  215. if token.pos_ not in {"VERB", "AUX"}:
  216. return False
  217. verb_forms = set(token.morph.get("VerbForm"))
  218. if "Ger" in verb_forms:
  219. return True
  220. return token.tag_ == "VBG"
  221. def _annotate_nonfinite_verbals(
  222. sentence: SpacySpan,
  223. spans: List[Span],
  224. mapping: Dict[int, int],
  225. ) -> None:
  226. """Highlight infinitive和gerund短语,帮助识别非限定动词。"""
  227. for token in sentence:
  228. if _token_is_infinitive(token):
  229. start_char, end_char = subtree_char_span(token)
  230. markers = _iter_infinitive_markers(token)
  231. if markers:
  232. start_char = min(start_char, min(child.idx for child in markers))
  233. add_char_based_span(
  234. spans,
  235. start_char,
  236. end_char,
  237. "verbal-infinitive",
  238. mapping,
  239. attrs={"data-form": "不定式"},
  240. )
  241. seen_gerunds = set()
  242. for token in sentence:
  243. if token.i in seen_gerunds:
  244. continue
  245. if _token_is_gerund(token):
  246. start_char, end_char = subtree_char_span(token)
  247. add_char_based_span(
  248. spans,
  249. start_char,
  250. end_char,
  251. "verbal-gerund",
  252. mapping,
  253. attrs={"data-form": "动名词"},
  254. )
  255. seen_gerunds.add(token.i)
  256. RESIDUAL_DEP_LABELS = {
  257. "det": "限定词",
  258. "prep": "介词",
  259. "case": "介词标记",
  260. "cc": "并列连词",
  261. "mark": "从属连词",
  262. "poss": "所有格标记",
  263. "nummod": "数量修饰语",
  264. "aux": "助动词",
  265. "prt": "小品词",
  266. }
  267. RESIDUAL_POS_LABELS = {
  268. "ADJ": "形容词修饰语",
  269. "ADV": "副词",
  270. "NUM": "数词",
  271. "PRON": "代词",
  272. }
  273. def _classify_segment(seg: str) -> str:
  274. if not seg:
  275. return "punct"
  276. if seg.isspace():
  277. return "space"
  278. if NUMBER_RE.fullmatch(seg) or WORD_LIKE_RE.fullmatch(seg):
  279. return "word"
  280. return "punct"
  281. def _append_fallback_tokens(text: str, start: int, end: int, tokens: List[Token]) -> None:
  282. for idx in range(start, end):
  283. ch = text[idx]
  284. if ch.isspace():
  285. kind = "space"
  286. elif ch.isalnum() or ch == "_":
  287. kind = "word"
  288. else:
  289. kind = "punct"
  290. tokens.append(Token(ch, idx, idx + 1, kind))
  291. def tokenize_preserve(text: str) -> List[Token]:
  292. tokens: List[Token] = []
  293. if not text:
  294. return tokens
  295. last_end = 0
  296. for match in TOKEN_REGEX.finditer(text):
  297. if match.start() > last_end:
  298. _append_fallback_tokens(text, last_end, match.start(), tokens)
  299. seg = text[match.start() : match.end()]
  300. tokens.append(Token(seg, match.start(), match.end(), _classify_segment(seg)))
  301. last_end = match.end()
  302. if last_end < len(text):
  303. _append_fallback_tokens(text, last_end, len(text), tokens)
  304. if not tokens and text:
  305. tokens = [Token(text, 0, len(text), "word" if text[0].isalnum() else "punct")]
  306. return tokens
  307. def build_char_to_token_map(tokens: List[Token]) -> Dict[int, int]:
  308. mapping: Dict[int, int] = {}
  309. for idx, tok in enumerate(tokens):
  310. for pos in range(tok.start, tok.end):
  311. mapping[pos] = idx
  312. return mapping
  313. def char_span_to_token_span(
  314. char_start: int, char_end: int, mapping: Dict[int, int]
  315. ) -> Tuple[int, int]:
  316. if char_end <= char_start:
  317. return -1, -1
  318. start_idx = mapping.get(char_start)
  319. end_idx = mapping.get(char_end - 1)
  320. if start_idx is None or end_idx is None:
  321. return -1, -1
  322. return start_idx, end_idx + 1
  323. def add_char_based_span(
  324. spans: List[Span],
  325. char_start: int,
  326. char_end: int,
  327. cls: str,
  328. mapping: Dict[int, int],
  329. attrs: Optional[Dict[str, str]] = None,
  330. ) -> None:
  331. s_tok, e_tok = char_span_to_token_span(char_start, char_end, mapping)
  332. if s_tok < 0 or e_tok < 0:
  333. return
  334. safe_attrs = None
  335. if attrs:
  336. safe_attrs = {k: html.escape(v, quote=True) for k, v in attrs.items() if v}
  337. spans.append(Span(start_token=s_tok, end_token=e_tok, cls=cls, attrs=safe_attrs))
  338. def add_span(spans: List[Span], start_token: int, end_token: int, cls: str, attrs: Optional[Dict[str, str]] = None):
  339. if start_token < 0 or end_token < 0 or end_token <= start_token:
  340. return
  341. spans.append(Span(start_token=start_token, end_token=end_token, cls=cls, attrs=attrs))
  342. def _prune_adverbial_spans(spans: List[Span], sentence_token_bounds: Tuple[int, int]) -> None:
  343. """Drop redundant/oversized adverbial spans that make entire sentences underline."""
  344. sent_start, sent_end = sentence_token_bounds
  345. if sent_start < 0 or sent_end <= sent_start:
  346. return
  347. sent_length = sent_end - sent_start
  348. filtered: List[Span] = []
  349. seen_ranges: Set[Tuple[int, int]] = set()
  350. for span in spans:
  351. classes = span.cls.split()
  352. if "role-adverbial" not in classes:
  353. filtered.append(span)
  354. continue
  355. span_length = span.end_token - span.start_token
  356. # Skip single-token adverbs and spans that swallow the whole sentence.
  357. if span_length <= 1:
  358. continue
  359. coverage_start = max(span.start_token, sent_start)
  360. coverage_end = min(span.end_token, sent_end)
  361. if coverage_end - coverage_start >= sent_length:
  362. continue
  363. range_key = (coverage_start, coverage_end)
  364. if range_key in seen_ranges:
  365. continue
  366. seen_ranges.add(range_key)
  367. filtered.append(span)
  368. spans[:] = filtered
  369. def subtree_char_span(token: SpacyToken) -> Tuple[int, int]:
  370. subtree = list(token.subtree)
  371. if not subtree:
  372. return token.idx, token.idx + len(token.text)
  373. return subtree[0].idx, subtree[-1].idx + len(subtree[-1].text)
  374. def _subtree_text(token: SpacyToken) -> str:
  375. span = token.doc[token.left_edge.i : token.right_edge.i + 1]
  376. return span.text
  377. def _find_antecedent_word(sentence: SpacySpan, clause_start_char: int) -> Optional[str]:
  378. candidate = None
  379. for tok in sentence:
  380. if tok.idx >= clause_start_char:
  381. break
  382. if tok.pos_ in {"NOUN", "PROPN", "PRON"}:
  383. candidate = tok.text
  384. return candidate
  385. def _is_nonfinite_clause(span: SpacySpan) -> bool:
  386. tags = {tok.tag_ for tok in span if tok.tag_}
  387. if tags & FINITE_VERB_TAGS:
  388. return False
  389. if "TO" in tags or tags & NONFINITE_VERB_TAGS:
  390. return True
  391. return False
  392. def _classify_noun_clause(span: SpacySpan) -> Optional[str]:
  393. deps = {tok.dep_ for tok in span}
  394. if deps & {"csubj", "csubjpass"}:
  395. return "subject"
  396. if deps & {"ccomp", "xcomp"}:
  397. return "complement"
  398. if deps & {"dobj", "obj"}:
  399. return "object"
  400. return None
  401. def _split_paragraph_ranges(text: str) -> List[Tuple[int, int]]:
  402. """Return inclusive paragraph ranges, keeping separators intact."""
  403. if not text:
  404. return [(0, 0)]
  405. ranges: List[Tuple[int, int]] = []
  406. start = 0
  407. for match in PARAGRAPH_BREAK_RE.finditer(text):
  408. ranges.append((start, match.start()))
  409. start = match.end()
  410. ranges.append((start, len(text)))
  411. # Ensure at least one range and sorted order
  412. if not ranges:
  413. ranges = [(0, len(text))]
  414. return ranges
  415. def _circled_number(value: int) -> str:
  416. """Return the circled number style for sentence numbering."""
  417. if value <= 0:
  418. return ""
  419. if value <= 20:
  420. return chr(ord("\u2460") + value - 1)
  421. if 21 <= value <= 35:
  422. return chr(ord("\u3251") + value - 21)
  423. if 36 <= value <= 50:
  424. return chr(ord("\u32B1") + value - 36)
  425. return f"({value})"
  426. def annotate_constituents(
  427. sentence: SpacySpan,
  428. spans: List[Span],
  429. mapping: Dict[int, int],
  430. sentence_start_char: int,
  431. sentence_end_char: int,
  432. summary: Optional[SentenceSummary] = None,
  433. ) -> None:
  434. # If benepar is not attached or a previous warning indicates fallback, skip.
  435. if not HAS_BENEPAR or BENE_PAR_WARNING:
  436. _ensure_benepar_warning(
  437. "Benepar component missing or unavailable. Using dependency-based spans."
  438. )
  439. return
  440. # If the extension is not present, skip
  441. if not SpacySpan.has_extension("constituents"):
  442. _ensure_benepar_warning(
  443. "Benepar component missing from spaCy pipeline. Falling back to dependency spans."
  444. )
  445. return
  446. try:
  447. constituents = sentence._.constituents
  448. except Exception as exc:
  449. # Catch any error while accessing benepar results and fallback safely
  450. _ensure_benepar_warning(
  451. f"Benepar constituency parse unavailable: {exc}. Falling back to dependency spans."
  452. )
  453. return
  454. seen_ranges = set()
  455. for const in constituents:
  456. label = getattr(const, "label_", None)
  457. if not label:
  458. continue
  459. start_char, end_char = const.start_char, const.end_char
  460. if start_char == sentence_start_char and end_char == sentence_end_char:
  461. continue # skip the entire sentence span itself
  462. key = (start_char, end_char, label)
  463. is_relative = False
  464. if label in {"PP", "ADVP"}:
  465. if key in seen_ranges:
  466. continue
  467. seen_ranges.add(key)
  468. add_char_based_span(spans, start_char, end_char, "role-adverbial", mapping)
  469. continue
  470. if label == "SBAR" and const:
  471. first_token = const[0]
  472. lowered = first_token.text.lower()
  473. if lowered in RELATIVE_PRONOUNS:
  474. antecedent = _find_antecedent_word(sentence, start_char)
  475. attrs = {"data-modifies": antecedent} if antecedent else None
  476. add_char_based_span(spans, start_char, end_char, "clause-relative", mapping, attrs)
  477. if summary:
  478. summary.clauses.append("定语从句")
  479. is_relative = True
  480. else:
  481. function = SUBORDINATORS_TO_FUNCTION.get(lowered)
  482. attrs = {"data-function": function}
  483. add_char_based_span(spans, start_char, end_char, "clause-adverbial", mapping, attrs)
  484. if summary:
  485. summary.clauses.append("状语从句")
  486. if function:
  487. summary.clause_functions.append(function)
  488. continue
  489. if label in {"S", "VP"}:
  490. if _is_nonfinite_clause(const):
  491. add_char_based_span(spans, start_char, end_char, "clause-nonfinite", mapping)
  492. if summary:
  493. summary.clauses.append("非限定结构")
  494. continue
  495. if label == "S" and not is_relative:
  496. role = _classify_noun_clause(const)
  497. if role:
  498. attrs = {"data-clause-role": role}
  499. add_char_based_span(spans, start_char, end_char, "clause-noun", mapping, attrs)
  500. if summary:
  501. summary.clauses.append(f"名词性从句({role})")
  502. def _predicate_span_bounds(head: SpacyToken) -> Tuple[int, int]:
  503. """Return a character range covering predicate head + functional dependents."""
  504. tokens = [head]
  505. for child in head.children:
  506. if child.dep_ in {"aux", "auxpass", "prt", "cop", "neg"}:
  507. tokens.append(child)
  508. start_char = min(tok.idx for tok in tokens)
  509. end_char = max(tok.idx + len(tok.text) for tok in tokens)
  510. return start_char, end_char
  511. def _token_is_finite(token: SpacyToken) -> bool:
  512. """Return True if token carries finite verb morphology."""
  513. if token.pos_ not in {"VERB", "AUX"}:
  514. return False
  515. verb_forms = set(token.morph.get("VerbForm"))
  516. if "Fin" in verb_forms or "Imp" in verb_forms:
  517. return True
  518. if token.tag_ in FINITE_VERB_TAGS or token.tag_ == "MD":
  519. return True
  520. return False
  521. def _has_finite_auxiliary(token: SpacyToken) -> bool:
  522. """Detect whether the verb head has a finite auxiliary helper."""
  523. for child in token.children:
  524. if child.dep_ in {"aux", "auxpass", "cop"} and _token_is_finite(child):
  525. return True
  526. return False
  527. def _is_finite_predicate_head(token: SpacyToken) -> bool:
  528. """Filter predicate heads to exclude bare infinitives/participles."""
  529. if _token_is_finite(token):
  530. return True
  531. verb_forms = set(token.morph.get("VerbForm"))
  532. if "Inf" in verb_forms:
  533. return _has_finite_auxiliary(token)
  534. if verb_forms & {"Part", "Ger"}:
  535. return _has_finite_auxiliary(token)
  536. if token.tag_ in NONFINITE_VERB_TAGS:
  537. return _has_finite_auxiliary(token)
  538. if token.tag_ == "VB":
  539. has_to_marker = any(
  540. child.dep_ == "mark" and child.lower_ == "to" for child in token.children
  541. )
  542. if has_to_marker:
  543. return False
  544. return token.dep_ == "ROOT"
  545. return False
  546. def _predicate_heads(sentence: SpacySpan) -> List[SpacyToken]:
  547. """Collect predicate heads including coordinated verbs."""
  548. candidates: List[SpacyToken] = []
  549. for tok in sentence:
  550. if tok.pos_ not in {"VERB", "AUX"} and tok.tag_ not in FINITE_VERB_TAGS:
  551. continue
  552. if tok.dep_ == "ROOT":
  553. candidates.append(tok)
  554. continue
  555. if tok.dep_ == "conj" and tok.head.pos_ in {"VERB", "AUX"}:
  556. candidates.append(tok)
  557. continue
  558. if tok.dep_ in {"ccomp", "xcomp", "advcl", "acl", "relcl", "parataxis"}:
  559. candidates.append(tok)
  560. seen = set()
  561. ordered: List[SpacyToken] = []
  562. for tok in sorted(candidates, key=lambda t: t.i):
  563. if tok.i in seen:
  564. continue
  565. seen.add(tok.i)
  566. if _is_finite_predicate_head(tok):
  567. ordered.append(tok)
  568. return ordered
  569. def _is_clause_predicate(token: SpacyToken) -> bool:
  570. """Return True if predicate originates inside从句."""
  571. if token.dep_ in CLAUSE_PREDICATE_DEPS:
  572. return True
  573. if token.dep_ != "conj":
  574. return False
  575. ancestor = token.head
  576. safety = 0
  577. while ancestor is not None and safety < 10:
  578. if ancestor.dep_ in CLAUSE_PREDICATE_DEPS:
  579. return True
  580. if ancestor.dep_ != "conj" or ancestor.head is ancestor:
  581. break
  582. ancestor = ancestor.head
  583. safety += 1
  584. return False
  585. def _add_fixed_phrases(
  586. sentence: SpacySpan,
  587. mapping: Dict[int, int],
  588. spans: List[Span],
  589. summary: Optional[SentenceSummary] = None,
  590. ) -> None:
  591. base = sentence.start_char
  592. text = sentence.text
  593. for pattern, label in FIXED_MULTIWORD_PHRASES:
  594. for match in pattern.finditer(text):
  595. start_char = base + match.start()
  596. end_char = base + match.end()
  597. add_char_based_span(
  598. spans,
  599. start_char,
  600. end_char,
  601. "phrase-fixed",
  602. mapping,
  603. attrs={"data-phrase": label},
  604. )
  605. if summary is not None:
  606. summary.connectors.append(label.lower())
  607. def annotate_sentence(
  608. tokens: List[Token],
  609. sentence: SpacySpan,
  610. mapping: Dict[int, int],
  611. collect_summary: bool = True,
  612. ) -> Tuple[List[Span], Optional[SentenceSummary]]:
  613. spans: List[Span] = []
  614. summary = SentenceSummary(sentence_length=len(sentence)) if collect_summary else None
  615. sent_bounds = char_span_to_token_span(sentence.start_char, sentence.end_char, mapping)
  616. sent_start_tok, sent_end_tok = sent_bounds
  617. def add_subtree(token: SpacyToken, cls: str):
  618. start_char, end_char = subtree_char_span(token)
  619. add_char_based_span(spans, start_char, end_char, cls, mapping)
  620. def add_token(token: SpacyToken, cls: str):
  621. add_char_based_span(spans, token.idx, token.idx + len(token.text), cls, mapping)
  622. for tok in sentence:
  623. if tok.dep_ in SUBJECT_DEPS:
  624. add_subtree(tok, "role-subject")
  625. if summary is not None:
  626. summary.subjects.append(_subtree_text(tok))
  627. for head in _predicate_heads(sentence):
  628. start_char, end_char = _predicate_span_bounds(head)
  629. cls = "role-predicate"
  630. if _is_clause_predicate(head):
  631. cls = "role-predicate role-predicate-clause"
  632. add_char_based_span(spans, start_char, end_char, cls, mapping)
  633. predicate_text = sentence.doc.text[start_char:end_char].strip()
  634. if summary is not None:
  635. summary.predicates.append(predicate_text or head.text)
  636. for tok in sentence:
  637. if tok.dep_ in DIRECT_OBJECT_DEPS:
  638. add_subtree(tok, "role-object-do")
  639. if summary is not None:
  640. summary.objects.append(_subtree_text(tok))
  641. break
  642. io_token = next((tok for tok in sentence if tok.dep_ in INDIRECT_OBJECT_DEPS), None)
  643. if io_token is None:
  644. for tok in sentence:
  645. if tok.dep_ == "pobj" and tok.head.dep_ == "prep" and tok.head.lemma_.lower() in {"to", "for"}:
  646. io_token = tok
  647. break
  648. if io_token:
  649. add_subtree(io_token, "role-object-io")
  650. if summary is not None:
  651. summary.objects.append(_subtree_text(io_token))
  652. for tok in sentence:
  653. if tok.dep_ in COMPLEMENT_DEPS:
  654. add_subtree(tok, "role-complement")
  655. if summary is not None:
  656. summary.complements.append(_subtree_text(tok))
  657. break
  658. for tok in sentence:
  659. lowered = tok.text.lower()
  660. if tok.dep_ in {"cc", "mark", "preconj"} or tok.pos_ in {"CCONJ", "SCONJ"}:
  661. add_token(tok, "role-connector")
  662. if summary is not None:
  663. summary.connectors.append(lowered)
  664. if tok.dep_ == "det" or tok.pos_ == "DET":
  665. add_token(tok, "role-determiner")
  666. if tok.dep_ in {"amod", "poss", "compound", "nummod"}:
  667. add_token(tok, "role-modifier")
  668. # Dependency-based adverbial spans are a fallback when constituency data is unavailable.
  669. if not HAS_BENEPAR or BENE_PAR_WARNING:
  670. adverbial_ranges = set()
  671. for tok in sentence:
  672. if tok.dep_ in ADVERBIAL_DEPS:
  673. adverbial_ranges.add(subtree_char_span(tok))
  674. for start_char, end_char in adverbial_ranges:
  675. add_char_based_span(spans, start_char, end_char, "role-adverbial", mapping)
  676. for tok in sentence:
  677. if tok.dep_ == "appos":
  678. add_subtree(tok, "role-apposition")
  679. if sent_start_tok >= 0 and sent_end_tok >= 0:
  680. stack = []
  681. for idx in range(sent_start_tok, sent_end_tok):
  682. token = tokens[idx]
  683. if token.text == "(":
  684. stack.append(idx)
  685. elif token.text == ")" and stack:
  686. add_span(spans, stack.pop(), idx + 1, "role-parenthetical")
  687. comma_token_idxs = [
  688. i
  689. for i in range(sent_start_tok, sent_end_tok)
  690. if tokens[i].kind == "punct" and tokens[i].text == ","
  691. ]
  692. for idx, first_comma in enumerate(comma_token_idxs):
  693. if idx + 1 >= len(comma_token_idxs):
  694. break
  695. second_comma = comma_token_idxs[idx + 1]
  696. start_char = tokens[first_comma].start
  697. end_char = tokens[second_comma].end
  698. span = sentence.doc.char_span(start_char, end_char, alignment_mode="expand")
  699. if span and any(tok.tag_ == "VBG" for tok in span):
  700. add_span(spans, first_comma, second_comma + 1, "role-absolute")
  701. _annotate_nonfinite_verbals(sentence, spans, mapping)
  702. annotate_constituents(
  703. sentence,
  704. spans,
  705. mapping,
  706. sentence.start_char,
  707. sentence.end_char,
  708. summary,
  709. )
  710. _add_fixed_phrases(sentence, mapping, spans, summary)
  711. _prune_adverbial_spans(spans, sent_bounds)
  712. return spans, summary
  713. def _label_residual_token(token: SpacyToken) -> Optional[str]:
  714. dep_label = RESIDUAL_DEP_LABELS.get(token.dep_)
  715. if dep_label:
  716. return dep_label
  717. return RESIDUAL_POS_LABELS.get(token.pos_)
  718. def _collect_residual_roles(
  719. sentence: SpacySpan,
  720. tokens: List[Token],
  721. spans: List[Span],
  722. sent_bounds: Tuple[int, int],
  723. summary: Optional[SentenceSummary],
  724. mapping: Dict[int, int],
  725. ) -> None:
  726. sent_start, sent_end = sent_bounds
  727. if sent_start < 0 or sent_end < 0 or sent_start >= sent_end:
  728. return
  729. coverage = [False] * (sent_end - sent_start)
  730. for span in spans:
  731. lo = max(span.start_token, sent_start)
  732. hi = min(span.end_token, sent_end)
  733. for idx in range(lo, hi):
  734. coverage[idx - sent_start] = True
  735. doc = sentence.doc
  736. for offset, covered in enumerate(coverage):
  737. if covered:
  738. continue
  739. token = tokens[sent_start + offset]
  740. if token.kind != "word":
  741. continue
  742. span = doc.char_span(token.start, token.end, alignment_mode="expand")
  743. if not span or not span.text.strip():
  744. continue
  745. label = _label_residual_token(span[0])
  746. if summary is not None and label and label not in summary.residual_roles:
  747. summary.residual_roles.append(label)
  748. if label:
  749. add_char_based_span(
  750. spans,
  751. token.start,
  752. token.end,
  753. "role-residual",
  754. mapping,
  755. attrs={"data-role": label},
  756. )
  757. def _classify_sentence_complexity(summary: SentenceSummary) -> Tuple[str, bool]:
  758. clause_count = len(summary.clauses)
  759. connector_count = len(summary.connectors)
  760. word_count = summary.sentence_length
  761. if clause_count >= 2:
  762. return "多重复杂句", True
  763. if clause_count == 1:
  764. return "主从复合句", True
  765. if connector_count >= 2:
  766. return "并列复合句", True
  767. if word_count >= 25:
  768. return "长句", True
  769. return "简单句", False
  770. def _translate_clause_functions(functions: List[str]) -> List[str]:
  771. translated = []
  772. for item in functions:
  773. label = CLAUSE_FUNCTION_LABELS.get(item, item)
  774. if label not in translated:
  775. translated.append(label)
  776. return translated
  777. def build_sentence_note(summary: SentenceSummary) -> Tuple[str, bool]:
  778. note_parts: List[str] = []
  779. clause_label = "无"
  780. if summary.clauses:
  781. counts = Counter(summary.clauses)
  782. clause_label = "、".join(
  783. f"{name}×{count}" if count > 1 else name for name, count in counts.items()
  784. )
  785. functions = _translate_clause_functions(summary.clause_functions)
  786. connectors = list(dict.fromkeys(summary.connectors))
  787. residual = summary.residual_roles
  788. subjects_seq = list(dict.fromkeys(summary.subjects))
  789. predicates_seq = list(dict.fromkeys(summary.predicates))
  790. objects_seq = list(dict.fromkeys(summary.objects))
  791. complements_seq = list(dict.fromkeys(summary.complements))
  792. subjects = "、".join(subjects_seq) if subjects_seq else "未识别"
  793. predicates = "、".join(predicates_seq) if predicates_seq else "未识别"
  794. objects = "、".join(objects_seq) if objects_seq else "无"
  795. complements = "、".join(complements_seq) if complements_seq else "无"
  796. note_parts.append(f"主语:{subjects}")
  797. note_parts.append(f"谓语:{predicates}")
  798. note_parts.append(f"宾语:{objects}")
  799. if complements != "无":
  800. note_parts.append(f"补语:{complements}")
  801. note_parts.append(f"从句:{clause_label}")
  802. if functions:
  803. note_parts.append(f"从句功能:{'、'.join(functions)}")
  804. connector_text = "、".join(connectors) if connectors else "未检测到典型连接词"
  805. note_parts.append(f"连接词:{connector_text}")
  806. if residual:
  807. note_parts.append(f"未高亮:{'、'.join(residual)}")
  808. complexity_label, is_complex = _classify_sentence_complexity(summary)
  809. note_parts.insert(0, f"句型:{complexity_label}")
  810. note_parts.append(f"词数:{summary.sentence_length}")
  811. return ";".join(note_parts), is_complex
  812. def render_with_spans(tokens: List[Token], spans: List[Span]) -> str:
  813. spans = sorted(spans, key=lambda s: (s.start_token, -s.end_token))
  814. out_parts: List[str] = []
  815. active_stack: List[Span] = []
  816. span_queue = list(spans)
  817. current_idx = 0
  818. def open_span(span: Span):
  819. attrs = ""
  820. if span.attrs:
  821. attrs = " " + " ".join(
  822. f"{k}='" + html.escape(v, quote=True) + "'" for k, v in span.attrs.items()
  823. )
  824. out_parts.append(f"<span class='{span.cls}'{attrs}>")
  825. def close_span():
  826. out_parts.append("</span>")
  827. while current_idx < len(tokens):
  828. opening = [sp for sp in span_queue if sp.start_token == current_idx]
  829. for sp in opening:
  830. open_span(sp)
  831. active_stack.append(sp)
  832. span_queue.remove(sp)
  833. token = tokens[current_idx]
  834. out_parts.append(html.escape(token.text))
  835. current_idx += 1
  836. while active_stack and active_stack[-1].end_token == current_idx:
  837. active_stack.pop()
  838. close_span()
  839. while active_stack:
  840. active_stack.pop()
  841. close_span()
  842. return "".join(out_parts)
  843. def _run_pipeline_without_benepar(text: str) -> "spacy.tokens.Doc":
  844. """Run the spaCy pipeline skipping benepar, for robust fallback."""
  845. assert NLP is not None
  846. doc = NLP.make_doc(text)
  847. for name, proc in NLP.pipeline:
  848. if name == "benepar":
  849. continue
  850. doc = proc(doc)
  851. return doc
  852. def highlight_text_with_spacy(
  853. text: str,
  854. paragraph_meta: Optional[List[Dict[str, str]]] = None,
  855. include_helper: bool = False,
  856. paragraph_ranges: Optional[List[Tuple[int, int]]] = None,
  857. ) -> str:
  858. if NLP is None:
  859. raise RuntimeError(f"spaCy pipeline unavailable: {NLP_LOAD_ERROR}")
  860. tokens = tokenize_preserve(text)
  861. if not tokens:
  862. return ""
  863. mapping = build_char_to_token_map(tokens)
  864. # Robust doc creation: if benepar causes any error, skip it and fallback.
  865. try:
  866. doc = NLP(text)
  867. except Exception as exc:
  868. _ensure_benepar_warning(
  869. f"Benepar failed during processing: {exc}. Falling back to dependency-based spans."
  870. )
  871. doc = _run_pipeline_without_benepar(text)
  872. ranges = None
  873. if paragraph_ranges:
  874. valid = True
  875. for start, end in paragraph_ranges:
  876. if start < 0 or end < start or end > len(text):
  877. valid = False
  878. break
  879. if valid:
  880. ranges = list(paragraph_ranges)
  881. if not ranges:
  882. ranges = _split_paragraph_ranges(text)
  883. paragraph_counters = [0 for _ in ranges]
  884. paragraph_idx = 0
  885. paragraph_spans: List[Span] = []
  886. paragraph_attrs = paragraph_meta if paragraph_meta and len(paragraph_meta) == len(ranges) else None
  887. for idx, (start, end) in enumerate(ranges):
  888. attrs = None
  889. if paragraph_attrs:
  890. attrs = paragraph_attrs[idx] or None
  891. add_char_based_span(paragraph_spans, start, end, "paragraph-scope", mapping, attrs=attrs)
  892. spans: List[Span] = list(paragraph_spans)
  893. for sent in doc.sents:
  894. while paragraph_idx < len(ranges) and ranges[paragraph_idx][1] <= sent.start_char:
  895. paragraph_idx += 1
  896. current_idx = min(paragraph_idx, len(ranges) - 1)
  897. paragraph_counters[current_idx] += 1
  898. sentence_label = _circled_number(paragraph_counters[current_idx])
  899. sentence_spans, summary = annotate_sentence(tokens, sent, mapping, collect_summary=include_helper)
  900. sent_bounds = char_span_to_token_span(sent.start_char, sent.end_char, mapping)
  901. sent_start, sent_end = sent_bounds
  902. if sent_start >= 0 and sent_end >= 0:
  903. _collect_residual_roles(sent, tokens, sentence_spans, sent_bounds, summary, mapping)
  904. helper_note = ""
  905. is_complex = False
  906. if include_helper and summary is not None:
  907. helper_note, is_complex = build_sentence_note(summary)
  908. attrs = {
  909. "data-sid": sentence_label,
  910. }
  911. if include_helper:
  912. attrs["data-complex"] = "1" if is_complex else "0"
  913. attrs["data-note"] = helper_note
  914. sentence_spans.append(Span(start_token=sent_start, end_token=sent_end, cls="sentence-scope", attrs=attrs))
  915. spans.extend(sentence_spans)
  916. return render_with_spans(tokens, spans)
  917. def _build_analysis_container(fragment: str, include_helper: bool) -> str:
  918. helper_state = "on" if include_helper else "off"
  919. return f"<div class='analysis' data-helper='{helper_state}'>{fragment}</div>"
  920. def _build_highlighted_html(fragment: str, include_helper: bool) -> str:
  921. return f"{STYLE_BLOCK}{_build_analysis_container(fragment, include_helper)}"
  922. def _perform_analysis(text: str, include_helper: bool) -> AnalyzeResponse:
  923. sanitized_fragment = highlight_text_with_spacy(text, include_helper=include_helper)
  924. highlighted_html = _build_highlighted_html(sanitized_fragment, include_helper)
  925. return AnalyzeResponse(highlighted_html=highlighted_html)
  926. app = FastAPI(title="Grammar Highlight API (spaCy + benepar)")
  927. app.add_middleware(
  928. CORSMiddleware,
  929. allow_origins=["*"],
  930. allow_credentials=True,
  931. allow_methods=["*"],
  932. allow_headers=["*"],
  933. )
  934. @app.post("/analyze", response_model=AnalyzeResponse)
  935. async def analyze(req: AnalyzeRequest):
  936. text = req.text
  937. if text is None or not text.strip():
  938. raise HTTPException(status_code=400, detail="Text is required")
  939. try:
  940. return _perform_analysis(text, include_helper=False)
  941. except RuntimeError as exc:
  942. raise HTTPException(status_code=500, detail=str(exc)) from exc
  943. except Exception as exc: # pragma: no cover - defensive
  944. raise HTTPException(status_code=500, detail=f"Analysis failed: {exc}") from exc
  945. @app.post("/analyze/detail", response_model=AnalyzeResponse)
  946. async def analyze_with_helper(req: AnalyzeRequest):
  947. text = req.text
  948. if text is None or not text.strip():
  949. raise HTTPException(status_code=400, detail="Text is required")
  950. try:
  951. return _perform_analysis(text, include_helper=True)
  952. except RuntimeError as exc:
  953. raise HTTPException(status_code=500, detail=str(exc)) from exc
  954. except Exception as exc: # pragma: no cover - defensive
  955. raise HTTPException(status_code=500, detail=f"Analysis failed: {exc}") from exc
  956. @app.get("/health")
  957. async def health():
  958. status = "ok" if NLP is not None else "failed"
  959. detail = None if NLP is not None else str(NLP_LOAD_ERROR)
  960. payload = {"status": status}
  961. if detail:
  962. payload["detail"] = detail
  963. if BENE_PAR_WARNING:
  964. payload["warning"] = BENE_PAR_WARNING
  965. payload["benepar_attached"] = HAS_BENEPAR
  966. return payload
  967. @app.get("/proxy", response_class=HTMLResponse)
  968. async def proxy(url: Optional[str] = None, show_images: bool = False):
  969. if not url:
  970. return HTMLResponse(_render_proxy_page(show_images=show_images))
  971. try:
  972. (
  973. normalized_url,
  974. title,
  975. page_text,
  976. images,
  977. code_blocks,
  978. paragraph_meta,
  979. paragraph_ranges,
  980. ) = await _fetch_remote_plaintext(url)
  981. highlighted_fragment = highlight_text_with_spacy(
  982. page_text,
  983. paragraph_meta=paragraph_meta or None,
  984. paragraph_ranges=paragraph_ranges or None,
  985. )
  986. if code_blocks:
  987. highlighted_fragment = _inject_proxy_codeblocks(highlighted_fragment, code_blocks)
  988. image_notice = None
  989. if images:
  990. if show_images:
  991. highlighted_fragment = _inject_proxy_images(highlighted_fragment, images)
  992. else:
  993. highlighted_fragment = _strip_proxy_image_markers(highlighted_fragment)
  994. image_notice = (
  995. f"检测到 {len(images)} 张正文图片,为提速默认隐藏。勾选“显示图片”后重新抓取即可加载原图。"
  996. )
  997. html_body = _render_proxy_page(
  998. url_value=normalized_url,
  999. message="分析完成,结果如下。",
  1000. highlight_fragment=highlighted_fragment,
  1001. source_url=normalized_url,
  1002. source_title=title,
  1003. show_images=show_images,
  1004. image_notice=image_notice,
  1005. source_plaintext=page_text,
  1006. )
  1007. return HTMLResponse(html_body)
  1008. except ValueError as exc:
  1009. body = _render_proxy_page(url_value=url or "", message=str(exc), is_error=True, show_images=show_images)
  1010. return HTMLResponse(body, status_code=400)
  1011. except httpx.HTTPError as exc:
  1012. # Provide a clearer message for common HTTP errors from the remote site.
  1013. msg = None
  1014. if isinstance(exc, httpx.HTTPStatusError) and exc.response is not None:
  1015. status = exc.response.status_code
  1016. if status == 403:
  1017. msg = (
  1018. "抓取页面失败:目标站点返回 403 Forbidden(禁止访问)。"
  1019. "该网站很可能禁止自动抓取或代理访问,目前无法通过本工具获取正文,"
  1020. "可以尝试在浏览器中打开并手动复制需要的内容。"
  1021. )
  1022. else:
  1023. msg = f"抓取页面失败:目标站点返回 HTTP {status}。"
  1024. if msg is None:
  1025. msg = f"抓取页面失败:{exc}"
  1026. body = _render_proxy_page(
  1027. url_value=url or "",
  1028. message=msg,
  1029. is_error=True,
  1030. show_images=show_images,
  1031. )
  1032. return HTMLResponse(body, status_code=502)
  1033. except Exception as exc:
  1034. body = _render_proxy_page(
  1035. url_value=url or "",
  1036. message=f"代理分析失败:{exc}",
  1037. is_error=True,
  1038. show_images=show_images,
  1039. )
  1040. return HTMLResponse(body, status_code=500)
  1041. @app.get("/", response_class=HTMLResponse)
  1042. async def ui():
  1043. return """<!DOCTYPE html>
  1044. <html lang=\"zh-CN\">
  1045. <head>
  1046. <meta charset=\"UTF-8\" />
  1047. <meta name=\"viewport\" content=\"width=device-width, initial-scale=1\" />
  1048. <title>Grammar Highlighter</title>
  1049. <style>
  1050. body { font-family: system-ui, -apple-system, sans-serif; margin: 2rem; line-height: 1.6; }
  1051. textarea { width: 100%; min-height: 140px; font-size: 1rem; padding: 0.75rem; border: 1px solid #d0d7de; border-radius: 0.5rem; }
  1052. button { margin-top: 0.75rem; padding: 0.6rem 1.4rem; font-size: 1rem; cursor: pointer; border: none; border-radius: 999px; background: #1f7a8c; color: #fff; }
  1053. button + button { margin-left: 0.5rem; background: #6b7280; }
  1054. button:disabled { opacity: 0.6; cursor: wait; }
  1055. #result { margin-top: 1.5rem; border-top: 1px solid #e5e7eb; padding-top: 1rem; min-height: 2rem; }
  1056. #status { margin-left: 0.75rem; color: #3b82f6; }
  1057. .err { color: #b00020; }
  1058. .muted { color: #6b7280; font-size: 0.9rem; }
  1059. .tts-controls { margin-top: 0.75rem; display: flex; align-items: center; gap: 0.75rem; flex-wrap: wrap; }
  1060. .tts-controls button { margin-top: 0; background: #f97316; }
  1061. .tts-status { font-size: 0.95rem; color: #475569; }
  1062. .sentence-scope.anchor-highlight { outline: 2px dashed #f97316; outline-offset: 2px; }
  1063. </style>
  1064. </head>
  1065. <body>
  1066. <h1>Grammar Highlighter (spaCy + benepar)</h1>
  1067. <textarea id=\"text\" placeholder=\"Type the English text you want to analyze...\"></textarea>
  1068. <div>
  1069. <button type=\"button\" id=\"submit\">Analyze</button>
  1070. <button type=\"button\" id=\"clear\">清空输入</button>
  1071. <span id=\"status\"></span>
  1072. </div>
  1073. <div class=\"tts-controls\">
  1074. <button type=\"button\" id=\"tts\">朗读高亮文本</button>
  1075. <button type=\"button\" id=\"tts-selection\">朗读选中文本</button>
  1076. <button type=\"button\" id=\"tts-anchor\" disabled>从点击处朗读</button>
  1077. <button type=\"button\" id=\"tts-toggle\" disabled>暂停播放</button>
  1078. <span class=\"tts-status\" id=\"tts-status\"></span>
  1079. </div>
  1080. <div id=\"result\"></div>
  1081. <script>
  1082. const btn = document.getElementById('submit');
  1083. const btnClear = document.getElementById('clear');
  1084. const textarea = document.getElementById('text');
  1085. const statusEl = document.getElementById('status');
  1086. const ttsBtn = document.getElementById('tts');
  1087. const ttsSelectionBtn = document.getElementById('tts-selection');
  1088. const ttsAnchorBtn = document.getElementById('tts-anchor');
  1089. const ttsToggleBtn = document.getElementById('tts-toggle');
  1090. const ttsStatus = document.getElementById('tts-status');
  1091. const result = document.getElementById('result');
  1092. const TTS_ENDPOINT = 'http://141.140.15.30:8028/generate';
  1093. let currentAudio = null;
  1094. let queuedAudios = [];
  1095. let streamingFinished = false;
  1096. let lastAnalyzedText = '';
  1097. let anchorSentenceIndex = 0;
  1098. let isPaused = false;
  1099. let hasHighlightContent = false;
  1100. function resetUI() {
  1101. result.innerHTML = '';
  1102. statusEl.textContent = '';
  1103. statusEl.classList.remove('err');
  1104. ttsStatus.textContent = '';
  1105. hasHighlightContent = false;
  1106. if (ttsAnchorBtn) {
  1107. ttsAnchorBtn.disabled = true;
  1108. }
  1109. resetAnchorState();
  1110. setTtsButtonsDisabled(false);
  1111. resetAudioPlayback();
  1112. }
  1113. function getSentenceNodes() {
  1114. const analysis = result.querySelector('.analysis');
  1115. return analysis ? Array.from(analysis.querySelectorAll('.sentence-scope')) : [];
  1116. }
  1117. function clearAnchorHighlight() {
  1118. const highlighted = result.querySelectorAll('.sentence-scope.anchor-highlight');
  1119. highlighted.forEach(el => el.classList.remove('anchor-highlight'));
  1120. }
  1121. function resetAnchorState() {
  1122. anchorSentenceIndex = 0;
  1123. clearAnchorHighlight();
  1124. }
  1125. function setAnchorFromSentence(sentenceEl) {
  1126. const sentences = getSentenceNodes();
  1127. const idx = sentences.indexOf(sentenceEl);
  1128. if (idx === -1) return;
  1129. anchorSentenceIndex = idx;
  1130. clearAnchorHighlight();
  1131. sentenceEl.classList.add('anchor-highlight');
  1132. const sid = sentenceEl.getAttribute('data-sid') || (idx + 1);
  1133. ttsStatus.textContent = '已选择第 ' + sid + ' 句作为朗读起点';
  1134. }
  1135. btn.addEventListener('click', async () => {
  1136. resetUI();
  1137. const value = textarea.value.trim();
  1138. if (!value) {
  1139. statusEl.textContent = '请输入要分析的英文文本。';
  1140. statusEl.classList.add('err');
  1141. return;
  1142. }
  1143. btn.disabled = true;
  1144. statusEl.textContent = 'Analyzing ...';
  1145. try {
  1146. const response = await fetch('/analyze', {
  1147. method: 'POST',
  1148. headers: { 'Content-Type': 'application/json' },
  1149. body: JSON.stringify({ text: value })
  1150. });
  1151. if (!response.ok) {
  1152. const error = await response.json().catch(() => ({ detail: 'Request failed' }));
  1153. throw new Error(error.detail || 'Request failed');
  1154. }
  1155. const data = await response.json();
  1156. result.innerHTML = data.highlighted_html || '';
  1157. lastAnalyzedText = value;
  1158. resetAnchorState();
  1159. hasHighlightContent = true;
  1160. if (ttsAnchorBtn) {
  1161. ttsAnchorBtn.disabled = false;
  1162. }
  1163. statusEl.textContent = '';
  1164. } catch (err) {
  1165. statusEl.textContent = '错误:' + (err.message || 'Unknown error');
  1166. statusEl.classList.add('err');
  1167. } finally {
  1168. btn.disabled = false;
  1169. }
  1170. });
  1171. btnClear.addEventListener('click', () => {
  1172. textarea.value = '';
  1173. lastAnalyzedText = '';
  1174. resetUI();
  1175. textarea.focus();
  1176. });
  1177. result.addEventListener('click', event => {
  1178. if (!hasHighlightContent) {
  1179. return;
  1180. }
  1181. const target = event.target;
  1182. const isTextNode = typeof Node !== 'undefined' && target && target.nodeType === Node.TEXT_NODE;
  1183. const base = isTextNode ? target.parentElement : target;
  1184. if (!base || typeof base.closest !== 'function') {
  1185. return;
  1186. }
  1187. const sentenceEl = base.closest('.sentence-scope');
  1188. if (sentenceEl) {
  1189. setAnchorFromSentence(sentenceEl);
  1190. }
  1191. });
  1192. function extractHighlightedText() {
  1193. const highlightRoot = result.querySelector('.analysis');
  1194. return highlightRoot ? highlightRoot.textContent.trim() : '';
  1195. }
  1196. function getFullTextForTts() {
  1197. return lastAnalyzedText || extractHighlightedText();
  1198. }
  1199. function extractAnchorText() {
  1200. const sentences = getSentenceNodes();
  1201. if (!sentences.length) return '';
  1202. const start = Math.min(anchorSentenceIndex, sentences.length - 1);
  1203. const parts = [];
  1204. for (let i = start; i < sentences.length; i++) {
  1205. const text = sentences[i].textContent.trim();
  1206. if (text) {
  1207. parts.push(text);
  1208. }
  1209. }
  1210. return parts.join(' ');
  1211. }
  1212. function setTtsButtonsDisabled(disabled) {
  1213. if (ttsBtn) {
  1214. ttsBtn.disabled = disabled;
  1215. }
  1216. if (ttsSelectionBtn) {
  1217. ttsSelectionBtn.disabled = disabled;
  1218. }
  1219. if (ttsAnchorBtn) {
  1220. ttsAnchorBtn.disabled = disabled || !hasHighlightContent;
  1221. }
  1222. }
  1223. function resetAudioPlayback() {
  1224. queuedAudios = [];
  1225. streamingFinished = false;
  1226. if (currentAudio) {
  1227. currentAudio.pause();
  1228. currentAudio = null;
  1229. }
  1230. resetPauseResumeState();
  1231. }
  1232. function setPauseResumeEnabled(enabled) {
  1233. if (ttsToggleBtn) {
  1234. ttsToggleBtn.disabled = !enabled;
  1235. }
  1236. }
  1237. function resetPauseResumeState() {
  1238. isPaused = false;
  1239. if (ttsToggleBtn) {
  1240. ttsToggleBtn.textContent = '暂停播放';
  1241. }
  1242. setPauseResumeEnabled(false);
  1243. }
  1244. function markStreamingFinished() {
  1245. streamingFinished = true;
  1246. if (!currentAudio && !queuedAudios.length && !isPaused) {
  1247. ttsStatus.textContent = '播放完成';
  1248. setPauseResumeEnabled(false);
  1249. }
  1250. }
  1251. function playNextAudioChunk() {
  1252. if (!queuedAudios.length) {
  1253. currentAudio = null;
  1254. if (streamingFinished && !isPaused) {
  1255. ttsStatus.textContent = '播放完成';
  1256. setPauseResumeEnabled(false);
  1257. } else if (!streamingFinished) {
  1258. ttsStatus.textContent = '等待更多语音...';
  1259. }
  1260. return;
  1261. }
  1262. const chunk = queuedAudios.shift();
  1263. ttsStatus.textContent = '播放中...';
  1264. currentAudio = new Audio('data:audio/wav;base64,' + chunk);
  1265. currentAudio.onended = () => {
  1266. if (!isPaused) {
  1267. playNextAudioChunk();
  1268. }
  1269. };
  1270. currentAudio.onerror = () => {
  1271. ttsStatus.textContent = '播放失败';
  1272. currentAudio = null;
  1273. setPauseResumeEnabled(false);
  1274. };
  1275. currentAudio.play().catch(err => {
  1276. ttsStatus.textContent = '自动播放被阻止:' + err.message;
  1277. currentAudio = null;
  1278. queuedAudios.unshift(chunk);
  1279. setPauseResumeEnabled(true);
  1280. });
  1281. }
  1282. function enqueueAudioChunk(chunk) {
  1283. queuedAudios.push(chunk);
  1284. setPauseResumeEnabled(true);
  1285. if (!currentAudio) {
  1286. playNextAudioChunk();
  1287. }
  1288. }
  1289. function handlePauseResumeToggle() {
  1290. if (!ttsToggleBtn) {
  1291. return;
  1292. }
  1293. if (!currentAudio && !queuedAudios.length) {
  1294. ttsStatus.textContent = '暂无可暂停的语音';
  1295. return;
  1296. }
  1297. if (!currentAudio) {
  1298. playNextAudioChunk();
  1299. ttsToggleBtn.textContent = '暂停播放';
  1300. return;
  1301. }
  1302. if (!isPaused) {
  1303. currentAudio.pause();
  1304. isPaused = true;
  1305. ttsToggleBtn.textContent = '继续播放';
  1306. ttsStatus.textContent = '已暂停';
  1307. } else {
  1308. currentAudio.play().then(() => {
  1309. isPaused = false;
  1310. ttsToggleBtn.textContent = '暂停播放';
  1311. ttsStatus.textContent = '播放中...';
  1312. }).catch(err => {
  1313. ttsStatus.textContent = '无法继续播放:' + err.message;
  1314. });
  1315. }
  1316. }
  1317. function normalizeTtsLine(rawLine) {
  1318. if (typeof rawLine !== 'string') {
  1319. return '';
  1320. }
  1321. let trimmed = rawLine.replace(/\\r/g, '').trim();
  1322. if (!trimmed) {
  1323. return '';
  1324. }
  1325. if (trimmed.startsWith('data:')) {
  1326. trimmed = trimmed.slice(5).trim();
  1327. }
  1328. if (!trimmed || trimmed === '[DONE]') {
  1329. return '';
  1330. }
  1331. return trimmed;
  1332. }
  1333. function parseTtsLine(line) {
  1334. const normalized = normalizeTtsLine(line);
  1335. if (!normalized) {
  1336. return false;
  1337. }
  1338. try {
  1339. const parsed = JSON.parse(normalized);
  1340. if (parsed && parsed.audio) {
  1341. enqueueAudioChunk(parsed.audio);
  1342. return true;
  1343. }
  1344. } catch (err) {
  1345. console.warn('无法解析TTS响应行', err);
  1346. }
  1347. return false;
  1348. }
  1349. async function consumeTtsResponse(response) {
  1350. let chunkCount = 0;
  1351. const handleLine = rawLine => {
  1352. if (parseTtsLine(rawLine)) {
  1353. chunkCount += 1;
  1354. }
  1355. };
  1356. if (response.body && response.body.getReader) {
  1357. const reader = response.body.getReader();
  1358. const decoder = new TextDecoder();
  1359. let buffer = '';
  1360. while (true) {
  1361. const { value, done } = await reader.read();
  1362. if (done) break;
  1363. buffer += decoder.decode(value, { stream: true });
  1364. let newlineIndex;
  1365. while ((newlineIndex = buffer.indexOf('\\n')) >= 0) {
  1366. const line = buffer.slice(0, newlineIndex);
  1367. buffer = buffer.slice(newlineIndex + 1);
  1368. handleLine(line);
  1369. }
  1370. }
  1371. buffer += decoder.decode();
  1372. if (buffer) {
  1373. handleLine(buffer);
  1374. }
  1375. } else {
  1376. const payload = await response.text();
  1377. payload.split('\\n').forEach(handleLine);
  1378. }
  1379. return chunkCount;
  1380. }
  1381. function getSelectedPageText() {
  1382. const selection = window.getSelection ? window.getSelection() : null;
  1383. return selection ? selection.toString().trim() : '';
  1384. }
  1385. async function streamTtsRequest(text) {
  1386. const response = await fetch(TTS_ENDPOINT, {
  1387. method: 'POST',
  1388. headers: { 'Content-Type': 'application/json' },
  1389. body: JSON.stringify({ text })
  1390. });
  1391. if (!response.ok) {
  1392. throw new Error('接口响应错误');
  1393. }
  1394. const chunkCount = await consumeTtsResponse(response);
  1395. if (!chunkCount) {
  1396. throw new Error('接口未返回音频数据');
  1397. }
  1398. markStreamingFinished();
  1399. }
  1400. function createTtsRequest(textResolver, emptyMessage) {
  1401. return async () => {
  1402. const text = textResolver();
  1403. if (!text) {
  1404. ttsStatus.textContent = emptyMessage;
  1405. return;
  1406. }
  1407. setTtsButtonsDisabled(true);
  1408. ttsStatus.textContent = '请求语音...';
  1409. resetAudioPlayback();
  1410. try {
  1411. await streamTtsRequest(text);
  1412. } catch (err) {
  1413. ttsStatus.textContent = 'TTS 出错:' + (err && err.message ? err.message : err);
  1414. resetAudioPlayback();
  1415. } finally {
  1416. setTtsButtonsDisabled(false);
  1417. }
  1418. };
  1419. }
  1420. if (ttsBtn) {
  1421. ttsBtn.addEventListener('click', createTtsRequest(getFullTextForTts, '请先生成高亮结果'));
  1422. }
  1423. if (ttsSelectionBtn) {
  1424. ttsSelectionBtn.addEventListener('click', createTtsRequest(getSelectedPageText, '请先选择要朗读的文本'));
  1425. }
  1426. if (ttsAnchorBtn) {
  1427. ttsAnchorBtn.addEventListener('click', createTtsRequest(extractAnchorText, '请先在结果中点击句子作为朗读起点'));
  1428. }
  1429. if (ttsToggleBtn) {
  1430. ttsToggleBtn.addEventListener('click', handlePauseResumeToggle);
  1431. }
  1432. </script>
  1433. </body>
  1434. </html>"""
  1435. PROXY_PAGE_TEMPLATE = Template(
  1436. """<!DOCTYPE html>
  1437. <html lang=\"zh-CN\">
  1438. <head>
  1439. <meta charset=\"UTF-8\" />
  1440. <meta name=\"viewport\" content=\"width=device-width, initial-scale=1\" />
  1441. <title>Grammar Proxy Highlighter</title>
  1442. <style>
  1443. body { font-family: system-ui, -apple-system, \"Segoe UI\", sans-serif; margin: 0 auto; max-width: 860px; padding: 1.5rem; line-height: 1.65; }
  1444. h1 { font-size: 1.45rem; margin-bottom: 1rem; }
  1445. form { display: flex; flex-wrap: wrap; gap: 0.5rem; margin-bottom: 0.75rem; }
  1446. input[type=\"url\"] { flex: 1 1 260px; padding: 0.65rem; font-size: 1rem; border-radius: 0.5rem; border: 1px solid #d0d7de; }
  1447. button { padding: 0.65rem 1.4rem; border: none; border-radius: 999px; background: #2563eb; color: #fff; font-size: 1rem; cursor: pointer; }
  1448. .show-images-toggle { display: inline-flex; align-items: center; gap: 0.35rem; font-size: 0.9rem; color: #475569; }
  1449. .show-images-toggle input { width: auto; }
  1450. .tts-controls { margin-top: 0.5rem; display: flex; align-items: center; flex-wrap: wrap; gap: 0.75rem; }
  1451. .tts-controls button { background: #f97316; }
  1452. .tts-status { font-size: 0.95rem; color: #475569; }
  1453. .sentence-scope.anchor-highlight { outline: 2px dashed #f97316; outline-offset: 2px; }
  1454. .status { margin-top: 0.25rem; font-size: 0.95rem; }
  1455. .status.err { color: #b00020; }
  1456. .status.ok { color: #059669; }
  1457. section.result { margin-top: 1.4rem; padding-top: 1rem; border-top: 1px solid #e5e7eb; }
  1458. section.result .source { font-size: 0.95rem; margin-bottom: 0.5rem; color: #475569; word-break: break-word; }
  1459. section.result .source a { color: inherit; text-decoration: underline; }
  1460. section.result img { display:block; margin:0.75rem auto; max-width:100%; height:auto; max-width:min(100%,800px); }
  1461. .image-hint { font-size:0.9rem; color:#6b7280; margin:0.5rem 0 0; }
  1462. .clear-floating { position: fixed; left: 0; right: 0; bottom: 0; padding: 0.55rem 1.5rem; border-radius: 0; border-top: 1px solid #e5e7eb; background: rgba(249,250,251,0.96); display: flex; justify-content: center; z-index: 40; }
  1463. .clear-floating button { padding: 0.55rem 1.8rem; border-radius: 999px; background: #6b7280; color: #fff; font-size: 0.95rem; }
  1464. .clear-floating button:hover { filter: brightness(1.05); }
  1465. @media (prefers-reduced-motion: reduce) { .clear-floating { scroll-behavior: auto; } }
  1466. @media (max-width: 640px) { body { padding-bottom: 3.2rem; } }
  1467. </style>
  1468. $style_block
  1469. </head>
  1470. <body>
  1471. <h1>网页代理高亮</h1>
  1472. <form method=\"get\" action=\"/proxy\" class=\"url-form\">
  1473. <input type=\"url\" name=\"url\" value=\"$url_value\" placeholder=\"https://example.com/article\" required />
  1474. <button type=\"submit\">抓取并高亮</button>
  1475. <label class=\"show-images-toggle\">
  1476. <input type=\"checkbox\" name=\"show_images\" value=\"1\" $show_images_checked />
  1477. <span>显示图片(默认关闭以提升速度)</span>
  1478. </label>
  1479. </form>
  1480. $status_block
  1481. <div class=\"tts-controls\">
  1482. <button type=\"button\" id=\"proxy-tts-btn\" disabled>朗读高亮文本</button>
  1483. <button type=\"button\" id=\"proxy-tts-selection\">朗读选中文本</button>
  1484. <button type=\"button\" id=\"proxy-tts-anchor\" disabled>从点击处朗读</button>
  1485. <button type=\"button\" id=\"proxy-tts-toggle\" disabled>暂停播放</button>
  1486. <span class=\"tts-status\" id=\"proxy-tts-status\"></span>
  1487. </div>
  1488. $result_block
  1489. $source_text_script
  1490. <div class=\"clear-floating\">
  1491. <button type=\"button\" id=\"proxy-reset\">清空并重置</button>
  1492. </div>
  1493. <script>
  1494. (function() {
  1495. var resetBtn = document.getElementById('proxy-reset');
  1496. if (resetBtn) {
  1497. resetBtn.addEventListener('click', function() {
  1498. window.location.href = '/proxy';
  1499. });
  1500. }
  1501. var ttsBtn = document.getElementById('proxy-tts-btn');
  1502. var ttsSelectionBtn = document.getElementById('proxy-tts-selection');
  1503. var ttsAnchorBtn = document.getElementById('proxy-tts-anchor');
  1504. var ttsToggleBtn = document.getElementById('proxy-tts-toggle');
  1505. var ttsStatus = document.getElementById('proxy-tts-status');
  1506. var analysisRoot = document.querySelector('section.result .analysis');
  1507. var proxySourceText = window.__proxySourceText || '';
  1508. var TTS_ENDPOINT = 'http://141.140.15.30:8028/generate';
  1509. var currentAudio = null;
  1510. var queuedAudios = [];
  1511. var streamingFinished = false;
  1512. var anchorSentenceIndex = 0;
  1513. var isPaused = false;
  1514. if (analysisRoot && ttsBtn) {
  1515. ttsBtn.disabled = false;
  1516. }
  1517. if (analysisRoot && ttsAnchorBtn) {
  1518. ttsAnchorBtn.disabled = false;
  1519. }
  1520. function extractProxyText() {
  1521. var container = document.querySelector('section.result .analysis');
  1522. return container ? container.textContent.trim() : '';
  1523. }
  1524. function getSentenceNodes() {
  1525. var container = document.querySelector('section.result .analysis');
  1526. return container ? Array.from(container.querySelectorAll('.sentence-scope')) : [];
  1527. }
  1528. function clearAnchorHighlight() {
  1529. var highlighted = document.querySelectorAll('section.result .sentence-scope.anchor-highlight');
  1530. highlighted.forEach(function(el) {
  1531. el.classList.remove('anchor-highlight');
  1532. });
  1533. }
  1534. function resetAnchorState() {
  1535. anchorSentenceIndex = 0;
  1536. clearAnchorHighlight();
  1537. }
  1538. function setAnchorFromSentence(sentenceEl) {
  1539. var sentences = getSentenceNodes();
  1540. var idx = sentences.indexOf(sentenceEl);
  1541. if (idx === -1) return;
  1542. anchorSentenceIndex = idx;
  1543. clearAnchorHighlight();
  1544. sentenceEl.classList.add('anchor-highlight');
  1545. var sid = sentenceEl.getAttribute('data-sid') || (idx + 1);
  1546. ttsStatus.textContent = '已选择第 ' + sid + ' 句作为朗读起点';
  1547. }
  1548. resetAnchorState();
  1549. var resultSection = document.querySelector('section.result');
  1550. if (resultSection) {
  1551. resultSection.addEventListener('click', function(evt) {
  1552. var target = evt.target;
  1553. var isTextNode = typeof Node !== 'undefined' && target && target.nodeType === Node.TEXT_NODE;
  1554. var base = isTextNode ? target.parentElement : target;
  1555. if (!base || typeof base.closest !== 'function') {
  1556. return;
  1557. }
  1558. var sentenceEl = base.closest('.sentence-scope');
  1559. if (sentenceEl) {
  1560. setAnchorFromSentence(sentenceEl);
  1561. }
  1562. });
  1563. }
  1564. function getFullTextForTts() {
  1565. var text = proxySourceText || extractProxyText();
  1566. return text.trim();
  1567. }
  1568. function extractAnchorText() {
  1569. var sentences = getSentenceNodes();
  1570. if (!sentences.length) return '';
  1571. var start = Math.min(anchorSentenceIndex, sentences.length - 1);
  1572. var parts = [];
  1573. for (var i = start; i < sentences.length; i++) {
  1574. var text = sentences[i].textContent.trim();
  1575. if (text) {
  1576. parts.push(text);
  1577. }
  1578. }
  1579. return parts.join(' ');
  1580. }
  1581. function setTtsButtonsDisabled(disabled) {
  1582. if (ttsBtn) {
  1583. ttsBtn.disabled = disabled;
  1584. }
  1585. if (ttsSelectionBtn) {
  1586. ttsSelectionBtn.disabled = disabled;
  1587. }
  1588. if (ttsAnchorBtn) {
  1589. ttsAnchorBtn.disabled = disabled || !analysisRoot;
  1590. }
  1591. }
  1592. function resetAudioPlayback() {
  1593. queuedAudios = [];
  1594. streamingFinished = false;
  1595. if (currentAudio) {
  1596. currentAudio.pause();
  1597. currentAudio = null;
  1598. }
  1599. resetPauseResumeState();
  1600. }
  1601. function setPauseResumeEnabled(enabled) {
  1602. if (ttsToggleBtn) {
  1603. ttsToggleBtn.disabled = !enabled;
  1604. }
  1605. }
  1606. function resetPauseResumeState() {
  1607. isPaused = false;
  1608. if (ttsToggleBtn) {
  1609. ttsToggleBtn.textContent = '暂停播放';
  1610. }
  1611. setPauseResumeEnabled(false);
  1612. }
  1613. function markStreamingFinished() {
  1614. streamingFinished = true;
  1615. if (!currentAudio && !queuedAudios.length && !isPaused) {
  1616. ttsStatus.textContent = '播放完成';
  1617. setPauseResumeEnabled(false);
  1618. }
  1619. }
  1620. function playNextAudioChunk() {
  1621. if (!queuedAudios.length) {
  1622. currentAudio = null;
  1623. if (streamingFinished && !isPaused) {
  1624. ttsStatus.textContent = '播放完成';
  1625. setPauseResumeEnabled(false);
  1626. } else if (!streamingFinished) {
  1627. ttsStatus.textContent = '等待更多语音...';
  1628. }
  1629. return;
  1630. }
  1631. var chunk = queuedAudios.shift();
  1632. ttsStatus.textContent = '播放中...';
  1633. currentAudio = new Audio('data:audio/wav;base64,' + chunk);
  1634. currentAudio.onended = function() {
  1635. if (!isPaused) {
  1636. playNextAudioChunk();
  1637. }
  1638. };
  1639. currentAudio.onerror = function() {
  1640. ttsStatus.textContent = '播放失败';
  1641. currentAudio = null;
  1642. setPauseResumeEnabled(false);
  1643. };
  1644. currentAudio.play().catch(function(err) {
  1645. ttsStatus.textContent = '自动播放被阻止:' + err.message;
  1646. currentAudio = null;
  1647. queuedAudios.unshift(chunk);
  1648. setPauseResumeEnabled(true);
  1649. });
  1650. }
  1651. function enqueueAudioChunk(chunk) {
  1652. queuedAudios.push(chunk);
  1653. setPauseResumeEnabled(true);
  1654. if (!currentAudio) {
  1655. playNextAudioChunk();
  1656. }
  1657. }
  1658. function handlePauseResumeToggle() {
  1659. if (!ttsToggleBtn) {
  1660. return;
  1661. }
  1662. if (!currentAudio && !queuedAudios.length) {
  1663. ttsStatus.textContent = '暂无可暂停的语音';
  1664. return;
  1665. }
  1666. if (!currentAudio) {
  1667. playNextAudioChunk();
  1668. ttsToggleBtn.textContent = '暂停播放';
  1669. return;
  1670. }
  1671. if (!isPaused) {
  1672. currentAudio.pause();
  1673. isPaused = true;
  1674. ttsToggleBtn.textContent = '继续播放';
  1675. ttsStatus.textContent = '已暂停';
  1676. } else {
  1677. currentAudio.play().then(function() {
  1678. isPaused = false;
  1679. ttsToggleBtn.textContent = '暂停播放';
  1680. ttsStatus.textContent = '播放中...';
  1681. }).catch(function(err) {
  1682. ttsStatus.textContent = '无法继续播放:' + err.message;
  1683. });
  1684. }
  1685. }
  1686. function normalizeProxyTtsLine(rawLine) {
  1687. if (typeof rawLine !== 'string') {
  1688. return '';
  1689. }
  1690. var trimmed = rawLine.replace(/\\r/g, '').trim();
  1691. if (!trimmed) {
  1692. return '';
  1693. }
  1694. if (trimmed.indexOf('data:') === 0) {
  1695. trimmed = trimmed.slice(5).trim();
  1696. }
  1697. if (!trimmed || trimmed === '[DONE]') {
  1698. return '';
  1699. }
  1700. return trimmed;
  1701. }
  1702. function parseTtsLine(line) {
  1703. var normalized = normalizeProxyTtsLine(line);
  1704. if (!normalized) {
  1705. return false;
  1706. }
  1707. try {
  1708. var parsed = JSON.parse(normalized);
  1709. if (parsed && parsed.audio) {
  1710. enqueueAudioChunk(parsed.audio);
  1711. return true;
  1712. }
  1713. } catch (err) {
  1714. console.warn('无法解析TTS响应行', err);
  1715. }
  1716. return false;
  1717. }
  1718. async function consumeTtsResponse(response) {
  1719. var chunkCount = 0;
  1720. var handleLine = function(rawLine) {
  1721. if (parseTtsLine(rawLine)) {
  1722. chunkCount += 1;
  1723. }
  1724. };
  1725. if (response.body && response.body.getReader) {
  1726. var reader = response.body.getReader();
  1727. var decoder = new TextDecoder();
  1728. var buffer = '';
  1729. while (true) {
  1730. var readResult = await reader.read();
  1731. if (readResult.done) {
  1732. break;
  1733. }
  1734. buffer += decoder.decode(readResult.value, { stream: true });
  1735. var newlineIndex;
  1736. while ((newlineIndex = buffer.indexOf('\\n')) >= 0) {
  1737. var line = buffer.slice(0, newlineIndex);
  1738. buffer = buffer.slice(newlineIndex + 1);
  1739. handleLine(line);
  1740. }
  1741. }
  1742. buffer += decoder.decode();
  1743. if (buffer) {
  1744. handleLine(buffer);
  1745. }
  1746. } else {
  1747. var payload = await response.text();
  1748. payload.split('\\n').forEach(handleLine);
  1749. }
  1750. return chunkCount;
  1751. }
  1752. function getSelectedPageText() {
  1753. var selection = window.getSelection ? window.getSelection() : null;
  1754. return selection ? selection.toString().trim() : '';
  1755. }
  1756. async function streamTtsRequest(text) {
  1757. var response = await fetch(TTS_ENDPOINT, {
  1758. method: 'POST',
  1759. headers: { 'Content-Type': 'application/json' },
  1760. body: JSON.stringify({ text: text })
  1761. });
  1762. if (!response.ok) {
  1763. throw new Error('接口响应错误');
  1764. }
  1765. var chunkCount = await consumeTtsResponse(response);
  1766. if (!chunkCount) {
  1767. throw new Error('接口未返回音频数据');
  1768. }
  1769. markStreamingFinished();
  1770. }
  1771. function createTtsRequest(textResolver, emptyMessage) {
  1772. return async function() {
  1773. var text = textResolver();
  1774. if (!text) {
  1775. ttsStatus.textContent = emptyMessage;
  1776. return;
  1777. }
  1778. setTtsButtonsDisabled(true);
  1779. ttsStatus.textContent = '请求语音...';
  1780. resetAudioPlayback();
  1781. try {
  1782. await streamTtsRequest(text);
  1783. } catch (err) {
  1784. ttsStatus.textContent = 'TTS 出错:' + (err && err.message ? err.message : err);
  1785. resetAudioPlayback();
  1786. } finally {
  1787. setTtsButtonsDisabled(false);
  1788. }
  1789. };
  1790. }
  1791. if (ttsBtn) {
  1792. ttsBtn.addEventListener('click', createTtsRequest(getFullTextForTts, '请先抓取文章内容再朗读'));
  1793. }
  1794. if (ttsSelectionBtn) {
  1795. ttsSelectionBtn.addEventListener('click', createTtsRequest(getSelectedPageText, '请先选择要朗读的文本'));
  1796. }
  1797. if (ttsAnchorBtn) {
  1798. ttsAnchorBtn.addEventListener('click', createTtsRequest(extractAnchorText, '请先点击句子作为朗读起点'));
  1799. }
  1800. if (ttsToggleBtn) {
  1801. ttsToggleBtn.addEventListener('click', handlePauseResumeToggle);
  1802. }
  1803. })();
  1804. </script>
  1805. </body>
  1806. </html>"""
  1807. )
  1808. ALLOWED_URL_SCHEMES = {"http", "https"}
  1809. MAX_REMOTE_HTML_BYTES = 1_000_000
  1810. REMOTE_FETCH_TIMEOUT = 10.0
  1811. REMOTE_FETCH_HEADERS = {
  1812. # Use a browser-like user agent and common headers so that sites which
  1813. # block generic HTTP clients are more likely to return normal content.
  1814. "User-Agent": (
  1815. "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
  1816. "AppleWebKit/537.36 (KHTML, like Gecko) "
  1817. "Chrome/124.0.0.0 Safari/537.36"
  1818. ),
  1819. "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
  1820. "Accept-Language": "en-US,en;q=0.9",
  1821. # Let httpx / the underlying HTTP stack negotiate an encoding it can
  1822. # actually decode. If we unconditionally advertise "br" but the runtime
  1823. # does not have brotli support installed, some sites will respond with
  1824. # brotli-compressed payloads that end up as乱码 or decoding errors.
  1825. #
  1826. # Most modern servers default to gzip or identity when the header is
  1827. # absent, which are both handled fine by httpx.
  1828. # "Accept-Encoding": "gzip, deflate, br",
  1829. "Connection": "keep-alive",
  1830. "Upgrade-Insecure-Requests": "1",
  1831. # A few anti‑bot setups check these request headers; keeping them close
  1832. # to real desktop Chrome values slightly improves compatibility, even
  1833. # though they are not a guarantee against 403 responses.
  1834. "Sec-Fetch-Site": "none",
  1835. "Sec-Fetch-Mode": "navigate",
  1836. "Sec-Fetch-User": "?1",
  1837. "Sec-Fetch-Dest": "document",
  1838. }
  1839. SIMPLE_FETCH_HEADERS = {
  1840. # Minimal browser-like headers for the fallback "simple request" path.
  1841. "User-Agent": (
  1842. "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
  1843. "AppleWebKit/537.36 (KHTML, like Gecko) "
  1844. "Chrome/124.0.0.0 Safari/537.36"
  1845. ),
  1846. "Accept": "text/html,application/xhtml+xml;q=0.9,*/*;q=0.8",
  1847. "Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8",
  1848. "Connection": "close",
  1849. }
  1850. def _inject_proxy_images(html_fragment: str, images: List[Dict[str, str]]) -> str:
  1851. """Replace stable image placeholders with <img> tags in the highlighted HTML."""
  1852. result = html_fragment
  1853. for idx, img in enumerate(images):
  1854. marker = img.get("marker") or f"__GHIMG_{idx}__"
  1855. src = html.escape(img.get("src", "") or "", quote=True)
  1856. if not src:
  1857. continue
  1858. alt = html.escape(img.get("alt", "") or "", quote=True)
  1859. title = html.escape(img.get("title", "") or "", quote=True)
  1860. attrs = [f"src='{src}'"]
  1861. if alt:
  1862. attrs.append(f"alt='{alt}'")
  1863. if title:
  1864. attrs.append(f"title='{title}'")
  1865. # Preserve simple width/height hints when they look safe. Most modern
  1866. # pages rely on CSS for sizing, but explicit attributes can help keep
  1867. # code snippets or diagrams close to their original scale.
  1868. def _safe_dim(value: Optional[str]) -> Optional[str]:
  1869. if not value:
  1870. return None
  1871. value = value.strip()
  1872. if re.fullmatch(r"\d+(?:\.\d+)?(px|%)?", value):
  1873. return value
  1874. return None
  1875. width = _safe_dim(img.get("width"))
  1876. height = _safe_dim(img.get("height"))
  1877. if width:
  1878. attrs.append(f"width='{html.escape(width, quote=True)}'")
  1879. if height:
  1880. attrs.append(f"height='{html.escape(height, quote=True)}'")
  1881. img_tag = "<img " + " ".join(attrs) + " />"
  1882. # Simple textual replacement is sufficient because placeholders
  1883. # are emitted as plain word tokens without HTML meta characters.
  1884. result = result.replace(marker, img_tag)
  1885. return result
  1886. IMG_MARKER_RE = re.compile(r"__GHIMG_\d+__")
  1887. def _strip_proxy_image_markers(html_fragment: str) -> str:
  1888. """Remove residual image placeholders when images are hidden."""
  1889. if IMG_MARKER_RE.search(html_fragment) is None:
  1890. return html_fragment
  1891. return IMG_MARKER_RE.sub("", html_fragment)
  1892. def _inject_proxy_codeblocks(html_fragment: str, code_blocks: List[Dict[str, str]]) -> str:
  1893. """Replace code placeholders with <pre><code> blocks, preserving formatting."""
  1894. result = html_fragment
  1895. for idx, block in enumerate(code_blocks):
  1896. marker = block.get("marker") or f"__GHCODE_{idx}__"
  1897. raw = block.get("text") or ""
  1898. if not raw.strip():
  1899. continue
  1900. # Escape HTML but keep newlines so that <pre> preserves formatting.
  1901. code_html = html.escape(raw, quote=False)
  1902. pre_tag = f"<pre><code>{code_html}</code></pre>"
  1903. result = result.replace(marker, pre_tag)
  1904. return result
  1905. class SimpleHTMLStripper(HTMLParser):
  1906. def __init__(self):
  1907. super().__init__()
  1908. # Accumulate visible text into paragraph-like blocks while skipping
  1909. # navigation / sidebars / ads etc. We do this with a small HTML
  1910. # structure–aware state machine instead of flattening everything.
  1911. self._blocks: List[Dict[str, Any]] = []
  1912. self._current_parts: List[str] = []
  1913. # Track when we are inside potentially main content containers
  1914. # like <article> or <main>.
  1915. self._article_depth = 0
  1916. # Track whether we are inside a preformatted code block so that we
  1917. # can preserve indentation and line breaks instead of collapsing
  1918. # whitespace as normal text.
  1919. self._in_pre = False
  1920. self._in_code = False
  1921. self._current_code_chunks: List[str] = []
  1922. self._code_blocks: List[Dict[str, str]] = []
  1923. # Stack of flags indicating which open tags should be skipped.
  1924. # When any active flag is True, textual data is ignored.
  1925. self._skip_stack: List[bool] = []
  1926. self._skip_depth = 0
  1927. self._title_chunks: List[str] = []
  1928. self._in_title = False
  1929. self._h1_chunks: List[str] = []
  1930. self._h1_main_chunks: List[str] = []
  1931. self._in_h1 = False
  1932. # Collected inline images from the main content, in document order.
  1933. # Each image is represented as a small dict with sanitized attributes.
  1934. self._images: List[Dict[str, str]] = []
  1935. # Active list containers (<ul>/<ol>) and current <li> nesting state.
  1936. self._list_stack: List[Dict[str, Any]] = []
  1937. self._list_item_stack: List[Dict[str, Any]] = []
  1938. # Keywords commonly used in class/id attributes for non‑article areas
  1939. _NOISE_KEYWORDS = {
  1940. "sidebar",
  1941. "side-bar",
  1942. "aside",
  1943. "nav",
  1944. "menu",
  1945. "breadcrumb",
  1946. "breadcrumbs",
  1947. "pagination",
  1948. "pager",
  1949. "comment",
  1950. "comments",
  1951. "reply",
  1952. "advert",
  1953. "ad-",
  1954. "ads",
  1955. "sponsor",
  1956. "promo",
  1957. "promotion",
  1958. "related",
  1959. "recommend",
  1960. "share",
  1961. "social",
  1962. "subscribe",
  1963. "signup",
  1964. "login",
  1965. "popup",
  1966. "modal",
  1967. "banner",
  1968. "cookie",
  1969. "notification",
  1970. "toolbar",
  1971. "footer",
  1972. "header-bar",
  1973. }
  1974. # Tags whose textual content is almost never part of the main article.
  1975. _ALWAYS_SKIP_TAGS = {
  1976. "script",
  1977. "style",
  1978. "noscript",
  1979. "nav",
  1980. "aside",
  1981. "footer",
  1982. "form",
  1983. "svg",
  1984. "iframe",
  1985. "button",
  1986. "input",
  1987. "textarea",
  1988. "select",
  1989. "option",
  1990. "label",
  1991. }
  1992. # Structural container tags where noise classes/roles are meaningful.
  1993. # For purely inline tags we avoid applying aggressive noise heuristics
  1994. # so that important inline text (e.g. spans in the first sentence) is
  1995. # not accidentally dropped.
  1996. _STRUCTURAL_NOISE_TAGS = {
  1997. "div",
  1998. "section",
  1999. "aside",
  2000. "nav",
  2001. "header",
  2002. "footer",
  2003. "main",
  2004. "article",
  2005. "ul",
  2006. "ol",
  2007. "li",
  2008. }
  2009. # Block-level tags that naturally mark paragraph boundaries.
  2010. _BLOCK_TAGS = {
  2011. "p",
  2012. "li",
  2013. "blockquote",
  2014. "h1",
  2015. "h2",
  2016. "h3",
  2017. "h4",
  2018. "h5",
  2019. "h6",
  2020. "pre",
  2021. "table",
  2022. "tr",
  2023. }
  2024. # Keywords for containers that are likely to hold the main article body.
  2025. # Used to decide which regions count as "main content" for both text
  2026. # and inline images.
  2027. _CONTENT_KEYWORDS = {
  2028. "content",
  2029. "main-content",
  2030. "article-body",
  2031. "post-body",
  2032. "post-content",
  2033. "entry-content",
  2034. "story-body",
  2035. "blog-post",
  2036. "markdown-body",
  2037. "readable-content",
  2038. }
  2039. # Keywords on image-related class/id/src that usually indicate avatars,
  2040. # logo icons, decorative banners, etc., which we want to drop from the
  2041. # extracted main content.
  2042. _IMAGE_NOISE_KEYWORDS = {
  2043. "avatar",
  2044. "author",
  2045. "logo",
  2046. "icon",
  2047. "favicon",
  2048. "badge",
  2049. "banner",
  2050. "thumb",
  2051. "thumbnail",
  2052. "profile",
  2053. "cover",
  2054. "background",
  2055. "sprite",
  2056. "emoji",
  2057. "reaction",
  2058. }
  2059. _TEXT_NOISE_KEYWORDS = {
  2060. "menu",
  2061. "menus",
  2062. "navigation",
  2063. "nav",
  2064. "目录",
  2065. "目錄",
  2066. "导航",
  2067. "導航",
  2068. "菜单",
  2069. "菜單",
  2070. "广告",
  2071. "廣告",
  2072. "ad",
  2073. "ads",
  2074. "sponsor",
  2075. "sponsored",
  2076. "上一篇",
  2077. "下一篇",
  2078. "返回顶部",
  2079. "返回頂部",
  2080. "分享",
  2081. "分享至",
  2082. "相关推荐",
  2083. "相关阅读",
  2084. "相關閱讀",
  2085. "recommended",
  2086. "related posts",
  2087. "login",
  2088. "signup",
  2089. }
  2090. _TEXT_NOISE_PREFIXES = (
  2091. "目录",
  2092. "目錄",
  2093. "导航",
  2094. "導航",
  2095. "菜单",
  2096. "菜單",
  2097. "广告",
  2098. "廣告",
  2099. "上一篇",
  2100. "下一篇",
  2101. "上一页",
  2102. "下一页",
  2103. "返回目录",
  2104. "返回目錄",
  2105. "返回顶部",
  2106. "返回頂部",
  2107. "分享",
  2108. "相关",
  2109. "相關",
  2110. "recommended",
  2111. "login",
  2112. "signup",
  2113. )
  2114. def _finish_paragraph(self) -> None:
  2115. """Flush current buffered tokens into a paragraph list."""
  2116. if not self._current_parts:
  2117. return
  2118. # For regular paragraphs we still collapse excessive internal
  2119. # whitespace, but we keep logical breaks between paragraphs
  2120. # themselves so that the downstream highlighter can reconstruct
  2121. # paragraph structure.
  2122. text = " ".join(self._current_parts)
  2123. text = re.sub(r"\s+", " ", text).strip()
  2124. self._current_parts = []
  2125. if not text:
  2126. return
  2127. if self._looks_like_noise_paragraph(text):
  2128. return
  2129. block_kind = "paragraph"
  2130. list_kind: Optional[str] = None
  2131. list_depth = 0
  2132. list_index: Optional[int] = None
  2133. if self._list_item_stack:
  2134. list_ctx = self._list_item_stack[-1]
  2135. block_kind = "list-item"
  2136. list_kind = list_ctx.get("list_type") or "ul"
  2137. depth_value = list_ctx.get("depth", 1)
  2138. try:
  2139. depth_int = int(depth_value)
  2140. except (TypeError, ValueError):
  2141. depth_int = 1
  2142. list_depth = min(max(depth_int, 1), 5)
  2143. if list_kind == "ol":
  2144. idx = list_ctx.get("index")
  2145. if isinstance(idx, int):
  2146. list_index = idx
  2147. self._blocks.append(
  2148. {
  2149. "text": text,
  2150. "is_main": self._article_depth > 0,
  2151. "kind": block_kind,
  2152. "list_kind": list_kind,
  2153. "list_depth": list_depth,
  2154. "list_index": list_index,
  2155. }
  2156. )
  2157. def _looks_like_noise_paragraph(self, text: str) -> bool:
  2158. normalized = text.strip()
  2159. if not normalized:
  2160. return True
  2161. lowered = normalized.lower()
  2162. compact = re.sub(r"\s+", "", lowered)
  2163. for prefix in self._TEXT_NOISE_PREFIXES:
  2164. if lowered.startswith(prefix.lower()):
  2165. if len(normalized) <= 80:
  2166. return True
  2167. if len(normalized) <= 80:
  2168. for keyword in self._TEXT_NOISE_KEYWORDS:
  2169. if keyword in lowered or keyword in compact:
  2170. return True
  2171. # Skip very short bullet-like crumbs that mostly consist of symbols.
  2172. if len(normalized) <= 6 and sum(ch.isalnum() for ch in normalized) <= 1:
  2173. return True
  2174. return False
  2175. @staticmethod
  2176. def _parse_ordered_start(raw_value: Optional[str]) -> int:
  2177. if raw_value is None:
  2178. return 1
  2179. value = raw_value.strip()
  2180. if not value:
  2181. return 1
  2182. try:
  2183. parsed = int(value)
  2184. return parsed if parsed >= 1 else 1
  2185. except ValueError:
  2186. return 1
  2187. def handle_starttag(self, tag, attrs):
  2188. lowered = tag.lower()
  2189. # Paragraph boundary before starting a new block element or <br>.
  2190. if lowered in self._BLOCK_TAGS or lowered == "br":
  2191. if self._skip_depth == 0:
  2192. self._finish_paragraph()
  2193. # Entering a <pre> region – treat it as a dedicated code block.
  2194. if lowered == "pre" and self._skip_depth == 0:
  2195. self._finish_paragraph()
  2196. self._in_pre = True
  2197. self._current_code_chunks = []
  2198. # Decide whether this element should be skipped entirely.
  2199. attr_dict = {k.lower(): (v or "") for k, v in attrs}
  2200. role = attr_dict.get("role", "").lower()
  2201. classes_ids = (attr_dict.get("class", "") + " " + attr_dict.get("id", "")).lower()
  2202. is_noise_attr = False
  2203. # Only treat class/id keywords as layout "noise" on structural
  2204. # containers (div/section/nav/etc). Inline tags with "comment"
  2205. # in their class (like mdspan-comment on Towards Data Science)
  2206. # should not be discarded, otherwise we lose the first words
  2207. # of sentences.
  2208. if lowered in self._STRUCTURAL_NOISE_TAGS:
  2209. is_noise_attr = any(key in classes_ids for key in self._NOISE_KEYWORDS)
  2210. if role in {"navigation", "banner", "contentinfo", "complementary"}:
  2211. is_noise_attr = True
  2212. skip_this = lowered in self._ALWAYS_SKIP_TAGS or is_noise_attr
  2213. if skip_this:
  2214. self._skip_depth += 1
  2215. self._skip_stack.append(skip_this)
  2216. # Track when we are inside an article-like container; only count if not skipped.
  2217. if self._skip_depth == 0 and lowered in {"article", "main", "section", "div"}:
  2218. # Treat semantic containers and common "main content" classes as
  2219. # part of the article area so that we keep their text and inline
  2220. # media but still avoid sidebars / nav.
  2221. if lowered in {"article", "main"} or any(
  2222. key in classes_ids for key in self._CONTENT_KEYWORDS
  2223. ) or role == "main":
  2224. self._article_depth += 1
  2225. if self._skip_depth == 0 and lowered in {"ul", "ol"}:
  2226. start = 1
  2227. if lowered == "ol":
  2228. start = self._parse_ordered_start(attr_dict.get("start"))
  2229. self._list_stack.append(
  2230. {
  2231. "type": lowered,
  2232. "start": start,
  2233. "next_index": start,
  2234. }
  2235. )
  2236. if lowered == "li" and self._skip_depth == 0:
  2237. list_ctx = self._list_stack[-1] if self._list_stack else None
  2238. depth = len(self._list_stack) if self._list_stack else 1
  2239. list_type = list_ctx.get("type") if list_ctx else "ul"
  2240. index = None
  2241. if list_ctx and list_ctx["type"] == "ol":
  2242. index = list_ctx["next_index"]
  2243. list_ctx["next_index"] = index + 1
  2244. li_value = attr_dict.get("value")
  2245. if li_value and list_ctx and list_ctx["type"] == "ol":
  2246. try:
  2247. value_idx = int(li_value)
  2248. index = value_idx
  2249. list_ctx["next_index"] = value_idx + 1
  2250. except ValueError:
  2251. pass
  2252. self._list_item_stack.append(
  2253. {
  2254. "list_type": list_type,
  2255. "index": index,
  2256. "depth": depth,
  2257. }
  2258. )
  2259. if lowered == "title" and self._skip_depth == 0:
  2260. self._in_title = True
  2261. if lowered == "h1" and self._skip_depth == 0:
  2262. self._in_h1 = True
  2263. if lowered == "code" and self._skip_depth == 0 and self._in_pre:
  2264. # Nested <code> inside <pre> – keep track but we don't need
  2265. # separate buffering beyond the enclosing pre block.
  2266. self._in_code = True
  2267. # Inline image handling: only keep <img> elements that are inside the
  2268. # main article content (tracked via _article_depth) and that do not
  2269. # look like avatars / logos / decorative icons. We insert a stable
  2270. # placeholder token into the text stream so that the /proxy renderer
  2271. # can later replace it with a real <img> tag while preserving the
  2272. # grammar highlighting.
  2273. if lowered == "img" and self._skip_depth == 0 and self._article_depth > 0:
  2274. src = attr_dict.get("src", "").strip()
  2275. if src:
  2276. alt = attr_dict.get("alt", "") or ""
  2277. title = attr_dict.get("title", "") or ""
  2278. width = (attr_dict.get("width") or "").strip()
  2279. height = (attr_dict.get("height") or "").strip()
  2280. img_classes_ids = classes_ids + " " + src.lower()
  2281. if any(key in img_classes_ids for key in self._IMAGE_NOISE_KEYWORDS):
  2282. return
  2283. marker = f"__GHIMG_{len(self._images)}__"
  2284. img_info: Dict[str, str] = {
  2285. "marker": marker,
  2286. "src": src,
  2287. "alt": alt,
  2288. "title": title,
  2289. }
  2290. if width:
  2291. img_info["width"] = width
  2292. if height:
  2293. img_info["height"] = height
  2294. self._images.append(img_info)
  2295. # Treat the image as an inline token within the current
  2296. # paragraph. Paragraph finishing logic will ensure it
  2297. # stays grouped with surrounding text.
  2298. self._current_parts.append(marker)
  2299. def handle_endtag(self, tag):
  2300. lowered = tag.lower()
  2301. if lowered == "code" and self._in_code:
  2302. self._in_code = False
  2303. if lowered == "pre" and self._in_pre:
  2304. self._in_pre = False
  2305. # Finalize the current code block into a single placeholder
  2306. # token so that it passes through the grammar highlighter
  2307. # untouched, and can later be restored as a <pre><code> block.
  2308. code_text = "".join(self._current_code_chunks)
  2309. self._current_code_chunks = []
  2310. if code_text.strip() and self._skip_depth == 0:
  2311. marker = f"__GHCODE_{len(self._code_blocks)}__"
  2312. self._code_blocks.append({"marker": marker, "text": code_text})
  2313. # We append the marker to the paragraph parts so that
  2314. # get_text() emits it in the right position.
  2315. self._current_parts.append(marker)
  2316. # Closing a block element ends the current paragraph.
  2317. if lowered in self._BLOCK_TAGS and self._skip_depth == 0:
  2318. self._finish_paragraph()
  2319. if lowered == "li" and self._skip_depth == 0 and self._list_item_stack:
  2320. self._list_item_stack.pop()
  2321. if lowered in {"ul", "ol"} and self._skip_depth == 0 and self._list_stack:
  2322. self._list_stack.pop()
  2323. if lowered == "title":
  2324. self._in_title = False
  2325. if lowered == "h1":
  2326. self._in_h1 = False
  2327. if lowered in {"article", "main", "section"} and self._skip_depth == 0 and self._article_depth > 0:
  2328. self._article_depth -= 1
  2329. if self._skip_stack:
  2330. skip_this = self._skip_stack.pop()
  2331. if skip_this and self._skip_depth > 0:
  2332. self._skip_depth -= 1
  2333. def handle_data(self, data):
  2334. if self._skip_depth > 0:
  2335. return
  2336. if self._in_pre or self._in_code:
  2337. # Preserve code blocks exactly as they appear, including
  2338. # newlines and indentation.
  2339. self._current_code_chunks.append(data)
  2340. return
  2341. stripped = data.strip()
  2342. if not stripped:
  2343. return
  2344. if self._in_title:
  2345. self._title_chunks.append(stripped)
  2346. return
  2347. # Regular visible text
  2348. self._current_parts.append(stripped)
  2349. if self._in_h1:
  2350. self._h1_chunks.append(stripped)
  2351. if self._article_depth > 0:
  2352. self._h1_main_chunks.append(stripped)
  2353. def get_text(self) -> str:
  2354. # Flush any trailing paragraph.
  2355. self._finish_paragraph()
  2356. blocks = self._selected_blocks()
  2357. if not blocks:
  2358. return ""
  2359. # Keep natural paragraphs contiguous with a single newline instead of
  2360. # injecting blank lines that did not exist in the source.
  2361. return "\n".join(block["text"] for block in blocks)
  2362. def _selected_blocks(self) -> List[Dict[str, Any]]:
  2363. if not self._blocks:
  2364. return []
  2365. main_blocks = [block for block in self._blocks if block.get("is_main")]
  2366. return main_blocks if main_blocks else self._blocks
  2367. def get_blocks(self) -> List[Dict[str, Any]]:
  2368. blocks = self._selected_blocks()
  2369. return [dict(block) for block in blocks]
  2370. def get_title(self) -> str:
  2371. # Prefer <h1> heading (especially inside <article>/<main>) as the
  2372. # primary title; fall back to <title>.
  2373. if self._h1_main_chunks:
  2374. raw = " ".join(self._h1_main_chunks)
  2375. elif self._h1_chunks:
  2376. raw = " ".join(self._h1_chunks)
  2377. elif self._title_chunks:
  2378. raw = " ".join(self._title_chunks)
  2379. else:
  2380. return ""
  2381. return re.sub(r"\s+", " ", raw).strip()
  2382. def get_images(self) -> List[Dict[str, str]]:
  2383. """Return the list of captured inline images in document order."""
  2384. return list(self._images)
  2385. def get_code_blocks(self) -> List[Dict[str, str]]:
  2386. """Return captured code blocks (from <pre>/<code>) in document order."""
  2387. return list(self._code_blocks)
  2388. def _normalize_target_url(raw_url: str) -> str:
  2389. candidate = (raw_url or "").strip()
  2390. if not candidate:
  2391. raise ValueError("请输入要抓取的 URL。")
  2392. parsed = urlparse(candidate if "://" in candidate else f"https://{candidate}")
  2393. if parsed.scheme not in ALLOWED_URL_SCHEMES:
  2394. raise ValueError("仅支持 http/https 协议链接。")
  2395. if not parsed.netloc:
  2396. raise ValueError("URL 缺少域名部分。")
  2397. sanitized = parsed._replace(fragment="")
  2398. return urlunparse(sanitized)
  2399. def _fallback_html_to_text(html_body: str) -> str:
  2400. """Very simple HTML-to-text fallback used when structured extraction fails.
  2401. This does not attempt to distinguish main content from navigation, but it
  2402. guarantees we return *something* for pages whose structure confuses the
  2403. SimpleHTMLStripper heuristics (e.g. some mirror sites).
  2404. """
  2405. # Drop script/style/noscript content outright.
  2406. cleaned = re.sub(
  2407. r"(?is)<(script|style|noscript)[^>]*>.*?</\1>",
  2408. " ",
  2409. html_body,
  2410. )
  2411. # Convert common block separators into newlines.
  2412. cleaned = re.sub(r"(?i)<br\s*/?>", "\n", cleaned)
  2413. cleaned = re.sub(r"(?i)</p\s*>", "\n\n", cleaned)
  2414. cleaned = re.sub(r"(?i)</(div|section|article|li|h[1-6])\s*>", "\n\n", cleaned)
  2415. # Remove all remaining tags.
  2416. cleaned = re.sub(r"(?is)<[^>]+>", " ", cleaned)
  2417. cleaned = html.unescape(cleaned)
  2418. # Normalize whitespace but keep paragraph-level blank lines.
  2419. cleaned = cleaned.replace("\r", "")
  2420. # Collapse runs of spaces/tabs inside lines.
  2421. cleaned = re.sub(r"[ \t\f\v]+", " ", cleaned)
  2422. # Collapse 3+ blank lines into just 2.
  2423. cleaned = re.sub(r"\n\s*\n\s*\n+", "\n\n", cleaned)
  2424. cleaned = cleaned.strip()
  2425. return cleaned
  2426. def _build_paragraph_metadata(blocks: List[Dict[str, Any]]) -> List[Dict[str, str]]:
  2427. """Convert stripped block info into span attributes for downstream rendering."""
  2428. if not blocks:
  2429. return []
  2430. paragraph_meta: List[Dict[str, str]] = []
  2431. for block in blocks:
  2432. attrs: Dict[str, str] = {}
  2433. if block.get("kind") == "list-item" and block.get("list_kind"):
  2434. attrs["data-list-kind"] = str(block["list_kind"])
  2435. depth = block.get("list_depth")
  2436. if depth:
  2437. attrs["data-list-depth"] = str(depth)
  2438. if block.get("list_kind") == "ol" and block.get("list_index") is not None:
  2439. attrs["data-list-index"] = str(block["list_index"])
  2440. paragraph_meta.append(attrs)
  2441. return paragraph_meta
  2442. def _build_paragraph_ranges(blocks: List[Dict[str, Any]]) -> List[Tuple[int, int]]:
  2443. """Map each stripped block to its char span within the joined plain text."""
  2444. if not blocks:
  2445. return []
  2446. ranges: List[Tuple[int, int]] = []
  2447. cursor = 0
  2448. for idx, block in enumerate(blocks):
  2449. text = block.get("text") or ""
  2450. start = cursor
  2451. end = start + len(text)
  2452. ranges.append((start, end))
  2453. cursor = end
  2454. # Plain text joins blocks with a single newline; skip trailing newline.
  2455. if idx < len(blocks) - 1:
  2456. cursor += 1
  2457. return ranges
  2458. def _decode_html_bytes(raw_content: bytes, encoding_hint: Optional[str]) -> str:
  2459. encoding_candidates: List[str] = []
  2460. if encoding_hint:
  2461. encoding_candidates.append(encoding_hint)
  2462. encoding_candidates.extend(["utf-8", "latin-1"])
  2463. last_exc: Optional[Exception] = None
  2464. for enc in encoding_candidates:
  2465. try:
  2466. html_body = raw_content.decode(enc, errors="replace")
  2467. break
  2468. except Exception as exc: # pragma: no cover - defensive
  2469. last_exc = exc
  2470. else: # pragma: no cover - extremely unlikely
  2471. raise RuntimeError(f"无法解码远程页面内容: {last_exc}")
  2472. if len(html_body) > MAX_REMOTE_HTML_BYTES:
  2473. html_body = html_body[:MAX_REMOTE_HTML_BYTES]
  2474. return html_body
  2475. async def _download_html_via_httpx(url: str) -> str:
  2476. async with httpx.AsyncClient(timeout=REMOTE_FETCH_TIMEOUT, follow_redirects=True) as client:
  2477. response = await client.get(url, headers=REMOTE_FETCH_HEADERS)
  2478. html_body = _decode_html_bytes(response.content, response.encoding)
  2479. response.raise_for_status()
  2480. return html_body
  2481. async def _download_html_via_stdlib(url: str) -> str:
  2482. def _sync_fetch() -> Tuple[bytes, Optional[str]]:
  2483. req = urllib_request.Request(url, headers=SIMPLE_FETCH_HEADERS)
  2484. opener = urllib_request.build_opener(urllib_request.ProxyHandler({}))
  2485. with opener.open(req, timeout=REMOTE_FETCH_TIMEOUT) as resp:
  2486. data = resp.read(MAX_REMOTE_HTML_BYTES + 1)
  2487. headers = getattr(resp, "headers", None)
  2488. encoding_hint = None
  2489. if headers is not None:
  2490. get_charset = getattr(headers, "get_content_charset", None)
  2491. if callable(get_charset):
  2492. encoding_hint = get_charset()
  2493. if not encoding_hint:
  2494. content_type = headers.get("Content-Type", "")
  2495. match = re.search(r"charset=([\w-]+)", content_type or "", re.IGNORECASE)
  2496. if match:
  2497. encoding_hint = match.group(1)
  2498. return data, encoding_hint
  2499. raw_content, encoding_hint = await asyncio.to_thread(_sync_fetch)
  2500. return _decode_html_bytes(raw_content, encoding_hint)
  2501. async def _download_html_with_fallback(url: str) -> str:
  2502. first_exc: Optional[Exception] = None
  2503. try:
  2504. return await _download_html_via_httpx(url)
  2505. except httpx.HTTPStatusError as exc:
  2506. status = exc.response.status_code if exc.response is not None else None
  2507. if status not in {401, 403, 407, 451, 429}:
  2508. raise
  2509. first_exc = exc
  2510. except httpx.HTTPError as exc:
  2511. first_exc = exc
  2512. try:
  2513. return await _download_html_via_stdlib(url)
  2514. except (urllib_error.URLError, urllib_error.HTTPError, TimeoutError) as fallback_exc:
  2515. if first_exc:
  2516. raise first_exc from fallback_exc
  2517. raise
  2518. async def _fetch_remote_plaintext(
  2519. url: str,
  2520. ) -> Tuple[
  2521. str,
  2522. str,
  2523. str,
  2524. List[Dict[str, str]],
  2525. List[Dict[str, str]],
  2526. List[Dict[str, str]],
  2527. List[Tuple[int, int]],
  2528. ]:
  2529. normalized = _normalize_target_url(url)
  2530. html_body = await _download_html_with_fallback(normalized)
  2531. stripper = SimpleHTMLStripper()
  2532. stripper.feed(html_body)
  2533. title = stripper.get_title() or normalized
  2534. images = stripper.get_images()
  2535. code_blocks = stripper.get_code_blocks()
  2536. plain_text = stripper.get_text()
  2537. block_info = stripper.get_blocks()
  2538. paragraph_ranges = _build_paragraph_ranges(block_info)
  2539. if not plain_text:
  2540. plain_text = _fallback_html_to_text(html_body)
  2541. if not plain_text:
  2542. raise ValueError("未能从该页面提取正文。")
  2543. # Fallback text no longer contains structured placeholders, so any
  2544. # collected media/code markers would be invalid.
  2545. images = []
  2546. code_blocks = []
  2547. block_info = []
  2548. paragraph_ranges = []
  2549. paragraph_meta = _build_paragraph_metadata(block_info)
  2550. return normalized, title, plain_text, images, code_blocks, paragraph_meta, paragraph_ranges
  2551. def _render_proxy_page(
  2552. *,
  2553. url_value: str = "",
  2554. message: Optional[str] = None,
  2555. is_error: bool = False,
  2556. highlight_fragment: Optional[str] = None,
  2557. helper_enabled: bool = False,
  2558. source_url: Optional[str] = None,
  2559. source_title: Optional[str] = None,
  2560. show_images: bool = False,
  2561. image_notice: Optional[str] = None,
  2562. source_plaintext: Optional[str] = None,
  2563. ) -> str:
  2564. helper_state = "on" if helper_enabled else "off"
  2565. status_block = ""
  2566. if message:
  2567. cls = "status err" if is_error else "status ok"
  2568. status_block = f"<p class='{cls}'>{html.escape(message)}</p>"
  2569. style_block = STYLE_BLOCK if highlight_fragment else ""
  2570. result_block = ""
  2571. source_script = ""
  2572. if highlight_fragment and source_url:
  2573. safe_url = html.escape(source_url, quote=True)
  2574. safe_title = html.escape(source_title or source_url)
  2575. image_hint = ""
  2576. if image_notice:
  2577. image_hint = f"<p class='image-hint'>{html.escape(image_notice)}</p>"
  2578. if source_plaintext:
  2579. source_script = f"<script>window.__proxySourceText = {json.dumps(source_plaintext)}</script>"
  2580. result_block = (
  2581. "<section class='result'>"
  2582. f"<div class='source'>原页面:<a href='{safe_url}' target='_blank' rel='noopener'>{safe_title}</a></div>"
  2583. f"<div class='analysis' data-helper='{helper_state}'>{highlight_fragment}</div>"
  2584. f"{image_hint}"
  2585. "</section>"
  2586. )
  2587. show_images_checked = "checked" if show_images else ""
  2588. return PROXY_PAGE_TEMPLATE.substitute(
  2589. style_block=style_block,
  2590. url_value=html.escape(url_value or "", quote=True),
  2591. status_block=status_block,
  2592. result_block=result_block,
  2593. show_images_checked=show_images_checked,
  2594. source_text_script=source_script,
  2595. )