|
|
@@ -9,7 +9,7 @@ from collections import Counter
|
|
|
from dataclasses import dataclass, field
|
|
|
from html.parser import HTMLParser
|
|
|
from string import Template
|
|
|
-from typing import Any, Dict, List, Optional, Tuple
|
|
|
+from typing import Any, Dict, List, Optional, Set, Tuple
|
|
|
from urllib import error as urllib_error, request as urllib_request
|
|
|
from urllib.parse import urlparse, urlunparse
|
|
|
|
|
|
@@ -380,6 +380,35 @@ def add_span(spans: List[Span], start_token: int, end_token: int, cls: str, attr
|
|
|
spans.append(Span(start_token=start_token, end_token=end_token, cls=cls, attrs=attrs))
|
|
|
|
|
|
|
|
|
+def _prune_adverbial_spans(spans: List[Span], sentence_token_bounds: Tuple[int, int]) -> None:
|
|
|
+ """Drop redundant/oversized adverbial spans that make entire sentences underline."""
|
|
|
+ sent_start, sent_end = sentence_token_bounds
|
|
|
+ if sent_start < 0 or sent_end <= sent_start:
|
|
|
+ return
|
|
|
+ sent_length = sent_end - sent_start
|
|
|
+ filtered: List[Span] = []
|
|
|
+ seen_ranges: Set[Tuple[int, int]] = set()
|
|
|
+ for span in spans:
|
|
|
+ classes = span.cls.split()
|
|
|
+ if "role-adverbial" not in classes:
|
|
|
+ filtered.append(span)
|
|
|
+ continue
|
|
|
+ span_length = span.end_token - span.start_token
|
|
|
+ # Skip single-token adverbs and spans that swallow the whole sentence.
|
|
|
+ if span_length <= 1:
|
|
|
+ continue
|
|
|
+ coverage_start = max(span.start_token, sent_start)
|
|
|
+ coverage_end = min(span.end_token, sent_end)
|
|
|
+ if coverage_end - coverage_start >= sent_length:
|
|
|
+ continue
|
|
|
+ range_key = (coverage_start, coverage_end)
|
|
|
+ if range_key in seen_ranges:
|
|
|
+ continue
|
|
|
+ seen_ranges.add(range_key)
|
|
|
+ filtered.append(span)
|
|
|
+ spans[:] = filtered
|
|
|
+
|
|
|
+
|
|
|
def subtree_char_span(token: SpacyToken) -> Tuple[int, int]:
|
|
|
subtree = list(token.subtree)
|
|
|
if not subtree:
|
|
|
@@ -724,12 +753,14 @@ def annotate_sentence(
|
|
|
if tok.dep_ in {"amod", "poss", "compound", "nummod"}:
|
|
|
add_token(tok, "role-modifier")
|
|
|
|
|
|
- adverbial_ranges = set()
|
|
|
- for tok in sentence:
|
|
|
- if tok.dep_ in ADVERBIAL_DEPS:
|
|
|
- adverbial_ranges.add(subtree_char_span(tok))
|
|
|
- for start_char, end_char in adverbial_ranges:
|
|
|
- add_char_based_span(spans, start_char, end_char, "role-adverbial", mapping)
|
|
|
+ # Dependency-based adverbial spans are a fallback when constituency data is unavailable.
|
|
|
+ if not HAS_BENEPAR or BENE_PAR_WARNING:
|
|
|
+ adverbial_ranges = set()
|
|
|
+ for tok in sentence:
|
|
|
+ if tok.dep_ in ADVERBIAL_DEPS:
|
|
|
+ adverbial_ranges.add(subtree_char_span(tok))
|
|
|
+ for start_char, end_char in adverbial_ranges:
|
|
|
+ add_char_based_span(spans, start_char, end_char, "role-adverbial", mapping)
|
|
|
|
|
|
for tok in sentence:
|
|
|
if tok.dep_ == "appos":
|
|
|
@@ -769,6 +800,7 @@ def annotate_sentence(
|
|
|
summary,
|
|
|
)
|
|
|
_add_fixed_phrases(sentence, mapping, spans, summary)
|
|
|
+ _prune_adverbial_spans(spans, sent_bounds)
|
|
|
|
|
|
return spans, summary
|
|
|
|