#!/usr/bin/env python
# coding=utf8
from __future__ import absolute_import
import re
from .annospan import SpanGroup, AnnoSpan
from . import maximum_weight_interval_set as mwis
[docs]class AnnoTier(object):
"""
A group of AnnoSpans stored sorted by start offset.
"""
def __init__(self, spans=None, presorted=False):
if spans is None:
self.spans = []
elif isinstance(spans, AnnoTier):
self.spans = list(spans.spans)
else:
if presorted:
self.spans = spans
else:
self.spans = sorted(spans)
def __repr__(self):
return ('AnnoTier([' +
', '.join([span.__repr__() for span in self.spans]) +
'])')
def __len__(self):
return len(self.spans)
def __add__(self, other_tier):
return AnnoTier(self.spans + other_tier.spans)
def __iter__(self):
return iter(self.spans)
def __getitem__(self, idx):
return self.spans[idx]
[docs] def subtract_overlaps(self, other_tier):
"""
:param other_tier: The spans to be removed from the territory of this tier
:type other_tier: AnnoTier
:return: A copy of this tier with spans truncated and split so that
none of the new spans overlap a span in other_tier
:rtype: AnnoTier
>>> from .annospan import AnnoSpan
>>> from .annodoc import AnnoDoc
>>> doc = AnnoDoc('one two three four')
>>> tier_a = AnnoTier([AnnoSpan(0, 18, doc)])
>>> tier_b = AnnoTier([AnnoSpan(3, 8, doc), AnnoSpan(13, 18, doc)])
>>> tier_a.subtract_overlaps(tier_b)
AnnoTier([AnnoSpan(0-3, one), AnnoSpan(8-13, three)])
"""
result_spans = []
for span, overlapping_spans in self.group_spans_by_containing_span(other_tier, allow_partial_containment=True):
new_start = span.start
for overlapping_span in overlapping_spans:
if overlapping_span.start <= new_start:
new_start = max(overlapping_span.end, new_start)
else:
result_spans.append(AnnoSpan(
new_start,
overlapping_span.start,
span.doc,
span.label,
span.metadata
))
new_start = overlapping_span.end
if new_start >= span.end:
break
if new_start < span.end:
result_spans.append(AnnoSpan(
new_start,
span.end,
span.doc,
span.label,
span.metadata
))
return AnnoTier(result_spans)
[docs] def group_spans_by_containing_span(self,
other_tier,
allow_partial_containment=False):
"""
Group spans in other_tier by the spans that contain them in this one.
:param other_tier: The spans to be grouped together
:type other_tier: AnnoTier
:param allow_partial_containment: Include spans in groups for spans that partially overlap them.
:return: An iterator that returns pairs of values, the first of which is
the containing span from this tier, the second is an array of
spans from other_tier that the span from this tier contans.
>>> from .annospan import AnnoSpan
>>> from .annodoc import AnnoDoc
>>> doc = AnnoDoc('one two three')
>>> tier_a = AnnoTier([AnnoSpan(0, 3, doc), AnnoSpan(4, 7, doc)])
>>> tier_b = AnnoTier([AnnoSpan(0, 1, doc)])
>>> list(tier_a.group_spans_by_containing_span(tier_b))
[(AnnoSpan(0-3, one), [AnnoSpan(0-1, o)]), (AnnoSpan(4-7, two), [])]
"""
if isinstance(other_tier, AnnoTier):
other_spans = other_tier.spans
else:
other_spans = sorted(other_tier)
other_spans_idx = 0
for span in self.spans:
span_group = []
# iterate over the other spans that come before this span.
while other_spans_idx < len(other_spans):
if allow_partial_containment:
if other_spans[other_spans_idx].end > span.start:
break
else:
if other_spans[other_spans_idx].start >= span.start:
break
other_spans_idx += 1
other_span_idx_2 = other_spans_idx
while other_span_idx_2 < len(other_spans):
if other_spans[other_span_idx_2].start >= span.end:
break
if not allow_partial_containment:
# Skip the other span if it is not contained by this span.
# It is possible there is another shorter span that starts
# after it and is fully contained by this span.
if other_spans[other_span_idx_2].end > span.end:
other_span_idx_2 += 1
continue
span_group.append(other_spans[other_span_idx_2])
other_span_idx_2 += 1
yield span, span_group
[docs] def spans_contained_by_span(self, selector_span):
"""
Return a list of spans that are contained by a "selector span".
>>> from epitator.annospan import AnnoSpan
>>> from epitator.annodoc import AnnoDoc
>>> from epitator.annotier import AnnoTier
>>> doc = AnnoDoc('one two three')
>>> tier1 = AnnoTier([AnnoSpan(0, 3, doc), AnnoSpan(4, 7, doc)])
>>> span1 = AnnoSpan(3, 9, doc)
>>> tier1.spans_contained_by_span(span1)
AnnoTier([AnnoSpan(4-7, two)])
"""
return(
AnnoTier([span for span in self if selector_span.contains(span)])
)
[docs] def spans_overlapped_by_span(self, selector_span):
"""
Return a list of spans that overlap a "selector span".
>>> from epitator.annospan import AnnoSpan
>>> from epitator.annodoc import AnnoDoc
>>> from epitator.annotier import AnnoTier
>>> doc = AnnoDoc('one two three')
>>> tier1 = AnnoTier([AnnoSpan(0, 3, doc), AnnoSpan(4, 7, doc)])
>>> span1 = AnnoSpan(0, 1, doc)
>>> tier1.spans_overlapped_by_span(span1)
AnnoTier([AnnoSpan(0-3, one)])
"""
return(
AnnoTier([span for span in self if selector_span.overlaps(span)])
)
[docs] def with_label(self, label):
"""
Create a tier from the spans which have the given label
>>> from .annospan import AnnoSpan
>>> from .annodoc import AnnoDoc
>>> doc = AnnoDoc('one two three')
>>> tier = AnnoTier([AnnoSpan(0, 3, doc, 'odd'),
... AnnoSpan(4, 7, doc, 'even'),
... AnnoSpan(8, 13, doc, 'odd')])
>>> tier.with_label("odd")
AnnoTier([AnnoSpan(0-3, odd), AnnoSpan(8-13, odd)])
"""
return AnnoTier([span for span in self if span.label == label])
[docs] def optimal_span_set(self, prefer="text_length"):
"""
:param perfer: A function that takes a span and returns a numeric tuple score.
The following predefined functions may be specified via string:
text_length, text_length_min_spans, num_spans, and num_spans_and_no_linebreaks
:type prefer: string, function
:return: A tier with the set of non-overlapping spans from this tier that
maximizes the prefer function.
:rtype: AnnoTier
>>> from .annospan import AnnoSpan
>>> from .annodoc import AnnoDoc
>>> doc = AnnoDoc('one two three')
>>> tier = AnnoTier([AnnoSpan(0, 3, doc, 'odd'),
... AnnoSpan(4, 7, doc, 'even'),
... AnnoSpan(3, 13, doc, 'long_span'),
... AnnoSpan(8, 13, doc, 'odd')])
>>> tier.optimal_span_set()
AnnoTier([AnnoSpan(0-3, odd), AnnoSpan(3-13, long_span)])
"""
all_spans = self.spans
def first(x):
"""
Perfers the matches that appear first in the first result list.
"""
# Using an exponent makes it so that a first match will be prefered
# over multiple non-overlapping later matches.
return 2 ** (len(all_spans) - all_spans.index(x))
def text_length(x):
"""
Prefers the match with the longest span of text that contains all the
matching content.
"""
return len(x)
def text_length_min_spans(x):
"""
Prefer the spans that cover the largest amount of text,
and as a secondary objective the minimizes the
overall number of matches.
"""
return len(x), -1
def num_spans(x):
"""
Prefers the match with the most distinct base spans.
"""
if isinstance(x, SpanGroup):
return len(set(x.iterate_leaf_base_spans()))
else:
return 1
def num_spans_and_no_linebreaks(x):
"""
Same as num_spans, but linebreaks are avoided as a secondary objective,
and overall text length is minimized as a third objective.
"""
return num_spans(x), int("\n" not in x.text), -len(x)
if prefer == "first":
prefunc = first
elif prefer == "text_length":
prefunc = text_length
elif prefer == "text_length_min_spans":
prefunc = text_length_min_spans
elif prefer == "num_spans":
prefunc = num_spans
elif prefer == "num_spans_and_no_linebreaks":
prefunc = num_spans_and_no_linebreaks
else:
prefunc = prefer
my_mwis = mwis.find_maximum_weight_interval_set([
mwis.Interval(
start=match.start,
end=match.end,
weight=prefunc(match),
corresponding_object=match
)
for match in all_spans
])
return AnnoTier([
interval.corresponding_object
for interval in my_mwis
])
[docs] def without_overlaps(self, other_tier):
"""
Create a copy of this tier without spans that overlap a span in the
other tier.
"""
span_groups = self.group_spans_by_containing_span(other_tier,
allow_partial_containment=True)
result = []
for span, group in span_groups:
if len(group) == 0:
result.append(span)
return AnnoTier(result)
[docs] def with_contained_spans_from(self, other_tier, allow_partial_containment=False):
"""
Create a new tier from pairs spans in this tier and the other tier
where the span in this tier contains one in the other tier.
"""
span_groups = self.group_spans_by_containing_span(other_tier,
allow_partial_containment=allow_partial_containment)
result = []
for span, group in span_groups:
for other_span in group:
result.append(SpanGroup([span, other_span]))
return AnnoTier(result)
[docs] def with_nearby_spans_from(self, other_tier, max_dist=100):
"""
Create a new tier from pairs spans in this tier and the other tier
that are near eachother.
"""
return AnnoTier(
self.with_following_spans_from(other_tier, max_dist=max_dist, allow_overlap=True) +
other_tier.with_following_spans_from(self, max_dist=max_dist, allow_overlap=True))
[docs] def with_following_spans_from(self, other_tier, max_dist=1, allow_overlap=False):
"""
Create a new tier from pairs of spans where the one in the other tier follows a span from this tier.
>>> from .annospan import AnnoSpan
>>> from .annodoc import AnnoDoc
>>> doc = AnnoDoc('one two three four')
>>> tier1 = AnnoTier([AnnoSpan(0, 3, doc),
... AnnoSpan(8, 13, doc)])
>>> tier2 = AnnoTier([AnnoSpan(14, 18, doc)])
>>> tier1.with_following_spans_from(tier2)
AnnoTier([SpanGroup(text=three four, label=None, AnnoSpan(8-13, three), AnnoSpan(14-18, four))])
"""
extended_spans = []
for span in self:
extended_spans.append(
AnnoSpan(span.start, span.end + max_dist + 1, span.doc, metadata=span))
extended_spans = AnnoTier(extended_spans, presorted=True)
span_groups = extended_spans.group_spans_by_containing_span(other_tier,
allow_partial_containment=True)
if allow_overlap:
def starts_before_f(span_a, span_b):
return span_a.start < span_b.start
else:
def starts_before_f(span_a, span_b):
return span_a.end <= span_b.start
result = []
for extended_span, span_group in span_groups:
idx = 0
for span in span_group:
if starts_before_f(extended_span.metadata, span):
break
idx += 1
for span in span_group[idx:]:
result.append(SpanGroup([extended_span.metadata, span]))
return AnnoTier(result)
[docs] def combined_adjacent_spans(self, max_dist=1):
"""
Create a new tier from groups of spans within max_dist of eachother.
>>> from .annospan import AnnoSpan
>>> from .annodoc import AnnoDoc
>>> doc = AnnoDoc('one two three four')
>>> tier = AnnoTier([AnnoSpan(0, 3, doc),
... AnnoSpan(8, 13, doc),
... AnnoSpan(14, 18, doc)])
>>> tier.combined_adjacent_spans()
AnnoTier([SpanGroup(text=one, label=None, AnnoSpan(0-3, one)), SpanGroup(text=three four, label=None, AnnoSpan(8-13, three), AnnoSpan(14-18, four))])
"""
prev_span = None
span_groups = []
span_group = None
for span in self:
if not prev_span:
span_group = [span]
elif prev_span.end + max_dist >= span.start:
span_group.append(span)
else:
span_groups.append(SpanGroup(span_group))
span_group = [span]
prev_span = span
if span_group:
span_groups.append(SpanGroup(span_group))
return AnnoTier(span_groups)
[docs] def chains(self, at_least=1, at_most=None, max_dist=1):
"""
Create a new tier from all chains of spans within max_dist of eachother.
"""
combined_spans = AnnoTier()
new_combined_spans = self
chain_len = 1
while True:
if chain_len >= at_least:
combined_spans += new_combined_spans
if len(new_combined_spans) == 0:
break
chain_len += 1
if at_most and chain_len > at_most:
break
new_combined_spans = new_combined_spans.with_following_spans_from(self, max_dist=max_dist)
return combined_spans
[docs] def span_before(self, target_span, allow_overlap=True):
"""
Find the nearest span that comes before the target span.
>>> from .annospan import AnnoSpan
>>> from .annodoc import AnnoDoc
>>> doc = AnnoDoc('one two three four')
>>> tier = AnnoTier([AnnoSpan(0, 3, doc),
... AnnoSpan(8, 13, doc),
... AnnoSpan(14, 18, doc)])
>>> tier.span_before(AnnoSpan(4, 7, doc))
AnnoSpan(0-3, one)
"""
closest_span = None
for span in self:
if span.start >= target_span.start:
break
if not allow_overlap and span.end > target_span.start:
break
closest_span = span
return closest_span
[docs] def span_after(self, target_span):
"""
Find the nearest span that comes after the target span.
"""
span = None
for span in self:
if span.start >= target_span.end:
break
return span
[docs] def nearest_to(self, target_span):
"""
Find the nearest span to the target span.
"""
closest_span = None
min_distance = None
for span in self:
span_distance = span.distance(target_span)
if closest_span is None or span_distance <= min_distance:
closest_span = span
min_distance = span_distance
else:
# Once the span distance stops decreasing
# it will only increase.
break
return closest_span
[docs] def label_spans(self, label):
"""
Create a new tier based on this one
with labeled spans that can be looked up by groupdict.
"""
return AnnoTier([SpanGroup([span], label) for span in self], presorted=True)
[docs] def search_spans(self, regex, label=None):
"""
Search spans for ones matching the given regular expression.
"""
regex = re.compile(regex + r'$', re.I)
match_spans = []
for span in self:
if regex.match(span.text):
match_spans.append(SpanGroup([span], label))
return AnnoTier(match_spans, presorted=True)
[docs] def match_subspans(self, regex):
"""
Create a new tier from the components of spans matching the given
regular expression.
>>> from .annospan import AnnoSpan
>>> from .annodoc import AnnoDoc
>>> doc = AnnoDoc('one two three four')
>>> tier = AnnoTier([AnnoSpan(0, 3, doc),
... AnnoSpan(4, 13, doc),
... AnnoSpan(14, 18, doc)])
>>> tier.match_subspans(r"two")
AnnoTier([AnnoSpan(4-7, two)])
"""
regex = re.compile(regex)
match_spans = []
for span in self:
for match in regex.finditer(span.text):
match_spans.append(AnnoSpan(
match.start() + span.start,
match.end() + span.start,
span.doc
))
return AnnoTier(match_spans, presorted=True)