Source code for epitator.annotier

#!/usr/bin/env python
# coding=utf8
from __future__ import absolute_import
import re
from .annospan import SpanGroup, AnnoSpan
from . import maximum_weight_interval_set as mwis


[docs]class AnnoTier(object):
    """
    A group of AnnoSpans stored sorted by start offset.
    """
    def __init__(self, spans=None, presorted=False):
        if spans is None:
            self.spans = []
        elif isinstance(spans, AnnoTier):
            self.spans = list(spans.spans)
        else:
            if presorted:
                self.spans = spans
            else:
                self.spans = sorted(spans)

    def __repr__(self):
        return ('AnnoTier([' +
                ', '.join([span.__repr__() for span in self.spans]) +
                '])')

    def __len__(self):
        return len(self.spans)

    def __add__(self, other_tier):
        return AnnoTier(self.spans + other_tier.spans)

    def __iter__(self):
        return iter(self.spans)

    def __getitem__(self, idx):
        return self.spans[idx]

[docs]    def subtract_overlaps(self, other_tier):
        """
        :param other_tier: The spans to be removed from the territory of this tier
        :type other_tier: AnnoTier
        :return: A copy of this tier with spans truncated and split so that
            none of the new spans overlap a span in other_tier
        :rtype: AnnoTier

        >>> from .annospan import AnnoSpan
        >>> from .annodoc import AnnoDoc
        >>> doc = AnnoDoc('one two three four')
        >>> tier_a = AnnoTier([AnnoSpan(0, 18, doc)])
        >>> tier_b = AnnoTier([AnnoSpan(3, 8, doc), AnnoSpan(13, 18, doc)])
        >>> tier_a.subtract_overlaps(tier_b)
        AnnoTier([AnnoSpan(0-3, one), AnnoSpan(8-13, three)])
        """
        result_spans = []
        for span, overlapping_spans in self.group_spans_by_containing_span(other_tier, allow_partial_containment=True):
            new_start = span.start
            for overlapping_span in overlapping_spans:
                if overlapping_span.start <= new_start:
                    new_start = max(overlapping_span.end, new_start)
                else:
                    result_spans.append(AnnoSpan(
                        new_start,
                        overlapping_span.start,
                        span.doc,
                        span.label,
                        span.metadata
                    ))
                    new_start = overlapping_span.end
                if new_start >= span.end:
                    break
            if new_start < span.end:
                result_spans.append(AnnoSpan(
                    new_start,
                    span.end,
                    span.doc,
                    span.label,
                    span.metadata
                ))
        return AnnoTier(result_spans)

[docs]    def group_spans_by_containing_span(self,
                                       other_tier,
                                       allow_partial_containment=False):
        """
        Group spans in other_tier by the spans that contain them in this one.

        :param other_tier: The spans to be grouped together
        :type other_tier: AnnoTier
        :param allow_partial_containment: Include spans in groups for spans that partially overlap them.
        :return: An iterator that returns pairs of values, the first of which is
            the containing span from this tier, the second is an array of
            spans from other_tier that the span from this tier contans.

        >>> from .annospan import AnnoSpan
        >>> from .annodoc import AnnoDoc
        >>> doc = AnnoDoc('one two three')
        >>> tier_a = AnnoTier([AnnoSpan(0, 3, doc), AnnoSpan(4, 7, doc)])
        >>> tier_b = AnnoTier([AnnoSpan(0, 1, doc)])
        >>> list(tier_a.group_spans_by_containing_span(tier_b))
        [(AnnoSpan(0-3, one), [AnnoSpan(0-1, o)]), (AnnoSpan(4-7, two), [])]
        """
        if isinstance(other_tier, AnnoTier):
            other_spans = other_tier.spans
        else:
            other_spans = sorted(other_tier)
        other_spans_idx = 0
        for span in self.spans:
            span_group = []
            # iterate over the other spans that come before this span.
            while other_spans_idx < len(other_spans):
                if allow_partial_containment:
                    if other_spans[other_spans_idx].end > span.start:
                        break
                else:
                    if other_spans[other_spans_idx].start >= span.start:
                        break
                other_spans_idx += 1
            other_span_idx_2 = other_spans_idx
            while other_span_idx_2 < len(other_spans):
                if other_spans[other_span_idx_2].start >= span.end:
                    break
                if not allow_partial_containment:
                    # Skip the other span if it is not contained by this span.
                    # It is possible there is another shorter span that starts
                    # after it and is fully contained by this span.
                    if other_spans[other_span_idx_2].end > span.end:
                        other_span_idx_2 += 1
                        continue
                span_group.append(other_spans[other_span_idx_2])
                other_span_idx_2 += 1
            yield span, span_group

[docs]    def spans_contained_by_span(self, selector_span):
        """
        Return a list of spans that are contained by a "selector span".

        >>> from epitator.annospan import AnnoSpan
        >>> from epitator.annodoc import AnnoDoc
        >>> from epitator.annotier import AnnoTier
        >>> doc = AnnoDoc('one two three')
        >>> tier1 = AnnoTier([AnnoSpan(0, 3, doc), AnnoSpan(4, 7, doc)])
        >>> span1 = AnnoSpan(3, 9, doc)
        >>> tier1.spans_contained_by_span(span1)
        AnnoTier([AnnoSpan(4-7, two)])
        """
        return(
            AnnoTier([span for span in self if selector_span.contains(span)])
        )

[docs]    def spans_overlapped_by_span(self, selector_span):
        """
        Return a list of spans that overlap a "selector span".

        >>> from epitator.annospan import AnnoSpan
        >>> from epitator.annodoc import AnnoDoc
        >>> from epitator.annotier import AnnoTier
        >>> doc = AnnoDoc('one two three')
        >>> tier1 = AnnoTier([AnnoSpan(0, 3, doc), AnnoSpan(4, 7, doc)])
        >>> span1 = AnnoSpan(0, 1, doc)
        >>> tier1.spans_overlapped_by_span(span1)
        AnnoTier([AnnoSpan(0-3, one)])
        """
        return(
            AnnoTier([span for span in self if selector_span.overlaps(span)])
        )

[docs]    def with_label(self, label):
        """
        Create a tier from the spans which have the given label

        >>> from .annospan import AnnoSpan
        >>> from .annodoc import AnnoDoc
        >>> doc = AnnoDoc('one two three')
        >>> tier = AnnoTier([AnnoSpan(0, 3, doc, 'odd'),
        ...                  AnnoSpan(4, 7, doc, 'even'),
        ...                  AnnoSpan(8, 13, doc, 'odd')])
        >>> tier.with_label("odd")
        AnnoTier([AnnoSpan(0-3, odd), AnnoSpan(8-13, odd)])
        """
        return AnnoTier([span for span in self if span.label == label])

[docs]    def optimal_span_set(self, prefer="text_length"):
        """
        :param perfer: A function that takes a span and returns a numeric tuple score.
            The following predefined functions may be specified via string:
            text_length, text_length_min_spans, num_spans, and num_spans_and_no_linebreaks
        :type prefer: string, function
        :return: A tier with the set of non-overlapping spans from this tier that
            maximizes the prefer function.
        :rtype: AnnoTier

        >>> from .annospan import AnnoSpan
        >>> from .annodoc import AnnoDoc
        >>> doc = AnnoDoc('one two three')
        >>> tier = AnnoTier([AnnoSpan(0, 3, doc, 'odd'),
        ...                  AnnoSpan(4, 7, doc, 'even'),
        ...                  AnnoSpan(3, 13, doc, 'long_span'),
        ...                  AnnoSpan(8, 13, doc, 'odd')])
        >>> tier.optimal_span_set()
        AnnoTier([AnnoSpan(0-3, odd), AnnoSpan(3-13, long_span)])
        """
        all_spans = self.spans

        def first(x):
            """
            Perfers the matches that appear first in the first result list.
            """
            # Using an exponent makes it so that a first match will be prefered
            # over multiple non-overlapping later matches.
            return 2 ** (len(all_spans) - all_spans.index(x))

        def text_length(x):
            """
            Prefers the match with the longest span of text that contains all the
            matching content.
            """
            return len(x)

        def text_length_min_spans(x):
            """
            Prefer the spans that cover the largest amount of text,
            and as a secondary objective the minimizes the
            overall number of matches.
            """
            return len(x), -1

        def num_spans(x):
            """
            Prefers the match with the most distinct base spans.
            """
            if isinstance(x, SpanGroup):
                return len(set(x.iterate_leaf_base_spans()))
            else:
                return 1

        def num_spans_and_no_linebreaks(x):
            """
            Same as num_spans, but linebreaks are avoided as a secondary objective,
            and overall text length is minimized as a third objective.
            """
            return num_spans(x), int("\n" not in x.text), -len(x)

        if prefer == "first":
            prefunc = first
        elif prefer == "text_length":
            prefunc = text_length
        elif prefer == "text_length_min_spans":
            prefunc = text_length_min_spans
        elif prefer == "num_spans":
            prefunc = num_spans
        elif prefer == "num_spans_and_no_linebreaks":
            prefunc = num_spans_and_no_linebreaks
        else:
            prefunc = prefer
        my_mwis = mwis.find_maximum_weight_interval_set([
            mwis.Interval(
                start=match.start,
                end=match.end,
                weight=prefunc(match),
                corresponding_object=match
            )
            for match in all_spans
        ])
        return AnnoTier([
            interval.corresponding_object
            for interval in my_mwis
        ])

[docs]    def without_overlaps(self, other_tier):
        """
        Create a copy of this tier without spans that overlap a span in the
        other tier.
        """
        span_groups = self.group_spans_by_containing_span(other_tier,
                                                          allow_partial_containment=True)
        result = []
        for span, group in span_groups:
            if len(group) == 0:
                result.append(span)
        return AnnoTier(result)

[docs]    def with_contained_spans_from(self, other_tier, allow_partial_containment=False):
        """
        Create a new tier from pairs spans in this tier and the other tier
        where the span in this tier contains one in the other tier.
        """
        span_groups = self.group_spans_by_containing_span(other_tier,
                                                          allow_partial_containment=allow_partial_containment)
        result = []
        for span, group in span_groups:
            for other_span in group:
                result.append(SpanGroup([span, other_span]))
        return AnnoTier(result)

[docs]    def with_nearby_spans_from(self, other_tier, max_dist=100):
        """
        Create a new tier from pairs spans in this tier and the other tier
        that are near eachother.
        """
        return AnnoTier(
            self.with_following_spans_from(other_tier, max_dist=max_dist, allow_overlap=True) +
            other_tier.with_following_spans_from(self, max_dist=max_dist, allow_overlap=True))

[docs]    def with_following_spans_from(self, other_tier, max_dist=1, allow_overlap=False):
        """
        Create a new tier from pairs of spans where the one in the other tier follows a span from this tier.

        >>> from .annospan import AnnoSpan
        >>> from .annodoc import AnnoDoc
        >>> doc = AnnoDoc('one two three four')
        >>> tier1 = AnnoTier([AnnoSpan(0, 3, doc),
        ...                   AnnoSpan(8, 13, doc)])
        >>> tier2 = AnnoTier([AnnoSpan(14, 18, doc)])
        >>> tier1.with_following_spans_from(tier2)
        AnnoTier([SpanGroup(text=three four, label=None, AnnoSpan(8-13, three), AnnoSpan(14-18, four))])
        """
        extended_spans = []
        for span in self:
            extended_spans.append(
                AnnoSpan(span.start, span.end + max_dist + 1, span.doc, metadata=span))
        extended_spans = AnnoTier(extended_spans, presorted=True)
        span_groups = extended_spans.group_spans_by_containing_span(other_tier,
                                                                    allow_partial_containment=True)
        if allow_overlap:
            def starts_before_f(span_a, span_b):
                return span_a.start < span_b.start
        else:
            def starts_before_f(span_a, span_b):
                return span_a.end <= span_b.start
        result = []
        for extended_span, span_group in span_groups:
            idx = 0
            for span in span_group:
                if starts_before_f(extended_span.metadata, span):
                    break
                idx += 1
            for span in span_group[idx:]:
                result.append(SpanGroup([extended_span.metadata, span]))
        return AnnoTier(result)

[docs]    def combined_adjacent_spans(self, max_dist=1):
        """
        Create a new tier from groups of spans within max_dist of eachother.

        >>> from .annospan import AnnoSpan
        >>> from .annodoc import AnnoDoc
        >>> doc = AnnoDoc('one two three four')
        >>> tier = AnnoTier([AnnoSpan(0, 3, doc),
        ...                  AnnoSpan(8, 13, doc),
        ...                  AnnoSpan(14, 18, doc)])
        >>> tier.combined_adjacent_spans()
        AnnoTier([SpanGroup(text=one, label=None, AnnoSpan(0-3, one)), SpanGroup(text=three four, label=None, AnnoSpan(8-13, three), AnnoSpan(14-18, four))])
        """
        prev_span = None
        span_groups = []
        span_group = None
        for span in self:
            if not prev_span:
                span_group = [span]
            elif prev_span.end + max_dist >= span.start:
                span_group.append(span)
            else:
                span_groups.append(SpanGroup(span_group))
                span_group = [span]
            prev_span = span
        if span_group:
            span_groups.append(SpanGroup(span_group))
        return AnnoTier(span_groups)

[docs]    def chains(self, at_least=1, at_most=None, max_dist=1):
        """
        Create a new tier from all chains of spans within max_dist of eachother.
        """
        combined_spans = AnnoTier()
        new_combined_spans = self
        chain_len = 1
        while True:
            if chain_len >= at_least:
                combined_spans += new_combined_spans
            if len(new_combined_spans) == 0:
                break
            chain_len += 1
            if at_most and chain_len > at_most:
                break
            new_combined_spans = new_combined_spans.with_following_spans_from(self, max_dist=max_dist)
        return combined_spans

[docs]    def span_before(self, target_span, allow_overlap=True):
        """
        Find the nearest span that comes before the target span.

        >>> from .annospan import AnnoSpan
        >>> from .annodoc import AnnoDoc
        >>> doc = AnnoDoc('one two three four')
        >>> tier = AnnoTier([AnnoSpan(0, 3, doc),
        ...                  AnnoSpan(8, 13, doc),
        ...                  AnnoSpan(14, 18, doc)])
        >>> tier.span_before(AnnoSpan(4, 7, doc))
        AnnoSpan(0-3, one)
        """
        closest_span = None
        for span in self:
            if span.start >= target_span.start:
                break
            if not allow_overlap and span.end > target_span.start:
                break
            closest_span = span
        return closest_span

[docs]    def span_after(self, target_span):
        """
        Find the nearest span that comes after the target span.
        """
        span = None
        for span in self:
            if span.start >= target_span.end:
                break
        return span

[docs]    def nearest_to(self, target_span):
        """
        Find the nearest span to the target span.
        """
        closest_span = None
        min_distance = None
        for span in self:
            span_distance = span.distance(target_span)
            if closest_span is None or span_distance <= min_distance:
                closest_span = span
                min_distance = span_distance
            else:
                # Once the span distance stops decreasing
                # it will only increase.
                break
        return closest_span

[docs]    def label_spans(self, label):
        """
        Create a new tier based on this one
        with labeled spans that can be looked up by groupdict.
        """
        return AnnoTier([SpanGroup([span], label) for span in self], presorted=True)

[docs]    def search_spans(self, regex, label=None):
        """
        Search spans for ones matching the given regular expression.
        """
        regex = re.compile(regex + r'$', re.I)
        match_spans = []
        for span in self:
            if regex.match(span.text):
                match_spans.append(SpanGroup([span], label))
        return AnnoTier(match_spans, presorted=True)

[docs]    def match_subspans(self, regex):
        """
        Create a new tier from the components of spans matching the given
        regular expression.

        >>> from .annospan import AnnoSpan
        >>> from .annodoc import AnnoDoc
        >>> doc = AnnoDoc('one two three four')
        >>> tier = AnnoTier([AnnoSpan(0, 3, doc),
        ...                  AnnoSpan(4, 13, doc),
        ...                  AnnoSpan(14, 18, doc)])
        >>> tier.match_subspans(r"two")
        AnnoTier([AnnoSpan(4-7, two)])
        """
        regex = re.compile(regex)
        match_spans = []
        for span in self:
            for match in regex.finditer(span.text):
                match_spans.append(AnnoSpan(
                    match.start() + span.start,
                    match.end() + span.start,
                    span.doc
                ))
        return AnnoTier(match_spans, presorted=True)
Source code for epitator.annotier

EpiTator

Navigation

Related Topics