Source code for epitator.annospan

#!/usr/bin/env python
# coding=utf8
from __future__ import absolute_import


EMPTY_LIST = []


[docs]class AnnoSpan(object):
    """
    A span of text with an annotation applied to it.
    """
    __slots__ = ["start", "end", "doc", "metadata", "label", "base_spans"]

    def __init__(self, start, end, doc, label=None, metadata=None):
        self.start = start
        self.end = end
        self.doc = doc
        self.metadata = metadata
        # Base spans is only non-empty on span groups.
        self.base_spans = EMPTY_LIST
        self.label = label

    def __repr__(self):
        return u'AnnoSpan({0}-{1}, {2})'.format(self.start, self.end, self.label or self.text)

    def __lt__(self, other):
        if self.start < other.start:
            return True
        elif self.start == other.start:
            return self.end < other.end
        else:
            return False

    def __len__(self):
        return self.end - self.start

[docs]    def distance(self, other_span):
        """
        The number of characters between this span and the other one.
        If the spans overlap the distance is the negative length of their
        overlap.

        >>> from .annotier import AnnoTier
        >>> from .annodoc import AnnoDoc
        >>> doc = AnnoDoc('one two three')
        >>> tier = AnnoTier([AnnoSpan(0, 3, doc), AnnoSpan(8, 13, doc)])
        >>> tier.spans[0].distance(tier.spans[1])
        5
        """
        if self.start < other_span.start:
            return other_span.start - self.end
        else:
            return self.start - other_span.end

[docs]    def overlaps(self, other_span):
        """
        Return true if the span overlaps other_span.
        """
        return (
            (self.start >= other_span.start and self.start < other_span.end) or
            (other_span.start >= self.start and other_span.start < self.end)
        )

[docs]    def contains(self, other_span):
        """
        Return true if the span completely contains other_span.
        """
        return self.start <= other_span.start and self.end >= other_span.end

[docs]    def adjacent_to(self, other_span, max_dist=1):
        """
        Return true if the span comes before or after other_span with at most
        max_dist charaters between them.
        """
        return (
            self.comes_before(other_span, max_dist) or
            other_span.comes_before(self, max_dist)
        )

[docs]    def comes_before(self, other_span, max_dist=1, allow_overlap=False):
        """
        Return True if the span comes before the other_span and there are
        max_dist or fewer charaters between them.

        >>> from .annotier import AnnoTier
        >>> from .annodoc import AnnoDoc
        >>> doc = AnnoDoc('one two three')
        >>> tier = AnnoTier([AnnoSpan(0, 3, doc), AnnoSpan(4, 7, doc)])
        >>> tier.spans[0].comes_before(tier.spans[1])
        True
        >>> tier.spans[1].comes_before(tier.spans[0])
        False
        """
        if allow_overlap:
            ok_start = self.start <= other_span.start
        else:
            ok_start = self.end <= other_span.start
        return ok_start and self.end >= other_span.start - max_dist

[docs]    def extended_through(self, other_span):
        """
        Create a new span that includes this one and the other span.
        """
        return SpanGroup([self, other_span], self.label)

[docs]    def trimmed(self):
        """
        Create a new AnnoSpan based on this one with the offsets adjusted
        so that there is no white space at the beginning or end.

        >>> from .annodoc import AnnoDoc
        >>> doc = AnnoDoc('one two three')
        >>> original_span = AnnoSpan(3, 8, doc)
        >>> original_span.trimmed()
        AnnoSpan(4-7, two)
        """
        start = self.start
        end = self.end
        doc_text = self.doc.text
        while start < end and doc_text[start] == " ":
            start += 1
        while start < end and doc_text[end - 1] == " ":
            end -= 1
        return AnnoSpan(start, end, self.doc, label=self.label, metadata=self.metadata)

    @property
    def text(self):
        return self.doc.text[self.start:self.end]

[docs]    def to_dict(self):
        """
        Return a json serializable dictionary.
        """
        return dict(
            label=self.label,
            textOffsets=[[self.start, self.end]]
        )

[docs]    def groupdict(self):
        """
        Return a dict with all the labeled matches.

        >>> from .annodoc import AnnoDoc
        >>> doc = AnnoDoc('one two wolf')
        >>> number_span_g = SpanGroup([AnnoSpan(0, 3, doc, 'number'),
        ...                            AnnoSpan(4, 7, doc, 'number'),
        ...                            AnnoSpan(8, 12, doc, 'animal')])
        >>> number_span_g.groupdict()['number']
        [AnnoSpan(0-3, number), AnnoSpan(4-7, number)]
        >>> number_span_g.groupdict()['animal']
        [AnnoSpan(8-12, animal)]
        """
        out = {}
        for base_span in self.base_spans:
            for key, values in base_span.groupdict().items():
                out[key] = out.get(key, []) + values
        for values in out.values():
            values.sort()
        if self.label:
            out[self.label] = [self]
        return out

[docs]    def iterate_base_spans(self):
        """
        Recursively iterate over all base_spans including base_spans of child SpanGroups.
        """
        for span in self.base_spans:
            yield span
            for span2 in span.iterate_base_spans():
                yield span2

[docs]    def iterate_leaf_base_spans(self):
        """
        Return the leaf base spans in a SpanGroup tree.
        """
        for span in self.iterate_base_spans():
            if not isinstance(span, SpanGroup):
                yield span


[docs]class SpanGroup(AnnoSpan):
    """
    A AnnoSpan that extends through a group of AnnoSpans.
    """
    def __init__(self, base_spans, label=None, metadata=None):
        assert isinstance(base_spans, list)
        assert len(base_spans) > 0
        super(SpanGroup, self).__init__(
            min(s.start for s in base_spans),
            max(s.end for s in base_spans),
            base_spans[0].doc,
            label,
            metadata)
        self.base_spans = base_spans

    def __repr__(self):
        return ("SpanGroup("
                "text=" + self.text + ", "
                "label=" + str(self.label) + ", " +
                ", ".join(map(str, self.base_spans)) + ")")
Source code for epitator.annospan

EpiTator

Navigation

Related Topics