#!/usr/bin/env python
# coding=utf8
from __future__ import absolute_import
EMPTY_LIST = []
[docs]class AnnoSpan(object):
"""
A span of text with an annotation applied to it.
"""
__slots__ = ["start", "end", "doc", "metadata", "label", "base_spans"]
def __init__(self, start, end, doc, label=None, metadata=None):
self.start = start
self.end = end
self.doc = doc
self.metadata = metadata
# Base spans is only non-empty on span groups.
self.base_spans = EMPTY_LIST
self.label = label
def __repr__(self):
return u'AnnoSpan({0}-{1}, {2})'.format(self.start, self.end, self.label or self.text)
def __lt__(self, other):
if self.start < other.start:
return True
elif self.start == other.start:
return self.end < other.end
else:
return False
def __len__(self):
return self.end - self.start
[docs] def distance(self, other_span):
"""
The number of characters between this span and the other one.
If the spans overlap the distance is the negative length of their
overlap.
>>> from .annotier import AnnoTier
>>> from .annodoc import AnnoDoc
>>> doc = AnnoDoc('one two three')
>>> tier = AnnoTier([AnnoSpan(0, 3, doc), AnnoSpan(8, 13, doc)])
>>> tier.spans[0].distance(tier.spans[1])
5
"""
if self.start < other_span.start:
return other_span.start - self.end
else:
return self.start - other_span.end
[docs] def overlaps(self, other_span):
"""
Return true if the span overlaps other_span.
"""
return (
(self.start >= other_span.start and self.start < other_span.end) or
(other_span.start >= self.start and other_span.start < self.end)
)
[docs] def contains(self, other_span):
"""
Return true if the span completely contains other_span.
"""
return self.start <= other_span.start and self.end >= other_span.end
[docs] def adjacent_to(self, other_span, max_dist=1):
"""
Return true if the span comes before or after other_span with at most
max_dist charaters between them.
"""
return (
self.comes_before(other_span, max_dist) or
other_span.comes_before(self, max_dist)
)
[docs] def comes_before(self, other_span, max_dist=1, allow_overlap=False):
"""
Return True if the span comes before the other_span and there are
max_dist or fewer charaters between them.
>>> from .annotier import AnnoTier
>>> from .annodoc import AnnoDoc
>>> doc = AnnoDoc('one two three')
>>> tier = AnnoTier([AnnoSpan(0, 3, doc), AnnoSpan(4, 7, doc)])
>>> tier.spans[0].comes_before(tier.spans[1])
True
>>> tier.spans[1].comes_before(tier.spans[0])
False
"""
if allow_overlap:
ok_start = self.start <= other_span.start
else:
ok_start = self.end <= other_span.start
return ok_start and self.end >= other_span.start - max_dist
[docs] def extended_through(self, other_span):
"""
Create a new span that includes this one and the other span.
"""
return SpanGroup([self, other_span], self.label)
[docs] def trimmed(self):
"""
Create a new AnnoSpan based on this one with the offsets adjusted
so that there is no white space at the beginning or end.
>>> from .annodoc import AnnoDoc
>>> doc = AnnoDoc('one two three')
>>> original_span = AnnoSpan(3, 8, doc)
>>> original_span.trimmed()
AnnoSpan(4-7, two)
"""
start = self.start
end = self.end
doc_text = self.doc.text
while start < end and doc_text[start] == " ":
start += 1
while start < end and doc_text[end - 1] == " ":
end -= 1
return AnnoSpan(start, end, self.doc, label=self.label, metadata=self.metadata)
@property
def text(self):
return self.doc.text[self.start:self.end]
[docs] def to_dict(self):
"""
Return a json serializable dictionary.
"""
return dict(
label=self.label,
textOffsets=[[self.start, self.end]]
)
[docs] def groupdict(self):
"""
Return a dict with all the labeled matches.
>>> from .annodoc import AnnoDoc
>>> doc = AnnoDoc('one two wolf')
>>> number_span_g = SpanGroup([AnnoSpan(0, 3, doc, 'number'),
... AnnoSpan(4, 7, doc, 'number'),
... AnnoSpan(8, 12, doc, 'animal')])
>>> number_span_g.groupdict()['number']
[AnnoSpan(0-3, number), AnnoSpan(4-7, number)]
>>> number_span_g.groupdict()['animal']
[AnnoSpan(8-12, animal)]
"""
out = {}
for base_span in self.base_spans:
for key, values in base_span.groupdict().items():
out[key] = out.get(key, []) + values
for values in out.values():
values.sort()
if self.label:
out[self.label] = [self]
return out
[docs] def iterate_base_spans(self):
"""
Recursively iterate over all base_spans including base_spans of child SpanGroups.
"""
for span in self.base_spans:
yield span
for span2 in span.iterate_base_spans():
yield span2
[docs] def iterate_leaf_base_spans(self):
"""
Return the leaf base spans in a SpanGroup tree.
"""
for span in self.iterate_base_spans():
if not isinstance(span, SpanGroup):
yield span
[docs]class SpanGroup(AnnoSpan):
"""
A AnnoSpan that extends through a group of AnnoSpans.
"""
def __init__(self, base_spans, label=None, metadata=None):
assert isinstance(base_spans, list)
assert len(base_spans) > 0
super(SpanGroup, self).__init__(
min(s.start for s in base_spans),
max(s.end for s in base_spans),
base_spans[0].doc,
label,
metadata)
self.base_spans = base_spans
def __repr__(self):
return ("SpanGroup("
"text=" + self.text + ", "
"label=" + str(self.label) + ", " +
", ".join(map(str, self.base_spans)) + ")")