From cbcb728e7ea0031d25738297f906a7f247fa2f75 Mon Sep 17 00:00:00 2001 From: Brian Warner Date: Wed, 4 Aug 2010 00:26:00 -0700 Subject: [PATCH] Add a byte-spans utility class, like perl's Set::IntSpan for .newsrc files. Also a data-spans class, which records a byte (instead of a bit) for each index. --- src/allmydata/test/test_util.py | 565 ++++++++++++++++++++++++++++++++ src/allmydata/util/spans.py | 431 ++++++++++++++++++++++++ 2 files changed, 996 insertions(+) create mode 100755 src/allmydata/util/spans.py diff --git a/src/allmydata/test/test_util.py b/src/allmydata/test/test_util.py index 6ba9ff66..48d4711d 100644 --- a/src/allmydata/test/test_util.py +++ b/src/allmydata/test/test_util.py @@ -7,12 +7,14 @@ from twisted.trial import unittest from twisted.internet import defer, reactor from twisted.python.failure import Failure from twisted.python import log +from hashlib import md5 from allmydata.util import base32, idlib, humanreadable, mathutil, hashutil from allmydata.util import assertutil, fileutil, deferredutil, abbreviate from allmydata.util import limiter, time_format, pollmixin, cachedir from allmydata.util import statistics, dictutil, pipeline from allmydata.util import log as tahoe_log +from allmydata.util.spans import Spans, overlap, DataSpans class Base32(unittest.TestCase): def test_b2a_matches_Pythons(self): @@ -1568,3 +1570,566 @@ class Log(unittest.TestCase): tahoe_log.err(format="intentional sample error", failure=f, level=tahoe_log.OPERATIONAL, umid="wO9UoQ") self.flushLoggedErrors(SampleError) + + +class SimpleSpans: + # this is a simple+inefficient form of util.spans.Spans . We compare the + # behavior of this reference model against the real (efficient) form. + + def __init__(self, _span_or_start=None, length=None): + self._have = set() + if length is not None: + for i in range(_span_or_start, _span_or_start+length): + self._have.add(i) + elif _span_or_start: + for (start,length) in _span_or_start: + self.add(start, length) + + def add(self, start, length): + for i in range(start, start+length): + self._have.add(i) + return self + + def remove(self, start, length): + for i in range(start, start+length): + self._have.discard(i) + return self + + def each(self): + return sorted(self._have) + + def __iter__(self): + items = sorted(self._have) + prevstart = None + prevend = None + for i in items: + if prevstart is None: + prevstart = prevend = i + continue + if i == prevend+1: + prevend = i + continue + yield (prevstart, prevend-prevstart+1) + prevstart = prevend = i + if prevstart is not None: + yield (prevstart, prevend-prevstart+1) + + def __len__(self): + # this also gets us bool(s) + return len(self._have) + + def __add__(self, other): + s = self.__class__(self) + for (start, length) in other: + s.add(start, length) + return s + + def __sub__(self, other): + s = self.__class__(self) + for (start, length) in other: + s.remove(start, length) + return s + + def __iadd__(self, other): + for (start, length) in other: + self.add(start, length) + return self + + def __isub__(self, other): + for (start, length) in other: + self.remove(start, length) + return self + + def __and__(self, other): + s = self.__class__() + for i in other.each(): + if i in self._have: + s.add(i, 1) + return s + + def __contains__(self, (start,length)): + for i in range(start, start+length): + if i not in self._have: + return False + return True + +class ByteSpans(unittest.TestCase): + def test_basic(self): + s = Spans() + self.failUnlessEqual(list(s), []) + self.failIf(s) + self.failIf((0,1) in s) + self.failUnlessEqual(len(s), 0) + + s1 = Spans(3, 4) # 3,4,5,6 + self._check1(s1) + + s2 = Spans(s1) + self._check1(s2) + + s2.add(10,2) # 10,11 + self._check1(s1) + self.failUnless((10,1) in s2) + self.failIf((10,1) in s1) + self.failUnlessEqual(list(s2.each()), [3,4,5,6,10,11]) + self.failUnlessEqual(len(s2), 6) + + s2.add(15,2).add(20,2) + self.failUnlessEqual(list(s2.each()), [3,4,5,6,10,11,15,16,20,21]) + self.failUnlessEqual(len(s2), 10) + + s2.remove(4,3).remove(15,1) + self.failUnlessEqual(list(s2.each()), [3,10,11,16,20,21]) + self.failUnlessEqual(len(s2), 6) + + s1 = SimpleSpans(3, 4) # 3 4 5 6 + s2 = SimpleSpans(5, 4) # 5 6 7 8 + i = s1 & s2 + self.failUnlessEqual(list(i.each()), [5, 6]) + + def _check1(self, s): + self.failUnlessEqual(list(s), [(3,4)]) + self.failUnless(s) + self.failUnlessEqual(len(s), 4) + self.failIf((0,1) in s) + self.failUnless((3,4) in s) + self.failUnless((3,1) in s) + self.failUnless((5,2) in s) + self.failUnless((6,1) in s) + self.failIf((6,2) in s) + self.failIf((7,1) in s) + self.failUnlessEqual(list(s.each()), [3,4,5,6]) + + def test_math(self): + s1 = Spans(0, 10) # 0,1,2,3,4,5,6,7,8,9 + s2 = Spans(5, 3) # 5,6,7 + s3 = Spans(8, 4) # 8,9,10,11 + + s = s1 - s2 + self.failUnlessEqual(list(s.each()), [0,1,2,3,4,8,9]) + s = s1 - s3 + self.failUnlessEqual(list(s.each()), [0,1,2,3,4,5,6,7]) + s = s2 - s3 + self.failUnlessEqual(list(s.each()), [5,6,7]) + s = s1 & s2 + self.failUnlessEqual(list(s.each()), [5,6,7]) + s = s2 & s1 + self.failUnlessEqual(list(s.each()), [5,6,7]) + s = s1 & s3 + self.failUnlessEqual(list(s.each()), [8,9]) + s = s3 & s1 + self.failUnlessEqual(list(s.each()), [8,9]) + s = s2 & s3 + self.failUnlessEqual(list(s.each()), []) + s = s3 & s2 + self.failUnlessEqual(list(s.each()), []) + s = Spans() & s3 + self.failUnlessEqual(list(s.each()), []) + s = s3 & Spans() + self.failUnlessEqual(list(s.each()), []) + + s = s1 + s2 + self.failUnlessEqual(list(s.each()), [0,1,2,3,4,5,6,7,8,9]) + s = s1 + s3 + self.failUnlessEqual(list(s.each()), [0,1,2,3,4,5,6,7,8,9,10,11]) + s = s2 + s3 + self.failUnlessEqual(list(s.each()), [5,6,7,8,9,10,11]) + + s = Spans(s1) + s -= s2 + self.failUnlessEqual(list(s.each()), [0,1,2,3,4,8,9]) + s = Spans(s1) + s -= s3 + self.failUnlessEqual(list(s.each()), [0,1,2,3,4,5,6,7]) + s = Spans(s2) + s -= s3 + self.failUnlessEqual(list(s.each()), [5,6,7]) + + s = Spans(s1) + s += s2 + self.failUnlessEqual(list(s.each()), [0,1,2,3,4,5,6,7,8,9]) + s = Spans(s1) + s += s3 + self.failUnlessEqual(list(s.each()), [0,1,2,3,4,5,6,7,8,9,10,11]) + s = Spans(s2) + s += s3 + self.failUnlessEqual(list(s.each()), [5,6,7,8,9,10,11]) + + def test_random(self): + # attempt to increase coverage of corner cases by comparing behavior + # of a simple-but-slow model implementation against the + # complex-but-fast actual implementation, in a large number of random + # operations + S1 = SimpleSpans + S2 = Spans + s1 = S1(); s2 = S2() + seed = "" + def _create(subseed): + ns1 = S1(); ns2 = S2() + for i in range(10): + what = md5(subseed+str(i)).hexdigest() + start = int(what[2:4], 16) + length = max(1,int(what[5:6], 16)) + ns1.add(start, length); ns2.add(start, length) + return ns1, ns2 + + #print + for i in range(1000): + what = md5(seed+str(i)).hexdigest() + op = what[0] + subop = what[1] + start = int(what[2:4], 16) + length = max(1,int(what[5:6], 16)) + #print what + if op in "0": + if subop in "01234": + s1 = S1(); s2 = S2() + elif subop in "5678": + s1 = S1(start, length); s2 = S2(start, length) + else: + s1 = S1(s1); s2 = S2(s2) + #print "s2 = %s" % s2.dump() + elif op in "123": + #print "s2.add(%d,%d)" % (start, length) + s1.add(start, length); s2.add(start, length) + elif op in "456": + #print "s2.remove(%d,%d)" % (start, length) + s1.remove(start, length); s2.remove(start, length) + elif op in "78": + ns1, ns2 = _create(what[7:11]) + #print "s2 + %s" % ns2.dump() + s1 = s1 + ns1; s2 = s2 + ns2 + elif op in "9a": + ns1, ns2 = _create(what[7:11]) + #print "%s - %s" % (s2.dump(), ns2.dump()) + s1 = s1 - ns1; s2 = s2 - ns2 + elif op in "bc": + ns1, ns2 = _create(what[7:11]) + #print "s2 += %s" % ns2.dump() + s1 += ns1; s2 += ns2 + elif op in "de": + ns1, ns2 = _create(what[7:11]) + #print "%s -= %s" % (s2.dump(), ns2.dump()) + s1 -= ns1; s2 -= ns2 + else: + ns1, ns2 = _create(what[7:11]) + #print "%s &= %s" % (s2.dump(), ns2.dump()) + s1 = s1 & ns1; s2 = s2 & ns2 + #print "s2 now %s" % s2.dump() + self.failUnlessEqual(list(s1.each()), list(s2.each())) + self.failUnlessEqual(len(s1), len(s2)) + self.failUnlessEqual(bool(s1), bool(s2)) + self.failUnlessEqual(list(s1), list(s2)) + for j in range(10): + what = md5(what[12:14]+str(j)).hexdigest() + start = int(what[2:4], 16) + length = max(1, int(what[5:6], 16)) + span = (start, length) + self.failUnlessEqual(bool(span in s1), bool(span in s2)) + + + # s() + # s(start,length) + # s(s0) + # s.add(start,length) : returns s + # s.remove(start,length) + # s.each() -> list of byte offsets, mostly for testing + # list(s) -> list of (start,length) tuples, one per span + # (start,length) in s -> True if (start..start+length-1) are all members + # NOT equivalent to x in list(s) + # len(s) -> number of bytes, for testing, bool(), and accounting/limiting + # bool(s) (__len__) + # s = s1+s2, s1-s2, +=s1, -=s1 + + def test_overlap(self): + for a in range(20): + for b in range(10): + for c in range(20): + for d in range(10): + self._test_overlap(a,b,c,d) + + def _test_overlap(self, a, b, c, d): + s1 = set(range(a,a+b)) + s2 = set(range(c,c+d)) + #print "---" + #self._show_overlap(s1, "1") + #self._show_overlap(s2, "2") + o = overlap(a,b,c,d) + expected = s1.intersection(s2) + if not expected: + self.failUnlessEqual(o, None) + else: + start,length = o + so = set(range(start,start+length)) + #self._show(so, "o") + self.failUnlessEqual(so, expected) + + def _show_overlap(self, s, c): + import sys + out = sys.stdout + if s: + for i in range(max(s)): + if i in s: + out.write(c) + else: + out.write(" ") + out.write("\n") + +def extend(s, start, length, fill): + if len(s) >= start+length: + return s + assert len(fill) == 1 + return s + fill*(start+length-len(s)) + +def replace(s, start, data): + assert len(s) >= start+len(data) + return s[:start] + data + s[start+len(data):] + +class SimpleDataSpans: + def __init__(self, other=None): + self.missing = "" # "1" where missing, "0" where found + self.data = "" + if other: + for (start, data) in other.get_chunks(): + self.add(start, data) + + def __len__(self): + return len(self.missing.translate(None, "1")) + def _dump(self): + return [i for (i,c) in enumerate(self.missing) if c == "0"] + def _have(self, start, length): + m = self.missing[start:start+length] + if not m or len(m) prev_end + prev_end = start+length + except AssertionError: + print "BAD:", self.dump() + raise + + def add(self, start, length): + assert start >= 0 + assert length > 0 + #print " ADD [%d+%d -%d) to %s" % (start, length, start+length, self.dump()) + first_overlap = last_overlap = None + for i,(s_start,s_length) in enumerate(self._spans): + #print " (%d+%d)-> overlap=%s adjacent=%s" % (s_start,s_length, overlap(s_start, s_length, start, length), adjacent(s_start, s_length, start, length)) + if (overlap(s_start, s_length, start, length) + or adjacent(s_start, s_length, start, length)): + last_overlap = i + if first_overlap is None: + first_overlap = i + continue + # no overlap + if first_overlap is not None: + break + #print " first_overlap", first_overlap, last_overlap + if first_overlap is None: + # no overlap, so just insert the span and sort by starting + # position. + self._spans.insert(0, (start,length)) + self._spans.sort() + else: + # everything from [first_overlap] to [last_overlap] overlapped + first_start,first_length = self._spans[first_overlap] + last_start,last_length = self._spans[last_overlap] + newspan_start = min(start, first_start) + newspan_end = max(start+length, last_start+last_length) + newspan_length = newspan_end - newspan_start + newspan = (newspan_start, newspan_length) + self._spans[first_overlap:last_overlap+1] = [newspan] + #print " ADD done: %s" % self.dump() + self._check() + + return self + + def remove(self, start, length): + assert start >= 0 + assert length > 0 + #print " REMOVE [%d+%d -%d) from %s" % (start, length, start+length, self.dump()) + first_complete_overlap = last_complete_overlap = None + for i,(s_start,s_length) in enumerate(self._spans): + s_end = s_start + s_length + o = overlap(s_start, s_length, start, length) + if o: + o_start, o_length = o + o_end = o_start+o_length + if o_start == s_start and o_end == s_end: + # delete this span altogether + if first_complete_overlap is None: + first_complete_overlap = i + last_complete_overlap = i + elif o_start == s_start: + # we only overlap the left side, so trim the start + # 1111 + # rrrr + # oo + # -> 11 + new_start = o_end + new_end = s_end + assert new_start > s_start + new_length = new_end - new_start + self._spans[i] = (new_start, new_length) + elif o_end == s_end: + # we only overlap the right side + # 1111 + # rrrr + # oo + # -> 11 + new_start = s_start + new_end = o_start + assert new_end < s_end + new_length = new_end - new_start + self._spans[i] = (new_start, new_length) + else: + # we overlap the middle, so create a new span. No need to + # examine any other spans. + # 111111 + # rr + # LL RR + left_start = s_start + left_end = o_start + left_length = left_end - left_start + right_start = o_end + right_end = s_end + right_length = right_end - right_start + self._spans[i] = (left_start, left_length) + self._spans.append( (right_start, right_length) ) + self._spans.sort() + break + if first_complete_overlap is not None: + del self._spans[first_complete_overlap:last_complete_overlap+1] + #print " REMOVE done: %s" % self.dump() + self._check() + return self + + def dump(self): + return "len=%d: %s" % (len(self), + ",".join(["[%d-%d]" % (start,start+l-1) + for (start,l) in self._spans]) ) + + def each(self): + for start, length in self._spans: + for i in range(start, start+length): + yield i + + def __iter__(self): + for s in self._spans: + yield s + + def __len__(self): + # this also gets us bool(s) + return sum([length for start,length in self._spans]) + + def __add__(self, other): + s = self.__class__(self) + for (start, length) in other: + s.add(start, length) + return s + + def __sub__(self, other): + s = self.__class__(self) + for (start, length) in other: + s.remove(start, length) + return s + + def __iadd__(self, other): + for (start, length) in other: + self.add(start, length) + return self + + def __isub__(self, other): + for (start, length) in other: + self.remove(start, length) + return self + + def __and__(self, other): + if not self._spans: + return self.__class__() + bounds = self.__class__(self._spans[0][0], + self._spans[-1][0]+self._spans[-1][1]) + not_other = bounds - other + return self - not_other + + def __contains__(self, (start,length)): + for span_start,span_length in self._spans: + o = overlap(start, length, span_start, span_length) + if o: + o_start,o_length = o + if o_start == start and o_length == length: + return True + return False + +def overlap(start0, length0, start1, length1): + # return start2,length2 of the overlapping region, or None + # 00 00 000 0000 00 00 000 00 00 00 00 + # 11 11 11 11 111 11 11 1111 111 11 11 + left = max(start0, start1) + right = min(start0+length0, start1+length1) + # if there is overlap, 'left' will be its start, and right-1 will + # be the end' + if left < right: + return (left, right-left) + return None + +def adjacent(start0, length0, start1, length1): + if (start0 < start1) and start0+length0 == start1: + return True + elif (start1 < start0) and start1+length1 == start0: + return True + return False + +class DataSpans: + """I represent portions of a large string. Equivalently, I can be said to + maintain a large array of characters (with gaps of empty elements). I can + be used to manage access to a remote share, where some pieces have been + retrieved, some have been requested, and others have not been read. + """ + + def __init__(self, other=None): + self.spans = [] # (start, data) tuples, non-overlapping, merged + if other: + for (start, data) in other.get_chunks(): + self.add(start, data) + + def __len__(self): + # return number of bytes we're holding + return sum([len(data) for (start,data) in self.spans]) + + def _dump(self): + # return iterator of sorted list of offsets, one per byte + for (start,data) in self.spans: + for i in range(start, start+len(data)): + yield i + + def dump(self): + return "len=%d: %s" % (len(self), + ",".join(["[%d-%d]" % (start,start+len(data)-1) + for (start,data) in self.spans]) ) + + def get_chunks(self): + return list(self.spans) + + def get_spans(self): + """Return a Spans object with a bit set for each byte I hold""" + return Spans([(start, len(data)) for (start,data) in self.spans]) + + def assert_invariants(self): + if not self.spans: + return + prev_start = self.spans[0][0] + prev_end = prev_start + len(self.spans[0][1]) + for start, data in self.spans[1:]: + if not start > prev_end: + # adjacent or overlapping: bad + print "ASSERTION FAILED", self.spans + raise AssertionError + + def get(self, start, length): + # returns a string of LENGTH, or None + #print "get", start, length, self.spans + end = start+length + for (s_start,s_data) in self.spans: + s_end = s_start+len(s_data) + #print " ",s_start,s_end + if s_start <= start < s_end: + # we want some data from this span. Because we maintain + # strictly merged and non-overlapping spans, everything we + # want must be in this span. + offset = start - s_start + if offset + length > len(s_data): + #print " None, span falls short" + return None # span falls short + #print " some", s_data[offset:offset+length] + return s_data[offset:offset+length] + if s_start >= end: + # we've gone too far: no further spans will overlap + #print " None, gone too far" + return None + #print " None, ran out of spans" + return None + + def add(self, start, data): + # first: walk through existing spans, find overlap, modify-in-place + # create list of new spans + # add new spans + # sort + # merge adjacent spans + #print "add", start, data, self.spans + end = start + len(data) + i = 0 + while len(data): + #print " loop", start, data, i, len(self.spans), self.spans + if i >= len(self.spans): + #print " append and done" + # append a last span + self.spans.append( (start, data) ) + break + (s_start,s_data) = self.spans[i] + # five basic cases: + # a: OLD b:OLDD c1:OLD c2:OLD d1:OLDD d2:OLD e: OLLDD + # NEW NEW NEW NEWW NEW NEW NEW + # + # we handle A by inserting a new segment (with "N") and looping, + # turning it into B or C. We handle B by replacing a prefix and + # terminating. We handle C (both c1 and c2) by replacing the + # segment (and, for c2, looping, turning it into A). We handle D + # by replacing a suffix (and, for d2, looping, turning it into + # A). We handle E by replacing the middle and terminating. + if start < s_start: + # case A: insert a new span, then loop with the remainder + #print " insert new psan" + s_len = s_start-start + self.spans.insert(i, (start, data[:s_len])) + i += 1 + start = s_start + data = data[s_len:] + continue + s_len = len(s_data) + s_end = s_start+s_len + if s_start <= start < s_end: + #print " modify this span", s_start, start, s_end + # we want to modify some data in this span: a prefix, a + # suffix, or the whole thing + if s_start == start: + if s_end <= end: + #print " replace whole segment" + # case C: replace this segment + self.spans[i] = (s_start, data[:s_len]) + i += 1 + start += s_len + data = data[s_len:] + # C2 is where len(data)>0 + continue + # case B: modify the prefix, retain the suffix + #print " modify prefix" + self.spans[i] = (s_start, data + s_data[len(data):]) + break + if start > s_start and end < s_end: + # case E: modify the middle + #print " modify middle" + prefix_len = start - s_start # we retain this much + suffix_len = s_end - end # and retain this much + newdata = s_data[:prefix_len] + data + s_data[-suffix_len:] + self.spans[i] = (s_start, newdata) + break + # case D: retain the prefix, modify the suffix + #print " modify suffix" + prefix_len = start - s_start # we retain this much + suffix_len = s_len - prefix_len # we replace this much + #print " ", s_data, prefix_len, suffix_len, s_len, data + self.spans[i] = (s_start, + s_data[:prefix_len] + data[:suffix_len]) + i += 1 + start += suffix_len + data = data[suffix_len:] + #print " now", start, data + # D2 is where len(data)>0 + continue + # else we're not there yet + #print " still looking" + i += 1 + continue + # now merge adjacent spans + #print " merging", self.spans + newspans = [] + for (s_start,s_data) in self.spans: + if newspans and adjacent(newspans[-1][0], len(newspans[-1][1]), + s_start, len(s_data)): + newspans[-1] = (newspans[-1][0], newspans[-1][1] + s_data) + else: + newspans.append( (s_start, s_data) ) + self.spans = newspans + self.assert_invariants() + #print " done", self.spans + + def remove(self, start, length): + i = 0 + end = start + length + #print "remove", start, length, self.spans + while i < len(self.spans): + (s_start,s_data) = self.spans[i] + if s_start >= end: + # this segment is entirely right of the removed region, and + # all further segments are even further right. We're done. + break + s_len = len(s_data) + s_end = s_start + s_len + o = overlap(start, length, s_start, s_len) + if not o: + i += 1 + continue + o_start, o_len = o + o_end = o_start + o_len + if o_len == s_len: + # remove the whole segment + del self.spans[i] + continue + if o_start == s_start: + # remove a prefix, leaving the suffix from o_end to s_end + prefix_len = o_end - o_start + self.spans[i] = (o_end, s_data[prefix_len:]) + i += 1 + continue + elif o_end == s_end: + # remove a suffix, leaving the prefix from s_start to o_start + prefix_len = o_start - s_start + self.spans[i] = (s_start, s_data[:prefix_len]) + i += 1 + continue + # remove the middle, creating a new segment + # left is s_start:o_start, right is o_end:s_end + left_len = o_start - s_start + left = s_data[:left_len] + right_len = s_end - o_end + right = s_data[-right_len:] + self.spans[i] = (s_start, left) + self.spans.insert(i+1, (o_end, right)) + break + #print " done", self.spans + + def pop(self, start, length): + data = self.get(start, length) + if data: + self.remove(start, length) + return data -- 2.45.2