zfec: rearrange files

[tahoe-lafs/zfec.git] / zfec / zfec / filefec.py
diff --git a/zfec/zfec/filefec.py b/zfec/zfec/filefec.py

deleted file mode 100644 (file)

index 48f3d9d..0000000
--- a/zfec/zfec/filefec.py
+++ /dev/null
@@ -1,504 +0,0 @@
-import easyfec, zfec
-from pyutil import fileutil
-from pyutil.mathutil import pad_size, log_ceil
-
-import array, os, struct
-
-CHUNKSIZE = 4096
-
-from base64 import b32encode
-def ab(x): # debuggery
-    if len(x) >= 3:
-        return "%s:%s" % (len(x), b32encode(x[-3:]),)
-    elif len(x) == 2:
-        return "%s:%s" % (len(x), b32encode(x[-2:]),)
-    elif len(x) == 1:
-        return "%s:%s" % (len(x), b32encode(x[-1:]),)
-    elif len(x) == 0:
-        return "%s:%s" % (len(x), "--empty--",)
-
-class InsufficientShareFilesError(zfec.Error):
-    def __init__(self, k, kb, *args, **kwargs):
-        zfec.Error.__init__(self, *args, **kwargs)
-        self.k = k
-        self.kb = kb
-
-    def __repr__(self):
-        return "Insufficient share files -- %d share files are required to recover this file, but only %d were given" % (self.k, self.kb,)
-
-    def __str__(self):
-        return self.__repr__()
-
-class CorruptedShareFilesError(zfec.Error):
-    pass
-
-def _build_header(m, k, pad, sh):
-    """
-    @param m: the total number of shares; 1 <= m <= 256
-    @param k: the number of shares required to reconstruct; 1 <= k <= m
-    @param pad: the number of bytes of padding added to the file before encoding; 0 <= pad < k
-    @param sh: the shnum of this share; 0 <= k < m
-
-    @return: a compressed string encoding m, k, pad, and sh
-    """
-    assert m >= 1
-    assert m <= 2**8
-    assert k >= 1
-    assert k <= m
-    assert pad >= 0
-    assert pad < k
-
-    assert sh >= 0
-    assert sh < m
-
-    bitsused = 0
-    val = 0
-
-    val |= (m - 1)
-    bitsused += 8 # the first 8 bits always encode m
-
-    kbits = log_ceil(m, 2) # num bits needed to store all possible values of k
-    val <<= kbits
-    bitsused += kbits
-
-    val |= (k - 1)
-
-    padbits = log_ceil(k, 2) # num bits needed to store all possible values of pad
-    val <<= padbits
-    bitsused += padbits
-
-    val |= pad
-
-    shnumbits = log_ceil(m, 2) # num bits needed to store all possible values of shnum
-    val <<= shnumbits
-    bitsused += shnumbits
-
-    val |= sh
-
-    assert bitsused >= 8, bitsused
-    assert bitsused <= 32, bitsused
-
-    if bitsused <= 16:
-        val <<= (16-bitsused)
-        cs = struct.pack('>H', val)
-        assert cs[:-2] == '\x00' * (len(cs)-2)
-        return cs[-2:]
-    if bitsused <= 24:
-        val <<= (24-bitsused)
-        cs = struct.pack('>I', val)
-        assert cs[:-3] == '\x00' * (len(cs)-3)
-        return cs[-3:]
-    else:
-        val <<= (32-bitsused)
-        cs = struct.pack('>I', val)
-        assert cs[:-4] == '\x00' * (len(cs)-4)
-        return cs[-4:]
-
-def MASK(bits):
-    return (1<<bits)-1
-
-def _parse_header(inf):
-    """
-    @param inf: an object which I can call read(1) on to get another byte
-
-    @return: tuple of (m, k, pad, sh,); side-effect: the first one to four
-        bytes of inf will be read
-    """
-    # The first 8 bits always encode m.
-    ch = inf.read(1)
-    if not ch:
-        raise CorruptedShareFilesError("Share files were corrupted -- share file %r didn't have a complete metadata header at the front.  Perhaps the file was truncated." % (inf.name,))
-    byte = ord(ch)
-    m = byte + 1
-
-    # The next few bits encode k.
-    kbits = log_ceil(m, 2) # num bits needed to store all possible values of k
-    b2_bits_left = 8-kbits
-    kbitmask = MASK(kbits) << b2_bits_left
-    ch = inf.read(1)
-    if not ch:
-        raise CorruptedShareFilesError("Share files were corrupted -- share file %r didn't have a complete metadata header at the front.  Perhaps the file was truncated." % (inf.name,))
-    byte = ord(ch)
-    k = ((byte & kbitmask) >> b2_bits_left) + 1
-
-    shbits = log_ceil(m, 2) # num bits needed to store all possible values of shnum
-    padbits = log_ceil(k, 2) # num bits needed to store all possible values of pad
-
-    val = byte & (~kbitmask)
-
-    needed_padbits = padbits - b2_bits_left
-    if needed_padbits > 0:
-        ch = inf.read(1)
-        if not ch:
-            raise CorruptedShareFilesError("Share files were corrupted -- share file %r didn't have a complete metadata header at the front.  Perhaps the file was truncated." % (inf.name,))
-        byte = struct.unpack(">B", ch)[0]
-        val <<= 8
-        val |= byte
-        needed_padbits -= 8
-    assert needed_padbits <= 0
-    extrabits = -needed_padbits
-    pad = val >> extrabits
-    val &= MASK(extrabits)
-
-    needed_shbits = shbits - extrabits
-    if needed_shbits > 0:
-        ch = inf.read(1)
-        if not ch:
-            raise CorruptedShareFilesError("Share files were corrupted -- share file %r didn't have a complete metadata header at the front.  Perhaps the file was truncated." % (inf.name,))
-        byte = struct.unpack(">B", ch)[0]
-        val <<= 8
-        val |= byte
-        needed_shbits -= 8
-    assert needed_shbits <= 0
-
-    gotshbits = -needed_shbits
-
-    sh = val >> gotshbits
-
-    return (m, k, pad, sh,)
-
-FORMAT_FORMAT = "%%s.%%0%dd_%%0%dd%%s"
-RE_FORMAT = "%s.[0-9]+_[0-9]+%s"
-def encode_to_files(inf, fsize, dirname, prefix, k, m, suffix=".fec", overwrite=False, verbose=False):
-    """
-    Encode inf, writing the shares to specially named, newly created files.
-
-    @param fsize: calling read() on inf must yield fsize bytes of data and
-        then raise an EOFError
-    @param dirname: the name of the directory into which the sharefiles will
-        be written
-    """
-    mlen = len(str(m))
-    format = FORMAT_FORMAT % (mlen, mlen,)
-
-    padbytes = pad_size(fsize, k)
-
-    fns = []
-    fs = []
-    try:
-        for shnum in range(m):
-            hdr = _build_header(m, k, padbytes, shnum)
-
-            fn = os.path.join(dirname, format % (prefix, shnum, m, suffix,))
-            if verbose:
-                print "Creating share file %r..." % (fn,)
-            if overwrite:
-                f = open(fn, "wb")
-            else:
-                flags = os.O_WRONLY|os.O_CREAT|os.O_EXCL | (hasattr(os, 'O_BINARY') and os.O_BINARY)
-                fd = os.open(fn, flags)
-                f = os.fdopen(fd, "wb")
-            f.write(hdr)
-            fs.append(f)
-            fns.append(fn)
-        sumlen = [0]
-        def cb(blocks, length):
-            assert len(blocks) == len(fs)
-            oldsumlen = sumlen[0]
-            sumlen[0] += length
-            if verbose:
-                if int((float(oldsumlen) / fsize) * 10) != int((float(sumlen[0]) / fsize) * 10):
-                    print str(int((float(sumlen[0]) / fsize) * 10) * 10) + "% ...",
-
-            if sumlen[0] > fsize:
-                raise IOError("Wrong file size -- possibly the size of the file changed during encoding.  Original size: %d, observed size at least: %s" % (fsize, sumlen[0],))
-            for i in range(len(blocks)):
-                data = blocks[i]
-                fs[i].write(data)
-                length -= len(data)
-
-        encode_file_stringy_easyfec(inf, cb, k, m, chunksize=4096)
-    except EnvironmentError, le:
-        print "Cannot complete because of exception: "
-        print le
-        print "Cleaning up..."
-        # clean up
-        while fs:
-            f = fs.pop()
-            f.close() ; del f
-            fn = fns.pop()
-            if verbose:
-                print "Cleaning up: trying to remove %r..." % (fn,)
-            fileutil.remove_if_possible(fn)
-        return 1
-    if verbose:
-        print
-        print "Done!"
-    return 0
-
-# Note: if you really prefer base-2 and you change this code, then please
-# denote 2^20 as "MiB" instead of "MB" in order to avoid ambiguity.  See:
-# http://en.wikipedia.org/wiki/Megabyte
-# Thanks.
-MILLION_BYTES=10**6
-
-def decode_from_files(outf, infiles, verbose=False):
-    """
-    Decode from the first k files in infiles, writing the results to outf.
-    """
-    assert len(infiles) >= 2
-    infs = []
-    shnums = []
-    m = None
-    k = None
-    padlen = None
-
-    byteswritten = 0
-    for f in infiles:
-        (nm, nk, npadlen, shnum,) = _parse_header(f)
-        if not (m is None or m == nm):
-            raise CorruptedShareFilesError("Share files were corrupted -- share file %r said that m was %s but another share file previously said that m was %s" % (f.name, nm, m,))
-        m = nm
-        if not (k is None or k == nk):
-            raise CorruptedShareFilesError("Share files were corrupted -- share file %r said that k was %s but another share file previously said that k was %s" % (f.name, nk, k,))
-        if k > len(infiles):
-            raise InsufficientShareFilesError(k, len(infiles))
-        k = nk
-        if not (padlen is None or padlen == npadlen):
-            raise CorruptedShareFilesError("Share files were corrupted -- share file %r said that pad length was %s but another share file previously said that pad length was %s" % (f.name, npadlen, padlen,))
-        padlen = npadlen
-
-        infs.append(f)
-        shnums.append(shnum)
-
-        if len(infs) == k:
-            break
-
-    dec = easyfec.Decoder(k, m)
-
-    while True:
-        chunks = [ inf.read(CHUNKSIZE) for inf in infs ]
-        if [ch for ch in chunks if len(ch) != len(chunks[-1])]:
-            raise CorruptedShareFilesError("Share files were corrupted -- all share files are required to be the same length, but they weren't.")
-
-        if len(chunks[-1]) == CHUNKSIZE:
-            # Then this was a full read, so we're still in the sharefiles.
-            resultdata = dec.decode(chunks, shnums, padlen=0)
-            outf.write(resultdata)
-            byteswritten += len(resultdata)
-            if verbose:
-                if ((byteswritten - len(resultdata)) / (10*MILLION_BYTES)) != (byteswritten / (10*MILLION_BYTES)):
-                    print str(byteswritten / MILLION_BYTES) + " MB ...",
-        else:
-            # Then this was a short read, so we've reached the end of the sharefiles.
-            resultdata = dec.decode(chunks, shnums, padlen)
-            outf.write(resultdata)
-            return # Done.
-    if verbose:
-        print
-        print "Done!"
-
-def encode_file(inf, cb, k, m, chunksize=4096):
-    """
-    Read in the contents of inf, encode, and call cb with the results.
-
-    First, k "input blocks" will be read from inf, each input block being of
-    size chunksize.  Then these k blocks will be encoded into m "result
-    blocks".  Then cb will be invoked, passing a list of the m result blocks
-    as its first argument, and the length of the encoded data as its second
-    argument.  (The length of the encoded data is always equal to k*chunksize,
-    until the last iteration, when the end of the file has been reached and
-    less than k*chunksize bytes could be read from the file.)  This procedure
-    is iterated until the end of the file is reached, in which case the space
-    of the input blocks that is unused is filled with zeroes before encoding.
-
-    Note that the sequence passed in calls to cb() contains mutable array
-    objects in its first k elements whose contents will be overwritten when
-    the next segment is read from the input file.  Therefore the
-    implementation of cb() has to either be finished with those first k arrays
-    before returning, or if it wants to keep the contents of those arrays for
-    subsequent use after it has returned then it must make a copy of them to
-    keep.
-
-    @param inf the file object from which to read the data
-    @param cb the callback to be invoked with the results
-    @param k the number of shares required to reconstruct the file
-    @param m the total number of shares created
-    @param chunksize how much data to read from inf for each of the k input
-        blocks
-    """
-    enc = zfec.Encoder(k, m)
-    l = tuple([ array.array('c') for i in range(k) ])
-    indatasize = k*chunksize # will be reset to shorter upon EOF
-    eof = False
-    ZEROES=array.array('c', ['\x00'])*chunksize
-    while not eof:
-        # This loop body executes once per segment.
-        i = 0
-        while (i<len(l)):
-            # This loop body executes once per chunk.
-            a = l[i]
-            del a[:]
-            try:
-                a.fromfile(inf, chunksize)
-                i += 1
-            except EOFError:
-                eof = True
-                indatasize = i*chunksize + len(a)
-
-                # padding
-                a.fromstring("\x00" * (chunksize-len(a)))
-                i += 1
-                while (i<len(l)):
-                    a = l[i]
-                    a[:] = ZEROES
-                    i += 1
-
-        res = enc.encode(l)
-        cb(res, indatasize)
-
-try:
-    from hashlib import sha1
-    sha1 = sha1 # hush pyflakes
-except ImportError:
-    # hashlib was added in Python 2.5.0.
-    import sha
-    sha1 = sha
-
-def encode_file_not_really(inf, cb, k, m, chunksize=4096):
-    enc = zfec.Encoder(k, m)
-    l = tuple([ array.array('c') for i in range(k) ])
-    indatasize = k*chunksize # will be reset to shorter upon EOF
-    eof = False
-    ZEROES=array.array('c', ['\x00'])*chunksize
-    while not eof:
-        # This loop body executes once per segment.
-        i = 0
-        while (i<len(l)):
-            # This loop body executes once per chunk.
-            a = l[i]
-            del a[:]
-            try:
-                a.fromfile(inf, chunksize)
-                i += 1
-            except EOFError:
-                eof = True
-                indatasize = i*chunksize + len(a)
-
-                # padding
-                a.fromstring("\x00" * (chunksize-len(a)))
-                i += 1
-                while (i<len(l)):
-                    a = l[i]
-                    a[:] = ZEROES
-                    i += 1
-
-        # res = enc.encode(l)
-        cb(None, None)
-
-def encode_file_not_really_and_hash(inf, cb, k, m, chunksize=4096):
-    hasher = sha1.new()
-    enc = zfec.Encoder(k, m)
-    l = tuple([ array.array('c') for i in range(k) ])
-    indatasize = k*chunksize # will be reset to shorter upon EOF
-    eof = False
-    ZEROES=array.array('c', ['\x00'])*chunksize
-    while not eof:
-        # This loop body executes once per segment.
-        i = 0
-        while (i<len(l)):
-            # This loop body executes once per chunk.
-            a = l[i]
-            del a[:]
-            try:
-                a.fromfile(inf, chunksize)
-                i += 1
-            except EOFError:
-                eof = True
-                indatasize = i*chunksize + len(a)
-
-                # padding
-                a.fromstring("\x00" * (chunksize-len(a)))
-                i += 1
-                while (i<len(l)):
-                    a = l[i]
-                    a[:] = ZEROES
-                    i += 1
-
-        # res = enc.encode(l)
-        for thing in l:
-            hasher.update(thing)
-        cb(None, None)
-
-def encode_file_stringy(inf, cb, k, m, chunksize=4096):
-    """
-    Read in the contents of inf, encode, and call cb with the results.
-
-    First, k "input blocks" will be read from inf, each input block being of
-    size chunksize.  Then these k blocks will be encoded into m "result
-    blocks".  Then cb will be invoked, passing a list of the m result blocks
-    as its first argument, and the length of the encoded data as its second
-    argument.  (The length of the encoded data is always equal to k*chunksize,
-    until the last iteration, when the end of the file has been reached and
-    less than k*chunksize bytes could be read from the file.)  This procedure
-    is iterated until the end of the file is reached, in which case the part
-    of the input shares that is unused is filled with zeroes before encoding.
-
-    @param inf the file object from which to read the data
-    @param cb the callback to be invoked with the results
-    @param k the number of shares required to reconstruct the file
-    @param m the total number of shares created
-    @param chunksize how much data to read from inf for each of the k input
-        blocks
-    """
-    enc = zfec.Encoder(k, m)
-    indatasize = k*chunksize # will be reset to shorter upon EOF
-    while indatasize == k*chunksize:
-        # This loop body executes once per segment.
-        i = 0
-        l = []
-        ZEROES = '\x00'*chunksize
-        while i<k:
-            # This loop body executes once per chunk.
-            i += 1
-            l.append(inf.read(chunksize))
-            if len(l[-1]) < chunksize:
-                indatasize = i*chunksize + len(l[-1])
-
-                # padding
-                l[-1] = l[-1] + "\x00" * (chunksize-len(l[-1]))
-                while i<k:
-                    l.append(ZEROES)
-                    i += 1
-
-        res = enc.encode(l)
-        cb(res, indatasize)
-
-def encode_file_stringy_easyfec(inf, cb, k, m, chunksize=4096):
-    """
-    Read in the contents of inf, encode, and call cb with the results.
-
-    First, chunksize*k bytes will be read from inf, then encoded into m
-    "result blocks".  Then cb will be invoked, passing a list of the m result
-    blocks as its first argument, and the length of the encoded data as its
-    second argument.  (The length of the encoded data is always equal to
-    k*chunksize, until the last iteration, when the end of the file has been
-    reached and less than k*chunksize bytes could be read from the file.)
-    This procedure is iterated until the end of the file is reached, in which
-    case the space of the input that is unused is filled with zeroes before
-    encoding.
-
-    @param inf the file object from which to read the data
-    @param cb the callback to be invoked with the results
-    @param k the number of shares required to reconstruct the file
-    @param m the total number of shares created
-    @param chunksize how much data to read from inf for each of the k input
-        blocks
-    """
-    enc = easyfec.Encoder(k, m)
-
-    readsize = k*chunksize
-    indata = inf.read(readsize)
-    while indata:
-        res = enc.encode(indata)
-        cb(res, len(indata))
-        indata = inf.read(readsize)
-
-# zfec -- fast forward error correction library with Python interface
-#
-# Copyright (C) 2007-2010 Allmydata, Inc.
-# Author: Zooko Wilcox-O'Hearn
-#
-# This file is part of zfec.
-#
-# See README.rst for licensing information.