whitespace, docstrings, copyright statements

[tahoe-lafs/zfec.git] / zfec / zfec / filefec.py
diff --git a/zfec/zfec/filefec.py b/zfec/zfec/filefec.py

index efacd12e801db72223c5df6b280355d989b481cd..25b6513a5e84f682653368aac963e1ebbe2b74d5 100644 (file)
--- a/zfec/zfec/filefec.py
+++ b/zfec/zfec/filefec.py
@@ -1,11 +1,22 @@
  import easyfec, zfec
-from util import fileutil
-from util.mathutil import log_ceil
+from pyutil import fileutil
+from pyutil.mathutil import pad_size, log_ceil
  
-import array, os, re, struct, traceback
+import array, os, struct
  
  CHUNKSIZE = 4096
  
+from base64 import b32encode
+def ab(x): # debuggery
+    if len(x) >= 3:
+        return "%s:%s" % (len(x), b32encode(x[-3:]),)
+    elif len(x) == 2:
+        return "%s:%s" % (len(x), b32encode(x[-2:]),)
+    elif len(x) == 1:
+        return "%s:%s" % (len(x), b32encode(x[-1:]),)
+    elif len(x) == 0:
+        return "%s:%s" % (len(x), "--empty--",)
+
  class InsufficientShareFilesError(zfec.Error):
      def __init__(self, k, kb, *args, **kwargs):
          zfec.Error.__init__(self, *args, **kwargs)
@@ -23,17 +34,17 @@ class CorruptedShareFilesError(zfec.Error):
  
  def _build_header(m, k, pad, sh):
      """
-    @param m: the total number of shares; 3 <= m <= 256
-    @param k: the number of shares required to reconstruct; 2 <= k < m
+    @param m: the total number of shares; 1 <= m <= 256
+    @param k: the number of shares required to reconstruct; 1 <= k <= m
      @param pad: the number of bytes of padding added to the file before encoding; 0 <= pad < k
      @param sh: the shnum of this share; 0 <= k < m
  
-    @return: a string (which is hopefully short) encoding m, k, sh, and pad
+    @return: a compressed string encoding m, k, pad, and sh
      """
-    assert m >= 3
+    assert m >= 1
      assert m <= 2**8
-    assert k >= 2
-    assert k < m
+    assert k >= 1
+    assert k <= m
      assert pad >= 0
      assert pad < k
  
@@ -43,14 +54,14 @@ def _build_header(m, k, pad, sh):
      bitsused = 0
      val = 0
  
-    val |= (m - 3)
+    val |= (m - 1)
      bitsused += 8 # the first 8 bits always encode m
  
-    kbits = log_ceil(m-2, 2) # num bits needed to store all possible values of k
+    kbits = log_ceil(m, 2) # num bits needed to store all possible values of k
      val <<= kbits
      bitsused += kbits
  
-    val |= (k - 2)
+    val |= (k - 1)
  
      padbits = log_ceil(k, 2) # num bits needed to store all possible values of pad
      val <<= padbits
@@ -64,8 +75,8 @@ def _build_header(m, k, pad, sh):
  
      val |= sh
  
-    assert bitsused >= 11
-    assert bitsused <= 32
+    assert bitsused >= 8, bitsused
+    assert bitsused <= 32, bitsused
  
      if bitsused <= 16:
          val <<= (16-bitsused)
@@ -98,17 +109,17 @@ def _parse_header(inf):
      if not ch:
          raise CorruptedShareFilesError("Share files were corrupted -- share file %r didn't have a complete metadata header at the front.  Perhaps the file was truncated." % (inf.name,))
      byte = ord(ch)
-    m = byte + 3
+    m = byte + 1
  
      # The next few bits encode k.
-    kbits = log_ceil(m-2, 2) # num bits needed to store all possible values of k
+    kbits = log_ceil(m, 2) # num bits needed to store all possible values of k
      b2_bits_left = 8-kbits
      kbitmask = MASK(kbits) << b2_bits_left
      ch = inf.read(1)
      if not ch:
          raise CorruptedShareFilesError("Share files were corrupted -- share file %r didn't have a complete metadata header at the front.  Perhaps the file was truncated." % (inf.name,))
      byte = ord(ch)
-    k = ((byte & kbitmask) >> b2_bits_left) + 2
+    k = ((byte & kbitmask) >> b2_bits_left) + 1
  
      shbits = log_ceil(m, 2) # num bits needed to store all possible values of shnum
      padbits = log_ceil(k, 2) # num bits needed to store all possible values of pad
@@ -122,7 +133,7 @@ def _parse_header(inf):
              raise CorruptedShareFilesError("Share files were corrupted -- share file %r didn't have a complete metadata header at the front.  Perhaps the file was truncated." % (inf.name,))
          byte = struct.unpack(">B", ch)[0]
          val <<= 8
-        val |= byte 
+        val |= byte
          needed_padbits -= 8
      assert needed_padbits <= 0
      extrabits = -needed_padbits
@@ -136,7 +147,7 @@ def _parse_header(inf):
              raise CorruptedShareFilesError("Share files were corrupted -- share file %r didn't have a complete metadata header at the front.  Perhaps the file was truncated." % (inf.name,))
          byte = struct.unpack(">B", ch)[0]
          val <<= 8
-        val |= byte 
+        val |= byte
          needed_shbits -= 8
      assert needed_shbits <= 0
  
@@ -152,7 +163,7 @@ def encode_to_files(inf, fsize, dirname, prefix, k, m, suffix=".fec", overwrite=
      """
      Encode inf, writing the shares to specially named, newly created files.
  
-    @param fsize: calling read() on inf must yield fsize bytes of data and 
+    @param fsize: calling read() on inf must yield fsize bytes of data and
          then raise an EOFError
      @param dirname: the name of the directory into which the sharefiles will
          be written
@@ -160,7 +171,7 @@ def encode_to_files(inf, fsize, dirname, prefix, k, m, suffix=".fec", overwrite=
      mlen = len(str(m))
      format = FORMAT_FORMAT % (mlen, mlen,)
  
-    padbytes = zfec.util.mathutil.pad_size(fsize, k)
+    padbytes = pad_size(fsize, k)
  
      fns = []
      fs = []
@@ -174,7 +185,8 @@ def encode_to_files(inf, fsize, dirname, prefix, k, m, suffix=".fec", overwrite=
              if overwrite:
                  f = open(fn, "wb")
              else:
-                fd = os.open(fn, os.O_WRONLY|os.O_CREAT|os.O_EXCL|os.O_BINARY)
+                flags = os.O_WRONLY|os.O_CREAT|os.O_EXCL | (hasattr(os, 'O_BINARY') and os.O_BINARY)
+                fd = os.open(fn, flags)
                  f = os.fdopen(fd, "wb")
              f.write(hdr)
              fs.append(f)
@@ -187,7 +199,7 @@ def encode_to_files(inf, fsize, dirname, prefix, k, m, suffix=".fec", overwrite=
              if verbose:
                  if int((float(oldsumlen) / fsize) * 10) != int((float(sumlen[0]) / fsize) * 10):
                      print str(int((float(sumlen[0]) / fsize) * 10) * 10) + "% ...",
-            
+
              if sumlen[0] > fsize:
                  raise IOError("Wrong file size -- possibly the size of the file changed during encoding.  Original size: %d, observed size at least: %s" % (fsize, sumlen[0],))
              for i in range(len(blocks)):
@@ -210,14 +222,14 @@ def encode_to_files(inf, fsize, dirname, prefix, k, m, suffix=".fec", overwrite=
              fileutil.remove_if_possible(fn)
          return 1
      if verbose:
-        print 
+        print
          print "Done!"
      return 0
  
  # Note: if you really prefer base-2 and you change this code, then please
-# denote 2^20 as "MiB" instead of "MB" in order to avoid ambiguity.
-# Thanks.
+# denote 2^20 as "MiB" instead of "MB" in order to avoid ambiguity.  See:
  # http://en.wikipedia.org/wiki/Megabyte
+# Thanks.
  MILLION_BYTES=10**6
  
  def decode_from_files(outf, infiles, verbose=False):
@@ -280,29 +292,29 @@ def encode_file(inf, cb, k, m, chunksize=4096):
      """
      Read in the contents of inf, encode, and call cb with the results.
  
-    First, k "input blocks" will be read from inf, each input block being of 
-    size chunksize.  Then these k blocks will be encoded into m "result 
-    blocks".  Then cb will be invoked, passing a list of the m result blocks 
-    as its first argument, and the length of the encoded data as its second 
-    argument.  (The length of the encoded data is always equal to k*chunksize, 
-    until the last iteration, when the end of the file has been reached and 
-    less than k*chunksize bytes could be read from the file.)  This procedure 
-    is iterated until the end of the file is reached, in which case the space 
+    First, k "input blocks" will be read from inf, each input block being of
+    size chunksize.  Then these k blocks will be encoded into m "result
+    blocks".  Then cb will be invoked, passing a list of the m result blocks
+    as its first argument, and the length of the encoded data as its second
+    argument.  (The length of the encoded data is always equal to k*chunksize,
+    until the last iteration, when the end of the file has been reached and
+    less than k*chunksize bytes could be read from the file.)  This procedure
+    is iterated until the end of the file is reached, in which case the space
      of the input blocks that is unused is filled with zeroes before encoding.
  
      Note that the sequence passed in calls to cb() contains mutable array
-    objects in its first k elements whose contents will be overwritten when 
-    the next segment is read from the input file.  Therefore the 
-    implementation of cb() has to either be finished with those first k arrays 
-    before returning, or if it wants to keep the contents of those arrays for 
-    subsequent use after it has returned then it must make a copy of them to 
+    objects in its first k elements whose contents will be overwritten when
+    the next segment is read from the input file.  Therefore the
+    implementation of cb() has to either be finished with those first k arrays
+    before returning, or if it wants to keep the contents of those arrays for
+    subsequent use after it has returned then it must make a copy of them to
      keep.
  
      @param inf the file object from which to read the data
      @param cb the callback to be invoked with the results
      @param k the number of shares required to reconstruct the file
      @param m the total number of shares created
-    @param chunksize how much data to read from inf for each of the k input 
+    @param chunksize how much data to read from inf for each of the k input
          blocks
      """
      enc = zfec.Encoder(k, m)
@@ -323,7 +335,7 @@ def encode_file(inf, cb, k, m, chunksize=4096):
              except EOFError:
                  eof = True
                  indatasize = i*chunksize + len(a)
-                
+
                  # padding
                  a.fromstring("\x00" * (chunksize-len(a)))
                  i += 1
@@ -335,25 +347,98 @@ def encode_file(inf, cb, k, m, chunksize=4096):
          res = enc.encode(l)
          cb(res, indatasize)
  
+try:
+    from hashlib import sha1
+    sha1 = sha1 # hush pyflakes
+except ImportError:
+    # hashlib was added in Python 2.5.0.
+    import sha
+    sha1 = sha
+
+def encode_file_not_really(inf, cb, k, m, chunksize=4096):
+    enc = zfec.Encoder(k, m)
+    l = tuple([ array.array('c') for i in range(k) ])
+    indatasize = k*chunksize # will be reset to shorter upon EOF
+    eof = False
+    ZEROES=array.array('c', ['\x00'])*chunksize
+    while not eof:
+        # This loop body executes once per segment.
+        i = 0
+        while (i<len(l)):
+            # This loop body executes once per chunk.
+            a = l[i]
+            del a[:]
+            try:
+                a.fromfile(inf, chunksize)
+                i += 1
+            except EOFError:
+                eof = True
+                indatasize = i*chunksize + len(a)
+
+                # padding
+                a.fromstring("\x00" * (chunksize-len(a)))
+                i += 1
+                while (i<len(l)):
+                    a = l[i]
+                    a[:] = ZEROES
+                    i += 1
+
+        # res = enc.encode(l)
+        cb(None, None)
+
+def encode_file_not_really_and_hash(inf, cb, k, m, chunksize=4096):
+    hasher = sha1.new()
+    enc = zfec.Encoder(k, m)
+    l = tuple([ array.array('c') for i in range(k) ])
+    indatasize = k*chunksize # will be reset to shorter upon EOF
+    eof = False
+    ZEROES=array.array('c', ['\x00'])*chunksize
+    while not eof:
+        # This loop body executes once per segment.
+        i = 0
+        while (i<len(l)):
+            # This loop body executes once per chunk.
+            a = l[i]
+            del a[:]
+            try:
+                a.fromfile(inf, chunksize)
+                i += 1
+            except EOFError:
+                eof = True
+                indatasize = i*chunksize + len(a)
+
+                # padding
+                a.fromstring("\x00" * (chunksize-len(a)))
+                i += 1
+                while (i<len(l)):
+                    a = l[i]
+                    a[:] = ZEROES
+                    i += 1
+
+        # res = enc.encode(l)
+        for thing in l:
+            hasher.update(thing)
+        cb(None, None)
+
  def encode_file_stringy(inf, cb, k, m, chunksize=4096):
      """
      Read in the contents of inf, encode, and call cb with the results.
  
-    First, k "input blocks" will be read from inf, each input block being of 
-    size chunksize.  Then these k blocks will be encoded into m "result 
-    blocks".  Then cb will be invoked, passing a list of the m result blocks 
-    as its first argument, and the length of the encoded data as its second 
-    argument.  (The length of the encoded data is always equal to k*chunksize, 
-    until the last iteration, when the end of the file has been reached and 
-    less than k*chunksize bytes could be read from the file.)  This procedure 
-    is iterated until the end of the file is reached, in which case the part 
+    First, k "input blocks" will be read from inf, each input block being of
+    size chunksize.  Then these k blocks will be encoded into m "result
+    blocks".  Then cb will be invoked, passing a list of the m result blocks
+    as its first argument, and the length of the encoded data as its second
+    argument.  (The length of the encoded data is always equal to k*chunksize,
+    until the last iteration, when the end of the file has been reached and
+    less than k*chunksize bytes could be read from the file.)  This procedure
+    is iterated until the end of the file is reached, in which case the part
      of the input shares that is unused is filled with zeroes before encoding.
  
      @param inf the file object from which to read the data
      @param cb the callback to be invoked with the results
      @param k the number of shares required to reconstruct the file
      @param m the total number of shares created
-    @param chunksize how much data to read from inf for each of the k input 
+    @param chunksize how much data to read from inf for each of the k input
          blocks
      """
      enc = zfec.Encoder(k, m)
@@ -369,7 +454,7 @@ def encode_file_stringy(inf, cb, k, m, chunksize=4096):
              l.append(inf.read(chunksize))
              if len(l[-1]) < chunksize:
                  indatasize = i*chunksize + len(l[-1])
-                
+
                  # padding
                  l[-1] = l[-1] + "\x00" * (chunksize-len(l[-1]))
                  while i<k:
@@ -397,7 +482,7 @@ def encode_file_stringy_easyfec(inf, cb, k, m, chunksize=4096):
      @param cb the callback to be invoked with the results
      @param k the number of shares required to reconstruct the file
      @param m the total number of shares created
-    @param chunksize how much data to read from inf for each of the k input 
+    @param chunksize how much data to read from inf for each of the k input
          blocks
      """
      enc = easyfec.Encoder(k, m)
@@ -410,27 +495,10 @@ def encode_file_stringy_easyfec(inf, cb, k, m, chunksize=4096):
          indata = inf.read(readsize)
  
  # zfec -- fast forward error correction library with Python interface
-# 
-# Copyright (C) 2007 Allmydata, Inc.
+#
+# Copyright (C) 2007-2010 Allmydata, Inc.
  # Author: Zooko Wilcox-O'Hearn
-# mailto:zooko@zooko.com
-# 
+#
  # This file is part of zfec.
-# 
-# This program is free software; you can redistribute it and/or modify it
-# under the terms of the GNU General Public License as published by the Free
-# Software Foundation; either version 2 of the License, or (at your option)
-# any later version.  This program also comes with the added permission that,
-# in the case that you are obligated to release a derived work under this
-# licence (as per section 2.b of the GPL), you may delay the fulfillment of
-# this obligation for up to 12 months.
-# 
-# This program is distributed in the hope that it will be useful,
-# but WITHOUT ANY WARRANTY; without even the implied warranty of
-# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-# GNU General Public License for more details.
-# 
-# You should have received a copy of the GNU General Public License
-# along with this program; if not, write to the Free Software
-# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
-
+#
+# See README.txt for licensing information.