From 9c5ab89afe75963f6a50e995131930932491bc04 Mon Sep 17 00:00:00 2001
From: Brian Warner <warner@lothar.com>
Date: Sun, 22 Jul 2007 19:48:44 -0700
Subject: [PATCH] truncate storage index to 128 bits, since it's derived from a
 128 bit AES key

---
 docs/uri.txt                      | 5 +++--
 src/allmydata/interfaces.py       | 2 +-
 src/allmydata/test/test_upload.py | 2 +-
 src/allmydata/upload.py           | 6 +++---
 src/allmydata/uri.py              | 2 +-
 src/allmydata/util/hashutil.py    | 4 +++-
 6 files changed, 12 insertions(+), 9 deletions(-)

diff --git a/docs/uri.txt b/docs/uri.txt
index e5f54723..e83c873f 100644
--- a/docs/uri.txt
+++ b/docs/uri.txt
@@ -48,8 +48,9 @@ property), and encrypted using a "read key". A secure hash of the data is
 computed to help validate the data afterwards (providing the "identification"
 property). All of these pieces, plus information about the file's size and
 the number of shares into which it has been distributed, are put into the
-"CHK" uri. The storage index is derived by hashing the read key, so it does
-not need to be physically present in the URI.
+"CHK" uri. The storage index is derived by hashing the read key (using a
+tagged SHA-256 hash, then truncated to 128 bits), so it does not need to be
+physically present in the URI.
 
 The current format for CHK URIs is the concatenation of the following
 strings:
diff --git a/src/allmydata/interfaces.py b/src/allmydata/interfaces.py
index 84e8de2e..cffc7649 100644
--- a/src/allmydata/interfaces.py
+++ b/src/allmydata/interfaces.py
@@ -11,7 +11,7 @@ Hash = StringConstraint(maxLength=HASH_SIZE,
 Nodeid = StringConstraint(maxLength=20,
                           minLength=20) # binary format 20-byte SHA1 hash
 FURL = StringConstraint(1000)
-StorageIndex = StringConstraint(32)
+StorageIndex = StringConstraint(16)
 URI = StringConstraint(300) # kind of arbitrary
 MAX_BUCKETS = 200  # per peer
 ShareData = StringConstraint(400000) # 1MB segment / k=3 = 334kB
diff --git a/src/allmydata/test/test_upload.py b/src/allmydata/test/test_upload.py
index 3b335d1f..15148085 100644
--- a/src/allmydata/test/test_upload.py
+++ b/src/allmydata/test/test_upload.py
@@ -163,7 +163,7 @@ class GoodServer(unittest.TestCase):
         u = IFileURI(newuri)
         self.failUnless(isinstance(u, uri.CHKFileURI))
         self.failUnless(isinstance(u.storage_index, str))
-        self.failUnlessEqual(len(u.storage_index), 32)
+        self.failUnlessEqual(len(u.storage_index), 16)
         self.failUnless(isinstance(u.key, str))
         self.failUnlessEqual(len(u.key), 16)
         self.failUnlessEqual(u.size, size)
diff --git a/src/allmydata/upload.py b/src/allmydata/upload.py
index 3a18f3f6..7f9ee0ab 100644
--- a/src/allmydata/upload.py
+++ b/src/allmydata/upload.py
@@ -288,9 +288,9 @@ class CHKUploader:
         self._encoder.set_encryption_key(key)
         storage_index = hashutil.storage_index_chk_hash(key)
         assert isinstance(storage_index, str)
-        # TODO: is there any point to having the SI be longer than the key?
-        # There's certainly no extra entropy to be had..
-        assert len(storage_index) == 32  # SHA-256
+        # There's no point to having the SI be longer than the key, so we
+        # specify that it is truncated to the same 128 bits as the AES key.
+        assert len(storage_index) == 16  # SHA-256 truncated to 128b
         self._storage_index = storage_index
         log.msg(" upload storage_index is [%s]" % (idlib.b2a(storage_index,)))
 
diff --git a/src/allmydata/uri.py b/src/allmydata/uri.py
index 47e617d1..72aece89 100644
--- a/src/allmydata/uri.py
+++ b/src/allmydata/uri.py
@@ -53,7 +53,7 @@ class CHKFileURI(_BaseURI):
 
         self.storage_index = hashutil.storage_index_chk_hash(self.key)
         assert isinstance(self.storage_index, str)
-        assert len(self.storage_index) == 32 # sha256 hash
+        assert len(self.storage_index) == 16 # sha256 hash truncated to 128
 
         self.uri_extension_hash = idlib.a2b(uri_extension_hash_s)
         assert isinstance(self.uri_extension_hash, str)
diff --git a/src/allmydata/util/hashutil.py b/src/allmydata/util/hashutil.py
index 62042c4b..25ac2424 100644
--- a/src/allmydata/util/hashutil.py
+++ b/src/allmydata/util/hashutil.py
@@ -23,7 +23,9 @@ def tagged_hasher(tag):
     return SHA256.new(netstring(tag))
 
 def storage_index_chk_hash(data):
-    return tagged_hash("allmydata_CHK_storage_index_v1", data)
+    # storage index is truncated to 128 bits (16 bytes). We're only hashing a
+    # 16-byte value to get it, so there's no point in using a larger value.
+    return tagged_hash("allmydata_CHK_storage_index_v1", data)[:16]
 
 def block_hash(data):
     return tagged_hash("allmydata_encoded_subshare_v1", data)
-- 
2.45.2