From 79c439d026379a582037d0449c22f82eba70d554 Mon Sep 17 00:00:00 2001
From: Zooko O'Whielacronx <zooko@zooko.com>
Date: Thu, 31 Jan 2008 16:26:28 -0700
Subject: [PATCH] storage: make two levels of share directories so as not to
 exceed certain filesystems's limitations on directory size The filesystem
 which gets my vote for most undeservedly popular is ext3, and it has a hard
 limit of 32,000 entries in a directory.  Many other filesystems (even ones
 that I like more than I like ext3) have either hard limits or bad performance
 consequences or weird edge cases when you get too many entries in a single
 directory.

This patch makes it so that there is a layer of intermediate directories between the "shares" directory and the actual storage-index directory (the one whose name contains the entire storage index (z-base-32 encoded) and which contains one or more share files named by their share number).

The intermediate directories are named by the first 14 bits of the storage index, which means there are at most 16384 of them.  (This also means that the intermediate directory names are not a leading prefix of the storage-index directory names -- to do that would have required us to have intermediate directories limited to either 1024 (2-char), which is too few, or 32768 (3-chars of a full 5 bits each), which would overrun ext3's funny hard limit of 32,000.))

This closes #150, and please see the "convertshares.py" script attached to #150 to convert your old tahoe-0.7.0 storage/shares directory into a new tahoe-0.8.0 storage/shares directory.
---
 src/allmydata/storage.py           | 53 +++++++++++++++++-------------
 src/allmydata/test/test_storage.py | 45 +++++++++++++++++++------
 src/allmydata/test/test_system.py  | 16 ++++-----
 3 files changed, 74 insertions(+), 40 deletions(-)

diff --git a/src/allmydata/storage.py b/src/allmydata/storage.py
index b0be6b1b..a89f05c5 100644
--- a/src/allmydata/storage.py
+++ b/src/allmydata/storage.py
@@ -17,10 +17,14 @@ class DataTooLargeError(Exception):
 
 # storage/
 # storage/shares/incoming
-#   incoming/ holds temp dirs named $STORAGEINDEX/$SHARENUM which will be
-#   moved to storage/shares/$STORAGEINDEX/$SHARENUM upon success
-# storage/shares/$STORAGEINDEX
-# storage/shares/$STORAGEINDEX/$SHARENUM
+#   incoming/ holds temp dirs named $START/$STORAGEINDEX/$SHARENUM which will
+#   be moved to storage/shares/$START/$STORAGEINDEX/$SHARENUM upon success
+# storage/shares/$START/$STORAGEINDEX
+# storage/shares/$START/$STORAGEINDEX/$SHARENUM
+
+# Where "$START" denotes the first 14 bits worth of $STORAGEINDEX (that's 3
+# base-32 chars, but the last one has only 4 bits in it -- i.e. only 16 possible
+# chars in the last position).
 
 # $SHARENUM matches this regex:
 NUM_RE=re.compile("^[0-9]+$")
@@ -42,6 +46,9 @@ NUM_RE=re.compile("^[0-9]+$")
 #   B+0x44: expiration time, 4 bytes big-endian seconds-since-epoch
 #   B+0x48: next lease, or end of record
 
+def storage_index_to_dir(storageindex):
+    return os.path.join(idlib.b2a_l(storageindex[:2], 14), idlib.b2a(storageindex))
+
 class ShareFile:
     LEASE_SIZE = struct.calcsize(">L32s32sL")
 
@@ -174,12 +181,13 @@ class BucketWriter(Referenceable):
         self.closed = False
         self.throw_out_all_data = False
         # touch the file, so later callers will see that we're working on it.
+        assert not os.path.exists(incominghome)
+        fileutil.make_dirs(os.path.dirname(incominghome))
         # Also construct the metadata.
-        assert not os.path.exists(self.incominghome)
-        f = open(self.incominghome, 'wb')
+        f = open(incominghome, 'wb')
         f.write(struct.pack(">LLL", 1, size, 0))
         f.close()
-        self._sharefile = ShareFile(self.incominghome)
+        self._sharefile = ShareFile(incominghome)
         # also, add our lease to the file now, so that other ones can be
         # added by simultaneous uploaders
         self._sharefile.add_lease(lease_info)
@@ -195,7 +203,15 @@ class BucketWriter(Referenceable):
 
     def remote_close(self):
         precondition(not self.closed)
+
+        fileutil.make_dirs(os.path.dirname(self.finalhome))
         fileutil.rename(self.incominghome, self.finalhome)
+        try:
+            os.rmdir(os.path.dirname(self.incominghome))
+            os.rmdir(os.path.dirname(os.path.dirname(self.incominghome)))
+            os.rmdir(os.path.dirname(os.path.dirname(os.path.dirname(self.incominghome))))
+        except EnvironmentError:
+            pass
         self._sharefile = None
         self.closed = True
         self._canary.dontNotifyOnDisconnect(self._disconnect_marker)
@@ -203,12 +219,6 @@ class BucketWriter(Referenceable):
         filelen = os.stat(self.finalhome)[stat.ST_SIZE]
         self.ss.bucket_writer_closed(self, filelen)
 
-        # if we were the last share to be moved, remove the incoming/
-        # directory that was our parent
-        parentdir = os.path.split(self.incominghome)[0]
-        if not os.listdir(parentdir):
-            os.rmdir(parentdir)
-
     def _disconnected(self):
         if not self.closed:
             self._abort()
@@ -235,8 +245,8 @@ class BucketWriter(Referenceable):
 class BucketReader(Referenceable):
     implements(RIBucketReader)
 
-    def __init__(self, home):
-        self._share_file = ShareFile(home)
+    def __init__(self, sharefname):
+        self._share_file = ShareFile(sharefname)
 
     def remote_read(self, offset, length):
         return self._share_file.read_share_data(offset, length)
@@ -719,7 +729,7 @@ class StorageServer(service.MultiService, Referenceable):
         # to a particular owner.
         alreadygot = set()
         bucketwriters = {} # k: shnum, v: BucketWriter
-        si_s = idlib.b2a(storage_index)
+        si_s = storage_index_to_dir(storage_index)
 
         # in this implementation, the lease information (including secrets)
         # goes into the share files themselves. It could also be put into a
@@ -752,7 +762,6 @@ class StorageServer(service.MultiService, Referenceable):
                 pass
             elif no_limits or remaining_space >= space_per_bucket:
                 # ok! we need to create the new share file.
-                fileutil.make_dirs(os.path.join(self.incomingdir, si_s))
                 bw = BucketWriter(self, incominghome, finalhome,
                                   space_per_bucket, lease_info, canary)
                 if self.no_storage:
@@ -792,7 +801,7 @@ class StorageServer(service.MultiService, Referenceable):
             raise IndexError("no such lease to renew")
 
     def remote_cancel_lease(self, storage_index, cancel_secret):
-        storagedir = os.path.join(self.sharedir, idlib.b2a(storage_index))
+        storagedir = os.path.join(self.sharedir, storage_index_to_dir(storage_index))
 
         remaining_files = 0
         total_space_freed = 0
@@ -844,7 +853,7 @@ class StorageServer(service.MultiService, Referenceable):
         """Return a list of (shnum, pathname) tuples for files that hold
         shares for this storage_index. In each tuple, 'shnum' will always be
         the integer form of the last component of 'pathname'."""
-        storagedir = os.path.join(self.sharedir, idlib.b2a(storage_index))
+        storagedir = os.path.join(self.sharedir, storage_index_to_dir(storage_index))
         try:
             for f in os.listdir(storagedir):
                 if NUM_RE.match(f):
@@ -855,7 +864,7 @@ class StorageServer(service.MultiService, Referenceable):
             pass
 
     def _get_incoming_shares(self, storage_index):
-        incomingdir = os.path.join(self.incomingdir, idlib.b2a(storage_index))
+        incomingdir = os.path.join(self.incomingdir, storage_index_to_dir(storage_index))
         try:
             for f in os.listdir(incomingdir):
                 if NUM_RE.match(f):
@@ -891,7 +900,7 @@ class StorageServer(service.MultiService, Referenceable):
                                                secrets,
                                                test_and_write_vectors,
                                                read_vector):
-        si_s = idlib.b2a(storage_index)
+        si_s = storage_index_to_dir(storage_index)
         (write_enabler, renew_secret, cancel_secret) = secrets
         # shares exist if there is a file for them
         bucketdir = os.path.join(self.sharedir, si_s)
@@ -968,7 +977,7 @@ class StorageServer(service.MultiService, Referenceable):
         return share
 
     def remote_slot_readv(self, storage_index, shares, readv):
-        si_s = idlib.b2a(storage_index)
+        si_s = storage_index_to_dir(storage_index)
         # shares exist if there is a file for them
         bucketdir = os.path.join(self.sharedir, si_s)
         if not os.path.isdir(bucketdir):
diff --git a/src/allmydata/test/test_storage.py b/src/allmydata/test/test_storage.py
index 5d09e14d..550891e4 100644
--- a/src/allmydata/test/test_storage.py
+++ b/src/allmydata/test/test_storage.py
@@ -7,7 +7,8 @@ import itertools
 from allmydata import interfaces
 from allmydata.util import fileutil, hashutil, idlib
 from allmydata.storage import BucketWriter, BucketReader, \
-     WriteBucketProxy, ReadBucketProxy, StorageServer, MutableShareFile
+     WriteBucketProxy, ReadBucketProxy, StorageServer, MutableShareFile, \
+     storage_index_to_dir
 from allmydata.interfaces import BadWriteEnablerError
 from allmydata.test.common import LoggingServiceParent
 
@@ -56,7 +57,7 @@ class Bucket(unittest.TestCase):
         bw.remote_close()
 
         # now read from it
-        br = BucketReader(final)
+        br = BucketReader(bw.finalhome)
         self.failUnlessEqual(br.remote_read(0, 25), "a"*25)
         self.failUnlessEqual(br.remote_read(25, 25), "b"*25)
         self.failUnlessEqual(br.remote_read(50, 7), "c"*7)
@@ -93,7 +94,7 @@ class BucketProxy(unittest.TestCase):
         pass
 
     def test_create(self):
-        bw, rb, final = self.make_bucket("test_create", 500)
+        bw, rb, sharefname = self.make_bucket("test_create", 500)
         bp = WriteBucketProxy(rb,
                               data_size=300,
                               segment_size=10,
@@ -123,7 +124,7 @@ class BucketProxy(unittest.TestCase):
                         for i in (1,9,13)]
         uri_extension = "s" + "E"*498 + "e"
 
-        bw, rb, final = self.make_bucket("test_readwrite", 1414)
+        bw, rb, sharefname = self.make_bucket("test_readwrite", 1414)
         bp = WriteBucketProxy(rb,
                               data_size=95,
                               segment_size=25,
@@ -146,7 +147,7 @@ class BucketProxy(unittest.TestCase):
 
         # now read everything back
         def _start_reading(res):
-            br = BucketReader(final)
+            br = BucketReader(sharefname)
             rb = RemoteBucket()
             rb.target = br
             rbp = ReadBucketProxy(rb)
@@ -212,16 +213,40 @@ class Server(unittest.TestCase):
                                           renew_secret, cancel_secret,
                                           sharenums, size, FakeCanary())
 
+    def test_dont_overfill_dirs(self):
+        """
+        This test asserts that if you add a second share whose storage index
+        share lots of leading bits with an extant share (but isn't the exact
+        same storage index), this won't add an entry to the share directory.
+        """
+        ss = self.create("test_dont_overfill_dirs")
+        already, writers = self.allocate(ss, "storageindex", [0], 10)
+        for i, wb in writers.items():
+            wb.remote_write(0, "%10d" % i)
+            wb.remote_close()
+        storedir = os.path.join(self.workdir("test_dont_overfill_dirs"),
+                                "shares")
+        children_of_storedir = set(os.listdir(storedir))
+
+        # Now store another one under another storageindex that has leading
+        # chars the same as the first storageindex.
+        already, writers = self.allocate(ss, "storageindey", [0], 10)
+        for i, wb in writers.items():
+            wb.remote_write(0, "%10d" % i)
+            wb.remote_close()
+        storedir = os.path.join(self.workdir("test_dont_overfill_dirs"),
+                                "shares")
+        new_children_of_storedir = set(os.listdir(storedir))
+        self.failUnlessEqual(children_of_storedir, new_children_of_storedir)
+
     def test_remove_incoming(self):
         ss = self.create("test_remove_incoming")
         already, writers = self.allocate(ss, "vid", range(3), 10)
         for i,wb in writers.items():
             wb.remote_write(0, "%10d" % i)
             wb.remote_close()
-        incomingdir = os.path.join(self.workdir("test_remove_incoming"),
-                                   "shares", "incoming")
-        leftover_dirs = os.listdir(incomingdir)
-        self.failUnlessEqual(leftover_dirs, [])
+        incomingdir = os.path.dirname(os.path.dirname(os.path.dirname(wb.incominghome)))
+        self.failIf(os.path.exists(incomingdir))
 
     def test_allocate(self):
         ss = self.create("test_allocate")
@@ -785,7 +810,7 @@ class MutableServer(unittest.TestCase):
         # create a random non-numeric file in the bucket directory, to
         # exercise the code that's supposed to ignore those.
         bucket_dir = os.path.join(self.workdir("test_leases"),
-                                  "shares", idlib.b2a("si1"))
+                                  "shares", storage_index_to_dir("si1"))
         f = open(os.path.join(bucket_dir, "ignore_me.txt"), "w")
         f.write("you ought to be ignoring me\n")
         f.close()
diff --git a/src/allmydata/test/test_system.py b/src/allmydata/test/test_system.py
index 75de3da6..8bcb7069 100644
--- a/src/allmydata/test/test_system.py
+++ b/src/allmydata/test/test_system.py
@@ -372,7 +372,7 @@ class SystemTest(testutil.SignalMixin, unittest.TestCase):
                 for i in range(self.numclients):
                     incdir = os.path.join(self.getdir("client%d" % i),
                                           "storage", "shares", "incoming")
-                    self.failUnlessEqual(os.listdir(incdir), [])
+                    self.failIf(os.path.exists(incdir) and os.listdir(incdir))
             d.addCallback(_disconnected)
 
             def _wait_for_reconnect(res):
@@ -442,11 +442,11 @@ class SystemTest(testutil.SignalMixin, unittest.TestCase):
             if not filenames:
                 continue
             pieces = dirpath.split(os.sep)
-            if pieces[-3] == "storage" and pieces[-2] == "shares":
-                # we're sitting in .../storage/shares/$SINDEX , and there
+            if pieces[-4] == "storage" and pieces[-3] == "shares":
+                # we're sitting in .../storage/shares/$START/$SINDEX , and there
                 # are sharefiles here
-                assert pieces[-4].startswith("client")
-                client_num = int(pieces[-4][-1])
+                assert pieces[-5].startswith("client")
+                client_num = int(pieces[-5][-1])
                 storage_index_s = pieces[-1]
                 storage_index = idlib.a2b(storage_index_s)
                 for sharename in filenames:
@@ -1115,9 +1115,9 @@ class SystemTest(testutil.SignalMixin, unittest.TestCase):
             if not filenames:
                 continue
             pieces = dirpath.split(os.sep)
-            if pieces[-3] == "storage" and pieces[-2] == "shares":
-                # we're sitting in .../storage/shares/$SINDEX , and there are
-                # sharefiles here
+            if pieces[-4] == "storage" and pieces[-3] == "shares":
+                # we're sitting in .../storage/shares/$START/$SINDEX , and there
+                # are sharefiles here
                 filename = os.path.join(dirpath, filenames[0])
                 # peek at the magic to see if it is a chk share
                 magic = open(filename, "rb").read(4)
-- 
2.45.2