From 6c68dd897ab354bb141102dd15acc9fde21b3372 Mon Sep 17 00:00:00 2001
From: zooko <zooko@zooko.com>
Date: Mon, 12 Nov 2007 20:28:19 +0530
Subject: [PATCH] zfec: reorder the inner loop to be more cache-friendly

Loop over this stride of each input block before looping over all strides of
this input block.  In theory, this should allow the strides of the input blocks
to remain in cache while we produce all of the output blocks.

darcs-hash:8f0ac74d2150507519463d2d711607f467f18ea6
---
 zfec/zfec/fec.c | 17 +++++++++--------
 1 file changed, 9 insertions(+), 8 deletions(-)

diff --git a/zfec/zfec/fec.c b/zfec/zfec/fec.c
index 2054697..84f4645 100644
--- a/zfec/zfec/fec.c
+++ b/zfec/zfec/fec.c
@@ -482,15 +482,16 @@ fec_encode(const fec_t* code, const gf*restrict const*restrict const src, gf*res
     unsigned fecnum;
     const gf* p;
 
-    for (i=0; i<num_block_nums; i++) {
-        fecnum=block_nums[i];
-        assert (fecnum >= code->k);
-        memset(fecs[i], 0, sz);
-        p = &(code->enc_matrix[fecnum * code->k]);
-// DUFF ME
-        for (k = 0; k < sz; k += STRIDE)
+    for (k = 0; k < sz; k += STRIDE) {
+        size_t stride = ((sz-k) < STRIDE)?(sz-k):STRIDE;
+        for (i=0; i<num_block_nums; i++) {
+            fecnum=block_nums[i];
+            assert (fecnum >= code->k);
+            memset(fecs[i]+k, 0, stride);
+            p = &(code->enc_matrix[fecnum * code->k]);
             for (j = 0; j < code->k; j++)
-                addmul(fecs[i]+k, src[j]+k, p[j], ((sz-k) < STRIDE)?(sz-k):STRIDE);
+                addmul(fecs[i]+k, src[j]+k, p[j], stride);
+        }
     }
 }
 
-- 
2.45.2