From: Daira Hopwood Date: Sun, 6 Dec 2015 04:36:30 +0000 (+0000) Subject: Add implementation of NTFS uppercase mapping table for Windows Vista. X-Git-Url: https://git.rkrishnan.org/simplejson/components/%22file:/something?a=commitdiff_plain;h=e0c8210216a0542a2584a2a5f6a8277fe9e7746c;p=tahoe-lafs%2Ftahoe-lafs.git Add implementation of NTFS uppercase mapping table for Windows Vista. Signed-off-by: Daira Hopwood --- diff --git a/src/allmydata/windows/ntfs_casemap.py b/src/allmydata/windows/ntfs_casemap.py new file mode 100644 index 00000000..329fa553 --- /dev/null +++ b/src/allmydata/windows/ntfs_casemap.py @@ -0,0 +1,89 @@ +from allmydata.util.assertutil import _assert + +# These are taken directly from the NTFS-3G table for Windows Vista, +# with trivial syntax changes. +# + +# (start, end, offset) => for c in xrange(start, end): c maps to c+offset +uc_run_table = ( + (0x0061, 0x007b, -32), (0x00e0, 0x00f7, -32), (0x00f8, 0x00ff, -32), + (0x0256, 0x0258, -205), (0x028a, 0x028c, -217), (0x037b, 0x037e, 130), + (0x03ac, 0x03ad, -38), (0x03ad, 0x03b0, -37), (0x03b1, 0x03c2, -32), + (0x03c2, 0x03c3, -31), (0x03c3, 0x03cc, -32), (0x03cc, 0x03cd, -64), + (0x03cd, 0x03cf, -63), (0x0430, 0x0450, -32), (0x0450, 0x0460, -80), + (0x0561, 0x0587, -48), (0x1f00, 0x1f08, 8), (0x1f10, 0x1f16, 8), + (0x1f20, 0x1f28, 8), (0x1f30, 0x1f38, 8), (0x1f40, 0x1f46, 8), + (0x1f51, 0x1f52, 8), (0x1f53, 0x1f54, 8), (0x1f55, 0x1f56, 8), + (0x1f57, 0x1f58, 8), (0x1f60, 0x1f68, 8), (0x1f70, 0x1f72, 74), + (0x1f72, 0x1f76, 86), (0x1f76, 0x1f78, 100), (0x1f78, 0x1f7a, 128), + (0x1f7a, 0x1f7c, 112), (0x1f7c, 0x1f7e, 126), (0x1f80, 0x1f88, 8), + (0x1f90, 0x1f98, 8), (0x1fa0, 0x1fa8, 8), (0x1fb0, 0x1fb2, 8), + (0x1fb3, 0x1fb4, 9), (0x1fcc, 0x1fcd, -9), (0x1fd0, 0x1fd2, 8), + (0x1fe0, 0x1fe2, 8), (0x1fe5, 0x1fe6, 7), (0x1ffc, 0x1ffd, -9), + (0x2170, 0x2180, -16), (0x24d0, 0x24ea, -26), (0x2c30, 0x2c5f, -48), + (0x2d00, 0x2d26, -7264), (0xff41, 0xff5b, -32), +) + +# (start, end) => for c in xrange(start+1, end, 2): c maps to c-1 +uc_dup_table = ( + (0x0100, 0x012f), (0x0132, 0x0137), (0x0139, 0x0149), (0x014a, 0x0178), + (0x0179, 0x017e), (0x01a0, 0x01a6), (0x01b3, 0x01b7), (0x01cd, 0x01dd), + (0x01de, 0x01ef), (0x01f4, 0x01f5), (0x01f8, 0x01f9), (0x01fa, 0x0220), + (0x0222, 0x0234), (0x023b, 0x023c), (0x0241, 0x0242), (0x0246, 0x024f), + (0x03d8, 0x03ef), (0x03f7, 0x03f8), (0x03fa, 0x03fb), (0x0460, 0x0481), + (0x048a, 0x04bf), (0x04c1, 0x04c4), (0x04c5, 0x04c8), (0x04c9, 0x04ce), + (0x04ec, 0x04ed), (0x04d0, 0x04eb), (0x04ee, 0x04f5), (0x04f6, 0x0513), + (0x1e00, 0x1e95), (0x1ea0, 0x1ef9), (0x2183, 0x2184), (0x2c60, 0x2c61), + (0x2c67, 0x2c6c), (0x2c75, 0x2c76), (0x2c80, 0x2ce3), +) + +# (c, v) => c maps to v +# This is uc_byte_table in NTFS-3G, but that's a poor name. +uc_singleton_table = ( + (0x00ff, 0x0178), (0x0180, 0x0243), (0x0183, 0x0182), (0x0185, 0x0184), + (0x0188, 0x0187), (0x018c, 0x018b), (0x0192, 0x0191), (0x0195, 0x01f6), + (0x0199, 0x0198), (0x019a, 0x023d), (0x019e, 0x0220), (0x01a8, 0x01a7), + (0x01ad, 0x01ac), (0x01b0, 0x01af), (0x01b9, 0x01b8), (0x01bd, 0x01bc), + (0x01bf, 0x01f7), (0x01c6, 0x01c4), (0x01c9, 0x01c7), (0x01cc, 0x01ca), + (0x01dd, 0x018e), (0x01f3, 0x01f1), (0x023a, 0x2c65), (0x023e, 0x2c66), + (0x0253, 0x0181), (0x0254, 0x0186), (0x0259, 0x018f), (0x025b, 0x0190), + (0x0260, 0x0193), (0x0263, 0x0194), (0x0268, 0x0197), (0x0269, 0x0196), + (0x026b, 0x2c62), (0x026f, 0x019c), (0x0272, 0x019d), (0x0275, 0x019f), + (0x027d, 0x2c64), (0x0280, 0x01a6), (0x0283, 0x01a9), (0x0288, 0x01ae), + (0x0289, 0x0244), (0x028c, 0x0245), (0x0292, 0x01b7), (0x03f2, 0x03f9), + (0x04cf, 0x04c0), (0x1d7d, 0x2c63), (0x214e, 0x2132), +) + +# Let's simplify by converting these to a single table. +# (start, end, step, offset) => for c in xrange(start, end, step): c maps to c+offset +uc_full_table = ( + [(start, end, 1, offset) for (start, end, offset) in uc_run_table] + + [(start+1, end, 2, -1) for (start, end) in uc_dup_table] + + [(c, c+1, 1, v-c) for (c, v) in uc_singleton_table] +) + +# Now we create an offset_trie such that +# c maps to c + offset_trie[c >> TABLE_BITS][c & TABLE_MASK]. +# This is memory-efficient because almost all of the subtables +# are represented by an all-zero array, which can be shared. + +TABLE_BITS = 9 +TABLE_SIZE = 1 << TABLE_BITS +TABLE_MASK = TABLE_SIZE-1 + +idmap = [0]*TABLE_SIZE +offset_trie = [idmap]*(0x110000/TABLE_SIZE) + +for (start, end, step, offset) in uc_full_table: + for c in xrange(start, end, step): + high = c >> TABLE_BITS + lowmap = offset_trie[high] + if lowmap == idmap: + # clone to avoid aliasing + offset_trie[high] = lowmap = idmap[:] + + lowmap[c & TABLE_MASK] = offset + +def uppercase(s): + def ucase(c): return unichr(c + offset_trie[c >> TABLE_BITS][c & TABLE_MASK]) + return "".join([ucase(ord(ch)) for ch in s])