diff -rN -u old-ticket393/src/allmydata/mutable/filenode.py new-ticket393/src/allmydata/mutable/filenode.py --- old-ticket393/src/allmydata/mutable/filenode.py 2011-04-01 17:04:28.000000000 -0600 +++ new-ticket393/src/allmydata/mutable/filenode.py 2011-04-01 17:04:30.000000000 -0600 @@ -986,7 +986,7 @@ power-of-two boundary, this operation will use roughly O(data.get_size()) memory/bandwidth/CPU to perform the update. Otherwise, it must download, re-encode, and upload the entire - file again, which will use O(filesize) resources. + file again, which will use O(filesize) resources. XXX no, it needs to download, modify, and re-upload only the hash tree, not the entire file contents """ return self._do_serialized(self._update, data, offset) @@ -994,7 +994,7 @@ def _update(self, data, offset): """ I update the mutable file version represented by this particular - IMutableVersion by inserting the data in data at the offset + IMutableVersion by inserting XXX overwriting the data in data at the offset offset. I return a Deferred that fires when this has been completed. """ @@ -1054,9 +1054,10 @@ # what we'll do later. start_segment = offset // segsize - # We only need the end segment if the data we append does not go - # beyond the current end-of-file. + # We only need the end segment if the data we write does not + # reach the end of the file. end_segment = start_segment + # print "offset: %s, spansize: %s, end=%s, filesize: %s" % (offset, data.get_size(), offset+data.get_size(), self.get_size()) if offset + data.get_size() < self.get_size(): end_data = offset + data.get_size() end_segment = end_data // segsize diff -rN -u old-ticket393/src/allmydata/mutable/layout.py new-ticket393/src/allmydata/mutable/layout.py --- old-ticket393/src/allmydata/mutable/layout.py 2011-04-01 17:04:28.000000000 -0600 +++ new-ticket393/src/allmydata/mutable/layout.py 2011-04-01 17:04:30.000000000 -0600 @@ -15,7 +15,7 @@ # PREFIX: # >: Big-endian byte order; the most significant byte is first (leftmost). # B: The version information; an 8 bit version identifier. Stored as -# an unsigned char. This is currently 00 00 00 00; our modifications +# an unsigned char. This is currently 00 00 00 00; our modifications XXX what modifications? # will turn it into 00 00 00 01. # Q: The sequence number; this is sort of like a revision history for # mutable files; they start at 1 and increase as they are changed after @@ -244,6 +244,7 @@ self._block_size = self._segment_size / self._required_shares + # XXX if the caller calls set_* to give us SDMF contents in multiple separate calls, could we incrementally upload them to the server even though they will end up stored and hashed as a single segment there? # This is meant to mimic how SDMF files were built before MDMF # entered the picture: we generate each share in its entirety, # then push it off to the storage server in one write. When @@ -427,7 +428,7 @@ The verinfo tuple for MDMF files contains: - seqnum - root hash - - a blank (nothing) + - a blank (nothing) # XXX salts? - segsize - datalen - k @@ -556,29 +557,36 @@ shares. """ # Expected layout, MDMF: - # offset: size: name: + # offset: size: name: pass # to know location: pass # to know contents: #-- signed part -- - # 0 1 version number (01) - # 1 8 sequence number - # 9 32 share tree root hash - # 41 1 The "k" encoding parameter - # 42 1 The "N" encoding parameter - # 43 8 The segment size of the uploaded file - # 51 8 The data length of the original plaintext + # 0 1 version number (01) 0 + # 1 8 sequence number 0 + # 9 32 share tree root hash 0 + # 41 1 The "k" encoding parameter 0 + # 42 1 The "N" encoding parameter 0 + # 43 8 The segment size of the uploaded file 0 + # 51 8 The data length of the original plaintext 0 #-- end signed part -- # 59 8 The offset of the encrypted private key # 83 8 The offset of the signature # 91 8 The offset of the verification key - # 67 8 The offset of the block hash tree # 75 8 The offset of the share hash chain - # 99 8 The offset of the EOF + # 99 8 The offset of the share data + # 67 8 The offset of the block hash tree # + # __ 1280 encrypted signing key (XXX note: experimentally serialized private keys by pycryptopp's implementation of RSA-PSS-SHA256 always come out to 1216 or 1217 bytes...) + # __ 256 verification key + # __ f(N) share hash chain + # __ 256 signature over the first (?) 8 (?) fields + # __ L share data (with per-block salt) + # __+L f(L) block hash tree + # followed by salts and share data, the encrypted private key, the # block hash tree, the share hash chain, a signature over the first # eight fields, and a verification key. # # The checkstring is the first three fields -- the version number, - # sequence number, root hash and root salt hash. This is consistent + # sequence number, root hash and root salt hash. This is consistent # XXX the first *four* fields? Should root salt hash be added to the schema above? # in meaning to what we have with SDMF files, except now instead of # using the literal salt, we use a value derived from all of the # salts -- the share hash root. @@ -597,7 +605,11 @@ # and where they should go.. We can also figure out where the # encrypted private key should go, because we can figure out how # big the share data will be. - # + +# XXX it would be a major added feature if we could initialize without knowing the data length. From a cursory inspection, I think this might be possible if the fields whose length are changed by the data length came at the end of the layout. Those fields are block hash tree, and... Oh, there is only one field whose length changes if the length of the file changes, and that is block hash tree. If we put that one at the end of the layout, then we can know where all the other fields go without knowing the data length. --Zooko 2011-03-16 + +# Hm, and also it would be better if we could compute the exact positions of more fields just starting from facts that we get earlier, such as k, N, segsize. (We get those early facts from cap or CEB.) For example, the offset of the encrypted private key. If the encrypted private key comes before any variable length fields, and therefore if we can compute its offset a priori instead of looking it up in a field, then we might be able to optimize out a round trip on read sometimes, and we definitely get to reduce complexity of the data format. + # 1: Encrypt, encode, and upload the file in chunks. Do something # like # diff -rN -u old-ticket393/src/allmydata/mutable/publish.py new-ticket393/src/allmydata/mutable/publish.py --- old-ticket393/src/allmydata/mutable/publish.py 2011-04-01 17:04:28.000000000 -0600 +++ new-ticket393/src/allmydata/mutable/publish.py 2011-04-01 17:04:30.000000000 -0600 @@ -534,6 +534,7 @@ def setup_encoding_parameters(self, offset=0): if self._version == MDMF_VERSION: + # print "WHEEE I'm USING D_M_S_S: ", DEFAULT_MAX_SEGMENT_SIZE segment_size = DEFAULT_MAX_SEGMENT_SIZE # 128 KiB by default else: segment_size = self.datalength # SDMF is only one segment diff -rN -u old-ticket393/src/allmydata/mutable/servermap.py new-ticket393/src/allmydata/mutable/servermap.py --- old-ticket393/src/allmydata/mutable/servermap.py 2011-04-01 17:04:28.000000000 -0600 +++ new-ticket393/src/allmydata/mutable/servermap.py 2011-04-01 17:04:30.000000000 -0600 @@ -724,7 +724,7 @@ # fetch the block hash tree and first + last segment, as # configured earlier. # Then set them in wherever we happen to want to set - # them. + # them.XXX this comment does not help me. :-) ds = [] # XXX: We do this above, too. Is there a good way to # make the two routines share the value without diff -rN -u old-ticket393/src/allmydata/nodemaker.py new-ticket393/src/allmydata/nodemaker.py --- old-ticket393/src/allmydata/nodemaker.py 2011-04-01 17:04:28.000000000 -0600 +++ new-ticket393/src/allmydata/nodemaker.py 2011-04-01 17:04:30.000000000 -0600 @@ -94,6 +94,7 @@ n = MutableFileNode(self.storage_broker, self.secret_holder, self.default_encoding_parameters, self.history) n.set_version(version) + print "-> thingie: ", self.key_generator.generate d = self.key_generator.generate(keysize) d.addCallback(n.create_with_keys, contents) d.addCallback(lambda res: n) diff -rN -u old-ticket393/src/allmydata/storage/crawler.py new-ticket393/src/allmydata/storage/crawler.py --- old-ticket393/src/allmydata/storage/crawler.py 2011-04-01 17:04:28.000000000 -0600 +++ new-ticket393/src/allmydata/storage/crawler.py 2011-04-01 17:04:30.000000000 -0600 @@ -72,8 +72,9 @@ self.server = server self.sharedir = server.sharedir self.statefile = statefile - self.prefixes = [si_b2a(struct.pack(">H", i << (16-10)))[:2] - for i in range(2**10)] + # self.prefixes = [si_b2a(struct.pack(">H", i << (16-10)))[:2] + # for i in range(2**10)] + self.prefixes = [] self.prefixes.sort() self.timer = None self.bucket_cache = (None, []) diff -rN -u old-ticket393/src/allmydata/storage/server.py new-ticket393/src/allmydata/storage/server.py --- old-ticket393/src/allmydata/storage/server.py 2011-04-01 17:04:28.000000000 -0600 +++ new-ticket393/src/allmydata/storage/server.py 2011-04-01 17:04:30.000000000 -0600 @@ -94,7 +94,7 @@ expiration_override_lease_duration, expiration_cutoff_date, expiration_sharetypes) - self.lease_checker.setServiceParent(self) + # xxx self.lease_checker.setServiceParent(self) def __repr__(self): return "" % (idlib.shortnodeid_b2a(self.my_nodeid),) @@ -102,7 +102,7 @@ def add_bucket_counter(self): statefile = os.path.join(self.storedir, "bucket_counter.state") self.bucket_counter = BucketCountingCrawler(self, statefile) - self.bucket_counter.setServiceParent(self) + #xxx self.bucket_counter.setServiceParent(self) def count(self, name, delta=1): if self.stats_provider: diff -rN -u old-ticket393/src/allmydata/test/common_util.py new-ticket393/src/allmydata/test/common_util.py --- old-ticket393/src/allmydata/test/common_util.py 2011-04-01 17:04:28.000000000 -0600 +++ new-ticket393/src/allmydata/test/common_util.py 2011-04-01 17:04:30.000000000 -0600 @@ -1,11 +1,13 @@ import os, signal, time from random import randrange + from twisted.internet import reactor, defer from twisted.python import failure def insecurerandstr(n): - return ''.join(map(chr, map(randrange, [0]*n, [256]*n))) + # return ''.join(map(chr, map(randrange, [0]*n, [256]*n))) + return os.urandom(n) def flip_bit(good, which): # flip the low-order bit of good[which] diff -rN -u old-ticket393/src/allmydata/test/test_mutable.py new-ticket393/src/allmydata/test/test_mutable.py --- old-ticket393/src/allmydata/test/test_mutable.py 2011-04-01 17:04:29.000000000 -0600 +++ new-ticket393/src/allmydata/test/test_mutable.py 2011-04-01 17:04:30.000000000 -0600 @@ -5,7 +5,7 @@ from twisted.internet import defer, reactor from allmydata import uri, client from allmydata.nodemaker import NodeMaker -from allmydata.util import base32, consumer +from allmydata.util import base32, consumer, mathutil from allmydata.util.hashutil import tagged_hash, ssk_writekey_hash, \ ssk_pubkey_fingerprint_hash from allmydata.util.deferredutil import gatherResults @@ -13,6 +13,7 @@ NotEnoughSharesError, SDMF_VERSION, MDMF_VERSION from allmydata.monitor import Monitor from allmydata.test.common import ShouldFailMixin +from allmydata.test.common_util import insecurerandstr from allmydata.test.no_network import GridTestMixin from foolscap.api import eventually, fireEventually from foolscap.logging import log @@ -25,14 +26,15 @@ NotEnoughServersError, CorruptShareError from allmydata.mutable.retrieve import Retrieve from allmydata.mutable.publish import Publish, MutableFileHandle, \ - MutableData, \ - DEFAULT_MAX_SEGMENT_SIZE + DEFAULT_MAX_SEGMENT_SIZE, MutableData from allmydata.mutable.servermap import ServerMap, ServermapUpdater from allmydata.mutable.layout import unpack_header, MDMFSlotReadProxy from allmydata.mutable.repairer import MustForceRepairError import allmydata.test.common_util as testutil +import mock + # this "FakeStorage" exists to put the share data in RAM and avoid using real # network connections, both to speed up the tests and to reduce the amount of # non-mutable.py code being exercised. @@ -3111,23 +3113,6 @@ self.failUnlessEqual(results, new_data)) return d - def test_replace_in_last_segment(self): - # The wrapper should know how to handle the tail segment - # appropriately. - replace_offset = len(self.data) - 100 - new_data = self.data[:replace_offset] + "replaced" - rest_offset = replace_offset + len("replaced") - new_data += self.data[rest_offset:] - d = self.mdmf_node.get_best_mutable_version() - d.addCallback(lambda mv: - mv.update(MutableData("replaced"), replace_offset)) - d.addCallback(lambda ignored: - self.mdmf_node.download_best_version()) - d.addCallback(lambda results: - self.failUnlessEqual(results, new_data)) - return d - - def test_multiple_segment_replace(self): replace_offset = 2 * DEFAULT_MAX_SEGMENT_SIZE new_data = self.data[:replace_offset] @@ -3145,3 +3130,125 @@ d.addCallback(lambda results: self.failUnlessEqual(results, new_data)) return d + +CONFIGUREDSEGSIZE = 7 +REALSEGSIZE = mathutil.next_multiple(CONFIGUREDSEGSIZE, 3) +SIZE = 15 * 10 ** 5 +# data = 'A' * filesize +filedata = insecurerandstr(SIZE) +def get_file_data(size): + assert size <= len(filedata), (size, len(filedata)) + return filedata[:size] + +# spandata = 'B' * size +spandata = insecurerandstr(SIZE) +def get_span_data(size): + assert size <= len(spandata) + return spandata[:size] + +class UpdateSpans(GridTestMixin, unittest.TestCase, testutil.ShouldFailMixin): + """ This is like class Update but we moved some tests out of there + and refactored the upload of files to be done on a per-test-method + basis instead of per instance. """ + + def setUp(self): + self.patcher1 = mock.patch('allmydata.mutable.publish.DEFAULT_MAX_SEGMENT_SIZE', CONFIGUREDSEGSIZE) + self.patcher1.__enter__() + + GridTestMixin.setUp(self) + self.basedir = self.mktemp() + self.set_up_grid() + self.c = self.g.clients[0] + self.nm = self.c.nodemaker + + # stub out time wasting RSA key generation + #XXXclass FakePrivateKeyjj/Fake + + #XXXdef generate_mock(keysize): + #XXX return defer.succeed((mock.Mock(), mock.Mock())) + #XXXself.nm.key_generator.generate = generate_mock + + def tearDown(self): + self.patcher1.__exit__() + return GridTestMixin.tearDown(self) + + def do_upload(self, filesize): + data = get_file_data(filesize) + d = self.nm.create_mutable_file(MutableData(data), + version=MDMF_VERSION) + print "hello 1 ", d + def _then(n): + assert isinstance(n, MutableFileNode) + + self.mdmf_node = n + return n, data + d.addCallback(_then) + return d + + def test_edge_cases(self): + d = defer.succeed(None) + EVEN_MULTIPLE = REALSEGSIZE*5 + NOT_EVEN_MULTIPLE = EVEN_MULTIPLE + 1 + + cases = [] + ss = REALSEGSIZE + for filesize in (EVEN_MULTIPLE,): # XXX test NOT_EVEN_MULTIPLE + d.addCallback(lambda ignored, filesize=filesize: self.do_upload(filesize)) + + for startseg in (0,): + # for startseg in (0, 1, 3, 4): + for endseg in range(startseg, 7): + # for endseg in range(startseg, 7): + for startindex in (0, ): + # for startindex in (0, 1, 2, ss-1): + for endindex in (1, ss-3, ): + # for endindex in (1, ss-3, ss-2, ss-1, ss): + offset = ss*startseg+startindex + end = ss*endseg+endindex + size = end - offset + if size <= 0: + continue + cases.append((offset, end)) + d.addCallback(self._do_replacement_test, offset, size) + + # print "list, set ", len(cases), len(set(cases)) + return d + test_edge_cases.timeout = 1000 + + def test_replace_in_last_segment_uneven_file_size(self): + # marked for death + return self._do_replacement_test(900000, 900000-100, 8) + + def test_replace_first_segment(self): + return self._do_replacement_test(900000, 0, REALSEGSIZE) + + def test_replace_last_segment_even_file_size(self): + filesize = REALSEGSIZE*9 + numsegs = mathutil.div_ceil(filesize, REALSEGSIZE) + offset = (numsegs-1)*REALSEGSIZE + size = filesize - offset + return self._do_replacement_test(filesize, offset, size) + + def test_replace_last_segment_uneven_file_size(self): + filesize = 900000 + numsegs = mathutil.div_ceil(filesize, REALSEGSIZE) + offset = (numsegs-1)*REALSEGSIZE + size = filesize - offset + return self._do_replacement_test(filesize, offset, size) + + def _do_replacement_test(self, (node, olddata), offset, size): + # print "hello _do_replacement_test(%s)" % ((node, olddata,),) + # print offset, size + d = node.get_best_mutable_version() + + spandata = get_span_data(size) + d.addCallback(lambda mv: + mv.update(MutableData(spandata), offset)) + d.addCallback(lambda ignored: + node.download_best_version()) + + newfiledata = olddata[:offset] + spandata + olddata[offset+size:] + d.addCallback(lambda results: + self.failUnlessEqual(results, newfiledata)) + d.addCallback(lambda ignore: (node, newfiledata)) + return d