Ticket #1628: fix-1628.diff

File fix-1628.diff, 18.4 KB (added by kevan, at 2011-12-13T02:19:28Z)

darcs-free version of fix-1628.darcs.patch

  • src/allmydata/test/no_network.py

    From 1a1a575120f5f58067c8d26f9c228dae67d1dcde Mon Sep 17 00:00:00 2001
    From: Kevan <kevan@isnotajoke.com>
    Date: Sun, 11 Dec 2011 15:42:49 -0800
    Subject: [PATCH 1/5] Make NoNetworkGrid.remove_server return the removed
     storage server object
    
    This supports tests in which servers leave the grid only to return with
    their shares intact at a later time.
    ---
     src/allmydata/test/no_network.py |    1 +
     1 files changed, 1 insertions(+), 0 deletions(-)
    
    diff --git a/src/allmydata/test/no_network.py b/src/allmydata/test/no_network.py
    index bb9c8f2..4bac7d1 100644
    a b class NoNetworkGrid(service.MultiService): 
    289289        del self.wrappers_by_id[serverid]
    290290        del self.proxies_by_id[serverid]
    291291        self.rebuild_serverlist()
     292        return ss
    292293
    293294    def break_server(self, serverid):
    294295        # mark the given server as broken, so it will throw exceptions when
  • src/allmydata/test/test_mutable.py

    -- 
    1.7.7
    
    From c13d982fea3547d88b6d306904a9766d5e7e4969 Mon Sep 17 00:00:00 2001
    From: Kevan <kevan@isnotajoke.com>
    Date: Sun, 11 Dec 2011 15:46:42 -0800
    Subject: [PATCH 2/5] Add test_mutable.Problems.test_multiply_placed_shares
    
    The UCWEs in the incident reports associated with #1628 all seem to be
    associated with shares that the servermap knows about, but which aren't
    accounted for during the publish process for whatever reason.
    Specifically, it looks like the publisher is only capable of keeping
    track of a single storage server for a given share. This makes the
    repair process worse than it was pre-MDMF at updating all of the shares
    of a particular file to the newest version, and can also cause spurious
    UCWEs. This test simulates such a layout and fails if an UCWE is thrown.
    We need to write another test to ensure that all copies of a share are
    updated to the latest version (or alter this test to do that), so that
    the test suite doesn't pass unless both regressions are fixed.
    ---
     src/allmydata/test/test_mutable.py |   31 +++++++++++++++++++++++++++++++
     1 files changed, 31 insertions(+), 0 deletions(-)
    
    diff --git a/src/allmydata/test/test_mutable.py b/src/allmydata/test/test_mutable.py
    index 32602bd..6d2d0fa 100644
    a b class Problems(GridTestMixin, unittest.TestCase, testutil.ShouldFailMixin): 
    25342534        d.addCallback(_created)
    25352535        return d
    25362536
     2537    def test_multiply_placed_shares(self):
     2538        self.basedir = "mutable/Problems/test_multiply_placed_shares"
     2539        self.set_up_grid()
     2540        self.g.clients[0].DEFAULT_ENCODING_PARAMETERS['n'] = 75
     2541        nm = self.g.clients[0].nodemaker
     2542        d = nm.create_mutable_file(MutableData("contents 1"))
     2543        # remove one of the servers and reupload the file.
     2544        def _created(n):
     2545            self._node = n
     2546
     2547            servers = self.g.get_all_serverids()
     2548            self.ss = self.g.remove_server(servers[len(servers)-1])
     2549
     2550            new_server = self.g.make_server(len(servers)-1)
     2551            self.g.add_server(len(servers)-1, new_server)
     2552
     2553            return self._node.download_best_version()
     2554        d.addCallback(_created)
     2555        d.addCallback(lambda data: MutableData(data))
     2556        d.addCallback(lambda data: self._node.overwrite(data))
     2557
     2558        # restore the server we removed earlier, then download+upload
     2559        # the file again
     2560        def _overwritten(ign):
     2561            self.g.add_server(len(self.g.servers_by_number), self.ss)
     2562            return self._node.download_best_version()
     2563        d.addCallback(_overwritten)
     2564        d.addCallback(lambda data: MutableData(data))
     2565        d.addCallback(lambda data: self._node.overwrite(data))
     2566        return d
     2567
    25372568    def test_bad_server(self):
    25382569        # Break one server, then create the file: the initial publish should
    25392570        # complete with an alternate server. Breaking a second server should
  • src/allmydata/test/test_mutable.py

    -- 
    1.7.7
    
    From d7f4cf18e134b65c07eb2737fcfe90e21afc2490 Mon Sep 17 00:00:00 2001
    From: Kevan <kevan@isnotajoke.com>
    Date: Sun, 11 Dec 2011 15:58:26 -0800
    Subject: [PATCH 3/5] Make test_multiply_placed_shares fail on partial
     updates.
    
    We want the publisher to follow the existing share placement when
    uploading a new version of a mutable file, and we don't want this test
    to pass unless it does.
    ---
     src/allmydata/test/test_mutable.py |    8 ++++++++
     1 files changed, 8 insertions(+), 0 deletions(-)
    
    diff --git a/src/allmydata/test/test_mutable.py b/src/allmydata/test/test_mutable.py
    index 6d2d0fa..815b840 100644
    a b class Problems(GridTestMixin, unittest.TestCase, testutil.ShouldFailMixin): 
    25632563        d.addCallback(_overwritten)
    25642564        d.addCallback(lambda data: MutableData(data))
    25652565        d.addCallback(lambda data: self._node.overwrite(data))
     2566        d.addCallback(lambda ignored:
     2567            self._node.get_servermap(MODE_CHECK))
     2568        def _overwritten_again(smap):
     2569            # Make sure that all shares were updated by making sure that
     2570            # there aren't any other versions in the sharemap.
     2571            self.failUnlessEqual(len(smap.recoverable_versions()), 1)
     2572            self.failUnlessEqual(len(smap.unrecoverable_versions()), 0)
     2573        d.addCallback(_overwritten_again)
    25662574        return d
    25672575
    25682576    def test_bad_server(self):
  • src/allmydata/mutable/publish.py

    -- 
    1.7.7
    
    From 86688d20b17281f85396a645a1689ed5d4e0a5b4 Mon Sep 17 00:00:00 2001
    From: Kevan <kevan@isnotajoke.com>
    Date: Mon, 12 Dec 2011 12:13:10 -0800
    Subject: [PATCH 4/5] Change self.writers to map a shnum to a list of writers.
    
    Before this commit, the publisher only kept track of a single writer for
    each share. This is insufficient to handle updates in which a single
    share may live on multiple servers. In the best case, an update will
    only update one of the existing shares instead of all of them. In some
    cases, the update will encounter the existing shares when publishing
    some other share, interpret it as a sign of an uncoordinated update, and
    fail. Keeping track of all of the writers helps ensure that all existing
    shares are updated, and helps avoid spurious uncoordinated write errors.
    ---
     src/allmydata/mutable/publish.py |  140 +++++++++++++++++++++-----------------
     1 files changed, 78 insertions(+), 62 deletions(-)
    
    diff --git a/src/allmydata/mutable/publish.py b/src/allmydata/mutable/publish.py
    index b028779..1e2de76 100644
    a b class Publish: 
    269269            cancel_secret = self._node.get_cancel_secret(server)
    270270            secrets = (write_enabler, renew_secret, cancel_secret)
    271271
    272             self.writers[shnum] =  writer_class(shnum,
    273                                                 server.get_rref(),
    274                                                 self._storage_index,
    275                                                 secrets,
    276                                                 self._new_seqnum,
    277                                                 self.required_shares,
    278                                                 self.total_shares,
    279                                                 self.segment_size,
    280                                                 self.datalength)
    281             self.writers[shnum].server = server
     272            writer = writer_class(shnum,
     273                                  server.get_rref(),
     274                                  self._storage_index,
     275                                  secrets,
     276                                  self._new_seqnum,
     277                                  self.required_shares,
     278                                  self.total_shares,
     279                                  self.segment_size,
     280                                  self.datalength)
     281
     282            self.writers.setdefault(shnum, []).append(writer)
     283            writer.server = server
    282284            known_shares = self._servermap.get_known_shares()
    283285            assert (server, shnum) in known_shares
    284286            old_versionid, old_timestamp = known_shares[(server,shnum)]
    285287            (old_seqnum, old_root_hash, old_salt, old_segsize,
    286288             old_datalength, old_k, old_N, old_prefix,
    287289             old_offsets_tuple) = old_versionid
    288             self.writers[shnum].set_checkstring(old_seqnum,
    289                                                 old_root_hash,
    290                                                 old_salt)
     290            writer.set_checkstring(old_seqnum,
     291                                   old_root_hash,
     292                                   old_salt)
    291293
    292294        # Our remote shares will not have a complete checkstring until
    293295        # after we are done writing share data and have started to write
    294296        # blocks. In the meantime, we need to know what to look for when
    295297        # writing, so that we can detect UncoordinatedWriteErrors.
    296         self._checkstring = self.writers.values()[0].get_checkstring()
     298        self._checkstring = self.writers.values()[0][0].get_checkstring()
    297299
    298300        # Now, we start pushing shares.
    299301        self._status.timings["setup"] = time.time() - self._started
    class Publish: 
    466468            cancel_secret = self._node.get_cancel_secret(server)
    467469            secrets = (write_enabler, renew_secret, cancel_secret)
    468470
    469             self.writers[shnum] =  writer_class(shnum,
    470                                                 server.get_rref(),
    471                                                 self._storage_index,
    472                                                 secrets,
    473                                                 self._new_seqnum,
    474                                                 self.required_shares,
    475                                                 self.total_shares,
    476                                                 self.segment_size,
    477                                                 self.datalength)
    478             self.writers[shnum].server = server
     471            writer =  writer_class(shnum,
     472                                   server.get_rref(),
     473                                   self._storage_index,
     474                                   secrets,
     475                                   self._new_seqnum,
     476                                   self.required_shares,
     477                                   self.total_shares,
     478                                   self.segment_size,
     479                                   self.datalength)
     480            self.writers.setdefault(shnum, []).append(writer)
     481            writer.server = server
    479482            known_shares = self._servermap.get_known_shares()
    480483            if (server, shnum) in known_shares:
    481484                old_versionid, old_timestamp = known_shares[(server,shnum)]
    482485                (old_seqnum, old_root_hash, old_salt, old_segsize,
    483486                 old_datalength, old_k, old_N, old_prefix,
    484487                 old_offsets_tuple) = old_versionid
    485                 self.writers[shnum].set_checkstring(old_seqnum,
    486                                                     old_root_hash,
    487                                                     old_salt)
     488                writer.set_checkstring(old_seqnum,
     489                                       old_root_hash,
     490                                       old_salt)
    488491            elif (server, shnum) in self.bad_share_checkstrings:
    489492                old_checkstring = self.bad_share_checkstrings[(server, shnum)]
    490                 self.writers[shnum].set_checkstring(old_checkstring)
     493                writer.set_checkstring(old_checkstring)
    491494
    492495        # Our remote shares will not have a complete checkstring until
    493496        # after we are done writing share data and have started to write
    494497        # blocks. In the meantime, we need to know what to look for when
    495498        # writing, so that we can detect UncoordinatedWriteErrors.
    496         self._checkstring = self.writers.values()[0].get_checkstring()
     499        self._checkstring = self.writers.values()[0][0].get_checkstring()
    497500
    498501        # Now, we start pushing shares.
    499502        self._status.timings["setup"] = time.time() - self._started
    class Publish: 
    620623        # Can we still successfully publish this file?
    621624        # TODO: Keep track of outstanding queries before aborting the
    622625        #       process.
    623         if len(self.writers) < self.required_shares or self.surprised:
     626        all_writers = []
     627        for shnum, writers in self.writers.iteritems():
     628            all_writers.extend(writers)
     629        if len(all_writers) < self.required_shares or self.surprised:
    624630            return self._failure()
    625631
    626632        # Figure out what we need to do next. Each of these needs to
    class Publish: 
    675681        salt = os.urandom(16)
    676682        assert self._version == SDMF_VERSION
    677683
    678         for writer in self.writers.itervalues():
    679             writer.put_salt(salt)
     684        for shnum, writers in self.writers.iteritems():
     685            for writer in writers:
     686                writer.put_salt(salt)
    680687
    681688
    682689    def _encode_segment(self, segnum):
    class Publish: 
    751758            block_hash = hashutil.block_hash(hashed)
    752759            self.blockhashes[shareid][segnum] = block_hash
    753760            # find the writer for this share
    754             writer = self.writers[shareid]
    755             writer.put_block(sharedata, segnum, salt)
     761            writers = self.writers[shareid]
     762            for writer in writers:
     763                writer.put_block(sharedata, segnum, salt)
    756764
    757765
    758766    def push_everything_else(self):
    class Publish: 
    775783    def push_encprivkey(self):
    776784        encprivkey = self._encprivkey
    777785        self._status.set_status("Pushing encrypted private key")
    778         for writer in self.writers.itervalues():
    779             writer.put_encprivkey(encprivkey)
     786        for shnum, writers in self.writers.iteritems():
     787            for writer in writers:
     788                writer.put_encprivkey(encprivkey)
    780789
    781790
    782791    def push_blockhashes(self):
    class Publish: 
    788797            # set the leaf for future use.
    789798            self.sharehash_leaves[shnum] = t[0]
    790799
    791             writer = self.writers[shnum]
    792             writer.put_blockhashes(self.blockhashes[shnum])
     800            writers = self.writers[shnum]
     801            for writer in writers:
     802                writer.put_blockhashes(self.blockhashes[shnum])
    793803
    794804
    795805    def push_sharehashes(self):
    class Publish: 
    799809            needed_indices = share_hash_tree.needed_hashes(shnum)
    800810            self.sharehashes[shnum] = dict( [ (i, share_hash_tree[i])
    801811                                             for i in needed_indices] )
    802             writer = self.writers[shnum]
    803             writer.put_sharehashes(self.sharehashes[shnum])
     812            writers = self.writers[shnum]
     813            for writer in writers:
     814                writer.put_sharehashes(self.sharehashes[shnum])
    804815        self.root_hash = share_hash_tree[0]
    805816
    806817
    class Publish: 
    811822        #   - Push the signature
    812823        self._status.set_status("Pushing root hashes and signature")
    813824        for shnum in xrange(self.total_shares):
    814             writer = self.writers[shnum]
    815             writer.put_root_hash(self.root_hash)
     825            writers = self.writers[shnum]
     826            for writer in writers:
     827                writer.put_root_hash(self.root_hash)
    816828        self._update_checkstring()
    817829        self._make_and_place_signature()
    818830
    class Publish: 
    825837        uncoordinated writes. SDMF files will have the same checkstring,
    826838        so we need not do anything.
    827839        """
    828         self._checkstring = self.writers.values()[0].get_checkstring()
     840        self._checkstring = self.writers.values()[0][0].get_checkstring()
    829841
    830842
    831843    def _make_and_place_signature(self):
    class Publish: 
    834846        """
    835847        started = time.time()
    836848        self._status.set_status("Signing prefix")
    837         signable = self.writers[0].get_signable()
     849        signable = self.writers.values()[0][0].get_signable()
    838850        self.signature = self._privkey.sign(signable)
    839851
    840         for (shnum, writer) in self.writers.iteritems():
    841             writer.put_signature(self.signature)
     852        for (shnum, writers) in self.writers.iteritems():
     853            for writer in writers:
     854                writer.put_signature(self.signature)
    842855        self._status.timings['sign'] = time.time() - started
    843856
    844857
    class Publish: 
    851864        ds = []
    852865        verification_key = self._pubkey.serialize()
    853866
    854         for (shnum, writer) in self.writers.copy().iteritems():
    855             writer.put_verification_key(verification_key)
    856             self.num_outstanding += 1
    857             def _no_longer_outstanding(res):
    858                 self.num_outstanding -= 1
    859                 return res
     867        for (shnum, writers) in self.writers.copy().iteritems():
     868            for writer in writers:
     869                writer.put_verification_key(verification_key)
     870                self.num_outstanding += 1
     871                def _no_longer_outstanding(res):
     872                    self.num_outstanding -= 1
     873                    return res
    860874
    861             d = writer.finish_publishing()
    862             d.addBoth(_no_longer_outstanding)
    863             d.addErrback(self._connection_problem, writer)
    864             d.addCallback(self._got_write_answer, writer, started)
    865             ds.append(d)
     875                d = writer.finish_publishing()
     876                d.addBoth(_no_longer_outstanding)
     877                d.addErrback(self._connection_problem, writer)
     878                d.addCallback(self._got_write_answer, writer, started)
     879                ds.append(d)
    866880        self._record_verinfo()
    867881        self._status.timings['pack'] = time.time() - started
    868882        return defer.DeferredList(ds)
    869883
    870884
    871885    def _record_verinfo(self):
    872         self.versioninfo = self.writers.values()[0].get_verinfo()
     886        self.versioninfo = self.writers.values()[0][0].get_verinfo()
    873887
    874888
    875889    def _connection_problem(self, f, writer):
    class Publish: 
    879893        """
    880894        self.log("found problem: %s" % str(f))
    881895        self._last_failure = f
    882         del(self.writers[writer.shnum])
     896        self.writers[writer.shnum].remove(writer)
    883897
    884898
    885899    def log_goal(self, goal, message=""):
    class Publish: 
    9881002        # knowingly also writing to that server from other writers.
    9891003
    9901004        # TODO: Precompute this.
    991         known_shnums = [x.shnum for x in self.writers.values()
    992                         if x.server == server]
    993         surprise_shares -= set(known_shnums)
     1005        shares = []
     1006        for shnum, writers in self.writers.iteritems():
     1007            shares.extend([x.shnum for x in writers if x.server == server])
     1008        known_shnums = set(shares)
     1009        surprise_shares -= known_shnums
    9941010        self.log("found the following surprise shares: %s" %
    9951011                 str(surprise_shares))
    9961012