Ticket #798: new-downloader-v6.diff

File new-downloader-v6.diff, 165.4 KB (added by warner, at 2010-05-01T02:37:16Z)

v6 patch: most existing tests pass

  • src/allmydata/client.py

    diff --git a/src/allmydata/client.py b/src/allmydata/client.py
    index 12e7473..b01fbe8 100644
    a b import allmydata 
    1212from allmydata.storage.server import StorageServer
    1313from allmydata import storage_client
    1414from allmydata.immutable.upload import Uploader
    15 from allmydata.immutable.download import Downloader
     15from allmydata.immutable.download2_util import Terminator
    1616from allmydata.immutable.offloaded import Helper
    1717from allmydata.control import ControlServer
    1818from allmydata.introducer.client import IntroducerClient
    19 from allmydata.util import hashutil, base32, pollmixin, cachedir, log
     19from allmydata.util import hashutil, base32, pollmixin, log
    2020from allmydata.util.abbreviate import parse_abbreviated_size
    2121from allmydata.util.time_format import parse_duration, parse_date
    2222from allmydata.stats import StatsProvider
    class Client(node.Node, pollmixin.PollMixin): 
    278278
    279279        self.init_client_storage_broker()
    280280        self.history = History(self.stats_provider)
     281        self.terminator = Terminator()
     282        self.terminator.setServiceParent(self)
    281283        self.add_service(Uploader(helper_furl, self.stats_provider))
    282         download_cachedir = os.path.join(self.basedir,
    283                                          "private", "cache", "download")
    284         self.download_cache_dirman = cachedir.CacheDirectoryManager(download_cachedir)
    285         self.download_cache_dirman.setServiceParent(self)
    286         self.downloader = Downloader(self.storage_broker, self.stats_provider)
    287284        self.init_stub_client()
    288285        self.init_nodemaker()
    289286
    class Client(node.Node, pollmixin.PollMixin): 
    342339                                   self._secret_holder,
    343340                                   self.get_history(),
    344341                                   self.getServiceNamed("uploader"),
    345                                    self.downloader,
    346                                    self.download_cache_dirman,
     342                                   self.terminator,
    347343                                   self.get_encoding_parameters(),
    348344                                   self._key_generator)
    349345
  • new file src/allmydata/immutable/download2.py

    diff --git a/src/allmydata/immutable/download2.py b/src/allmydata/immutable/download2.py
    new file mode 100644
    index 0000000..e88ff1a
    - +  
     1
     2import binascii
     3import struct
     4import copy
     5from zope.interface import implements
     6from twisted.python.failure import Failure
     7from twisted.internet import defer
     8from twisted.internet.interfaces import IPushProducer, IConsumer
     9
     10from foolscap.api import eventually
     11from allmydata.interfaces import IImmutableFileNode, IUploadResults, \
     12     NotEnoughSharesError, NoSharesError, HASH_SIZE
     13from allmydata.hashtree import IncompleteHashTree, BadHashError, \
     14     NotEnoughHashesError
     15from allmydata.util import base32, log, hashutil, mathutil, idlib
     16from allmydata.util.spans import Spans, DataSpans, overlap
     17from allmydata.util.dictutil import DictOfSets
     18from allmydata.check_results import CheckResults, CheckAndRepairResults
     19from allmydata.codec import CRSDecoder
     20from allmydata import uri
     21from pycryptopp.cipher.aes import AES
     22from download2_util import Observer2, incidentally
     23from layout import make_write_bucket_proxy
     24from checker import Checker
     25from repairer import Repairer
     26
     27(AVAILABLE, PENDING, OVERDUE, COMPLETE, CORRUPT, DEAD, BADSEGNUM) = \
     28 ("AVAILABLE", "PENDING", "OVERDUE", "COMPLETE", "CORRUPT", "DEAD", "BADSEGNUM")
     29
     30KiB = 1024
     31class BadSegmentNumberError(Exception):
     32    pass
     33class BadSegmentError(Exception):
     34    pass
     35class BadCiphertextHashError(Exception):
     36    pass
     37
     38class Share:
     39    """I represent a single instance of a single share (e.g. I reference the
     40    shnum2 for share SI=abcde on server xy12t, not the one on server ab45q).
     41    I am associated with a CommonShare that remembers data that is held in
     42    common among e.g. SI=abcde/shnum2 across all servers. I am also
     43    associated with a CiphertextFileNode for e.g. SI=abcde (all shares, all
     44    servers).
     45    """
     46    # this is a specific implementation of IShare for tahoe's native storage
     47    # servers. A different backend would use a different class.
     48
     49    def __init__(self, rref, server_version, verifycap, commonshare, node,
     50                 peerid, shnum, logparent):
     51        self._rref = rref
     52        self._server_version = server_version
     53        self._node = node # holds share_hash_tree and UEB
     54        self._guess_offsets(verifycap, node.guessed_segment_size)
     55        self.actual_offsets = None
     56        self.actual_segment_size = None
     57        self._UEB_length = None
     58        self._commonshare = commonshare # holds block_hash_tree
     59        self._peerid = peerid
     60        self._peerid_s = base32.b2a(peerid)[:5]
     61        self._storage_index = verifycap.storage_index
     62        self._si_prefix = base32.b2a(verifycap.storage_index)[:8]
     63        self._shnum = shnum
     64
     65        self._lp = log.msg(format="%(share)s created", share=repr(self),
     66                           level=log.NOISY, parent=logparent, umid="P7hv2w")
     67
     68        # any given byte of the share can be in one of four states:
     69        #  in: _wanted, _requested, _received
     70        #      FALSE    FALSE       FALSE : don't care about it at all
     71        #      TRUE     FALSE       FALSE : want it, haven't yet asked for it
     72        #      TRUE     TRUE        FALSE : request is in-flight
     73        #                                   or didn't get it
     74        #      FALSE    TRUE        TRUE  : got it, haven't used it yet
     75        #      FALSE    TRUE        FALSE : got it and used it
     76        #      FALSE    FALSE       FALSE : block consumed, ready to ask again
     77        #
     78        # when we request data and get a NAK, we leave it in _requested
     79        # to remind ourself to not ask for it again. We don't explicitly
     80        # remove it from anything (maybe this should change).
     81        #
     82        # We retain the hashtrees in the Node, so we leave those spans in
     83        # _requested (and never ask for them again, as long as the Node is
     84        # alive). But we don't retain data blocks (too big), so when we
     85        # consume a data block, we remove it from _requested, so a later
     86        # download can re-fetch it.
     87
     88        # self._wanted contains data that we need, either metadata (like
     89        # hashes) or block data. Once we've received the data, we remove it
     90        # from self._wanted
     91        self._wanted = Spans()
     92
     93        # self._requested contains ranges we've requested before: this data
     94        # is either in-flight or answered-yes or answered-no.
     95        self._requested = Spans() # we've sent a request for this
     96        # self._received contains data that we haven't yet used
     97        self._received = DataSpans() # we've received a response for this
     98
     99        self._requested_blocks = [] # (segnum, set(observer2..))
     100        ver = server_version["http://allmydata.org/tahoe/protocols/storage/v1"]
     101        self._overrun_ok = ver["tolerates-immutable-read-overrun"]
     102        # If _overrun_ok and we guess the offsets correctly, we can get
     103        # everything in one RTT. If _overrun_ok and we guess wrong, we might
     104        # need two RTT (but we could get lucky and do it in one). If overrun
     105        # is *not* ok (tahoe-1.3.0 or earlier), we need four RTT: 1=version,
     106        # 2=offset table, 3=UEB_length and everything else (hashes, block),
     107        # 4=UEB.
     108
     109        self._dead = False
     110
     111    def __repr__(self):
     112        return "Share(sh%d-on-%s)" % (self._shnum, self._peerid_s)
     113
     114    def not_dead(self):
     115        # XXX: reconsider. If the share sees a single error, should it remain
     116        # dead for all time? Or should the next segment try again? Also,
     117        # 'not_dead' is a dorky method name. This DEAD state is stored
     118        # elsewhere too (SegmentFetcher per-share states?) and needs to be
     119        # consistent.
     120        return not self._dead
     121
     122    def _guess_offsets(self, verifycap, guessed_segment_size):
     123        self.guessed_segment_size = guessed_segment_size
     124        size = verifycap.size
     125        k = verifycap.needed_shares
     126        N = verifycap.total_shares
     127        r = self._node._calculate_sizes(guessed_segment_size)
     128        # num_segments, block_size/tail_block_size
     129        # guessed_segment_size/tail_segment_size/tail_segment_padded
     130        share_size = mathutil.div_ceil(size, k)
     131        # share_size is the amount of block data that will be put into each
     132        # share, summed over all segments. It does not include hashes, the
     133        # UEB, or other overhead.
     134
     135        # use the upload-side code to get this as accurate as possible
     136        ht = IncompleteHashTree(N)
     137        num_share_hashes = len(ht.needed_hashes(0, include_leaf=True))
     138        wbp = make_write_bucket_proxy(None, share_size, r["block_size"],
     139                                      r["num_segments"], num_share_hashes, 0,
     140                                      None)
     141        self._fieldsize = wbp.fieldsize
     142        self._fieldstruct = wbp.fieldstruct
     143        self.guessed_offsets = wbp._offsets
     144
     145    # called by our client, the SegmentFetcher
     146    def get_block(self, segnum):
     147        """Add a block number to the list of requests. This will eventually
     148        result in a fetch of the data necessary to validate the block, then
     149        the block itself. The fetch order is generally
     150        first-come-first-served, but requests may be answered out-of-order if
     151        data becomes available sooner.
     152
     153        I return an Observer2, which has two uses. The first is to call
     154        o.subscribe(), which gives me a place to send state changes and
     155        eventually the data block. The second is o.cancel(), which removes
     156        the request (if it is still active).
     157
     158        I will distribute the following events through my Observer2:
     159         - state=OVERDUE: ?? I believe I should have had an answer by now.
     160                          You may want to ask another share instead.
     161         - state=BADSEGNUM: the segnum you asked for is too large. I must
     162                            fetch a valid UEB before I can determine this,
     163                            so the notification is asynchronous
     164         - state=COMPLETE, block=data: here is a valid block
     165         - state=CORRUPT: this share contains corrupted data
     166         - state=DEAD, f=Failure: the server reported an error, this share
     167                                  is unusable
     168        """
     169        log.msg("%s.get_block(%d)" % (repr(self), segnum),
     170                level=log.NOISY, parent=self._lp, umid="RTo9MQ")
     171        assert segnum >= 0
     172        o = Observer2()
     173        o.set_canceler(self._cancel_block_request)
     174        for i,(segnum0,observers) in enumerate(self._requested_blocks):
     175            if segnum0 == segnum:
     176                observers.add(o)
     177                break
     178        else:
     179            self._requested_blocks.append( (segnum, set([o])) )
     180        eventually(self.loop)
     181        return o
     182
     183    def _cancel_block_request(self, o):
     184        new_requests = []
     185        for e in self._requested_blocks:
     186            (segnum0, observers) = e
     187            observers.discard(o)
     188            if observers:
     189                new_requests.append(e)
     190        self._requested_blocks = new_requests
     191
     192    # internal methods
     193    def _active_segnum(self):
     194        if self._requested_blocks:
     195            return self._requested_blocks[0]
     196        return None
     197
     198    def _active_segnum_and_observers(self):
     199        if self._requested_blocks:
     200            # we only retrieve information for one segment at a time, to
     201            # minimize alacrity (first come, first served)
     202            return self._requested_blocks[0]
     203        return None, []
     204
     205    def loop(self):
     206        try:
     207            # if any exceptions occur here, kill the download
     208            log.msg("%s.loop, reqs=[%s], wanted=%s, requested=%s, received=%s" %
     209                    (repr(self),
     210                     ",".join([str(req[0]) for req in self._requested_blocks]),
     211                     self._wanted.dump(), self._requested.dump(),
     212                     self._received.dump() ),
     213                    level=log.NOISY, parent=self._lp, umid="BaL1zw")
     214            self._do_loop()
     215        except BaseException:
     216            self._fail(Failure())
     217            raise
     218
     219    def _do_loop(self):
     220        # we are (eventually) called after all state transitions:
     221        #  new segments added to self._requested_blocks
     222        #  new data received from servers (responses to our read() calls)
     223        #  impatience timer fires (server appears slow)
     224
     225        # First, consume all of the information that we currently have, for
     226        # all the segments people currently want.
     227        while self._get_satisfaction():
     228            pass
     229
     230        # When we get no satisfaction (from the data we've received so far),
     231        # we determine what data we desire (to satisfy more requests). The
     232        # number of segments is finite, so I can't get no satisfaction
     233        # forever.
     234        self._desire()
     235
     236        # finally send out requests for whatever we need (desire minus have).
     237        # You can't always get what you want, but, sometimes, you get what
     238        # you need.
     239        self._request_needed() # express desire
     240
     241    def _get_satisfaction(self):
     242        # return True if we retired a data block, and should therefore be
     243        # called again. Return False if we don't retire a data block (even if
     244        # we do retire some other data, like hash chains).
     245
     246        if self.actual_offsets is None:
     247            if not self._satisfy_offsets():
     248                # can't even look at anything without the offset table
     249                return False
     250
     251        if not self._node.have_UEB:
     252            if not self._satisfy_UEB():
     253                # can't check any hashes without the UEB
     254                return False
     255
     256        # knowing the UEB means knowing num_segments. Despite the redundancy,
     257        # this is the best place to set this. CommonShare.set_numsegs will
     258        # ignore duplicate calls.
     259        cs = self._commonshare
     260        cs.set_numsegs(self._node.num_segments)
     261
     262        segnum, observers = self._active_segnum_and_observers()
     263        # if segnum is None, we don't really need to do anything (we have no
     264        # outstanding readers right now), but we'll fill in the bits that
     265        # aren't tied to any particular segment.
     266
     267        if segnum is not None and segnum >= self._node.num_segments:
     268            for o in observers:
     269                o.notify(state=BADSEGNUM)
     270            self._requested_blocks.pop(0)
     271            return True
     272
     273        if self._node.share_hash_tree.needed_hashes(self._shnum):
     274            if not self._satisfy_share_hash_tree():
     275                # can't check block_hash_tree without a root
     276                return False
     277
     278        if cs.need_block_hash_root():
     279            block_hash_root = self._node.share_hash_tree.get_leaf(self._shnum)
     280            cs.set_block_hash_root(block_hash_root)
     281
     282        if segnum is None:
     283            return False # we don't want any particular segment right now
     284
     285        # block_hash_tree
     286        needed_hashes = self._commonshare.get_needed_block_hashes(segnum)
     287        if needed_hashes:
     288            if not self._satisfy_block_hash_tree(needed_hashes):
     289                # can't check block without block_hash_tree
     290                return False
     291
     292        # ciphertext_hash_tree
     293        needed_hashes = self._node.get_needed_ciphertext_hashes(segnum)
     294        if needed_hashes:
     295            if not self._satisfy_ciphertext_hash_tree(needed_hashes):
     296                # can't check decoded blocks without ciphertext_hash_tree
     297                return False
     298
     299        # data blocks
     300        return self._satisfy_data_block(segnum, observers)
     301
     302    def _satisfy_offsets(self):
     303        version_s = self._received.get(0, 4)
     304        if version_s is None:
     305            return False
     306        (version,) = struct.unpack(">L", version_s)
     307        if version == 1:
     308            table_start = 0x0c
     309            self._fieldsize = 0x4
     310            self._fieldstruct = "L"
     311        else:
     312            table_start = 0x14
     313            self._fieldsize = 0x8
     314            self._fieldstruct = "Q"
     315        offset_table_size = 6 * self._fieldsize
     316        table_s = self._received.pop(table_start, offset_table_size)
     317        if table_s is None:
     318            return False
     319        fields = struct.unpack(">"+6*self._fieldstruct, table_s)
     320        offsets = {}
     321        for i,field in enumerate(['data',
     322                                  'plaintext_hash_tree', # UNUSED
     323                                  'crypttext_hash_tree',
     324                                  'block_hashes',
     325                                  'share_hashes',
     326                                  'uri_extension',
     327                                  ] ):
     328            offsets[field] = fields[i]
     329        self.actual_offsets = offsets
     330        log.msg("actual offsets: data=%d, plaintext_hash_tree=%d, crypttext_hash_tree=%d, block_hashes=%d, share_hashes=%d, uri_extension=%d" % tuple(fields))
     331        self._received.remove(0, 4) # don't need this anymore
     332        return True
     333
     334    def _satisfy_UEB(self):
     335        o = self.actual_offsets
     336        fsize = self._fieldsize
     337        UEB_length_s = self._received.get(o["uri_extension"], fsize)
     338        if not UEB_length_s:
     339            return False
     340        (UEB_length,) = struct.unpack(">"+self._fieldstruct, UEB_length_s)
     341        UEB_s = self._received.pop(o["uri_extension"]+fsize, UEB_length)
     342        if not UEB_s:
     343            return False
     344        self._received.remove(o["uri_extension"], fsize)
     345        try:
     346            self._node.validate_and_store_UEB(UEB_s)
     347            self.actual_segment_size = self._node.segment_size
     348            assert self.actual_segment_size is not None
     349            return True
     350        except BadHashError:
     351            # TODO: if this UEB was bad, we'll keep trying to validate it
     352            # over and over again. Only log.err on the first one, or better
     353            # yet skip all but the first
     354            f = Failure()
     355            self._signal_corruption(f, o["uri_extension"], fsize+UEB_length)
     356            return False
     357
     358    def _satisfy_share_hash_tree(self):
     359        # the share hash chain is stored as (hashnum,hash) tuples, so you
     360        # can't fetch just the pieces you need, because you don't know
     361        # exactly where they are. So fetch everything, and parse the results
     362        # later.
     363        o = self.actual_offsets
     364        hashlen = o["uri_extension"] - o["share_hashes"]
     365        assert hashlen % (2+HASH_SIZE) == 0
     366        hashdata = self._received.get(o["share_hashes"], hashlen)
     367        if not hashdata:
     368            return False
     369        share_hashes = {}
     370        for i in range(0, hashlen, 2+HASH_SIZE):
     371            (hashnum,) = struct.unpack(">H", hashdata[i:i+2])
     372            hashvalue = hashdata[i+2:i+2+HASH_SIZE]
     373            share_hashes[hashnum] = hashvalue
     374        try:
     375            self._node.process_share_hashes(share_hashes)
     376            # adds to self._node.share_hash_tree
     377            self._received.remove(o["share_hashes"], hashlen)
     378            return True
     379        except (BadHashError, NotEnoughHashesError, IndexError):
     380            f = Failure()
     381            self._signal_corruption(f, o["share_hashes"], hashlen)
     382            return False
     383
     384    def _signal_corruption(self, f, start, offset):
     385        # there was corruption somewhere in the given range
     386        reason = "corruption in share[%d-%d): %s" % (start, start+offset,
     387                                                     str(f.value))
     388        self._rref.callRemoteOnly("advise_corrupt_share", "immutable",
     389                                  self._storage_index, self._shnum, reason)
     390
     391    def _satisfy_block_hash_tree(self, needed_hashes):
     392        o_bh = self.actual_offsets["block_hashes"]
     393        block_hashes = {}
     394        for hashnum in needed_hashes:
     395            hashdata = self._received.get(o_bh+hashnum*HASH_SIZE, HASH_SIZE)
     396            if hashdata:
     397                block_hashes[hashnum] = hashdata
     398            else:
     399                return False # missing some hashes
     400        # note that we don't submit any hashes to the block_hash_tree until
     401        # we've gotten them all, because the hash tree will throw an
     402        # exception if we only give it a partial set (which it therefore
     403        # cannot validate)
     404        commonshare = self._commonshare
     405        ok = commonshare.process_block_hashes(block_hashes, self._peerid_s)
     406        if not ok:
     407            return False
     408        for hashnum in needed_hashes:
     409            self._received.remove(o_bh+hashnum*HASH_SIZE, HASH_SIZE)
     410        return True
     411
     412    def _satisfy_ciphertext_hash_tree(self, needed_hashes):
     413        start = self.actual_offsets["crypttext_hash_tree"]
     414        ciphertext_hashes = {}
     415        for hashnum in needed_hashes:
     416            hashdata = self._received.get(start+hashnum*HASH_SIZE, HASH_SIZE)
     417            if hashdata:
     418                ciphertext_hashes[hashnum] = hashdata
     419            else:
     420                return False # missing some hashes
     421        # we don't submit any hashes to the ciphertext_hash_tree until we've
     422        # gotten them all
     423        ok = self._node.process_ciphertext_hashes(ciphertext_hashes,
     424                                                  self._shnum, self._peerid_s)
     425        if not ok:
     426            return False
     427        for hashnum in needed_hashes:
     428            self._received.remove(start+hashnum*HASH_SIZE, HASH_SIZE)
     429        return True
     430
     431    def _satisfy_data_block(self, segnum, observers):
     432        tail = (segnum == self._node.num_segments-1)
     433        datastart = self.actual_offsets["data"]
     434        blockstart = datastart + segnum * self._node.block_size
     435        blocklen = self._node.block_size
     436        if tail:
     437            blocklen = self._node.tail_block_size
     438
     439        block = self._received.pop(blockstart, blocklen)
     440        if not block:
     441            return False
     442        log.msg(format="%(share)s._satisfy_data_block, len(block)=%(blocklen)d",
     443                share=repr(self),
     444                blocklen=len(block),
     445                level=log.NOISY, parent=self._lp, umid="uTDNZg")
     446        # we removed the block from _received, but don't retain the data in
     447        # our Node or CommonShare, so also remove it from _requested: this
     448        # lets us ask for it again in a later download which uses this same
     449        # Share object.
     450        self._requested.remove(blockstart, blocklen)
     451        # this block is being retired, either as COMPLETE or CORRUPT, since
     452        # no further data reads will help
     453        assert self._requested_blocks[0][0] == segnum
     454        commonshare = self._commonshare
     455        ok = commonshare.check_block(segnum, block, self._peerid_s)
     456        if ok:
     457            for o in observers:
     458                # goes to SegmentFetcher._block_request_activity
     459                o.notify(state=COMPLETE, block=block)
     460        else:
     461            for o in observers:
     462                o.notify(state=CORRUPT)
     463        self._requested_blocks.pop(0) # retired
     464        # popping the request keeps us from turning around and wanting the
     465        # block again right away
     466        return True # got satisfaction
     467
     468    def _desire(self):
     469        segnum, observers = self._active_segnum_and_observers() # maybe None
     470        commonshare = self._commonshare
     471
     472        if not self.actual_offsets:
     473            self._desire_offsets()
     474
     475        # we can use guessed offsets as long as this server tolerates overrun
     476        if not self.actual_offsets and not self._overrun_ok:
     477            return # must wait for the offsets to arrive
     478
     479        o = self.actual_offsets or self.guessed_offsets
     480        segsize = self.actual_segment_size or self.guessed_segment_size
     481        if not self._node.have_UEB:
     482            self._desire_UEB(o)
     483
     484        if self._node.share_hash_tree.needed_hashes(self._shnum):
     485            hashlen = o["uri_extension"] - o["share_hashes"]
     486            self._wanted.add(o["share_hashes"], hashlen)
     487
     488        if segnum is None:
     489            return # I have achieved Zen: I desire nothing.
     490
     491        # block hash chain
     492        for hashnum in commonshare.get_needed_block_hashes(segnum):
     493            self._wanted.add(o["block_hashes"]+hashnum*HASH_SIZE, HASH_SIZE)
     494
     495        # ciphertext hash chain
     496        for hashnum in self._node.get_needed_ciphertext_hashes(segnum):
     497            self._wanted.add(o["crypttext_hash_tree"]+hashnum*HASH_SIZE, HASH_SIZE)
     498
     499        # data
     500        r = self._node._calculate_sizes(segsize)
     501        tail = (segnum == r["num_segments"])
     502        datastart = o["data"]
     503        blockstart = datastart + segnum * r["block_size"]
     504        blocklen = r["block_size"]
     505        if tail:
     506            blocklen = r["tail_block_size"]
     507        self._wanted.add(blockstart, blocklen)
     508        #log.msg("end _desire: wanted=%s" % (self._wanted.dump(),))
     509
     510    def _desire_offsets(self):
     511        if self._overrun_ok:
     512            # easy! this includes version number, sizes, and offsets
     513            self._wanted.add(0,1024)
     514            return
     515
     516        # v1 has an offset table that lives [0x0,0x24). v2 lives [0x0,0x44).
     517        # To be conservative, only request the data that we know lives there,
     518        # even if that means more roundtrips.
     519
     520        self._wanted.add(0,4)  # version number, always safe
     521        version_s = self._received.get(0, 4)
     522        if not version_s:
     523            return
     524        (version,) = struct.unpack(">L", version_s)
     525        if version == 1:
     526            table_start = 0x0c
     527            fieldsize = 0x4
     528        else:
     529            table_start = 0x14
     530            fieldsize = 0x8
     531        offset_table_size = 6 * fieldsize
     532        self._wanted.add(table_start, offset_table_size)
     533
     534    def _desire_UEB(self, o):
     535        # UEB data is stored as (length,data).
     536        if self._overrun_ok:
     537            # We can pre-fetch 2kb, which should probably cover it. If it
     538            # turns out to be larger, we'll come back here later with a known
     539            # length and fetch the rest.
     540            self._wanted.add(o["uri_extension"], 2048)
     541            # now, while that is probably enough to fetch the whole UEB, it
     542            # might not be, so we need to do the next few steps as well. In
     543            # most cases, the following steps will not actually add anything
     544            # to self._wanted
     545
     546        self._wanted.add(o["uri_extension"], self._fieldsize)
     547        # only use a length if we're sure it's correct, otherwise we'll
     548        # probably fetch a huge number
     549        if not self.actual_offsets:
     550            return
     551        UEB_length_s = self._received.get(o["uri_extension"], self._fieldsize)
     552        if UEB_length_s:
     553            (UEB_length,) = struct.unpack(">"+self._fieldstruct, UEB_length_s)
     554            # we know the length, so make sure we grab everything
     555            self._wanted.add(o["uri_extension"]+self._fieldsize, UEB_length)
     556
     557    def _request_needed(self):
     558        received = self._received.get_spans()
     559        ask = self._wanted - self._requested - received
     560        self._send_requests(ask) # this removes it from _wanted
     561        # XXX then send requests for data blocks. All the hashes should
     562        # arrive before the blocks, so the blocks can be consumed and
     563        # released in a single turn. TODO: I removed this for simplicity.
     564        # Reconsider the removal: maybe bring it back.
     565
     566    def _send_requests(self, needed):
     567        for (start, length) in needed:
     568            # TODO: quantize to reasonably-large blocks
     569            self._requested.add(start, length)
     570            lp = log.msg(format="%(share)s._send_request"
     571                         " [%(start)d:+%(length)d]",
     572                         share=repr(self),
     573                         start=start, length=length,
     574                         level=log.NOISY, parent=self._lp, umid="sgVAyA")
     575            d = self._send_request(start, length)
     576            d.addCallback(self._got_data, start, length, lp)
     577            d.addErrback(self._got_error, start, length, lp)
     578            d.addCallback(incidentally, eventually, self.loop)
     579            d.addErrback(lambda f:
     580                         log.err(format="unhandled error during send_request",
     581                                 failure=f, parent=self._lp,
     582                                 level=log.WEIRD, umid="qZu0wg"))
     583
     584    def _send_request(self, start, length):
     585        return self._rref.callRemote("read", start, length)
     586
     587    def _got_data(self, data, start, length, lp):
     588        log.msg(format="%(share)s._got_data [%(start)d:+%(length)d] -> %(datalen)d",
     589                share=repr(self), start=start, length=length, datalen=len(data),
     590                level=log.NOISY, parent=lp, umid="sgVAyA")
     591        span = (start, length)
     592        assert span in self._requested # XXX eh, not important
     593        self._received.add(start, data)
     594        self._wanted.remove(start, length)
     595
     596    def _got_error(self, f, start, length, lp):
     597        log.msg(format="error requesting %(start)d+%(length)d"
     598                " from %(server)s for si %(si)s",
     599                start=start, length=length,
     600                server=self._peerid_s, si=self._si_prefix,
     601                failure=f, parent=lp, level=log.UNUSUAL, umid="qZu0wg")
     602        # retire our observers, assuming we won't be able to make any
     603        # further progress
     604        self._fail(f)
     605
     606    def _fail(self, f):
     607        log.msg(format="abandoning %(share)s",
     608                share=repr(self), failure=f,
     609                level=log.UNUSUAL, parent=self._lp, umid="JKM2Og")
     610        self._dead = True
     611        for (segnum, observers) in self._requested_blocks:
     612            for o in observers:
     613                o.notify(state=DEAD, f=f)
     614
     615
     616class CommonShare:
     617    """I hold data that is common across all instances of a single share,
     618    like sh2 on both servers A and B. This is just the block hash tree.
     619    """
     620    def __init__(self, guessed_numsegs, si_prefix, shnum, logparent):
     621        self.si_prefix = si_prefix
     622        self.shnum = shnum
     623        # in the beginning, before we have the real UEB, we can only guess at
     624        # the number of segments. But we want to ask for block hashes early.
     625        # So if we're asked for which block hashes are needed before we know
     626        # numsegs for sure, we return a guess.
     627        self._block_hash_tree = IncompleteHashTree(guessed_numsegs)
     628        self._know_numsegs = False
     629        self._logparent = logparent
     630
     631    def set_numsegs(self, numsegs):
     632        if self._know_numsegs:
     633            return
     634        self._block_hash_tree = IncompleteHashTree(numsegs)
     635        self._know_numsegs = True
     636
     637    def need_block_hash_root(self):
     638        return bool(not self._block_hash_tree[0])
     639
     640    def set_block_hash_root(self, roothash):
     641        assert self._know_numsegs
     642        self._block_hash_tree.set_hashes({0: roothash})
     643
     644    def get_needed_block_hashes(self, segnum):
     645        needed = ",".join([str(n) for n in sorted(self._block_hash_tree.needed_hashes(segnum))])
     646        log.msg("CommonShare.get_needed_block_hashes: segnum=%d needs %s" %
     647                (segnum, needed),
     648                level=log.NOISY, parent=self._logparent, umid="6qTMnw")
     649        # XXX: include_leaf=True needs thought: how did the old downloader do
     650        # it? I think it grabbed *all* block hashes and set them all at once.
     651        # Since we want to fetch less data, we either need to fetch the leaf
     652        # too, or wait to set the block hashes until we've also received the
     653        # block itself, so we can hash it too, and set the chain+leaf all at
     654        # the same time.
     655        return self._block_hash_tree.needed_hashes(segnum, include_leaf=True)
     656
     657    def process_block_hashes(self, block_hashes, serverid_s):
     658        assert self._know_numsegs
     659        try:
     660            self._block_hash_tree.set_hashes(block_hashes)
     661            return True
     662        except (BadHashError, NotEnoughHashesError):
     663            hashnums = ",".join([str(n) for n in sorted(block_hashes.keys())])
     664            log.msg(format="hash failure in block_hashes=(%(hashnums)s),"
     665                    " shnum=%(shnum)d SI=%(si)s server=%(server)s",
     666                    hashnums=hashnums, shnum=self.shnum,
     667                    si=self.si_prefix, server=serverid_s, failure=Failure(),
     668                    level=log.WEIRD, parent=self._logparent, umid="yNyFdA")
     669        return False
     670
     671    def check_block(self, segnum, block, serverid_s):
     672        assert self._know_numsegs
     673        h = hashutil.block_hash(block)
     674        try:
     675            self._block_hash_tree.set_hashes(leaves={segnum: h})
     676            return True
     677        except (BadHashError, NotEnoughHashesError):
     678            self.log(format="hash failure in block %(segnum)d,"
     679                     " shnum=%(shnum)d SI=%(si)s server=%(server)s",
     680                     segnum=segnum, shnum=self.shnum, si=self.si_prefix,
     681                     server=serverid_s, failure=Failure(),
     682                     level=log.WEIRD, parent=self._logparent, umid="mZjkqA")
     683        return False
     684
     685# all classes are also Services, and the rule is that you don't initiate more
     686# work unless self.running
     687
     688# GC: decide whether each service is restartable or not. For non-restartable
     689# services, stopService() should delete a lot of attributes to kill reference
     690# cycles. The primary goal is to decref remote storage BucketReaders when a
     691# download is complete.
     692
     693class SegmentFetcher:
     694    """I am responsible for acquiring blocks for a single segment. I will use
     695    the Share instances passed to my add_shares() method to locate, retrieve,
     696    and validate those blocks. I expect my parent node to call my
     697    no_more_shares() method when there are no more shares available. I will
     698    call my parent's want_more_shares() method when I want more: I expect to
     699    see at least one call to add_shares or no_more_shares afterwards.
     700
     701    When I have enough validated blocks, I will call my parent's
     702    process_blocks() method with a dictionary that maps shnum to blockdata.
     703    If I am unable to provide enough blocks, I will call my parent's
     704    fetch_failed() method with (self, f). After either of these events, I
     705    will shut down and do no further work. My parent can also call my stop()
     706    method to have me shut down early."""
     707
     708    def __init__(self, node, segnum, k):
     709        self._node = node # _Node
     710        self.segnum = segnum
     711        self._k = k
     712        self._shares = {} # maps non-dead Share instance to a state, one of
     713                          # (AVAILABLE, PENDING, OVERDUE, COMPLETE, CORRUPT).
     714                          # State transition map is:
     715                          #  AVAILABLE -(send-read)-> PENDING
     716                          #  PENDING -(timer)-> OVERDUE
     717                          #  PENDING -(rx)-> COMPLETE, CORRUPT, DEAD, BADSEGNUM
     718                          #  OVERDUE -(rx)-> COMPLETE, CORRUPT, DEAD, BADSEGNUM
     719                          # If a share becomes DEAD, it is removed from the
     720                          # dict. If it becomes BADSEGNUM, the whole fetch is
     721                          # terminated.
     722        self._share_observers = {} # maps Share to Observer2 for active ones
     723        self._shnums = DictOfSets() # maps shnum to the shares that provide it
     724        self._blocks = {} # maps shnum to validated block data
     725        self._no_more_shares = False
     726        self._bad_segnum = False
     727        self._last_failure = None
     728        self._running = True
     729
     730    def stop(self):
     731        log.msg("SegmentFetcher(%s).stop" % self._node._si_prefix,
     732                level=log.NOISY, umid="LWyqpg")
     733        self._cancel_all_requests()
     734        self._running = False
     735        self._shares.clear() # let GC work # ??? XXX
     736
     737
     738    # called by our parent _Node
     739
     740    def add_shares(self, shares):
     741        # called when ShareFinder locates a new share, and when a non-initial
     742        # segment fetch is started and we already know about shares from the
     743        # previous segment
     744        for s in shares:
     745            self._shares[s] = AVAILABLE
     746            self._shnums.add(s._shnum, s)
     747        eventually(self.loop)
     748
     749    def no_more_shares(self):
     750        # ShareFinder tells us it's reached the end of its list
     751        self._no_more_shares = True
     752        eventually(self.loop)
     753
     754    # internal methods
     755
     756    def _count_shnums(self, *states):
     757        """shnums for which at least one state is in the following list"""
     758        shnums = []
     759        for shnum,shares in self._shnums.iteritems():
     760            matches = [s for s in shares if self._shares.get(s) in states]
     761            if matches:
     762                shnums.append(shnum)
     763        return len(shnums)
     764
     765    def loop(self):
     766        try:
     767            # if any exception occurs here, kill the download
     768            self._do_loop()
     769        except BaseException:
     770            self._node.fetch_failed(self, Failure())
     771            raise
     772
     773    def _do_loop(self):
     774        k = self._k
     775        if not self._running:
     776            return
     777        if self._bad_segnum:
     778            # oops, we were asking for a segment number beyond the end of the
     779            # file. This is an error.
     780            self.stop()
     781            e = BadSegmentNumberError("%d > %d" % (self.segnum,
     782                                                   self._node.num_segments))
     783            f = Failure(e)
     784            self._node.fetch_failed(self, f)
     785            return
     786
     787        # are we done?
     788        if self._count_shnums(COMPLETE) >= k:
     789            # yay!
     790            self.stop()
     791            self._node.process_blocks(self.segnum, self._blocks)
     792            return
     793
     794        # we may have exhausted everything
     795        if (self._no_more_shares and
     796            self._count_shnums(AVAILABLE, PENDING, OVERDUE, COMPLETE) < k):
     797            # no more new shares are coming, and the remaining hopeful shares
     798            # aren't going to be enough. boo!
     799
     800            log.msg("share states: %r" % (self._shares,),
     801                    level=log.NOISY, umid="0ThykQ")
     802            if self._count_shnums(AVAILABLE, PENDING, OVERDUE, COMPLETE) == 0:
     803                format = ("no shares (need %(k)d)."
     804                          " Last failure: %(last_failure)s")
     805                args = { "k": k,
     806                         "last_failure": self._last_failure }
     807                error = NoSharesError
     808            else:
     809                format = ("ran out of shares: %(complete)d complete,"
     810                          " %(pending)d pending, %(overdue)d overdue,"
     811                          " %(unused)d unused, need %(k)d."
     812                          " Last failure: %(last_failure)s")
     813                args = {"complete": self._count_shnums(COMPLETE),
     814                        "pending": self._count_shnums(PENDING),
     815                        "overdue": self._count_shnums(OVERDUE),
     816                        # 'unused' should be zero
     817                        "unused": self._count_shnums(AVAILABLE),
     818                        "k": k,
     819                        "last_failure": self._last_failure,
     820                        }
     821                error = NotEnoughSharesError
     822            log.msg(format=format, level=log.UNUSUAL, umid="1DsnTg", **args)
     823            e = error(format % args)
     824            f = Failure(e)
     825            self.stop()
     826            self._node.fetch_failed(self, f)
     827            return
     828
     829        # nope, not done. Are we "block-hungry" (i.e. do we want to send out
     830        # more read requests, or do we think we have enough in flight
     831        # already?)
     832        while self._count_shnums(PENDING, COMPLETE) < k:
     833            # we're hungry.. are there any unused shares?
     834            sent = self._send_new_request()
     835            if not sent:
     836                break
     837
     838        # ok, now are we "share-hungry" (i.e. do we have enough known shares
     839        # to make us happy, or should we ask the ShareFinder to get us more?)
     840        if self._count_shnums(AVAILABLE, PENDING, COMPLETE) < k:
     841            # we're hungry for more shares
     842            self._node.want_more_shares()
     843            # that will trigger the ShareFinder to keep looking
     844
     845    def _find_one(self, shares, state):
     846        # TODO could choose fastest
     847        for s in shares:
     848            if self._shares[s] == state:
     849                return s
     850        raise IndexError("shouldn't get here")
     851
     852    def _send_new_request(self):
     853        for shnum,shares in self._shnums.iteritems():
     854            states = [self._shares[s] for s in shares]
     855            if COMPLETE in states or PENDING in states:
     856                # don't send redundant requests
     857                continue
     858            if AVAILABLE not in states:
     859                # no candidates for this shnum, move on
     860                continue
     861            # here's a candidate. Send a request.
     862            s = self._find_one(shares, AVAILABLE)
     863            self._shares[s] = PENDING
     864            self._share_observers[s] = o = s.get_block(self.segnum)
     865            o.subscribe(self._block_request_activity, share=s, shnum=shnum)
     866            # TODO: build up a list of candidates, then walk through the
     867            # list, sending requests to the most desireable servers,
     868            # re-checking our block-hunger each time. For non-initial segment
     869            # fetches, this would let us stick with faster servers.
     870            return True
     871        # nothing was sent: don't call us again until you have more shares to
     872        # work with, or one of the existing shares has been declared OVERDUE
     873        return False
     874
     875    def _cancel_all_requests(self):
     876        for o in self._share_observers.values():
     877            o.cancel()
     878        self._share_observers = {}
     879
     880    def _block_request_activity(self, share, shnum, state, block=None, f=None):
     881        # called by Shares, in response to our s.send_request() calls.
     882        log.msg("SegmentFetcher(%s)._block_request_activity:"
     883                " Share(sh%d-on-%s) -> %s" %
     884                (self._node._si_prefix, shnum, share._peerid_s, state),
     885                level=log.NOISY, umid="vilNWA")
     886        # COMPLETE, CORRUPT, DEAD, BADSEGNUM are terminal.
     887        if state in (COMPLETE, CORRUPT, DEAD, BADSEGNUM):
     888            self._share_observers.pop(share, None)
     889        if state is COMPLETE:
     890            # 'block' is fully validated
     891            self._shares[share] = COMPLETE
     892            self._blocks[shnum] = block
     893        elif state is OVERDUE:
     894            self._shares[share] = OVERDUE
     895            # OVERDUE is not terminal: it will eventually transition to
     896            # COMPLETE, CORRUPT, or DEAD.
     897        elif state is CORRUPT:
     898            self._shares[share] = CORRUPT
     899        elif state is DEAD:
     900            del self._shares[share]
     901            self._shnums[shnum].remove(share)
     902            self._last_failure = f
     903        elif state is BADSEGNUM:
     904            self._shares[share] = BADSEGNUM # ???
     905            self._bad_segnum = True
     906        eventually(self.loop)
     907
     908
     909class RequestToken:
     910    def __init__(self, peerid):
     911        self.peerid = peerid
     912
     913class ShareFinder:
     914    def __init__(self, storage_broker, verifycap, node, logparent=None,
     915                 max_outstanding_requests=10):
     916        self.running = True # stopped by Share.stop, from Terminator
     917        self.verifycap = verifycap
     918        self._started = False
     919        self._storage_broker = storage_broker
     920        self.share_consumer = self.node = node
     921        self.max_outstanding_requests = max_outstanding_requests
     922
     923        self._hungry = False
     924
     925        self._commonshares = {} # shnum to CommonShare instance
     926        self.undelivered_shares = []
     927        self.pending_requests = set()
     928
     929        self._storage_index = verifycap.storage_index
     930        self._si_prefix = base32.b2a_l(self._storage_index[:8], 60)
     931        self._node_logparent = logparent
     932        self._lp = log.msg(format="ShareFinder[si=%(si)s] starting",
     933                           si=self._si_prefix,
     934                           level=log.NOISY, parent=logparent, umid="2xjj2A")
     935
     936    def start_finding_servers(self):
     937        # don't get servers until somebody uses us: creating the
     938        # ImmutableFileNode should not cause work to happen yet. Test case is
     939        # test_dirnode, which creates us with storage_broker=None
     940        if not self._started:
     941            si = self.verifycap.storage_index
     942            s = self._storage_broker.get_servers_for_index(si)
     943            self._servers = iter(s)
     944            self._started = True
     945
     946    def log(self, *args, **kwargs):
     947        if "parent" not in kwargs:
     948            kwargs["parent"] = self._lp
     949        return log.msg(*args, **kwargs)
     950
     951    def stop(self):
     952        self.running = False
     953
     954    # called by our parent CiphertextDownloader
     955    def hungry(self):
     956        self.log(format="ShareFinder[si=%(si)s] hungry",
     957                 si=self._si_prefix, level=log.NOISY, umid="NywYaQ")
     958        self.start_finding_servers()
     959        self._hungry = True
     960        eventually(self.loop)
     961
     962    # internal methods
     963    def loop(self):
     964        undelivered_s = ",".join(["sh%d@%s" %
     965                                  (s._shnum, idlib.shortnodeid_b2a(s._peerid))
     966                                  for s in self.undelivered_shares])
     967        pending_s = ",".join([idlib.shortnodeid_b2a(rt.peerid)
     968                              for rt in self.pending_requests]) # sort?
     969        self.log(format="ShareFinder loop: running=%(running)s"
     970                 " hungry=%(hungry)s, undelivered=%(undelivered)s,"
     971                 " pending=%(pending)s",
     972                 running=self.running, hungry=self._hungry,
     973                 undelivered=undelivered_s, pending=pending_s,
     974                 level=log.NOISY, umid="kRtS4Q")
     975        if not self.running:
     976            return
     977        if not self._hungry:
     978            return
     979        if self.undelivered_shares:
     980            sh = self.undelivered_shares.pop(0)
     981            # they will call hungry() again if they want more
     982            self._hungry = False
     983            self.log(format="delivering Share(shnum=%(shnum)d, server=%(peerid)s)",
     984                     shnum=sh._shnum, peerid=sh._peerid_s,
     985                     level=log.NOISY, umid="2n1qQw")
     986            eventually(self.share_consumer.got_shares, [sh])
     987            return
     988        if len(self.pending_requests) >= self.max_outstanding_requests:
     989            # cannot send more requests, must wait for some to retire
     990            return
     991
     992        server = None
     993        try:
     994            if self._servers:
     995                server = self._servers.next()
     996        except StopIteration:
     997            self._servers = None
     998
     999        if server:
     1000            self.send_request(server)
     1001            return
     1002
     1003        if self.pending_requests:
     1004            # no server, but there are still requests in flight: maybe one of
     1005            # them will make progress
     1006            return
     1007
     1008        self.log(format="ShareFinder.loop: no_more_shares, ever",
     1009                 level=log.UNUSUAL, umid="XjQlzg")
     1010        # we've run out of servers (so we can't send any more requests), and
     1011        # we have nothing in flight. No further progress can be made. They
     1012        # are destined to remain hungry.
     1013        self.share_consumer.no_more_shares()
     1014
     1015    def send_request(self, server):
     1016        peerid, rref = server
     1017        req = RequestToken(peerid)
     1018        self.pending_requests.add(req)
     1019        lp = self.log(format="sending DYHB to [%(peerid)s]",
     1020                      peerid=idlib.shortnodeid_b2a(peerid),
     1021                      level=log.NOISY, umid="Io7pyg")
     1022        d = rref.callRemote("get_buckets", self._storage_index)
     1023        d.addBoth(incidentally, self.pending_requests.discard, req)
     1024        d.addCallbacks(self._got_response, self._got_error,
     1025                       callbackArgs=(rref.version, peerid, req, lp),
     1026                       errbackArgs=(peerid, req, lp))
     1027        d.addErrback(log.err, format="error in send_request",
     1028                     level=log.WEIRD, parent=lp, umid="rpdV0w")
     1029        d.addCallback(incidentally, eventually, self.loop)
     1030
     1031    def _got_response(self, buckets, server_version, peerid, req, lp):
     1032        if buckets:
     1033            shnums_s = ",".join([str(shnum) for shnum in buckets])
     1034            self.log(format="got shnums [%(shnums)s] from [%(peerid)s]",
     1035                     shnums=shnums_s, peerid=idlib.shortnodeid_b2a(peerid),
     1036                     level=log.NOISY, parent=lp, umid="0fcEZw")
     1037        else:
     1038            self.log(format="no shares from [%(peerid)s]",
     1039                     peerid=idlib.shortnodeid_b2a(peerid),
     1040                     level=log.NOISY, parent=lp, umid="U7d4JA")
     1041        if self.node.num_segments is None:
     1042            best_numsegs = self.node.guessed_num_segments
     1043        else:
     1044            best_numsegs = self.node.num_segments
     1045        for shnum, bucket in buckets.iteritems():
     1046            if shnum in self._commonshares:
     1047                cs = self._commonshares[shnum]
     1048            else:
     1049                cs = CommonShare(best_numsegs, self._si_prefix, shnum,
     1050                                 self._node_logparent)
     1051                # Share._get_satisfaction is responsible for updating
     1052                # CommonShare.set_numsegs after we know the UEB. Alternatives:
     1053                #  1: d = self.node.get_num_segments()
     1054                #     d.addCallback(cs.got_numsegs)
     1055                #   the problem is that the OneShotObserverList I was using
     1056                #   inserts an eventual-send between _get_satisfaction's
     1057                #   _satisfy_UEB and _satisfy_block_hash_tree, and the
     1058                #   CommonShare didn't get the num_segs message before
     1059                #   being asked to set block hash values. To resolve this
     1060                #   would require an immediate ObserverList instead of
     1061                #   an eventual-send -based one
     1062                #  2: break _get_satisfaction into Deferred-attached pieces.
     1063                #     Yuck.
     1064                self._commonshares[shnum] = cs
     1065            s = Share(bucket, server_version, self.verifycap, cs, self.node,
     1066                      peerid, shnum, self._node_logparent)
     1067            self.undelivered_shares.append(s)
     1068
     1069    def _got_error(self, f, peerid, req, lp):
     1070        self.log(format="got error from [%(peerid)s]",
     1071                 peerid=idlib.shortnodeid_b2a(peerid), failure=f,
     1072                 level=log.UNUSUAL, parent=lp, umid="zUKdCw")
     1073
     1074
     1075
     1076class Segmentation:
     1077    """I am responsible for a single offset+size read of the file. I handle
     1078    segmentation: I figure out which segments are necessary, request them
     1079    (from my CiphertextDownloader) in order, and trim the segments down to
     1080    match the offset+size span. I use the Producer/Consumer interface to only
     1081    request one segment at a time.
     1082    """
     1083    implements(IPushProducer)
     1084    def __init__(self, node, offset, size, consumer, logparent=None):
     1085        self._node = node
     1086        self._hungry = True
     1087        self._active_segnum = None
     1088        self._cancel_segment_request = None
     1089        # these are updated as we deliver data. At any given time, we still
     1090        # want to download file[offset:offset+size]
     1091        self._offset = offset
     1092        self._size = size
     1093        self._consumer = consumer
     1094        self._lp = logparent
     1095
     1096    def start(self):
     1097        self._alive = True
     1098        self._deferred = defer.Deferred()
     1099        self._consumer.registerProducer(self, True)
     1100        self._maybe_fetch_next()
     1101        return self._deferred
     1102
     1103    def _maybe_fetch_next(self):
     1104        if not self._alive or not self._hungry:
     1105            return
     1106        if self._active_segnum is not None:
     1107            return
     1108        self._fetch_next()
     1109
     1110    def _fetch_next(self):
     1111        if self._size == 0:
     1112            # done!
     1113            self._alive = False
     1114            self._hungry = False
     1115            self._consumer.unregisterProducer()
     1116            self._deferred.callback(self._consumer)
     1117            return
     1118        n = self._node
     1119        have_actual_segment_size = n.segment_size is not None
     1120        segment_size = n.segment_size or n.guessed_segment_size
     1121        if self._offset == 0:
     1122            # great! we want segment0 for sure
     1123            wanted_segnum = 0
     1124        else:
     1125            # this might be a guess
     1126            wanted_segnum = self._offset // segment_size
     1127        log.msg(format="_fetch_next(offset=%(offset)d) wants segnum=%(segnum)d",
     1128                offset=self._offset, segnum=wanted_segnum,
     1129                level=log.NOISY, parent=self._lp, umid="5WfN0w")
     1130        self._active_segnum = wanted_segnum
     1131        d,c = n.get_segment(wanted_segnum, self._lp)
     1132        self._cancel_segment_request = c
     1133        d.addBoth(self._request_retired)
     1134        d.addCallback(self._got_segment, have_actual_segment_size,
     1135                      wanted_segnum)
     1136        d.addErrback(self._retry_bad_segment, have_actual_segment_size)
     1137        d.addErrback(self._error)
     1138
     1139    def _request_retired(self, res):
     1140        self._active_segnum = None
     1141        self._cancel_segment_request = None
     1142        return res
     1143
     1144    def _got_segment(self, (segment_start,segment), had_actual_segment_size,
     1145                     wanted_segnum):
     1146        self._cancel_segment_request = None
     1147        # we got file[segment_start:segment_start+len(segment)]
     1148        # we want file[self._offset:self._offset+self._size]
     1149        log.msg(format="Segmentation got data:"
     1150                " want [%(wantstart)d-%(wantend)d),"
     1151                " given [%(segstart)d-%(segend)d), for segnum=%(segnum)d",
     1152                wantstart=self._offset, wantend=self._offset+self._size,
     1153                segstart=segment_start, segend=segment_start+len(segment),
     1154                segnum=wanted_segnum,
     1155                level=log.OPERATIONAL, parent=self._lp, umid="32dHcg")
     1156
     1157        o = overlap(segment_start, len(segment),  self._offset, self._size)
     1158        # the overlap is file[o[0]:o[0]+o[1]]
     1159        if not o or o[0] != self._offset:
     1160            # we didn't get the first byte, so we can't use this segment
     1161            if self._node.segment_size is not None:
     1162                # and we should have gotten it right. This is big problem.
     1163                log.msg("Segmentation handed wrong data (but we knew better):"
     1164                        " want [%d-%d), given [%d-%d), for segnum=%d,"
     1165                        " for si=%s"
     1166                        % (self._offset, self._offset+self._size,
     1167                           segment_start, segment_start+len(segment),
     1168                           wanted_segnum, self._node._si_prefix),
     1169                        level=log.WEIRD, parent=self._lp, umid="STlIiA")
     1170                raise BadSegmentError("Despite knowing the segment size,"
     1171                                      " I was given the wrong data."
     1172                                      " I cannot cope.")
     1173            # we've wasted some bandwidth, but now we can grab the right one,
     1174            # because we should know the segsize by now.
     1175            assert self._node.segment_size is not None
     1176            self._maybe_fetch_next()
     1177            return
     1178        offset_in_segment = self._offset - segment_start
     1179        desired_data = segment[offset_in_segment:offset_in_segment+o[1]]
     1180
     1181        self._offset += len(desired_data)
     1182        self._size -= len(desired_data)
     1183        self._consumer.write(desired_data)
     1184        # the consumer might call our .pauseProducing() inside that write()
     1185        # call, setting self._hungry=False
     1186        self._maybe_fetch_next()
     1187
     1188    def _retry_bad_segment(self, f, had_actual_segment_size):
     1189        f.trap(BadSegmentNumberError) # guessed way wrong, off the end
     1190        if had_actual_segment_size:
     1191            # but we should have known better, so this is a real error
     1192            return f
     1193        # we didn't know better: try again with more information
     1194        return self._maybe_fetch_next()
     1195
     1196    def _error(self, f):
     1197        log.msg("Error in Segmentation",
     1198                level=log.WEIRD, parent=self._lp, umid="EYlXBg")
     1199        self._alive = False
     1200        self._hungry = False
     1201        self._consumer.unregisterProducer()
     1202        self._deferred.errback(f)
     1203
     1204    def stopProducing(self):
     1205        self._hungry = False
     1206        self._alive = False
     1207        # cancel any outstanding segment request
     1208        if self._cancel_segment_request:
     1209            self._cancel_segment_request()
     1210            self._cancel_segment_request = None
     1211    def pauseProducing(self):
     1212        self._hungry = False
     1213    def resumeProducing(self):
     1214        self._hungry = True
     1215        eventually(self._maybe_fetch_next)
     1216
     1217class Cancel:
     1218    def __init__(self, f):
     1219        self._f = f
     1220        self.cancelled = False
     1221    def cancel(self):
     1222        if not self.cancelled:
     1223            self.cancelled = True
     1224            self._f(self)
     1225
     1226class _Node:
     1227    """Internal class which manages downloads and holds state. External
     1228    callers use CiphertextFileNode instead."""
     1229
     1230    # Share._node points to me
     1231    def __init__(self, verifycap, storage_broker, secret_holder,
     1232                 terminator, history):
     1233        assert isinstance(verifycap, uri.CHKFileVerifierURI)
     1234        self._verifycap = verifycap
     1235        self._storage_broker = storage_broker
     1236        self._si_prefix = base32.b2a_l(verifycap.storage_index[:8], 60)
     1237        self.running = True
     1238        if terminator:
     1239            terminator.register(self) # calls self.stop() at stopService()
     1240        # the rules are:
     1241        # 1: Only send network requests if you're active (self.running is True)
     1242        # 2: Use TimerService, not reactor.callLater
     1243        # 3: You can do eventual-sends any time.
     1244        # These rules should mean that once
     1245        # stopService()+flushEventualQueue() fires, everything will be done.
     1246        self._secret_holder = secret_holder
     1247        self._history = history
     1248
     1249        k, N = self._verifycap.needed_shares, self._verifycap.total_shares
     1250        self.share_hash_tree = IncompleteHashTree(N)
     1251
     1252        # we guess the segment size, so Segmentation can pull non-initial
     1253        # segments in a single roundtrip
     1254        max_segment_size = 128*KiB # TODO: pull from elsewhere, maybe the
     1255                                   # same place as upload.BaseUploadable
     1256        s = mathutil.next_multiple(min(verifycap.size, max_segment_size), k)
     1257        self.guessed_segment_size = s
     1258        r = self._calculate_sizes(self.guessed_segment_size)
     1259        self.guessed_num_segments = r["num_segments"]
     1260        # as with CommonShare, our ciphertext_hash_tree is a stub until we
     1261        # get the real num_segments
     1262        self.ciphertext_hash_tree = IncompleteHashTree(self.guessed_num_segments)
     1263
     1264        # filled in when we parse a valid UEB
     1265        self.have_UEB = False
     1266        self.segment_size = None
     1267        self.tail_segment_size = None
     1268        self.tail_segment_padded = None
     1269        self.num_segments = None
     1270        self.block_size = None
     1271        self.tail_block_size = None
     1272        #self.ciphertext_hash_tree = None # size depends on num_segments
     1273        self.ciphertext_hash = None # flat hash, optional
     1274
     1275        # things to track callers that want data
     1276
     1277        # _segment_requests can have duplicates
     1278        self._segment_requests = [] # (segnum, d, cancel_handle)
     1279        self._active_segment = None # a SegmentFetcher, with .segnum
     1280
     1281        # we create one top-level logparent for this _Node, and another one
     1282        # for each read() call. Segmentation and get_segment() messages are
     1283        # associated with the read() call, everything else is tied to the
     1284        # _Node's log entry.
     1285        lp = log.msg(format="Immutable _Node(%(si)s) created: size=%(size)d,"
     1286                     " guessed_segsize=%(guessed_segsize)d,"
     1287                     " guessed_numsegs=%(guessed_numsegs)d",
     1288                     si=self._si_prefix, size=verifycap.size,
     1289                     guessed_segsize=self.guessed_segment_size,
     1290                     guessed_numsegs=self.guessed_num_segments,
     1291                     level=log.OPERATIONAL, umid="uJ0zAQ")
     1292        self._lp = lp
     1293
     1294        self._sharefinder = ShareFinder(storage_broker, verifycap, self, lp)
     1295        self._shares = set()
     1296
     1297    def stop(self):
     1298        # called by the Terminator at shutdown, mostly for tests
     1299        if self._active_segment:
     1300            self._active_segment.stop()
     1301            self._active_segment = None
     1302        self._sharefinder.stop()
     1303
     1304    # things called by outside callers, via CiphertextFileNode. get_segment()
     1305    # may also be called by Segmentation.
     1306
     1307    def read(self, consumer, offset=0, size=None):
     1308        """I am the main entry point, from which FileNode.read() can get
     1309        data. I feed the consumer with the desired range of ciphertext. I
     1310        return a Deferred that fires (with the consumer) when the read is
     1311        finished.
     1312
     1313        Note that there is no notion of a 'file pointer': each call to read()
     1314        uses an independent offset= value."""
     1315        # for concurrent operations: each gets its own Segmentation manager
     1316        if size is None:
     1317            size = self._verifycap.size
     1318        # clip size so offset+size does not go past EOF
     1319        size = min(size, self._verifycap.size-offset)
     1320        lp = log.msg(format="imm Node(%(si)s).read(%(offset)d, %(size)d)",
     1321                     si=base32.b2a(self._verifycap.storage_index)[:8],
     1322                     offset=offset, size=size,
     1323                     level=log.OPERATIONAL, parent=self._lp, umid="l3j3Ww")
     1324        sp = self._history.stats_provider
     1325        sp.count("downloader.files_downloaded", 1) # really read() calls
     1326        sp.count("downloader.bytes_downloaded", size)
     1327        s = Segmentation(self, offset, size, consumer, lp)
     1328        # this raises an interesting question: what segments to fetch? if
     1329        # offset=0, always fetch the first segment, and then allow
     1330        # Segmentation to be responsible for pulling the subsequent ones if
     1331        # the first wasn't large enough. If offset>0, we're going to need an
     1332        # extra roundtrip to get the UEB (and therefore the segment size)
     1333        # before we can figure out which segment to get. TODO: allow the
     1334        # offset-table-guessing code (which starts by guessing the segsize)
     1335        # to assist the offset>0 process.
     1336        d = s.start()
     1337        return d
     1338
     1339    def get_segment(self, segnum, logparent=None):
     1340        """Begin downloading a segment. I return a tuple (d, c): 'd' is a
     1341        Deferred that fires with (offset,data) when the desired segment is
     1342        available, and c is an object on which c.cancel() can be called to
     1343        disavow interest in the segment (after which 'd' will never fire).
     1344
     1345        You probably need to know the segment size before calling this,
     1346        unless you want the first few bytes of the file. If you ask for a
     1347        segment number which turns out to be too large, the Deferred will
     1348        errback with BadSegmentNumberError.
     1349
     1350        The Deferred fires with the offset of the first byte of the data
     1351        segment, so that you can call get_segment() before knowing the
     1352        segment size, and still know which data you received.
     1353
     1354        The Deferred can also errback with other fatal problems, such as
     1355        NotEnoughSharesError, NoSharesError, or BadCiphertextHashError.
     1356        """
     1357        log.msg(format="imm Node(%(si)s).get_segment(%(segnum)d)",
     1358                si=base32.b2a(self._verifycap.storage_index)[:8],
     1359                segnum=segnum,
     1360                level=log.OPERATIONAL, parent=logparent, umid="UKFjDQ")
     1361        d = defer.Deferred()
     1362        c = Cancel(self._cancel_request)
     1363        self._segment_requests.append( (segnum, d, c) )
     1364        self._start_new_segment()
     1365        return (d, c)
     1366
     1367    # things called by the Segmentation object used to transform
     1368    # arbitrary-sized read() calls into quantized segment fetches
     1369
     1370    def _start_new_segment(self):
     1371        if self._active_segment is None and self._segment_requests:
     1372            segnum = self._segment_requests[0][0]
     1373            k = self._verifycap.needed_shares
     1374            self._active_segment = fetcher = SegmentFetcher(self, segnum, k)
     1375            active_shares = [s for s in self._shares if s.not_dead()]
     1376            fetcher.add_shares(active_shares) # this triggers the loop
     1377
     1378
     1379    # called by our child ShareFinder
     1380    def got_shares(self, shares):
     1381        self._shares.update(shares)
     1382        if self._active_segment:
     1383            self._active_segment.add_shares(shares)
     1384    def no_more_shares(self):
     1385        self._no_more_shares = True
     1386        if self._active_segment:
     1387            self._active_segment.no_more_shares()
     1388
     1389    # things called by our Share instances
     1390
     1391    def validate_and_store_UEB(self, UEB_s):
     1392        log.msg("validate_and_store_UEB",
     1393                level=log.OPERATIONAL, parent=self._lp, umid="7sTrPw")
     1394        h = hashutil.uri_extension_hash(UEB_s)
     1395        if h != self._verifycap.uri_extension_hash:
     1396            raise hashutil.BadHashError
     1397        UEB_dict = uri.unpack_extension(UEB_s)
     1398        self._parse_and_store_UEB(UEB_dict) # sets self._stuff
     1399        # TODO: a malformed (but authentic) UEB could throw an assertion in
     1400        # _parse_and_store_UEB, and we should abandon the download.
     1401        self.have_UEB = True
     1402
     1403    def _parse_and_store_UEB(self, d):
     1404        # Note: the UEB contains needed_shares and total_shares. These are
     1405        # redundant and inferior (the filecap contains the authoritative
     1406        # values). However, because it is possible to encode the same file in
     1407        # multiple ways, and the encoders might choose (poorly) to use the
     1408        # same key for both (therefore getting the same SI), we might
     1409        # encounter shares for both types. The UEB hashes will be different,
     1410        # however, and we'll disregard the "other" encoding's shares as
     1411        # corrupted.
     1412
     1413        # therefore, we ignore d['total_shares'] and d['needed_shares'].
     1414
     1415        log.msg(format="UEB=%(ueb)s, vcap=%(vcap)s",
     1416                ueb=repr(d), vcap=self._verifycap.to_string(),
     1417                level=log.NOISY, parent=self._lp, umid="cVqZnA")
     1418
     1419        k, N = self._verifycap.needed_shares, self._verifycap.total_shares
     1420
     1421        self.segment_size = d['segment_size']
     1422
     1423        r = self._calculate_sizes(self.segment_size)
     1424        self.tail_segment_size = r["tail_segment_size"]
     1425        self.tail_segment_padded = r["tail_segment_padded"]
     1426        self.num_segments = r["num_segments"]
     1427        self.block_size = r["block_size"]
     1428        self.tail_block_size = r["tail_block_size"]
     1429        log.msg("actual sizes: %s" % (r,),
     1430                level=log.NOISY, parent=self._lp, umid="PY6P5Q")
     1431        if (self.segment_size == self.guessed_segment_size
     1432            and self.num_segments == self.guessed_num_segments):
     1433            log.msg("my guess was right!",
     1434                    level=log.NOISY, parent=self._lp, umid="x340Ow")
     1435        else:
     1436            log.msg("my guess was wrong! Extra round trips for me.",
     1437                    level=log.NOISY, parent=self._lp, umid="tb7RJw")
     1438
     1439        # zfec.Decode() instantiation is fast, but still, let's use the same
     1440        # codec instance for all but the last segment. 3-of-10 takes 15us on
     1441        # my laptop, 25-of-100 is 900us, 3-of-255 is 97us, 25-of-255 is
     1442        # 2.5ms, worst-case 254-of-255 is 9.3ms
     1443        self._codec = CRSDecoder()
     1444        self._codec.set_params(self.segment_size, k, N)
     1445
     1446
     1447        # Ciphertext hash tree root is mandatory, so that there is at most
     1448        # one ciphertext that matches this read-cap or verify-cap. The
     1449        # integrity check on the shares is not sufficient to prevent the
     1450        # original encoder from creating some shares of file A and other
     1451        # shares of file B.
     1452        self.ciphertext_hash_tree = IncompleteHashTree(self.num_segments)
     1453        self.ciphertext_hash_tree.set_hashes({0: d['crypttext_root_hash']})
     1454
     1455        self.share_hash_tree.set_hashes({0: d['share_root_hash']})
     1456
     1457        # crypttext_hash is optional. We only pull this from the first UEB
     1458        # that we see.
     1459        if 'crypttext_hash' in d:
     1460            if len(d["crypttext_hash"]) == hashutil.CRYPTO_VAL_SIZE:
     1461                self.ciphertext_hash = d['crypttext_hash']
     1462            else:
     1463                log.msg("ignoring bad-length UEB[crypttext_hash], "
     1464                        "got %d bytes, want %d" % (len(d['crypttext_hash']),
     1465                                                   hashutil.CRYPTO_VAL_SIZE),
     1466                        level=log.WEIRD, parent=self._lp, umid="oZkGLA")
     1467
     1468        # Our job is a fast download, not verification, so we ignore any
     1469        # redundant fields. The Verifier uses a different code path which
     1470        # does not ignore them.
     1471
     1472    def _calculate_sizes(self, segment_size):
     1473        # segments of ciphertext
     1474        size = self._verifycap.size
     1475        k = self._verifycap.needed_shares
     1476
     1477        # this assert matches the one in encode.py:127 inside
     1478        # Encoded._got_all_encoding_parameters, where the UEB is constructed
     1479        assert segment_size % k == 0
     1480
     1481        # the last segment is usually short. We don't store a whole segsize,
     1482        # but we do pad the segment up to a multiple of k, because the
     1483        # encoder requires that.
     1484        tail_segment_size = size % segment_size
     1485        if tail_segment_size == 0:
     1486            tail_segment_size = segment_size
     1487        padded = mathutil.next_multiple(tail_segment_size, k)
     1488        tail_segment_padded = padded
     1489
     1490        num_segments = mathutil.div_ceil(size, segment_size)
     1491
     1492        # each segment is turned into N blocks. All but the last are of size
     1493        # block_size, and the last is of size tail_block_size
     1494        block_size = segment_size / k
     1495        tail_block_size = tail_segment_padded / k
     1496
     1497        return { "tail_segment_size": tail_segment_size,
     1498                 "tail_segment_padded": tail_segment_padded,
     1499                 "num_segments": num_segments,
     1500                 "block_size": block_size,
     1501                 "tail_block_size": tail_block_size,
     1502                 }
     1503
     1504
     1505    def process_share_hashes(self, share_hashes):
     1506        self.share_hash_tree.set_hashes(share_hashes)
     1507
     1508    def get_needed_ciphertext_hashes(self, segnum):
     1509        cht = self.ciphertext_hash_tree
     1510        return cht.needed_hashes(segnum, include_leaf=True)
     1511    def process_ciphertext_hashes(self, hashes, shnum, serverid_s):
     1512        assert self.num_segments is not None
     1513        try:
     1514            self.ciphertext_hash_tree.set_hashes(hashes)
     1515            return True
     1516        except (BadHashError, NotEnoughHashesError):
     1517            hashnums = ",".join([str(n) for n in sorted(hashes.keys())])
     1518            log.msg(format="hash failure in ciphertext_hashes=(%(hashnums)s),"
     1519                    " shnum=%(shnum)d SI=%(si)s server=%(server)s",
     1520                    hashnums=hashnums, shnum=shnum,
     1521                    si=self._si_prefix, server=serverid_s, failure=Failure(),
     1522                    level=log.WEIRD, parent=self._lp, umid="iZI0TA")
     1523        return False
     1524
     1525    # called by our child SegmentFetcher
     1526
     1527    def want_more_shares(self):
     1528        self._sharefinder.hungry()
     1529
     1530    def fetch_failed(self, sf, f):
     1531        assert sf is self._active_segment
     1532        self._active_segment = None
     1533        # deliver error upwards
     1534        for (d,c) in self._extract_requests(sf.segnum):
     1535            eventually(self._deliver, d, c, f)
     1536
     1537    def process_blocks(self, segnum, blocks):
     1538        d = defer.maybeDeferred(self._decode_blocks, segnum, blocks)
     1539        d.addCallback(self._check_ciphertext_hash, segnum)
     1540        def _deliver(result):
     1541            log.msg(format="delivering segment(%(segnum)d)",
     1542                    segnum=segnum,
     1543                    level=log.OPERATIONAL, parent=self._lp,
     1544                    umid="j60Ojg")
     1545            for (d,c) in self._extract_requests(segnum):
     1546                eventually(self._deliver, d, c, result)
     1547            self._active_segment = None
     1548            self._start_new_segment()
     1549        d.addBoth(_deliver)
     1550        d.addErrback(lambda f:
     1551                     log.err("unhandled error during process_blocks",
     1552                             failure=f, level=log.WEIRD,
     1553                             parent=self._lp, umid="MkEsCg"))
     1554
     1555    def _decode_blocks(self, segnum, blocks):
     1556        tail = (segnum == self.num_segments-1)
     1557        codec = self._codec
     1558        block_size = self.block_size
     1559        decoded_size = self.segment_size
     1560        if tail:
     1561            # account for the padding in the last segment
     1562            codec = CRSDecoder()
     1563            k, N = self._verifycap.needed_shares, self._verifycap.total_shares
     1564            codec.set_params(self.tail_segment_padded, k, N)
     1565            block_size = self.tail_block_size
     1566            decoded_size = self.tail_segment_padded
     1567
     1568        shares = []
     1569        shareids = []
     1570        for (shareid, share) in blocks.iteritems():
     1571            assert len(share) == block_size
     1572            shareids.append(shareid)
     1573            shares.append(share)
     1574        del blocks
     1575
     1576        d = codec.decode(shares, shareids)   # segment
     1577        del shares
     1578        def _process(buffers):
     1579            segment = "".join(buffers)
     1580            assert len(segment) == decoded_size
     1581            del buffers
     1582            if tail:
     1583                segment = segment[:self.tail_segment_size]
     1584            return segment
     1585        d.addCallback(_process)
     1586        return d
     1587
     1588    def _check_ciphertext_hash(self, segment, segnum):
     1589        assert self._active_segment.segnum == segnum
     1590        assert self.segment_size is not None
     1591        offset = segnum * self.segment_size
     1592
     1593        h = hashutil.crypttext_segment_hash(segment)
     1594        try:
     1595            self.ciphertext_hash_tree.set_hashes(leaves={segnum: h})
     1596            return (offset, segment)
     1597        except (BadHashError, NotEnoughHashesError):
     1598            format = ("hash failure in ciphertext_hash_tree:"
     1599                      " segnum=%(segnum)d, SI=%(si)s")
     1600            log.msg(format=format, segnum=segnum, si=self._si_prefix,
     1601                    failure=Failure(),
     1602                    level=log.WEIRD, parent=self._lp, umid="MTwNnw")
     1603            # this is especially weird, because we made it past the share
     1604            # hash tree. It implies that we're using the wrong encoding, or
     1605            # that the uploader deliberately constructed a bad UEB.
     1606            msg = format % {"segnum": segnum, "si": self._si_prefix}
     1607            raise BadCiphertextHashError(msg)
     1608
     1609    def _deliver(self, d, c, result):
     1610        # this method exists to handle cancel() that occurs between
     1611        # _got_segment and _deliver
     1612        if not c.cancelled:
     1613            d.callback(result) # might actually be an errback
     1614
     1615    def _extract_requests(self, segnum):
     1616        """Remove matching requests and return their (d,c) tuples so that the
     1617        caller can retire them."""
     1618        retire = [(d,c) for (segnum0, d, c) in self._segment_requests
     1619                  if segnum0 == segnum]
     1620        self._segment_requests = [t for t in self._segment_requests
     1621                                  if t[0] != segnum]
     1622        return retire
     1623
     1624    def _cancel_request(self, c):
     1625        self._segment_requests = [t for t in self._segment_requests
     1626                                  if t[2] != c]
     1627        segnums = [segnum for (segnum,d,c) in self._segment_requests]
     1628        if self._active_segment.segnum not in segnums:
     1629            self._active_segment.stop()
     1630            self._active_segment = None
     1631            self._start_new_segment()
     1632
     1633    def check_and_repair(self, monitor, verify=False, add_lease=False):
     1634        verifycap = self._verifycap
     1635        storage_index = verifycap.storage_index
     1636        sb = self._storage_broker
     1637        servers = sb.get_all_servers()
     1638        sh = self._secret_holder
     1639
     1640        c = Checker(verifycap=verifycap, servers=servers,
     1641                    verify=verify, add_lease=add_lease, secret_holder=sh,
     1642                    monitor=monitor)
     1643        d = c.start()
     1644        def _maybe_repair(cr):
     1645            crr = CheckAndRepairResults(storage_index)
     1646            crr.pre_repair_results = cr
     1647            if cr.is_healthy():
     1648                crr.post_repair_results = cr
     1649                return defer.succeed(crr)
     1650            else:
     1651                crr.repair_attempted = True
     1652                crr.repair_successful = False # until proven successful
     1653                def _gather_repair_results(ur):
     1654                    assert IUploadResults.providedBy(ur), ur
     1655                    # clone the cr (check results) to form the basis of the
     1656                    # prr (post-repair results)
     1657                    prr = CheckResults(cr.uri, cr.storage_index)
     1658                    prr.data = copy.deepcopy(cr.data)
     1659
     1660                    sm = prr.data['sharemap']
     1661                    assert isinstance(sm, DictOfSets), sm
     1662                    sm.update(ur.sharemap)
     1663                    servers_responding = set(prr.data['servers-responding'])
     1664                    servers_responding.union(ur.sharemap.iterkeys())
     1665                    prr.data['servers-responding'] = list(servers_responding)
     1666                    prr.data['count-shares-good'] = len(sm)
     1667                    prr.data['count-good-share-hosts'] = len(sm)
     1668                    is_healthy = bool(len(sm) >= verifycap.total_shares)
     1669                    is_recoverable = bool(len(sm) >= verifycap.needed_shares)
     1670                    prr.set_healthy(is_healthy)
     1671                    prr.set_recoverable(is_recoverable)
     1672                    crr.repair_successful = is_healthy
     1673                    prr.set_needs_rebalancing(len(sm) >= verifycap.total_shares)
     1674
     1675                    crr.post_repair_results = prr
     1676                    return crr
     1677                def _repair_error(f):
     1678                    # as with mutable repair, I'm not sure if I want to pass
     1679                    # through a failure or not. TODO
     1680                    crr.repair_successful = False
     1681                    crr.repair_failure = f
     1682                    return f
     1683                r = Repairer(storage_broker=sb, secret_holder=sh,
     1684                             verifycap=verifycap, monitor=monitor)
     1685                d = r.start()
     1686                d.addCallbacks(_gather_repair_results, _repair_error)
     1687                return d
     1688
     1689        d.addCallback(_maybe_repair)
     1690        return d
     1691
     1692    def check(self, monitor, verify=False, add_lease=False):
     1693        verifycap = self._verifycap
     1694        sb = self._storage_broker
     1695        servers = sb.get_all_servers()
     1696        sh = self._secret_holder
     1697
     1698        v = Checker(verifycap=verifycap, servers=servers,
     1699                    verify=verify, add_lease=add_lease, secret_holder=sh,
     1700                    monitor=monitor)
     1701        return v.start()
     1702
     1703class CiphertextFileNode:
     1704    def __init__(self, verifycap, storage_broker, secret_holder,
     1705                 terminator, history):
     1706        assert isinstance(verifycap, uri.CHKFileVerifierURI)
     1707        self._node = _Node(verifycap, storage_broker, secret_holder,
     1708                           terminator, history)
     1709
     1710    def read(self, consumer, offset=0, size=None):
     1711        """I am the main entry point, from which FileNode.read() can get
     1712        data. I feed the consumer with the desired range of ciphertext. I
     1713        return a Deferred that fires (with the consumer) when the read is
     1714        finished."""
     1715        return self._node.read(consumer, offset, size)
     1716
     1717    def get_segment(self, segnum):
     1718        """Begin downloading a segment. I return a tuple (d, c): 'd' is a
     1719        Deferred that fires with (offset,data) when the desired segment is
     1720        available, and c is an object on which c.cancel() can be called to
     1721        disavow interest in the segment (after which 'd' will never fire).
     1722
     1723        You probably need to know the segment size before calling this,
     1724        unless you want the first few bytes of the file. If you ask for a
     1725        segment number which turns out to be too large, the Deferred will
     1726        errback with BadSegmentNumberError.
     1727
     1728        The Deferred fires with the offset of the first byte of the data
     1729        segment, so that you can call get_segment() before knowing the
     1730        segment size, and still know which data you received.
     1731        """
     1732        return self._node.get_segment(segnum)
     1733
     1734    def raise_error(self):
     1735        pass
     1736
     1737
     1738    def check_and_repair(self, monitor, verify=False, add_lease=False):
     1739        return self._node.check_and_repair(monitor, verify, add_lease)
     1740    def check(self, monitor, verify=False, add_lease=False):
     1741        return self._node.check(monitor, verify, add_lease)
     1742
     1743
     1744class DecryptingConsumer:
     1745    """I sit between a CiphertextDownloader (which acts as a Producer) and
     1746    the real Consumer, decrypting everything that passes by. The real
     1747    Consumer sees the real Producer, but the Producer sees us instead of the
     1748    real consumer."""
     1749    implements(IConsumer)
     1750
     1751    def __init__(self, consumer, readkey, offset):
     1752        self._consumer = consumer
     1753        # TODO: pycryptopp CTR-mode needs random-access operations: I want
     1754        # either a=AES(readkey, offset) or better yet both of:
     1755        #  a=AES(readkey, offset=0)
     1756        #  a.process(ciphertext, offset=xyz)
     1757        # For now, we fake it with the existing iv= argument.
     1758        offset_big = offset // 16
     1759        offset_small = offset % 16
     1760        iv = binascii.unhexlify("%032x" % offset_big)
     1761        self._decryptor = AES(readkey, iv=iv)
     1762        self._decryptor.process("\x00"*offset_small)
     1763
     1764    def registerProducer(self, producer, streaming):
     1765        # this passes through, so the real consumer can flow-control the real
     1766        # producer. Therefore we don't need to provide any IPushProducer
     1767        # methods. We implement all the IConsumer methods as pass-throughs,
     1768        # and only intercept write() to perform decryption.
     1769        self._consumer.registerProducer(producer, streaming)
     1770    def unregisterProducer(self):
     1771        self._consumer.unregisterProducer()
     1772    def write(self, ciphertext):
     1773        plaintext = self._decryptor.process(ciphertext)
     1774        self._consumer.write(plaintext)
     1775
     1776class ImmutableFileNode:
     1777    implements(IImmutableFileNode)
     1778
     1779    # I wrap a CiphertextFileNode with a decryption key
     1780    def __init__(self, filecap, storage_broker, secret_holder, terminator,
     1781                 history):
     1782        assert isinstance(filecap, uri.CHKFileURI)
     1783        verifycap = filecap.get_verify_cap()
     1784        self._cnode = CiphertextFileNode(verifycap, storage_broker,
     1785                                         secret_holder, terminator, history)
     1786        assert isinstance(filecap, uri.CHKFileURI)
     1787        self.u = filecap
     1788        self._readkey = filecap.key
     1789
     1790    def read(self, consumer, offset=0, size=None):
     1791        decryptor = DecryptingConsumer(consumer, self._readkey, offset)
     1792        d = self._cnode.read(decryptor, offset, size)
     1793        d.addCallback(lambda dc: consumer)
     1794        return d
     1795
     1796    def raise_error(self):
     1797        pass
     1798
     1799    def get_write_uri(self):
     1800        return None
     1801
     1802    def get_readonly_uri(self):
     1803        return self.get_uri()
     1804
     1805    def get_uri(self):
     1806        return self.u.to_string()
     1807    def get_cap(self):
     1808        return self.u
     1809    def get_readcap(self):
     1810        return self.u.get_readonly()
     1811    def get_verify_cap(self):
     1812        return self.u.get_verify_cap()
     1813    def get_repair_cap(self):
     1814        # CHK files can be repaired with just the verifycap
     1815        return self.u.get_verify_cap()
     1816
     1817    def get_storage_index(self):
     1818        return self.u.get_storage_index()
     1819
     1820    def get_size(self):
     1821        return self.u.get_size()
     1822    def get_current_size(self):
     1823        return defer.succeed(self.get_size())
     1824
     1825    def is_mutable(self):
     1826        return False
     1827
     1828    def is_readonly(self):
     1829        return True
     1830
     1831    def is_unknown(self):
     1832        return False
     1833
     1834    def is_allowed_in_immutable_directory(self):
     1835        return True
     1836
     1837    def check_and_repair(self, monitor, verify=False, add_lease=False):
     1838        return self._cnode.check_and_repair(monitor, verify, add_lease)
     1839    def check(self, monitor, verify=False, add_lease=False):
     1840        return self._cnode.check(monitor, verify, add_lease)
     1841
     1842# TODO: if server1 has all shares, and server2-10 have one each, make the
     1843# loop stall slightly before requesting all shares from the first server, to
     1844# give it a chance to learn about the other shares and get some diversity.
     1845# Or, don't bother, let the first block all come from one server, and take
     1846# comfort in the fact that we'll learn about the other servers by the time we
     1847# fetch the second block.
     1848#
     1849# davidsarah points out that we could use sequential (instead of parallel)
     1850# fetching of multiple block from a single server: by the time the first
     1851# block arrives, we'll hopefully have heard about other shares. This would
     1852# induce some RTT delays (i.e. lose pipelining) in the case that this server
     1853# has the only shares, but that seems tolerable. We could rig it to only use
     1854# sequential requests on the first segment.
     1855
     1856# as a query gets later, we're more willing to duplicate work.
     1857
     1858# should change server read protocol to allow small shares to be fetched in a
     1859# single RTT. Instead of get_buckets-then-read, just use read(shnums, readv),
     1860# where shnums=[] means all shares, and the return value is a dict of
     1861# # shnum->ta (like with mutable files). The DYHB query should also fetch the
     1862# offset table, since everything else can be located once we have that.
     1863
     1864
     1865# ImmutableFileNode
     1866#    DecryptingConsumer
     1867#  CiphertextFileNode
     1868#    Segmentation
     1869#   ShareFinder
     1870#   SegmentFetcher[segnum] (one at a time)
     1871#   CommonShare[shnum]
     1872#   Share[shnum,server]
     1873
     1874# TODO: when we learn numsegs, any get_segment() calls for bad blocknumbers
     1875# should be failed with BadSegmentNumberError. But should this be the
     1876# responsibility of CiphertextFileNode, or SegmentFetcher? The knowledge will
     1877# first appear when a Share receives a valid UEB and calls
     1878# CiphertextFileNode.validate_UEB, then _parse_UEB. The SegmentFetcher is
     1879# expecting to hear from the Share, via the _block_request_activity observer.
     1880
     1881# make it the responsibility of the SegmentFetcher. Each Share that gets a
     1882# valid UEB will tell the SegmentFetcher BADSEGNUM (instead of COMPLETE or
     1883# CORRUPT). The SegmentFetcher it then responsible for shutting down, and
     1884# informing its parent (the CiphertextFileNode) of the BadSegmentNumberError,
     1885# which is then passed to the client of get_segment().
     1886
     1887
     1888# TODO: if offset table is corrupt, attacker could cause us to fetch whole
     1889# (large) share
     1890
     1891# log budget: when downloading at 1MBps (i.e. 8 segments-per-second), 10
     1892# log.OPERATIONAL per second, 100 log.NOISY per second. With k=3, that's 3
     1893# log.NOISY per block fetch.
     1894
     1895
     1896# test_cli.Error failed for a while: ShareFinder created, used up
     1897# (NotEnoughSharesError), started again. The self.running=False is the
     1898# problem.
     1899#
     1900# The second download is hungry, but because ShareFinder.running is false, it
     1901# never notifies the SegmentFetcher that there are no more shares coming, so
     1902# the download never completes. To trigger this in tests, we need the first
     1903# download to want more shares (so it must fail with NotEnoughSharesError, or
     1904# we must lose a share/server between downloads).
     1905#
     1906# fix was to not call self.stop when ShareFinder runs out of shares. stop()
     1907# is now only called by the Terminator.
  • new file src/allmydata/immutable/download2_off.py

    diff --git a/src/allmydata/immutable/download2_off.py b/src/allmydata/immutable/download2_off.py
    new file mode 100755
    index 0000000..d2b8b99
    - +  
     1#! /usr/bin/python
     2
     3# known (shnum,Server) pairs are sorted into a list according to
     4# desireability. This sort is picking a winding path through a matrix of
     5# [shnum][server]. The goal is to get diversity of both shnum and server.
     6
     7# The initial order is:
     8#  find the lowest shnum on the first server, add it
     9#  look at the next server, find the lowest shnum that we don't already have
     10#   if any
     11#  next server, etc, until all known servers are checked
     12#  now look at servers that we skipped (because ...
     13
     14# Keep track of which block requests are outstanding by (shnum,Server). Don't
     15# bother prioritizing "validated" shares: the overhead to pull the share hash
     16# chain is tiny (4 hashes = 128 bytes), and the overhead to pull a new block
     17# hash chain is also tiny (1GB file, 8192 segments of 128KiB each, 13 hashes,
     18# 832 bytes). Each time a block request is sent, also request any necessary
     19# hashes. Don't bother with a "ValidatedShare" class (as distinct from some
     20# other sort of Share). Don't bother avoiding duplicate hash-chain requests.
     21
     22# For each outstanding segread, walk the list and send requests (skipping
     23# outstanding shnums) until requests for k distinct shnums are in flight. If
     24# we can't do that, ask for more. If we get impatient on a request, find the
     25# first non-outstanding
     26
     27# start with the first Share in the list, and send a request. Then look at
     28# the next one. If we already have a pending request for the same shnum or
     29# server, push that Share down onto the fallback list and try the next one,
     30# etc. If we run out of non-fallback shares, use the fallback ones,
     31# preferring shnums that we don't have outstanding requests for (i.e. assume
     32# that all requests will complete). Do this by having a second fallback list.
     33
     34# hell, I'm reviving the Herder. But remember, we're still talking 3 objects
     35# per file, not thousands.
     36
     37# actually, don't bother sorting the initial list. Append Shares as the
     38# responses come back, that will put the fastest servers at the front of the
     39# list, and give a tiny preference to servers that are earlier in the
     40# permuted order.
     41
     42# more ideas:
     43#  sort shares by:
     44#   1: number of roundtrips needed to get some data
     45#   2: share number
     46#   3: ms of RTT delay
     47# maybe measure average time-to-completion of requests, compare completion
     48# time against that, much larger indicates congestion on the server side
     49# or the server's upstream speed is less than our downstream. Minimum
     50# time-to-completion indicates min(our-downstream,their-upstream). Could
     51# fetch shares one-at-a-time to measure that better.
     52
     53# when should we risk duplicate work and send a new request?
     54
     55def walk(self):
     56    shares = sorted(list)
     57    oldshares = copy(shares)
     58    outstanding = list()
     59    fallbacks = list()
     60    second_fallbacks = list()
     61    while len(outstanding.nonlate.shnums) < k: # need more requests
     62        while oldshares:
     63            s = shares.pop(0)
     64            if s.server in outstanding.servers or s.shnum in outstanding.shnums:
     65                fallbacks.append(s)
     66                continue
     67            outstanding.append(s)
     68            send_request(s)
     69            break #'while need_more_requests'
     70        # must use fallback list. Ask for more servers while we're at it.
     71        ask_for_more_servers()
     72        while fallbacks:
     73            s = fallbacks.pop(0)
     74            if s.shnum in outstanding.shnums:
     75                # assume that the outstanding requests will complete, but
     76                # send new requests for other shnums to existing servers
     77                second_fallbacks.append(s)
     78                continue
     79            outstanding.append(s)
     80            send_request(s)
     81            break #'while need_more_requests'
     82        # if we get here, we're being forced to send out multiple queries per
     83        # share. We've already asked for more servers, which might help. If
     84        # there are no late outstanding queries, then duplicate shares won't
     85        # help. Don't send queries for duplicate shares until some of the
     86        # queries are late.
     87        if outstanding.late:
     88            # we're allowed to try any non-outstanding share
     89            while second_fallbacks:
     90                pass
     91    newshares = outstanding + fallbacks + second_fallbacks + oldshares
     92       
     93
     94class Server:
     95    """I represent an abstract Storage Server. One day, the StorageBroker
     96    will return instances of me. For now, the StorageBroker returns (peerid,
     97    RemoteReference) tuples, and this code wraps a Server instance around
     98    them.
     99    """
     100    def __init__(self, peerid, ss):
     101        self.peerid = peerid
     102        self.remote = ss
     103        self._remote_buckets = {} # maps shnum to RIBucketReader
     104        # TODO: release the bucket references on shares that we no longer
     105        # want. OTOH, why would we not want them? Corruption?
     106
     107    def send_query(self, storage_index):
     108        """I return a Deferred that fires with a set of shnums. If the server
     109        had shares available, I will retain the RemoteReferences to its
     110        buckets, so that get_data(shnum, range) can be called later."""
     111        d = self.remote.callRemote("get_buckets", self.storage_index)
     112        d.addCallback(self._got_response)
     113        return d
     114
     115    def _got_response(self, r):
     116        self._remote_buckets = r
     117        return set(r.keys())
     118
     119class ShareOnAServer:
     120    """I represent one instance of a share, known to live on a specific
     121    server. I am created every time a server responds affirmatively to a
     122    do-you-have-block query."""
     123
     124    def __init__(self, shnum, server):
     125        self._shnum = shnum
     126        self._server = server
     127        self._block_hash_tree = None
     128
     129    def cost(self, segnum):
     130        """I return a tuple of (roundtrips, bytes, rtt), indicating how
     131        expensive I think it would be to fetch the given segment. Roundtrips
     132        indicates how many roundtrips it is likely to take (one to get the
     133        data and hashes, plus one to get the offset table and UEB if this is
     134        the first segment we've ever fetched). 'bytes' is how many bytes we
     135        must fetch (estimated). 'rtt' is estimated round-trip time (float) in
     136        seconds for a trivial request. The downloading algorithm will compare
     137        costs to decide which shares should be used."""
     138        # the most significant factor here is roundtrips: a Share for which
     139        # we already have the offset table is better to than a brand new one
     140
     141    def max_bandwidth(self):
     142        """Return a float, indicating the highest plausible bytes-per-second
     143        that I've observed coming from this share. This will be based upon
     144        the minimum (bytes-per-fetch / time-per-fetch) ever observed. This
     145        can we used to estimate the server's upstream bandwidth. Clearly this
     146        is only accurate if a share is retrieved with no contention for
     147        either the upstream, downstream, or middle of the connection, but it
     148        may still serve as a useful metric for deciding which servers to pull
     149        from."""
     150
     151    def get_segment(self, segnum):
     152        """I return a Deferred that will fire with the segment data, or
     153        errback."""
     154
     155class NativeShareOnAServer(ShareOnAServer):
     156    """For tahoe native (foolscap) servers, I contain a RemoteReference to
     157    the RIBucketReader instance."""
     158    def __init__(self, shnum, server, rref):
     159        ShareOnAServer.__init__(self, shnum, server)
     160        self._rref = rref # RIBucketReader
     161
     162class Share:
     163    def __init__(self, shnum):
     164        self._shnum = shnum
     165        # _servers are the Server instances which appear to hold a copy of
     166        # this share. It is populated when the ValidShare is first created,
     167        # or when we receive a get_buckets() response for a shnum that
     168        # already has a ValidShare instance. When we lose the connection to a
     169        # server, we remove it.
     170        self._servers = set()
     171        # offsets, UEB, and share_hash_tree all live in the parent.
     172        # block_hash_tree lives here.
     173        self._block_hash_tree = None
     174
     175        self._want
     176
     177    def get_servers(self):
     178        return self._servers
     179
     180
     181    def get_block(self, segnum):
     182        # read enough data to obtain a single validated block
     183        if not self.have_offsets:
     184            # we get the offsets in their own read, since they tell us where
     185            # everything else lives. We must fetch offsets for each share
     186            # separately, since they aren't directly covered by the UEB.
     187            pass
     188        if not self.parent.have_ueb:
     189            # use _guessed_segsize to make a guess about the layout, so we
     190            # can fetch both the offset table and the UEB in the same read.
     191            # This also requires making a guess about the presence or absence
     192            # of the plaintext_hash_tree. Oh, and also the version number. Oh
     193            # well.
     194            pass
     195
     196class CiphertextDownloader:
     197    """I manage all downloads for a single file. I operate a state machine
     198    with input events that are local read() requests, responses to my remote
     199    'get_bucket' and 'read_bucket' messages, and connection establishment and
     200    loss. My outbound events are connection establishment requests and bucket
     201    read requests messages.
     202    """
     203    # eventually this will merge into the FileNode
     204    ServerClass = Server # for tests to override
     205
     206    def __init__(self, storage_index, ueb_hash, size, k, N, storage_broker,
     207                 shutdowner):
     208        # values we get from the filecap
     209        self._storage_index = si = storage_index
     210        self._ueb_hash = ueb_hash
     211        self._size = size
     212        self._needed_shares = k
     213        self._total_shares = N
     214        self._share_hash_tree = IncompleteHashTree(self._total_shares)
     215        # values we discover when we first fetch the UEB
     216        self._ueb = None # is dict after UEB fetch+validate
     217        self._segsize = None
     218        self._numsegs = None
     219        self._blocksize = None
     220        self._tail_segsize = None
     221        self._ciphertext_hash = None # optional
     222        # structures we create when we fetch the UEB, then continue to fill
     223        # as we download the file
     224        self._share_hash_tree = None # is IncompleteHashTree after UEB fetch
     225        self._ciphertext_hash_tree = None
     226
     227        # values we learn as we download the file
     228        self._offsets = {} # (shnum,Server) to offset table (dict)
     229        self._block_hash_tree = {} # shnum to IncompleteHashTree
     230        # other things which help us
     231        self._guessed_segsize = min(128*1024, size)
     232        self._active_share_readers = {} # maps shnum to Reader instance
     233        self._share_readers = [] # sorted by preference, best first
     234        self._readers = set() # set of Reader instances
     235        self._recent_horizon = 10 # seconds
     236
     237        # 'shutdowner' is a MultiService parent used to cancel all downloads
     238        # when the node is shutting down, to let tests have a clean reactor.
     239
     240        self._init_available_servers()
     241        self._init_find_enough_shares()
     242
     243    # _available_servers is an iterator that provides us with Server
     244    # instances. Each time we pull out a Server, we immediately send it a
     245    # query, so we don't need to keep track of who we've sent queries to.
     246
     247    def _init_available_servers(self):
     248        self._available_servers = self._get_available_servers()
     249        self._no_more_available_servers = False
     250
     251    def _get_available_servers(self):
     252        """I am a generator of servers to use, sorted by the order in which
     253        we should query them. I make sure there are no duplicates in this
     254        list."""
     255        # TODO: make StorageBroker responsible for this non-duplication, and
     256        # replace this method with a simple iter(get_servers_for_index()),
     257        # plus a self._no_more_available_servers=True
     258        seen = set()
     259        sb = self._storage_broker
     260        for (peerid, ss) in sb.get_servers_for_index(self._storage_index):
     261            if peerid not in seen:
     262                yield self.ServerClass(peerid, ss) # Server(peerid, ss)
     263                seen.add(peerid)
     264        self._no_more_available_servers = True
     265
     266    # this block of code is responsible for having enough non-problematic
     267    # distinct shares/servers available and ready for download, and for
     268    # limiting the number of queries that are outstanding. The idea is that
     269    # we'll use the k fastest/best shares, and have the other ones in reserve
     270    # in case those servers stop responding or respond too slowly. We keep
     271    # track of all known shares, but we also keep track of problematic shares
     272    # (ones with hash failures or lost connections), so we can put them at
     273    # the bottom of the list.
     274
     275    def _init_find_enough_shares(self):
     276        # _unvalidated_sharemap maps shnum to set of Servers, and remembers
     277        # where viable (but not yet validated) shares are located. Each
     278        # get_bucket() response adds to this map, each act of validation
     279        # removes from it.
     280        self._sharemap = DictOfSets()
     281
     282        # _sharemap maps shnum to set of Servers, and remembers where viable
     283        # shares are located. Each get_bucket() response adds to this map,
     284        # each hash failure or disconnect removes from it. (TODO: if we
     285        # disconnect but reconnect later, we should be allowed to re-query).
     286        self._sharemap = DictOfSets()
     287
     288        # _problem_shares is a set of (shnum, Server) tuples, and
     289
     290        # _queries_in_flight maps a Server to a timestamp, which remembers
     291        # which servers we've sent queries to (and when) but have not yet
     292        # heard a response. This lets us put a limit on the number of
     293        # outstanding queries, to limit the size of the work window (how much
     294        # extra work we ask servers to do in the hopes of keeping our own
     295        # pipeline filled). We remove a Server from _queries_in_flight when
     296        # we get an answer/error or we finally give up. If we ever switch to
     297        # a non-connection-oriented protocol (like UDP, or forwarded Chord
     298        # queries), we can use this information to retransmit any query that
     299        # has gone unanswered for too long.
     300        self._queries_in_flight = dict()
     301
     302    def _count_recent_queries_in_flight(self):
     303        now = time.time()
     304        recent = now - self._recent_horizon
     305        return len([s for (s,when) in self._queries_in_flight.items()
     306                    if when > recent])
     307
     308    def _find_enough_shares(self):
     309        # goal: have 2*k distinct not-invalid shares available for reading,
     310        # from 2*k distinct servers. Do not have more than 4*k "recent"
     311        # queries in flight at a time.
     312        if (len(self._sharemap) >= 2*self._needed_shares
     313            and len(self._sharemap.values) >= 2*self._needed_shares):
     314            return
     315        num = self._count_recent_queries_in_flight()
     316        while num < 4*self._needed_shares:
     317            try:
     318                s = self._available_servers.next()
     319            except StopIteration:
     320                return # no more progress can be made
     321            self._queries_in_flight[s] = time.time()
     322            d = s.send_query(self._storage_index)
     323            d.addBoth(incidentally, self._queries_in_flight.discard, s)
     324            d.addCallbacks(lambda shnums: [self._sharemap.add(shnum, s)
     325                                           for shnum in shnums],
     326                           lambda f: self._query_error(f, s))
     327            d.addErrback(self._error)
     328            d.addCallback(self._reschedule)
     329            num += 1
     330
     331    def _query_error(self, f, s):
     332        # a server returned an error, log it gently and ignore
     333        level = log.WEIRD
     334        if f.check(DeadReferenceError):
     335            level = log.UNUSUAL
     336        log.msg("Error during get_buckets to server=%(server)s", server=str(s),
     337                failure=f, level=level, umid="3uuBUQ")
     338
     339    # this block is responsible for turning known shares into usable shares,
     340    # by fetching enough data to validate their contents.
     341
     342    # UEB (from any share)
     343    # share hash chain, validated (from any share, for given shnum)
     344    # block hash (any share, given shnum)
     345
     346    def _got_ueb(self, ueb_data, share):
     347        if self._ueb is not None:
     348            return
     349        if hashutil.uri_extension_hash(ueb_data) != self._ueb_hash:
     350            share.error("UEB hash does not match")
     351            return
     352        d = uri.unpack_extension(ueb_data)
     353        self.share_size = mathutil.div_ceil(self._size, self._needed_shares)
     354
     355
     356        # There are several kinds of things that can be found in a UEB.
     357        # First, things that we really need to learn from the UEB in order to
     358        # do this download. Next: things which are optional but not redundant
     359        # -- if they are present in the UEB they will get used. Next, things
     360        # that are optional and redundant. These things are required to be
     361        # consistent: they don't have to be in the UEB, but if they are in
     362        # the UEB then they will be checked for consistency with the
     363        # already-known facts, and if they are inconsistent then an exception
     364        # will be raised. These things aren't actually used -- they are just
     365        # tested for consistency and ignored. Finally: things which are
     366        # deprecated -- they ought not be in the UEB at all, and if they are
     367        # present then a warning will be logged but they are otherwise
     368        # ignored.
     369
     370        # First, things that we really need to learn from the UEB:
     371        # segment_size, crypttext_root_hash, and share_root_hash.
     372        self._segsize = d['segment_size']
     373
     374        self._blocksize = mathutil.div_ceil(self._segsize, self._needed_shares)
     375        self._numsegs = mathutil.div_ceil(self._size, self._segsize)
     376
     377        self._tail_segsize = self._size % self._segsize
     378        if self._tail_segsize == 0:
     379            self._tail_segsize = self._segsize
     380        # padding for erasure code
     381        self._tail_segsize = mathutil.next_multiple(self._tail_segsize,
     382                                                    self._needed_shares)
     383
     384        # Ciphertext hash tree root is mandatory, so that there is at most
     385        # one ciphertext that matches this read-cap or verify-cap. The
     386        # integrity check on the shares is not sufficient to prevent the
     387        # original encoder from creating some shares of file A and other
     388        # shares of file B.
     389        self._ciphertext_hash_tree = IncompleteHashTree(self._numsegs)
     390        self._ciphertext_hash_tree.set_hashes({0: d['crypttext_root_hash']})
     391
     392        self._share_hash_tree.set_hashes({0: d['share_root_hash']})
     393
     394
     395        # Next: things that are optional and not redundant: crypttext_hash
     396        if 'crypttext_hash' in d:
     397            if len(self._ciphertext_hash) == hashutil.CRYPTO_VAL_SIZE:
     398                self._ciphertext_hash = d['crypttext_hash']
     399            else:
     400                log.msg("ignoring bad-length UEB[crypttext_hash], "
     401                        "got %d bytes, want %d" % (len(d['crypttext_hash']),
     402                                                   hashutil.CRYPTO_VAL_SIZE),
     403                        umid="oZkGLA", level=log.WEIRD)
     404
     405        # we ignore all of the redundant fields when downloading. The
     406        # Verifier uses a different code path which does not ignore them.
     407
     408        # finally, set self._ueb as a marker that we don't need to request it
     409        # anymore
     410        self._ueb = d
     411
     412    def _got_share_hashes(self, hashes, share):
     413        assert isinstance(hashes, dict)
     414        try:
     415            self._share_hash_tree.set_hashes(hashes)
     416        except (IndexError, BadHashError, NotEnoughHashesError), le:
     417            share.error("Bad or missing hashes")
     418            return
     419
     420    #def _got_block_hashes(
     421
     422    def _init_validate_enough_shares(self):
     423        # _valid_shares maps shnum to ValidatedShare instances, and is
     424        # populated once the block hash root has been fetched and validated
     425        # (which requires any valid copy of the UEB, and a valid copy of the
     426        # share hash chain for each shnum)
     427        self._valid_shares = {}
     428
     429        # _target_shares is an ordered list of ReadyShare instances, each of
     430        # which is a (shnum, server) tuple. It is sorted in order of
     431        # preference: we expect to get the fastest response from the
     432        # ReadyShares at the front of the list. It is also sorted to
     433        # distribute the shnums, so that fetching shares from
     434        # _target_shares[:k] is likely (but not guaranteed) to give us k
     435        # distinct shares. The rule is that we skip over entries for blocks
     436        # that we've already received, limit the number of recent queries for
     437        # the same block,
     438        self._target_shares = []
     439
     440    def _validate_enough_shares(self):
     441        # my goal is to have at least 2*k distinct validated shares from at
     442        # least 2*k distinct servers
     443        valid_share_servers = set()
     444        for vs in self._valid_shares.values():
     445            valid_share_servers.update(vs.get_servers())
     446        if (len(self._valid_shares) >= 2*self._needed_shares
     447            and len(self._valid_share_servers) >= 2*self._needed_shares):
     448            return
     449        #for
     450
     451    def _reschedule(self, _ign):
     452        # fire the loop again
     453        if not self._scheduled:
     454            self._scheduled = True
     455            eventually(self._loop)
     456
     457    def _loop(self):
     458        self._scheduled = False
     459        # what do we need?
     460
     461        self._find_enough_shares()
     462        self._validate_enough_shares()
     463
     464        if not self._ueb:
     465            # we always need a copy of the UEB
     466            pass
     467
     468    def _error(self, f):
     469        # this is an unexpected error: a coding bug
     470        log.err(f, level=log.UNUSUAL)
     471           
     472
     473
     474# using a single packed string (and an offset table) may be an artifact of
     475# our native storage server: other backends might allow cheap multi-part
     476# files (think S3, several buckets per share, one for each section).
     477
     478# find new names for:
     479#  data_holder
     480#  Share / Share2  (ShareInstance / Share? but the first is more useful)
     481
     482class IShare(Interface):
     483    """I represent a single instance of a single share (e.g. I reference the
     484    shnum2 for share SI=abcde on server xy12t, not the one on server ab45q).
     485    This interface is used by SegmentFetcher to retrieve validated blocks.
     486    """
     487    def get_block(segnum):
     488        """Return an Observer2, which will be notified with the following
     489        events:
     490         state=COMPLETE, block=data (terminal): validated block data
     491         state=OVERDUE (non-terminal): we have reason to believe that the
     492                                       request might have stalled, or we
     493                                       might just be impatient
     494         state=CORRUPT (terminal): the data we received was corrupt
     495         state=DEAD (terminal): the connection has failed
     496        """
     497
     498
     499# it'd be nice if we receive the hashes before the block, or just
     500# afterwards, so we aren't stuck holding on to unvalidated blocks
     501# that we can't process. If we guess the offsets right, we can
     502# accomplish this by sending the block request after the metadata
     503# requests (by keeping two separate requestlists), and have a one RTT
     504# pipeline like:
     505#  1a=metadata, 1b=block
     506#  1b->process+deliver : one RTT
     507
     508# But if we guess wrong, and fetch the wrong part of the block, we'll
     509# have a pipeline that looks like:
     510#  1a=wrong metadata, 1b=wrong block
     511#  1a->2a=right metadata,2b=right block
     512#  2b->process+deliver
     513# which means two RTT and buffering one block (which, since we'll
     514# guess the segsize wrong for everything, means buffering one
     515# segment)
     516
     517# if we start asking for multiple segments, we could get something
     518# worse:
     519#  1a=wrong metadata, 1b=wrong block0, 1c=wrong block1, ..
     520#  1a->2a=right metadata,2b=right block0,2c=right block1, .
     521#  2b->process+deliver
     522
     523# which means two RTT but fetching and buffering the whole file
     524# before delivering anything. However, since we don't know when the
     525# other shares are going to arrive, we need to avoid having more than
     526# one block in the pipeline anyways. So we shouldn't be able to get
     527# into this state.
     528
     529# it also means that, instead of handling all of
     530# self._requested_blocks at once, we should only be handling one
     531# block at a time: one of the requested block should be special
     532# (probably FIFO). But retire all we can.
     533
     534    # this might be better with a Deferred, using COMPLETE as the success
     535    # case and CORRUPT/DEAD in an errback, because that would let us hold the
     536    # 'share' and 'shnum' arguments locally (instead of roundtripping them
     537    # through Share.send_request). But that OVERDUE is not terminal. So I
     538    # want a new sort of callback mechanism, with the extra-argument-passing
     539    # aspects of Deferred, but without being so one-shot. Is this a job for
     540    # Observer? No, it doesn't take extra arguments. So this uses Observer2.
     541
     542
     543class Reader:
     544    """I am responsible for a single offset+size read of the file. I handle
     545    segmentation: I figure out which segments are necessary, request them
     546    (from my CiphertextDownloader) in order, and trim the segments down to
     547    match the offset+size span. I use the Producer/Consumer interface to only
     548    request one segment at a time.
     549    """
     550    implements(IPushProducer)
     551    def __init__(self, consumer, offset, size):
     552        self._needed = []
     553        self._consumer = consumer
     554        self._hungry = False
     555        self._offset = offset
     556        self._size = size
     557        self._segsize = None
     558    def start(self):
     559        self._alive = True
     560        self._deferred = defer.Deferred()
     561        # the process doesn't actually start until set_segment_size()
     562        return self._deferred
     563
     564    def set_segment_size(self, segsize):
     565        if self._segsize is not None:
     566            return
     567        self._segsize = segsize
     568        self._compute_segnums()
     569
     570    def _compute_segnums(self, segsize):
     571        # now that we know the file's segsize, what segments (and which
     572        # ranges of each) will we need?
     573        size = self._size
     574        offset = self._offset
     575        while size:
     576            assert size >= 0
     577            this_seg_num = int(offset / self._segsize)
     578            this_seg_offset = offset - (seg_num*self._segsize)
     579            this_seg_size = min(size, self._segsize-seg_offset)
     580            size -= this_seg_size
     581            if size:
     582                offset += this_seg_size
     583            yield (this_seg_num, this_seg_offset, this_seg_size)
     584
     585    def get_needed_segments(self):
     586        return set([segnum for (segnum, off, size) in self._needed])
     587
     588
     589    def stopProducing(self):
     590        self._hungry = False
     591        self._alive = False
     592        # TODO: cancel the segment requests
     593    def pauseProducing(self):
     594        self._hungry = False
     595    def resumeProducing(self):
     596        self._hungry = True
     597    def add_segment(self, segnum, offset, size):
     598        self._needed.append( (segnum, offset, size) )
     599    def got_segment(self, segnum, segdata):
     600        """Return True if this schedule has more to go, or False if it is
     601        done."""
     602        assert self._needed[0][segnum] == segnum
     603        (_ign, offset, size) = self._needed.pop(0)
     604        data = segdata[offset:offset+size]
     605        self._consumer.write(data)
     606        if not self._needed:
     607            # we're done
     608            self._alive = False
     609            self._hungry = False
     610            self._consumer.unregisterProducer()
     611            self._deferred.callback(self._consumer)
     612    def error(self, f):
     613        self._alive = False
     614        self._hungry = False
     615        self._consumer.unregisterProducer()
     616        self._deferred.errback(f)
     617
     618
     619
     620class x:
     621    def OFFread(self, consumer, offset=0, size=None):
     622        """I am the main entry point, from which FileNode.read() can get
     623        data."""
     624        # tolerate concurrent operations: each gets its own Reader
     625        if size is None:
     626            size = self._size - offset
     627        r = Reader(consumer, offset, size)
     628        self._readers.add(r)
     629        d = r.start()
     630        if self.segment_size is not None:
     631            r.set_segment_size(self.segment_size)
     632            # TODO: if we can't find any segments, and thus never get a
     633            # segsize, tell the Readers to give up
     634        return d
  • new file src/allmydata/immutable/download2_util.py

    diff --git a/src/allmydata/immutable/download2_util.py b/src/allmydata/immutable/download2_util.py
    new file mode 100755
    index 0000000..9e20ff4
    - +  
     1import weakref
     2
     3from twisted.application import service
     4from foolscap.api import eventually
     5
     6class Observer2:
     7    """A simple class to distribute multiple events to a single subscriber.
     8    It accepts arbitrary kwargs, but no posargs."""
     9    def __init__(self):
     10        self._watcher = None
     11        self._undelivered_results = []
     12        self._canceler = None
     13
     14    def set_canceler(self, f):
     15        # we use a weakref to avoid creating a cycle between us and the thing
     16        # we're observing: they'll be holding a reference to us to compare
     17        # against the value we pass to their canceler function.
     18        self._canceler = weakref.ref(f)
     19
     20    def subscribe(self, observer, **watcher_kwargs):
     21        self._watcher = (observer, watcher_kwargs)
     22        while self._undelivered_results:
     23            self._notify(self._undelivered_results.pop(0))
     24
     25    def notify(self, **result_kwargs):
     26        if self._watcher:
     27            self._notify(result_kwargs)
     28        else:
     29            self._undelivered_results.append(result_kwargs)
     30
     31    def _notify(self, result_kwargs):
     32        o, watcher_kwargs = self._watcher
     33        kwargs = dict(result_kwargs)
     34        kwargs.update(watcher_kwargs)
     35        eventually(o, **kwargs)
     36
     37    def cancel(self):
     38        f = self._canceler()
     39        if f:
     40            f(self)
     41
     42
     43def incidentally(res, f, *args, **kwargs):
     44    """Add me to a Deferred chain like this:
     45     d.addBoth(incidentally, func, arg)
     46    and I'll behave as if you'd added the following function:
     47     def _(res):
     48         func(arg)
     49         return res
     50    This is useful if you want to execute an expression when the Deferred
     51    fires, but don't care about its value.
     52    """
     53    f(*args, **kwargs)
     54    return res
     55
     56
     57class Terminator(service.Service):
     58    def __init__(self):
     59        self._clients = weakref.WeakKeyDictionary()
     60    def register(self, c):
     61        self._clients[c] = None
     62    def stopService(self):
     63        for c in self._clients:
     64            c.stop()
     65        return service.Service.stopService(self)
  • src/allmydata/nodemaker.py

    diff --git a/src/allmydata/nodemaker.py b/src/allmydata/nodemaker.py
    index a30efbf..36ddfc7 100644
    a b import weakref 
    22from zope.interface import implements
    33from allmydata.util.assertutil import precondition
    44from allmydata.interfaces import INodeMaker, MustBeDeepImmutableError
    5 from allmydata.immutable.filenode import ImmutableFileNode, LiteralFileNode
     5from allmydata.immutable.filenode import LiteralFileNode
     6from allmydata.immutable.download2 import ImmutableFileNode
    67from allmydata.immutable.upload import Data
    78from allmydata.mutable.filenode import MutableFileNode
    89from allmydata.dirnode import DirectoryNode, pack_children
    class NodeMaker: 
    1718    implements(INodeMaker)
    1819
    1920    def __init__(self, storage_broker, secret_holder, history,
    20                  uploader, downloader, download_cache_dirman,
     21                 uploader, terminator,
    2122                 default_encoding_parameters, key_generator):
    2223        self.storage_broker = storage_broker
    2324        self.secret_holder = secret_holder
    2425        self.history = history
    2526        self.uploader = uploader
    26         self.downloader = downloader
    27         self.download_cache_dirman = download_cache_dirman
     27        self.terminator = terminator
    2828        self.default_encoding_parameters = default_encoding_parameters
    2929        self.key_generator = key_generator
    3030
    class NodeMaker: 
    3434        return LiteralFileNode(cap)
    3535    def _create_immutable(self, cap):
    3636        return ImmutableFileNode(cap, self.storage_broker, self.secret_holder,
    37                                  self.downloader, self.history,
    38                                  self.download_cache_dirman)
     37                                 self.terminator, self.history)
    3938    def _create_mutable(self, cap):
    4039        n = MutableFileNode(self.storage_broker, self.secret_holder,
    4140                            self.default_encoding_parameters,
    class NodeMaker: 
    4847        # this returns synchronously. It starts with a "cap string".
    4948        assert isinstance(writecap, (str, type(None))), type(writecap)
    5049        assert isinstance(readcap,  (str, type(None))), type(readcap)
    51        
     50
    5251        bigcap = writecap or readcap
    5352        if not bigcap:
    5453            # maybe the writecap was hidden because we're in a readonly
  • src/allmydata/test/test_cli.py

    diff --git a/src/allmydata/test/test_cli.py b/src/allmydata/test/test_cli.py
    index 3503b1b..b15fcf4 100644
    a b class Errors(GridTestMixin, CLITestMixin, unittest.TestCase): 
    20352035            self.delete_shares_numbered(ur.uri, range(1,10))
    20362036        d.addCallback(_stash_bad)
    20372037
     2038        # the download is abandoned as soon as it's clear that we won't get
     2039        # enough shares. The one remaining share might be in either the
     2040        # COMPLETE or the PENDING state.
     2041        in_complete_msg = "ran out of shares: 1 complete, 0 pending, 0 overdue, 0 unused, need 3"
     2042        in_pending_msg = "ran out of shares: 0 complete, 1 pending, 0 overdue, 0 unused, need 3"
     2043
    20382044        d.addCallback(lambda ign: self.do_cli("get", self.uri_1share))
    20392045        def _check1((rc, out, err)):
    20402046            self.failIfEqual(rc, 0)
    20412047            self.failUnless("410 Gone" in err, err)
    20422048            self.failUnlessIn("NotEnoughSharesError: ", err)
    2043             self.failUnlessIn("Failed to get enough shareholders: have 1, need 3", err)
     2049            self.failUnless(in_complete_msg in err or in_pending_msg in err,
     2050                            err)
    20442051        d.addCallback(_check1)
    20452052
    20462053        targetf = os.path.join(self.basedir, "output")
    class Errors(GridTestMixin, CLITestMixin, unittest.TestCase): 
    20492056            self.failIfEqual(rc, 0)
    20502057            self.failUnless("410 Gone" in err, err)
    20512058            self.failUnlessIn("NotEnoughSharesError: ", err)
    2052             self.failUnlessIn("Failed to get enough shareholders: have 1, need 3", err)
     2059            self.failUnless(in_complete_msg in err or in_pending_msg in err,
     2060                            err)
    20532061            self.failIf(os.path.exists(targetf))
    20542062        d.addCallback(_check2)
    20552063
  • src/allmydata/test/test_dirnode.py

    diff --git a/src/allmydata/test/test_dirnode.py b/src/allmydata/test/test_dirnode.py
    index e6aaf77..3779327 100644
    a b class Packing(unittest.TestCase): 
    11061106    def test_unpack_and_pack_behavior(self):
    11071107        known_tree = b32decode(self.known_tree)
    11081108        nodemaker = NodeMaker(None, None, None,
    1109                               None, None, None,
     1109                              None, None,
    11101110                              {"k": 3, "n": 10}, None)
    11111111        write_uri = "URI:SSK-RO:e3mdrzfwhoq42hy5ubcz6rp3o4:ybyibhnp3vvwuq2vaw2ckjmesgkklfs6ghxleztqidihjyofgw7q"
    11121112        filenode = nodemaker.create_from_cap(write_uri)
    class Packing(unittest.TestCase): 
    11681168        return kids
    11691169
    11701170    def test_deep_immutable(self):
    1171         nm = NodeMaker(None, None, None, None, None, None, {"k": 3, "n": 10},
    1172                        None)
     1171        nm = NodeMaker(None, None, None, None, None, {"k": 3, "n": 10}, None)
    11731172        fn = MinimalFakeMutableFile()
    11741173
    11751174        kids = self._make_kids(nm, ["imm", "lit", "write", "read",
    class FakeNodeMaker(NodeMaker): 
    12631262class FakeClient2(Client):
    12641263    def __init__(self):
    12651264        self.nodemaker = FakeNodeMaker(None, None, None,
    1266                                        None, None, None,
     1265                                       None, None,
    12671266                                       {"k":3,"n":10}, None)
    12681267    def create_node_from_uri(self, rwcap, rocap):
    12691268        return self.nodemaker.create_from_cap(rwcap, rocap)
    class Deleter(GridTestMixin, unittest.TestCase): 
    15471546        def _do_delete(ignored):
    15481547            nm = UCWEingNodeMaker(c0.storage_broker, c0._secret_holder,
    15491548                                  c0.get_history(), c0.getServiceNamed("uploader"),
    1550                                   c0.downloader,
    1551                                   c0.download_cache_dirman,
     1549                                  c0.terminator,
    15521550                                  c0.get_encoding_parameters(),
    15531551                                  c0._key_generator)
    15541552            n = nm.create_from_cap(self.root_uri)
  • src/allmydata/test/test_download.py

    diff --git a/src/allmydata/test/test_download.py b/src/allmydata/test/test_download.py
    index b54bf01..cffa132 100644
    a b class DownloadTest(GridTestMixin, unittest.TestCase): 
    178178        def _got_data(data):
    179179            self.failUnlessEqual(data, plaintext)
    180180        d.addCallback(_got_data)
     181        # make sure we can use the same node twice
     182        d.addCallback(lambda ign: download_to_data(n))
     183        d.addCallback(_got_data)
    181184        return d
    182185
    183186    def download_mutable(self, ignored=None):
    class DownloadTest(GridTestMixin, unittest.TestCase): 
    188191        d.addCallback(_got_data)
    189192        return d
    190193
     194    def test_download_failover(self):
     195        self.basedir = self.mktemp()
     196        self.set_up_grid()
     197        self.c0 = self.g.clients[0]
     198
     199        self.load_shares()
     200
     201        n = self.c0.create_node_from_uri(immutable_uri)
     202        d = download_to_data(n)
     203        def _got_data(data):
     204            self.failUnlessEqual(data, plaintext)
     205        d.addCallback(_got_data)
     206
     207        def _clobber_shares(ign):
     208            # find the three shares that were used, and delete them. Then
     209            # download again, forcing the downloader to fail over to other
     210            # shares
     211            si = uri.from_string(immutable_uri).get_storage_index()
     212            si_dir = storage_index_to_dir(si)
     213            for s in n._cnode._node._shares:
     214                for clientnum in immutable_shares:
     215                    for shnum in immutable_shares[clientnum]:
     216                        if s._shnum == shnum:
     217                            fn = os.path.join(self.get_serverdir(clientnum),
     218                                              "shares", si_dir, str(shnum))
     219                            os.unlink(fn)
     220        d.addCallback(_clobber_shares)
     221        d.addCallback(lambda ign: download_to_data(n))
     222        d.addCallback(_got_data)
     223        return d
     224
  • src/allmydata/test/test_mutable.py

    diff --git a/src/allmydata/test/test_mutable.py b/src/allmydata/test/test_mutable.py
    index fa29d34..1c3825c 100644
    a b def make_nodemaker(s=None, num_peers=10): 
    197197    keygen = client.KeyGenerator()
    198198    keygen.set_default_keysize(522)
    199199    nodemaker = NodeMaker(storage_broker, sh, None,
    200                           None, None, None,
     200                          None, None,
    201201                          {"k": 3, "n": 10}, keygen)
    202202    return nodemaker
    203203
  • src/allmydata/test/test_system.py

    diff --git a/src/allmydata/test/test_system.py b/src/allmydata/test/test_system.py
    index 5b301b8..d1bc6cb 100644
    a b from allmydata import uri 
    99from allmydata.storage.mutable import MutableShareFile
    1010from allmydata.storage.server import si_a2b
    1111from allmydata.immutable import offloaded, upload
    12 from allmydata.immutable.filenode import ImmutableFileNode, LiteralFileNode
     12from allmydata.immutable.filenode import LiteralFileNode
     13from allmydata.immutable.download2 import ImmutableFileNode
    1314from allmydata.util import idlib, mathutil
    1415from allmydata.util import log, base32
    1516from allmydata.util.consumer import MemoryConsumer, download_to_data
    class SystemTest(SystemTestMixin, unittest.TestCase): 
    11631164        d.addCallback(_got_status)
    11641165        def _got_up(res):
    11651166            return self.GET("status/down-%d" % self._down_status)
    1166         d.addCallback(_got_up)
     1167        #d.addCallback(_got_up)
    11671168        def _got_down(res):
    11681169            return self.GET("status/mapupdate-%d" % self._update_status)
    11691170        d.addCallback(_got_down)
  • src/allmydata/test/test_util.py

    diff --git a/src/allmydata/test/test_util.py b/src/allmydata/test/test_util.py
    index 0a326b3..de4a8ad 100644
    a b from twisted.trial import unittest 
    77from twisted.internet import defer, reactor
    88from twisted.python.failure import Failure
    99from twisted.python import log
     10from hashlib import md5
    1011
    1112from allmydata.util import base32, idlib, humanreadable, mathutil, hashutil
    1213from allmydata.util import assertutil, fileutil, deferredutil, abbreviate
    1314from allmydata.util import limiter, time_format, pollmixin, cachedir
    1415from allmydata.util import statistics, dictutil, pipeline
    1516from allmydata.util import log as tahoe_log
     17from allmydata.util.spans import Spans, overlap, DataSpans
    1618
    1719class Base32(unittest.TestCase):
    1820    def test_b2a_matches_Pythons(self):
    class Log(unittest.TestCase): 
    15371539        tahoe_log.err(format="intentional sample error",
    15381540                      failure=f, level=tahoe_log.OPERATIONAL, umid="wO9UoQ")
    15391541        self.flushLoggedErrors(SampleError)
     1542
     1543
     1544class SimpleSpans:
     1545    # this is a simple+inefficient form of util.spans.Spans . We compare the
     1546    # behavior of this reference model against the real (efficient) form.
     1547
     1548    def __init__(self, _span_or_start=None, length=None):
     1549        self._have = set()
     1550        if length is not None:
     1551            for i in range(_span_or_start, _span_or_start+length):
     1552                self._have.add(i)
     1553        elif _span_or_start:
     1554            for (start,length) in _span_or_start:
     1555                self.add(start, length)
     1556
     1557    def add(self, start, length):
     1558        for i in range(start, start+length):
     1559            self._have.add(i)
     1560        return self
     1561
     1562    def remove(self, start, length):
     1563        for i in range(start, start+length):
     1564            self._have.discard(i)
     1565        return self
     1566
     1567    def each(self):
     1568        return sorted(self._have)
     1569
     1570    def __iter__(self):
     1571        items = sorted(self._have)
     1572        prevstart = None
     1573        prevend = None
     1574        for i in items:
     1575            if prevstart is None:
     1576                prevstart = prevend = i
     1577                continue
     1578            if i == prevend+1:
     1579                prevend = i
     1580                continue
     1581            yield (prevstart, prevend-prevstart+1)
     1582            prevstart = prevend = i
     1583        if prevstart is not None:
     1584            yield (prevstart, prevend-prevstart+1)
     1585
     1586    def __len__(self):
     1587        # this also gets us bool(s)
     1588        return len(self._have)
     1589
     1590    def __add__(self, other):
     1591        s = self.__class__(self)
     1592        for (start, length) in other:
     1593            s.add(start, length)
     1594        return s
     1595
     1596    def __sub__(self, other):
     1597        s = self.__class__(self)
     1598        for (start, length) in other:
     1599            s.remove(start, length)
     1600        return s
     1601
     1602    def __iadd__(self, other):
     1603        for (start, length) in other:
     1604            self.add(start, length)
     1605        return self
     1606
     1607    def __isub__(self, other):
     1608        for (start, length) in other:
     1609            self.remove(start, length)
     1610        return self
     1611
     1612    def __contains__(self, (start,length)):
     1613        for i in range(start, start+length):
     1614            if i not in self._have:
     1615                return False
     1616        return True
     1617
     1618class ByteSpans(unittest.TestCase):
     1619    def test_basic(self):
     1620        s = Spans()
     1621        self.failUnlessEqual(list(s), [])
     1622        self.failIf(s)
     1623        self.failIf((0,1) in s)
     1624        self.failUnlessEqual(len(s), 0)
     1625
     1626        s1 = Spans(3, 4) # 3,4,5,6
     1627        self._check1(s1)
     1628
     1629        s2 = Spans(s1)
     1630        self._check1(s2)
     1631
     1632        s2.add(10,2) # 10,11
     1633        self._check1(s1)
     1634        self.failUnless((10,1) in s2)
     1635        self.failIf((10,1) in s1)
     1636        self.failUnlessEqual(list(s2.each()), [3,4,5,6,10,11])
     1637        self.failUnlessEqual(len(s2), 6)
     1638
     1639        s2.add(15,2).add(20,2)
     1640        self.failUnlessEqual(list(s2.each()), [3,4,5,6,10,11,15,16,20,21])
     1641        self.failUnlessEqual(len(s2), 10)
     1642
     1643        s2.remove(4,3).remove(15,1)
     1644        self.failUnlessEqual(list(s2.each()), [3,10,11,16,20,21])
     1645        self.failUnlessEqual(len(s2), 6)
     1646
     1647    def _check1(self, s):
     1648        self.failUnlessEqual(list(s), [(3,4)])
     1649        self.failUnless(s)
     1650        self.failUnlessEqual(len(s), 4)
     1651        self.failIf((0,1) in s)
     1652        self.failUnless((3,4) in s)
     1653        self.failUnless((3,1) in s)
     1654        self.failUnless((5,2) in s)
     1655        self.failUnless((6,1) in s)
     1656        self.failIf((6,2) in s)
     1657        self.failIf((7,1) in s)
     1658        self.failUnlessEqual(list(s.each()), [3,4,5,6])
     1659
     1660    def test_math(self):
     1661        s1 = Spans(0, 10) # 0,1,2,3,4,5,6,7,8,9
     1662        s2 = Spans(5, 3) # 5,6,7
     1663        s3 = Spans(8, 4) # 8,9,10,11
     1664
     1665        s = s1 - s2
     1666        self.failUnlessEqual(list(s.each()), [0,1,2,3,4,8,9])
     1667        s = s1 - s3
     1668        self.failUnlessEqual(list(s.each()), [0,1,2,3,4,5,6,7])
     1669        s = s2 - s3
     1670        self.failUnlessEqual(list(s.each()), [5,6,7])
     1671
     1672        s = s1 + s2
     1673        self.failUnlessEqual(list(s.each()), [0,1,2,3,4,5,6,7,8,9])
     1674        s = s1 + s3
     1675        self.failUnlessEqual(list(s.each()), [0,1,2,3,4,5,6,7,8,9,10,11])
     1676        s = s2 + s3
     1677        self.failUnlessEqual(list(s.each()), [5,6,7,8,9,10,11])
     1678
     1679        s = Spans(s1)
     1680        s -= s2
     1681        self.failUnlessEqual(list(s.each()), [0,1,2,3,4,8,9])
     1682        s = Spans(s1)
     1683        s -= s3
     1684        self.failUnlessEqual(list(s.each()), [0,1,2,3,4,5,6,7])
     1685        s = Spans(s2)
     1686        s -= s3
     1687        self.failUnlessEqual(list(s.each()), [5,6,7])
     1688
     1689        s = Spans(s1)
     1690        s += s2
     1691        self.failUnlessEqual(list(s.each()), [0,1,2,3,4,5,6,7,8,9])
     1692        s = Spans(s1)
     1693        s += s3
     1694        self.failUnlessEqual(list(s.each()), [0,1,2,3,4,5,6,7,8,9,10,11])
     1695        s = Spans(s2)
     1696        s += s3
     1697        self.failUnlessEqual(list(s.each()), [5,6,7,8,9,10,11])
     1698
     1699    def test_random(self):
     1700        # attempt to increase coverage of corner cases by comparing behavior
     1701        # of a simple-but-slow model implementation against the
     1702        # complex-but-fast actual implementation, in a large number of random
     1703        # operations
     1704        S1 = SimpleSpans
     1705        S2 = Spans
     1706        s1 = S1(); s2 = S2()
     1707        seed = ""
     1708        def _create(subseed):
     1709            ns1 = S1(); ns2 = S2()
     1710            for i in range(10):
     1711                what = md5(subseed+str(i)).hexdigest()
     1712                start = int(what[2:4], 16)
     1713                length = max(1,int(what[5:6], 16))
     1714                ns1.add(start, length); ns2.add(start, length)
     1715            return ns1, ns2
     1716
     1717        #print
     1718        for i in range(1000):
     1719            what = md5(seed+str(i)).hexdigest()
     1720            op = what[0]
     1721            subop = what[1]
     1722            start = int(what[2:4], 16)
     1723            length = max(1,int(what[5:6], 16))
     1724            #print what
     1725            if op in "0":
     1726                if subop in "01234":
     1727                    s1 = S1(); s2 = S2()
     1728                elif subop in "5678":
     1729                    s1 = S1(start, length); s2 = S2(start, length)
     1730                else:
     1731                    s1 = S1(s1); s2 = S2(s2)
     1732                #print "s2 = %s" % s2.dump()
     1733            elif op in "123":
     1734                #print "s2.add(%d,%d)" % (start, length)
     1735                s1.add(start, length); s2.add(start, length)
     1736            elif op in "456":
     1737                #print "s2.remove(%d,%d)" % (start, length)
     1738                s1.remove(start, length); s2.remove(start, length)
     1739            elif op in "78":
     1740                ns1, ns2 = _create(what[7:11])
     1741                #print "s2 + %s" % ns2.dump()
     1742                s1 = s1 + ns1; s2 = s2 + ns2
     1743            elif op in "9a":
     1744                ns1, ns2 = _create(what[7:11])
     1745                #print "%s - %s" % (s2.dump(), ns2.dump())
     1746                s1 = s1 - ns1; s2 = s2 - ns2
     1747            elif op in "bc":
     1748                ns1, ns2 = _create(what[7:11])
     1749                #print "s2 += %s" % ns2.dump()
     1750                s1 += ns1; s2 += ns2
     1751            else:
     1752                ns1, ns2 = _create(what[7:11])
     1753                #print "%s -= %s" % (s2.dump(), ns2.dump())
     1754                s1 -= ns1; s2 -= ns2
     1755            #print "s2 now %s" % s2.dump()
     1756            self.failUnlessEqual(list(s1.each()), list(s2.each()))
     1757            self.failUnlessEqual(len(s1), len(s2))
     1758            self.failUnlessEqual(bool(s1), bool(s2))
     1759            self.failUnlessEqual(list(s1), list(s2))
     1760            for j in range(10):
     1761                what = md5(what[12:14]+str(j)).hexdigest()
     1762                start = int(what[2:4], 16)
     1763                length = max(1, int(what[5:6], 16))
     1764                span = (start, length)
     1765                self.failUnlessEqual(bool(span in s1), bool(span in s2))
     1766
     1767
     1768    # s()
     1769    # s(start,length)
     1770    # s(s0)
     1771    # s.add(start,length) : returns s
     1772    # s.remove(start,length)
     1773    # s.each() -> list of byte offsets, mostly for testing
     1774    # list(s) -> list of (start,length) tuples, one per span
     1775    # (start,length) in s -> True if (start..start+length-1) are all members
     1776    #  NOT equivalent to x in list(s)
     1777    # len(s) -> number of bytes, for testing, bool(), and accounting/limiting
     1778    # bool(s)  (__len__)
     1779    # s = s1+s2, s1-s2, +=s1, -=s1
     1780
     1781    def test_overlap(self):
     1782        for a in range(20):
     1783            for b in range(10):
     1784                for c in range(20):
     1785                    for d in range(10):
     1786                        self._test_overlap(a,b,c,d)
     1787
     1788    def _test_overlap(self, a, b, c, d):
     1789        s1 = set(range(a,a+b))
     1790        s2 = set(range(c,c+d))
     1791        #print "---"
     1792        #self._show_overlap(s1, "1")
     1793        #self._show_overlap(s2, "2")
     1794        o = overlap(a,b,c,d)
     1795        expected = s1.intersection(s2)
     1796        if not expected:
     1797            self.failUnlessEqual(o, None)
     1798        else:
     1799            start,length = o
     1800            so = set(range(start,start+length))
     1801            #self._show(so, "o")
     1802            self.failUnlessEqual(so, expected)
     1803
     1804    def _show_overlap(self, s, c):
     1805        import sys
     1806        out = sys.stdout
     1807        if s:
     1808            for i in range(max(s)):
     1809                if i in s:
     1810                    out.write(c)
     1811                else:
     1812                    out.write(" ")
     1813        out.write("\n")
     1814
     1815def extend(s, start, length, fill):
     1816    if len(s) >= start+length:
     1817        return s
     1818    assert len(fill) == 1
     1819    return s + fill*(start+length-len(s))
     1820
     1821def replace(s, start, data):
     1822    assert len(s) >= start+len(data)
     1823    return s[:start] + data + s[start+len(data):]
     1824
     1825class SimpleDataSpans:
     1826    def __init__(self, other=None):
     1827        self.missing = "" # "1" where missing, "0" where found
     1828        self.data = ""
     1829        if other:
     1830            for (start, data) in other.get_chunks():
     1831                self.add(start, data)
     1832
     1833    def __len__(self):
     1834        return len(self.missing.translate(None, "1"))
     1835    def _dump(self):
     1836        return [i for (i,c) in enumerate(self.missing) if c == "0"]
     1837    def _have(self, start, length):
     1838        m = self.missing[start:start+length]
     1839        if not m or len(m)<length or int(m):
     1840            return False
     1841        return True
     1842    def get_chunks(self):
     1843        for i in self._dump():
     1844            yield (i, self.data[i])
     1845    def get_spans(self):
     1846        return SimpleSpans([(start,len(data))
     1847                            for (start,data) in self.get_chunks()])
     1848    def get(self, start, length):
     1849        if self._have(start, length):
     1850            return self.data[start:start+length]
     1851        return None
     1852    def pop(self, start, length):
     1853        data = self.get(start, length)
     1854        if data:
     1855            self.remove(start, length)
     1856        return data
     1857    def remove(self, start, length):
     1858        self.missing = replace(extend(self.missing, start, length, "1"),
     1859                               start, "1"*length)
     1860    def add(self, start, data):
     1861        self.missing = replace(extend(self.missing, start, len(data), "1"),
     1862                               start, "0"*len(data))
     1863        self.data = replace(extend(self.data, start, len(data), " "),
     1864                            start, data)
     1865
     1866
     1867class StringSpans(unittest.TestCase):
     1868    def do_basic(self, klass):
     1869        ds = klass()
     1870        self.failUnlessEqual(len(ds), 0)
     1871        self.failUnlessEqual(list(ds._dump()), [])
     1872        self.failUnlessEqual(sum([len(d) for (s,d) in ds.get_chunks()]), 0)
     1873        s = ds.get_spans()
     1874        self.failUnlessEqual(ds.get(0, 4), None)
     1875        self.failUnlessEqual(ds.pop(0, 4), None)
     1876        ds.remove(0, 4)
     1877
     1878        ds.add(2, "four")
     1879        self.failUnlessEqual(len(ds), 4)
     1880        self.failUnlessEqual(list(ds._dump()), [2,3,4,5])
     1881        self.failUnlessEqual(sum([len(d) for (s,d) in ds.get_chunks()]), 4)
     1882        s = ds.get_spans()
     1883        self.failUnless((2,2) in s)
     1884        self.failUnlessEqual(ds.get(0, 4), None)
     1885        self.failUnlessEqual(ds.pop(0, 4), None)
     1886        self.failUnlessEqual(ds.get(4, 4), None)
     1887
     1888        ds2 = klass(ds)
     1889        self.failUnlessEqual(len(ds2), 4)
     1890        self.failUnlessEqual(list(ds2._dump()), [2,3,4,5])
     1891        self.failUnlessEqual(sum([len(d) for (s,d) in ds2.get_chunks()]), 4)
     1892        self.failUnlessEqual(ds2.get(0, 4), None)
     1893        self.failUnlessEqual(ds2.pop(0, 4), None)
     1894        self.failUnlessEqual(ds2.pop(2, 3), "fou")
     1895        self.failUnlessEqual(sum([len(d) for (s,d) in ds2.get_chunks()]), 1)
     1896        self.failUnlessEqual(ds2.get(2, 3), None)
     1897        self.failUnlessEqual(ds2.get(5, 1), "r")
     1898        self.failUnlessEqual(ds.get(2, 3), "fou")
     1899        self.failUnlessEqual(sum([len(d) for (s,d) in ds.get_chunks()]), 4)
     1900
     1901        ds.add(0, "23")
     1902        self.failUnlessEqual(len(ds), 6)
     1903        self.failUnlessEqual(list(ds._dump()), [0,1,2,3,4,5])
     1904        self.failUnlessEqual(sum([len(d) for (s,d) in ds.get_chunks()]), 6)
     1905        self.failUnlessEqual(ds.get(0, 4), "23fo")
     1906        self.failUnlessEqual(ds.pop(0, 4), "23fo")
     1907        self.failUnlessEqual(sum([len(d) for (s,d) in ds.get_chunks()]), 2)
     1908        self.failUnlessEqual(ds.get(0, 4), None)
     1909        self.failUnlessEqual(ds.pop(0, 4), None)
     1910
     1911        ds = klass()
     1912        ds.add(2, "four")
     1913        ds.add(3, "ea")
     1914        self.failUnlessEqual(ds.get(2, 4), "fear")
     1915
     1916    def do_scan(self, klass):
     1917        # do a test with gaps and spans of size 1 and 2
     1918        #  left=(1,11) * right=(1,11) * gapsize=(1,2)
     1919        # 111, 112, 121, 122, 211, 212, 221, 222
     1920        #    211
     1921        #      121
     1922        #         112
     1923        #            212
     1924        #               222
     1925        #                   221
     1926        #                      111
     1927        #                        122
     1928        #  11 1  1 11 11  11  1 1  111
     1929        # 0123456789012345678901234567
     1930        # abcdefghijklmnopqrstuvwxyz-=
     1931        pieces = [(1, "bc"),
     1932                  (4, "e"),
     1933                  (7, "h"),
     1934                  (9, "jk"),
     1935                  (12, "mn"),
     1936                  (16, "qr"),
     1937                  (20, "u"),
     1938                  (22, "w"),
     1939                  (25, "z-="),
     1940                  ]
     1941        p_elements = set([1,2,4,7,9,10,12,13,16,17,20,22,25,26,27])
     1942        S = "abcdefghijklmnopqrstuvwxyz-="
     1943        # TODO: when adding data, add capital letters, to make sure we aren't
     1944        # just leaving the old data in place
     1945        l = len(S)
     1946        def base():
     1947            ds = klass()
     1948            for start, data in pieces:
     1949                ds.add(start, data)
     1950            return ds
     1951        def dump(s):
     1952            p = set(s._dump())
     1953            # wow, this is the first time I've ever wanted ?: in python
     1954            # note: this requires python2.5
     1955            d = "".join([(S[i] if i in p else " ") for i in range(l)])
     1956            assert len(d) == l
     1957            return d
     1958        DEBUG = False
     1959        for start in range(0, l):
     1960            for end in range(start+1, l):
     1961                # add [start-end) to the baseline
     1962                which = "%d-%d" % (start, end-1)
     1963                p_added = set(range(start, end))
     1964                b = base()
     1965                if DEBUG:
     1966                    print
     1967                    print dump(b), which
     1968                    add = klass(); add.add(start, S[start:end])
     1969                    print dump(add)
     1970                b.add(start, S[start:end])
     1971                if DEBUG:
     1972                    print dump(b)
     1973                # check that the new span is there
     1974                d = b.get(start, end-start)
     1975                self.failUnlessEqual(d, S[start:end], which)
     1976                # check that all the original pieces are still there
     1977                for t_start, t_data in pieces:
     1978                    t_len = len(t_data)
     1979                    self.failUnlessEqual(b.get(t_start, t_len),
     1980                                         S[t_start:t_start+t_len],
     1981                                         "%s %d+%d" % (which, t_start, t_len))
     1982                # check that a lot of subspans are mostly correct
     1983                for t_start in range(l):
     1984                    for t_len in range(1,4):
     1985                        d = b.get(t_start, t_len)
     1986                        if d is not None:
     1987                            which2 = "%s+(%d-%d)" % (which, t_start,
     1988                                                     t_start+t_len-1)
     1989                            self.failUnlessEqual(d, S[t_start:t_start+t_len],
     1990                                                 which2)
     1991                        # check that removing a subspan gives the right value
     1992                        b2 = klass(b)
     1993                        b2.remove(t_start, t_len)
     1994                        removed = set(range(t_start, t_start+t_len))
     1995                        for i in range(l):
     1996                            exp = (((i in p_elements) or (i in p_added))
     1997                                   and (i not in removed))
     1998                            which2 = "%s-(%d-%d)" % (which, t_start,
     1999                                                     t_start+t_len-1)
     2000                            self.failUnlessEqual(bool(b2.get(i, 1)), exp,
     2001                                                 which2+" %d" % i)
     2002
     2003    def test_test(self):
     2004        self.do_basic(SimpleDataSpans)
     2005        self.do_scan(SimpleDataSpans)
     2006
     2007    def test_basic(self):
     2008        self.do_basic(DataSpans)
     2009        self.do_scan(DataSpans)
     2010
     2011    def test_random(self):
     2012        # attempt to increase coverage of corner cases by comparing behavior
     2013        # of a simple-but-slow model implementation against the
     2014        # complex-but-fast actual implementation, in a large number of random
     2015        # operations
     2016        S1 = SimpleDataSpans
     2017        S2 = DataSpans
     2018        s1 = S1(); s2 = S2()
     2019        seed = ""
     2020        def _randstr(length, seed):
     2021            created = 0
     2022            pieces = []
     2023            while created < length:
     2024                piece = md5(seed + str(created)).hexdigest()
     2025                pieces.append(piece)
     2026                created += len(piece)
     2027            return "".join(pieces)[:length]
     2028        def _create(subseed):
     2029            ns1 = S1(); ns2 = S2()
     2030            for i in range(10):
     2031                what = md5(subseed+str(i)).hexdigest()
     2032                start = int(what[2:4], 16)
     2033                length = max(1,int(what[5:6], 16))
     2034                ns1.add(start, _randstr(length, what[7:9]));
     2035                ns2.add(start, _randstr(length, what[7:9]))
     2036            return ns1, ns2
     2037
     2038        #print
     2039        for i in range(1000):
     2040            what = md5(seed+str(i)).hexdigest()
     2041            op = what[0]
     2042            subop = what[1]
     2043            start = int(what[2:4], 16)
     2044            length = max(1,int(what[5:6], 16))
     2045            #print what
     2046            if op in "0":
     2047                if subop in "0123456":
     2048                    s1 = S1(); s2 = S2()
     2049                else:
     2050                    s1, s2 = _create(what[7:11])
     2051                #print "s2 = %s" % list(s2._dump())
     2052            elif op in "123456":
     2053                #print "s2.add(%d,%d)" % (start, length)
     2054                s1.add(start, _randstr(length, what[7:9]));
     2055                s2.add(start, _randstr(length, what[7:9]))
     2056            elif op in "789abc":
     2057                #print "s2.remove(%d,%d)" % (start, length)
     2058                s1.remove(start, length); s2.remove(start, length)
     2059            else:
     2060                #print "s2.pop(%d,%d)" % (start, length)
     2061                d1 = s1.pop(start, length); d2 = s2.pop(start, length)
     2062                self.failUnlessEqual(d1, d2)
     2063            #print "s1 now %s" % list(s1._dump())
     2064            #print "s2 now %s" % list(s2._dump())
     2065            self.failUnlessEqual(len(s1), len(s2))
     2066            self.failUnlessEqual(list(s1._dump()), list(s2._dump()))
     2067            for j in range(100):
     2068                what = md5(what[12:14]+str(j)).hexdigest()
     2069                start = int(what[2:4], 16)
     2070                length = max(1, int(what[5:6], 16))
     2071                d1 = s1.get(start, length); d2 = s2.get(start, length)
     2072                self.failUnlessEqual(d1, d2, "%d+%d" % (start, length))
  • src/allmydata/test/test_web.py

    diff --git a/src/allmydata/test/test_web.py b/src/allmydata/test/test_web.py
    index b148598..d70460f 100644
    a b class FakeClient(Client): 
    105105        self.uploader = FakeUploader()
    106106        self.uploader.setServiceParent(self)
    107107        self.nodemaker = FakeNodeMaker(None, self._secret_holder, None,
    108                                        self.uploader, None, None,
     108                                       self.uploader, None,
    109109                                       None, None)
    110110
    111111    def startService(self):
  • src/allmydata/util/dictutil.py

    diff --git a/src/allmydata/util/dictutil.py b/src/allmydata/util/dictutil.py
    index 3dc815b..91785ac 100644
    a b class DictOfSets(dict): 
    5757        if not self[key]:
    5858            del self[key]
    5959
     60    def allvalues(self):
     61        # return a set that merges all value sets
     62        r = set()
     63        for key in self:
     64            r.update(self[key])
     65        return r
     66
    6067class UtilDict:
    6168    def __init__(self, initialdata={}):
    6269        self.d = {}
  • new file src/allmydata/util/spans.py

    diff --git a/src/allmydata/util/spans.py b/src/allmydata/util/spans.py
    new file mode 100755
    index 0000000..853d207
    - +  
     1
     2class Spans:
     3    """I represent a compressed list of booleans, one per index (an integer).
     4    Typically, each index represents an offset into a large string, pointing
     5    to a specific byte of a share. In this context, True means that byte has
     6    been received, or has been requested.
     7
     8    Another way to look at this is maintaining a set of integers, optimized
     9    for operations on spans like 'add range to set' and 'is range in set?'.
     10
     11    This is a python equivalent of perl's Set::IntSpan module, frequently
     12    used to represent .newsrc contents.
     13
     14    Rather than storing an actual (large) list or dictionary, I represent my
     15    internal state as a sorted list of spans, each with a start and a length.
     16    My API is presented in terms of start+length pairs. I provide set
     17    arithmetic operators, to efficiently answer questions like 'I want bytes
     18    XYZ, I already requested bytes ABC, and I've already received bytes DEF:
     19    what bytes should I request now?'.
     20
     21    The new downloader will use it to keep track of which bytes we've requested
     22    or received already.
     23    """
     24
     25    def __init__(self, _span_or_start=None, length=None):
     26        self._spans = list()
     27        if length is not None:
     28            self._spans.append( (_span_or_start, length) )
     29        elif _span_or_start:
     30            for (start,length) in _span_or_start:
     31                self.add(start, length)
     32        self._check()
     33
     34    def _check(self):
     35        assert sorted(self._spans) == self._spans
     36        prev_end = None
     37        try:
     38            for (start,length) in self._spans:
     39                if prev_end is not None:
     40                    assert start > prev_end
     41                prev_end = start+length
     42        except AssertionError:
     43            print "BAD:", self.dump()
     44            raise
     45
     46    def add(self, start, length):
     47        assert start >= 0
     48        assert length > 0
     49        #print " ADD [%d+%d -%d) to %s" % (start, length, start+length, self.dump())
     50        first_overlap = last_overlap = None
     51        for i,(s_start,s_length) in enumerate(self._spans):
     52            #print "  (%d+%d)-> overlap=%s adjacent=%s" % (s_start,s_length, overlap(s_start, s_length, start, length), adjacent(s_start, s_length, start, length))
     53            if (overlap(s_start, s_length, start, length)
     54                or adjacent(s_start, s_length, start, length)):
     55                last_overlap = i
     56                if first_overlap is None:
     57                    first_overlap = i
     58                continue
     59            # no overlap
     60            if first_overlap is not None:
     61                break
     62        #print "  first_overlap", first_overlap, last_overlap
     63        if first_overlap is None:
     64            # no overlap, so just insert the span and sort by starting
     65            # position.
     66            self._spans.insert(0, (start,length))
     67            self._spans.sort()
     68        else:
     69            # everything from [first_overlap] to [last_overlap] overlapped
     70            first_start,first_length = self._spans[first_overlap]
     71            last_start,last_length = self._spans[last_overlap]
     72            newspan_start = min(start, first_start)
     73            newspan_end = max(start+length, last_start+last_length)
     74            newspan_length = newspan_end - newspan_start
     75            newspan = (newspan_start, newspan_length)
     76            self._spans[first_overlap:last_overlap+1] = [newspan]
     77        #print "  ADD done: %s" % self.dump()
     78        self._check()
     79
     80        return self
     81
     82    def remove(self, start, length):
     83        assert start >= 0
     84        assert length > 0
     85        #print " REMOVE [%d+%d -%d) from %s" % (start, length, start+length, self.dump())
     86        first_complete_overlap = last_complete_overlap = None
     87        for i,(s_start,s_length) in enumerate(self._spans):
     88            s_end = s_start + s_length
     89            o = overlap(s_start, s_length, start, length)
     90            if o:
     91                o_start, o_length = o
     92                o_end = o_start+o_length
     93                if o_start == s_start and o_end == s_end:
     94                    # delete this span altogether
     95                    if first_complete_overlap is None:
     96                        first_complete_overlap = i
     97                    last_complete_overlap = i
     98                elif o_start == s_start:
     99                    # we only overlap the left side, so trim the start
     100                    #    1111
     101                    #  rrrr
     102                    #    oo
     103                    # ->   11
     104                    new_start = o_end
     105                    new_end = s_end
     106                    assert new_start > s_start
     107                    new_length = new_end - new_start
     108                    self._spans[i] = (new_start, new_length)
     109                elif o_end == s_end:
     110                    # we only overlap the right side
     111                    #    1111
     112                    #      rrrr
     113                    #      oo
     114                    # -> 11
     115                    new_start = s_start
     116                    new_end = o_start
     117                    assert new_end < s_end
     118                    new_length = new_end - new_start
     119                    self._spans[i] = (new_start, new_length)
     120                else:
     121                    # we overlap the middle, so create a new span. No need to
     122                    # examine any other spans.
     123                    #    111111
     124                    #      rr
     125                    #    LL  RR
     126                    left_start = s_start
     127                    left_end = o_start
     128                    left_length = left_end - left_start
     129                    right_start = o_end
     130                    right_end = s_end
     131                    right_length = right_end - right_start
     132                    self._spans[i] = (left_start, left_length)
     133                    self._spans.append( (right_start, right_length) )
     134                    self._spans.sort()
     135                    break
     136        if first_complete_overlap is not None:
     137            del self._spans[first_complete_overlap:last_complete_overlap+1]
     138        #print "  REMOVE done: %s" % self.dump()
     139        self._check()
     140        return self
     141
     142    def dump(self):
     143        return "len=%d: %s" % (len(self),
     144                               ",".join(["[%d-%d]" % (start,start+l-1)
     145                                         for (start,l) in self._spans]) )
     146
     147    def each(self):
     148        for start, length in self._spans:
     149            for i in range(start, start+length):
     150                yield i
     151
     152    def __iter__(self):
     153        for s in self._spans:
     154            yield s
     155
     156    def __len__(self):
     157        # this also gets us bool(s)
     158        return sum([length for start,length in self._spans])
     159
     160    def __add__(self, other):
     161        s = self.__class__(self)
     162        for (start, length) in other:
     163            s.add(start, length)
     164        return s
     165
     166    def __sub__(self, other):
     167        s = self.__class__(self)
     168        for (start, length) in other:
     169            s.remove(start, length)
     170        return s
     171
     172    def __iadd__(self, other):
     173        for (start, length) in other:
     174            self.add(start, length)
     175        return self
     176
     177    def __isub__(self, other):
     178        for (start, length) in other:
     179            self.remove(start, length)
     180        return self
     181
     182    def __contains__(self, (start,length)):
     183        for span_start,span_length in self._spans:
     184            o = overlap(start, length, span_start, span_length)
     185            if o:
     186                o_start,o_length = o
     187                if o_start == start and o_length == length:
     188                    return True
     189        return False
     190
     191def overlap(start0, length0, start1, length1):
     192    # return start2,length2 of the overlapping region, or None
     193    #  00      00   000   0000  00  00 000  00   00  00      00
     194    #     11    11   11    11   111 11 11  1111 111 11    11
     195    left = max(start0, start1)
     196    right = min(start0+length0, start1+length1)
     197    # if there is overlap, 'left' will be its start, and right-1 will
     198    # be the end'
     199    if left < right:
     200        return (left, right-left)
     201    return None
     202
     203def adjacent(start0, length0, start1, length1):
     204    if (start0 < start1) and start0+length0 == start1:
     205        return True
     206    elif (start1 < start0) and start1+length1 == start0:
     207        return True
     208    return False
     209
     210class DataSpans:
     211    """I represent portions of a large string. Equivalently, I can be said to
     212    maintain a large array of characters (with gaps of empty elements). I can
     213    be used to manage access to a remote share, where some pieces have been
     214    retrieved, some have been requested, and others have not been read.
     215    """
     216
     217    def __init__(self, other=None):
     218        self.spans = [] # (start, data) tuples, non-overlapping, merged
     219        if other:
     220            for (start, data) in other.get_chunks():
     221                self.add(start, data)
     222
     223    def __len__(self):
     224        # return number of bytes we're holding
     225        return sum([len(data) for (start,data) in self.spans])
     226
     227    def _dump(self):
     228        # return iterator of sorted list of offsets, one per byte
     229        for (start,data) in self.spans:
     230            for i in range(start, start+len(data)):
     231                yield i
     232
     233    def dump(self):
     234        return "len=%d: %s" % (len(self),
     235                               ",".join(["[%d-%d]" % (start,start+len(data)-1)
     236                                         for (start,data) in self.spans]) )
     237
     238    def get_chunks(self):
     239        return list(self.spans)
     240
     241    def get_spans(self):
     242        """Return a Spans object with a bit set for each byte I hold"""
     243        return Spans([(start, len(data)) for (start,data) in self.spans])
     244
     245    def assert_invariants(self):
     246        if not self.spans:
     247            return
     248        prev_start = self.spans[0][0]
     249        prev_end = prev_start + len(self.spans[0][1])
     250        for start, data in self.spans[1:]:
     251            if not start > prev_end:
     252                # adjacent or overlapping: bad
     253                print "ASSERTION FAILED", self.spans
     254                raise AssertionError
     255
     256    def get(self, start, length):
     257        # returns a string of LENGTH, or None
     258        #print "get", start, length, self.spans
     259        end = start+length
     260        for (s_start,s_data) in self.spans:
     261            s_end = s_start+len(s_data)
     262            #print " ",s_start,s_end
     263            if s_start <= start < s_end:
     264                # we want some data from this span. Because we maintain
     265                # strictly merged and non-overlapping spans, everything we
     266                # want must be in this span.
     267                offset = start - s_start
     268                if offset + length > len(s_data):
     269                    #print " None, span falls short"
     270                    return None # span falls short
     271                #print " some", s_data[offset:offset+length]
     272                return s_data[offset:offset+length]
     273            if s_start >= end:
     274                # we've gone too far: no further spans will overlap
     275                #print " None, gone too far"
     276                return None
     277        #print " None, ran out of spans"
     278        return None
     279
     280    def add(self, start, data):
     281        # first: walk through existing spans, find overlap, modify-in-place
     282        #  create list of new spans
     283        #  add new spans
     284        #  sort
     285        #  merge adjacent spans
     286        #print "add", start, data, self.spans
     287        end = start + len(data)
     288        i = 0
     289        while len(data):
     290            #print " loop", start, data, i, len(self.spans), self.spans
     291            if i >= len(self.spans):
     292                #print " append and done"
     293                # append a last span
     294                self.spans.append( (start, data) )
     295                break
     296            (s_start,s_data) = self.spans[i]
     297            # five basic cases:
     298            #  a: OLD  b:OLDD  c1:OLD  c2:OLD   d1:OLDD  d2:OLD  e: OLLDD
     299            #    NEW     NEW      NEW     NEWW      NEW      NEW     NEW
     300            #
     301            # we handle A by inserting a new segment (with "N") and looping,
     302            # turning it into B or C. We handle B by replacing a prefix and
     303            # terminating. We handle C (both c1 and c2) by replacing the
     304            # segment (and, for c2, looping, turning it into A). We handle D
     305            # by replacing a suffix (and, for d2, looping, turning it into
     306            # A). We handle E by replacing the middle and terminating.
     307            if start < s_start:
     308                # case A: insert a new span, then loop with the remainder
     309                #print " insert new psan"
     310                s_len = s_start-start
     311                self.spans.insert(i, (start, data[:s_len]))
     312                i += 1
     313                start = s_start
     314                data = data[s_len:]
     315                continue
     316            s_len = len(s_data)
     317            s_end = s_start+s_len
     318            if s_start <= start < s_end:
     319                #print " modify this span", s_start, start, s_end
     320                # we want to modify some data in this span: a prefix, a
     321                # suffix, or the whole thing
     322                if s_start == start:
     323                    if s_end <= end:
     324                        #print " replace whole segment"
     325                        # case C: replace this segment
     326                        self.spans[i] = (s_start, data[:s_len])
     327                        i += 1
     328                        start += s_len
     329                        data = data[s_len:]
     330                        # C2 is where len(data)>0
     331                        continue
     332                    # case B: modify the prefix, retain the suffix
     333                    #print " modify prefix"
     334                    self.spans[i] = (s_start, data + s_data[len(data):])
     335                    break
     336                if start > s_start and end < s_end:
     337                    # case E: modify the middle
     338                    #print " modify middle"
     339                    prefix_len = start - s_start # we retain this much
     340                    suffix_len = s_end - end # and retain this much
     341                    newdata = s_data[:prefix_len] + data + s_data[-suffix_len:]
     342                    self.spans[i] = (s_start, newdata)
     343                    break
     344                # case D: retain the prefix, modify the suffix
     345                #print " modify suffix"
     346                prefix_len = start - s_start # we retain this much
     347                suffix_len = s_len - prefix_len # we replace this much
     348                #print "  ", s_data, prefix_len, suffix_len, s_len, data
     349                self.spans[i] = (s_start,
     350                                 s_data[:prefix_len] + data[:suffix_len])
     351                i += 1
     352                start += suffix_len
     353                data = data[suffix_len:]
     354                #print "  now", start, data
     355                # D2 is where len(data)>0
     356                continue
     357            # else we're not there yet
     358            #print " still looking"
     359            i += 1
     360            continue
     361        # now merge adjacent spans
     362        #print " merging", self.spans
     363        newspans = []
     364        for (s_start,s_data) in self.spans:
     365            if newspans and adjacent(newspans[-1][0], len(newspans[-1][1]),
     366                                     s_start, len(s_data)):
     367                newspans[-1] = (newspans[-1][0], newspans[-1][1] + s_data)
     368            else:
     369                newspans.append( (s_start, s_data) )
     370        self.spans = newspans
     371        self.assert_invariants()
     372        #print " done", self.spans
     373
     374    def remove(self, start, length):
     375        i = 0
     376        end = start + length
     377        #print "remove", start, length, self.spans
     378        while i < len(self.spans):
     379            (s_start,s_data) = self.spans[i]
     380            if s_start >= end:
     381                # this segment is entirely right of the removed region, and
     382                # all further segments are even further right. We're done.
     383                break
     384            s_len = len(s_data)
     385            s_end = s_start + s_len
     386            o = overlap(start, length, s_start, s_len)
     387            if not o:
     388                i += 1
     389                continue
     390            o_start, o_len = o
     391            o_end = o_start + o_len
     392            if o_len == s_len:
     393                # remove the whole segment
     394                del self.spans[i]
     395                continue
     396            if o_start == s_start:
     397                # remove a prefix, leaving the suffix from o_end to s_end
     398                prefix_len = o_end - o_start
     399                self.spans[i] = (o_end, s_data[prefix_len:])
     400                i += 1
     401                continue
     402            elif o_end == s_end:
     403                # remove a suffix, leaving the prefix from s_start to o_start
     404                prefix_len = o_start - s_start
     405                self.spans[i] = (s_start, s_data[:prefix_len])
     406                i += 1
     407                continue
     408            # remove the middle, creating a new segment
     409            # left is s_start:o_start, right is o_end:s_end
     410            left_len = o_start - s_start
     411            left = s_data[:left_len]
     412            right_len = s_end - o_end
     413            right = s_data[-right_len:]
     414            self.spans[i] = (s_start, left)
     415            self.spans.insert(i+1, (o_end, right))
     416            break
     417        #print " done", self.spans
     418
     419    def pop(self, start, length):
     420        data = self.get(start, length)
     421        if data:
     422            self.remove(start, length)
     423        return data