source: trunk/src/allmydata/hashtree.py

Last change on this file was 1cfe843d, checked in by Alexandre Detiste <alexandre.detiste@…>, at 2024-02-22T23:40:25Z

more python2 removal

  • Property mode set to 100644
File size: 19.1 KB
Line 
1# -*- test-case-name: allmydata.test.test_hashtree -*-
2"""
3Read and write chunks from files.
4
5Version 1.0.0.
6
7A file is divided into blocks, each of which has size L{BLOCK_SIZE}
8(except for the last block, which may be smaller).  Blocks are encoded
9into chunks.  One publishes the hash of the entire file.  Clients
10who want to download the file first obtain the hash, then the clients
11can receive chunks in any order.  Cryptographic hashing is used to
12verify each received chunk before writing to disk.  Thus it is
13impossible to download corrupt data if one has the correct file hash.
14
15One obtains the hash of a complete file via
16L{CompleteChunkFile.file_hash}.  One can read chunks from a complete
17file by the sequence operations of C{len()} and subscripting on a
18L{CompleteChunkFile} object.  One can open an empty or partially
19downloaded file with L{PartialChunkFile}, and read and write chunks
20to this file.  A chunk will fail to write if its contents and index
21are not consistent with the overall file hash passed to
22L{PartialChunkFile} when the partial chunk file was first created.
23
24The chunks have an overhead of less than 4% for files of size
25less than C{10**20} bytes.
26
27Benchmarks:
28
29 - On a 3 GHz Pentium 3, it took 3.4 minutes to first make a
30   L{CompleteChunkFile} object for a 4 GB file.  Up to 10 MB of
31   memory was used as the constructor ran.  A metafile filename
32   was passed to the constructor, and so the hash information was
33   written to the metafile.  The object used a negligible amount
34   of memory after the constructor was finished.
35 - Creation of L{CompleteChunkFile} objects in future runs of the
36   program took negligible time, since the hash information was
37   already stored in the metafile.
38
39@var BLOCK_SIZE:     Size of a block.  See L{BlockFile}.
40@var MAX_CHUNK_SIZE: Upper bound on the size of a chunk.
41                     See L{CompleteChunkFile}.
42
43free (adj.): unencumbered; not under the control of others
44Written by Connelly Barnes in 2005 and released into the
45public domain  with no warranty of any kind, either expressed
46or implied.  It probably won't make your computer catch on fire,
47or eat  your children, but it might.  Use at your own risk.
48
49Ported to Python 3.
50"""
51
52from allmydata.util import mathutil # from the pyutil library
53
54from allmydata.util import base32
55from allmydata.util.hashutil import tagged_hash, tagged_pair_hash
56
57__version__ = '1.0.0-allmydata'
58
59BLOCK_SIZE     = 65536
60MAX_CHUNK_SIZE = BLOCK_SIZE + 4096
61
62def roundup_pow2(x):
63    """
64    Round integer C{x} up to the nearest power of 2.
65    """
66    ans = 1
67    while ans < x:
68        ans *= 2
69    return ans
70
71
72class CompleteBinaryTreeMixin(object):
73    """
74    Adds convenience methods to a complete binary tree.
75
76    Assumes the total number of elements in the binary tree may be
77    accessed via C{__len__}, and that each element can be retrieved
78    using list subscripting.
79
80    Tree is indexed like so::
81
82
83                        0
84                   /        \
85                1               2
86             /    \          /    \
87           3       4       5       6
88          / \     / \     / \     / \
89         7   8   9   10  11  12  13  14
90
91    """
92
93    def parent(self, i):
94        """
95        Index of the parent of C{i}.
96        """
97        if i < 1 or (hasattr(self, '__len__') and i >= len(self)):
98            raise IndexError('index out of range: ' + repr(i))
99        return (i - 1) // 2
100
101    def lchild(self, i):
102        """
103        Index of the left child of C{i}.
104        """
105        ans = 2 * i + 1
106        if i < 0 or (hasattr(self, '__len__') and ans >= len(self)):
107            raise IndexError('index out of range: ' + repr(i))
108        return ans
109
110    def rchild(self, i):
111        """
112        Index of right child of C{i}.
113        """
114        ans = 2 * i + 2
115        if i < 0 or (hasattr(self, '__len__') and ans >= len(self)):
116            raise IndexError('index out of range: ' + repr(i))
117        return ans
118
119    def sibling(self, i):
120        """
121        Index of sibling of C{i}.
122        """
123        parent = self.parent(i)
124        if self.lchild(parent) == i:
125            return self.rchild(parent)
126        else:
127            return self.lchild(parent)
128
129    def needed_for(self, i):
130        """
131        Return a list of node indices that are necessary for the hash chain.
132        """
133        if i < 0 or i >= len(self):
134            raise IndexError('index out of range: 0 >= %s < %s' % (i, len(self)))
135        needed = []
136        here = i
137        while here != 0:
138            needed.append(self.sibling(here))
139            here = self.parent(here)
140        return needed
141
142    def depth_first(self, i=0):
143        yield i, 0
144        try:
145            for child,childdepth in self.depth_first(self.lchild(i)):
146                yield child, childdepth+1
147        except IndexError:
148            pass
149        try:
150            for child,childdepth in self.depth_first(self.rchild(i)):
151                yield child, childdepth+1
152        except IndexError:
153            pass
154
155    def dump(self):
156        lines = []
157        for i,depth in self.depth_first():
158            value = base32.b2a_or_none(self[i])
159            if value is not None:
160                value = str(value, "utf-8")
161            lines.append("%s%3d: %s" % ("  "*depth, i, value))
162        return "\n".join(lines) + "\n"
163
164    def get_leaf_index(self, leafnum):
165        return self.first_leaf_num + leafnum
166
167    def get_leaf(self, leafnum):
168        return self[self.first_leaf_num + leafnum]
169
170def depth_of(i):
171    """Return the depth or level of the given node. Level 0 contains node 0
172    Level 1 contains nodes 1 and 2. Level 2 contains nodes 3,4,5,6."""
173    return mathutil.log_floor(i+1, 2)
174
175def empty_leaf_hash(i):
176    return tagged_hash(b'Merkle tree empty leaf', b"%d" % i)
177
178def pair_hash(a, b):
179    return tagged_pair_hash(b'Merkle tree internal node', a, b)
180
181class HashTree(CompleteBinaryTreeMixin, list):
182    """
183    Compute Merkle hashes at any node in a complete binary tree.
184
185    Tree is indexed like so::
186
187
188                        0
189                   /        \
190                1               2
191             /    \          /    \
192           3       4       5       6
193          / \     / \     / \     / \
194         7   8   9   10  11  12  13  14  <- List passed to constructor.
195
196    """
197
198    def __init__(self, L):
199        """
200        Create complete binary tree from list of hash strings.
201
202        The list is augmented by hashes so its length is a power of 2, and
203        then this is used as the bottom row of the hash tree.
204
205        The augmenting is done so that if the augmented element is at index
206        C{i}, then its value is C{hash(tagged_hash('Merkle tree empty leaf',
207        '%d'%i))}.
208        """
209
210        # Augment the list.
211        start = len(L)
212        end   = roundup_pow2(len(L))
213        self.first_leaf_num = end - 1
214        L     = L + [None] * (end - start)
215        for i in range(start, end):
216            L[i] = empty_leaf_hash(i)
217        # Form each row of the tree.
218        rows = [L]
219        while len(rows[-1]) != 1:
220            last = rows[-1]
221            rows += [[pair_hash(last[2*i], last[2*i+1])
222                                for i in range(len(last)//2)]]
223        # Flatten the list of rows into a single list.
224        rows.reverse()
225        self[:] = sum(rows, [])
226
227    def needed_hashes(self, leafnum, include_leaf=False):
228        """Which hashes will someone need to validate a given data block?
229
230        I am used to answer a question: supposing you have the data block
231        that is used to form leaf hash N, and you want to validate that it,
232        which hashes would you need?
233
234        I accept a leaf number and return a set of 'hash index' values, which
235        are integers from 0 to len(self). In the 'hash index' number space,
236        hash[0] is the root hash, while hash[len(self)-1] is the last leaf
237        hash.
238
239        This method can be used to find out which hashes you should request
240        from some untrusted source (usually the same source that provides the
241        data block), so you can minimize storage or transmission overhead. It
242        can also be used to determine which hashes you should send to a
243        remote data store so that it will be able to provide validatable data
244        in the future.
245
246        I will not include '0' (the root hash) in the result, since the root
247        is generally stored somewhere that is more trusted than the source of
248        the remaining hashes. I will include the leaf hash itself only if you
249        ask me to, by passing include_leaf=True.
250        """
251
252        needed = set(self.needed_for(self.first_leaf_num + leafnum))
253        if include_leaf:
254            needed.add(self.first_leaf_num + leafnum)
255        return needed
256
257
258class NotEnoughHashesError(Exception):
259    pass
260
261class BadHashError(Exception):
262    pass
263
264class IncompleteHashTree(CompleteBinaryTreeMixin, list):
265    """I am a hash tree which may or may not be complete. I can be used to
266    validate inbound data from some untrustworthy provider who has a subset
267    of leaves and a sufficient subset of internal nodes.
268
269    Initially I am completely unpopulated. Over time, I will become filled
270    with hashes, just enough to validate particular leaf nodes.
271
272    If you desire to validate leaf number N, first find out which hashes I
273    need by calling needed_hashes(N). This will return a list of node numbers
274    (which will nominally be the sibling chain between the given leaf and the
275    root, but if I already have some of those nodes, needed_hashes(N) will
276    only return a subset). Obtain these hashes from the data provider, then
277    tell me about them with set_hash(i, HASH). Once I have enough hashes, you
278    can tell me the hash of the leaf with set_leaf_hash(N, HASH), and I will
279    either return None or raise BadHashError.
280
281    The first hash to be set will probably be 0 (the root hash), since this
282    is the one that will come from someone more trustworthy than the data
283    provider.
284
285    """
286
287    def __init__(self, num_leaves):
288        L = [None] * num_leaves
289        start = len(L)
290        end   = roundup_pow2(len(L))
291        self.first_leaf_num = end - 1
292        L     = L + [None] * (end - start)
293        rows = [L]
294        while len(rows[-1]) != 1:
295            last = rows[-1]
296            rows += [[None for i in range(len(last)//2)]]
297        # Flatten the list of rows into a single list.
298        rows.reverse()
299        self[:] = sum(rows, [])
300
301
302    def needed_hashes(self, leafnum, include_leaf=False):
303        """Which new hashes do I need to validate a given data block?
304
305        I am much like HashTree.needed_hashes(), except that I don't include
306        hashes that I already know about. When needed_hashes() is called on
307        an empty IncompleteHashTree, it will return the same set as a
308        HashTree of the same size. But later, once hashes have been added
309        with set_hashes(), I will ask for fewer hashes, since some of the
310        necessary ones have already been set.
311        """
312
313        maybe_needed = set(self.needed_for(self.first_leaf_num + leafnum))
314        if include_leaf:
315            maybe_needed.add(self.first_leaf_num + leafnum)
316        return set([i for i in maybe_needed if self[i] is None])
317
318    def _name_hash(self, i):
319        name = "[%d of %d]" % (i, len(self))
320        if i >= self.first_leaf_num:
321            leafnum = i - self.first_leaf_num
322            numleaves = len(self) - self.first_leaf_num
323            name += " (leaf [%d] of %d)" % (leafnum, numleaves)
324        return name
325
326    def set_hashes(self, hashes=None, leaves=None):
327        """Add a bunch of hashes to the tree.
328
329        I will validate these to the best of my ability. If I already have a
330        copy of any of the new hashes, the new values must equal the existing
331        ones, or I will raise BadHashError. If adding a hash allows me to
332        compute a parent hash, those parent hashes must match or I will raise
333        BadHashError. If I raise BadHashError, I will forget about all the
334        hashes that you tried to add, leaving my state exactly the same as
335        before I was called. If I return successfully, I will remember all
336        those hashes.
337
338        I insist upon being able to validate all of the hashes that were
339        given to me. If I cannot do this because I'm missing some hashes, I
340        will raise NotEnoughHashesError (and forget about all the hashes that
341        you tried to add). Note that this means that the root hash must
342        either be included in 'hashes', or it must have been provided at some
343        point in the past.
344
345        'leaves' is a dictionary uses 'leaf index' values, which range from 0
346        (the left-most leaf) to num_leaves-1 (the right-most leaf), and form
347        the base of the tree. 'hashes' uses 'hash_index' values, which range
348        from 0 (the root of the tree) to 2*num_leaves-2 (the right-most
349        leaf). leaf[i] is the same as hash[num_leaves-1+i].
350
351        The best way to use me is to start by obtaining the root hash from
352        some 'good' channel and populate me with it:
353
354         iht = IncompleteHashTree(numleaves)
355         roothash = trusted_channel.get_roothash()
356         iht.set_hashes(hashes={0: roothash})
357
358        Then use the 'bad' channel to obtain data block 0 and the
359        corresponding hash chain (a dict with the same hashes that
360        needed_hashes(0) tells you, e.g. {0:h0, 2:h2, 4:h4, 8:h8} when
361        len(L)=8). Hash the data block to create leaf0, then feed everything
362        into set_hashes() and see if it raises an exception or not::
363
364         otherhashes = untrusted_channel.get_hashes()
365         # otherhashes.keys() should == iht.needed_hashes(leaves=[0])
366         datablock0 = untrusted_channel.get_data(0)
367         leaf0 = HASH(datablock0)
368         # HASH() is probably hashutil.tagged_hash(tag, datablock0)
369         iht.set_hashes(otherhashes, leaves={0: leaf0})
370
371        If the set_hashes() call doesn't raise an exception, the data block
372        was valid. If it raises BadHashError, then either the data block was
373        corrupted or one of the received hashes was corrupted. If it raises
374        NotEnoughHashesError, then the otherhashes dictionary was incomplete.
375        """
376        if hashes is None:
377            hashes = {}
378        if leaves is None:
379            leaves = {}
380        assert isinstance(hashes, dict)
381        for h in hashes.values():
382            assert isinstance(h, bytes)
383        assert isinstance(leaves, dict)
384        for h in leaves.values():
385            assert isinstance(h, bytes)
386        new_hashes = hashes.copy()
387        for leafnum,leafhash in leaves.items():
388            hashnum = self.first_leaf_num + leafnum
389            if hashnum in new_hashes:
390                if new_hashes[hashnum] != leafhash:
391                    raise BadHashError("got conflicting hashes in my "
392                                       "arguments: leaves[%d] != hashes[%d]"
393                                       % (leafnum, hashnum))
394            new_hashes[hashnum] = leafhash
395
396        remove_upon_failure = set() # we'll remove these if the check fails
397
398        # visualize this method in the following way:
399        #  A: start with the empty or partially-populated tree as shown in
400        #     the HashTree docstring
401        #  B: add all of our input hashes to the tree, filling in some of the
402        #     holes. Don't overwrite anything, but new values must equal the
403        #     existing ones. Mark everything that was added with a red dot
404        #     (meaning "not yet validated")
405        #  C: start with the lowest/deepest level. Pick any red-dotted node,
406        #     hash it with its sibling to compute the parent hash. Add the
407        #     parent to the tree just like in step B (if the parent already
408        #     exists, the values must be equal; if not, add our computed
409        #     value with a red dot). If we have no sibling, throw
410        #     NotEnoughHashesError, since we won't be able to validate this
411        #     node. Remove the red dot. If there was a red dot on our
412        #     sibling, remove it too.
413        #  D: finish all red-dotted nodes in one level before moving up to
414        #     the next.
415        #  E: if we hit NotEnoughHashesError or BadHashError before getting
416        #     to the root, discard every hash we've added.
417
418        try:
419            num_levels = depth_of(len(self)-1)
420            # hashes_to_check[level] is set(index). This holds the "red dots"
421            # described above
422            hashes_to_check = [set() for level in range(num_levels+1)]
423
424            # first we provisionally add all hashes to the tree, comparing
425            # any duplicates
426            for i,h in new_hashes.items():
427                if self[i]:
428                    if self[i] != h:
429                        raise BadHashError("new hash %r does not match "
430                                           "existing hash %r at %r"
431                                           % (base32.b2a(h),
432                                              base32.b2a(self[i]),
433                                              self._name_hash(i)))
434                else:
435                    level = depth_of(i)
436                    hashes_to_check[level].add(i)
437                    self[i] = h
438                    remove_upon_failure.add(i)
439
440            for level in reversed(range(len(hashes_to_check))):
441                this_level = hashes_to_check[level]
442                while this_level:
443                    i = this_level.pop()
444                    if i == 0:
445                        # The root has no sibling. How lonely. You can't
446                        # really *check* the root; you either accept it
447                        # because the caller told you what it is by including
448                        # it in hashes, or you accept it because you
449                        # calculated it from its two children. You probably
450                        # want to set the root (from a trusted source) before
451                        # adding any children from an untrusted source.
452                        continue
453                    siblingnum = self.sibling(i)
454                    if self[siblingnum] is None:
455                        # without a sibling, we can't compute a parent, and
456                        # we can't verify this node
457                        raise NotEnoughHashesError("unable to validate [%d]"%i)
458                    parentnum = self.parent(i)
459                    # make sure we know right from left
460                    leftnum, rightnum = sorted([i, siblingnum])
461                    new_parent_hash = pair_hash(self[leftnum], self[rightnum])
462                    if self[parentnum]:
463                        if self[parentnum] != new_parent_hash:
464                            raise BadHashError("h([%d]+[%d]) != h[%d]" %
465                                               (leftnum, rightnum, parentnum))
466                    else:
467                        self[parentnum] = new_parent_hash
468                        remove_upon_failure.add(parentnum)
469                        parent_level = depth_of(parentnum)
470                        assert parent_level == level-1
471                        hashes_to_check[parent_level].add(parentnum)
472
473                    # our sibling is now as valid as this node
474                    this_level.discard(siblingnum)
475            # we're done!
476
477        except (BadHashError, NotEnoughHashesError):
478            for i in remove_upon_failure:
479                self[i] = None
480            raise
Note: See TracBrowser for help on using the repository browser.