1 | # -*- test-case-name: allmydata.test.test_hashtree -*- |
---|
2 | """ |
---|
3 | Read and write chunks from files. |
---|
4 | |
---|
5 | Version 1.0.0. |
---|
6 | |
---|
7 | A file is divided into blocks, each of which has size L{BLOCK_SIZE} |
---|
8 | (except for the last block, which may be smaller). Blocks are encoded |
---|
9 | into chunks. One publishes the hash of the entire file. Clients |
---|
10 | who want to download the file first obtain the hash, then the clients |
---|
11 | can receive chunks in any order. Cryptographic hashing is used to |
---|
12 | verify each received chunk before writing to disk. Thus it is |
---|
13 | impossible to download corrupt data if one has the correct file hash. |
---|
14 | |
---|
15 | One obtains the hash of a complete file via |
---|
16 | L{CompleteChunkFile.file_hash}. One can read chunks from a complete |
---|
17 | file by the sequence operations of C{len()} and subscripting on a |
---|
18 | L{CompleteChunkFile} object. One can open an empty or partially |
---|
19 | downloaded file with L{PartialChunkFile}, and read and write chunks |
---|
20 | to this file. A chunk will fail to write if its contents and index |
---|
21 | are not consistent with the overall file hash passed to |
---|
22 | L{PartialChunkFile} when the partial chunk file was first created. |
---|
23 | |
---|
24 | The chunks have an overhead of less than 4% for files of size |
---|
25 | less than C{10**20} bytes. |
---|
26 | |
---|
27 | Benchmarks: |
---|
28 | |
---|
29 | - On a 3 GHz Pentium 3, it took 3.4 minutes to first make a |
---|
30 | L{CompleteChunkFile} object for a 4 GB file. Up to 10 MB of |
---|
31 | memory was used as the constructor ran. A metafile filename |
---|
32 | was passed to the constructor, and so the hash information was |
---|
33 | written to the metafile. The object used a negligible amount |
---|
34 | of memory after the constructor was finished. |
---|
35 | - Creation of L{CompleteChunkFile} objects in future runs of the |
---|
36 | program took negligible time, since the hash information was |
---|
37 | already stored in the metafile. |
---|
38 | |
---|
39 | @var BLOCK_SIZE: Size of a block. See L{BlockFile}. |
---|
40 | @var MAX_CHUNK_SIZE: Upper bound on the size of a chunk. |
---|
41 | See L{CompleteChunkFile}. |
---|
42 | |
---|
43 | free (adj.): unencumbered; not under the control of others |
---|
44 | Written by Connelly Barnes in 2005 and released into the |
---|
45 | public domain with no warranty of any kind, either expressed |
---|
46 | or implied. It probably won't make your computer catch on fire, |
---|
47 | or eat your children, but it might. Use at your own risk. |
---|
48 | |
---|
49 | Ported to Python 3. |
---|
50 | """ |
---|
51 | |
---|
52 | from allmydata.util import mathutil # from the pyutil library |
---|
53 | |
---|
54 | from allmydata.util import base32 |
---|
55 | from allmydata.util.hashutil import tagged_hash, tagged_pair_hash |
---|
56 | |
---|
57 | __version__ = '1.0.0-allmydata' |
---|
58 | |
---|
59 | BLOCK_SIZE = 65536 |
---|
60 | MAX_CHUNK_SIZE = BLOCK_SIZE + 4096 |
---|
61 | |
---|
62 | def roundup_pow2(x): |
---|
63 | """ |
---|
64 | Round integer C{x} up to the nearest power of 2. |
---|
65 | """ |
---|
66 | ans = 1 |
---|
67 | while ans < x: |
---|
68 | ans *= 2 |
---|
69 | return ans |
---|
70 | |
---|
71 | |
---|
72 | class CompleteBinaryTreeMixin(object): |
---|
73 | """ |
---|
74 | Adds convenience methods to a complete binary tree. |
---|
75 | |
---|
76 | Assumes the total number of elements in the binary tree may be |
---|
77 | accessed via C{__len__}, and that each element can be retrieved |
---|
78 | using list subscripting. |
---|
79 | |
---|
80 | Tree is indexed like so:: |
---|
81 | |
---|
82 | |
---|
83 | 0 |
---|
84 | / \ |
---|
85 | 1 2 |
---|
86 | / \ / \ |
---|
87 | 3 4 5 6 |
---|
88 | / \ / \ / \ / \ |
---|
89 | 7 8 9 10 11 12 13 14 |
---|
90 | |
---|
91 | """ |
---|
92 | |
---|
93 | def parent(self, i): |
---|
94 | """ |
---|
95 | Index of the parent of C{i}. |
---|
96 | """ |
---|
97 | if i < 1 or (hasattr(self, '__len__') and i >= len(self)): |
---|
98 | raise IndexError('index out of range: ' + repr(i)) |
---|
99 | return (i - 1) // 2 |
---|
100 | |
---|
101 | def lchild(self, i): |
---|
102 | """ |
---|
103 | Index of the left child of C{i}. |
---|
104 | """ |
---|
105 | ans = 2 * i + 1 |
---|
106 | if i < 0 or (hasattr(self, '__len__') and ans >= len(self)): |
---|
107 | raise IndexError('index out of range: ' + repr(i)) |
---|
108 | return ans |
---|
109 | |
---|
110 | def rchild(self, i): |
---|
111 | """ |
---|
112 | Index of right child of C{i}. |
---|
113 | """ |
---|
114 | ans = 2 * i + 2 |
---|
115 | if i < 0 or (hasattr(self, '__len__') and ans >= len(self)): |
---|
116 | raise IndexError('index out of range: ' + repr(i)) |
---|
117 | return ans |
---|
118 | |
---|
119 | def sibling(self, i): |
---|
120 | """ |
---|
121 | Index of sibling of C{i}. |
---|
122 | """ |
---|
123 | parent = self.parent(i) |
---|
124 | if self.lchild(parent) == i: |
---|
125 | return self.rchild(parent) |
---|
126 | else: |
---|
127 | return self.lchild(parent) |
---|
128 | |
---|
129 | def needed_for(self, i): |
---|
130 | """ |
---|
131 | Return a list of node indices that are necessary for the hash chain. |
---|
132 | """ |
---|
133 | if i < 0 or i >= len(self): |
---|
134 | raise IndexError('index out of range: 0 >= %s < %s' % (i, len(self))) |
---|
135 | needed = [] |
---|
136 | here = i |
---|
137 | while here != 0: |
---|
138 | needed.append(self.sibling(here)) |
---|
139 | here = self.parent(here) |
---|
140 | return needed |
---|
141 | |
---|
142 | def depth_first(self, i=0): |
---|
143 | yield i, 0 |
---|
144 | try: |
---|
145 | for child,childdepth in self.depth_first(self.lchild(i)): |
---|
146 | yield child, childdepth+1 |
---|
147 | except IndexError: |
---|
148 | pass |
---|
149 | try: |
---|
150 | for child,childdepth in self.depth_first(self.rchild(i)): |
---|
151 | yield child, childdepth+1 |
---|
152 | except IndexError: |
---|
153 | pass |
---|
154 | |
---|
155 | def dump(self): |
---|
156 | lines = [] |
---|
157 | for i,depth in self.depth_first(): |
---|
158 | value = base32.b2a_or_none(self[i]) |
---|
159 | if value is not None: |
---|
160 | value = str(value, "utf-8") |
---|
161 | lines.append("%s%3d: %s" % (" "*depth, i, value)) |
---|
162 | return "\n".join(lines) + "\n" |
---|
163 | |
---|
164 | def get_leaf_index(self, leafnum): |
---|
165 | return self.first_leaf_num + leafnum |
---|
166 | |
---|
167 | def get_leaf(self, leafnum): |
---|
168 | return self[self.first_leaf_num + leafnum] |
---|
169 | |
---|
170 | def depth_of(i): |
---|
171 | """Return the depth or level of the given node. Level 0 contains node 0 |
---|
172 | Level 1 contains nodes 1 and 2. Level 2 contains nodes 3,4,5,6.""" |
---|
173 | return mathutil.log_floor(i+1, 2) |
---|
174 | |
---|
175 | def empty_leaf_hash(i): |
---|
176 | return tagged_hash(b'Merkle tree empty leaf', b"%d" % i) |
---|
177 | |
---|
178 | def pair_hash(a, b): |
---|
179 | return tagged_pair_hash(b'Merkle tree internal node', a, b) |
---|
180 | |
---|
181 | class HashTree(CompleteBinaryTreeMixin, list): |
---|
182 | """ |
---|
183 | Compute Merkle hashes at any node in a complete binary tree. |
---|
184 | |
---|
185 | Tree is indexed like so:: |
---|
186 | |
---|
187 | |
---|
188 | 0 |
---|
189 | / \ |
---|
190 | 1 2 |
---|
191 | / \ / \ |
---|
192 | 3 4 5 6 |
---|
193 | / \ / \ / \ / \ |
---|
194 | 7 8 9 10 11 12 13 14 <- List passed to constructor. |
---|
195 | |
---|
196 | """ |
---|
197 | |
---|
198 | def __init__(self, L): |
---|
199 | """ |
---|
200 | Create complete binary tree from list of hash strings. |
---|
201 | |
---|
202 | The list is augmented by hashes so its length is a power of 2, and |
---|
203 | then this is used as the bottom row of the hash tree. |
---|
204 | |
---|
205 | The augmenting is done so that if the augmented element is at index |
---|
206 | C{i}, then its value is C{hash(tagged_hash('Merkle tree empty leaf', |
---|
207 | '%d'%i))}. |
---|
208 | """ |
---|
209 | |
---|
210 | # Augment the list. |
---|
211 | start = len(L) |
---|
212 | end = roundup_pow2(len(L)) |
---|
213 | self.first_leaf_num = end - 1 |
---|
214 | L = L + [None] * (end - start) |
---|
215 | for i in range(start, end): |
---|
216 | L[i] = empty_leaf_hash(i) |
---|
217 | # Form each row of the tree. |
---|
218 | rows = [L] |
---|
219 | while len(rows[-1]) != 1: |
---|
220 | last = rows[-1] |
---|
221 | rows += [[pair_hash(last[2*i], last[2*i+1]) |
---|
222 | for i in range(len(last)//2)]] |
---|
223 | # Flatten the list of rows into a single list. |
---|
224 | rows.reverse() |
---|
225 | self[:] = sum(rows, []) |
---|
226 | |
---|
227 | def needed_hashes(self, leafnum, include_leaf=False): |
---|
228 | """Which hashes will someone need to validate a given data block? |
---|
229 | |
---|
230 | I am used to answer a question: supposing you have the data block |
---|
231 | that is used to form leaf hash N, and you want to validate that it, |
---|
232 | which hashes would you need? |
---|
233 | |
---|
234 | I accept a leaf number and return a set of 'hash index' values, which |
---|
235 | are integers from 0 to len(self). In the 'hash index' number space, |
---|
236 | hash[0] is the root hash, while hash[len(self)-1] is the last leaf |
---|
237 | hash. |
---|
238 | |
---|
239 | This method can be used to find out which hashes you should request |
---|
240 | from some untrusted source (usually the same source that provides the |
---|
241 | data block), so you can minimize storage or transmission overhead. It |
---|
242 | can also be used to determine which hashes you should send to a |
---|
243 | remote data store so that it will be able to provide validatable data |
---|
244 | in the future. |
---|
245 | |
---|
246 | I will not include '0' (the root hash) in the result, since the root |
---|
247 | is generally stored somewhere that is more trusted than the source of |
---|
248 | the remaining hashes. I will include the leaf hash itself only if you |
---|
249 | ask me to, by passing include_leaf=True. |
---|
250 | """ |
---|
251 | |
---|
252 | needed = set(self.needed_for(self.first_leaf_num + leafnum)) |
---|
253 | if include_leaf: |
---|
254 | needed.add(self.first_leaf_num + leafnum) |
---|
255 | return needed |
---|
256 | |
---|
257 | |
---|
258 | class NotEnoughHashesError(Exception): |
---|
259 | pass |
---|
260 | |
---|
261 | class BadHashError(Exception): |
---|
262 | pass |
---|
263 | |
---|
264 | class IncompleteHashTree(CompleteBinaryTreeMixin, list): |
---|
265 | """I am a hash tree which may or may not be complete. I can be used to |
---|
266 | validate inbound data from some untrustworthy provider who has a subset |
---|
267 | of leaves and a sufficient subset of internal nodes. |
---|
268 | |
---|
269 | Initially I am completely unpopulated. Over time, I will become filled |
---|
270 | with hashes, just enough to validate particular leaf nodes. |
---|
271 | |
---|
272 | If you desire to validate leaf number N, first find out which hashes I |
---|
273 | need by calling needed_hashes(N). This will return a list of node numbers |
---|
274 | (which will nominally be the sibling chain between the given leaf and the |
---|
275 | root, but if I already have some of those nodes, needed_hashes(N) will |
---|
276 | only return a subset). Obtain these hashes from the data provider, then |
---|
277 | tell me about them with set_hash(i, HASH). Once I have enough hashes, you |
---|
278 | can tell me the hash of the leaf with set_leaf_hash(N, HASH), and I will |
---|
279 | either return None or raise BadHashError. |
---|
280 | |
---|
281 | The first hash to be set will probably be 0 (the root hash), since this |
---|
282 | is the one that will come from someone more trustworthy than the data |
---|
283 | provider. |
---|
284 | |
---|
285 | """ |
---|
286 | |
---|
287 | def __init__(self, num_leaves): |
---|
288 | L = [None] * num_leaves |
---|
289 | start = len(L) |
---|
290 | end = roundup_pow2(len(L)) |
---|
291 | self.first_leaf_num = end - 1 |
---|
292 | L = L + [None] * (end - start) |
---|
293 | rows = [L] |
---|
294 | while len(rows[-1]) != 1: |
---|
295 | last = rows[-1] |
---|
296 | rows += [[None for i in range(len(last)//2)]] |
---|
297 | # Flatten the list of rows into a single list. |
---|
298 | rows.reverse() |
---|
299 | self[:] = sum(rows, []) |
---|
300 | |
---|
301 | |
---|
302 | def needed_hashes(self, leafnum, include_leaf=False): |
---|
303 | """Which new hashes do I need to validate a given data block? |
---|
304 | |
---|
305 | I am much like HashTree.needed_hashes(), except that I don't include |
---|
306 | hashes that I already know about. When needed_hashes() is called on |
---|
307 | an empty IncompleteHashTree, it will return the same set as a |
---|
308 | HashTree of the same size. But later, once hashes have been added |
---|
309 | with set_hashes(), I will ask for fewer hashes, since some of the |
---|
310 | necessary ones have already been set. |
---|
311 | """ |
---|
312 | |
---|
313 | maybe_needed = set(self.needed_for(self.first_leaf_num + leafnum)) |
---|
314 | if include_leaf: |
---|
315 | maybe_needed.add(self.first_leaf_num + leafnum) |
---|
316 | return set([i for i in maybe_needed if self[i] is None]) |
---|
317 | |
---|
318 | def _name_hash(self, i): |
---|
319 | name = "[%d of %d]" % (i, len(self)) |
---|
320 | if i >= self.first_leaf_num: |
---|
321 | leafnum = i - self.first_leaf_num |
---|
322 | numleaves = len(self) - self.first_leaf_num |
---|
323 | name += " (leaf [%d] of %d)" % (leafnum, numleaves) |
---|
324 | return name |
---|
325 | |
---|
326 | def set_hashes(self, hashes=None, leaves=None): |
---|
327 | """Add a bunch of hashes to the tree. |
---|
328 | |
---|
329 | I will validate these to the best of my ability. If I already have a |
---|
330 | copy of any of the new hashes, the new values must equal the existing |
---|
331 | ones, or I will raise BadHashError. If adding a hash allows me to |
---|
332 | compute a parent hash, those parent hashes must match or I will raise |
---|
333 | BadHashError. If I raise BadHashError, I will forget about all the |
---|
334 | hashes that you tried to add, leaving my state exactly the same as |
---|
335 | before I was called. If I return successfully, I will remember all |
---|
336 | those hashes. |
---|
337 | |
---|
338 | I insist upon being able to validate all of the hashes that were |
---|
339 | given to me. If I cannot do this because I'm missing some hashes, I |
---|
340 | will raise NotEnoughHashesError (and forget about all the hashes that |
---|
341 | you tried to add). Note that this means that the root hash must |
---|
342 | either be included in 'hashes', or it must have been provided at some |
---|
343 | point in the past. |
---|
344 | |
---|
345 | 'leaves' is a dictionary uses 'leaf index' values, which range from 0 |
---|
346 | (the left-most leaf) to num_leaves-1 (the right-most leaf), and form |
---|
347 | the base of the tree. 'hashes' uses 'hash_index' values, which range |
---|
348 | from 0 (the root of the tree) to 2*num_leaves-2 (the right-most |
---|
349 | leaf). leaf[i] is the same as hash[num_leaves-1+i]. |
---|
350 | |
---|
351 | The best way to use me is to start by obtaining the root hash from |
---|
352 | some 'good' channel and populate me with it: |
---|
353 | |
---|
354 | iht = IncompleteHashTree(numleaves) |
---|
355 | roothash = trusted_channel.get_roothash() |
---|
356 | iht.set_hashes(hashes={0: roothash}) |
---|
357 | |
---|
358 | Then use the 'bad' channel to obtain data block 0 and the |
---|
359 | corresponding hash chain (a dict with the same hashes that |
---|
360 | needed_hashes(0) tells you, e.g. {0:h0, 2:h2, 4:h4, 8:h8} when |
---|
361 | len(L)=8). Hash the data block to create leaf0, then feed everything |
---|
362 | into set_hashes() and see if it raises an exception or not:: |
---|
363 | |
---|
364 | otherhashes = untrusted_channel.get_hashes() |
---|
365 | # otherhashes.keys() should == iht.needed_hashes(leaves=[0]) |
---|
366 | datablock0 = untrusted_channel.get_data(0) |
---|
367 | leaf0 = HASH(datablock0) |
---|
368 | # HASH() is probably hashutil.tagged_hash(tag, datablock0) |
---|
369 | iht.set_hashes(otherhashes, leaves={0: leaf0}) |
---|
370 | |
---|
371 | If the set_hashes() call doesn't raise an exception, the data block |
---|
372 | was valid. If it raises BadHashError, then either the data block was |
---|
373 | corrupted or one of the received hashes was corrupted. If it raises |
---|
374 | NotEnoughHashesError, then the otherhashes dictionary was incomplete. |
---|
375 | """ |
---|
376 | if hashes is None: |
---|
377 | hashes = {} |
---|
378 | if leaves is None: |
---|
379 | leaves = {} |
---|
380 | assert isinstance(hashes, dict) |
---|
381 | for h in hashes.values(): |
---|
382 | assert isinstance(h, bytes) |
---|
383 | assert isinstance(leaves, dict) |
---|
384 | for h in leaves.values(): |
---|
385 | assert isinstance(h, bytes) |
---|
386 | new_hashes = hashes.copy() |
---|
387 | for leafnum,leafhash in leaves.items(): |
---|
388 | hashnum = self.first_leaf_num + leafnum |
---|
389 | if hashnum in new_hashes: |
---|
390 | if new_hashes[hashnum] != leafhash: |
---|
391 | raise BadHashError("got conflicting hashes in my " |
---|
392 | "arguments: leaves[%d] != hashes[%d]" |
---|
393 | % (leafnum, hashnum)) |
---|
394 | new_hashes[hashnum] = leafhash |
---|
395 | |
---|
396 | remove_upon_failure = set() # we'll remove these if the check fails |
---|
397 | |
---|
398 | # visualize this method in the following way: |
---|
399 | # A: start with the empty or partially-populated tree as shown in |
---|
400 | # the HashTree docstring |
---|
401 | # B: add all of our input hashes to the tree, filling in some of the |
---|
402 | # holes. Don't overwrite anything, but new values must equal the |
---|
403 | # existing ones. Mark everything that was added with a red dot |
---|
404 | # (meaning "not yet validated") |
---|
405 | # C: start with the lowest/deepest level. Pick any red-dotted node, |
---|
406 | # hash it with its sibling to compute the parent hash. Add the |
---|
407 | # parent to the tree just like in step B (if the parent already |
---|
408 | # exists, the values must be equal; if not, add our computed |
---|
409 | # value with a red dot). If we have no sibling, throw |
---|
410 | # NotEnoughHashesError, since we won't be able to validate this |
---|
411 | # node. Remove the red dot. If there was a red dot on our |
---|
412 | # sibling, remove it too. |
---|
413 | # D: finish all red-dotted nodes in one level before moving up to |
---|
414 | # the next. |
---|
415 | # E: if we hit NotEnoughHashesError or BadHashError before getting |
---|
416 | # to the root, discard every hash we've added. |
---|
417 | |
---|
418 | try: |
---|
419 | num_levels = depth_of(len(self)-1) |
---|
420 | # hashes_to_check[level] is set(index). This holds the "red dots" |
---|
421 | # described above |
---|
422 | hashes_to_check = [set() for level in range(num_levels+1)] |
---|
423 | |
---|
424 | # first we provisionally add all hashes to the tree, comparing |
---|
425 | # any duplicates |
---|
426 | for i,h in new_hashes.items(): |
---|
427 | if self[i]: |
---|
428 | if self[i] != h: |
---|
429 | raise BadHashError("new hash %r does not match " |
---|
430 | "existing hash %r at %r" |
---|
431 | % (base32.b2a(h), |
---|
432 | base32.b2a(self[i]), |
---|
433 | self._name_hash(i))) |
---|
434 | else: |
---|
435 | level = depth_of(i) |
---|
436 | hashes_to_check[level].add(i) |
---|
437 | self[i] = h |
---|
438 | remove_upon_failure.add(i) |
---|
439 | |
---|
440 | for level in reversed(range(len(hashes_to_check))): |
---|
441 | this_level = hashes_to_check[level] |
---|
442 | while this_level: |
---|
443 | i = this_level.pop() |
---|
444 | if i == 0: |
---|
445 | # The root has no sibling. How lonely. You can't |
---|
446 | # really *check* the root; you either accept it |
---|
447 | # because the caller told you what it is by including |
---|
448 | # it in hashes, or you accept it because you |
---|
449 | # calculated it from its two children. You probably |
---|
450 | # want to set the root (from a trusted source) before |
---|
451 | # adding any children from an untrusted source. |
---|
452 | continue |
---|
453 | siblingnum = self.sibling(i) |
---|
454 | if self[siblingnum] is None: |
---|
455 | # without a sibling, we can't compute a parent, and |
---|
456 | # we can't verify this node |
---|
457 | raise NotEnoughHashesError("unable to validate [%d]"%i) |
---|
458 | parentnum = self.parent(i) |
---|
459 | # make sure we know right from left |
---|
460 | leftnum, rightnum = sorted([i, siblingnum]) |
---|
461 | new_parent_hash = pair_hash(self[leftnum], self[rightnum]) |
---|
462 | if self[parentnum]: |
---|
463 | if self[parentnum] != new_parent_hash: |
---|
464 | raise BadHashError("h([%d]+[%d]) != h[%d]" % |
---|
465 | (leftnum, rightnum, parentnum)) |
---|
466 | else: |
---|
467 | self[parentnum] = new_parent_hash |
---|
468 | remove_upon_failure.add(parentnum) |
---|
469 | parent_level = depth_of(parentnum) |
---|
470 | assert parent_level == level-1 |
---|
471 | hashes_to_check[parent_level].add(parentnum) |
---|
472 | |
---|
473 | # our sibling is now as valid as this node |
---|
474 | this_level.discard(siblingnum) |
---|
475 | # we're done! |
---|
476 | |
---|
477 | except (BadHashError, NotEnoughHashesError): |
---|
478 | for i in remove_upon_failure: |
---|
479 | self[i] = None |
---|
480 | raise |
---|