1 | |
---|
2 | |
---|
3 | class Spans(object): |
---|
4 | """I represent a compressed list of booleans, one per index (an integer). |
---|
5 | Typically, each index represents an offset into a large string, pointing |
---|
6 | to a specific byte of a share. In this context, True means that byte has |
---|
7 | been received, or has been requested. |
---|
8 | |
---|
9 | Another way to look at this is maintaining a set of integers, optimized |
---|
10 | for operations on spans like 'add range to set' and 'is range in set?'. |
---|
11 | |
---|
12 | This is a python equivalent of perl's Set::IntSpan module, frequently |
---|
13 | used to represent .newsrc contents. |
---|
14 | |
---|
15 | Rather than storing an actual (large) list or dictionary, I represent my |
---|
16 | internal state as a sorted list of spans, each with a start and a length. |
---|
17 | My API is presented in terms of start+length pairs. I provide set |
---|
18 | arithmetic operators, to efficiently answer questions like 'I want bytes |
---|
19 | XYZ, I already requested bytes ABC, and I've already received bytes DEF: |
---|
20 | what bytes should I request now?'. |
---|
21 | |
---|
22 | The new downloader will use it to keep track of which bytes we've requested |
---|
23 | or received already. |
---|
24 | """ |
---|
25 | |
---|
26 | def __init__(self, _span_or_start=None, length=None): |
---|
27 | self._spans = list() |
---|
28 | if length is not None: |
---|
29 | self._spans.append( (_span_or_start, length) ) |
---|
30 | elif _span_or_start: |
---|
31 | for (start,length) in _span_or_start: |
---|
32 | self.add(start, length) |
---|
33 | self._check() |
---|
34 | |
---|
35 | def _check(self): |
---|
36 | assert sorted(self._spans) == self._spans |
---|
37 | prev_end = None |
---|
38 | try: |
---|
39 | for (start,length) in self._spans: |
---|
40 | if prev_end is not None: |
---|
41 | assert start > prev_end |
---|
42 | prev_end = start+length |
---|
43 | except AssertionError: |
---|
44 | print("BAD:", self.dump()) |
---|
45 | raise |
---|
46 | |
---|
47 | def add(self, start, length): |
---|
48 | assert start >= 0 |
---|
49 | assert length > 0 |
---|
50 | #print(" ADD [%d+%d -%d) to %s" % (start, length, start+length, self.dump())) |
---|
51 | first_overlap = last_overlap = None |
---|
52 | for i,(s_start,s_length) in enumerate(self._spans): |
---|
53 | #print(" (%d+%d)-> overlap=%s adjacent=%s" % (s_start,s_length, overlap(s_start, s_length, start, length), adjacent(s_start, s_length, start, length))) |
---|
54 | if (overlap(s_start, s_length, start, length) |
---|
55 | or adjacent(s_start, s_length, start, length)): |
---|
56 | last_overlap = i |
---|
57 | if first_overlap is None: |
---|
58 | first_overlap = i |
---|
59 | continue |
---|
60 | # no overlap |
---|
61 | if first_overlap is not None: |
---|
62 | break |
---|
63 | #print(" first_overlap", first_overlap, last_overlap) |
---|
64 | if first_overlap is None: |
---|
65 | # no overlap, so just insert the span and sort by starting |
---|
66 | # position. |
---|
67 | self._spans.insert(0, (start,length)) |
---|
68 | self._spans.sort() |
---|
69 | else: |
---|
70 | # everything from [first_overlap] to [last_overlap] overlapped |
---|
71 | first_start,first_length = self._spans[first_overlap] |
---|
72 | last_start,last_length = self._spans[last_overlap] |
---|
73 | newspan_start = min(start, first_start) |
---|
74 | newspan_end = max(start+length, last_start+last_length) |
---|
75 | newspan_length = newspan_end - newspan_start |
---|
76 | newspan = (newspan_start, newspan_length) |
---|
77 | self._spans[first_overlap:last_overlap+1] = [newspan] |
---|
78 | #print(" ADD done: %s" % self.dump()) |
---|
79 | self._check() |
---|
80 | |
---|
81 | return self |
---|
82 | |
---|
83 | def remove(self, start, length): |
---|
84 | assert start >= 0 |
---|
85 | assert length > 0 |
---|
86 | #print(" REMOVE [%d+%d -%d) from %s" % (start, length, start+length, self.dump())) |
---|
87 | first_complete_overlap = last_complete_overlap = None |
---|
88 | for i,(s_start,s_length) in enumerate(self._spans): |
---|
89 | s_end = s_start + s_length |
---|
90 | o = overlap(s_start, s_length, start, length) |
---|
91 | if o: |
---|
92 | o_start, o_length = o |
---|
93 | o_end = o_start+o_length |
---|
94 | if o_start == s_start and o_end == s_end: |
---|
95 | # delete this span altogether |
---|
96 | if first_complete_overlap is None: |
---|
97 | first_complete_overlap = i |
---|
98 | last_complete_overlap = i |
---|
99 | elif o_start == s_start: |
---|
100 | # we only overlap the left side, so trim the start |
---|
101 | # 1111 |
---|
102 | # rrrr |
---|
103 | # oo |
---|
104 | # -> 11 |
---|
105 | new_start = o_end |
---|
106 | new_end = s_end |
---|
107 | assert new_start > s_start |
---|
108 | new_length = new_end - new_start |
---|
109 | self._spans[i] = (new_start, new_length) |
---|
110 | elif o_end == s_end: |
---|
111 | # we only overlap the right side |
---|
112 | # 1111 |
---|
113 | # rrrr |
---|
114 | # oo |
---|
115 | # -> 11 |
---|
116 | new_start = s_start |
---|
117 | new_end = o_start |
---|
118 | assert new_end < s_end |
---|
119 | new_length = new_end - new_start |
---|
120 | self._spans[i] = (new_start, new_length) |
---|
121 | else: |
---|
122 | # we overlap the middle, so create a new span. No need to |
---|
123 | # examine any other spans. |
---|
124 | # 111111 |
---|
125 | # rr |
---|
126 | # LL RR |
---|
127 | left_start = s_start |
---|
128 | left_end = o_start |
---|
129 | left_length = left_end - left_start |
---|
130 | right_start = o_end |
---|
131 | right_end = s_end |
---|
132 | right_length = right_end - right_start |
---|
133 | self._spans[i] = (left_start, left_length) |
---|
134 | self._spans.append( (right_start, right_length) ) |
---|
135 | self._spans.sort() |
---|
136 | break |
---|
137 | if first_complete_overlap is not None: |
---|
138 | del self._spans[first_complete_overlap:last_complete_overlap+1] |
---|
139 | #print(" REMOVE done: %s" % self.dump()) |
---|
140 | self._check() |
---|
141 | return self |
---|
142 | |
---|
143 | def dump(self): |
---|
144 | return "len=%d: %s" % (self.len(), |
---|
145 | ",".join(["[%d-%d]" % (start,start+l-1) |
---|
146 | for (start,l) in self._spans]) ) |
---|
147 | |
---|
148 | def each(self): |
---|
149 | for start, length in self._spans: |
---|
150 | for i in range(start, start+length): |
---|
151 | yield i |
---|
152 | |
---|
153 | def __iter__(self): |
---|
154 | for s in self._spans: |
---|
155 | yield s |
---|
156 | |
---|
157 | def __bool__(self): # this gets us bool() |
---|
158 | return bool(self.len()) |
---|
159 | |
---|
160 | #__nonzero__ = __bool__ # Python 2 backwards compatibility |
---|
161 | |
---|
162 | def len(self): |
---|
163 | # guess what! python doesn't allow __len__ to return a long, only an |
---|
164 | # int. So we stop using len(spans), use spans.len() instead. |
---|
165 | return sum([length for start,length in self._spans]) |
---|
166 | |
---|
167 | def __add__(self, other): |
---|
168 | s = self.__class__(self) |
---|
169 | for (start, length) in other: |
---|
170 | s.add(start, length) |
---|
171 | return s |
---|
172 | |
---|
173 | def __sub__(self, other): |
---|
174 | s = self.__class__(self) |
---|
175 | for (start, length) in other: |
---|
176 | s.remove(start, length) |
---|
177 | return s |
---|
178 | |
---|
179 | def __iadd__(self, other): |
---|
180 | for (start, length) in other: |
---|
181 | self.add(start, length) |
---|
182 | return self |
---|
183 | |
---|
184 | def __isub__(self, other): |
---|
185 | for (start, length) in other: |
---|
186 | self.remove(start, length) |
---|
187 | return self |
---|
188 | |
---|
189 | def __and__(self, other): |
---|
190 | if not self._spans: |
---|
191 | return self.__class__() |
---|
192 | bounds = self.__class__(self._spans[0][0], |
---|
193 | self._spans[-1][0]+self._spans[-1][1]) |
---|
194 | not_other = bounds - other |
---|
195 | return self - not_other |
---|
196 | |
---|
197 | def __contains__(self, start_and_length): |
---|
198 | (start, length) = start_and_length |
---|
199 | for span_start,span_length in self._spans: |
---|
200 | o = overlap(start, length, span_start, span_length) |
---|
201 | if o: |
---|
202 | o_start,o_length = o |
---|
203 | if o_start == start and o_length == length: |
---|
204 | return True |
---|
205 | return False |
---|
206 | |
---|
207 | def overlap(start0, length0, start1, length1): |
---|
208 | # return start2,length2 of the overlapping region, or None |
---|
209 | # 00 00 000 0000 00 00 000 00 00 00 00 |
---|
210 | # 11 11 11 11 111 11 11 1111 111 11 11 |
---|
211 | left = max(start0, start1) |
---|
212 | right = min(start0+length0, start1+length1) |
---|
213 | # if there is overlap, 'left' will be its start, and right-1 will |
---|
214 | # be the end' |
---|
215 | if left < right: |
---|
216 | return (left, right-left) |
---|
217 | return None |
---|
218 | |
---|
219 | def adjacent(start0, length0, start1, length1): |
---|
220 | if (start0 < start1) and start0+length0 == start1: |
---|
221 | return True |
---|
222 | elif (start1 < start0) and start1+length1 == start0: |
---|
223 | return True |
---|
224 | return False |
---|
225 | |
---|
226 | class DataSpans(object): |
---|
227 | """I represent portions of a large string. Equivalently, I can be said to |
---|
228 | maintain a large array of characters (with gaps of empty elements). I can |
---|
229 | be used to manage access to a remote share, where some pieces have been |
---|
230 | retrieved, some have been requested, and others have not been read. |
---|
231 | """ |
---|
232 | |
---|
233 | def __init__(self, other=None): |
---|
234 | self.spans = [] # (start, data) tuples, non-overlapping, merged |
---|
235 | if other: |
---|
236 | for (start, data) in other.get_chunks(): |
---|
237 | self.add(start, data) |
---|
238 | |
---|
239 | def __bool__(self): # this gets us bool() |
---|
240 | return bool(self.len()) |
---|
241 | |
---|
242 | def len(self): |
---|
243 | # return number of bytes we're holding |
---|
244 | return sum([len(data) for (start,data) in self.spans]) |
---|
245 | |
---|
246 | def _dump(self): |
---|
247 | # return iterator of sorted list of offsets, one per byte |
---|
248 | for (start,data) in self.spans: |
---|
249 | for i in range(start, start+len(data)): |
---|
250 | yield i |
---|
251 | |
---|
252 | def dump(self): |
---|
253 | return "len=%d: %s" % (self.len(), |
---|
254 | ",".join(["[%d-%d]" % (start,start+len(data)-1) |
---|
255 | for (start,data) in self.spans]) ) |
---|
256 | |
---|
257 | def get_chunks(self): |
---|
258 | return list(self.spans) |
---|
259 | |
---|
260 | def get_spans(self): |
---|
261 | """Return a Spans object with a bit set for each byte I hold""" |
---|
262 | return Spans([(start, len(data)) for (start,data) in self.spans]) |
---|
263 | |
---|
264 | def assert_invariants(self): |
---|
265 | if not self.spans: |
---|
266 | return |
---|
267 | prev_start = self.spans[0][0] |
---|
268 | prev_end = prev_start + len(self.spans[0][1]) |
---|
269 | for start, data in self.spans[1:]: |
---|
270 | if not start > prev_end: |
---|
271 | # adjacent or overlapping: bad |
---|
272 | print("ASSERTION FAILED", self.spans) |
---|
273 | raise AssertionError |
---|
274 | |
---|
275 | def get(self, start, length): |
---|
276 | # returns a string of LENGTH, or None |
---|
277 | #print("get", start, length, self.spans) |
---|
278 | end = start+length |
---|
279 | for (s_start,s_data) in self.spans: |
---|
280 | s_end = s_start+len(s_data) |
---|
281 | #print(" ",s_start,s_end) |
---|
282 | if s_start <= start < s_end: |
---|
283 | # we want some data from this span. Because we maintain |
---|
284 | # strictly merged and non-overlapping spans, everything we |
---|
285 | # want must be in this span. |
---|
286 | offset = start - s_start |
---|
287 | if offset + length > len(s_data): |
---|
288 | #print(" None, span falls short") |
---|
289 | return None # span falls short |
---|
290 | #print(" some", s_data[offset:offset+length]) |
---|
291 | return s_data[offset:offset+length] |
---|
292 | if s_start >= end: |
---|
293 | # we've gone too far: no further spans will overlap |
---|
294 | #print(" None, gone too far") |
---|
295 | return None |
---|
296 | #print(" None, ran out of spans") |
---|
297 | return None |
---|
298 | |
---|
299 | def add(self, start, data): |
---|
300 | # first: walk through existing spans, find overlap, modify-in-place |
---|
301 | # create list of new spans |
---|
302 | # add new spans |
---|
303 | # sort |
---|
304 | # merge adjacent spans |
---|
305 | #print("add", start, data, self.spans) |
---|
306 | end = start + len(data) |
---|
307 | i = 0 |
---|
308 | while len(data): |
---|
309 | #print(" loop", start, data, i, len(self.spans), self.spans) |
---|
310 | if i >= len(self.spans): |
---|
311 | #print(" append and done") |
---|
312 | # append a last span |
---|
313 | self.spans.append( (start, data) ) |
---|
314 | break |
---|
315 | (s_start,s_data) = self.spans[i] |
---|
316 | # five basic cases: |
---|
317 | # a: OLD b:OLDD c1:OLD c2:OLD d1:OLDD d2:OLD e: OLLDD |
---|
318 | # NEW NEW NEW NEWW NEW NEW NEW |
---|
319 | # |
---|
320 | # we handle A by inserting a new segment (with "N") and looping, |
---|
321 | # turning it into B or C. We handle B by replacing a prefix and |
---|
322 | # terminating. We handle C (both c1 and c2) by replacing the |
---|
323 | # segment (and, for c2, looping, turning it into A). We handle D |
---|
324 | # by replacing a suffix (and, for d2, looping, turning it into |
---|
325 | # A). We handle E by replacing the middle and terminating. |
---|
326 | if start < s_start: |
---|
327 | # case A: insert a new span, then loop with the remainder |
---|
328 | #print(" insert new span") |
---|
329 | s_len = s_start-start |
---|
330 | self.spans.insert(i, (start, data[:s_len])) |
---|
331 | i += 1 |
---|
332 | start = s_start |
---|
333 | data = data[s_len:] |
---|
334 | continue |
---|
335 | s_len = len(s_data) |
---|
336 | s_end = s_start+s_len |
---|
337 | if s_start <= start < s_end: |
---|
338 | #print(" modify this span", s_start, start, s_end) |
---|
339 | # we want to modify some data in this span: a prefix, a |
---|
340 | # suffix, or the whole thing |
---|
341 | if s_start == start: |
---|
342 | if s_end <= end: |
---|
343 | #print(" replace whole segment") |
---|
344 | # case C: replace this segment |
---|
345 | self.spans[i] = (s_start, data[:s_len]) |
---|
346 | i += 1 |
---|
347 | start += s_len |
---|
348 | data = data[s_len:] |
---|
349 | # C2 is where len(data)>0 |
---|
350 | continue |
---|
351 | # case B: modify the prefix, retain the suffix |
---|
352 | #print(" modify prefix") |
---|
353 | self.spans[i] = (s_start, data + s_data[len(data):]) |
---|
354 | break |
---|
355 | if start > s_start and end < s_end: |
---|
356 | # case E: modify the middle |
---|
357 | #print(" modify middle") |
---|
358 | prefix_len = start - s_start # we retain this much |
---|
359 | suffix_len = s_end - end # and retain this much |
---|
360 | newdata = s_data[:prefix_len] + data + s_data[-suffix_len:] |
---|
361 | self.spans[i] = (s_start, newdata) |
---|
362 | break |
---|
363 | # case D: retain the prefix, modify the suffix |
---|
364 | #print(" modify suffix") |
---|
365 | prefix_len = start - s_start # we retain this much |
---|
366 | suffix_len = s_len - prefix_len # we replace this much |
---|
367 | #print(" ", s_data, prefix_len, suffix_len, s_len, data) |
---|
368 | self.spans[i] = (s_start, |
---|
369 | s_data[:prefix_len] + data[:suffix_len]) |
---|
370 | i += 1 |
---|
371 | start += suffix_len |
---|
372 | data = data[suffix_len:] |
---|
373 | #print(" now", start, data) |
---|
374 | # D2 is where len(data)>0 |
---|
375 | continue |
---|
376 | # else we're not there yet |
---|
377 | #print(" still looking") |
---|
378 | i += 1 |
---|
379 | continue |
---|
380 | # now merge adjacent spans |
---|
381 | #print(" merging", self.spans) |
---|
382 | newspans = [] |
---|
383 | for (s_start,s_data) in self.spans: |
---|
384 | if newspans and adjacent(newspans[-1][0], len(newspans[-1][1]), |
---|
385 | s_start, len(s_data)): |
---|
386 | newspans[-1] = (newspans[-1][0], newspans[-1][1] + s_data) |
---|
387 | else: |
---|
388 | newspans.append( (s_start, s_data) ) |
---|
389 | self.spans = newspans |
---|
390 | self.assert_invariants() |
---|
391 | #print(" done", self.spans) |
---|
392 | |
---|
393 | def remove(self, start, length): |
---|
394 | i = 0 |
---|
395 | end = start + length |
---|
396 | #print("remove", start, length, self.spans) |
---|
397 | while i < len(self.spans): |
---|
398 | (s_start,s_data) = self.spans[i] |
---|
399 | if s_start >= end: |
---|
400 | # this segment is entirely right of the removed region, and |
---|
401 | # all further segments are even further right. We're done. |
---|
402 | break |
---|
403 | s_len = len(s_data) |
---|
404 | s_end = s_start + s_len |
---|
405 | o = overlap(start, length, s_start, s_len) |
---|
406 | if not o: |
---|
407 | i += 1 |
---|
408 | continue |
---|
409 | o_start, o_len = o |
---|
410 | o_end = o_start + o_len |
---|
411 | if o_len == s_len: |
---|
412 | # remove the whole segment |
---|
413 | del self.spans[i] |
---|
414 | continue |
---|
415 | if o_start == s_start: |
---|
416 | # remove a prefix, leaving the suffix from o_end to s_end |
---|
417 | prefix_len = o_end - o_start |
---|
418 | self.spans[i] = (o_end, s_data[prefix_len:]) |
---|
419 | i += 1 |
---|
420 | continue |
---|
421 | elif o_end == s_end: |
---|
422 | # remove a suffix, leaving the prefix from s_start to o_start |
---|
423 | prefix_len = o_start - s_start |
---|
424 | self.spans[i] = (s_start, s_data[:prefix_len]) |
---|
425 | i += 1 |
---|
426 | continue |
---|
427 | # remove the middle, creating a new segment |
---|
428 | # left is s_start:o_start, right is o_end:s_end |
---|
429 | left_len = o_start - s_start |
---|
430 | left = s_data[:left_len] |
---|
431 | right_len = s_end - o_end |
---|
432 | right = s_data[-right_len:] |
---|
433 | self.spans[i] = (s_start, left) |
---|
434 | self.spans.insert(i+1, (o_end, right)) |
---|
435 | break |
---|
436 | #print(" done", self.spans) |
---|
437 | |
---|
438 | def pop(self, start, length): |
---|
439 | data = self.get(start, length) |
---|
440 | if data: |
---|
441 | self.remove(start, length) |
---|
442 | return data |
---|