source: trunk/src/allmydata/scripts/tahoe_backup.py

Last change on this file was 1cfe843d, checked in by Alexandre Detiste <alexandre.detiste@…>, at 2024-02-22T23:40:25Z

more python2 removal

  • Property mode set to 100644
File size: 20.2 KB
Line 
1"""
2Ported to Python 3.
3"""
4
5import os.path
6import time
7from urllib.parse import quote as url_quote
8import datetime
9
10from allmydata.scripts.common import get_alias, escape_path, DEFAULT_ALIAS, \
11                                     UnknownAliasError
12from allmydata.scripts.common_http import do_http, HTTPError, format_http_error
13from allmydata.util import time_format, jsonbytes as json
14from allmydata.scripts import backupdb
15from allmydata.util.encodingutil import listdir_unicode, quote_output, \
16     quote_local_unicode_path, to_bytes, FilenameEncodingError, unicode_to_url
17from allmydata.util.assertutil import precondition
18from allmydata.util.fileutil import abspath_expanduser_unicode, precondition_abspath
19
20
21def get_local_metadata(path):
22    metadata = {}
23
24    # posix stat(2) metadata, depends on the platform
25    s = os.stat(path)
26    metadata["ctime"] = s.st_ctime
27    metadata["mtime"] = s.st_mtime
28
29    misc_fields = ("st_mode", "st_ino", "st_dev", "st_uid", "st_gid")
30    macos_misc_fields = ("st_rsize", "st_creator", "st_type")
31    for field in misc_fields + macos_misc_fields:
32        if hasattr(s, field):
33            metadata[field] = getattr(s, field)
34
35    # TODO: extended attributes, like on OS-X's HFS+
36    return metadata
37
38def mkdir(contents, options):
39    kids = dict([ (childname, (contents[childname][0],
40                               {"ro_uri": contents[childname][1],
41                                "metadata": contents[childname][2],
42                                }))
43                  for childname in contents
44                  ])
45    body = json.dumps(kids).encode("utf-8")
46    url = options['node-url'] + "uri?t=mkdir-immutable"
47    resp = do_http("POST", url, body)
48    if resp.status < 200 or resp.status >= 300:
49        raise HTTPError("Error during mkdir", resp)
50
51    dircap = to_bytes(resp.read().strip())
52    return dircap
53
54def put_child(dirurl, childname, childcap):
55    assert dirurl[-1] != "/"
56    url = dirurl + "/" + url_quote(unicode_to_url(childname)) + "?t=uri"
57    resp = do_http("PUT", url, childcap)
58    if resp.status not in (200, 201):
59        raise HTTPError("Error during put_child", resp)
60
61class BackerUpper(object):
62    """
63    :ivar int _files_checked: The number of files which the backup process has
64        so-far inspected on the grid to determine if they need to be
65        re-uploaded.
66
67    :ivar int _directories_checked: The number of directories which the backup
68        process has so-far inspected on the grid to determine if they need to
69        be re-uploaded.
70    """
71    def __init__(self, options):
72        self.options = options
73        self._files_checked = 0
74        self._directories_checked = 0
75
76    def run(self):
77        options = self.options
78        nodeurl = options['node-url']
79        self.verbosity = 1
80        if options['quiet']:
81            self.verbosity = 0
82        if options['verbose']:
83            self.verbosity = 2
84        stdout = options.stdout
85        stderr = options.stderr
86
87        start_timestamp = datetime.datetime.now()
88        bdbfile = os.path.join(options["node-directory"],
89                               "private", "backupdb.sqlite")
90        bdbfile = abspath_expanduser_unicode(bdbfile)
91        self.backupdb = backupdb.get_backupdb(bdbfile, stderr)
92        if not self.backupdb:
93            print("ERROR: Unable to load backup db.", file=stderr)
94            return 1
95
96        try:
97            rootcap, path = get_alias(options.aliases, options.to_dir, DEFAULT_ALIAS)
98        except UnknownAliasError as e:
99            e.display(stderr)
100            return 1
101        to_url = nodeurl + "uri/%s/" % url_quote(rootcap)
102        if path:
103            to_url += escape_path(path)
104        if not to_url.endswith("/"):
105            to_url += "/"
106
107        archives_url = to_url + "Archives/"
108
109        archives_url = archives_url.rstrip("/")
110        to_url = to_url.rstrip("/")
111
112        # first step: make sure the target directory exists, as well as the
113        # Archives/ subdirectory.
114        resp = do_http("GET", archives_url + "?t=json")
115        if resp.status == 404:
116            resp = do_http("POST", archives_url + "?t=mkdir")
117            if resp.status != 200:
118                print(format_http_error("Unable to create target directory", resp), file=stderr)
119                return 1
120
121        # second step: process the tree
122        targets = list(collect_backup_targets(
123            options.from_dir,
124            listdir_unicode,
125            self.options.filter_listdir,
126        ))
127        completed = run_backup(
128            warn=self.warn,
129            upload_file=self.upload,
130            upload_directory=self.upload_directory,
131            targets=targets,
132            start_timestamp=start_timestamp,
133            stdout=stdout,
134        )
135        new_backup_dircap = completed.dircap
136
137        # third: attach the new backup to the list
138        now = time_format.iso_utc(int(time.time()), sep="_") + "Z"
139
140        put_child(archives_url, now, new_backup_dircap)
141        put_child(to_url, "Latest", new_backup_dircap)
142        print(completed.report(
143            self.verbosity,
144            self._files_checked,
145            self._directories_checked,
146        ), file=stdout)
147
148        # The command exits with code 2 if files or directories were skipped
149        if completed.any_skips():
150            return 2
151
152        # done!
153        return 0
154
155    def verboseprint(self, msg):
156        precondition(isinstance(msg, str), msg)
157        if self.verbosity >= 2:
158            print(msg, file=self.options.stdout)
159
160    def warn(self, msg):
161        precondition(isinstance(msg, str), msg)
162        print(msg, file=self.options.stderr)
163
164    def upload_directory(self, path, compare_contents, create_contents):
165        must_create, r = self.check_backupdb_directory(compare_contents)
166        if must_create:
167            self.verboseprint(" creating directory for %s" % quote_local_unicode_path(path))
168            newdircap = mkdir(create_contents, self.options)
169            assert isinstance(newdircap, bytes)
170            if r:
171                r.did_create(newdircap)
172            return True, newdircap
173        else:
174            self.verboseprint(" re-using old directory for %s" % quote_local_unicode_path(path))
175            return False, r.was_created()
176
177
178    def check_backupdb_file(self, childpath):
179        if not self.backupdb:
180            return True, None
181        use_timestamps = not self.options["ignore-timestamps"]
182        r = self.backupdb.check_file(childpath, use_timestamps)
183
184        if not r.was_uploaded():
185            return True, r
186
187        if not r.should_check():
188            # the file was uploaded or checked recently, so we can just use
189            # it
190            return False, r
191
192        # we must check the file before using the results
193        filecap = r.was_uploaded()
194        self.verboseprint("checking %s" % quote_output(filecap))
195        nodeurl = self.options['node-url']
196        checkurl = nodeurl + "uri/%s?t=check&output=JSON" % url_quote(filecap)
197        self._files_checked += 1
198        resp = do_http("POST", checkurl)
199        if resp.status != 200:
200            # can't check, so we must assume it's bad
201            return True, r
202
203        cr = json.loads(resp.read())
204        healthy = cr["results"]["healthy"]
205        if not healthy:
206            # must upload
207            return True, r
208        # file is healthy, no need to upload
209        r.did_check_healthy(cr)
210        return False, r
211
212    def check_backupdb_directory(self, compare_contents):
213        if not self.backupdb:
214            return True, None
215        r = self.backupdb.check_directory(compare_contents)
216
217        if not r.was_created():
218            return True, r
219
220        if not r.should_check():
221            # the file was uploaded or checked recently, so we can just use
222            # it
223            return False, r
224
225        # we must check the directory before re-using it
226        dircap = r.was_created()
227        self.verboseprint("checking %s" % quote_output(dircap))
228        nodeurl = self.options['node-url']
229        checkurl = nodeurl + "uri/%s?t=check&output=JSON" % url_quote(dircap)
230        self._directories_checked += 1
231        resp = do_http("POST", checkurl)
232        if resp.status != 200:
233            # can't check, so we must assume it's bad
234            return True, r
235
236        cr = json.loads(resp.read())
237        healthy = cr["results"]["healthy"]
238        if not healthy:
239            # must create
240            return True, r
241        # directory is healthy, no need to upload
242        r.did_check_healthy(cr)
243        return False, r
244
245    # This function will raise an IOError exception when called on an unreadable file
246    def upload(self, childpath):
247        precondition_abspath(childpath)
248
249        #self.verboseprint("uploading %s.." % quote_local_unicode_path(childpath))
250        metadata = get_local_metadata(childpath)
251
252        # we can use the backupdb here
253        must_upload, bdb_results = self.check_backupdb_file(childpath)
254
255        if must_upload:
256            self.verboseprint("uploading %s.." % quote_local_unicode_path(childpath))
257            infileobj = open(childpath, "rb")
258            url = self.options['node-url'] + "uri"
259            resp = do_http("PUT", url, infileobj)
260            if resp.status not in (200, 201):
261                raise HTTPError("Error during file PUT", resp)
262
263            filecap = resp.read().strip()
264            self.verboseprint(" %s -> %s" % (quote_local_unicode_path(childpath, quotemarks=False),
265                                             quote_output(filecap, quotemarks=False)))
266            #self.verboseprint(" metadata: %s" % (quote_output(metadata, quotemarks=False),))
267
268            if bdb_results:
269                bdb_results.did_upload(filecap)
270
271            return True, filecap, metadata
272
273        else:
274            self.verboseprint("skipping %s.." % quote_local_unicode_path(childpath))
275            return False, bdb_results.was_uploaded(), metadata
276
277
278def backup(options):
279    bu = BackerUpper(options)
280    return bu.run()
281
282
283def collect_backup_targets(root, listdir, filter_children):
284    """
285    Yield BackupTargets in a suitable order for processing (deepest targets
286    before their parents).
287    """
288    try:
289        children = listdir(root)
290    except EnvironmentError:
291        yield PermissionDeniedTarget(root, isdir=True)
292    except FilenameEncodingError:
293        yield FilenameUndecodableTarget(root, isdir=True)
294    else:
295        for child in filter_children(children):
296            assert isinstance(child, str), child
297            childpath = os.path.join(root, child)
298            if os.path.islink(childpath):
299                yield LinkTarget(childpath, isdir=False)
300            elif os.path.isdir(childpath):
301                child_targets = collect_backup_targets(
302                    childpath,
303                    listdir,
304                    filter_children,
305                )
306                for child_target in child_targets:
307                    yield child_target
308            elif os.path.isfile(childpath):
309                yield FileTarget(childpath)
310            else:
311                yield SpecialTarget(childpath)
312        yield DirectoryTarget(root)
313
314
315def run_backup(
316        warn,
317        upload_file,
318        upload_directory,
319        targets,
320        start_timestamp,
321        stdout,
322):
323    progress = BackupProgress(warn, start_timestamp, len(targets))
324    for target in targets:
325        # Pass in the progress and get back a progress.  It would be great if
326        # progress objects were immutable.  Then the target's backup would
327        # make a new progress with the desired changes and return it to us.
328        # Currently, BackupProgress is mutable, though, and everything just
329        # mutates it.
330        progress = target.backup(progress, upload_file, upload_directory)
331        print(progress.report(datetime.datetime.now()), file=stdout)
332    return progress.backup_finished()
333
334
335class FileTarget(object):
336    def __init__(self, path):
337        self._path = path
338
339    def __repr__(self):
340        return "<File {}>".format(self._path)
341
342    def backup(self, progress, upload_file, upload_directory):
343        try:
344            created, childcap, metadata = upload_file(self._path)
345        except EnvironmentError:
346            target = PermissionDeniedTarget(self._path, isdir=False)
347            return target.backup(progress, upload_file, upload_directory)
348        else:
349            assert isinstance(childcap, bytes)
350            if created:
351                return progress.created_file(self._path, childcap, metadata)
352            return progress.reused_file(self._path, childcap, metadata)
353
354
355class DirectoryTarget(object):
356    def __init__(self, path):
357        self._path = path
358
359    def __repr__(self):
360        return "<Directory {}>".format(self._path)
361
362    def backup(self, progress, upload_file, upload_directory):
363        metadata = get_local_metadata(self._path)
364        progress, create, compare = progress.consume_directory(self._path)
365        did_create, dircap = upload_directory(self._path, compare, create)
366        if did_create:
367            return progress.created_directory(self._path, dircap, metadata)
368        return progress.reused_directory(self._path, dircap, metadata)
369
370
371class _ErrorTarget(object):
372    def __init__(self, path, isdir=False):
373        self._path = path
374        self._quoted_path = quote_local_unicode_path(path)
375        self._isdir = isdir
376
377
378class PermissionDeniedTarget(_ErrorTarget):
379    def backup(self, progress, upload_file, upload_directory):
380        return progress.permission_denied(self._isdir, self._quoted_path)
381
382
383class FilenameUndecodableTarget(_ErrorTarget):
384    def backup(self, progress, upload_file, upload_directory):
385        return progress.decoding_failed(self._isdir, self._quoted_path)
386
387
388class LinkTarget(_ErrorTarget):
389    def backup(self, progress, upload_file, upload_directory):
390        return progress.unsupported_filetype(
391            self._isdir,
392            self._quoted_path,
393            "symlink",
394        )
395
396
397class SpecialTarget(_ErrorTarget):
398    def backup(self, progress, upload_file, upload_directory):
399        return progress.unsupported_filetype(
400            self._isdir,
401            self._quoted_path,
402            "special",
403        )
404
405
406class BackupComplete(object):
407    def __init__(self,
408                 start_timestamp,
409                 end_timestamp,
410                 files_created,
411                 files_reused,
412                 files_skipped,
413                 directories_created,
414                 directories_reused,
415                 directories_skipped,
416                 dircap,
417    ):
418        self._start_timestamp = start_timestamp
419        self._end_timestamp = end_timestamp
420        self._files_created = files_created
421        self._files_reused = files_reused
422        self._files_skipped = files_skipped
423        self._directories_created = directories_created
424        self._directories_reused = directories_reused
425        self._directories_skipped = directories_skipped
426        self.dircap = dircap
427
428    def any_skips(self):
429        return self._files_skipped or self._directories_skipped
430
431    def report(self, verbosity, files_checked, directories_checked):
432        result = []
433
434        if verbosity >= 1:
435            result.append(
436                " %d files uploaded (%d reused),"
437                " %d files skipped,"
438                " %d directories created (%d reused),"
439                " %d directories skipped" % (
440                    self._files_created,
441                    self._files_reused,
442                    self._files_skipped,
443                    self._directories_created,
444                    self._directories_reused,
445                    self._directories_skipped,
446                ),
447            )
448
449        if verbosity >= 2:
450            result.append(
451                " %d files checked, %d directories checked" % (
452                    files_checked,
453                    directories_checked,
454                ),
455            )
456        # calc elapsed time, omitting microseconds
457        elapsed_time = str(
458            self._end_timestamp - self._start_timestamp
459        ).split('.')[0]
460        result.append(" backup done, elapsed time: %s" % (elapsed_time,))
461
462        return "\n".join(result)
463
464
465class BackupProgress(object):
466    # Would be nice if this data structure were immutable and its methods were
467    # transformations that created a new slightly different object.  Not there
468    # yet, though.
469    def __init__(self, warn, start_timestamp, target_count):
470        self._warn = warn
471        self._start_timestamp = start_timestamp
472        self._target_count = target_count
473        self._files_created = 0
474        self._files_reused = 0
475        self._files_skipped = 0
476        self._directories_created = 0
477        self._directories_reused = 0
478        self._directories_skipped = 0
479        self.last_dircap = None
480        self._create_contents = {}
481        self._compare_contents = {}
482
483    def report(self, now):
484        report_format = (
485            "Backing up {target_progress}/{target_total}... {elapsed} elapsed..."
486        )
487        return report_format.format(
488            target_progress=(
489                self._files_created
490                + self._files_reused
491                + self._files_skipped
492                + self._directories_created
493                + self._directories_reused
494                + self._directories_skipped
495            ),
496            target_total=self._target_count,
497            elapsed=self._format_elapsed(now - self._start_timestamp),
498        )
499
500    def _format_elapsed(self, elapsed):
501        seconds = int(elapsed.total_seconds())
502        hours = seconds // 3600
503        minutes = (seconds // 60) % 60
504        seconds = seconds % 60
505        return "{}h {}m {}s".format(
506            hours,
507            minutes,
508            seconds,
509        )
510
511    def backup_finished(self):
512        end_timestamp = datetime.datetime.now()
513        return BackupComplete(
514            self._start_timestamp,
515            end_timestamp,
516            self._files_created,
517            self._files_reused,
518            self._files_skipped,
519            self._directories_created,
520            self._directories_reused,
521            self._directories_skipped,
522            self.last_dircap,
523        )
524
525    def consume_directory(self, dirpath):
526        return self, {
527            os.path.basename(create_path): create_value
528            for (create_path, create_value)
529            in list(self._create_contents.items())
530            if os.path.dirname(create_path) == dirpath
531        }, {
532            os.path.basename(compare_path): compare_value
533            for (compare_path, compare_value)
534            in list(self._compare_contents.items())
535            if os.path.dirname(compare_path) == dirpath
536        }
537
538    def created_directory(self, path, dircap, metadata):
539        self._create_contents[path] = ("dirnode", dircap, metadata)
540        self._compare_contents[path] = dircap
541        self._directories_created += 1
542        self.last_dircap = dircap
543        return self
544
545    def reused_directory(self, path, dircap, metadata):
546        self._create_contents[path] = ("dirnode", dircap, metadata)
547        self._compare_contents[path] = dircap
548        self._directories_reused += 1
549        self.last_dircap = dircap
550        return self
551
552    def created_file(self, path, cap, metadata):
553        self._create_contents[path] = ("filenode", cap, metadata)
554        self._compare_contents[path] = cap
555        self._files_created += 1
556        return self
557
558    def reused_file(self, path, cap, metadata):
559        self._create_contents[path] = ("filenode", cap, metadata)
560        self._compare_contents[path] = cap
561        self._files_reused += 1
562        return self
563
564    def permission_denied(self, isdir, quoted_path):
565        return self._skip(
566            "WARNING: permission denied on {kind} {path}",
567            isdir,
568            path=quoted_path,
569        )
570
571    def decoding_failed(self, isdir, quoted_path):
572        return self._skip(
573            "WARNING: could not list {kind} {path} due to a filename encoding error",
574            isdir,
575            path=quoted_path,
576        )
577
578    def unsupported_filetype(self, isdir, quoted_path, filetype):
579        return self._skip(
580            "WARNING: cannot backup {filetype} {path}",
581            isdir,
582            path=quoted_path,
583            filetype=filetype,
584        )
585
586    def _skip(self, message, isdir, **kw):
587        if isdir:
588            self._directories_skipped += 1
589            kind = "directory"
590        else:
591            self._files_skipped += 1
592            kind = "file"
593        self._warn(message.format(kind=kind, **kw))
594        # Pretend we're a persistent data structure being transformed.
595        return self
Note: See TracBrowser for help on using the repository browser.