Sun Apr 25 13:08:26 PDT 2010 kevan@isnotajoke.com * Add code to support 'tahoe censor' 'tahoe censor' allows users to quickly and easily remove furls, IP addresses and (eventually) storage indices from log files, so that they can be posted in bug reports, mailing list messages, and other places without much effort. New patches: [Add code to support 'tahoe censor' kevan@isnotajoke.com**20100425200826 Ignore-this: 4e8c27d53f766c82e18bf5c7f5df0c01 'tahoe censor' allows users to quickly and easily remove furls, IP addresses and (eventually) storage indices from log files, so that they can be posted in bug reports, mailing list messages, and other places without much effort. ] { hunk ./src/allmydata/scripts/cli.py 411 (which must be a directory), like 'tahoe check' but for multiple files. Optionally repair any problems found.""" + +class CensorOptions(BaseOptions, usage.Options): + optFlags = [ + ["verbose", "v", "Print information about what is going on"] + ] + + def parseArgs(self, oldfile, newfile=None): + self['oldfile'] = oldfile + if newfile == None: + self['newfile'] = oldfile + else: + self['newfile'] = newfile + + def getSynopsis(self): + return "%s censor SOURCE-LOG DEST-LOG" % (os.path.basename(sys.argv[0]),) + + longdesc = """ + Censor IP addreses, node addresses and storage indices from + SOURCE-LOG, writing the results to DEST-LOG.""" + + subCommands = [ ["mkdir", None, MakeDirectoryOptions, "Create a new directory"], ["add-alias", None, AddAliasOptions, "Add a new alias cap"], hunk ./src/allmydata/scripts/cli.py 450 ["stats", None, StatsOptions, "Print statistics about all files/directories in a subtree"], ["check", None, CheckOptions, "Check a single file or directory"], ["deep-check", None, DeepCheckOptions, "Check all files/directories reachable from a starting point"], + ["censor", None, CensorOptions, "Purge sensitive information from a log file"], ] def mkdir(options): hunk ./src/allmydata/scripts/cli.py 547 rc = tahoe_check.deepcheck(options) return rc +def censor(options): + from allmydata.scripts import tahoe_censor + rc = tahoe_censor.censor(options) + return rc + dispatch = { "mkdir": mkdir, "add-alias": add_alias, hunk ./src/allmydata/scripts/cli.py 570 "stats": stats, "check": check, "deep-check": deepcheck, + "censor": censor, } addfile ./src/allmydata/scripts/tahoe_censor.py hunk ./src/allmydata/scripts/tahoe_censor.py 1 +import sys, pickle, bz2, re, shutil, os + +from twisted.python import usage + +class Censorer: + """ + I use pickle to read events in a foolscap log, remove information + that people might not want to be public knowledge (e.g., IP + addresses, node furls, storage indices) from the messages of those + logs, and write out my results. + """ + + + NONEXISTENT_FILE_MSG = ("Error: Either '%s' doesn't exist, or I " + "can't read it") + UNWRITABLE_FILE_MSG = ("Error: I can't write to '%s'") + + IN_PLACE_LOG_MSG = ("Info: I'm censoring the file '%s' in-place") + NEW_DEST_LOG_MSG = ("Info: I'm censoring '%s', and writing the " + "results to '%s'") + STATUS_MSG = ("Info: I'm processing the message '%s'") + INVALID_FILE_MSG = ("Error: Either the file '%s' is unreadable, " + "or it is in an invalid format") + DONE_MSG = ("Info: Done. Your censored log files are now in " + "'%s'. I processed a total of %d messages. I " + "found and removed the following:\n " + "IP Addresses: %d\n " + "Storage Indices: %d\n " + "Node URLs: %d\n") + + + def run(self, options): + self.options = options + self.rc = 0 + self.censored_logs = 0 + self.censored_ips = 0 + self.censored_sis = 0 + self.censored_furls = 0 + newfile = options['newfile'] + oldfile = options['oldfile'] + + # Does the source file exist? Is it readable? + if not os.path.exists(oldfile) or not os.access(oldfile, os.R_OK): + print >>options.stderr, self.NONEXISTENT_FILE_MSG % oldfile + return 1 + # Does the destination file exist? If so, can I write to it? + if os.path.exists(newfile) and not os.access(newfile, os.W_OK): + print >>options.stderr, self.UNWRITABLE_FILE_MSG % newfile + return 1 + + if options['verbose']: + if newfile == oldfile: + print >>options.stdout, self.IN_PLACE_LOG_MSG % newfile + else: + print >>options.stdout, self.NEW_DEST_LOG_MSG % (newfile, + oldfile) + if newfile == oldfile: + newfilename = newfile + ".tmp" + if newfile.endswith(".bz2"): + newfilename += ".bz2" + else: + newfilename = newfile + + if newfilename.endswith(".bz2"): + newfilef = bz2.BZ2File(newfilename, "wb") + else: + newfilef = open(newfilename, "wb") + + ip_expression = re.compile("\d+\.\d+\.\d+\.\d+") + ip_replacement = "xxx.xxx.xxx.xxx" + si_expression = re.compile(".*", flags=re.IGNORECASE) + si_replacement = "xxxxxx" + furl_expression = re.compile("pb://.*/", flags=re.IGNORECASE) + furl_replacement = "pb://xxxx/" + + for e in self.get_events(oldfile): + if "d" in e and "message" in e['d']: + message = e['d']["message"] + message, ip_matches = re.subn(ip_expression, + ip_replacement, + message) + self.censored_ips += ip_matches + message, si_matches = re.subn(si_expression, + si_replacement, + message) + self.censored_sis += si_matches + message, furl_matches = re.subn(furl_expression, + furl_replacement, + message) + self.censored_furls += furl_matches + + if options['verbose'] \ + and (ip_matches or si_matches or furl_matches): + print >>options.stdout, self.STATUS_MSG % e['d']['message'] + e['d']["message"] = message + pickle.dump(e, newfilef, 2) + self.censored_logs += 1 + newfilef.close() + if newfile == oldfile: + shutil.move(newfilename, newfile) + if not options['quiet']: + print >>options.stdout, self.DONE_MSG % (newfile, + self.censored_logs, + self.censored_ips, + self.censored_sis, + self.censored_furls) + return self.rc + + + def get_events(self, source): + if source.endswith(".bz2"): + f = bz2.BZ2File(source, "rb") + else: + f = open(source, "rb") + + while True: + try: + e = pickle.load(f) + assert(isinstance(e, dict)) + yield e + except EOFError, e: + # pickle.load will raise an EOFError when it is done + # reading the contents of a file, but also when there + # aren't any more valid contents to read. The first + # condition is okay; it means there are no more logs + # to read; the latter isn't. So we keep track of how + # logs we have processed, and use that to figure out + # which condition we're in. + if self.censored_logs == 0: + self.rc = 1 + print >>self.options.stderr, self.INVALID_FILE_MSG % source + break + except: + # Sometimes, pickle will raise other exceptions to tell + # us that it doesn't like the format of what it is + # reading. + self.rc = 1 + print >>self.options.stderr, self.INVALID_FILE_MSG % source + break + f.close() + + +def censor(options): + c = Censorer() + rc = c.run(options) + return rc } Context: [setup: add licensing declaration for setuptools (noticed by the FSF compliance folks) zooko@zooko.com**20100309184415 Ignore-this: 2dfa7d812d65fec7c72ddbf0de609ccb ] [setup: fix error in licensing declaration from Shawn Willden, as noted by the FSF compliance division zooko@zooko.com**20100309163736 Ignore-this: c0623d27e469799d86cabf67921a13f8 ] [CREDITS to Jacob Appelbaum zooko@zooko.com**20100304015616 Ignore-this: 70db493abbc23968fcc8db93f386ea54 ] [desert-island-build-with-proper-versions jacob@appelbaum.net**20100304013858] [docs: a few small edits to try to guide newcomers through the docs zooko@zooko.com**20100303231902 Ignore-this: a6aab44f5bf5ad97ea73e6976bc4042d These edits were suggested by my watching over Jake Appelbaum's shoulder as he completely ignored/skipped/missed install.html and also as he decided that debian.txt wouldn't help him with basic installation. Then I threw in a few docs edits that have been sitting around in my sandbox asking to be committed for months. ] [TAG allmydata-tahoe-1.6.1 david-sarah@jacaranda.org**20100228062314 Ignore-this: eb5f03ada8ea953ee7780e7fe068539 ] Patch bundle hash: cc205da6be53515218cbe01c0e349b63f076a4d1