Ticket #562: censor.darcspatch.txt

File censor.darcspatch.txt, 9.8 KB (added by kevan, at 2010-05-01T23:49:00Z)

implementation of 'tahoe censor'

Line 
1Sun Apr 25 13:08:26 PDT 2010  kevan@isnotajoke.com
2  * Add code to support 'tahoe censor'
3 
4  'tahoe censor' allows users to quickly and easily remove furls, IP
5  addresses and (eventually) storage indices from log files, so that they
6  can be posted in bug reports, mailing list messages, and other places
7  without much effort.
8
9New patches:
10
11[Add code to support 'tahoe censor'
12kevan@isnotajoke.com**20100425200826
13 Ignore-this: 4e8c27d53f766c82e18bf5c7f5df0c01
14 
15 'tahoe censor' allows users to quickly and easily remove furls, IP
16 addresses and (eventually) storage indices from log files, so that they
17 can be posted in bug reports, mailing list messages, and other places
18 without much effort.
19] {
20hunk ./src/allmydata/scripts/cli.py 411
21     (which must be a directory), like 'tahoe check' but for multiple files.
22     Optionally repair any problems found."""
23 
24+
25+class CensorOptions(BaseOptions, usage.Options):
26+    optFlags = [
27+        ["verbose", "v", "Print information about what is going on"]
28+    ]
29+
30+    def parseArgs(self, oldfile, newfile=None):
31+        self['oldfile'] = oldfile
32+        if newfile == None:
33+            self['newfile'] = oldfile
34+        else:
35+            self['newfile'] = newfile
36+
37+    def getSynopsis(self):
38+        return "%s censor SOURCE-LOG DEST-LOG" % (os.path.basename(sys.argv[0]),)
39+
40+    longdesc = """
41+    Censor IP addreses, node addresses and storage indices from
42+    SOURCE-LOG, writing the results to DEST-LOG."""
43+
44+
45 subCommands = [
46     ["mkdir", None, MakeDirectoryOptions, "Create a new directory"],
47     ["add-alias", None, AddAliasOptions, "Add a new alias cap"],
48hunk ./src/allmydata/scripts/cli.py 450
49     ["stats", None, StatsOptions, "Print statistics about all files/directories in a subtree"],
50     ["check", None, CheckOptions, "Check a single file or directory"],
51     ["deep-check", None, DeepCheckOptions, "Check all files/directories reachable from a starting point"],
52+    ["censor", None, CensorOptions, "Purge sensitive information from a log file"],
53     ]
54 
55 def mkdir(options):
56hunk ./src/allmydata/scripts/cli.py 547
57     rc = tahoe_check.deepcheck(options)
58     return rc
59 
60+def censor(options):
61+    from allmydata.scripts import tahoe_censor
62+    rc = tahoe_censor.censor(options)
63+    return rc
64+
65 dispatch = {
66     "mkdir": mkdir,
67     "add-alias": add_alias,
68hunk ./src/allmydata/scripts/cli.py 570
69     "stats": stats,
70     "check": check,
71     "deep-check": deepcheck,
72+    "censor": censor,
73     }
74addfile ./src/allmydata/scripts/tahoe_censor.py
75hunk ./src/allmydata/scripts/tahoe_censor.py 1
76+import sys, pickle, bz2, re, shutil, os
77+
78+from twisted.python import usage
79+
80+class Censorer:
81+    """
82+    I use pickle to read events in a foolscap log, remove information
83+    that people might not want to be public knowledge (e.g., IP
84+    addresses, node furls, storage indices) from the messages of those
85+    logs, and write out my results.
86+    """
87+
88+
89+    NONEXISTENT_FILE_MSG = ("Error: Either '%s' doesn't exist, or I "
90+                            "can't read it")
91+    UNWRITABLE_FILE_MSG  = ("Error: I can't write to '%s'")
92+
93+    IN_PLACE_LOG_MSG     = ("Info: I'm censoring the file '%s' in-place")
94+    NEW_DEST_LOG_MSG     = ("Info: I'm censoring '%s', and writing the "
95+                            "results to '%s'")
96+    STATUS_MSG           = ("Info: I'm processing the message '%s'")
97+    INVALID_FILE_MSG     = ("Error: Either the file '%s' is unreadable, "
98+                            "or it is in an invalid format")
99+    DONE_MSG             = ("Info: Done. Your censored log files are now in "
100+                            "'%s'. I processed a total of %d messages. I "
101+                            "found and removed the following:\n "
102+                            "IP Addresses: %d\n "
103+                            "Storage Indices: %d\n "
104+                            "Node URLs: %d\n")
105+
106+
107+    def run(self, options):
108+        self.options = options
109+        self.rc = 0
110+        self.censored_logs = 0
111+        self.censored_ips = 0
112+        self.censored_sis = 0
113+        self.censored_furls = 0
114+        newfile = options['newfile']
115+        oldfile = options['oldfile']
116+
117+        # Does the source file exist? Is it readable?
118+        if not os.path.exists(oldfile) or not os.access(oldfile, os.R_OK):
119+            print >>options.stderr, self.NONEXISTENT_FILE_MSG % oldfile
120+            return 1
121+        # Does the destination file exist? If so, can I write to it?
122+        if os.path.exists(newfile) and not os.access(newfile, os.W_OK):
123+            print >>options.stderr, self.UNWRITABLE_FILE_MSG % newfile
124+            return 1
125+
126+        if options['verbose']:
127+            if newfile == oldfile:
128+                print >>options.stdout, self.IN_PLACE_LOG_MSG % newfile
129+            else:
130+                print >>options.stdout, self.NEW_DEST_LOG_MSG % (newfile,
131+                                                                 oldfile)
132+        if newfile == oldfile:
133+            newfilename = newfile + ".tmp"
134+            if newfile.endswith(".bz2"):
135+                newfilename += ".bz2"
136+        else:
137+            newfilename = newfile
138+
139+        if newfilename.endswith(".bz2"):
140+            newfilef = bz2.BZ2File(newfilename, "wb")
141+        else:
142+            newfilef = open(newfilename, "wb")
143+
144+        ip_expression    = re.compile("\d+\.\d+\.\d+\.\d+")
145+        ip_replacement   = "xxx.xxx.xxx.xxx"
146+        si_expression    = re.compile("<si>.*</si>", flags=re.IGNORECASE)
147+        si_replacement   = "<si>xxxxxx</si>"
148+        furl_expression  = re.compile("pb://.*/", flags=re.IGNORECASE)
149+        furl_replacement = "pb://xxxx/"
150+
151+        for e in self.get_events(oldfile):
152+            if "d" in e and "message" in e['d']:
153+                message = e['d']["message"]
154+                message, ip_matches = re.subn(ip_expression,
155+                                              ip_replacement,
156+                                              message)
157+                self.censored_ips += ip_matches
158+                message, si_matches = re.subn(si_expression,
159+                                              si_replacement,
160+                                              message)
161+                self.censored_sis += si_matches
162+                message, furl_matches = re.subn(furl_expression,
163+                                                furl_replacement,
164+                                                message)
165+                self.censored_furls += furl_matches
166+
167+                if options['verbose'] \
168+                and (ip_matches or si_matches or furl_matches):
169+                    print >>options.stdout, self.STATUS_MSG % e['d']['message']
170+                e['d']["message"] = message
171+                pickle.dump(e, newfilef, 2)
172+                self.censored_logs += 1
173+        newfilef.close()
174+        if newfile == oldfile:
175+            shutil.move(newfilename, newfile)
176+        if not options['quiet']:
177+            print >>options.stdout, self.DONE_MSG % (newfile,
178+                                                     self.censored_logs,
179+                                                     self.censored_ips,
180+                                                     self.censored_sis,
181+                                                     self.censored_furls)
182+        return self.rc
183+
184+
185+    def get_events(self, source):
186+        if source.endswith(".bz2"):
187+            f = bz2.BZ2File(source, "rb")
188+        else:
189+            f = open(source, "rb")
190+
191+        while True:
192+            try:
193+                e = pickle.load(f)
194+                assert(isinstance(e, dict))
195+                yield e
196+            except EOFError, e:
197+                # pickle.load will raise an EOFError when it is done
198+                # reading the contents of a file, but also when there
199+                # aren't any more valid contents to read. The first
200+                # condition is okay; it means there are no more logs
201+                # to read; the latter isn't. So we keep track of how
202+                # logs we have processed, and use that to figure out
203+                # which condition we're in.
204+                if self.censored_logs == 0:
205+                    self.rc = 1
206+                    print >>self.options.stderr, self.INVALID_FILE_MSG % source
207+                break
208+            except:
209+                # Sometimes, pickle will raise other exceptions to tell
210+                # us that it doesn't like the format of what it is
211+                # reading.
212+                self.rc = 1
213+                print >>self.options.stderr, self.INVALID_FILE_MSG % source
214+                break
215+        f.close()
216+
217+
218+def censor(options):
219+    c = Censorer()
220+    rc = c.run(options)
221+    return rc
222}
223
224Context:
225
226[setup: add licensing declaration for setuptools (noticed by the FSF compliance folks)
227zooko@zooko.com**20100309184415
228 Ignore-this: 2dfa7d812d65fec7c72ddbf0de609ccb
229]
230[setup: fix error in licensing declaration from Shawn Willden, as noted by the FSF compliance division
231zooko@zooko.com**20100309163736
232 Ignore-this: c0623d27e469799d86cabf67921a13f8
233]
234[CREDITS to Jacob Appelbaum
235zooko@zooko.com**20100304015616
236 Ignore-this: 70db493abbc23968fcc8db93f386ea54
237]
238[desert-island-build-with-proper-versions
239jacob@appelbaum.net**20100304013858]
240[docs: a few small edits to try to guide newcomers through the docs
241zooko@zooko.com**20100303231902
242 Ignore-this: a6aab44f5bf5ad97ea73e6976bc4042d
243 These edits were suggested by my watching over Jake Appelbaum's shoulder as he completely ignored/skipped/missed install.html and also as he decided that debian.txt wouldn't help him with basic installation. Then I threw in a few docs edits that have been sitting around in my sandbox asking to be committed for months.
244]
245[TAG allmydata-tahoe-1.6.1
246david-sarah@jacaranda.org**20100228062314
247 Ignore-this: eb5f03ada8ea953ee7780e7fe068539
248]
249Patch bundle hash:
250cc205da6be53515218cbe01c0e349b63f076a4d1