source: trunk/src/allmydata/webish.py

Last change on this file was 79512a9, checked in by Jean-Paul Calderone <exarkun@…>, at 2023-07-11T20:30:54Z

Adjust the temp factory return type

BinaryIO is a subclass of IO[bytes] so it doesn't check out as
the return type of a callable we pass around.

Switch to the superclass instead.

  • Property mode set to 100644
File size: 12.4 KB
Line 
1"""
2General web server-related utilities.
3"""
4from __future__ import annotations
5
6from six import ensure_str
7from typing import IO, Callable, Optional
8import re, time, tempfile
9from urllib.parse import parse_qsl, urlencode
10
11from cgi import (
12    FieldStorage,
13)
14from io import (
15    BytesIO,
16)
17
18from twisted.application import service, strports, internet
19from twisted.web import static
20from twisted.web.http import (
21    parse_qs,
22)
23from twisted.web.server import (
24    Request,
25    Site,
26)
27from twisted.internet import defer
28from twisted.internet.address import (
29    IPv4Address,
30    IPv6Address,
31)
32from allmydata.util import log, fileutil
33
34from allmydata.web import introweb, root
35from allmydata.web.operations import OphandleTable
36
37from .web.storage_plugins import (
38    StoragePlugins,
39)
40
41
42class FileUploadFieldStorage(FieldStorage):
43    """
44    Do terrible things to ensure files are still bytes.
45
46    On Python 2, uploaded files were always bytes.  On Python 3, there's a
47    heuristic: if the filename is set on a field, it's assumed to be a file
48    upload and therefore bytes.  If no filename is set, it's Unicode.
49
50    Unfortunately, we always want it to be bytes, and Tahoe-LAFS also
51    enables setting the filename not via the MIME filename, but via a
52    separate field called "name".
53
54    Thus we need to do this ridiculous workaround.  Mypy doesn't like it
55    either, thus the ``# type: ignore`` below.
56
57    Source for idea:
58    https://mail.python.org/pipermail/python-dev/2017-February/147402.html
59    """
60    @property  # type: ignore
61    def filename(self):
62        if self.name == "file" and not self._mime_filename:
63            # We use the file field to upload files, see directory.py's
64            # _POST_upload. Lack of _mime_filename means we need to trick
65            # FieldStorage into thinking there is a filename so it'll
66            # return bytes.
67            return "unknown-filename"
68        return self._mime_filename
69
70    @filename.setter
71    def filename(self, value):
72        self._mime_filename = value
73
74
75class TahoeLAFSRequest(Request, object):
76    """
77    ``TahoeLAFSRequest`` adds several features to a Twisted Web ``Request``
78    that are useful for Tahoe-LAFS.
79
80    :ivar NoneType|FieldStorage fields: For POST requests, a structured
81        representation of the contents of the request body.  For anything
82        else, ``None``.
83    """
84    fields = None
85
86    def requestReceived(self, command, path, version):
87        """
88        Called by channel when all data has been received.
89
90        Override the base implementation to apply certain site-wide policies
91        and to provide less memory-intensive multipart/form-post handling for
92        large file uploads.
93        """
94        self.content.seek(0)
95        self.args = {}
96        self.stack = []
97
98        self.method, self.uri = command, path
99        self.clientproto = version
100        x = self.uri.split(b'?', 1)
101
102        if len(x) == 1:
103            self.path = self.uri
104        else:
105            self.path, argstring = x
106            self.args = parse_qs(argstring, 1)
107
108        content_type = (self.requestHeaders.getRawHeaders("content-type") or [""])[0]
109        if self.method == b'POST' and content_type.split(";")[0] in ("multipart/form-data", "application/x-www-form-urlencoded"):
110            # We use FieldStorage here because it performs better than
111            # cgi.parse_multipart(self.content, pdict) which is what
112            # twisted.web.http.Request uses.
113
114            headers = {
115                ensure_str(name.lower()): ensure_str(value[-1])
116                for (name, value)
117                in self.requestHeaders.getAllRawHeaders()
118            }
119
120            if 'content-length' not in headers:
121                # Python 3's cgi module would really, really like us to set Content-Length.
122                self.content.seek(0, 2)
123                headers['content-length'] = str(self.content.tell())
124                self.content.seek(0)
125
126            self.fields = FileUploadFieldStorage(
127                self.content, headers, environ={'REQUEST_METHOD': 'POST'})
128            self.content.seek(0)
129
130        self._tahoeLAFSSecurityPolicy()
131
132        self.processing_started_timestamp = time.time()
133        self.process()
134
135    def _tahoeLAFSSecurityPolicy(self):
136        """
137        Set response properties related to Tahoe-LAFS-imposed security policy.
138        This will ensure that all HTTP requests received by the Tahoe-LAFS
139        HTTP server have this policy imposed, regardless of other
140        implementation details.
141        """
142        # See https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/X-Frame-Options
143        self.responseHeaders.setRawHeaders("X-Frame-Options", ["DENY"])
144        # See https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/Referrer-Policy
145        self.setHeader("Referrer-Policy", "no-referrer")
146
147
148def _get_client_ip(request):
149    try:
150        get = request.getClientAddress
151    except AttributeError:
152        return request.getClientIP()
153    else:
154        client_addr = get()
155        if isinstance(client_addr, (IPv4Address, IPv6Address)):
156            return client_addr.host
157        return None
158
159
160def _logFormatter(logDateTime, request):
161    # we build up a log string that hides most of the cap, to preserve
162    # user privacy. We retain the query args so we can identify things
163    # like t=json. Then we send it to the flog. We make no attempt to
164    # match apache formatting. TODO: when we move to DSA dirnodes and
165    # shorter caps, consider exposing a few characters of the cap, or
166    # maybe a few characters of its hash.
167    x = request.uri.split(b"?", 1)
168    if len(x) == 1:
169        # no query args
170        path = request.uri
171        queryargs = b""
172    else:
173        path, queryargs = x
174        queryargs = b"?" + censor(queryargs)
175    if path.startswith(b"/uri/"):
176        path = b"/uri/[CENSORED]"
177    elif path.startswith(b"/file/"):
178        path = b"/file/[CENSORED]"
179    elif path.startswith(b"/named/"):
180        path = b"/named/[CENSORED]"
181
182    uri = path + queryargs
183
184    template = "web: %(clientip)s %(method)s %(uri)s %(code)s %(length)s"
185    return template % dict(
186        clientip=_get_client_ip(request),
187        method=str(request.method, "utf-8"),
188        uri=str(uri, "utf-8"),
189        code=request.code,
190        length=(request.sentLength or "-"),
191        facility="tahoe.webish",
192        level=log.OPERATIONAL,
193    )
194
195
196def censor(queryargs: bytes) -> bytes:
197    """
198    Replace potentially sensitive values in query arguments with a
199    constant string.
200    """
201    args = parse_qsl(queryargs.decode("ascii"), keep_blank_values=True, encoding="utf8")
202    result = []
203    for k, v in args:
204        if k == "uri":
205            # there is a form handler which redirects POST /uri?uri=FOO into
206            # GET /uri/FOO so folks can paste in non-HTTP-prefixed uris. Make
207            # sure we censor these.
208            v = "[CENSORED]"
209        elif k == "private-key":
210            # Likewise, sometimes a private key is supplied with mutable
211            # creation.
212            v = "[CENSORED]"
213
214        result.append((k, v))
215
216    # Customize safe to try to leave our markers intact.
217    return urlencode(result, safe="[]").encode("ascii")
218
219
220def anonymous_tempfile_factory(tempdir: bytes) -> Callable[[], IO[bytes]]:
221    """
222    Create a no-argument callable for creating a new temporary file in the
223    given directory.
224
225    :param tempdir: The directory in which temporary files with be created.
226
227    :return: The callable.
228    """
229    return lambda: tempfile.TemporaryFile(dir=tempdir)
230
231
232class TahoeLAFSSite(Site, object):
233    """
234    The HTTP protocol factory used by Tahoe-LAFS.
235
236    Among the behaviors provided:
237
238    * A configurable temporary file factory for large request bodies to avoid
239      keeping them in memory.
240
241    * A log formatter that writes some access logs but omits capability
242      strings to help keep them secret.
243    """
244    requestFactory = TahoeLAFSRequest
245
246    def __init__(self, make_tempfile: Callable[[], IO[bytes]], *args, **kwargs):
247        Site.__init__(self, *args, logFormatter=_logFormatter, **kwargs)
248        assert callable(make_tempfile)
249        with make_tempfile():
250            pass
251        self._make_tempfile = make_tempfile
252
253    def getContentFile(self, length: Optional[int]) -> IO[bytes]:
254        if length is None or length >= 1024 * 1024:
255            return self._make_tempfile()
256        return BytesIO()
257
258class WebishServer(service.MultiService):
259    # The type in Twisted for services is wrong in 22.10...
260    # https://github.com/twisted/twisted/issues/10135
261    name = "webish"  # type: ignore[assignment]
262
263    def __init__(self, client, webport, make_tempfile, nodeurl_path=None, staticdir=None,
264                 clock=None, now_fn=time.time):
265        service.MultiService.__init__(self)
266        # the 'data' argument to all render() methods default to the Client
267        # the 'clock' argument to root.Root is, if set, a
268        # twisted.internet.task.Clock that is provided by the unit tests
269        # so that they can test features that involve the passage of
270        # time in a deterministic manner.
271
272        self.root = root.Root(client, clock, now_fn)
273        self.buildServer(webport, make_tempfile, nodeurl_path, staticdir)
274
275        # If set, clock is a twisted.internet.task.Clock that the tests
276        # use to test ophandle expiration.
277        self._operations = OphandleTable(clock)
278        self._operations.setServiceParent(self)
279        self.root.putChild(b"operations", self._operations)
280
281        self.root.putChild(b"storage-plugins", StoragePlugins(client))
282
283    def buildServer(self, webport, make_tempfile, nodeurl_path, staticdir):
284        self.webport = webport
285        self.site = TahoeLAFSSite(make_tempfile, self.root)
286        self.staticdir = staticdir # so tests can check
287        if staticdir:
288            self.root.putChild(b"static", static.File(staticdir))
289        if re.search(r'^\d', webport):
290            webport = "tcp:"+webport # twisted warns about bare "0" or "3456"
291        # strports must be native strings.
292        webport = ensure_str(webport)
293        s = strports.service(webport, self.site)
294        s.setServiceParent(self)
295
296        self._scheme = None
297        self._portnum = None
298        self._url = None
299        self._listener = s # stash it so we can query for the portnum
300
301        self._started = defer.Deferred()
302        if nodeurl_path:
303            def _write_nodeurl_file(ign):
304                # this file will be created with default permissions
305                line = self.getURL() + "\n"
306                fileutil.write_atomically(nodeurl_path, line, mode="")
307            self._started.addCallback(_write_nodeurl_file)
308
309    def getURL(self):
310        assert self._url
311        return self._url
312
313    def getPortnum(self):
314        assert self._portnum
315        return self._portnum
316
317    def startService(self):
318        def _got_port(lp):
319            self._portnum = lp.getHost().port
320            # what is our webport?
321            assert self._scheme
322            self._url = "%s://127.0.0.1:%d/" % (self._scheme, self._portnum)
323            self._started.callback(None)
324            return lp
325        def _fail(f):
326            self._started.errback(f)
327            return f
328
329        service.MultiService.startService(self)
330        s = self._listener
331        if hasattr(s, 'endpoint') and hasattr(s, '_waitingForPort'):
332            # Twisted 10.2 gives us a StreamServerEndpointService. This is
333            # ugly but should do for now.
334            classname = s.endpoint.__class__.__name__
335            if classname.startswith('SSL'):
336                self._scheme = 'https'
337            else:
338                self._scheme = 'http'
339            s._waitingForPort.addCallbacks(_got_port, _fail)
340        elif isinstance(s, internet.TCPServer):
341            # Twisted <= 10.1
342            self._scheme = 'http'
343            _got_port(s._port)
344        elif isinstance(s, internet.SSLServer):
345            # Twisted <= 10.1
346            self._scheme = 'https'
347            _got_port(s._port)
348        else:
349            # who knows, probably some weirdo future version of Twisted
350            self._started.errback(AssertionError("couldn't find out the scheme or port for the web-API server"))
351
352    def get_operations(self):
353        """
354        :return: a reference to our "active operations" tracker
355        """
356        return self._operations
357
358
359class IntroducerWebishServer(WebishServer):
360    def __init__(self, introducer, webport, nodeurl_path=None, staticdir=None):
361        service.MultiService.__init__(self)
362        self.root = introweb.IntroducerRoot(introducer)
363        self.buildServer(webport, tempfile.TemporaryFile, nodeurl_path, staticdir)
Note: See TracBrowser for help on using the repository browser.