[tahoe-dev] [tahoe-lafs] #534: "tahoe cp" command encoding issue

Tue Apr 28 23:11:02 PDT 2009

#534: "tahoe cp" command encoding issue
-----------------------------------+----------------------------------------
     Reporter:  francois           |       Owner:  francois                          
         Type:  defect             |      Status:  assigned                          
     Priority:  minor              |   Milestone:  1.5.0                             
    Component:  code-frontend-cli  |     Version:  1.2.0                             
   Resolution:                     |    Keywords:  cp encoding unicode filename utf-8
Launchpad_bug:                     |  
-----------------------------------+----------------------------------------

Comment(by zooko):

 I was referencing this ticket and my (untested) example code here in the
 PEP 383 thread on python-dev (http://mail.python.org/pipermail/python-
 dev/2009-April/089170.html ), and I realized that I forgot to set the
 "failed_decode" flag in my example code.  Here is a new version of it that
 sets the "failed_decode" flag, and that uses {{{utf-8}}} for the attempted
 decode and only uses {{{utf-8b}}} when doing the fallback (for clarity --
 should make no difference since the attempted decode uses error handling
 mode 'strict').

 I will also attach it.

 {{{
 # A wrapper around the Python Standard Library's filename access functions
 to
 # provide a uniform API for all platforms and to prevent lossy en/de-
 coding.

 class Fname:
     def __init__(self, name, failed_decode=False, alleged_encoding=None):
         self.name = name
         self.failed_decode = failed_decode
         self.alleged_encoding = alleged_encoding

 if platform.system() in ('Linux', 'Solaris'):
     # on byte-oriented filesystems, such as Linux and Solaris

     def unicode_to_fs(fn):
         """ Encode an unicode object to bytes. """
         precondition(isinstance(fn, Fname), fn)
         precondition(isinstance(fn.name, unicode), fn.name)

         if fn.failed_decode:
             # This means that the unicode string in .name is not
             # actually the result of a successful decoding with a
             # suggested codec, but is instead the result of stuffing the
             # bytes into a unicode by dint of the utf-8b trick.  This
             # means that on a byte-oriented system, you shouldn't treat
             # the .name as a unicode string containing chars, but
             # instead you should get the original bytes back out of it.
             return fn.name.encode('utf-8b', 'python-replace')
         else:
             fsencoding = sys.getfilesystemencoding()
             if fsencoding in (None, '', 'ascii', 'utf-8'):
                 fsencoding = 'utf-8b'
             try:
                 return fn.name.encode(encoding, 'python-escape')
             except UnicodeEncodeError:
                 raise usage.UsageError("Filename '%s' cannot be \
 encoded using the current encoding of your filesystem (%s). Please \
 configure your locale correctly or rename this file." %
                                        (s, sys.getfilesystemencoding()))

     def fs_to_unicode(bytesfn):
         """ Decode bytes from the filesystem to a unicode object. """
         precondition(isinstance(bytesfn, str), str)

         alleged_encoding = sys.getfilesystemencoding()
         if alleged_encoding in (None, '', 'ascii'):
             alleged_encoding = 'utf-8'

         try:
             unicodefn = bytesfn.decode(alleged_encoding, 'strict')
         except UnicodeEncodeError:
             unicodefn = bytesfn.decode('utf-8b', 'python-escape')
             return Fname(unicodefn, failed_decode=True)
         else:
             unicodefn = unicodedata.normalize('NFC', unicodefn)
             if alleged_encoding == 'utf-8':
                 return Fname(unicodefn)
             else:
                 return Fname(unicodefn, alleged_encoding)

     def listdir(fn):
         assert isinstance(fn, Fname), fn
         assert isinstance(fn.name, unicode), fn.name
         bytesfn = unicode_to_fs(fn.name)
         res = os.listdir(bytesfn)
         return([fs_to_unicode(fn) for fn in res])

 else:
     # on unicode-oriented filesystems, such as Mac and Windows
     def listdir(fn):
         assert isinstance(fn, Fname), fn
         assert isinstance(fn.name, unicode), fn.name
         return [Fname(n) for n in os.listdir(fn.name)]
 }}}

-- 
Ticket URL: <http://allmydata.org/trac/tahoe/ticket/534#comment:60>
tahoe-lafs <http://allmydata.org>
secure decentralized file storage grid