Skip to content

Unify the code to recursively walk structures and make compatible with Python 3. #93

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 1 commit into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
59 changes: 39 additions & 20 deletions mig/shared/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,10 +32,12 @@
from past.builtins import basestring

import base64
import codecs
import os
import re

# IMPORTANT: do not import any other MiG modules here - to avoid import loops
from mig.shared.compat import PY2
from mig.shared.defaults import default_str_coding, default_fs_coding, \
keyword_all, keyword_auto, sandbox_names, _user_invisible_files, \
_user_invisible_dirs, _vgrid_xgi_scripts, cert_field_order, csrf_field, \
Expand Down Expand Up @@ -488,12 +490,28 @@ def verify_local_url(configuration, req_url):
return False


def is_bytes_type(thetype):
"""Return boolean indicating if val is a unicode string. We avoid
Copy link
Contributor

@jonasbardino jonasbardino Aug 21, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You mean "if thetype corresponds to a byte sequence / string" ... looks like unfinished copy / paste.

Comparing to bytes works but perhaps it would be slightly clearer to compare with type(b"") here for symmetry with the is_unicode_type helper. One could even refactor them into one, then, but that's another matter :-)

`isinstance(val, unicode)`
and the like since it breaks when combined with python-future and futurize.
"""
return (thetype == bytes)


def is_unicode(val):
"""Return boolean indicating if val is a unicode string. We avoid
`isinstance(val, unicode)`
and the like since it breaks when combined with python-future and futurize.
"""
return (type(u"") == type(val))
return is_unicode_type(type(val))


def is_unicode_type(thetype):
"""Return boolean indicating if val is a unicode string. We avoid
`isinstance(val, unicode)`
and the like since it breaks when combined with python-future and futurize.
"""
return (thetype == type(u""))


def force_utf8(val, highlight=''):
Expand All @@ -505,23 +523,32 @@ def force_utf8(val, highlight=''):
val = "%s" % val
if not is_unicode(val):
return val
return "%s%s%s" % (highlight, val.encode("utf8"), highlight)
return codecs.encode("%s%s%s" % (highlight, val, highlight), 'utf8')


def _walk_and_convert_recursive(input_obj, highlight='', _is_primitive=None, _force_primitive=None, _force_recursive=None):
assert _is_primitive is not None
assert _force_primitive is not None
assert _force_recursive is not None

thetype = type(input_obj)
if issubclass(thetype, dict):
return {_force_recursive(i, highlight): _force_recursive(j, highlight) for (i, j) in
input_obj.items()}
elif issubclass(thetype, (list, tuple)):
return thetype((_force_recursive(i, highlight) for i in input_obj))
elif not _is_primitive(thetype):
return _force_primitive(input_obj, highlight)
else:
return input_obj


def force_utf8_rec(input_obj, highlight=''):
"""Recursive object conversion from unicode to utf8: useful to convert e.g.
dictionaries with nested unicode strings to a pure utf8 version. Actual
changes are marked out with the highlight string if given.
"""
if isinstance(input_obj, dict):
return {force_utf8_rec(i, highlight): force_utf8_rec(j, highlight) for (i, j) in
input_obj.items()}
elif isinstance(input_obj, list):
return [force_utf8_rec(i, highlight) for i in input_obj]
elif is_unicode(input_obj):
return force_utf8(input_obj, highlight)
else:
return input_obj
return _walk_and_convert_recursive(input_obj, highlight, _is_primitive=is_bytes_type, _force_primitive=force_utf8, _force_recursive=force_utf8_rec)


def force_unicode(val, highlight=''):
Expand All @@ -541,15 +568,7 @@ def force_unicode_rec(input_obj, highlight=''):
dictionaries with nested utf8 strings to a pure unicode version. Actual
changes are marked out with the highlight string if given.
"""
if isinstance(input_obj, dict):
return {force_unicode_rec(i, highlight): force_unicode_rec(j, highlight) for (i, j) in
input_obj.items()}
elif isinstance(input_obj, list):
return [force_unicode_rec(i, highlight) for i in input_obj]
elif not is_unicode(input_obj):
return force_unicode(input_obj, highlight)
else:
return input_obj
return _walk_and_convert_recursive(input_obj, highlight, _is_primitive=is_unicode_type, _force_primitive=force_unicode, _force_recursive=force_unicode_rec)


def _force_default_coding(input_obj, kind, highlight=''):
Expand Down
5 changes: 3 additions & 2 deletions mig/shared/safeinput.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,8 @@
from html import escape as escape_html
assert escape_html is not None

from mig.shared.base import force_unicode, force_utf8
from mig.shared.base import force_unicode
from mig.shared.compat import ensure_native_string
from mig.shared.defaults import src_dst_sep, username_charset, \
username_max_length, session_id_charset, session_id_length, \
subject_id_charset, subject_id_min_length, subject_id_max_length, \
Expand Down Expand Up @@ -2294,7 +2295,7 @@ def __init__(self, value):
def __str__(self):
"""Return string representation"""

return force_utf8(force_unicode(self.value))
return ensure_native_string(self.value)


def main(_exit=sys.exit, _print=print):
Expand Down
117 changes: 117 additions & 0 deletions tests/test_mig_shared_base.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,117 @@
# -*- coding: utf-8 -*-
#
# --- BEGIN_HEADER ---
#
# test_mig_shared_compat - unit test of the corresponding mig shared module
# Copyright (C) 2003-2024 The MiG Project by the Science HPC Center at UCPH
#
# This file is part of MiG.
#
# MiG is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# MiG is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301,
# USA.
#
# --- END_HEADER ---
#

"""Unit tests for the migrid module pointed to in the filename"""

import binascii
import os
import sys

from tests.support import PY2, MigTestCase, testmain

from mig.shared.base import force_default_fs_coding_rec, \
force_default_fs_coding, force_default_str_coding_rec, \
force_default_str_coding, force_utf8, force_unicode

DUMMY_BYTECHARS = b'DEADBEEF'
DUMMY_BYTESRAW = binascii.unhexlify('DEADBEEF') # 4 bytes
DUMMY_UNICODE = u'UniCode123½¾µßðþđŋħĸþł@ª€£$¥©®'


class MigSharedBase__force_default_fs_coding_rec(MigTestCase):
"""Unit tests of mig.shared.base force_default_fs_coding_rec()"""
Copy link
Contributor

@jonasbardino jonasbardino Aug 21, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This version of the test suite looks a lot better in terms of correctness and coverage, as we discussed previously.
I still think we should either emphasize in the docstring or a comment that we explicitly test output contents to be byte values (b"XYZ") because we already know that the force_default_fs_coding* functions always must return bytes. Without that information one should really compare the recursive output parts against the plain force_default_fs_coding(val) in one test series and perhaps add another test series to check that the non-recursive version returns bytes to complete the chain.


def test_encode_a_string(self):
output = force_default_fs_coding_rec('foobar')

self.assertEqual(output, b'foobar')

def test_encode_within_a_dict(self):
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I might want to duplicate the bytes/unicode permutations from the tuple tests below for these other data types for the sake of completeness if nothing else.

output = force_default_fs_coding_rec({ 'key': 'value' })

self.assertEqual(output, { b'key': b'value' })

def test_encode_within_a_list(self):
output = force_default_fs_coding_rec(['foo', 'bar', 'baz'])

self.assertEqual(output, [b'foo', b'bar', b'baz'])

def test_encode_within_a_tuple_string(self):
output = force_default_fs_coding_rec(('foo', 'bar', 'baz'))

self.assertEqual(output, (b'foo', b'bar', b'baz'))

def test_encode_within_a_tuple_bytes(self):
output = force_default_fs_coding_rec((b'foo', b'bar', b'baz'))

self.assertEqual(output, (b'foo', b'bar', b'baz'))

def test_encode_within_a_tuple_unicode(self):
output = force_default_fs_coding_rec((u'foo', u'bar', u'baz'))

self.assertEqual(output, (b'foo', b'bar', b'baz'))


class MigSharedBase__force_utf8(MigTestCase):
"""Unit tests of mig.shared.base force_utf8()"""

def test_encode_string(self):
output = force_utf8('foobar')

self.assertEqual(output, b'foobar')

def test_encode_bytes(self):
output = force_utf8(b'foobar')

self.assertEqual(output, b'foobar')

def test_encode_unicode(self):
output = force_utf8(u'foobar')

self.assertEqual(output, b'foobar')


class MigSharedBase__force_unicode(MigTestCase):
"""Unit tests of mig.shared.base force_unicode()"""

def test_encode_string(self):
output = force_unicode('foobar')

self.assertEqual(output, u'foobar')

def test_encode_bytes(self):
output = force_unicode(b'foobar')

self.assertEqual(output, u'foobar')

def test_encode_unicode(self):
output = force_unicode(u'foobar')

self.assertEqual(output, u'foobar')

if __name__ == '__main__':
testmain(failfast=True)
Loading