From a8ae9e5421f28248c2de40ac25281b10055bdc38 Mon Sep 17 00:00:00 2001 From: Alex Burke Date: Fri, 31 May 2024 13:56:11 +0200 Subject: [PATCH 01/14] WIP: safename 1/x --- mig/shared/sanitize.py | 44 ++++++++++++++++++++++++++++++ tests/test_mig_shared_sanitize.py | 45 +++++++++++++++++++++++++++++++ 2 files changed, 89 insertions(+) create mode 100644 mig/shared/sanitize.py create mode 100644 tests/test_mig_shared_sanitize.py diff --git a/mig/shared/sanitize.py b/mig/shared/sanitize.py new file mode 100644 index 000000000..5fd498914 --- /dev/null +++ b/mig/shared/sanitize.py @@ -0,0 +1,44 @@ +#!/usr/bin/python +# -*- coding: utf-8 -*- +# +# --- BEGIN_HEADER --- +# +# safeeval - Safe evaluation of expressions and commands +# Copyright (C) 2003-2023 The MiG Project +# +# This file is part of MiG. +# +# MiG is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# (at your option) any later version. +# +# MiG is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. +# +# -- END_HEADER --- +# + +import codecs +import sys + +PY2 = sys.version_info[0] == 2 + +if PY2: + def _as_ascii_string(value): return value +else: + def _as_ascii_string(value): return codecs.decode(value, 'ascii') + + +def safename_encode(value): + return _as_ascii_string(codecs.encode(value, 'punycode')) + + +def safename_decode(value): + return codecs.decode(value, 'punycode') diff --git a/tests/test_mig_shared_sanitize.py b/tests/test_mig_shared_sanitize.py new file mode 100644 index 000000000..ca95d06a3 --- /dev/null +++ b/tests/test_mig_shared_sanitize.py @@ -0,0 +1,45 @@ +# -*- coding: utf-8 -*- + +import importlib +import os +import sys + +sys.path.append(os.path.realpath(os.path.join(os.path.dirname(__file__), "."))) + +from support import MigTestCase, testmain + +from mig.shared.sanitize import safename_encode, safename_decode + +DUMMY_EXOTIC = u'UniCode123½¾µßðþđŋħĸþł@ª€£$¥©®' + + +class MigSharedSanitize_safename(MigTestCase): + def test_executes_encode(self): + safename_encode("") + + def test_encode_exotic(self): + encoded = safename_encode(DUMMY_EXOTIC) + + self.assertEqual( + encoded, "UniCode123@$-lna3a4dm6e3ftgua80ewlwka88boszo7i7iv930g") + + def test_executes_decode(self): + safename_decode("") + + def test_roundtrip_empty(self): + inputvalue = "" + + outputvalue = safename_decode(safename_encode(inputvalue)) + + self.assertEqual(outputvalue, inputvalue) + + def test_roundtrip_ascii(self): + inputvalue = "abcde123467890" + + outputvalue = safename_decode(safename_encode(inputvalue)) + + self.assertEqual(outputvalue, inputvalue) + + +if __name__ == '__main__': + testmain() From 9279de4d3cc8464060b110ec0c5a8138efea558a Mon Sep 17 00:00:00 2001 From: Alex Burke Date: Mon, 3 Jun 2024 16:46:45 +0200 Subject: [PATCH 02/14] start to encode special chars with shell meaning --- mig/shared/sanitize.py | 41 +++++++++++++++++++++++++++++-- tests/test_mig_shared_sanitize.py | 10 +++++--- 2 files changed, 46 insertions(+), 5 deletions(-) diff --git a/mig/shared/sanitize.py b/mig/shared/sanitize.py index 5fd498914..0adb85bd2 100644 --- a/mig/shared/sanitize.py +++ b/mig/shared/sanitize.py @@ -26,8 +26,19 @@ # import codecs +import os +import string import sys +sys.path.append(os.path.realpath( + os.path.join(os.path.dirname(__file__), "../.."))) + +from mig.shared.defaults import username_charset + +UNSAFE_CHARS = sorted(list(set(string.printable) - set(username_charset))) +UNSAFE_CHARS_ORD = list(ord(c) for c in UNSAFE_CHARS) +UNSAFE_CHARS_NAMES = list(str(o).zfill(3) for o in UNSAFE_CHARS_ORD) +UNSAFE_SUBSTIUTIONS = dict(zip(UNSAFE_CHARS_ORD, UNSAFE_CHARS_NAMES)) PY2 = sys.version_info[0] == 2 if PY2: @@ -37,8 +48,34 @@ def _as_ascii_string(value): return codecs.decode(value, 'ascii') def safename_encode(value): - return _as_ascii_string(codecs.encode(value, 'punycode')) + punycoded = _as_ascii_string(codecs.encode(value, 'punycode')) + characters = list(punycoded) + + for index, character in enumerate(characters): + character_ordinal = ord(character) + character_substitute = UNSAFE_SUBSTIUTIONS.get(character_ordinal, None) + if character_substitute is not None: + characters[index] = ":%s" % character_substitute + + return ''.join(characters) def safename_decode(value): - return codecs.decode(value, 'punycode') + chunked = value.split(':') + + if len(chunked) > 1: + for index, chunk in enumerate(chunked): + if chunk == '': + continue + trailer = chunk[3:] + character_substitute = chunk[0:3] + character_ordinal = int(character_substitute) + chunked[index] = "%s%s" % (chr(character_ordinal), trailer) + + return codecs.decode(''.join(chunked), 'punycode') + + +if __name__ == '__main__': + d = dict(zip(UNSAFE_CHARS_ORD, UNSAFE_CHARS_NAMES)) + print(len(d)) + print(d) diff --git a/tests/test_mig_shared_sanitize.py b/tests/test_mig_shared_sanitize.py index ca95d06a3..11c95bb17 100644 --- a/tests/test_mig_shared_sanitize.py +++ b/tests/test_mig_shared_sanitize.py @@ -21,7 +21,7 @@ def test_encode_exotic(self): encoded = safename_encode(DUMMY_EXOTIC) self.assertEqual( - encoded, "UniCode123@$-lna3a4dm6e3ftgua80ewlwka88boszo7i7iv930g") + encoded, "UniCode123@:036-lna3a4dm6e3ftgua80ewlwka88boszo7i7iv930g") def test_executes_decode(self): safename_decode("") @@ -34,12 +34,16 @@ def test_roundtrip_empty(self): self.assertEqual(outputvalue, inputvalue) def test_roundtrip_ascii(self): - inputvalue = "abcde123467890" + inputvalue = "$abcde$123467890$" outputvalue = safename_decode(safename_encode(inputvalue)) self.assertEqual(outputvalue, inputvalue) -if __name__ == '__main__': +def main(): testmain() + + +if __name__ == '__main__': + main() From 808f6a45abaef0337cb955d8cae4902c8771a6e1 Mon Sep 17 00:00:00 2001 From: Alex Burke Date: Tue, 4 Jun 2024 11:11:19 +0200 Subject: [PATCH 03/14] visibly print unsafe chars --- mig/shared/sanitize.py | 21 ++++++++++++++++++--- 1 file changed, 18 insertions(+), 3 deletions(-) diff --git a/mig/shared/sanitize.py b/mig/shared/sanitize.py index 0adb85bd2..e503cb765 100644 --- a/mig/shared/sanitize.py +++ b/mig/shared/sanitize.py @@ -76,6 +76,21 @@ def safename_decode(value): if __name__ == '__main__': - d = dict(zip(UNSAFE_CHARS_ORD, UNSAFE_CHARS_NAMES)) - print(len(d)) - print(d) + def visibly_print(characters): + pieces = [] + for c in UNSAFE_CHARS: + c_ord = ord(c) + if c == ' ': + pieces.append("\\N{SPACE}") + elif c == '"': + pieces.append('\\"') + elif c_ord < 10: + # single digit control chars + pieces.append("\\x0%d" % c_ord) + elif c_ord < 32: + # double digit control chars + pieces.append("\\x%s" % c_ord) + else: + pieces.append(c) + return ''.join(pieces) + print("%d username chars: %s" % (len(UNSAFE_CHARS), visibly_print(UNSAFE_CHARS))) From cef4e9431b380d2feaf59b8f9ddb7511907708e0 Mon Sep 17 00:00:00 2001 From: Alex Burke Date: Tue, 4 Jun 2024 11:14:02 +0200 Subject: [PATCH 04/14] explicitly test ascii encode --- tests/test_mig_shared_sanitize.py | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/tests/test_mig_shared_sanitize.py b/tests/test_mig_shared_sanitize.py index 11c95bb17..6e8bcc1f9 100644 --- a/tests/test_mig_shared_sanitize.py +++ b/tests/test_mig_shared_sanitize.py @@ -5,11 +5,12 @@ import sys sys.path.append(os.path.realpath(os.path.join(os.path.dirname(__file__), "."))) - from support import MigTestCase, testmain from mig.shared.sanitize import safename_encode, safename_decode +DUMMY_ASCII = u'abcde123467890' +DUMMY_ASCII_WITH_REPLACE = "$abcde$123467890$" DUMMY_EXOTIC = u'UniCode123½¾µßðþđŋħĸþł@ª€£$¥©®' @@ -17,6 +18,12 @@ class MigSharedSanitize_safename(MigTestCase): def test_executes_encode(self): safename_encode("") + def test_encode_ascii(self): + encoded = safename_encode(DUMMY_ASCII) + + self.assertEqual( + encoded, "abcde123467890-") + def test_encode_exotic(self): encoded = safename_encode(DUMMY_EXOTIC) @@ -34,7 +41,7 @@ def test_roundtrip_empty(self): self.assertEqual(outputvalue, inputvalue) def test_roundtrip_ascii(self): - inputvalue = "$abcde$123467890$" + inputvalue = DUMMY_ASCII_WITH_REPLACE outputvalue = safename_decode(safename_encode(inputvalue)) From a7833ab119cb787741837b851a7cf2bfb9ffdf76 Mon Sep 17 00:00:00 2001 From: Alex Burke Date: Tue, 4 Jun 2024 16:24:36 +0200 Subject: [PATCH 05/14] wrap logic around the punycode hyphen to encode it --- mig/shared/sanitize.py | 78 +++++++++++++++++++++++++------ tests/test_mig_shared_sanitize.py | 18 +++++-- 2 files changed, 79 insertions(+), 17 deletions(-) diff --git a/mig/shared/sanitize.py b/mig/shared/sanitize.py index e503cb765..1dada2704 100644 --- a/mig/shared/sanitize.py +++ b/mig/shared/sanitize.py @@ -35,6 +35,9 @@ from mig.shared.defaults import username_charset +INDICATOR_CH = ':' +MARKER = INDICATOR_CH * 2 +MARKER_LENGTH = len(MARKER) UNSAFE_CHARS = sorted(list(set(string.printable) - set(username_charset))) UNSAFE_CHARS_ORD = list(ord(c) for c in UNSAFE_CHARS) UNSAFE_CHARS_NAMES = list(str(o).zfill(3) for o in UNSAFE_CHARS_ORD) @@ -47,32 +50,81 @@ def _as_ascii_string(value): return value def _as_ascii_string(value): return codecs.decode(value, 'ascii') +# TODO +# - swap to converting the ord char value to hex as a way to save bytes + def safename_encode(value): punycoded = _as_ascii_string(codecs.encode(value, 'punycode')) + + if len(punycoded) == 0: + return '' + + was_ascii = False + was_encoded = False + idx_trailer = -1 + + if punycoded[-1] == '-': + # the value is punycoded ascii - record this fact and + # remove this trailing character which will be added + # back later bsaed on the indication character + was_ascii = True + if not was_ascii: + idx_trailer = punycoded.rindex('-') + was_encoded = idx_trailer > -1 + if not (was_ascii or was_encoded): + raise NotImplementedError() + + characters = list(punycoded) for index, character in enumerate(characters): character_ordinal = ord(character) character_substitute = UNSAFE_SUBSTIUTIONS.get(character_ordinal, None) if character_substitute is not None: - characters[index] = ":%s" % character_substitute + characters[index] = "%s%s" % (INDICATOR_CH, character_substitute) + + if was_ascii: + # replace punycode single hyphen trailer with an escaped indicator + characters[-1] = INDICATOR_CH + characters.append(INDICATOR_CH) + + if was_encoded: + # replace punycode single hyphen trailer with an escaped indicator + characters[idx_trailer] = INDICATOR_CH + characters.insert(idx_trailer, INDICATOR_CH) return ''.join(characters) def safename_decode(value): - chunked = value.split(':') - - if len(chunked) > 1: - for index, chunk in enumerate(chunked): - if chunk == '': - continue - trailer = chunk[3:] - character_substitute = chunk[0:3] - character_ordinal = int(character_substitute) - chunked[index] = "%s%s" % (chr(character_ordinal), trailer) - - return codecs.decode(''.join(chunked), 'punycode') + if value == '': + return value + + value_to_decode = None + try: + idx = value.rindex(MARKER) + value_to_decode = ''.join((value[:idx + 1], '045', value[idx + 2:])) + except ValueError: + raise RuntimeError() + + chunked = value_to_decode.split(INDICATOR_CH) + + skip_first_chunk = chunked[0] != '' + index = 1 if skip_first_chunk else 0 + + while index < len(chunked): + chunk = chunked[index] + if chunk == '': + index += 1 + continue + character_substitute = chr(int(chunk[0:3])) + chunked[index] = "%s%s" % (character_substitute, chunk[3:]) + index += 1 + + try: + return codecs.decode(''.join(chunked), 'punycode') + except Exception as e: + raise e if __name__ == '__main__': diff --git a/tests/test_mig_shared_sanitize.py b/tests/test_mig_shared_sanitize.py index 6e8bcc1f9..a5e1ae175 100644 --- a/tests/test_mig_shared_sanitize.py +++ b/tests/test_mig_shared_sanitize.py @@ -15,24 +15,34 @@ class MigSharedSanitize_safename(MigTestCase): - def test_executes_encode(self): + def test_encode_basic(self): safename_encode("") def test_encode_ascii(self): encoded = safename_encode(DUMMY_ASCII) self.assertEqual( - encoded, "abcde123467890-") + encoded, "abcde123467890::") def test_encode_exotic(self): encoded = safename_encode(DUMMY_EXOTIC) self.assertEqual( - encoded, "UniCode123@:036-lna3a4dm6e3ftgua80ewlwka88boszo7i7iv930g") + encoded, "UniCode123@:036::lna3a4dm6e3ftgua80ewlwka88boszo7i7iv930g") - def test_executes_decode(self): + def test_decode_basic(self): safename_decode("") + def test_decode_ascii(self): + decoded = safename_decode("abcde123467890::") + + self.assertEqual(decoded, DUMMY_ASCII) + + def test_decode_exotic(self): + decoded = safename_decode("UniCode123@:036::lna3a4dm6e3ftgua80ewlwka88boszo7i7iv930g") + + self.assertEqual(decoded, DUMMY_EXOTIC) + def test_roundtrip_empty(self): inputvalue = "" From cfce1f713b44d0674ca38d9fae69e96d8de8cc62 Mon Sep 17 00:00:00 2001 From: Alex Burke Date: Thu, 6 Jun 2024 14:48:59 +0200 Subject: [PATCH 06/14] make two branches look the same --- mig/shared/sanitize.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mig/shared/sanitize.py b/mig/shared/sanitize.py index 1dada2704..3dee6e166 100644 --- a/mig/shared/sanitize.py +++ b/mig/shared/sanitize.py @@ -86,7 +86,7 @@ def safename_encode(value): if was_ascii: # replace punycode single hyphen trailer with an escaped indicator characters[-1] = INDICATOR_CH - characters.append(INDICATOR_CH) + characters.insert(-1, INDICATOR_CH) if was_encoded: # replace punycode single hyphen trailer with an escaped indicator From 9080468476ab28d435b006450b4bc74857725963 Mon Sep 17 00:00:00 2001 From: Alex Burke Date: Thu, 6 Jun 2024 15:01:35 +0200 Subject: [PATCH 07/14] simplify the code and collapse the indictor insertion into one block --- mig/shared/sanitize.py | 33 +++++++++++++++------------------ 1 file changed, 15 insertions(+), 18 deletions(-) diff --git a/mig/shared/sanitize.py b/mig/shared/sanitize.py index 3dee6e166..4acebac08 100644 --- a/mig/shared/sanitize.py +++ b/mig/shared/sanitize.py @@ -36,8 +36,8 @@ from mig.shared.defaults import username_charset INDICATOR_CH = ':' +INVALID_INSERTION_POINT = -2 MARKER = INDICATOR_CH * 2 -MARKER_LENGTH = len(MARKER) UNSAFE_CHARS = sorted(list(set(string.printable) - set(username_charset))) UNSAFE_CHARS_ORD = list(ord(c) for c in UNSAFE_CHARS) UNSAFE_CHARS_NAMES = list(str(o).zfill(3) for o in UNSAFE_CHARS_ORD) @@ -59,20 +59,22 @@ def safename_encode(value): if len(punycoded) == 0: return '' - was_ascii = False - was_encoded = False - idx_trailer = -1 + insertion_point = INVALID_INSERTION_POINT if punycoded[-1] == '-': # the value is punycoded ascii - record this fact and # remove this trailing character which will be added # back later bsaed on the indication character - was_ascii = True - if not was_ascii: - idx_trailer = punycoded.rindex('-') - was_encoded = idx_trailer > -1 - if not (was_ascii or was_encoded): - raise NotImplementedError() + insertion_point = -1 + else: + try: + insertion_point = punycoded.rindex('-') + except ValueError: + # the marker could not be located so the insertion + # point as not updated and thus remains set invalid + pass + if insertion_point == INVALID_INSERTION_POINT: + raise AssertionError(None) characters = list(punycoded) @@ -83,15 +85,10 @@ def safename_encode(value): if character_substitute is not None: characters[index] = "%s%s" % (INDICATOR_CH, character_substitute) - if was_ascii: + if insertion_point != INVALID_INSERTION_POINT: # replace punycode single hyphen trailer with an escaped indicator - characters[-1] = INDICATOR_CH - characters.insert(-1, INDICATOR_CH) - - if was_encoded: - # replace punycode single hyphen trailer with an escaped indicator - characters[idx_trailer] = INDICATOR_CH - characters.insert(idx_trailer, INDICATOR_CH) + characters[insertion_point] = INDICATOR_CH + characters.insert(insertion_point, INDICATOR_CH) return ''.join(characters) From 96a9c304f518b13a8efc2ddbb17b5054de63e1fe Mon Sep 17 00:00:00 2001 From: Alex Burke Date: Fri, 7 Jun 2024 10:39:46 +0200 Subject: [PATCH 08/14] convert to hex digits for substitutions of unsafe characters --- mig/shared/sanitize.py | 23 +++++++++++++++-------- tests/test_mig_shared_sanitize.py | 4 ++-- 2 files changed, 17 insertions(+), 10 deletions(-) diff --git a/mig/shared/sanitize.py b/mig/shared/sanitize.py index 4acebac08..c260e7d00 100644 --- a/mig/shared/sanitize.py +++ b/mig/shared/sanitize.py @@ -25,6 +25,7 @@ # -- END_HEADER --- # +import base64 import codecs import os import string @@ -38,10 +39,10 @@ INDICATOR_CH = ':' INVALID_INSERTION_POINT = -2 MARKER = INDICATOR_CH * 2 +MARKER_HEXDIGIT_WIDTH = 2 UNSAFE_CHARS = sorted(list(set(string.printable) - set(username_charset))) -UNSAFE_CHARS_ORD = list(ord(c) for c in UNSAFE_CHARS) -UNSAFE_CHARS_NAMES = list(str(o).zfill(3) for o in UNSAFE_CHARS_ORD) -UNSAFE_SUBSTIUTIONS = dict(zip(UNSAFE_CHARS_ORD, UNSAFE_CHARS_NAMES)) +UNSAFE_CHARS_HEXDIGITS = None +UNSAFE_SUBSTIUTIONS = None PY2 = sys.version_info[0] == 2 if PY2: @@ -49,6 +50,11 @@ def _as_ascii_string(value): return value else: def _as_ascii_string(value): return codecs.decode(value, 'ascii') +def _as_hexdigit(ch): + return _as_ascii_string(base64.b16encode(bytes(ch, 'ascii'))) + +UNSAFE_CHARS_HEXDIGITS = list(_as_hexdigit(c) for c in UNSAFE_CHARS) +UNSAFE_SUBSTIUTIONS = dict(zip(UNSAFE_CHARS, UNSAFE_CHARS_HEXDIGITS)) # TODO # - swap to converting the ord char value to hex as a way to save bytes @@ -80,8 +86,7 @@ def safename_encode(value): characters = list(punycoded) for index, character in enumerate(characters): - character_ordinal = ord(character) - character_substitute = UNSAFE_SUBSTIUTIONS.get(character_ordinal, None) + character_substitute = UNSAFE_SUBSTIUTIONS.get(character, None) if character_substitute is not None: characters[index] = "%s%s" % (INDICATOR_CH, character_substitute) @@ -100,7 +105,8 @@ def safename_decode(value): value_to_decode = None try: idx = value.rindex(MARKER) - value_to_decode = ''.join((value[:idx + 1], '045', value[idx + 2:])) + character_substitute = _as_hexdigit('-') + value_to_decode = ''.join((value[:idx + 1], character_substitute, value[idx + 2:])) except ValueError: raise RuntimeError() @@ -114,8 +120,9 @@ def safename_decode(value): if chunk == '': index += 1 continue - character_substitute = chr(int(chunk[0:3])) - chunked[index] = "%s%s" % (character_substitute, chunk[3:]) + hexdigit = chunk[0:MARKER_HEXDIGIT_WIDTH] + character_substitute = _as_ascii_string(base64.b16decode(hexdigit)) + chunked[index] = chunked[index].replace(hexdigit, character_substitute, 1) index += 1 try: diff --git a/tests/test_mig_shared_sanitize.py b/tests/test_mig_shared_sanitize.py index a5e1ae175..e41b2afa2 100644 --- a/tests/test_mig_shared_sanitize.py +++ b/tests/test_mig_shared_sanitize.py @@ -28,7 +28,7 @@ def test_encode_exotic(self): encoded = safename_encode(DUMMY_EXOTIC) self.assertEqual( - encoded, "UniCode123@:036::lna3a4dm6e3ftgua80ewlwka88boszo7i7iv930g") + encoded, "UniCode123@:24::lna3a4dm6e3ftgua80ewlwka88boszo7i7iv930g") def test_decode_basic(self): safename_decode("") @@ -39,7 +39,7 @@ def test_decode_ascii(self): self.assertEqual(decoded, DUMMY_ASCII) def test_decode_exotic(self): - decoded = safename_decode("UniCode123@:036::lna3a4dm6e3ftgua80ewlwka88boszo7i7iv930g") + decoded = safename_decode("UniCode123@:24::lna3a4dm6e3ftgua80ewlwka88boszo7i7iv930g") self.assertEqual(decoded, DUMMY_EXOTIC) From 83d4cef09509faeabb5e00aae02d032ae5ce52b2 Mon Sep 17 00:00:00 2001 From: Alex Burke Date: Mon, 17 Jun 2024 09:31:55 +0200 Subject: [PATCH 09/14] fixup --- mig/shared/sanitize.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mig/shared/sanitize.py b/mig/shared/sanitize.py index c260e7d00..f8e270d1e 100644 --- a/mig/shared/sanitize.py +++ b/mig/shared/sanitize.py @@ -134,7 +134,7 @@ def safename_decode(value): if __name__ == '__main__': def visibly_print(characters): pieces = [] - for c in UNSAFE_CHARS: + for c in characters: c_ord = ord(c) if c == ' ': pieces.append("\\N{SPACE}") From 66b4671af237139bc4c0655df923b0ed3b5a7d40 Mon Sep 17 00:00:00 2001 From: Alex Burke Date: Mon, 17 Jun 2024 09:34:32 +0200 Subject: [PATCH 10/14] fixup - pretty print --- mig/shared/sanitize.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/mig/shared/sanitize.py b/mig/shared/sanitize.py index f8e270d1e..3187e83cd 100644 --- a/mig/shared/sanitize.py +++ b/mig/shared/sanitize.py @@ -145,8 +145,9 @@ def visibly_print(characters): pieces.append("\\x0%d" % c_ord) elif c_ord < 32: # double digit control chars - pieces.append("\\x%s" % c_ord) + pieces.append(str(hex(27)).replace('0', '\\', 1)) else: pieces.append(c) return ''.join(pieces) + print("%d username chars: %s" % (len(UNSAFE_CHARS), visibly_print(UNSAFE_CHARS))) From 9367fe84b8a46098e701ed4359eec03d60241b82 Mon Sep 17 00:00:00 2001 From: Alex Burke Date: Mon, 17 Jun 2024 09:36:32 +0200 Subject: [PATCH 11/14] fixup - insertion point comment --- mig/shared/sanitize.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mig/shared/sanitize.py b/mig/shared/sanitize.py index 3187e83cd..ab8380492 100644 --- a/mig/shared/sanitize.py +++ b/mig/shared/sanitize.py @@ -77,7 +77,7 @@ def safename_encode(value): insertion_point = punycoded.rindex('-') except ValueError: # the marker could not be located so the insertion - # point as not updated and thus remains set invalid + # point is not updated and thus remains set invalid pass if insertion_point == INVALID_INSERTION_POINT: raise AssertionError(None) From 44e698bff2b9b16cbdcdb18c4065f78bb57be211 Mon Sep 17 00:00:00 2001 From: Alex Burke Date: Mon, 17 Jun 2024 10:08:17 +0200 Subject: [PATCH 12/14] raise a specific identifiable error when decoding a non-safename string --- mig/shared/sanitize.py | 5 ++++- tests/test_mig_shared_sanitize.py | 11 +++++++++-- 2 files changed, 13 insertions(+), 3 deletions(-) diff --git a/mig/shared/sanitize.py b/mig/shared/sanitize.py index ab8380492..78187a02d 100644 --- a/mig/shared/sanitize.py +++ b/mig/shared/sanitize.py @@ -56,6 +56,9 @@ def _as_hexdigit(ch): UNSAFE_CHARS_HEXDIGITS = list(_as_hexdigit(c) for c in UNSAFE_CHARS) UNSAFE_SUBSTIUTIONS = dict(zip(UNSAFE_CHARS, UNSAFE_CHARS_HEXDIGITS)) +class NotAnExistingSafenameError(RuntimeError): + pass + # TODO # - swap to converting the ord char value to hex as a way to save bytes @@ -108,7 +111,7 @@ def safename_decode(value): character_substitute = _as_hexdigit('-') value_to_decode = ''.join((value[:idx + 1], character_substitute, value[idx + 2:])) except ValueError: - raise RuntimeError() + raise NotAnExistingSafenameError() chunked = value_to_decode.split(INDICATOR_CH) diff --git a/tests/test_mig_shared_sanitize.py b/tests/test_mig_shared_sanitize.py index e41b2afa2..e4c0250d3 100644 --- a/tests/test_mig_shared_sanitize.py +++ b/tests/test_mig_shared_sanitize.py @@ -7,7 +7,7 @@ sys.path.append(os.path.realpath(os.path.join(os.path.dirname(__file__), "."))) from support import MigTestCase, testmain -from mig.shared.sanitize import safename_encode, safename_decode +from mig.shared.sanitize import safename_encode, safename_decode, NotAnExistingSafenameError DUMMY_ASCII = u'abcde123467890' DUMMY_ASCII_WITH_REPLACE = "$abcde$123467890$" @@ -30,6 +30,13 @@ def test_encode_exotic(self): self.assertEqual( encoded, "UniCode123@:24::lna3a4dm6e3ftgua80ewlwka88boszo7i7iv930g") + def test_decode_a_non_safename(self): + with self.assertRaises(Exception) as asserted: + safename_decode("foobar") + + the_exception = asserted.exception + self.assertIsInstance(the_exception, NotAnExistingSafenameError) + def test_decode_basic(self): safename_decode("") @@ -59,7 +66,7 @@ def test_roundtrip_ascii(self): def main(): - testmain() + testmain(failfast=True) if __name__ == '__main__': From 5fe6092e90480a23785571ce6c8e23358113f380 Mon Sep 17 00:00:00 2001 From: Alex Burke Date: Mon, 17 Jun 2024 10:49:19 +0200 Subject: [PATCH 13/14] specifically pretty print escape given we are about to lean on it --- mig/shared/sanitize.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/mig/shared/sanitize.py b/mig/shared/sanitize.py index 78187a02d..69c4d175d 100644 --- a/mig/shared/sanitize.py +++ b/mig/shared/sanitize.py @@ -141,6 +141,8 @@ def visibly_print(characters): c_ord = ord(c) if c == ' ': pieces.append("\\N{SPACE}") + elif c_ord == 27: + pieces.append("\\N{ESCAPE}") elif c == '"': pieces.append('\\"') elif c_ord < 10: From 12d04a90d4b9348e2ee04efff74da271333e0eea Mon Sep 17 00:00:00 2001 From: Alex Burke Date: Mon, 17 Jun 2024 10:57:36 +0200 Subject: [PATCH 14/14] swap to using an ascii escape (decimal 27, hex 1b) for the indicator by virtue of this not being a previously valid character, making use of it for the escape marker should allow identifying values that need converting simply by encountering of the "known" exception --- mig/shared/sanitize.py | 2 +- tests/test_mig_shared_sanitize.py | 8 ++++---- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/mig/shared/sanitize.py b/mig/shared/sanitize.py index 69c4d175d..5950cc4c2 100644 --- a/mig/shared/sanitize.py +++ b/mig/shared/sanitize.py @@ -36,7 +36,7 @@ from mig.shared.defaults import username_charset -INDICATOR_CH = ':' +INDICATOR_CH = '\x1b' INVALID_INSERTION_POINT = -2 MARKER = INDICATOR_CH * 2 MARKER_HEXDIGIT_WIDTH = 2 diff --git a/tests/test_mig_shared_sanitize.py b/tests/test_mig_shared_sanitize.py index e4c0250d3..513b81c00 100644 --- a/tests/test_mig_shared_sanitize.py +++ b/tests/test_mig_shared_sanitize.py @@ -22,13 +22,13 @@ def test_encode_ascii(self): encoded = safename_encode(DUMMY_ASCII) self.assertEqual( - encoded, "abcde123467890::") + encoded, "abcde123467890\x1b\x1b") def test_encode_exotic(self): encoded = safename_encode(DUMMY_EXOTIC) self.assertEqual( - encoded, "UniCode123@:24::lna3a4dm6e3ftgua80ewlwka88boszo7i7iv930g") + encoded, "UniCode123@\x1b24\x1b\x1blna3a4dm6e3ftgua80ewlwka88boszo7i7iv930g") def test_decode_a_non_safename(self): with self.assertRaises(Exception) as asserted: @@ -41,12 +41,12 @@ def test_decode_basic(self): safename_decode("") def test_decode_ascii(self): - decoded = safename_decode("abcde123467890::") + decoded = safename_decode("abcde123467890\x1b\x1b") self.assertEqual(decoded, DUMMY_ASCII) def test_decode_exotic(self): - decoded = safename_decode("UniCode123@:24::lna3a4dm6e3ftgua80ewlwka88boszo7i7iv930g") + decoded = safename_decode("UniCode123@\x1b24\x1b\x1blna3a4dm6e3ftgua80ewlwka88boszo7i7iv930g") self.assertEqual(decoded, DUMMY_EXOTIC)