Skip to content

Commit 909f3b8

Browse files
authored
[feature] stdlib: strutils.multiReplace for character sets (#24805)
Multiple replacements based on character sets in a single pass. Useful for string sanitation. Follows existing `multiReplace` semantics. Note: initially copied the substring version logic with a `while` and a named block break, but Godbolt showed it had produced slightly larger assembly using higher registers than the final version. - [x] Tests - [x] changelog.md
1 parent d15705e commit 909f3b8

File tree

3 files changed

+57
-4
lines changed

3 files changed

+57
-4
lines changed

changelog.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,8 @@ errors.
2525
- `setutils.symmetricDifference` along with its operator version
2626
`` setutils.`-+-` `` and in-place version `setutils.toggle` have been added
2727
to more efficiently calculate the symmetric difference of bitsets.
28+
- `strutils.multiReplace` overload for character set replacements in a single pass.
29+
Useful for string sanitation. Follows existing multiReplace semantics.
2830

2931
[//]: # "Changes:"
3032
- `std/math` The `^` symbol now supports floating-point as exponent in addition to the Natural type.

lib/pure/strutils.nim

Lines changed: 37 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -2202,7 +2202,8 @@ func replace*(s, sub: string, by = ""): string {.rtl,
22022202
## * `replace func<#replace,string,char,char>`_ for replacing
22032203
## single characters
22042204
## * `replaceWord func<#replaceWord,string,string,string>`_
2205-
## * `multiReplace func<#multiReplace,string,varargs[]>`_
2205+
## * `multiReplace func<#multiReplace,string,varargs[]>`_ for substrings
2206+
## * `multiReplace func<#multiReplace,openArray[char],varargs[]>`_ for single characters
22062207
result = ""
22072208
let subLen = sub.len
22082209
if subLen == 0:
@@ -2245,7 +2246,8 @@ func replace*(s: string, sub, by: char): string {.rtl,
22452246
## See also:
22462247
## * `find func<#find,string,char,Natural,int>`_
22472248
## * `replaceWord func<#replaceWord,string,string,string>`_
2248-
## * `multiReplace func<#multiReplace,string,varargs[]>`_
2249+
## * `multiReplace func<#multiReplace,string,varargs[]>`_ for substrings
2250+
## * `multiReplace func<#multiReplace,openArray[char],varargs[]>`_ for single characters
22492251
result = newString(s.len)
22502252
var i = 0
22512253
while i < s.len:
@@ -2330,7 +2332,39 @@ func multiReplace*(s: string, replacements: varargs[(string, string)]): string =
23302332
add result, s[i]
23312333
inc(i)
23322334

2333-
2335+
func multiReplace*(s: openArray[char]; replacements: varargs[(set[char], char)]): string {.noinit.} =
2336+
## Performs multiple character replacements in a single pass through the input.
2337+
##
2338+
## `multiReplace` scans the input `s` from left to right and replaces
2339+
## characters based on character sets, applying the first matching replacement
2340+
## at each position. Useful for sanitizing or transforming strings with
2341+
## predefined character mappings.
2342+
##
2343+
## The order of the `replacements` matters:
2344+
## - First matching replacement is applied
2345+
## - Subsequent replacements are not considered for the same character
2346+
##
2347+
## See also:
2348+
## - `multiReplace(s: string; replacements: varargs[(string, string)]) <#multiReplace,string,varargs[]>`_,
2349+
runnableExamples:
2350+
const WinSanitationRules = [
2351+
({'\0'..'\31'}, ' '),
2352+
({'"'}, '\''),
2353+
({'/', '\\', ':', '|'}, '-'),
2354+
({'*', '?', '<', '>'}, '_'),
2355+
]
2356+
# Sanitize a filename with Windows-incompatible characters
2357+
const file = "a/file:with?invalid*chars.txt"
2358+
doAssert file.multiReplace(WinSanitationRules) == "a-file-with_invalid_chars.txt"
2359+
{.cast(noSideEffect).}:
2360+
result = newStringUninit(s.len)
2361+
for i in 0..<s.len:
2362+
var nextChar = s[i]
2363+
for subs, by in replacements.items:
2364+
if nextChar in subs:
2365+
nextChar = by
2366+
break
2367+
result[i] = nextChar
23342368

23352369
func insertSep*(s: string, sep = '_', digits = 3): string {.rtl,
23362370
extern: "nsuInsertSep".} =

tests/stdlib/tstrutils.nim

Lines changed: 18 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -575,12 +575,29 @@ template main() =
575575
doAssert "-lda-ldz -ld abc".replaceWord("-ld") == "-lda-ldz abc"
576576
doAssert "-lda-ldz -ld abc".replaceWord("") == "-lda-ldz -ld abc"
577577

578-
block: # multiReplace
578+
block: # multiReplace substrings
579579
doAssert "abba".multiReplace(("a", "b"), ("b", "a")) == "baab"
580580
doAssert "Hello World.".multiReplace(("ello", "ELLO"), ("World.",
581581
"PEOPLE!")) == "HELLO PEOPLE!"
582582
doAssert "aaaa".multiReplace(("a", "aa"), ("aa", "bb")) == "aaaaaaaa"
583583

584+
block: # multiReplace characters
585+
# https://learn.microsoft.com/en-us/windows/win32/fileio/naming-a-file#naming-conventions
586+
const SanitationRules = [
587+
({'\0'..'\31'}, ' '),
588+
({'"'}, '\''),
589+
({'/', '\\', ':', '|'}, '-'),
590+
({'*', '?', '<', '>'}, '_'),
591+
]
592+
# Basic character set replacements
593+
doAssert multiReplace("abba", SanitationRules) == "abba"
594+
doAssert multiReplace("a/b\\c:d", SanitationRules) == "a-b-c-d"
595+
doAssert multiReplace("a*b?c", SanitationRules) == "a_b_c"
596+
doAssert multiReplace("\0\3test", SanitationRules) == " test"
597+
doAssert multiReplace("testquote\"", SanitationRules) == "testquote'"
598+
doAssert multiReplace("", SanitationRules) == ""
599+
doAssert multiReplace("/\\:*?\"\0<>", ({'\0'..'\255'}, '.')) == "........."
600+
584601
# `parseEnum`, ref issue #14030
585602
# check enum defined at top level # xxx this is probably irrelevant, and pollutes scope
586603
# for remaining tests

0 commit comments

Comments
 (0)