@@ -75,25 +75,59 @@ size_t mbs_to_wcs(std::wstring &wDest, const std::string &sSrc) {
75
75
return std::mbsrtowcs (wDest.data (), &sSrcP , wDest.length (), &mbState);
76
76
}
77
77
78
- void dumphex_string (const std::string &sIn , const std::string &msgTag){
78
+ template <typename TString>
79
+ void dumphex_string (const TString &sIn , const std::string &msgTag){
79
80
std::cout << msgTag << " [ " ;
80
81
for (auto c: sIn ) {
81
- std::cout << std::format (" {:02x}, " , (uint8_t )c);
82
+ auto cSize = sizeof (c);
83
+ if (cSize == 1 ) {
84
+ std::cout << std::format (" {:02x}, " , (uint8_t )c);
85
+ } else if (cSize == 2 ) {
86
+ std::cout << std::format (" {:04x}, " , (uint16_t )c);
87
+ } else if (cSize == 4 ) {
88
+ std::cout << std::format (" {:08x}, " , (uint32_t )c);
89
+ } else {
90
+ throw std::runtime_error ( std::format (" ERRR:{}:Unsupported char type with size [{}]" , __func__, cSize) );
91
+ }
82
92
}
83
93
std::cout << " ]" << std::endl;
84
94
}
85
95
86
- // Remove chars from begin and end of the passed string, provided the char belongs
87
- // to one of the chars in trimChars.
88
- // NOTE: Chars being trimmed (ie trimChars) needs to be 1byte encoded chars.
89
- // NOTE: This will work provided the string being trimmed as well the chars being
90
- // trimmed are made up of 1byte encoded chars including in utf8 encoding space.
91
- // If the string being trimmed includes multibyte encoded characters at the end,
92
- // then trimming can mess things up.
96
+ // Remove chars from begin and end of the passed string, provided the char
97
+ // belongs to one of the chars in trimChars.
98
+ //
99
+ // NOTE: This will work perfectly provided the string being trimmed as well as
100
+ // chars being trimmed are made up of FixedSize chars from the same encoded space.
101
+ // For utf-8, this means the ascii equivalent 1byteSized chars of utf8 and not
102
+ // variable length ones.
103
+ // NOTE: It will also work, if atleast either end of string have fixedSize chars
104
+ // from their encoding space, rather than variable length based chars if any.
105
+ // And the trimChars are also fixedSize encoded chars.
106
+ //
107
+ // NOTE: Given the way UTF-8 char encoding is designed, where fixedSize 1byte
108
+ // encoded chars are fully unique and dont overlap with any bytes from any of
109
+ // the variable length encoded chars in the utf-8 space, so as long as the
110
+ // trimChars belong to the fixedSize chars subset, the logic should work, even
111
+ // if the string has a mixture of fixed and variable length encoded chars.
112
+ // Chances are utf-16 and utf-32 also have similar characteristics wrt thier
113
+ // fixedSize encoded chars, and so equivalent semantic applies to them also.
114
+ //
115
+ // ALERT: Given that this simple minded logic, works at individual bytes level
116
+ // only, If trimChars involve variable length encoded chars, then
117
+ // * because different bytes from different trim chars when clubbed together
118
+ // can map to some other new char, if there is that new char at either end
119
+ // of the string, it may get trimmed, because of the possibility of mix up
120
+ // mentioned.
121
+ // * given that different variable length encoded chars may have some common
122
+ // bytes between them, if one of these chars is at either end of the string
123
+ // and another char is in trimChars, then string may get partially trimmed.
124
+ //
93
125
template <typename TString>
94
126
TString str_trim_dumb (TString sin, const TString &trimChars=" \t\n " ) {
127
+ #ifdef SC_DEBUG
95
128
dumphex_string (sin, " DBUG:TrimDumb:Str:" );
96
129
dumphex_string (trimChars, " DBUG:TrimDumb:Tim:" );
130
+ #endif
97
131
sin.erase (sin.find_last_not_of (trimChars)+1 );
98
132
sin.erase (0 , sin.find_first_not_of (trimChars));
99
133
return sin;
@@ -102,8 +136,9 @@ TString str_trim_dumb(TString sin, const TString &trimChars=" \t\n") {
102
136
// Remove chars from begin and end of the passed string, provided the char belongs
103
137
// to one of the chars in trimChars.
104
138
// NOTE: Internally converts to wchar/wstring to try and support proper trimming,
105
- // wrt possibly more languages, to some extent, ie even if the passed string
106
- // contains multibyte encoded characters in it.
139
+ // wrt possibly more languages, to some extent. IE even if the passed string
140
+ // contains multibyte encoded characters in it in utf-8 space, it may get converted
141
+ // to fixed size chars in the expanded wchar_t encoding space.
107
142
std::string str_trim_oversmart (std::string sIn , const std::string &trimChars=" \t\n " ) {
108
143
std::wstring wIn;
109
144
mbs_to_wcs (wIn, sIn );
@@ -118,11 +153,13 @@ std::string str_trim_oversmart(std::string sIn, const std::string &trimChars=" \
118
153
119
154
// Remove atmost 1 char at the begin and 1 char at the end of the passed string,
120
155
// provided the char belongs to one of the chars in trimChars.
121
- // NOTE: Chars being trimmed (ie trimChars) needs to be 1byte encoded chars.
156
+ // NOTE: Chars being trimmed (ie in trimChars) needs to be 1byte encoded chars, to
157
+ // avoid mix up when working utf-8/variable length encoded strings.
122
158
// NOTE: This will work provided the string being trimmed as well the chars being
123
159
// trimmed are made up of 1byte encoded chars including in utf8 encoding space.
124
160
// If the string being trimmed includes multibyte encoded characters at the end,
125
- // then trimming can mess things up.
161
+ // then trimming can mess things up, if you have multibyte encoded utf-8 chars
162
+ // in the trimChars set.
126
163
std::string str_trim_single (std::string sin, std::string trimChars=" \t\n " ) {
127
164
if (sin.empty ()) return sin;
128
165
for (auto c: trimChars) {
@@ -462,11 +499,18 @@ void check_nonenglish() {
462
499
std::string sGotOSmart = str_trim_oversmart (sTest , {" \n\t " });
463
500
std::cout << std::format (" {}: Test1[{}] Dumb[{}] OverSmart[{}]" , __func__, sTest , sGotDumb , sGotOSmart ) << std::endl;
464
501
}
465
- std::vector<std::string> vTest2 = { " \n\t this र remove 0s at end 000 " , " \n\t this र remove 0s and अs at end 000रअ0अ " };
502
+ std::vector<std::string> vTest2 = { " \n\t this र remove 0s at end 000 " , " \n\t this र remove 0s and अs at end 000रअ0अ " , " \n\t this र remove 0s and अs at end 000रअ0 \xa4 अ " };
466
503
for (auto sTest : vTest2) {
467
504
std::string sGotDumb = str_trim_dumb (sTest , {" \n\t 0अ" });
505
+ std::cout << std::format (" {}: Test2[{}] Dumb[{}]" , __func__, sTest , sGotDumb ) << std::endl;
506
+ }
507
+ // This partly invalid utf8 string will mess up str_trim_dumb "\n\tthis र remove 0s and अs at end 000रअ0\xa4अ "
508
+ // but will trigger a exception with oversmart.
509
+ // std::vector<std::string> vTest3 = { "\n\t this र remove 0s at end 000 ", "\n\tthis र remove 0s and अs at end 000रअ0अ ", "\n\tthis र remove 0s and अs at end 000रअ0\xa4अ "};
510
+ std::vector<std::string> vTest3 = { " \n\t this र remove 0s at end 000 " , " \n\t this र remove 0s and अs at end 000रअ0अ " , " \n\t this र remove 0s and अs at end 000रअ0\xe0\xa4\x30 अ " }; // \xe0\xa4
511
+ for (auto sTest : vTest3) {
468
512
std::string sGotOSmart = str_trim_oversmart (sTest , {" \n\t 0अ" });
469
- std::cout << std::format (" {}: Test2 [{}] Dumb[{}] OverSmart[{}]" , __func__, sTest , sGotDumb , sGotOSmart ) << std::endl;
513
+ std::cout << std::format (" {}: Test3 [{}] OverSmart[{}]" , __func__, sTest , sGotOSmart ) << std::endl;
470
514
}
471
515
}
472
516
0 commit comments