Skip to content

Commit dea84a5

Browse files
committed
SimpCfg:Cleanup, updated notes, templated code
Update the notes to match the templated flow now and some of the nitty gritties involved. Update DumpHexString to be templated. Split check nonenglish flow wrt trim dumb and oversmart testing, so that things with work with one, but not the other can be differentiated in the flow.
1 parent 6f7f5e6 commit dea84a5

File tree

1 file changed

+59
-15
lines changed

1 file changed

+59
-15
lines changed

common/simpcfg.hpp

Lines changed: 59 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -75,25 +75,59 @@ size_t mbs_to_wcs(std::wstring &wDest, const std::string &sSrc) {
7575
return std::mbsrtowcs(wDest.data(), &sSrcP, wDest.length(), &mbState);
7676
}
7777

78-
void dumphex_string(const std::string &sIn, const std::string &msgTag){
78+
template <typename TString>
79+
void dumphex_string(const TString &sIn, const std::string &msgTag){
7980
std::cout << msgTag << "[ ";
8081
for(auto c: sIn) {
81-
std::cout << std::format("{:02x}, ", (uint8_t)c);
82+
auto cSize = sizeof(c);
83+
if (cSize == 1) {
84+
std::cout << std::format("{:02x}, ", (uint8_t)c);
85+
} else if (cSize == 2) {
86+
std::cout << std::format("{:04x}, ", (uint16_t)c);
87+
} else if (cSize == 4) {
88+
std::cout << std::format("{:08x}, ", (uint32_t)c);
89+
} else {
90+
throw std::runtime_error( std::format("ERRR:{}:Unsupported char type with size [{}]", __func__, cSize) );
91+
}
8292
}
8393
std::cout << " ]" << std::endl;
8494
}
8595

86-
// Remove chars from begin and end of the passed string, provided the char belongs
87-
// to one of the chars in trimChars.
88-
// NOTE: Chars being trimmed (ie trimChars) needs to be 1byte encoded chars.
89-
// NOTE: This will work provided the string being trimmed as well the chars being
90-
// trimmed are made up of 1byte encoded chars including in utf8 encoding space.
91-
// If the string being trimmed includes multibyte encoded characters at the end,
92-
// then trimming can mess things up.
96+
// Remove chars from begin and end of the passed string, provided the char
97+
// belongs to one of the chars in trimChars.
98+
//
99+
// NOTE: This will work perfectly provided the string being trimmed as well as
100+
// chars being trimmed are made up of FixedSize chars from the same encoded space.
101+
// For utf-8, this means the ascii equivalent 1byteSized chars of utf8 and not
102+
// variable length ones.
103+
// NOTE: It will also work, if atleast either end of string have fixedSize chars
104+
// from their encoding space, rather than variable length based chars if any.
105+
// And the trimChars are also fixedSize encoded chars.
106+
//
107+
// NOTE: Given the way UTF-8 char encoding is designed, where fixedSize 1byte
108+
// encoded chars are fully unique and dont overlap with any bytes from any of
109+
// the variable length encoded chars in the utf-8 space, so as long as the
110+
// trimChars belong to the fixedSize chars subset, the logic should work, even
111+
// if the string has a mixture of fixed and variable length encoded chars.
112+
// Chances are utf-16 and utf-32 also have similar characteristics wrt thier
113+
// fixedSize encoded chars, and so equivalent semantic applies to them also.
114+
//
115+
// ALERT: Given that this simple minded logic, works at individual bytes level
116+
// only, If trimChars involve variable length encoded chars, then
117+
// * because different bytes from different trim chars when clubbed together
118+
// can map to some other new char, if there is that new char at either end
119+
// of the string, it may get trimmed, because of the possibility of mix up
120+
// mentioned.
121+
// * given that different variable length encoded chars may have some common
122+
// bytes between them, if one of these chars is at either end of the string
123+
// and another char is in trimChars, then string may get partially trimmed.
124+
//
93125
template <typename TString>
94126
TString str_trim_dumb(TString sin, const TString &trimChars=" \t\n") {
127+
#ifdef SC_DEBUG
95128
dumphex_string(sin, "DBUG:TrimDumb:Str:");
96129
dumphex_string(trimChars, "DBUG:TrimDumb:Tim:");
130+
#endif
97131
sin.erase(sin.find_last_not_of(trimChars)+1);
98132
sin.erase(0, sin.find_first_not_of(trimChars));
99133
return sin;
@@ -102,8 +136,9 @@ TString str_trim_dumb(TString sin, const TString &trimChars=" \t\n") {
102136
// Remove chars from begin and end of the passed string, provided the char belongs
103137
// to one of the chars in trimChars.
104138
// NOTE: Internally converts to wchar/wstring to try and support proper trimming,
105-
// wrt possibly more languages, to some extent, ie even if the passed string
106-
// contains multibyte encoded characters in it.
139+
// wrt possibly more languages, to some extent. IE even if the passed string
140+
// contains multibyte encoded characters in it in utf-8 space, it may get converted
141+
// to fixed size chars in the expanded wchar_t encoding space.
107142
std::string str_trim_oversmart(std::string sIn, const std::string &trimChars=" \t\n") {
108143
std::wstring wIn;
109144
mbs_to_wcs(wIn, sIn);
@@ -118,11 +153,13 @@ std::string str_trim_oversmart(std::string sIn, const std::string &trimChars=" \
118153

119154
// Remove atmost 1 char at the begin and 1 char at the end of the passed string,
120155
// provided the char belongs to one of the chars in trimChars.
121-
// NOTE: Chars being trimmed (ie trimChars) needs to be 1byte encoded chars.
156+
// NOTE: Chars being trimmed (ie in trimChars) needs to be 1byte encoded chars, to
157+
// avoid mix up when working utf-8/variable length encoded strings.
122158
// NOTE: This will work provided the string being trimmed as well the chars being
123159
// trimmed are made up of 1byte encoded chars including in utf8 encoding space.
124160
// If the string being trimmed includes multibyte encoded characters at the end,
125-
// then trimming can mess things up.
161+
// then trimming can mess things up, if you have multibyte encoded utf-8 chars
162+
// in the trimChars set.
126163
std::string str_trim_single(std::string sin, std::string trimChars=" \t\n") {
127164
if (sin.empty()) return sin;
128165
for(auto c: trimChars) {
@@ -462,11 +499,18 @@ void check_nonenglish() {
462499
std::string sGotOSmart = str_trim_oversmart(sTest, {" \n\t"});
463500
std::cout << std::format("{}: Test1[{}] Dumb[{}] OverSmart[{}]", __func__, sTest, sGotDumb, sGotOSmart) << std::endl;
464501
}
465-
std::vector<std::string> vTest2 = { "\n\t this र remove 0s at end 000 ", "\n\tthis र remove 0s and अs at end 000रअ0अ "};
502+
std::vector<std::string> vTest2 = { "\n\t this र remove 0s at end 000 ", "\n\tthis र remove 0s and अs at end 000रअ0अ ", "\n\tthis र remove 0s and अs at end 000रअ0\xa4"};
466503
for (auto sTest: vTest2) {
467504
std::string sGotDumb = str_trim_dumb(sTest, {" \n\t0अ"});
505+
std::cout << std::format("{}: Test2[{}] Dumb[{}]", __func__, sTest, sGotDumb) << std::endl;
506+
}
507+
// This partly invalid utf8 string will mess up str_trim_dumb "\n\tthis र remove 0s and अs at end 000रअ0\xa4अ "
508+
// but will trigger a exception with oversmart.
509+
// std::vector<std::string> vTest3 = { "\n\t this र remove 0s at end 000 ", "\n\tthis र remove 0s and अs at end 000रअ0अ ", "\n\tthis र remove 0s and अs at end 000रअ0\xa4अ "};
510+
std::vector<std::string> vTest3 = { "\n\t this र remove 0s at end 000 ", "\n\tthis र remove 0s and अs at end 000रअ0अ ", "\n\tthis र remove 0s and अs at end 000रअ0\xe0\xa4\x30"}; // \xe0\xa4
511+
for (auto sTest: vTest3) {
468512
std::string sGotOSmart = str_trim_oversmart(sTest, {" \n\t0अ"});
469-
std::cout << std::format("{}: Test2[{}] Dumb[{}] OverSmart[{}]", __func__, sTest, sGotDumb, sGotOSmart) << std::endl;
513+
std::cout << std::format("{}: Test3[{}] OverSmart[{}]", __func__, sTest, sGotOSmart) << std::endl;
470514
}
471515
}
472516

0 commit comments

Comments
 (0)