Skip to content

Commit 9278e00

Browse files
committed
Refactor Regex classes further
This commit changes Regex interface rather drastically. Most importantly, RegexMatch class now contains a list of matched groups, with group(0) being entire match, group(1) - first capturing group, and so on. Secondly, searchAll now returns a list of RegexMatch objects instead of reversed flattened list of groups from all matches.
1 parent 9e4c8f0 commit 9278e00

File tree

10 files changed

+155
-140
lines changed

10 files changed

+155
-140
lines changed

src/modsecurity.cc

Lines changed: 21 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -228,9 +228,9 @@ int ModSecurity::processContentOffset(const char *content, size_t len,
228228
const unsigned char *buf;
229229
size_t jsonSize;
230230

231-
std::list<regex::RegexMatch> vars = variables.searchAll(matchString);
232-
std::list<regex::RegexMatch> ops = operators.searchAll(matchString);
233-
std::list<regex::RegexMatch> trans = transformations.searchAll(matchString);
231+
auto vars = variables.searchAll(matchString);
232+
auto ops = operators.searchAll(matchString);
233+
auto trans = transformations.searchAll(matchString);
234234

235235
g = yajl_gen_alloc(NULL);
236236
if (g == NULL) {
@@ -255,14 +255,12 @@ int ModSecurity::processContentOffset(const char *content, size_t len,
255255
strlen("highlight"));
256256

257257
yajl_gen_array_open(g);
258-
while (vars.size() > 0) {
258+
259+
for (const auto &m : vars) {
259260
std::string value;
260261
yajl_gen_map_open(g);
261-
vars.pop_back();
262-
const std::string &startingAt = vars.back().str();
263-
vars.pop_back();
264-
const std::string &size = vars.back().str();
265-
vars.pop_back();
262+
const std::string &startingAt = m.group(1).string;
263+
const std::string &size = m.group(2).string;
266264
yajl_gen_string(g,
267265
reinterpret_cast<const unsigned char*>("startingAt"),
268266
strlen("startingAt"));
@@ -298,32 +296,34 @@ int ModSecurity::processContentOffset(const char *content, size_t len,
298296
yajl_gen_map_open(g);
299297
yajl_gen_string(g, reinterpret_cast<const unsigned char*>("value"),
300298
strlen("value"));
301-
yajl_gen_string(g, reinterpret_cast<const unsigned char*>(varValue.c_str()),
299+
yajl_gen_string(g, reinterpret_cast<const unsigned char*>(varValue.data()),
302300
varValue.size());
303301
yajl_gen_map_close(g);
304302

305-
while (trans.size() > 0) {
303+
for (const auto &m : trans) {
306304
modsecurity::actions::transformations::Transformation *t;
307305
std::string varValueRes;
308306
yajl_gen_map_open(g);
309307
yajl_gen_string(g,
310308
reinterpret_cast<const unsigned char*>("transformation"),
311309
strlen("transformation"));
312310

311+
const std::string &transformation = m.group(0).string;
312+
313313
yajl_gen_string(g,
314-
reinterpret_cast<const unsigned char*>(trans.back().str().c_str()),
315-
trans.back().str().size());
314+
reinterpret_cast<const unsigned char*>(transformation.data()),
315+
transformation.size());
316316

317317
t = modsecurity::actions::transformations::Transformation::instantiate(
318-
trans.back().str().c_str());
318+
transformation.c_str());
319319
varValueRes = t->evaluate(varValue, NULL);
320320
varValue.assign(varValueRes);
321-
trans.pop_back();
321+
322322

323323
yajl_gen_string(g, reinterpret_cast<const unsigned char*>("value"),
324324
strlen("value"));
325325
yajl_gen_string(g, reinterpret_cast<const unsigned char*>(
326-
varValue.c_str()),
326+
varValue.data()),
327327
varValue.size());
328328
yajl_gen_map_close(g);
329329

@@ -337,26 +337,23 @@ int ModSecurity::processContentOffset(const char *content, size_t len,
337337

338338
yajl_gen_map_open(g);
339339

340-
while (ops.size() > 0) {
340+
for (const auto &m : ops) {
341341
std::string value;
342342
yajl_gen_string(g, reinterpret_cast<const unsigned char*>("highlight"),
343343
strlen("highlight"));
344344
yajl_gen_map_open(g);
345-
ops.pop_back();
346-
std::string startingAt = ops.back().str();
347-
ops.pop_back();
348-
std::string size = ops.back().str();
349-
ops.pop_back();
345+
const std::string &startingAt = m.group(1).string;
346+
const std::string &size = m.group(2).string;
350347
yajl_gen_string(g,
351348
reinterpret_cast<const unsigned char*>("startingAt"),
352349
strlen("startingAt"));
353350
yajl_gen_string(g,
354-
reinterpret_cast<const unsigned char*>(startingAt.c_str()),
351+
reinterpret_cast<const unsigned char*>(startingAt.data()),
355352
startingAt.size());
356353
yajl_gen_string(g, reinterpret_cast<const unsigned char*>("size"),
357354
strlen("size"));
358355
yajl_gen_string(g,
359-
reinterpret_cast<const unsigned char*>(size.c_str()),
356+
reinterpret_cast<const unsigned char*>(size.data()),
360357
size.size());
361358
yajl_gen_map_close(g);
362359

src/operators/rx.cc

Lines changed: 14 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -38,7 +38,6 @@ bool Rx::init(const std::string &arg, std::string *error) {
3838

3939
bool Rx::evaluate(Transaction *transaction, Rule *rule,
4040
const std::string& input, std::shared_ptr<RuleMessage> ruleMessage) {
41-
std::list<RegexMatch> matches;
4241
Regex *re;
4342

4443
if (m_param.empty() && !m_string->m_containsMacro) {
@@ -52,33 +51,30 @@ bool Rx::evaluate(Transaction *transaction, Rule *rule,
5251
re = m_re;
5352
}
5453

55-
matches = re->searchAll(input);
56-
if (rule && rule->m_containsCaptureAction && transaction) {
57-
int i = 0;
58-
matches.reverse();
59-
for (const RegexMatch& a : matches) {
54+
regex::RegexMatch m;
55+
bool matched = re->search(input, &m, 9);
56+
57+
if (matched && rule && rule->m_containsCaptureAction && transaction) {
58+
for (int i = 0; i < m.num_groups(); i++) {
59+
auto key = std::to_string(i);
60+
const std::string &value = m.group(i).string;
6061
transaction->m_collections.m_tx_collection->storeOrUpdateFirst(
61-
std::to_string(i), a.str());
62+
key, value);
6263
ms_dbg_a(transaction, 7, "Added regex subexpression TX." +
63-
std::to_string(i) + ": " + a.str());
64-
transaction->m_matched.push_back(a.str());
65-
i++;
64+
key + ": " + value);
65+
transaction->m_matched.push_back(value);
6666
}
6767
}
68-
69-
for (const auto & i : matches) {
70-
logOffset(ruleMessage, i.offset(), i.str().size());
68+
for (int i = 0; i < m.num_groups(); i++) {
69+
const regex::MatchGroup &g = m.group(i);
70+
logOffset(ruleMessage, g.offset, g.string.size());
7171
}
7272

7373
if (m_string->m_containsMacro) {
7474
delete re;
7575
}
7676

77-
if (matches.size() > 0) {
78-
return true;
79-
}
80-
81-
return false;
77+
return matched;
8278
}
8379

8480

src/operators/verify_cpf.cc

Lines changed: 7 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -119,7 +119,6 @@ bool VerifyCPF::verify(const char *cpfnumber, int len) {
119119

120120
bool VerifyCPF::evaluate(Transaction *t, Rule *rule,
121121
const std::string& input, std::shared_ptr<RuleMessage> ruleMessage) {
122-
std::list<RegexMatch> matches;
123122
bool is_cpf = false;
124123
int i;
125124

@@ -128,18 +127,17 @@ bool VerifyCPF::evaluate(Transaction *t, Rule *rule,
128127
}
129128

130129
for (i = 0; i < input.size() - 1 && is_cpf == false; i++) {
131-
matches = m_re->searchAll(input.substr(i, input.size()));
132-
for (const auto & i : matches) {
133-
is_cpf = verify(i.str().c_str(), i.str().size());
130+
auto matches = m_re->searchAll(input.substr(i, input.size()));
131+
for (const auto &m : matches) {
132+
const regex::MatchGroup &g = m.group(0);
133+
is_cpf = verify(g.string.data(), g.string.size());
134134
if (is_cpf) {
135-
logOffset(ruleMessage, i.offset(), i.str().size());
135+
logOffset(ruleMessage, g.offset, g.string.size());
136136
if (rule && t && rule->m_containsCaptureAction) {
137137
t->m_collections.m_tx_collection->storeOrUpdateFirst(
138-
"0", i.str());
139-
ms_dbg_a(t, 7, "Added VerifyCPF match TX.0: " + \
140-
i.str());
138+
"0", g.string);
139+
ms_dbg_a(t, 7, "Added VerifyCPF match TX.0: " + g.string);
141140
}
142-
143141
goto out;
144142
}
145143
}

src/operators/verify_ssn.cc

Lines changed: 7 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -110,7 +110,6 @@ bool VerifySSN::verify(const char *ssnumber, int len) {
110110

111111
bool VerifySSN::evaluate(Transaction *t, Rule *rule,
112112
const std::string& input, std::shared_ptr<RuleMessage> ruleMessage) {
113-
std::list<RegexMatch> matches;
114113
bool is_ssn = false;
115114
int i;
116115

@@ -119,18 +118,17 @@ bool VerifySSN::evaluate(Transaction *t, Rule *rule,
119118
}
120119

121120
for (i = 0; i < input.size() - 1 && is_ssn == false; i++) {
122-
matches = m_re->searchAll(input.substr(i, input.size()));
123-
for (const auto & i : matches) {
124-
is_ssn = verify(i.str().c_str(), i.str().size());
121+
auto matches = m_re->searchAll(input.substr(i, input.size()));
122+
for (const auto &m : matches) {
123+
const regex::MatchGroup &g = m.group(0);
124+
is_ssn = verify(g.string.data(), g.string.size());
125125
if (is_ssn) {
126-
logOffset(ruleMessage, i.offset(), i.str().size());
126+
logOffset(ruleMessage, g.offset, g.string.size());
127127
if (rule && t && rule->m_containsCaptureAction) {
128128
t->m_collections.m_tx_collection->storeOrUpdateFirst(
129-
"0", i.str());
130-
ms_dbg_a(t, 7, "Added VerifySSN match TX.0: " + \
131-
i.str());
129+
"0", g.string);
130+
ms_dbg_a(t, 7, "Added VerifySSN match TX.0: " + g.string);
132131
}
133-
134132
goto out;
135133
}
136134
}

src/regex/backend/pcre.cc

Lines changed: 62 additions & 45 deletions
Original file line numberDiff line numberDiff line change
@@ -45,6 +45,8 @@ Pcre::Pcre(const std::string& pattern_)
4545
&errptr, &erroffset, NULL);
4646

4747
m_pce = pcre_study(m_pc, pcre_study_opt, &errptr);
48+
49+
pcre_fullinfo(m_pc, m_pce, PCRE_INFO_CAPTURECOUNT, &m_capture_count);
4850
}
4951

5052

@@ -63,64 +65,79 @@ Pcre::~Pcre() {
6365
}
6466
}
6567

68+
static bool do_match(
69+
pcre *pc,
70+
pcre_extra *pce,
71+
int pcre_capture_count,
72+
const char *s,
73+
size_t n,
74+
RegexMatch *m,
75+
ssize_t max_groups,
76+
size_t offset)
77+
{
78+
if (m == nullptr) {
79+
max_groups = 0;
80+
}
6681

67-
std::list<RegexMatch> Pcre::searchAll(const std::string& s) const {
68-
const char *subject = s.c_str();
69-
const std::string tmpString = std::string(s.c_str(), s.size());
70-
int ovector[OVECCOUNT];
71-
int rc, i, offset = 0;
72-
std::list<RegexMatch> retList;
73-
74-
do {
75-
rc = pcre_exec(m_pc, m_pce, subject,
76-
s.size(), offset, 0, ovector, OVECCOUNT);
77-
78-
for (i = 0; i < rc; i++) {
79-
size_t start = ovector[2*i];
80-
size_t end = ovector[2*i+1];
81-
size_t len = end - start;
82-
if (end > s.size()) {
83-
rc = 0;
84-
break;
85-
}
86-
std::string match = std::string(tmpString, start, len);
87-
offset = start + len;
88-
retList.push_front(RegexMatch(match, start));
82+
// "+1" is required for full match (aka group 0)
83+
int ovecsize = (pcre_capture_count+1) * 3;
84+
int ovector[ovecsize];
85+
int ret = pcre_exec(pc, pce, s, n, offset, 0, ovector, ovecsize);
8986

90-
if (len == 0) {
91-
rc = 0;
92-
break;
93-
}
87+
if (ret > 0) {
88+
if (max_groups < 0) {
89+
max_groups = ret;
9490
}
95-
} while (rc > 0);
96-
97-
return retList;
98-
}
99-
10091

101-
int Pcre::search(const std::string& s, RegexMatch *match) const {
102-
int ovector[OVECCOUNT];
103-
int ret = pcre_exec(m_pc, m_pce, s.c_str(),
104-
s.size(), 0, 0, ovector, OVECCOUNT) > 0;
92+
if (max_groups > 0) {
93+
size_t ngroups = std::min<size_t>(max_groups, ret);
94+
RegexMatch::MatchGroupContainer groups;
95+
groups.reserve(ngroups);
96+
for (size_t i = 0; i < ngroups; i++) {
97+
size_t start = ovector[2*i];
98+
size_t end = ovector[2*i+1];
99+
std::string group(s + start, end - start);
105100

106-
if (ret > 0) {
107-
*match = RegexMatch(
108-
std::string(s, ovector[ret-1], ovector[ret] - ovector[ret-1]),
109-
0);
101+
groups.push_back(MatchGroup{start, std::move(group)});
102+
}
103+
*m = RegexMatch(std::move(groups));
104+
}
105+
return true;
110106
}
107+
return false;
111108

112-
return ret;
113109
}
114110

111+
std::vector<RegexMatch> Pcre::searchAll(const std::string& s, bool overlapping) const {
112+
std::vector<RegexMatch> res;
113+
size_t offset = 0;
114+
115+
while (1) {
116+
RegexMatch m;
117+
bool match = do_match(m_pc, m_pce, m_capture_count, s.data(), s.size(), &m, -1, offset);
118+
if (!match) break;
119+
120+
if (overlapping) {
121+
// start just after the beginning of the last match
122+
offset = m.group(0).offset + 1;
123+
} else {
124+
// start just at the end of the last match
125+
offset = m.group(0).offset + m.group(0).string.size();
126+
if (offset == m.group(0).offset) {
127+
// empty match - advance by one to not match empty string repeatedly
128+
offset++;
129+
}
130+
}
131+
res.push_back(std::move(m));
132+
}
133+
return res;
134+
}
115135

116-
int Pcre::search(const std::string& s) const {
117-
int ovector[OVECCOUNT];
118-
return pcre_exec(m_pc, m_pce, s.c_str(),
119-
s.size(), 0, 0, ovector, OVECCOUNT) > 0;
136+
bool Pcre::search(const std::string &s, RegexMatch *m, ssize_t max_groups) const {
137+
return do_match(m_pc, m_pce, m_capture_count, s.data(), s.size(), m, max_groups, 0);
120138
}
121139

122140

123141
} // namespace backend
124142
} // namespace regex
125143
} // namespace modsecurity
126-

src/regex/backend/pcre.h

Lines changed: 9 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -29,10 +29,6 @@ namespace modsecurity {
2929
namespace regex {
3030
namespace backend {
3131

32-
33-
#define OVECCOUNT 900
34-
35-
3632
class Pcre {
3733
public:
3834
explicit Pcre(const std::string& pattern_);
@@ -42,12 +38,17 @@ class Pcre {
4238
Pcre(const Pcre&) = delete;
4339
Pcre& operator=(const Pcre&) = delete;
4440

45-
std::list<RegexMatch> searchAll(const std::string& s) const;
46-
int search(const std::string &s, RegexMatch *m) const;
47-
int search(const std::string &s) const;
41+
std::vector<RegexMatch> searchAll(const std::string& s, bool overlapping = false) const;
42+
bool search(const std::string &s, RegexMatch *m = nullptr, ssize_t max_groups = -1) const;
4843

49-
const std::string pattern;
44+
const std::string& getPattern() const {
45+
return pattern;
46+
};
5047
private:
48+
const std::string pattern;
49+
50+
int m_capture_count;
51+
5152
pcre *m_pc = NULL;
5253
pcre_extra *m_pce = NULL;
5354
};

0 commit comments

Comments
 (0)