Skip to content

Commit 199f247

Browse files
authored
Merge pull request #20024 from geoffw0/moresensitive2
Shared: Improve sensitive data heuristics
2 parents 5f8c457 + 68f0dfe commit 199f247

File tree

8 files changed

+107
-78
lines changed

8 files changed

+107
-78
lines changed
Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
---
2+
category: minorAnalysis
3+
---
4+
* The regular expressions in `SensitiveDataHeuristics.qll` have been extended to find more instances of sensitive data such as secrets used in authentication, finance and health information, and device data. The heuristics have also been refined to find fewer false positive matches. This will improve results for queries related to sensitive information.
Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
---
2+
category: minorAnalysis
3+
---
4+
* The regular expressions in `SensitiveDataHeuristics.qll` have been extended to find more instances of sensitive data such as secrets used in authentication, finance and health information, and device data. The heuristics have also been refined to find fewer false positive matches. This will improve results for queries related to sensitive information.
Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
---
2+
category: minorAnalysis
3+
---
4+
* The regular expressions in `SensitiveDataHeuristics.qll` have been extended to find more instances of sensitive data such as secrets used in authentication, finance and health information, and device data. The heuristics have also been refined to find fewer false positive matches. This will improve results for queries related to sensitive information.
Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
---
2+
category: minorAnalysis
3+
---
4+
* The regular expressions in `SensitiveDataHeuristics.qll` have been extended to find more instances of sensitive data such as secrets used in authentication, finance and health information, and device data. The heuristics have also been refined to find fewer false positive matches. This will improve results for queries related to sensitive information.
Lines changed: 28 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -1,29 +1,29 @@
11
multipleCallTargets
2-
| test.rs:55:7:55:26 | ... .as_str() |
3-
| test.rs:56:7:56:21 | ... .as_str() |
4-
| test.rs:72:7:72:26 | ... .as_str() |
5-
| test.rs:73:7:73:36 | ... .as_str() |
6-
| test.rs:74:7:74:34 | ... .as_str() |
7-
| test.rs:75:7:75:27 | ... .as_str() |
8-
| test.rs:258:7:258:36 | ... .as_str() |
9-
| test.rs:260:7:260:33 | ... .as_str() |
10-
| test.rs:261:7:261:36 | ... .as_str() |
11-
| test.rs:262:7:262:26 | ... .as_str() |
12-
| test.rs:266:7:266:28 | ... .as_str() |
13-
| test.rs:267:7:267:37 | ... .as_str() |
14-
| test.rs:268:7:268:36 | ... .as_str() |
15-
| test.rs:271:7:271:32 | ... .as_str() |
16-
| test.rs:281:7:281:34 | ... .as_str() |
17-
| test.rs:284:7:284:36 | ... .as_str() |
18-
| test.rs:288:7:288:39 | ... .as_str() |
19-
| test.rs:295:7:295:53 | ... .as_str() |
20-
| test.rs:296:7:296:45 | ... .as_str() |
21-
| test.rs:298:7:298:39 | ... .as_str() |
22-
| test.rs:299:7:299:34 | ... .as_str() |
23-
| test.rs:300:7:300:42 | ... .as_str() |
24-
| test.rs:302:7:302:48 | ... .as_str() |
25-
| test.rs:303:7:303:35 | ... .as_str() |
26-
| test.rs:304:7:304:35 | ... .as_str() |
27-
| test.rs:313:8:313:19 | num.as_str() |
28-
| test.rs:324:8:324:19 | num.as_str() |
29-
| test.rs:343:7:343:39 | ... .as_str() |
2+
| test.rs:56:7:56:26 | ... .as_str() |
3+
| test.rs:57:7:57:21 | ... .as_str() |
4+
| test.rs:73:7:73:26 | ... .as_str() |
5+
| test.rs:74:7:74:36 | ... .as_str() |
6+
| test.rs:75:7:75:34 | ... .as_str() |
7+
| test.rs:76:7:76:27 | ... .as_str() |
8+
| test.rs:262:7:262:36 | ... .as_str() |
9+
| test.rs:264:7:264:33 | ... .as_str() |
10+
| test.rs:265:7:265:36 | ... .as_str() |
11+
| test.rs:266:7:266:26 | ... .as_str() |
12+
| test.rs:270:7:270:28 | ... .as_str() |
13+
| test.rs:271:7:271:37 | ... .as_str() |
14+
| test.rs:272:7:272:36 | ... .as_str() |
15+
| test.rs:275:7:275:32 | ... .as_str() |
16+
| test.rs:285:7:285:34 | ... .as_str() |
17+
| test.rs:288:7:288:36 | ... .as_str() |
18+
| test.rs:292:7:292:39 | ... .as_str() |
19+
| test.rs:299:7:299:53 | ... .as_str() |
20+
| test.rs:300:7:300:45 | ... .as_str() |
21+
| test.rs:302:7:302:39 | ... .as_str() |
22+
| test.rs:303:7:303:34 | ... .as_str() |
23+
| test.rs:304:7:304:42 | ... .as_str() |
24+
| test.rs:306:7:306:48 | ... .as_str() |
25+
| test.rs:307:7:307:35 | ... .as_str() |
26+
| test.rs:308:7:308:35 | ... .as_str() |
27+
| test.rs:317:8:317:19 | num.as_str() |
28+
| test.rs:328:8:328:19 | num.as_str() |
29+
| test.rs:347:7:347:39 | ... .as_str() |

rust/ql/test/library-tests/sensitivedata/test.rs

Lines changed: 42 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,7 @@ impl MyStruct {
2323
fn get_password() -> String { get_string() }
2424

2525
fn test_passwords(
26-
password: &str, pass_word: &str, passwd: &str, my_password: &str, password_str: &str,
26+
password: &str, pass_word: &str, passwd: &str, my_password: &str, password_str: &str, password_confirmation: &str,
2727
pass_phrase: &str, passphrase: &str, passPhrase: &str, backup_code: &str,
2828
auth_key: &str, authkey: &str, authKey: &str, authentication_key: &str, authenticationkey: &str, authenticationKey: &str, oauth: &str,
2929
one_time_code: &str,
@@ -37,6 +37,7 @@ fn test_passwords(
3737
sink(passwd); // $ sensitive=password
3838
sink(my_password); // $ sensitive=password
3939
sink(password_str); // $ sensitive=password
40+
sink(password_confirmation); // $ sensitive=password
4041
sink(pass_phrase); // $ sensitive=password
4142
sink(passphrase); // $ sensitive=password
4243
sink(passPhrase); // $ sensitive=password
@@ -48,12 +49,12 @@ fn test_passwords(
4849
sink(authentication_key); // $ sensitive=password
4950
sink(authenticationkey); // $ sensitive=password
5051
sink(authenticationKey); // $ sensitive=password
51-
sink(oauth); // $ MISSING: sensitive=password
52+
sink(oauth); // $ sensitive=password
5253
sink(one_time_code); // $ MISSING: sensitive=password
5354

5455
sink(ms); // $ MISSING: sensitive=password
5556
sink(ms.password.as_str()); // $ sensitive=password
56-
sink(ms.mfa.as_str()); // $ MISSING: sensitive=password
57+
sink(ms.mfa.as_str()); // $ sensitive=password
5758

5859
sink(get_password()); // $ sensitive=password
5960
let password2 = get_string();
@@ -67,10 +68,10 @@ fn test_passwords(
6768
sink(harmless);
6869
sink(encrypted_password);
6970
sink(password_hash);
70-
sink(passwordFile); // $ SPURIOUS: sensitive=password
71+
sink(passwordFile);
7172

7273
sink(ms.harmless.as_str());
73-
sink(ms.password_file_path.as_str()); // $ SPURIOUS: sensitive=password
74+
sink(ms.password_file_path.as_str());
7475
sink(ms.password_enabled.as_str()); // $ SPURIOUS: sensitive=password
7576
sink(ms.numfailed.as_str());
7677

@@ -127,11 +128,11 @@ fn test_credentials(
127128

128129
sink(hashkey);
129130
sink(hash_key);
130-
sink(sessionkeypath); // $ SPURIOUS: sensitive=id
131-
sink(account_key_path); // $ SPURIOUS: sensitive=id
131+
sink(sessionkeypath);
132+
sink(account_key_path);
132133

133-
sink(ms.get_certificate_url()); // $ SPURIOUS: sensitive=certificate
134-
sink(ms.get_certificate_file()); // $ SPURIOUS: sensitive=certificate
134+
sink(ms.get_certificate_url());
135+
sink(ms.get_certificate_file());
135136

136137
sink(get_public_key());
137138
sink(get_next_token());
@@ -160,16 +161,19 @@ impl DeviceInfo {
160161
fn test_device_info(&self, other: &DeviceInfo) {
161162
// private device info
162163

163-
sink(&self.api_key); // $ MISSING: sensitive=id
164-
sink(&other.api_key); // $ MISSING: sensitive=id
165-
sink(&self.deviceApiToken); // $ MISSING: sensitive=id
166-
sink(&self.finger_print); // $ MISSING: sensitive=id
167-
sink(&self.ip_address); // $ MISSING: sensitive=id
168-
sink(self.macaddr12); // $ MISSING: sensitive=id
169-
sink(&self.mac_addr); // $ MISSING: sensitive=id
170-
sink(self.mac_addr.values); // $ MISSING: sensitive=id
171-
sink(self.mac_addr.values[0]); // $ MISSING: sensitive=id
172-
sink(&self.networkMacAddress); // $ MISSING: sensitive=id
164+
sink(&self.api_key); // $ sensitive=password
165+
sink(&other.api_key); // $ sensitive=password
166+
sink(&self.deviceApiToken); // $ sensitive=password
167+
sink(self.macaddr12); // $ sensitive=private
168+
sink(&self.mac_addr); // $ sensitive=private
169+
sink(self.mac_addr.values); // $ sensitive=private
170+
sink(self.mac_addr.values[0]); // $ sensitive=private
171+
sink(&self.networkMacAddress); // $ sensitive=private
172+
173+
// dubious (may or may not be private device info, depending on context)
174+
175+
sink(&self.finger_print);
176+
sink(&self.ip_address);
173177

174178
// not private device info
175179

@@ -267,26 +271,26 @@ fn test_private_info(
267271
sink(info.emergency_contact.as_str()); // $ sensitive=private
268272
sink(info.name_of_employer.as_str()); // $ sensitive=private
269273

270-
sink(&info.gender); // $ MISSING: sensitive=private
271-
sink(info.genderString.as_str()); // $ MISSING: sensitive=private
274+
sink(&info.gender); // $ sensitive=private
275+
sink(info.genderString.as_str()); // $ sensitive=private
272276
let sex = "Male";
273277
let gender = Gender::Female;
274278
let a = Gender::Female;
275-
sink(sex); // $ MISSING: sensitive=private
276-
sink(gender); // $ MISSING: sensitive=private
279+
sink(sex); // $ sensitive=private
280+
sink(gender); // $ sensitive=private
277281
sink(a); // $ MISSING: sensitive=private
278282

279-
sink(info.patient_id); // $ MISSING: sensitive=private
280-
sink(info.linkedPatientId); // $ MISSING: sensitive=private
281-
sink(info.patient_record.as_str()); // $ MISSING: sensitive=private
282-
sink(info.patient_record.trim()); // $ MISSING: sensitive=private
283+
sink(info.patient_id); // $ sensitive=private
284+
sink(info.linkedPatientId); // $ sensitive=private
285+
sink(info.patient_record.as_str()); // $ sensitive=private
286+
sink(info.patient_record.trim()); // $ sensitive=private
283287
sink(&info.medical_notes); // $ sensitive=private
284288
sink(info.medical_notes[0].as_str()); // $ sensitive=private
285289
for n in info.medical_notes.iter() {
286290
sink(n.as_str()); // $ MISSING: sensitive=private
287291
}
288-
sink(info.confidentialMessage.as_str()); // $ MISSING: sensitive=private
289-
sink(info.confidentialMessage.to_lowercase()); // $ MISSING: sensitive=private
292+
sink(info.confidentialMessage.as_str()); // $ sensitive=secret
293+
sink(info.confidentialMessage.to_lowercase()); // $ sensitive=secret
290294

291295
sink(info.latitude); // $ sensitive=private
292296
let x = info.longitude.unwrap();
@@ -296,12 +300,12 @@ fn test_private_info(
296300
sink(info.financials.credit_card_no.as_str()); // $ sensitive=private
297301
sink(info.financials.credit_rating); // $ sensitive=private
298302
sink(info.financials.user_ccn.as_str()); // $ sensitive=private
299-
sink(info.financials.cvv.as_str()); // $ MISSING: sensitive=private
300-
sink(info.financials.beneficiary.as_str()); // $ MISSING: sensitive=private
301-
sink(info.financials.routing_number); // $ MISSING: sensitive=private
302-
sink(info.financials.routingNumberText.as_str()); // $ MISSING: sensitive=private
303-
sink(info.financials.iban.as_str()); // $ MISSING: sensitive=private
304-
sink(info.financials.iBAN.as_str()); // $ MISSING: sensitive=private
303+
sink(info.financials.cvv.as_str()); // $ sensitive=private
304+
sink(info.financials.beneficiary.as_str()); // $ sensitive=private
305+
sink(info.financials.routing_number); // $ sensitive=private
306+
sink(info.financials.routingNumberText.as_str()); // $ sensitive=private
307+
sink(info.financials.iban.as_str()); // $ sensitive=private
308+
sink(info.financials.iBAN.as_str()); // $ sensitive=private
305309

306310
sink(ContactDetails::HomePhoneNumber("123".to_string())); // $ sensitive=private
307311
sink(ContactDetails::MobileNumber("123".to_string())); // $ sensitive=private
@@ -343,8 +347,8 @@ fn test_private_info(
343347
sink(info.financials.harmless.as_str());
344348
sink(info.financials.num_accounts); // $ SPURIOUS: sensitive=id
345349
sink(info.financials.total_accounts); // $ SPURIOUS: sensitive=id
346-
sink(info.financials.accounting); // $ SPURIOUS: sensitive=id
347-
sink(info.financials.unaccounted); // $ SPURIOUS: sensitive=id
350+
sink(info.financials.accounting);
351+
sink(info.financials.unaccounted);
348352
sink(info.financials.multiband);
349353

350354
sink(ContactDetails::FavouriteColor("blue".to_string()));
@@ -362,5 +366,5 @@ impl MyArray {
362366

363367
fn test_iterator() {
364368
let iter = std::iter::repeat(1).take(10);
365-
sink(MyArray::from_trusted_iterator(iter)); // $ SPURIOUS: sensitive=secret
369+
sink(MyArray::from_trusted_iterator(iter));
366370
}

shared/concepts/codeql/concepts/internal/SensitiveDataHeuristics.qll

Lines changed: 17 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -56,15 +56,16 @@ module HeuristicNames {
5656
* Gets a regular expression that identifies strings that may indicate the presence of secret
5757
* or trusted data.
5858
*/
59-
string maybeSecret() { result = "(?is).*((?<!is|is_)secret|(?<!un|un_|is|is_)trusted).*" }
59+
string maybeSecret() {
60+
result = "(?is).*((?<!is|is_)secret|(?<!un|un_|is|is_)trusted(?!_iter)|confidential).*"
61+
}
6062

6163
/**
6264
* Gets a regular expression that identifies strings that may indicate the presence of
6365
* user names or other account information.
6466
*/
6567
string maybeAccountInfo() {
66-
result = "(?is).*acc(ou)?nt.*" or
67-
result = "(?is).*(puid|user.?name|user.?id|session.?(id|key)).*" or
68+
result = "(?is).*(acc(ou)?nt|puid|user.?(name|id)|session.?(id|key)).*" or
6869
result = "(?s).*([uU]|^|_|[a-z](?=U))([uU][iI][dD]).*"
6970
}
7071

@@ -73,8 +74,9 @@ module HeuristicNames {
7374
* a password or an authorization key.
7475
*/
7576
string maybePassword() {
76-
result = "(?is).*pass(wd|word|code|.?phrase)(?!.*question).*" or
77-
result = "(?is).*(auth(entication|ori[sz]ation)?).?key.*"
77+
result =
78+
"(?is).*(pass(wd|word|code|.?phrase)(?!.*question)|(auth(entication|ori[sz]ation)?).?key|oauth|"
79+
+ "api.?(key|token)|([_-]|\\b)mfa([_-]|\\b)).*"
7880
}
7981

8082
/**
@@ -90,7 +92,7 @@ module HeuristicNames {
9092
string maybePrivate() {
9193
result =
9294
"(?is).*(" +
93-
// Inspired by the list on https://cwe.mitre.org/data/definitions/359.html
95+
// Inspired by multiple sources including the list on https://cwe.mitre.org/data/definitions/359.html
9496
// Government identifiers, such as Social Security Numbers
9597
"social.?security|employer.?identification|national.?insurance|resident.?id|" +
9698
"passport.?(num|no)|([_-]|\\b)ssn([_-]|\\b)|" +
@@ -102,17 +104,19 @@ module HeuristicNames {
102104
// Geographic location - where the user is (or was)
103105
"latitude|longitude|nationality|" +
104106
// Financial data - such as credit card numbers, salary, bank accounts, and debts
105-
"(credit|debit|bank|visa).?(card|num|no|acc(ou)?nt)|acc(ou)?nt.?(no|num|credit)|" +
106-
"salary|billing|credit.?(rating|score)|([_-]|\\b)ccn([_-]|\\b)|" +
107+
"(credit|debit|bank|visa).?(card|num|no|acc(ou)?nt)|acc(ou)?nt.?(no|num|credit)|routing.?num|"
108+
+ "salary|billing|beneficiary|credit.?(rating|score)|([_-]|\\b)(ccn|cvv|iban)([_-]|\\b)|" +
107109
// Communications - e-mail addresses, private e-mail messages, SMS text messages, chat logs, etc.
108110
// "e(mail|_mail)|" + // this seems too noisy
109111
// Health - medical conditions, insurance status, prescription records
110-
"birth.?da(te|y)|da(te|y).?(of.?)?birth|" +
111-
"medical|(health|care).?plan|healthkit|appointment|prescription|" +
112+
"birth.?da(te|y)|da(te|y).?(of.?)?birth|gender|([_-]|\\b)sex([_-]|\\b)|" +
113+
"medical|(health|care).?plan|healthkit|appointment|prescription|patient.?(id|record)|" +
112114
"blood.?(type|alcohol|glucose|pressure)|heart.?(rate|rhythm)|body.?(mass|fat)|" +
113115
"menstrua|pregnan|insulin|inhaler|" +
114116
// Relationships - work and family
115-
"employ(er|ee)|spouse|maiden.?name" +
117+
"employ(er|ee)|spouse|maiden.?name|" +
118+
// Device information
119+
"mac.?addr" +
116120
// ---
117121
").*"
118122
}
@@ -146,7 +150,8 @@ module HeuristicNames {
146150
*/
147151
string notSensitiveRegexp() {
148152
result =
149-
"(?is).*([^\\w$.-]|redact|censor|obfuscate|hash|md5|sha|random|((?<!un)(en))?(crypt|(?<!pass)code)|certain|concert|secretar|accountant|accountab).*"
153+
"(?is).*([^\\w$.-]|redact|censor|obfuscate|hash|md5|sha|random|((?<!un)(en))?(crypt|(?<!pass)code)|"
154+
+ "certain|concert|secretar|account(ant|ab|ing|ed)|file|path|([_-]|\\b)url).*"
150155
}
151156

152157
/**
Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
---
2+
category: minorAnalysis
3+
---
4+
* The regular expressions in `SensitiveDataHeuristics.qll` have been extended to find more instances of sensitive data such as secrets used in authentication, finance and health information, and device data. The heuristics have also been refined to find fewer false positive matches. This will improve results for queries related to sensitive information.

0 commit comments

Comments
 (0)