Skip to content

Commit 3248f7b

Browse files
authored
Merge pull request #9649 from RasmusWL/certificate-modeling
Python/JS/Ruby: Ignore common words (like certain) as sensitive data source
2 parents 9b58784 + 876ba71 commit 3248f7b

File tree

7 files changed

+39
-6
lines changed

7 files changed

+39
-6
lines changed
Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
---
2+
category: minorAnalysis
3+
---
4+
* Improved modeling of sensitive data sources, so common words like `certain` and `secretary` are no longer considered a certificate and a secret (respectively).

javascript/ql/lib/semmle/javascript/security/internal/SensitiveDataHeuristics.qll

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -50,7 +50,7 @@ module HeuristicNames {
5050
* Gets a regular expression that identifies strings that may indicate the presence of secret
5151
* or trusted data.
5252
*/
53-
string maybeSecret() { result = "(?is).*((?<!is)secret|(?<!un|is)trusted).*" }
53+
string maybeSecret() { result = "(?is).*((?<!is|is_)secret|(?<!un|un_|is|is_)trusted).*" }
5454

5555
/**
5656
* Gets a regular expression that identifies strings that may indicate the presence of
@@ -96,10 +96,14 @@ module HeuristicNames {
9696
* Gets a regular expression that identifies strings that may indicate the presence of data
9797
* that is hashed or encrypted, and hence rendered non-sensitive, or contains special characters
9898
* suggesting nouns within the string do not represent the meaning of the whole string (e.g. a URL or a SQL query).
99+
*
100+
* We also filter out common words like `certain` and `concert`, since otherwise these could
101+
* be matched by the certificate regular expressions. Same for `accountable` (account), or
102+
* `secretarial` (secret).
99103
*/
100104
string notSensitiveRegexp() {
101105
result =
102-
"(?is).*([^\\w$.-]|redact|censor|obfuscate|hash|md5|sha|random|((?<!un)(en))?(crypt|code)).*"
106+
"(?is).*([^\\w$.-]|redact|censor|obfuscate|hash|md5|sha|random|((?<!un)(en))?(crypt|code)|certain|concert|secretar|accountant|accountab).*"
103107
}
104108

105109
/**
Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
---
2+
category: minorAnalysis
3+
---
4+
* Improved modeling of sensitive data sources, so common words like `certain` and `secretary` are no longer considered a certificate and a secret (respectively).

python/ql/lib/semmle/python/security/internal/SensitiveDataHeuristics.qll

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -50,7 +50,7 @@ module HeuristicNames {
5050
* Gets a regular expression that identifies strings that may indicate the presence of secret
5151
* or trusted data.
5252
*/
53-
string maybeSecret() { result = "(?is).*((?<!is)secret|(?<!un|is)trusted).*" }
53+
string maybeSecret() { result = "(?is).*((?<!is|is_)secret|(?<!un|un_|is|is_)trusted).*" }
5454

5555
/**
5656
* Gets a regular expression that identifies strings that may indicate the presence of
@@ -96,10 +96,14 @@ module HeuristicNames {
9696
* Gets a regular expression that identifies strings that may indicate the presence of data
9797
* that is hashed or encrypted, and hence rendered non-sensitive, or contains special characters
9898
* suggesting nouns within the string do not represent the meaning of the whole string (e.g. a URL or a SQL query).
99+
*
100+
* We also filter out common words like `certain` and `concert`, since otherwise these could
101+
* be matched by the certificate regular expressions. Same for `accountable` (account), or
102+
* `secretarial` (secret).
99103
*/
100104
string notSensitiveRegexp() {
101105
result =
102-
"(?is).*([^\\w$.-]|redact|censor|obfuscate|hash|md5|sha|random|((?<!un)(en))?(crypt|code)).*"
106+
"(?is).*([^\\w$.-]|redact|censor|obfuscate|hash|md5|sha|random|((?<!un)(en))?(crypt|code)|certain|concert|secretar|accountant|accountab).*"
103107
}
104108

105109
/**

python/ql/test/experimental/dataflow/sensitive-data/test.py

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,10 @@ def encrypt_password(pwd):
3737
x = f()
3838
print(x) # $ SensitiveUse=password
3939

40+
# some prefixes makes us ignore it as a source
41+
not_found.isSecret
42+
not_found.is_secret
43+
4044
def my_func(non_sensitive_name):
4145
x = non_sensitive_name()
4246
print(x) # $ SensitiveUse=password
@@ -56,6 +60,11 @@ def my_func(non_sensitive_name):
5660
def my_func(password): # $ SensitiveDataSource=password
5761
print(password) # $ SensitiveUse=password
5862

63+
# FP where the `cert` in `uncertainty` makes us treat it like a certificate
64+
# https://github.com/github/codeql/issues/9632
65+
def my_other_func(uncertainty):
66+
print(uncertainty)
67+
5968
password = some_function() # $ SensitiveDataSource=password
6069
print(password) # $ SensitiveUse=password
6170

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
---
2+
category: minorAnalysis
3+
---
4+
* Improved modeling of sensitive data sources, so common words like `certain` and `secretary` are no longer considered a certificate and a secret (respectively).

ruby/ql/lib/codeql/ruby/security/internal/SensitiveDataHeuristics.qll

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -50,7 +50,7 @@ module HeuristicNames {
5050
* Gets a regular expression that identifies strings that may indicate the presence of secret
5151
* or trusted data.
5252
*/
53-
string maybeSecret() { result = "(?is).*((?<!is)secret|(?<!un|is)trusted).*" }
53+
string maybeSecret() { result = "(?is).*((?<!is|is_)secret|(?<!un|un_|is|is_)trusted).*" }
5454

5555
/**
5656
* Gets a regular expression that identifies strings that may indicate the presence of
@@ -96,10 +96,14 @@ module HeuristicNames {
9696
* Gets a regular expression that identifies strings that may indicate the presence of data
9797
* that is hashed or encrypted, and hence rendered non-sensitive, or contains special characters
9898
* suggesting nouns within the string do not represent the meaning of the whole string (e.g. a URL or a SQL query).
99+
*
100+
* We also filter out common words like `certain` and `concert`, since otherwise these could
101+
* be matched by the certificate regular expressions. Same for `accountable` (account), or
102+
* `secretarial` (secret).
99103
*/
100104
string notSensitiveRegexp() {
101105
result =
102-
"(?is).*([^\\w$.-]|redact|censor|obfuscate|hash|md5|sha|random|((?<!un)(en))?(crypt|code)).*"
106+
"(?is).*([^\\w$.-]|redact|censor|obfuscate|hash|md5|sha|random|((?<!un)(en))?(crypt|code)|certain|concert|secretar|accountant|accountab).*"
103107
}
104108

105109
/**

0 commit comments

Comments
 (0)