Skip to content

Commit dc899fb

Browse files
Merge pull request #589 from DataDog/tham/vsc_ecosystem
Add extensions ecosystem support for Guarddog
2 parents 0034a77 + e39c0ec commit dc899fb

File tree

12 files changed

+370
-38
lines changed

12 files changed

+370
-38
lines changed

README.md

Lines changed: 15 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -6,16 +6,17 @@
66
<img src="./docs/images/logo.png" alt="GuardDog" width="300" />
77
</p>
88

9-
GuardDog is a CLI tool that allows to identify malicious PyPI and npm packages or Go modules. It runs a set of heuristics on the package source code (through Semgrep rules) and on the package metadata.
9+
GuardDog is a CLI tool that allows to identify malicious PyPI and npm packages, Go modules, GitHub actions, or VSCode extensions. It runs a set of heuristics on the package source code (through Semgrep rules) and on the package metadata.
1010

11-
GuardDog can be used to scan local or remote PyPI and npm packages or Go modules using any of the available [heuristics](#heuristics).
11+
GuardDog can be used to scan local or remote PyPI and npm packages, Go modules, GitHub actions, or VSCode extensions using any of the available [heuristics](#heuristics).
1212

1313
It downloads and scans code from:
1414

1515
* NPM: Packages hosted in [npmjs.org](https://www.npmjs.com/)
1616
* PyPI: Source files (tar.gz) packages hosted in [PyPI.org](https://pypi.org/)
1717
* Go: GoLang source files of repositories hosted in [GitHub.com](https://github.com)
1818
* GitHub Actions: Javascript source files of repositories hosted in [GitHub.com](https://github.com)
19+
* VSCode Extensions: Extensions (.vsix) packages hosted in [marketplace.visualstudio.com](https://marketplace.visualstudio.com/)
1920

2021
![GuardDog demo usage](docs/images/demo.png)
2122

@@ -78,6 +79,15 @@ guarddog github_action scan DataDog/synthetics-ci-github-action
7879

7980
guarddog github_action verify /tmp/repo/.github/workflows/main.yml
8081

82+
# Scan VSCode extensions from the marketplace
83+
guarddog extension scan ms-python.python
84+
85+
# Scan a specific version of a VSCode extension
86+
guarddog extension scan ms-python.python --version 2023.20.0
87+
88+
# Scan a local VSCode extension directory or VSIX archive
89+
guarddog extension scan /tmp/my-extension/
90+
8191
# Run in debug mode
8292
guarddog --log-level debug npm scan express
8393
```
@@ -163,9 +173,9 @@ Source code heuristics:
163173
| **Heuristic** | **Description** |
164174
|:-------------:|:---------------:|
165175
| shady-links | Identify when a package contains an URL to a domain with a suspicious extension |
166-
| go-exec-base64 | Identify Base64-decoded content being passed to execution functions |
167-
| go-exec-download | Identify Go code that downloads potentially executable files |
168-
| go-exfiltrate-sensitive-data | Identify Go code that reads and exfiltrates sensitive|
176+
| go-exec-base64 | Identify Base64-decoded content being passed to execution functions in Go |
177+
| go-exfiltrate-sensitive-data | This rule identifies when a package reads and exfiltrates sensitive data from the local system. |
178+
| go-exec-download | This rule downloads and executes a remote binary after setting executable permissions. |
169179

170180
Metadata heuristics:
171181

@@ -176,9 +186,6 @@ Metadata heuristics:
176186

177187
### GitHub Action
178188

179-
For GitHub Actions that are implemented in JavaScript,
180-
GuardDog will run the same source code heuristics as for npm packages.
181-
182189
Source code heuristics:
183190

184191
| **Heuristic** | **Description** |

guarddog/analyzer/analyzer.py

Lines changed: 39 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -67,7 +67,13 @@ def __init__(self, ecosystem=ECOSYSTEM.PYPI) -> None:
6767
".semgrep_logs",
6868
]
6969

70-
def analyze(self, path, info=None, rules=None, name: Optional[str] = None, version: Optional[str] = None) -> dict:
70+
def analyze(
71+
self,
72+
path,
73+
info=None,
74+
rules=None,
75+
name: Optional[str] = None,
76+
version: Optional[str] = None) -> dict:
7177
"""
7278
Analyzes a package in the given path
7379
@@ -95,10 +101,19 @@ def analyze(self, path, info=None, rules=None, name: Optional[str] = None, versi
95101
results = metadata_results["results"] | sourcecode_results["results"]
96102
errors = metadata_results["errors"] | sourcecode_results["errors"]
97103

98-
return {"issues": issues, "errors": errors, "results": results, "path": path}
99-
100-
def analyze_metadata(self, path: str, info, rules=None, name: Optional[str] = None,
101-
version: Optional[str] = None) -> dict:
104+
return {
105+
"issues": issues,
106+
"errors": errors,
107+
"results": results,
108+
"path": path}
109+
110+
def analyze_metadata(
111+
self,
112+
path: str,
113+
info,
114+
rules=None,
115+
name: Optional[str] = None,
116+
version: Optional[str] = None) -> dict:
102117
"""
103118
Analyzes the metadata of a given package
104119
@@ -157,7 +172,11 @@ def analyze_sourcecode(self, path, rules=None) -> dict:
157172
results = semgrepscan_results["results"] | yarascan_results["results"]
158173
errors = semgrepscan_results["errors"] | yarascan_results["errors"]
159174

160-
return {"issues": issues, "errors": errors, "results": results, "path": path}
175+
return {
176+
"issues": issues,
177+
"errors": errors,
178+
"results": results,
179+
"path": path}
161180

162181
def analyze_yara(self, path: str, rules: Optional[set] = None) -> dict:
163182
"""
@@ -206,6 +225,7 @@ def analyze_yara(self, path: str, rules: Optional[set] = None) -> dict:
206225

207226
matches = scan_rules.match(scan_file_target_abspath)
208227
for m in matches:
228+
209229
for s in m.strings:
210230
for i in s.instances:
211231
finding = {
@@ -229,7 +249,10 @@ def analyze_yara(self, path: str, rules: Optional[set] = None) -> dict:
229249
except Exception as e:
230250
errors["rules-all"] = f"failed to run rule: {str(e)}"
231251

232-
return {"results": results | rule_results, "errors": errors, "issues": issues}
252+
return {
253+
"results": results | rule_results,
254+
"errors": errors,
255+
"issues": issues}
233256

234257
def analyze_semgrep(self, path, rules=None) -> dict:
235258
"""
@@ -255,9 +278,7 @@ def analyze_semgrep(self, path, rules=None) -> dict:
255278
issues = 0
256279

257280
rules_path = list(map(
258-
lambda rule_name: os.path.join(SOURCECODE_RULES_PATH, f"{rule_name}.yml"),
259-
all_rules
260-
))
281+
lambda rule_name: os.path.join(SOURCECODE_RULES_PATH, f"{rule_name}.yml"), all_rules))
261282

262283
if len(rules_path) == 0:
263284
log.debug("No semgrep code rules to run")
@@ -349,9 +370,9 @@ def _format_semgrep_response(self, response, rule=None, targetpath=None):
349370
file_path = os.path.abspath(result["path"])
350371
code = self.trim_code_snippet(
351372
self.get_snippet(
352-
file_path=file_path, start_line=start_line, end_line=end_line
353-
)
354-
)
373+
file_path=file_path,
374+
start_line=start_line,
375+
end_line=end_line))
355376
if targetpath:
356377
file_path = os.path.relpath(file_path, targetpath)
357378

@@ -370,7 +391,11 @@ def _format_semgrep_response(self, response, rule=None, targetpath=None):
370391

371392
return results
372393

373-
def get_snippet(self, file_path: str, start_line: int, end_line: int) -> str:
394+
def get_snippet(
395+
self,
396+
file_path: str,
397+
start_line: int,
398+
end_line: int) -> str:
374399
"""
375400
Returns the code snippet between start_line and stop_line in a file
376401

guarddog/analyzer/metadata/__init__.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,3 +16,5 @@ def get_metadata_detectors(ecosystem: ECOSYSTEM) -> dict[str, Detector]:
1616
return GO_METADATA_RULES
1717
case ECOSYSTEM.GITHUB_ACTION:
1818
return GITHUB_ACTION_METADATA_RULES
19+
case ECOSYSTEM.EXTENSION:
20+
return {} # No metadata detectors for extensions currently

guarddog/analyzer/sourcecode/__init__.py

Lines changed: 23 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -11,9 +11,13 @@
1111

1212
current_dir = pathlib.Path(__file__).parent.resolve()
1313

14+
EXTENSION_YARA_PREFIX = "extension_"
1415

1516
# These data class aim to reduce the spreading of the logic
16-
# Instead of using the a dict as a structure and parse it difffently depending on the type
17+
# Instead of using the a dict as a structure and parse it difffently
18+
# depending on the type
19+
20+
1721
@dataclass
1822
class SourceCodeRule:
1923
"""
@@ -22,6 +26,7 @@ class SourceCodeRule:
2226
id: str
2327
file: str
2428
description: str
29+
ecosystem: Optional[ECOSYSTEM] # None means "any ecosystem"
2530

2631

2732
@dataclass
@@ -38,13 +43,11 @@ class SempgrepRule(SourceCodeRule):
3843
Semgrep rule are language specific
3944
Content of rule in yaml format is accessible through rule_content
4045
"""
41-
ecosystem: ECOSYSTEM
4246
rule_content: dict
4347

4448

4549
def get_sourcecode_rules(
46-
ecosystem: ECOSYSTEM, kind: Optional[type] = None
47-
) -> Iterable[SourceCodeRule]:
50+
ecosystem: ECOSYSTEM, kind: Optional[type] = None) -> Iterable[SourceCodeRule]:
4851
"""
4952
This function returns the source code rules for a given ecosystem and kind.
5053
Args:
@@ -54,7 +57,8 @@ def get_sourcecode_rules(
5457
for rule in SOURCECODE_RULES:
5558
if kind and not isinstance(rule, kind):
5659
continue
57-
if not (getattr(rule, "ecosystem", ecosystem) == ecosystem):
60+
# Include rules that match the specific ecosystem OR rules that apply to any ecosystem (None)
61+
if rule.ecosystem is not None and rule.ecosystem != ecosystem:
5862
continue
5963
yield rule
6064

@@ -78,13 +82,15 @@ def get_sourcecode_rules(
7882
case "javascript" | "typescript" | "json":
7983
ecosystems.add(ECOSYSTEM.NPM)
8084
ecosystems.add(ECOSYSTEM.GITHUB_ACTION)
85+
ecosystems.add(ECOSYSTEM.EXTENSION)
8186
case "go":
8287
ecosystems.add(ECOSYSTEM.GO)
8388
case _:
8489
continue
8590

8691
for ecosystem in ecosystems:
87-
# avoids duplicates when multiple languages are supported by a rule
92+
# avoids duplicates when multiple languages are supported
93+
# by a rule
8894
if not next(
8995
filter(
9096
lambda r: r.id == rule["id"],
@@ -99,8 +105,7 @@ def get_sourcecode_rules(
99105
description=rule.get("metadata", {}).get("description", ""),
100106
file=file_name,
101107
rule_content=rule,
102-
)
103-
)
108+
))
104109

105110
yara_rule_file_names = list(
106111
filter(lambda x: x.endswith("yar"), os.listdir(current_dir))
@@ -111,9 +116,18 @@ def get_sourcecode_rules(
111116
rule_id = pathlib.Path(file_name).stem
112117
description_regex = fr'\s*rule\s+{rule_id}[^}}]+meta:[^}}]+description\s*=\s*\"(.+?)\"'
113118

119+
# Determine ecosystem based on filename prefix
120+
rule_ecosystem: Optional[ECOSYSTEM] = ECOSYSTEM.EXTENSION if file_name.startswith(EXTENSION_YARA_PREFIX) else None
121+
114122
with open(os.path.join(current_dir, file_name), "r") as fd:
115123
match = re.search(description_regex, fd.read())
116124
rule_description = ""
117125
if match:
118126
rule_description = match.group(1)
119-
SOURCECODE_RULES.append(YaraRule(id=rule_id, file=file_name, description=rule_description))
127+
128+
SOURCECODE_RULES.append(YaraRule(
129+
id=rule_id,
130+
file=file_name,
131+
description=rule_description,
132+
ecosystem=rule_ecosystem
133+
))
Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
rule DETECT_FILE_powershell_policy_bypass
2+
{
3+
meta:
4+
author = "T HAMDOUNI, Datadog"
5+
description = "Detects suspicious read access to /etc/passwd file, which is often targeted by malware for credential harvesting"
6+
7+
strings:
8+
$cli = /(cat|less|more|head|tail)\s+.{0,100}\/etc\/passwd/ nocase
9+
$read = /(readFile|readFileSync)\(\s*['"]\/etc\/passwd/ nocase
10+
condition:
11+
$cli or $read
12+
}
Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
rule DETECT_FILE_suspicious_passwd_access_linux
2+
{
3+
meta:
4+
author = "T HAMDOUNI, Datadog"
5+
description = "Detects suspicious read access to /etc/passwd file, which is often targeted by malware for credential harvesting"
6+
7+
strings:
8+
$cli = /(cat|less|more|head|tail)\s+.{0,100}\/etc\/passwd/ nocase
9+
$read = /(readFile|readFileSync)\(\s*['"]\/etc\/passwd/ nocase
10+
condition:
11+
$cli or $read
12+
}

guarddog/ecosystems.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@ class ECOSYSTEM(Enum):
66
NPM = "npm"
77
GO = "go"
88
GITHUB_ACTION = "github-action"
9+
EXTENSION = "extension"
910

1011

1112
def get_friendly_name(ecosystem: ECOSYSTEM) -> str:
@@ -18,5 +19,7 @@ def get_friendly_name(ecosystem: ECOSYSTEM) -> str:
1819
return "go"
1920
case ECOSYSTEM.GITHUB_ACTION:
2021
return "GitHub Action"
22+
case ECOSYSTEM.EXTENSION:
23+
return "Extension"
2124
case _:
2225
return ecosystem.value

guarddog/reporters/human_readable.py

Lines changed: 9 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -47,10 +47,15 @@ def _format_code_line_for_output(code) -> str:
4747
if num_issues == 0:
4848
lines.append(
4949
"Found "
50-
+ colored("0 potentially malicious indicators", "green", attrs=["bold"])
50+
+ colored(
51+
"0 potentially malicious indicators",
52+
"green",
53+
attrs=["bold"])
5154
+ " scanning "
52-
+ colored(identifier, None, attrs=["bold"])
53-
)
55+
+ colored(
56+
identifier,
57+
None,
58+
attrs=["bold"]))
5459
lines.append("")
5560
else:
5661
lines.append(
@@ -69,9 +74,7 @@ def _format_code_line_for_output(code) -> str:
6974
for finding in findings:
7075
description = findings[finding]
7176
if isinstance(description, str): # package metadata
72-
lines.append(
73-
colored(finding, None, attrs=["bold"]) + ": " + description
74-
)
77+
lines.append(colored(finding, None, attrs=["bold"]) + ": " + description)
7578
lines.append("")
7679
elif isinstance(description, list): # semgrep rule result:
7780
source_code_findings = description

guarddog/scanners/__init__.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88
from .go_package_scanner import GoModuleScanner
99
from .go_project_scanner import GoDependenciesScanner
1010
from .github_action_scanner import GithubActionScanner
11+
from .extension_scanner import ExtensionScanner
1112
from .scanner import PackageScanner, ProjectScanner
1213
from ..ecosystems import ECOSYSTEM
1314

@@ -33,6 +34,8 @@ def get_package_scanner(ecosystem: ECOSYSTEM) -> Optional[PackageScanner]:
3334
return GoModuleScanner()
3435
case ECOSYSTEM.GITHUB_ACTION:
3536
return GithubActionScanner()
37+
case ECOSYSTEM.EXTENSION:
38+
return ExtensionScanner()
3639
return None
3740

3841

@@ -57,4 +60,6 @@ def get_project_scanner(ecosystem: ECOSYSTEM) -> Optional[ProjectScanner]:
5760
return GoDependenciesScanner()
5861
case ECOSYSTEM.GITHUB_ACTION:
5962
return GitHubActionDependencyScanner()
63+
case ECOSYSTEM.EXTENSION:
64+
return None # we're not including dependency scanning for this PR
6065
return None

0 commit comments

Comments
 (0)