diff --git a/README.md b/README.md index 14ff9323..c4becf0c 100644 --- a/README.md +++ b/README.md @@ -6,9 +6,9 @@ GuardDog

-GuardDog is a CLI tool that allows to identify malicious PyPI and npm packages or Go modules. It runs a set of heuristics on the package source code (through Semgrep rules) and on the package metadata. +GuardDog is a CLI tool that allows to identify malicious PyPI and npm packages, Go modules, GitHub actions, or VSCode extensions. It runs a set of heuristics on the package source code (through Semgrep rules) and on the package metadata. -GuardDog can be used to scan local or remote PyPI and npm packages or Go modules using any of the available [heuristics](#heuristics). +GuardDog can be used to scan local or remote PyPI and npm packages, Go modules, GitHub actions, or VSCode extensions using any of the available [heuristics](#heuristics). It downloads and scans code from: @@ -16,6 +16,7 @@ It downloads and scans code from: * PyPI: Source files (tar.gz) packages hosted in [PyPI.org](https://pypi.org/) * Go: GoLang source files of repositories hosted in [GitHub.com](https://github.com) * GitHub Actions: Javascript source files of repositories hosted in [GitHub.com](https://github.com) +* VSCode Extensions: Extensions (.vsix) packages hosted in [marketplace.visualstudio.com](https://marketplace.visualstudio.com/) ![GuardDog demo usage](docs/images/demo.png) @@ -78,6 +79,15 @@ guarddog github_action scan DataDog/synthetics-ci-github-action guarddog github_action verify /tmp/repo/.github/workflows/main.yml +# Scan VSCode extensions from the marketplace +guarddog extension scan ms-python.python + +# Scan a specific version of a VSCode extension +guarddog extension scan ms-python.python --version 2023.20.0 + +# Scan a local VSCode extension directory or VSIX archive +guarddog extension scan /tmp/my-extension/ + # Run in debug mode guarddog --log-level debug npm scan express ``` @@ -163,9 +173,9 @@ Source code heuristics: | **Heuristic** | **Description** | |:-------------:|:---------------:| | shady-links | Identify when a package contains an URL to a domain with a suspicious extension | -| go-exec-base64 | Identify Base64-decoded content being passed to execution functions | -| go-exec-download | Identify Go code that downloads potentially executable files | -| go-exfiltrate-sensitive-data | Identify Go code that reads and exfiltrates sensitive| +| go-exec-base64 | Identify Base64-decoded content being passed to execution functions in Go | +| go-exfiltrate-sensitive-data | This rule identifies when a package reads and exfiltrates sensitive data from the local system. | +| go-exec-download | This rule downloads and executes a remote binary after setting executable permissions. | Metadata heuristics: @@ -176,9 +186,6 @@ Metadata heuristics: ### GitHub Action -For GitHub Actions that are implemented in JavaScript, -GuardDog will run the same source code heuristics as for npm packages. - Source code heuristics: | **Heuristic** | **Description** | diff --git a/guarddog/analyzer/analyzer.py b/guarddog/analyzer/analyzer.py index 4f61512a..7d294bac 100644 --- a/guarddog/analyzer/analyzer.py +++ b/guarddog/analyzer/analyzer.py @@ -67,7 +67,13 @@ def __init__(self, ecosystem=ECOSYSTEM.PYPI) -> None: ".semgrep_logs", ] - def analyze(self, path, info=None, rules=None, name: Optional[str] = None, version: Optional[str] = None) -> dict: + def analyze( + self, + path, + info=None, + rules=None, + name: Optional[str] = None, + version: Optional[str] = None) -> dict: """ Analyzes a package in the given path @@ -95,10 +101,19 @@ def analyze(self, path, info=None, rules=None, name: Optional[str] = None, versi results = metadata_results["results"] | sourcecode_results["results"] errors = metadata_results["errors"] | sourcecode_results["errors"] - return {"issues": issues, "errors": errors, "results": results, "path": path} - - def analyze_metadata(self, path: str, info, rules=None, name: Optional[str] = None, - version: Optional[str] = None) -> dict: + return { + "issues": issues, + "errors": errors, + "results": results, + "path": path} + + def analyze_metadata( + self, + path: str, + info, + rules=None, + name: Optional[str] = None, + version: Optional[str] = None) -> dict: """ Analyzes the metadata of a given package @@ -157,7 +172,11 @@ def analyze_sourcecode(self, path, rules=None) -> dict: results = semgrepscan_results["results"] | yarascan_results["results"] errors = semgrepscan_results["errors"] | yarascan_results["errors"] - return {"issues": issues, "errors": errors, "results": results, "path": path} + return { + "issues": issues, + "errors": errors, + "results": results, + "path": path} def analyze_yara(self, path: str, rules: Optional[set] = None) -> dict: """ @@ -206,6 +225,7 @@ def analyze_yara(self, path: str, rules: Optional[set] = None) -> dict: matches = scan_rules.match(scan_file_target_abspath) for m in matches: + for s in m.strings: for i in s.instances: finding = { @@ -229,7 +249,10 @@ def analyze_yara(self, path: str, rules: Optional[set] = None) -> dict: except Exception as e: errors["rules-all"] = f"failed to run rule: {str(e)}" - return {"results": results | rule_results, "errors": errors, "issues": issues} + return { + "results": results | rule_results, + "errors": errors, + "issues": issues} def analyze_semgrep(self, path, rules=None) -> dict: """ @@ -255,9 +278,7 @@ def analyze_semgrep(self, path, rules=None) -> dict: issues = 0 rules_path = list(map( - lambda rule_name: os.path.join(SOURCECODE_RULES_PATH, f"{rule_name}.yml"), - all_rules - )) + lambda rule_name: os.path.join(SOURCECODE_RULES_PATH, f"{rule_name}.yml"), all_rules)) if len(rules_path) == 0: log.debug("No semgrep code rules to run") @@ -349,9 +370,9 @@ def _format_semgrep_response(self, response, rule=None, targetpath=None): file_path = os.path.abspath(result["path"]) code = self.trim_code_snippet( self.get_snippet( - file_path=file_path, start_line=start_line, end_line=end_line - ) - ) + file_path=file_path, + start_line=start_line, + end_line=end_line)) if targetpath: file_path = os.path.relpath(file_path, targetpath) @@ -370,7 +391,11 @@ def _format_semgrep_response(self, response, rule=None, targetpath=None): return results - def get_snippet(self, file_path: str, start_line: int, end_line: int) -> str: + def get_snippet( + self, + file_path: str, + start_line: int, + end_line: int) -> str: """ Returns the code snippet between start_line and stop_line in a file diff --git a/guarddog/analyzer/metadata/__init__.py b/guarddog/analyzer/metadata/__init__.py index 0853e650..8f2686ac 100644 --- a/guarddog/analyzer/metadata/__init__.py +++ b/guarddog/analyzer/metadata/__init__.py @@ -16,3 +16,5 @@ def get_metadata_detectors(ecosystem: ECOSYSTEM) -> dict[str, Detector]: return GO_METADATA_RULES case ECOSYSTEM.GITHUB_ACTION: return GITHUB_ACTION_METADATA_RULES + case ECOSYSTEM.EXTENSION: + return {} # No metadata detectors for extensions currently diff --git a/guarddog/analyzer/sourcecode/__init__.py b/guarddog/analyzer/sourcecode/__init__.py index 0c47b630..86af3674 100644 --- a/guarddog/analyzer/sourcecode/__init__.py +++ b/guarddog/analyzer/sourcecode/__init__.py @@ -11,9 +11,13 @@ current_dir = pathlib.Path(__file__).parent.resolve() +EXTENSION_YARA_PREFIX = "extension_" # These data class aim to reduce the spreading of the logic -# Instead of using the a dict as a structure and parse it difffently depending on the type +# Instead of using the a dict as a structure and parse it difffently +# depending on the type + + @dataclass class SourceCodeRule: """ @@ -22,6 +26,7 @@ class SourceCodeRule: id: str file: str description: str + ecosystem: Optional[ECOSYSTEM] # None means "any ecosystem" @dataclass @@ -38,13 +43,11 @@ class SempgrepRule(SourceCodeRule): Semgrep rule are language specific Content of rule in yaml format is accessible through rule_content """ - ecosystem: ECOSYSTEM rule_content: dict def get_sourcecode_rules( - ecosystem: ECOSYSTEM, kind: Optional[type] = None -) -> Iterable[SourceCodeRule]: + ecosystem: ECOSYSTEM, kind: Optional[type] = None) -> Iterable[SourceCodeRule]: """ This function returns the source code rules for a given ecosystem and kind. Args: @@ -54,7 +57,8 @@ def get_sourcecode_rules( for rule in SOURCECODE_RULES: if kind and not isinstance(rule, kind): continue - if not (getattr(rule, "ecosystem", ecosystem) == ecosystem): + # Include rules that match the specific ecosystem OR rules that apply to any ecosystem (None) + if rule.ecosystem is not None and rule.ecosystem != ecosystem: continue yield rule @@ -78,13 +82,15 @@ def get_sourcecode_rules( case "javascript" | "typescript" | "json": ecosystems.add(ECOSYSTEM.NPM) ecosystems.add(ECOSYSTEM.GITHUB_ACTION) + ecosystems.add(ECOSYSTEM.EXTENSION) case "go": ecosystems.add(ECOSYSTEM.GO) case _: continue for ecosystem in ecosystems: - # avoids duplicates when multiple languages are supported by a rule + # avoids duplicates when multiple languages are supported + # by a rule if not next( filter( lambda r: r.id == rule["id"], @@ -99,8 +105,7 @@ def get_sourcecode_rules( description=rule.get("metadata", {}).get("description", ""), file=file_name, rule_content=rule, - ) - ) + )) yara_rule_file_names = list( filter(lambda x: x.endswith("yar"), os.listdir(current_dir)) @@ -111,9 +116,18 @@ def get_sourcecode_rules( rule_id = pathlib.Path(file_name).stem description_regex = fr'\s*rule\s+{rule_id}[^}}]+meta:[^}}]+description\s*=\s*\"(.+?)\"' + # Determine ecosystem based on filename prefix + rule_ecosystem: Optional[ECOSYSTEM] = ECOSYSTEM.EXTENSION if file_name.startswith(EXTENSION_YARA_PREFIX) else None + with open(os.path.join(current_dir, file_name), "r") as fd: match = re.search(description_regex, fd.read()) rule_description = "" if match: rule_description = match.group(1) - SOURCECODE_RULES.append(YaraRule(id=rule_id, file=file_name, description=rule_description)) + + SOURCECODE_RULES.append(YaraRule( + id=rule_id, + file=file_name, + description=rule_description, + ecosystem=rule_ecosystem + )) diff --git a/guarddog/analyzer/sourcecode/extension_powershell_policy_bypass.yar b/guarddog/analyzer/sourcecode/extension_powershell_policy_bypass.yar new file mode 100644 index 00000000..508df247 --- /dev/null +++ b/guarddog/analyzer/sourcecode/extension_powershell_policy_bypass.yar @@ -0,0 +1,12 @@ +rule DETECT_FILE_powershell_policy_bypass +{ + meta: + author = "T HAMDOUNI, Datadog" + description = "Detects suspicious read access to /etc/passwd file, which is often targeted by malware for credential harvesting" + + strings: + $cli = /(cat|less|more|head|tail)\s+.{0,100}\/etc\/passwd/ nocase + $read = /(readFile|readFileSync)\(\s*['"]\/etc\/passwd/ nocase + condition: + $cli or $read +} \ No newline at end of file diff --git a/guarddog/analyzer/sourcecode/extension_suspicious_passwd_access_linux.yar b/guarddog/analyzer/sourcecode/extension_suspicious_passwd_access_linux.yar new file mode 100644 index 00000000..72930cb5 --- /dev/null +++ b/guarddog/analyzer/sourcecode/extension_suspicious_passwd_access_linux.yar @@ -0,0 +1,12 @@ +rule DETECT_FILE_suspicious_passwd_access_linux +{ + meta: + author = "T HAMDOUNI, Datadog" + description = "Detects suspicious read access to /etc/passwd file, which is often targeted by malware for credential harvesting" + + strings: + $cli = /(cat|less|more|head|tail)\s+.{0,100}\/etc\/passwd/ nocase + $read = /(readFile|readFileSync)\(\s*['"]\/etc\/passwd/ nocase + condition: + $cli or $read +} \ No newline at end of file diff --git a/guarddog/ecosystems.py b/guarddog/ecosystems.py index fcd73c73..d4359b7c 100644 --- a/guarddog/ecosystems.py +++ b/guarddog/ecosystems.py @@ -6,6 +6,7 @@ class ECOSYSTEM(Enum): NPM = "npm" GO = "go" GITHUB_ACTION = "github-action" + EXTENSION = "extension" def get_friendly_name(ecosystem: ECOSYSTEM) -> str: @@ -18,5 +19,7 @@ def get_friendly_name(ecosystem: ECOSYSTEM) -> str: return "go" case ECOSYSTEM.GITHUB_ACTION: return "GitHub Action" + case ECOSYSTEM.EXTENSION: + return "Extension" case _: return ecosystem.value diff --git a/guarddog/reporters/human_readable.py b/guarddog/reporters/human_readable.py index 40c71713..dd519a3d 100644 --- a/guarddog/reporters/human_readable.py +++ b/guarddog/reporters/human_readable.py @@ -47,10 +47,15 @@ def _format_code_line_for_output(code) -> str: if num_issues == 0: lines.append( "Found " - + colored("0 potentially malicious indicators", "green", attrs=["bold"]) + + colored( + "0 potentially malicious indicators", + "green", + attrs=["bold"]) + " scanning " - + colored(identifier, None, attrs=["bold"]) - ) + + colored( + identifier, + None, + attrs=["bold"])) lines.append("") else: lines.append( @@ -69,9 +74,7 @@ def _format_code_line_for_output(code) -> str: for finding in findings: description = findings[finding] if isinstance(description, str): # package metadata - lines.append( - colored(finding, None, attrs=["bold"]) + ": " + description - ) + lines.append(colored(finding, None, attrs=["bold"]) + ": " + description) lines.append("") elif isinstance(description, list): # semgrep rule result: source_code_findings = description diff --git a/guarddog/scanners/__init__.py b/guarddog/scanners/__init__.py index 55627bc7..0b5a41c5 100644 --- a/guarddog/scanners/__init__.py +++ b/guarddog/scanners/__init__.py @@ -8,6 +8,7 @@ from .go_package_scanner import GoModuleScanner from .go_project_scanner import GoDependenciesScanner from .github_action_scanner import GithubActionScanner +from .extension_scanner import ExtensionScanner from .scanner import PackageScanner, ProjectScanner from ..ecosystems import ECOSYSTEM @@ -33,6 +34,8 @@ def get_package_scanner(ecosystem: ECOSYSTEM) -> Optional[PackageScanner]: return GoModuleScanner() case ECOSYSTEM.GITHUB_ACTION: return GithubActionScanner() + case ECOSYSTEM.EXTENSION: + return ExtensionScanner() return None @@ -57,4 +60,6 @@ def get_project_scanner(ecosystem: ECOSYSTEM) -> Optional[ProjectScanner]: return GoDependenciesScanner() case ECOSYSTEM.GITHUB_ACTION: return GitHubActionDependencyScanner() + case ECOSYSTEM.EXTENSION: + return None # we're not including dependency scanning for this PR return None diff --git a/guarddog/scanners/extension_scanner.py b/guarddog/scanners/extension_scanner.py new file mode 100644 index 00000000..51eead9e --- /dev/null +++ b/guarddog/scanners/extension_scanner.py @@ -0,0 +1,146 @@ +import logging +import os +import typing + +import requests + +from guarddog.analyzer.analyzer import Analyzer +from guarddog.ecosystems import ECOSYSTEM +from guarddog.scanners.scanner import PackageScanner, noop + +log = logging.getLogger("guarddog") + +MARKETPLACE_URL = "https://marketplace.visualstudio.com/_apis/public/gallery/extensionquery" +MARKETPLACE_HEADERS = { + "Content-Type": "application/json", + "Accept": "application/json;api-version=3.0-preview.1" +} +MARKETPLACE_DOWNLOAD_LINK_ASSET_TYPE = "Microsoft.VisualStudio.Services.VSIXPackage" +VSIX_FILE_EXTENSION = ".vsix" + +# VSCode Marketplace API filter types +# FilterType 7 = publisherName.extensionName (search by exact extension identifier) +MARKETPLACE_FILTER_TYPE_EXTENSION_NAME = 7 + +# VSCode Marketplace API flags (bitwise combination) +# 446 = IncludeVersions | IncludeFiles | IncludeMetadata +MARKETPLACE_FLAGS_FULL_METADATA = 446 + + +class ExtensionScanner(PackageScanner): + def __init__(self) -> None: + super().__init__(Analyzer(ECOSYSTEM.EXTENSION)) + + def download_and_get_package_info(self, + directory: str, + package_name: str, + version=None) -> typing.Tuple[dict, + str]: + """ + Downloads a VSCode extension from the marketplace and extracts it + + Args: + directory: Directory to download to + package_name: Extension identifier (publisher.extension format) + version: Specific version or default to latest + + Returns: + Tuple of (marketplace API response, extracted_path) + """ + marketplace_data, vsix_url = self._get_marketplace_info_and_url(package_name, version) + + vsix_path = os.path.join(directory, package_name.replace("/", "-") + VSIX_FILE_EXTENSION) + extracted_path = vsix_path.removesuffix(VSIX_FILE_EXTENSION) + + log.debug(f"Downloading VSCode extension from {vsix_url}") + + self.download_compressed(vsix_url, vsix_path, extracted_path) + + return marketplace_data, extracted_path + + def _get_marketplace_info_and_url( + self, + package_name: str, + version: typing.Optional[str] = None) -> typing.Tuple[dict, str]: + """Get marketplace metadata and VSIX download URL""" + payload = { + "filters": [ + { + "criteria": [ + { + "filterType": MARKETPLACE_FILTER_TYPE_EXTENSION_NAME, + "value": package_name + } + ] + } + ], + "flags": MARKETPLACE_FLAGS_FULL_METADATA + } + + response = requests.post( + MARKETPLACE_URL, + headers=MARKETPLACE_HEADERS, + json=payload) + + response.raise_for_status() + + data = response.json() + + if not data.get("results") or not data["results"][0].get("extensions"): + raise ValueError( + f"Extension {package_name} not found in marketplace") + + extension_info = data["results"][0]["extensions"][0] + versions = extension_info.get("versions", []) + + if not versions: + raise ValueError( + f"No versions available for this extension: {package_name}") + + target_version = None + if version is None: + # if not version is provided, default to latest + target_version = versions[0] + else: + for v in versions: + if v.get("version") == version: + target_version = v + break + if target_version is None: + raise ValueError( + f"Version {version} not found for extension: {package_name}") + + # Extract download URL + files = target_version.get("files", []) + vsix_url = None + for file_info in files: + if file_info.get("assetType") == MARKETPLACE_DOWNLOAD_LINK_ASSET_TYPE: + vsix_url = file_info.get("source") + break + + if not vsix_url: + raise ValueError( + f"No VSIX download link available for this extension: {package_name}") + + return data, vsix_url + + def scan_local(self, path: str, rules=None, callback: typing.Callable[[dict], None] = noop) -> dict: + """ + Scan a local VSCode extension directory + + Args: + path: Path to extension directory containing package.json + rules: Set of rules to use + callback: Callback to apply to analyzer output + + Returns: + Scan results + """ + if rules is not None: + rules = set(rules) + + # Use only sourcecode analysis for local scans, consistent with other ecosystems + results = self.analyzer.analyze_sourcecode(path, rules=rules) + callback(results) + + return results diff --git a/tests/analyzer/sourcecode/obfuscation.py b/tests/analyzer/sourcecode/obfuscation.py index 826e12b5..6b63f923 100644 --- a/tests/analyzer/sourcecode/obfuscation.py +++ b/tests/analyzer/sourcecode/obfuscation.py @@ -54,4 +54,4 @@ def f(): # ruleid: obfuscation exec(''.join(chr(c) for c in [112, 114, 105, 110, 116, 40, 34, 72, 101, 108, 108, 111, 44, 32, 119, 111, 114, 108, 100, 34, 41])) # ruleid: obfuscation - exec(''.join(map(chr, [112, 114, 105, 110, 116, 40, 34, 72, 101, 108, 108, 111, 44, 32, 119, 111, 114, 108, 100, 34, 41]))) + exec(''.join(map(chr, [112, 114, 105, 110, 116, 40, 34, 72, 101, 108, 108, 111, 44, 32, 119, 111, 114, 108, 100, 34, 41]))) \ No newline at end of file diff --git a/tests/core/test_extension_scanner.py b/tests/core/test_extension_scanner.py new file mode 100644 index 00000000..11041148 --- /dev/null +++ b/tests/core/test_extension_scanner.py @@ -0,0 +1,103 @@ +import os.path +import tempfile +import json + +import pytest + +from guarddog.scanners import ExtensionScanner + + +def test_download_and_get_package_info(): + scanner = ExtensionScanner() + with tempfile.TemporaryDirectory() as tmpdirname: + data, path = scanner.download_and_get_package_info(tmpdirname, "wayou.vscode-todo-highlight") + assert path + assert path.endswith("/wayou.vscode-todo-highlight") + + assert os.path.exists(path) + + package_json_found = False + for root, dirs, files in os.walk(path): + if "package.json" in files: + package_json_found = True + break + assert package_json_found, "package.json should exist in the extracted extension" + + # data now contains raw marketplace API response + assert "results" in data + assert len(data["results"]) > 0 + assert "extensions" in data["results"][0] + assert len(data["results"][0]["extensions"]) > 0 + + extension = data["results"][0]["extensions"][0] + assert "extensionName" in extension + assert "publisher" in extension + + +def test_download_and_get_package_info_with_version(): + scanner = ExtensionScanner() + with tempfile.TemporaryDirectory() as tmpdirname: + data, path = scanner.download_and_get_package_info(tmpdirname, "gerane.Theme-FlatlandMonokai", "0.0.6") + assert path + assert path.endswith("/gerane.Theme-FlatlandMonokai") + + # data now contains raw marketplace API response + assert "results" in data + assert len(data["results"]) > 0 + extension = data["results"][0]["extensions"][0] + assert "versions" in extension + assert len(extension["versions"]) > 0 + # Note: We're using a relatively stable extension that hasn't been updated in a long while + + +def test_download_and_get_package_info_non_existing_package(): + scanner = ExtensionScanner() + with tempfile.TemporaryDirectory() as tmpdirname: + with pytest.raises(Exception) as exc_info: + scanner.download_and_get_package_info(tmpdirname, "non-existent-publisher.non-existent-extension") + assert "not found" in str(exc_info.value).lower() + + +def test_download_and_get_package_info_non_existing_version(): + scanner = ExtensionScanner() + with tempfile.TemporaryDirectory() as tmpdirname: + with pytest.raises(Exception) as exc_info: + scanner.download_and_get_package_info(tmpdirname, "wayou.vscode-todo-highlight", "999.999.999") + assert "version" in str(exc_info.value).lower() + + +def test_scan_local_extension_directory(): + scanner = ExtensionScanner() + + with tempfile.TemporaryDirectory() as tmpdirname: + # Create a mock extension directory structure + extension_dir = os.path.join(tmpdirname, "test-extension") + os.makedirs(extension_dir) + + package_json = { + "name": "test-extension", + "displayName": "Test Extension", + "description": "A test extension for GuardDog", + "version": "1.0.0", + "publisher": "test-publisher", + "categories": ["Other"], + "activationEvents": ["*"], + "contributes": { + "commands": [ + { + "command": "test.hello", + "title": "Hello World" + } + ] + } + } + + package_json_path = os.path.join(extension_dir, "package.json") + with open(package_json_path, 'w') as f: + json.dump(package_json, f) + + result = scanner.scan_local(extension_dir) + + assert "issues" in result + assert "results" in result +