DataDog · tesnim5hamdouni · Jul 23, 2025 · Jul 16, 2025 · Jul 16, 2025 · Jul 17, 2025
diff --git a/README.md b/README.md
@@ -6,16 +6,17 @@
   <img src="./docs/images/logo.png" alt="GuardDog" width="300" />
 </p>
 
-GuardDog is a CLI tool that allows to identify malicious PyPI and npm packages or Go modules. It runs a set of heuristics on the package source code (through Semgrep rules) and on the package metadata.
+GuardDog is a CLI tool that allows to identify malicious PyPI and npm packages, Go modules, GitHub actions, or VSCode extensions. It runs a set of heuristics on the package source code (through Semgrep rules) and on the package metadata.
 
-GuardDog can be used to scan local or remote PyPI and npm packages or Go modules using any of the available [heuristics](#heuristics).
+GuardDog can be used to scan local or remote PyPI and npm packages, Go modules, GitHub actions, or VSCode extensions using any of the available [heuristics](#heuristics).
 
 It downloads and scans code from:
 
 * NPM: Packages hosted in [npmjs.org](https://www.npmjs.com/)
 * PyPI: Source files (tar.gz) packages hosted in [PyPI.org](https://pypi.org/)
 * Go: GoLang source files of repositories hosted in [GitHub.com](https://github.com)
 * GitHub Actions: Javascript source files of repositories hosted in [GitHub.com](https://github.com)
+* VSCode Extensions: Extensions (.vsix) packages hosted in [marketplace.visualstudio.com](https://marketplace.visualstudio.com/)
 
 ![GuardDog demo usage](docs/images/demo.png)
 
@@ -78,6 +79,15 @@ guarddog github_action scan DataDog/synthetics-ci-github-action
 
 guarddog github_action verify /tmp/repo/.github/workflows/main.yml
 
+# Scan VSCode extensions from the marketplace
+guarddog extension scan ms-python.python
+
+# Scan a specific version of a VSCode extension
+guarddog extension scan ms-python.python --version 2023.20.0
+
+# Scan a local VSCode extension directory or VSIX archive
+guarddog extension scan /tmp/my-extension/
+
 # Run in debug mode
 guarddog --log-level debug npm scan express
 ```
@@ -163,9 +173,9 @@ Source code heuristics:
 | **Heuristic** | **Description** |
 |:-------------:|:---------------:|
 | shady-links | Identify when a package contains an URL to a domain with a suspicious extension |
-| go-exec-base64 | Identify Base64-decoded content being passed to execution functions |
-| go-exec-download | Identify Go code that downloads potentially executable files |
-| go-exfiltrate-sensitive-data | Identify Go code that reads and exfiltrates sensitive|
+| go-exec-base64 | Identify Base64-decoded content being passed to execution functions in Go |
+| go-exfiltrate-sensitive-data | This rule identifies when a package reads and exfiltrates sensitive data from the local system. |
+| go-exec-download | This rule downloads and executes a remote binary after setting executable permissions. |
 
 Metadata heuristics:
 
@@ -176,9 +186,6 @@ Metadata heuristics:
 
 ### GitHub Action
 
-For GitHub Actions that are implemented in JavaScript,
-GuardDog will run the same source code heuristics as for npm packages.
-
 Source code heuristics:
 
 | **Heuristic** | **Description** |

diff --git a/guarddog/analyzer/analyzer.py b/guarddog/analyzer/analyzer.py
@@ -67,7 +67,13 @@ def __init__(self, ecosystem=ECOSYSTEM.PYPI) -> None:
             ".semgrep_logs",
         ]
 
-    def analyze(self, path, info=None, rules=None, name: Optional[str] = None, version: Optional[str] = None) -> dict:
+    def analyze(
+            self,
+            path,
+            info=None,
+            rules=None,
+            name: Optional[str] = None,
+            version: Optional[str] = None) -> dict:
         """
         Analyzes a package in the given path
 
@@ -95,10 +101,19 @@ def analyze(self, path, info=None, rules=None, name: Optional[str] = None, versi
         results = metadata_results["results"] | sourcecode_results["results"]
         errors = metadata_results["errors"] | sourcecode_results["errors"]
 
-        return {"issues": issues, "errors": errors, "results": results, "path": path}
-
-    def analyze_metadata(self, path: str, info, rules=None, name: Optional[str] = None,
-                         version: Optional[str] = None) -> dict:
+        return {
+            "issues": issues,
+            "errors": errors,
+            "results": results,
+            "path": path}
+
+    def analyze_metadata(
+            self,
+            path: str,
+            info,
+            rules=None,
+            name: Optional[str] = None,
+            version: Optional[str] = None) -> dict:
         """
         Analyzes the metadata of a given package
 
@@ -157,7 +172,11 @@ def analyze_sourcecode(self, path, rules=None) -> dict:
         results = semgrepscan_results["results"] | yarascan_results["results"]
         errors = semgrepscan_results["errors"] | yarascan_results["errors"]
 
-        return {"issues": issues, "errors": errors, "results": results, "path": path}
+        return {
+            "issues": issues,
+            "errors": errors,
+            "results": results,
+            "path": path}
 
     def analyze_yara(self, path: str, rules: Optional[set] = None) -> dict:
         """
@@ -206,6 +225,7 @@ def analyze_yara(self, path: str, rules: Optional[set] = None) -> dict:
 
                     matches = scan_rules.match(scan_file_target_abspath)
                     for m in matches:
+
                         for s in m.strings:
                             for i in s.instances:
                                 finding = {
@@ -229,7 +249,10 @@ def analyze_yara(self, path: str, rules: Optional[set] = None) -> dict:
         except Exception as e:
             errors["rules-all"] = f"failed to run rule: {str(e)}"
 
-        return {"results": results | rule_results, "errors": errors, "issues": issues}
+        return {
+            "results": results | rule_results,
+            "errors": errors,
+            "issues": issues}
 
     def analyze_semgrep(self, path, rules=None) -> dict:
         """
@@ -255,9 +278,7 @@ def analyze_semgrep(self, path, rules=None) -> dict:
         issues = 0
 
         rules_path = list(map(
-            lambda rule_name: os.path.join(SOURCECODE_RULES_PATH, f"{rule_name}.yml"),
-            all_rules
-        ))
+            lambda rule_name: os.path.join(SOURCECODE_RULES_PATH, f"{rule_name}.yml"), all_rules))
 
         if len(rules_path) == 0:
             log.debug("No semgrep code rules to run")
@@ -349,9 +370,9 @@ def _format_semgrep_response(self, response, rule=None, targetpath=None):
             file_path = os.path.abspath(result["path"])
             code = self.trim_code_snippet(
                 self.get_snippet(
-                    file_path=file_path, start_line=start_line, end_line=end_line
-                )
-            )
+                    file_path=file_path,
+                    start_line=start_line,
+                    end_line=end_line))
             if targetpath:
                 file_path = os.path.relpath(file_path, targetpath)
 
@@ -370,7 +391,11 @@ def _format_semgrep_response(self, response, rule=None, targetpath=None):
 
         return results
 
-    def get_snippet(self, file_path: str, start_line: int, end_line: int) -> str:
+    def get_snippet(
+            self,
+            file_path: str,
+            start_line: int,
+            end_line: int) -> str:
         """
         Returns the code snippet between start_line and stop_line in a file
 

diff --git a/guarddog/analyzer/metadata/__init__.py b/guarddog/analyzer/metadata/__init__.py
@@ -16,3 +16,5 @@ def get_metadata_detectors(ecosystem: ECOSYSTEM) -> dict[str, Detector]:
             return GO_METADATA_RULES
         case ECOSYSTEM.GITHUB_ACTION:
             return GITHUB_ACTION_METADATA_RULES
+        case ECOSYSTEM.EXTENSION:
+            return {}  # No metadata detectors for extensions currently
diff --git a/guarddog/analyzer/sourcecode/__init__.py b/guarddog/analyzer/sourcecode/__init__.py
@@ -13,7 +13,8 @@
 
 
 # These data class aim to reduce the spreading of the logic
-# Instead of using the a dict as a structure and parse it difffently depending on the type
+# Instead of using the a dict as a structure and parse it difffently
+# depending on the type
 @dataclass
 class SourceCodeRule:
     """
@@ -22,6 +23,7 @@ class SourceCodeRule:
     id: str
     file: str
     description: str
+    ecosystem: Optional[ECOSYSTEM]  # None means "any ecosystem"
 
 
 @dataclass
@@ -38,13 +40,11 @@ class SempgrepRule(SourceCodeRule):
     Semgrep rule are language specific
     Content of rule in yaml format is accessible through rule_content
     """
-    ecosystem: ECOSYSTEM
     rule_content: dict
 
 
 def get_sourcecode_rules(
-    ecosystem: ECOSYSTEM, kind: Optional[type] = None
-) -> Iterable[SourceCodeRule]:
+        ecosystem: ECOSYSTEM, kind: Optional[type] = None) -> Iterable[SourceCodeRule]:
     """
     This function returns the source code rules for a given ecosystem and kind.
     Args:
@@ -54,7 +54,8 @@ def get_sourcecode_rules(
     for rule in SOURCECODE_RULES:
         if kind and not isinstance(rule, kind):
             continue
-        if not (getattr(rule, "ecosystem", ecosystem) == ecosystem):
+        # Include rules that match the specific ecosystem OR rules that apply to any ecosystem (None)
+        if rule.ecosystem is not None and rule.ecosystem != ecosystem:
             continue
         yield rule
 
@@ -78,13 +79,15 @@ def get_sourcecode_rules(
                     case "javascript" | "typescript" | "json":
                         ecosystems.add(ECOSYSTEM.NPM)
                         ecosystems.add(ECOSYSTEM.GITHUB_ACTION)
+                        ecosystems.add(ECOSYSTEM.EXTENSION)
                     case "go":
                         ecosystems.add(ECOSYSTEM.GO)
                     case _:
                         continue
 
                 for ecosystem in ecosystems:
-                    # avoids duplicates when multiple languages are supported by a rule
+                    # avoids duplicates when multiple languages are supported
+                    # by a rule
                     if not next(
                         filter(
                             lambda r: r.id == rule["id"],
@@ -99,8 +102,7 @@ def get_sourcecode_rules(
                                 description=rule.get("metadata", {}).get("description", ""),
                                 file=file_name,
                                 rule_content=rule,
-                            )
-                        )
+                            ))
 
 yara_rule_file_names = list(
     filter(lambda x: x.endswith("yar"), os.listdir(current_dir))
@@ -111,9 +113,23 @@ def get_sourcecode_rules(
     rule_id = pathlib.Path(file_name).stem
     description_regex = fr'\s*rule\s+{rule_id}[^}}]+meta:[^}}]+description\s*=\s*\"(.+?)\"'
 
+    # Determine ecosystem based on filename prefix
+    rule_ecosystem: Optional[ECOSYSTEM]
+    if file_name.startswith("extension_"):
+        rule_ecosystem = ECOSYSTEM.EXTENSION
+    else:
+        # If no specific ecosystem prefix, apply to any ecosystem
+        rule_ecosystem = None
+
     with open(os.path.join(current_dir, file_name), "r") as fd:
         match = re.search(description_regex, fd.read())
         rule_description = ""
         if match:
             rule_description = match.group(1)
-        SOURCECODE_RULES.append(YaraRule(id=rule_id, file=file_name, description=rule_description))
+
+        SOURCECODE_RULES.append(YaraRule(
+            id=rule_id,
+            file=file_name,
+            description=rule_description,
+            ecosystem=rule_ecosystem
+        ))
diff --git a/guarddog/analyzer/sourcecode/extension_powershell_policy_bypass.yar b/guarddog/analyzer/sourcecode/extension_powershell_policy_bypass.yar
@@ -0,0 +1,13 @@
+rule DETECT_FILE_powershell_policy_bypass
+{
+    meta:
+        author = "T HAMDOUNI, Datadog"
+        credits = ""
+        description = "Detects suspicious read access to /etc/passwd file, which is often targeted by malware for credential harvesting"
+
+    strings:
+        $cli = /(cat|less|more|head|tail)\s+.{0,100}\/etc\/passwd/ nocase
+        $read = /(readFile|readFileSync)\(\s*['"]\/etc\/passwd/ nocase
+    condition:
+        $cli or $read
+}
diff --git a/guarddog/analyzer/sourcecode/extension_suspicious_passwd_access_linux.yar b/guarddog/analyzer/sourcecode/extension_suspicious_passwd_access_linux.yar
@@ -0,0 +1,13 @@
+rule DETECT_FILE_suspicious_passwd_access_linux
+{
+    meta:
+        author = "T HAMDOUNI, Datadog"
+        credits = ""
+        description = "Detects suspicious read access to /etc/passwd file, which is often targeted by malware for credential harvesting"
+
+    strings:
+        $cli = /(cat|less|more|head|tail)\s+.{0,100}\/etc\/passwd/ nocase
+        $read = /(readFile|readFileSync)\(\s*['"]\/etc\/passwd/ nocase
+    condition:
+        $cli or $read
+}
diff --git a/guarddog/ecosystems.py b/guarddog/ecosystems.py
@@ -6,6 +6,7 @@ class ECOSYSTEM(Enum):
     NPM = "npm"
     GO = "go"
     GITHUB_ACTION = "github-action"
+    EXTENSION = "extension"
 
 
 def get_friendly_name(ecosystem: ECOSYSTEM) -> str:
@@ -18,5 +19,7 @@ def get_friendly_name(ecosystem: ECOSYSTEM) -> str:
             return "go"
         case ECOSYSTEM.GITHUB_ACTION:
             return "GitHub Action"
+        case ECOSYSTEM.EXTENSION:
+            return "Extension"
         case _:
             return ecosystem.value
diff --git a/guarddog/reporters/human_readable.py b/guarddog/reporters/human_readable.py
@@ -47,10 +47,15 @@ def _format_code_line_for_output(code) -> str:
         if num_issues == 0:
             lines.append(
                 "Found "
-                + colored("0 potentially malicious indicators", "green", attrs=["bold"])
+                + colored(
+                    "0 potentially malicious indicators",
+                    "green",
+                    attrs=["bold"])
                 + " scanning "
-                + colored(identifier, None, attrs=["bold"])
-            )
+                + colored(
+                    identifier,
+                    None,
+                    attrs=["bold"]))
             lines.append("")
         else:
             lines.append(
@@ -69,9 +74,7 @@ def _format_code_line_for_output(code) -> str:
             for finding in findings:
                 description = findings[finding]
                 if isinstance(description, str):  # package metadata
-                    lines.append(
-                        colored(finding, None, attrs=["bold"]) + ": " + description
-                    )
+                    lines.append(colored(finding, None, attrs=["bold"]) + ": " + description)
                     lines.append("")
                 elif isinstance(description, list):  # semgrep rule result:
                     source_code_findings = description

diff --git a/guarddog/scanners/__init__.py b/guarddog/scanners/__init__.py
@@ -8,6 +8,7 @@
 from .go_package_scanner import GoModuleScanner
 from .go_project_scanner import GoDependenciesScanner
 from .github_action_scanner import GithubActionScanner
+from .extension_scanner import ExtensionScanner
 from .scanner import PackageScanner, ProjectScanner
 from ..ecosystems import ECOSYSTEM
 
@@ -33,6 +34,8 @@ def get_package_scanner(ecosystem: ECOSYSTEM) -> Optional[PackageScanner]:
             return GoModuleScanner()
         case ECOSYSTEM.GITHUB_ACTION:
             return GithubActionScanner()
+        case ECOSYSTEM.EXTENSION:
+            return ExtensionScanner()
     return None
 
 
@@ -57,4 +60,6 @@ def get_project_scanner(ecosystem: ECOSYSTEM) -> Optional[ProjectScanner]:
             return GoDependenciesScanner()
         case ECOSYSTEM.GITHUB_ACTION:
             return GitHubActionDependencyScanner()
+        case ECOSYSTEM.EXTENSION:
+            return None  # we're not including dependency scanning for this PR
     return None