Skip to content
Merged
Show file tree
Hide file tree
Changes from 5 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
23 changes: 15 additions & 8 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,16 +6,17 @@
<img src="./docs/images/logo.png" alt="GuardDog" width="300" />
</p>

GuardDog is a CLI tool that allows to identify malicious PyPI and npm packages or Go modules. It runs a set of heuristics on the package source code (through Semgrep rules) and on the package metadata.
GuardDog is a CLI tool that allows to identify malicious PyPI and npm packages, Go modules, GitHub actions, or VSCode extensions. It runs a set of heuristics on the package source code (through Semgrep rules) and on the package metadata.

GuardDog can be used to scan local or remote PyPI and npm packages or Go modules using any of the available [heuristics](#heuristics).
GuardDog can be used to scan local or remote PyPI and npm packages, Go modules, GitHub actions, or VSCode extensions using any of the available [heuristics](#heuristics).

It downloads and scans code from:

* NPM: Packages hosted in [npmjs.org](https://www.npmjs.com/)
* PyPI: Source files (tar.gz) packages hosted in [PyPI.org](https://pypi.org/)
* Go: GoLang source files of repositories hosted in [GitHub.com](https://github.com)
* GitHub Actions: Javascript source files of repositories hosted in [GitHub.com](https://github.com)
* VSCode Extensions: Extensions (.vsix) packages hosted in [marketplace.visualstudio.com](https://marketplace.visualstudio.com/)

![GuardDog demo usage](docs/images/demo.png)

Expand Down Expand Up @@ -78,6 +79,15 @@ guarddog github_action scan DataDog/synthetics-ci-github-action

guarddog github_action verify /tmp/repo/.github/workflows/main.yml

# Scan VSCode extensions from the marketplace
guarddog extension scan ms-python.python

# Scan a specific version of a VSCode extension
guarddog extension scan ms-python.python --version 2023.20.0

# Scan a local VSCode extension directory or VSIX archive
guarddog extension scan /tmp/my-extension/

# Run in debug mode
guarddog --log-level debug npm scan express
```
Expand Down Expand Up @@ -163,9 +173,9 @@ Source code heuristics:
| **Heuristic** | **Description** |
|:-------------:|:---------------:|
| shady-links | Identify when a package contains an URL to a domain with a suspicious extension |
| go-exec-base64 | Identify Base64-decoded content being passed to execution functions |
| go-exec-download | Identify Go code that downloads potentially executable files |
| go-exfiltrate-sensitive-data | Identify Go code that reads and exfiltrates sensitive|
| go-exec-base64 | Identify Base64-decoded content being passed to execution functions in Go |
| go-exfiltrate-sensitive-data | This rule identifies when a package reads and exfiltrates sensitive data from the local system. |
| go-exec-download | This rule downloads and executes a remote binary after setting executable permissions. |

Metadata heuristics:

Expand All @@ -176,9 +186,6 @@ Metadata heuristics:

### GitHub Action

For GitHub Actions that are implemented in JavaScript,
GuardDog will run the same source code heuristics as for npm packages.

Source code heuristics:

| **Heuristic** | **Description** |
Expand Down
53 changes: 39 additions & 14 deletions guarddog/analyzer/analyzer.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,7 +67,13 @@ def __init__(self, ecosystem=ECOSYSTEM.PYPI) -> None:
".semgrep_logs",
]

def analyze(self, path, info=None, rules=None, name: Optional[str] = None, version: Optional[str] = None) -> dict:
def analyze(
self,
path,
info=None,
rules=None,
name: Optional[str] = None,
version: Optional[str] = None) -> dict:
"""
Analyzes a package in the given path

Expand Down Expand Up @@ -95,10 +101,19 @@ def analyze(self, path, info=None, rules=None, name: Optional[str] = None, versi
results = metadata_results["results"] | sourcecode_results["results"]
errors = metadata_results["errors"] | sourcecode_results["errors"]

return {"issues": issues, "errors": errors, "results": results, "path": path}

def analyze_metadata(self, path: str, info, rules=None, name: Optional[str] = None,
version: Optional[str] = None) -> dict:
return {
"issues": issues,
"errors": errors,
"results": results,
"path": path}

def analyze_metadata(
self,
path: str,
info,
rules=None,
name: Optional[str] = None,
version: Optional[str] = None) -> dict:
"""
Analyzes the metadata of a given package

Expand Down Expand Up @@ -157,7 +172,11 @@ def analyze_sourcecode(self, path, rules=None) -> dict:
results = semgrepscan_results["results"] | yarascan_results["results"]
errors = semgrepscan_results["errors"] | yarascan_results["errors"]

return {"issues": issues, "errors": errors, "results": results, "path": path}
return {
"issues": issues,
"errors": errors,
"results": results,
"path": path}

def analyze_yara(self, path: str, rules: Optional[set] = None) -> dict:
"""
Expand Down Expand Up @@ -206,6 +225,7 @@ def analyze_yara(self, path: str, rules: Optional[set] = None) -> dict:

matches = scan_rules.match(scan_file_target_abspath)
for m in matches:

for s in m.strings:
for i in s.instances:
finding = {
Expand All @@ -229,7 +249,10 @@ def analyze_yara(self, path: str, rules: Optional[set] = None) -> dict:
except Exception as e:
errors["rules-all"] = f"failed to run rule: {str(e)}"

return {"results": results | rule_results, "errors": errors, "issues": issues}
return {
"results": results | rule_results,
"errors": errors,
"issues": issues}

def analyze_semgrep(self, path, rules=None) -> dict:
"""
Expand All @@ -255,9 +278,7 @@ def analyze_semgrep(self, path, rules=None) -> dict:
issues = 0

rules_path = list(map(
lambda rule_name: os.path.join(SOURCECODE_RULES_PATH, f"{rule_name}.yml"),
all_rules
))
lambda rule_name: os.path.join(SOURCECODE_RULES_PATH, f"{rule_name}.yml"), all_rules))

if len(rules_path) == 0:
log.debug("No semgrep code rules to run")
Expand Down Expand Up @@ -349,9 +370,9 @@ def _format_semgrep_response(self, response, rule=None, targetpath=None):
file_path = os.path.abspath(result["path"])
code = self.trim_code_snippet(
self.get_snippet(
file_path=file_path, start_line=start_line, end_line=end_line
)
)
file_path=file_path,
start_line=start_line,
end_line=end_line))
if targetpath:
file_path = os.path.relpath(file_path, targetpath)

Expand All @@ -370,7 +391,11 @@ def _format_semgrep_response(self, response, rule=None, targetpath=None):

return results

def get_snippet(self, file_path: str, start_line: int, end_line: int) -> str:
def get_snippet(
self,
file_path: str,
start_line: int,
end_line: int) -> str:
"""
Returns the code snippet between start_line and stop_line in a file

Expand Down
2 changes: 2 additions & 0 deletions guarddog/analyzer/metadata/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,3 +16,5 @@ def get_metadata_detectors(ecosystem: ECOSYSTEM) -> dict[str, Detector]:
return GO_METADATA_RULES
case ECOSYSTEM.GITHUB_ACTION:
return GITHUB_ACTION_METADATA_RULES
case ECOSYSTEM.EXTENSION:
return {} # No metadata detectors for extensions currently
34 changes: 25 additions & 9 deletions guarddog/analyzer/sourcecode/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,8 @@


# These data class aim to reduce the spreading of the logic
# Instead of using the a dict as a structure and parse it difffently depending on the type
# Instead of using the a dict as a structure and parse it difffently
# depending on the type
@dataclass
class SourceCodeRule:
"""
Expand All @@ -22,6 +23,7 @@ class SourceCodeRule:
id: str
file: str
description: str
ecosystem: Optional[ECOSYSTEM] # None means "any ecosystem"


@dataclass
Expand All @@ -38,13 +40,11 @@ class SempgrepRule(SourceCodeRule):
Semgrep rule are language specific
Content of rule in yaml format is accessible through rule_content
"""
ecosystem: ECOSYSTEM
rule_content: dict


def get_sourcecode_rules(
ecosystem: ECOSYSTEM, kind: Optional[type] = None
) -> Iterable[SourceCodeRule]:
ecosystem: ECOSYSTEM, kind: Optional[type] = None) -> Iterable[SourceCodeRule]:
"""
This function returns the source code rules for a given ecosystem and kind.
Args:
Expand All @@ -54,7 +54,8 @@ def get_sourcecode_rules(
for rule in SOURCECODE_RULES:
if kind and not isinstance(rule, kind):
continue
if not (getattr(rule, "ecosystem", ecosystem) == ecosystem):
# Include rules that match the specific ecosystem OR rules that apply to any ecosystem (None)
if rule.ecosystem is not None and rule.ecosystem != ecosystem:
continue
yield rule

Expand All @@ -78,13 +79,15 @@ def get_sourcecode_rules(
case "javascript" | "typescript" | "json":
ecosystems.add(ECOSYSTEM.NPM)
ecosystems.add(ECOSYSTEM.GITHUB_ACTION)
ecosystems.add(ECOSYSTEM.EXTENSION)
case "go":
ecosystems.add(ECOSYSTEM.GO)
case _:
continue

for ecosystem in ecosystems:
# avoids duplicates when multiple languages are supported by a rule
# avoids duplicates when multiple languages are supported
# by a rule
if not next(
filter(
lambda r: r.id == rule["id"],
Expand All @@ -99,8 +102,7 @@ def get_sourcecode_rules(
description=rule.get("metadata", {}).get("description", ""),
file=file_name,
rule_content=rule,
)
)
))

yara_rule_file_names = list(
filter(lambda x: x.endswith("yar"), os.listdir(current_dir))
Expand All @@ -111,9 +113,23 @@ def get_sourcecode_rules(
rule_id = pathlib.Path(file_name).stem
description_regex = fr'\s*rule\s+{rule_id}[^}}]+meta:[^}}]+description\s*=\s*\"(.+?)\"'

# Determine ecosystem based on filename prefix
rule_ecosystem: Optional[ECOSYSTEM]
if file_name.startswith("extension_"):
rule_ecosystem = ECOSYSTEM.EXTENSION
else:
# If no specific ecosystem prefix, apply to any ecosystem
rule_ecosystem = None
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

rule_ecosystem = None
if file_name.startswith("extension_"):
    rule_ecosystem = ECOSYSTEM.EXTENSION

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You may also want to move the "extension_" piece to a constant.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You could even do rule_ecosystem = ECOSYSTEM.EXTENSION if file_name.startswith("extension_") else None

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'll wait for Sebastian's review and apply this change

Copy link
Contributor

@sobregosodd sobregosodd Jul 21, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

mmh... actually, we don't use the name's prefix to do this, we use the metadata of the rule.
similar to description_regex up there.
in some ideal future world, we should remove the prefixes of the rules (there are problems if we try to do this now)


with open(os.path.join(current_dir, file_name), "r") as fd:
match = re.search(description_regex, fd.read())
rule_description = ""
if match:
rule_description = match.group(1)
SOURCECODE_RULES.append(YaraRule(id=rule_id, file=file_name, description=rule_description))

SOURCECODE_RULES.append(YaraRule(
id=rule_id,
file=file_name,
description=rule_description,
ecosystem=rule_ecosystem
))
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
rule DETECT_FILE_powershell_policy_bypass
{
meta:
author = "T HAMDOUNI, Datadog"
credits = ""
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

empty?

description = "Detects suspicious read access to /etc/passwd file, which is often targeted by malware for credential harvesting"

strings:
$cli = /(cat|less|more|head|tail)\s+.{0,100}\/etc\/passwd/ nocase
$read = /(readFile|readFileSync)\(\s*['"]\/etc\/passwd/ nocase
condition:
$cli or $read
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
rule DETECT_FILE_suspicious_passwd_access_linux
{
meta:
author = "T HAMDOUNI, Datadog"
credits = ""
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

empty?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

yes no sources for this one

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'll remove it then

description = "Detects suspicious read access to /etc/passwd file, which is often targeted by malware for credential harvesting"

strings:
$cli = /(cat|less|more|head|tail)\s+.{0,100}\/etc\/passwd/ nocase
$read = /(readFile|readFileSync)\(\s*['"]\/etc\/passwd/ nocase
condition:
$cli or $read
}
3 changes: 3 additions & 0 deletions guarddog/ecosystems.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ class ECOSYSTEM(Enum):
NPM = "npm"
GO = "go"
GITHUB_ACTION = "github-action"
EXTENSION = "extension"


def get_friendly_name(ecosystem: ECOSYSTEM) -> str:
Expand All @@ -18,5 +19,7 @@ def get_friendly_name(ecosystem: ECOSYSTEM) -> str:
return "go"
case ECOSYSTEM.GITHUB_ACTION:
return "GitHub Action"
case ECOSYSTEM.EXTENSION:
return "Extension"
case _:
return ecosystem.value
15 changes: 9 additions & 6 deletions guarddog/reporters/human_readable.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,10 +47,15 @@ def _format_code_line_for_output(code) -> str:
if num_issues == 0:
lines.append(
"Found "
+ colored("0 potentially malicious indicators", "green", attrs=["bold"])
+ colored(
"0 potentially malicious indicators",
"green",
attrs=["bold"])
+ " scanning "
+ colored(identifier, None, attrs=["bold"])
)
+ colored(
identifier,
None,
attrs=["bold"]))
lines.append("")
else:
lines.append(
Expand All @@ -69,9 +74,7 @@ def _format_code_line_for_output(code) -> str:
for finding in findings:
description = findings[finding]
if isinstance(description, str): # package metadata
lines.append(
colored(finding, None, attrs=["bold"]) + ": " + description
)
lines.append(colored(finding, None, attrs=["bold"]) + ": " + description)
lines.append("")
elif isinstance(description, list): # semgrep rule result:
source_code_findings = description
Expand Down
5 changes: 5 additions & 0 deletions guarddog/scanners/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
from .go_package_scanner import GoModuleScanner
from .go_project_scanner import GoDependenciesScanner
from .github_action_scanner import GithubActionScanner
from .extension_scanner import ExtensionScanner
from .scanner import PackageScanner, ProjectScanner
from ..ecosystems import ECOSYSTEM

Expand All @@ -33,6 +34,8 @@ def get_package_scanner(ecosystem: ECOSYSTEM) -> Optional[PackageScanner]:
return GoModuleScanner()
case ECOSYSTEM.GITHUB_ACTION:
return GithubActionScanner()
case ECOSYSTEM.EXTENSION:
return ExtensionScanner()
return None


Expand All @@ -57,4 +60,6 @@ def get_project_scanner(ecosystem: ECOSYSTEM) -> Optional[ProjectScanner]:
return GoDependenciesScanner()
case ECOSYSTEM.GITHUB_ACTION:
return GitHubActionDependencyScanner()
case ECOSYSTEM.EXTENSION:
return None # we're not including dependency scanning for this PR
Comment on lines +63 to +64
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

was this suggested by the lint? is adding no value

return None
Loading