Skip to content

Commit 4b89d44

Browse files
committed
add stevenblack list support
1 parent 119fbef commit 4b89d44

File tree

12 files changed

+409
-29
lines changed

12 files changed

+409
-29
lines changed

.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,3 +13,4 @@ docs
1313
*.gz
1414
*.gz
1515
rdomains_0.3.0.tar.gz
16+
*.csv

.lintr

Lines changed: 0 additions & 6 deletions
This file was deleted.

DESCRIPTION

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -28,8 +28,7 @@ Imports:
2828
Suggests:
2929
testthat,
3030
rmarkdown,
31-
knitr (>= 1.11),
32-
lintr
31+
knitr (>= 1.11)
3332
VignetteBuilder: knitr
3433
License: MIT + file LICENSE
3534
Encoding: UTF-8

NAMESPACE

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,10 +6,12 @@ export(dmoz_cat)
66
export(get_alexa_data)
77
export(get_dmoz_data)
88
export(get_shalla_data)
9+
export(get_stevenblack_data)
910
export(glm_shalla)
1011
export(not_news)
1112
export(openai_cat)
1213
export(shalla_cat)
14+
export(stevenblack_cat)
1315
export(uni_cat)
1416
export(virustotal_cat)
1517
importFrom(Matrix,Matrix)

R/get_shalla_data.R

Lines changed: 57 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -18,17 +18,63 @@
1818

1919
get_shalla_data <- function(outdir = "./", overwrite = FALSE) {
2020

21-
# Check if file already there
22-
output_file <- paste0(outdir, "shalla_domain_category.csv")
23-
if (overwrite == FALSE & file.exists(output_file)) {
24-
stop("There is already a file with that name in the location.
25-
Pick another name or location.")
21+
# Normalize and create output directory path
22+
outdir <- normalizePath(outdir, mustWork = FALSE)
23+
if (!dir.exists(outdir)) {
24+
tryCatch({
25+
dir.create(outdir, recursive = TRUE)
26+
}, error = function(e) {
27+
stop("Cannot create output directory: ", outdir, "\n",
28+
"Error: ", e$message, "\n",
29+
"Please check directory permissions.")
30+
})
31+
}
32+
33+
# Use proper file path construction
34+
output_file <- file.path(outdir, "shalla_domain_category.csv")
35+
36+
# Check if file already exists
37+
if (!overwrite && file.exists(output_file)) {
38+
stop("File already exists: ", output_file, "\n",
39+
"Set overwrite=TRUE to replace it or choose a different location.")
2640
}
2741

28-
tmp <- tempfile()
29-
curl_download("https://raw.githubusercontent.com/themains/rdomains/master/data-raw/shallalist/accomplist/shallalist.gz", tmp)
30-
gunzip(tmp, destname = output_file, overwrite = overwrite)
31-
unlink(tmp, force = TRUE)
32-
33-
cat("Shallalist Data saved to the following destination:", outdir, "\n")
42+
# Create temporary file in a writable location
43+
tmp_dir <- tempdir()
44+
tmp <- tempfile(tmpdir = tmp_dir, fileext = ".gz")
45+
46+
tryCatch({
47+
# Download file
48+
cat("Downloading Shallalist data...\n")
49+
curl::curl_download(
50+
"https://raw.githubusercontent.com/themains/rdomains/master/data-raw/shallalist/accomplist/shallalist.gz",
51+
tmp
52+
)
53+
54+
# Extract to destination with proper error handling
55+
R.utils::gunzip(tmp, destname = output_file, overwrite = overwrite)
56+
57+
# Verify the file was created successfully
58+
if (!file.exists(output_file)) {
59+
stop("Failed to create output file. Please check write permissions for: ", outdir)
60+
}
61+
62+
cat("Shallalist data saved to:", output_file, "\n")
63+
64+
}, error = function(e) {
65+
# Clean up temp file on error
66+
if (file.exists(tmp)) unlink(tmp, force = TRUE)
67+
68+
if (grepl("permission", e$message, ignore.case = TRUE)) {
69+
stop("Permission denied. Please ensure you have write access to: ", outdir, "\n",
70+
"On Windows, try running R as administrator or choose a different output directory.")
71+
} else {
72+
stop("Error downloading or extracting Shallalist data: ", e$message)
73+
}
74+
})
75+
76+
# Clean up temp file
77+
if (file.exists(tmp)) unlink(tmp, force = TRUE)
78+
79+
invisible(output_file)
3480
}

R/get_stevenblack_data.R

Lines changed: 64 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,64 @@
1+
#' Get Steven Black's Host List Data
2+
#'
3+
#' Downloads the latest version of Steven Black's unified hosts file.
4+
#' The hosts file contains domains known for serving ads, malware, and tracking.
5+
#'
6+
#' @param outdir Optional; folder to which you want to save the file; Default is current directory
7+
#' @param variant Optional; which variant to download. Options: "base", "porn", "social", "gambling", "all"
8+
#' @param overwrite Optional; default is FALSE. If TRUE, the file is overwritten.
9+
#'
10+
#' @export
11+
#'
12+
#' @references \url{https://github.com/StevenBlack/hosts}
13+
#'
14+
#' @examples \dontrun{
15+
#' get_stevenblack_data()
16+
#' get_stevenblack_data(variant = "all")
17+
#' }
18+
19+
get_stevenblack_data <- function(outdir = "./", variant = "base", overwrite = FALSE) {
20+
21+
# Define available variants and their URLs
22+
variants <- list(
23+
base = "https://raw.githubusercontent.com/StevenBlack/hosts/master/hosts",
24+
porn = "https://raw.githubusercontent.com/StevenBlack/hosts/master/alternates/porn/hosts",
25+
social = "https://raw.githubusercontent.com/StevenBlack/hosts/master/alternates/social/hosts",
26+
gambling = "https://raw.githubusercontent.com/StevenBlack/hosts/master/alternates/gambling/hosts",
27+
all = "https://raw.githubusercontent.com/StevenBlack/hosts/master/alternates/porn-social-gambling/hosts"
28+
)
29+
30+
if (!variant %in% names(variants)) {
31+
stop("Invalid variant. Choose from: ", paste(names(variants), collapse = ", "))
32+
}
33+
34+
# Create output filename
35+
output_file <- file.path(outdir, paste0("stevenblack_hosts_", variant, ".txt"))
36+
37+
# Check if file already exists
38+
if (!overwrite && file.exists(output_file)) {
39+
stop("File already exists: ", output_file,
40+
"\nSet overwrite=TRUE to replace it.")
41+
}
42+
43+
# Create output directory if it doesn't exist
44+
if (!dir.exists(outdir)) {
45+
dir.create(outdir, recursive = TRUE)
46+
}
47+
48+
# Download the hosts file
49+
tryCatch({
50+
cat("Downloading Steven Black's hosts file (", variant, " variant)...\n")
51+
curl::curl_download(variants[[variant]], output_file)
52+
cat("Steven Black's hosts data saved to:", output_file, "\n")
53+
54+
# Print some statistics
55+
hosts_lines <- readLines(output_file, warn = FALSE)
56+
blocked_count <- sum(grepl("^(0\\.0\\.0\\.0|127\\.0\\.0\\.1)\\s+", hosts_lines))
57+
cat("Total blocked domains:", blocked_count, "\n")
58+
59+
}, error = function(e) {
60+
stop("Failed to download hosts file: ", e$message)
61+
})
62+
63+
invisible(output_file)
64+
}

R/stevenblack_cat.R

Lines changed: 101 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,101 @@
1+
#' Get Category from Steven Black's Host List
2+
#'
3+
#' Classifies domains based on Steven Black's unified host list which blocks
4+
#' ads, malware, and tracking domains. The function checks if a domain appears
5+
#' in the blocklist and categorizes it accordingly.
6+
#'
7+
#' Steven Black's host list is a consolidated list from multiple sources including
8+
#' adaway.org, mvps.org, malwaredomainlist.com, and someonewhocares.org.
9+
#'
10+
#' @param domain domain names as character vector
11+
#' @param use_file path to a local Steven Black hosts file. If NULL, downloads from GitHub
12+
#'
13+
#' @return data.frame with original domain name and category
14+
#'
15+
#' @export
16+
#' @references \url{https://github.com/StevenBlack/hosts}
17+
#'
18+
#' @examples \dontrun{
19+
#' stevenblack_cat("doubleclick.net")
20+
#' stevenblack_cat(c("google.com", "googleadservices.com", "malware-example.com"))
21+
#' }
22+
23+
stevenblack_cat <- function(domain = NULL, use_file = NULL) {
24+
25+
if (is.null(domain)) {
26+
stop("Please provide at least one domain.")
27+
}
28+
29+
# Clean domains
30+
clean_domains <- gsub("^https?://", "", domain)
31+
clean_domains <- gsub("^www\\.", "", clean_domains)
32+
clean_domains <- gsub("/.*$", "", clean_domains)
33+
34+
# Get or load hosts data
35+
if (is.null(use_file)) {
36+
hosts_file <- tempfile()
37+
tryCatch({
38+
cat("Downloading Steven Black's hosts file...\n")
39+
curl::curl_download(
40+
"https://raw.githubusercontent.com/StevenBlack/hosts/master/hosts",
41+
hosts_file
42+
)
43+
}, error = function(e) {
44+
stop("Failed to download hosts file: ", e$message)
45+
})
46+
} else {
47+
if (!file.exists(use_file)) {
48+
stop("File not found: ", use_file)
49+
}
50+
hosts_file <- use_file
51+
}
52+
53+
# Read and parse hosts file
54+
tryCatch({
55+
hosts_lines <- readLines(hosts_file, warn = FALSE)
56+
}, error = function(e) {
57+
stop("Failed to read hosts file: ", e$message)
58+
})
59+
60+
# Extract blocked domains (lines starting with 0.0.0.0 or 127.0.0.1)
61+
blocked_lines <- hosts_lines[grepl("^(0\\.0\\.0\\.0|127\\.0\\.0\\.1)\\s+", hosts_lines)]
62+
63+
# Parse domain names from blocked lines
64+
blocked_domains <- sub("^(0\\.0\\.0\\.0|127\\.0\\.0\\.1)\\s+", "", blocked_lines)
65+
blocked_domains <- trimws(blocked_domains)
66+
67+
# Remove comments and empty lines
68+
blocked_domains <- blocked_domains[!grepl("^#", blocked_domains)]
69+
blocked_domains <- blocked_domains[blocked_domains != ""]
70+
blocked_domains <- blocked_domains[blocked_domains != "localhost"]
71+
72+
# Classify domains
73+
results <- data.frame(
74+
domain = domain,
75+
stevenblack = "safe",
76+
stringsAsFactors = FALSE
77+
)
78+
79+
# Check each clean domain against blocklist
80+
for (i in seq_along(clean_domains)) {
81+
if (clean_domains[i] %in% blocked_domains) {
82+
# Categorize based on common patterns
83+
if (grepl("(ad|ads|doubleclick|googleadservices|googlesyndication)", clean_domains[i], ignore.case = TRUE)) {
84+
results$stevenblack[i] <- "ads"
85+
} else if (grepl("(malware|virus|trojan|phishing)", clean_domains[i], ignore.case = TRUE)) {
86+
results$stevenblack[i] <- "malware"
87+
} else if (grepl("(track|analytics|metric|stats)", clean_domains[i], ignore.case = TRUE)) {
88+
results$stevenblack[i] <- "tracking"
89+
} else {
90+
results$stevenblack[i] <- "blocked"
91+
}
92+
}
93+
}
94+
95+
# Clean up temp file if we downloaded it
96+
if (is.null(use_file)) {
97+
unlink(hosts_file)
98+
}
99+
100+
return(results)
101+
}

man/get_stevenblack_data.Rd

Lines changed: 28 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

man/stevenblack_cat.Rd

Lines changed: 34 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

tests/testthat/test-pkg-style.R

Lines changed: 0 additions & 7 deletions
This file was deleted.

0 commit comments

Comments
 (0)