Skip to content
This repository was archived by the owner on Oct 12, 2023. It is now read-only.

Commit 02c5eac

Browse files
authored
v0.3.0 Release (#20)
* Added set chunk size * Adding resource files on pool creation * renaming generate file functions * Moved worker/merger scripts to doAzureParallel and created common job env * Added stdout and stderr logs in uploads * added to docs / README * Switched params for cluster and added examples * setCreds, resizeCluster, job management * cred generator update * Added samples, moved autoscale, and low-pri/output files * Added documentation on methods for ??R feature * Added export for makeCluster * Namespace missing export * clusterSetting param name * cluster id param name * NumOfNodes param for wait nodes completion fix * Added proper naming for registerDoAzureParallel * readme update' * typo readme * low pri in readme * monte carlo simulation * Added new sample for sas resource files * caret + annotation on montecarlo sim * samples readme.md * samples readme * Fixed the resource files to use proper storage account for example * Update README.md * Update 11-autoscale.md * Fixed autoscale formula for task queue to take maxTaskPerNode * Added named args to createSasToken * Update resource-files-example.R * Update 21-distributing-data.md * Renamed samples files to underscore format * Update 21-distributing-data.md * Update README.md * Update README.md * Update README.md * Edited changelog file * Update plyr_example.R * Update README.md
1 parent 96ad39a commit 02c5eac

31 files changed

+1480
-191
lines changed

CHANGELOG.md

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
0.3.0
2+
- [BREAKING CHANGE] Two configuration files for easier debugging - credentials and cluster settings
3+
- [BREAKING CHANGE] Added low priority virtual machine support for additional cost saving
4+
- Added external method for setting chunk size (SetChunkSize)
5+
- Added getJobList function to check the status of user's jobs
6+
- Added resizeCluster function to allow users to change their autoscale formulas on the fly

DESCRIPTION

Lines changed: 11 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,16 +1,21 @@
11
Package: doAzureParallel
22
Type: Package
33
Title: doAzureParallel
4-
Version: 0.2.2
4+
Version: 0.3.0
55
Author: Brian Hoang
6-
Maintainer: Who to complain to <yourfault@somewhere.net>
7-
Description: More about what it does (maybe more than one line)
8-
License: What license is it under?
6+
Maintainer: Brian Hoang <brhoan@microsoft.com>
7+
Description: The project is for data experts who use R at scale. The project
8+
comes together as an R package that will allow users to run their R code in
9+
parallel across a cluster hosted on Azure. The cluster will be created and
10+
maintained by Azure Batch and, for the initial version, will be a public/
11+
communal pool. The orchestration for each job that needs to be parallelized in
12+
the cluster will be done by a middle layer that schedules each request.
13+
License: Microsoft Corporation
914
LazyData: TRUE
1015
Depends:
1116
foreach (>= 1.4.3),
1217
iterators (>= 1.0.8),
13-
rAzureBatch (>= 0.1.0)
18+
rAzureBatch (>= 0.2.4)
1419
Suggests:
15-
testthat
20+
testthat, caret, plyr
1621
RoxygenNote: 5.0.1

NAMESPACE

Lines changed: 12 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,2 +1,13 @@
11
# Generated by roxygen2: do not edit by hand
2-
exportPattern("^[^\\.]")
2+
3+
export(generateClusterConfig)
4+
export(generateCredentialsConfig)
5+
export(getJobList)
6+
export(getJobResult)
7+
export(makeCluster)
8+
export(registerDoAzureParallel)
9+
export(setChunkSize)
10+
export(setCredentials)
11+
export(setVerbose)
12+
export(stopCluster)
13+
export(waitForNodesToComplete)

R/autoscale.R

Lines changed: 79 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,79 @@
1+
AUTOSCALE_WORKDAY_FORMULA <- paste0(
2+
"$curTime = time();",
3+
"$workHours = $curTime.hour >= 8 && $curTime.hour < 18;",
4+
"$isWeekday = $curTime.weekday >= 1 && $curTime.weekday <= 5;",
5+
"$isWorkingWeekdayHour = $workHours && $isWeekday;",
6+
"$TargetDedicatedNodes = $isWorkingWeekdayHour ? %s:%s;")
7+
8+
AUTOSCALE_WEEKEND_FORMULA <- paste0(
9+
"$isWeekend = $curTime.weekday >= 6 && $curTime.weekday <= 7;",
10+
"$TargetDedicatedNodes = $isWeekend ? %s:%s;")
11+
12+
AUTOSCALE_MAX_CPU_FORMULA <- "$totalNodes =
13+
(min($CPUPercent.GetSample(TimeInterval_Minute * 10)) > 0.7) ?
14+
($CurrentDedicated * 1.1) : $CurrentDedicated; $totalNodes =
15+
(avg($CPUPercent.GetSample(TimeInterval_Minute * 60)) < 0.2) ?
16+
($CurrentDedicated * 0.9) : $totalNodes;
17+
$TargetDedicatedNodes = min(%s, $totalNodes)"
18+
19+
AUTOSCALE_QUEUE_FORMULA <- paste0(
20+
"$samples = $ActiveTasks.GetSamplePercent(TimeInterval_Minute * 15);",
21+
"$tasks = $samples < 70 ? max(0,$ActiveTasks.GetSample(1)) : max( $ActiveTasks.GetSample(1), avg($ActiveTasks.GetSample(TimeInterval_Minute * 15)));",
22+
"$maxTasksPerNode = %s;",
23+
"$round = $maxTasksPerNode - 1;",
24+
"$targetVMs = $tasks > 0? (($tasks + $round)/ $maxTasksPerNode) : max(0, $TargetDedicated/2) + 0.5;",
25+
"$TargetDedicatedNodes = max(%s, min($targetVMs, %s));",
26+
"$TargetLowPriorityNodes = max(%s, min($targetVMs, %s));",
27+
"$NodeDeallocationOption = taskcompletion;"
28+
)
29+
30+
AUTOSCALE_FORMULA = list("WEEKEND" = AUTOSCALE_WEEKEND_FORMULA,
31+
"WORKDAY" = AUTOSCALE_WORKDAY_FORMULA,
32+
"MAX_CPU" = AUTOSCALE_MAX_CPU_FORMULA,
33+
"QUEUE" = AUTOSCALE_QUEUE_FORMULA)
34+
35+
getAutoscaleFormula <- function(formulaName, dedicatedMin, dedicatedMax, lowPriorityMin, lowPriorityMax, maxTasksPerNode = 1){
36+
formulas <- names(AUTOSCALE_FORMULA)
37+
38+
if(formulaName == formulas[1]){
39+
return(sprintf(AUTOSCALE_WEEKEND_FORMULA, dedicatedMin, dedicatedMax))
40+
}
41+
else if(formulaName == formulas[2]){
42+
return(sprintf(AUTOSCALE_WORKDAY_FORMULA, dedicatedMin, dedicatedMax))
43+
}
44+
else if(formulaName == formulas[3]){
45+
return(sprintf(AUTOSCALE_MAX_CPU_FORMULA, dedicatedMin))
46+
}
47+
else if(formulaName == formulas[4]){
48+
return(sprintf(AUTOSCALE_QUEUE_FORMULA, maxTasksPerNode, dedicatedMin, dedicatedMax, lowPriorityMin, lowPriorityMax))
49+
}
50+
else{
51+
stop("Incorrect autoscale formula: QUEUE, MAX_CPU, WEEKEND, WORKDAY")
52+
}
53+
}
54+
55+
#' Resize an Azure cloud-enabled cluster.
56+
#'
57+
#' @param cluster Cluster object that was referenced in \code{makeCluster}
58+
#' @param dedicatedMin The minimum number of dedicated nodes
59+
#' @param dedicatedMax The maximum number of dedicated nodes
60+
#' @param lowPriorityMin The minimum number of low priority nodes
61+
#' @param lowPriorityMax The maximum number of low priority nodes
62+
#' @param algorithm Current built-in autoscale formulas: QUEUE, MAX_CPU, WEEKEND, WEEKDAY
63+
#' @param timeInterval
64+
#'
65+
#' @examples
66+
#' resizeCluster(cluster, dedicatedMin = 2, dedicatedMax = 6, dedicatedMin = 2, dedicatedMax = 6, algorithm = "QUEUE", timeInterval = "PT10M")
67+
resizeCluster <- function(cluster,
68+
dedicatedMin,
69+
dedicatedMax,
70+
lowPriorityMin,
71+
lowPriorityMax,
72+
algorithm = "QUEUE",
73+
timeInterval = "PT5M"){
74+
pool <- getPool(cluster$poolId)
75+
76+
resizePool(cluster$poolId,
77+
autoscaleFormula = getAutoscaleFormula(algorithm, dedicatedMin, dedicatedMax, lowPriorityMin, lowPriorityMax, maxTasksPerNode = pool$maxTasksPerNode),
78+
autoscaleInterval = timeInterval)
79+
}

R/cluster.R

Lines changed: 96 additions & 40 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
1-
#' Creates a configuration file for the user's cluster setup.
1+
#' Creates a credentials file for rAzureBatch package authentication
22
#'
3-
#' @param fileName Cluster configuration's file name.
3+
#' @param fileName Credentials file name
44
#' @param ... Further named parameters
55
#' \itemize{
66
#' \item{"batchAccount"}: {A list of files that the Batch service will download to the compute node before running the command line.}
@@ -11,10 +11,11 @@
1111
#'}
1212
#' @return The request to the Batch service was successful.
1313
#' @examples {
14-
#' generateClusterConfig("test_config.json")
15-
#' generateClusterConfig("test_config.json", batchAccount = "testbatchaccount", batchKey = "test_batch_account_key", batchUrl = "http://testbatchaccount.azure.com", storageAccount = "teststorageaccount", storageKey = "test_storage_account_key")
14+
#' generateCredentialsConfig("test_config.json")
15+
#' generateCredentialsConfig("test_config.json", batchAccount = "testbatchaccount", batchKey = "test_batch_account_key", batchUrl = "http://testbatchaccount.azure.com", storageAccount = "teststorageaccount", storageKey = "test_storage_account_key")
1616
#' }
17-
generateClusterConfig <- function(fileName, ...){
17+
#' @export
18+
generateCredentialsConfig <- function(fileName, ...){
1819
args <- list(...)
1920

2021
batchAccount <- ifelse(is.null(args$batchAccount), "batch_account_name", args$batchAccount)
@@ -31,35 +32,64 @@ generateClusterConfig <- function(fileName, ...){
3132
batchAccount = list(
3233
name = batchAccount,
3334
key = batchKey,
34-
url = batchUrl,
35-
pool = list(
36-
name = "myPoolName",
37-
vmSize = "Standard_D2_v2",
38-
maxTasksPerNode = 1,
39-
poolSize = list(
40-
minNodes = 3,
41-
maxNodes = 10,
42-
autoscaleFormula = "QUEUE"
43-
)
44-
),
45-
rPackages = list(
46-
cran = vector(),
47-
github = vector()
48-
)
35+
url = batchUrl
4936
),
5037
storageAccount = list(
5138
name = storageName,
5239
key = storageKey
53-
),
54-
settings = list(
55-
verbose = FALSE
5640
)
5741
)
5842

5943
configJson <- jsonlite::toJSON(config, auto_unbox = TRUE, pretty = TRUE)
6044
write(configJson, file=paste0(getwd(), "/", fileName))
6145

6246
print(sprintf("A config file has been generated %s. Please enter your Batch credentials.", paste0(getwd(), "/", fileName)))
47+
}
48+
}
49+
50+
#' Creates a configuration file for the user's cluster setup.
51+
#'
52+
#' @param fileName Cluster settings file name
53+
#' @return The request to the Batch service was successful.
54+
#' @examples {
55+
#' generateClusterConfig("test_config.json")
56+
#' generateClusterConfig("test_config.json")
57+
#' }
58+
#'
59+
#' @export
60+
generateClusterConfig <- function(fileName, ...){
61+
args <- list(...)
62+
63+
packages <- ifelse(is.null(args$packages), list(), args$packages)
64+
65+
if(!file.exists(fileName) || !file.exists(paste0(getwd(), "/", fileName))){
66+
config <- list(
67+
pool = list(
68+
name = "myPoolName",
69+
vmSize = "Standard_D2_v2",
70+
maxTasksPerNode = 1,
71+
poolSize = list(
72+
dedicatedNodes = list(
73+
min = 3,
74+
max = 3
75+
),
76+
lowPriorityNodes = list(
77+
min = 3,
78+
max = 3
79+
),
80+
autoscaleFormula = "QUEUE"
81+
)
82+
),
83+
rPackages = list(
84+
cran = vector(),
85+
github = vector()
86+
)
87+
)
88+
89+
configJson <- jsonlite::toJSON(config, auto_unbox = TRUE, pretty = TRUE)
90+
write(configJson, file=paste0(getwd(), "/", fileName))
91+
92+
print(sprintf("A cluster settings has been generated %s. Please enter your cluster specification.", paste0(getwd(), "/", fileName)))
6393
print("Note: To maximize all CPU cores, set the maxTasksPerNode property up to 4x the number of cores for the VM size.")
6494
}
6595
}
@@ -69,35 +99,48 @@ generateClusterConfig <- function(fileName, ...){
6999
#' @param fileName Cluster configuration's file name
70100
#' @param fullName A boolean flag for checking the file full name
71101
#' @param wait A boolean flag to wait for all nodes to boot up
102+
#' @param resourceFiles A list of files that Batch will download to the compute node before running the command line
72103
#'
73104
#' @return The request to the Batch service was successful.
74105
#' @examples
75106
#' cluster <- makeCluster("cluster_config.json", fullName = TRUE, wait = TRUE)
76-
makeCluster <- function(fileName = "az_config.json", fullName = FALSE, wait = TRUE, resourceFiles = list()){
77-
setPoolOption(fileName, fullName)
107+
#' @export
108+
makeCluster <- function(clusterSetting = "cluster_settings.json", fullName = FALSE, wait = TRUE, resourceFiles = list()){
109+
if(fullName){
110+
pool <- rjson::fromJSON(file=paste0(clusterSetting))
111+
}
112+
else{
113+
pool <- rjson::fromJSON(file=paste0(getwd(), "/", clusterSetting))
114+
}
115+
78116
config <- getOption("az_config")
79-
pool <- config$batchAccount$pool
117+
if(is.null(config)){
118+
stop("Credentials were not set.")
119+
}
120+
121+
config$poolId = pool$pool$name
122+
options("az_config" = config)
80123

81124
packages <- NULL
82-
if(!is.null(config$batchAccount$rPackages) && !is.null(config$batchAccount$rPackages$cran) && length(config$batchAccount$rPackages$cran) > 0){
83-
packages <- getInstallationCommand(config$batchAccount$rPackages$cran)
125+
if(!is.null(pool$rPackages) && !is.null(pool$rPackages$cran) && length(pool$rPackages$cran) > 0){
126+
packages <- getInstallationCommand(pool$rPackages$cran)
84127
}
85128

86-
if(!is.null(config$batchAccount$rPackages) && !is.null(config$batchAccount$rPackages$github) && length(config$batchAccount$rPackages$github) > 0){
129+
if(!is.null(pool$rPackages) && !is.null(pool$rPackages$github) && length(pool$rPackages$github) > 0){
87130
if(is.null(packages)){
88-
packages <- getGithubInstallationCommand(config$batchAccount$rPackages$github)
131+
packages <- getGithubInstallationCommand(pool$rPackages$github)
89132
}
90133
else{
91-
packages <- paste0(packages, ";", getGithubInstallationCommand(config$batchAccount$rPackages$github))
134+
packages <- paste0(packages, ";", getGithubInstallationCommand(pool$rPackages$github))
92135
}
93136
}
94137

95138
response <- .addPool(
96-
pool = pool,
139+
pool = pool$pool,
97140
packages = packages,
98141
resourceFiles = resourceFiles)
99142

100-
pool <- getPool(pool$name)
143+
pool <- getPool(pool$pool$name)
101144

102145
if(grepl("AuthenticationFailed", response)){
103146
stop("Check your credentials and try again.");
@@ -108,36 +151,49 @@ makeCluster <- function(fileName = "az_config.json", fullName = FALSE, wait = TR
108151
}
109152
else{
110153
if(wait){
111-
waitForNodesToComplete(pool$id, 60000, targetDedicated = pool$targetDedicated)
154+
waitForNodesToComplete(pool$id, 60000)
112155
}
113156
}
114157

115158
print("Your pool has been registered.")
116-
print(sprintf("Node Count: %i", pool$targetDedicated))
159+
print(sprintf("Dedicated Node Count: %i", pool$targetDedicatedNodes))
160+
print(sprintf("Low Priority Node Count: %i", pool$targetLowPriorityNodes))
117161
return(getOption("az_config"))
118162
}
119163

120164
#' Deletes the cluster from your Azure account.
121165
#'
122166
#' @param cluster The cluster configuration that was created in \code{makeCluster}
123167
#'
124-
#' @return The request to the Batch service was successful.
125168
#' @examples
126-
#' clusterConfiguration <- makeCluster("pool_configuration.json")
169+
#' clusterConfiguration <- makeCluster("cluster_settings.json")
127170
#' stopCluster(clusterConfiguration)
171+
#' @export
128172
stopCluster <- function(cluster){
129-
deletePool(pool$batchAccount$pool$name)
173+
deletePool(cluster$poolId)
174+
175+
print(sprintf("Your %s cluster has been destroyed.", cluster$poolId))
130176
}
131177

132-
setPoolOption <- function(fileName = "az_config.json", fullName = FALSE){
133-
if(fullName){
178+
#' Deletes the cluster from your Azure account.
179+
#'
180+
#' @param fileName The cluster configuration that was created in \code{makeCluster}
181+
#'
182+
#' @return The request to the Batch service was successful.
183+
#' @examples
184+
#' clusterConfiguration <- makeCluster("cluster_settings.json")
185+
#' stopCluster(clusterConfiguration)
186+
#' @export
187+
setCredentials <- function(fileName = "az_config.json"){
188+
if(file.exists(fileName)){
134189
config <- rjson::fromJSON(file=paste0(fileName))
135190
}
136191
else{
137192
config <- rjson::fromJSON(file=paste0(getwd(), "/", fileName))
138193
}
139194

140195
options("az_config" = config)
196+
print("Your azure credentials have been set.")
141197
}
142198

143199
getPoolWorkers <- function(poolId, ...){

0 commit comments

Comments
 (0)