Skip to content

Commit 75a780e

Browse files
GCP support, @status, @config, minor bug corrections
1 parent 745aacd commit 75a780e

File tree

15 files changed

+542
-436
lines changed

15 files changed

+542
-436
lines changed

CCconfig.toml

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -18,18 +18,20 @@ mpiflags = ""
1818

1919
[ec2]
2020

21-
imageid = "ami-0b869698add04fbdc" # found at us-east-1 (North Virginia). To use in other regions, copy it.
21+
imageid = "ami-0bec2868f8f28086f" # found at us-east-1 (North Virginia). To use in other regions, copy it.
22+
#security_group_id = "sg-09e2e7c3eebd45160"
2223

2324
# placement_group = "pg-XXXXXXXXXXXX" or "automatic"
2425
# security_group_id = "sg-XXXXXXXXXXXX" or "automatic"
2526
# subnet_id = "subnet-XXXXXXXXXXXX"
2627

2728
[gcp]
2829

29-
imageid = "hpc-shelf-311900/global/images/cloudclusters-basic-v3"
30+
imageid = "hpc-shelf-311900/global/images/cloudclusters-basic-v5"
3031
zone = "us-central1-a"
3132
project = "hpc-shelf-311900"
3233
user = "heron"
3334
exename = "/home/heron/.juliaup/bin/julia"
3435
directory = "/home/heron"
3536
mpiflags = "--map-by node --hostfile /home/heron/hostfile"
37+
# network_interface = "default"

Project.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
name = "CloudClusters"
22
uuid = "4ca6f12b-c8f1-4945-b50f-6bb73234c039"
33
authors = ["Francisco Heron de Carvalho Junior <heron@dc.ufc.br> e João Marcelo Uchôa de Alencar <joao.marcelo@ufc.br>"]
4-
version = "0.1.2"
4+
version = "0.2.0"
55

66
[deps]
77
AWS = "fbe9abb3-538b-5e4e-ba9e-bc94f4f92ebc"

README.md

Lines changed: 199 additions & 188 deletions
Large diffs are not rendered by default.

docs/src/index.md

Lines changed: 199 additions & 188 deletions
Large diffs are not rendered by default.

src/CloudClusters.jl

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -46,10 +46,10 @@ export cluster_create, @cluster,
4646
cluster_restart, @restart,
4747
cluster_features, @features,
4848
cluster_nodes, @nodes,
49-
cluster_defaultconfig,
50-
cluster_providers,
51-
cluster_features
52-
49+
cluster_status, @status,
50+
cluster_config, @config,
51+
cluster_providers, @providers
52+
5353
# Cluster types
5454
export ManagerWorkers, PeerWorkers, PeerWorkersMPI, Localhost
5555

src/cluster.jl

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -66,7 +66,7 @@ function cluster_list(;from = DateTime(0), cluster_type = :AnyCluster)
6666
path_contents = readdir(configpath; join = true)
6767

6868
for cluster_file in path_contents
69-
if occursin(r"\s*.cluster", cluster_file)
69+
if file_extension(cluster_file) == "cluster"
7070
cluster_data = load_cluster(cluster_file; from=from, cluster_type=cluster_type)
7171
!isempty(cluster_data) && push!(result, cluster_data)
7272
end

src/cluster_providers/ec2/ec2_deploy.jl

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -135,3 +135,24 @@ cluster_isrunning(_::Type{AmazonEC2}, cluster_handle) = ec2_cluster_info[cluster
135135
cluster_isstopped(_::Type{AmazonEC2}, cluster_handle) = ec2_cluster_info[cluster_handle] |> ec2_cluster_isstopped
136136

137137

138+
function cluster_status(_::Type{AmazonEC2}, cluster_handle)
139+
cluster = ec2_cluster_info[cluster_handle]
140+
cluster_nodes = cluster.cluster_nodes
141+
error = false
142+
cluster_status = nothing
143+
for (nodeid,instanceid) in cluster_nodes
144+
node_status = ec2_get_instance_status(instanceid)
145+
@info "$nodeid ($instanceid) is $node_status"
146+
error = !isnothing(cluster_status) && cluster_status != node_status
147+
cluster_status = node_status
148+
end
149+
if error
150+
@error "The EC2 cluster is in a inconsistent status (all nodes must be in the same status)"
151+
else
152+
@info "The cluster $cluster_handle at EC2 is in $cluster_status status"
153+
end
154+
end
155+
156+
function cluster_delete(_::Type{AmazonEC2}, cluster_handle)
157+
ec2_delete_cluster(cluster_handle)
158+
end

src/cluster_providers/ec2/ec2_persist.jl

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -108,7 +108,6 @@ function cluster_load(_::Type{AmazonEC2}, _::Type{<:ManagerWorkers}, cluster_han
108108
ec2_cluster_info[cluster_handle] = cluster
109109
return cluster.features
110110
else
111-
ec2_delete_cluster(cluster_handle)
112111
return nothing
113112
end
114113
end
@@ -155,7 +154,6 @@ function cluster_load(_::Type{AmazonEC2}, _::Type{<:PeerWorkers}, cluster_handle
155154
ec2_cluster_info[cluster_handle] = cluster
156155
return cluster.features
157156
else
158-
ec2_delete_cluster(cluster_handle)
159157
return nothing
160158
end
161159
end

src/cluster_providers/gcp/gcp_backend.jl

Lines changed: 20 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,7 @@ mutable struct GCPManagerWorkers <: ManagerWorkers #Cluster
3434
user_worker::String
3535
zone::String
3636
project::String
37+
network_interface::String
3738
cluster_nodes::Union{Dict{Symbol, String}, Nothing}
3839
features::Dict{Symbol, Any}
3940
end
@@ -47,6 +48,7 @@ mutable struct GCPPeerWorkers <: PeerWorkers # Cluster
4748
user::String
4849
zone::String
4950
project::String
51+
network_interface::String
5052
cluster_nodes::Union{Dict{Symbol, String}, Nothing}
5153
features::Dict{Symbol, Any}
5254
end
@@ -59,6 +61,7 @@ mutable struct GCPPeerWorkersMPI <: PeerWorkersMPI # Cluster
5961
user::String
6062
zone::String
6163
project::String
64+
network_interface::String
6265
cluster_nodes::Union{Dict{Symbol, String}, Nothing}
6366
features::Dict{Symbol, Any}
6467
end
@@ -189,7 +192,7 @@ function gcp_create_params(cluster::ManagerWorkers, cluster_nodes, internal_key_
189192
"name" => "external-nat",
190193
"type" => "ONE_TO_ONE_NAT"
191194
)],
192-
"network" => "https://www.googleapis.com/compute/v1/projects/$(cluster.project)/global/networks/default"
195+
"network" => "https://www.googleapis.com/compute/v1/projects/$(cluster.project)/global/networks/$(cluster.network_interface)"
193196
)],
194197
"metadata" =>
195198
"items" => [Dict(
@@ -224,7 +227,7 @@ function gcp_create_params(cluster::ManagerWorkers, cluster_nodes, internal_key_
224227
"name" => "external-nat",
225228
"type" => "ONE_TO_ONE_NAT"
226229
)],
227-
"network" => "https://www.googleapis.com/compute/v1/projects/$(cluster.project)/global/networks/default"
230+
"network" => "https://www.googleapis.com/compute/v1/projects/$(cluster.project)/global/networks/$(cluster.network_interface)"
228231
)],
229232
"metadata" =>
230233
"items" => [Dict(
@@ -270,7 +273,7 @@ function gcp_create_params(cluster::PeerWorkers, cluster_nodes, internal_key_nam
270273
"name" => "external-nat",
271274
"type" => "ONE_TO_ONE_NAT"
272275
)],
273-
"network" => "https://www.googleapis.com/compute/v1/projects/$(cluster.project)/global/networks/default"
276+
"network" => "https://www.googleapis.com/compute/v1/projects/$(cluster.project)/global/networks/$(cluster.network_interface)"
274277
)],
275278
"metadata" =>
276279
"items" => [Dict(
@@ -377,7 +380,7 @@ function gcp_create_instances(cluster::ManagerWorkers)
377380

378381
internal_key_name = cluster.name
379382

380-
try gcp_allow_ssh(cluster.project) catch end
383+
try gcp_allow_ssh(cluster) catch end
381384

382385
# Criando as instâncias
383386
params_manager, params_workers = gcp_create_params(cluster, cluster_nodes, internal_key_name, (user_data_manager, user_data_worker), private_key, public_key)
@@ -412,7 +415,7 @@ function gcp_create_instances(cluster::PeerWorkers)
412415

413416
internal_key_name = cluster.name
414417

415-
try gcp_allow_ssh(cluster.project) catch end
418+
try gcp_allow_ssh(cluster) catch end
416419

417420
# Criando as instâncias
418421
params = gcp_create_params(new_cluster, cluster_nodes, internal_key_name, user_data, private_key, public_key)
@@ -568,18 +571,20 @@ function gcp_get_instance_dict(cluster::Cluster, name)
568571
end
569572

570573

571-
function gcp_allow_ssh(project)
574+
function gcp_allow_ssh(cluster)
572575
firewall_rule = Dict(
573-
"allowed" => [
574-
Dict("IPProtocol" => "tcp",
575-
"ports" => ["22"])],
576-
"direction" => "INGRESS",
576+
# "allowed" => [
577+
# Dict("IPProtocol" => "tcp",
578+
# "ports" => ["22"])],
579+
# "direction" => "INGRESS",
577580
"kind" => "compute#firewall",
578-
"name" => "allow-ssh",
579-
"network" => "projects/$project/global/networks/default",
580-
"priority" => 1000,
581-
"selfLink" => "projects/$project/global/firewalls/allow-ssh",
582-
"sourceRanges" => ["0.0.0.0/0"]
581+
# "name" => "allow-ssh",
582+
"name" => "hpcshelf-virtualplatform-network-rules",
583+
"network" => "projects/$(cluster.project)/global/networks/$(cluster.network_interface)",
584+
# "priority" => 1000,
585+
# "selfLink" => "projects/$(cluster.project)/global/firewalls/allow-ssh",
586+
"selfLink" => "projects/$(cluster.project)/global/firewalls/hpcshelf-virtualplatform-network-rules",
587+
"sourceRanges" => ["0.0.0.0/0"]
583588
)
584589

585590
GCPAPI.compute(:Firewall, :insert, project; data=firewall_rule)

src/cluster_providers/gcp/gcp_deploy.jl

Lines changed: 36 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -37,7 +37,8 @@ function deploy_cluster(gcptype::Type{GoogleCloud},
3737
#image_id_workers = get(cluster_features, :image_id, defaults_dict[GoogleCloud][:image_id])
3838
#image_id_manager = get(cluster_features, :image_id_manager, defaults_dict[GoogleCloud][:image_id_manager])
3939
zone = get(cluster_features, :zone, defaults_dict[GoogleCloud][:zone])
40-
project = defaults_dict[GoogleCloud][:project]
40+
project = get(cluster_features, :project, defaults_dict[GoogleCloud][:project])
41+
network_interface = get(cluster_features, :network_interface, get(defaults_dict[GoogleCloud], :network_interface, "default"))
4142
instance_type_manager = instance_type[1]
4243
instance_type_worker = instance_type[2]
4344

@@ -51,6 +52,7 @@ function deploy_cluster(gcptype::Type{GoogleCloud},
5152
user_worker,
5253
zone,
5354
project,
55+
network_interface,
5456
nothing,
5557
cluster_features)
5658

@@ -76,7 +78,11 @@ function deploy_cluster(gcptype::Type{GoogleCloud},
7678
imageid = get(cluster_features, :imageid, defaults_dict[GoogleCloud][:imageid])
7779
user = get(cluster_features, :user, defaults_dict[GoogleCloud][:user])
7880
zone = get(cluster_features, :zone, defaults_dict[GoogleCloud][:zone])
79-
project = defaults_dict[GoogleCloud][:project]
81+
project = get(cluster_features, :project, defaults_dict[GoogleCloud][:project])
82+
network_interface = get(cluster_features, :network_interface, get(defaults_dict[GoogleCloud],:network_interface, "default"))
83+
84+
85+
# get(cluster_features, :placement_group, get(defaults_dict[AmazonEC2], :placement_group, nothing))
8086

8187
cluster = gcp_build_clusterobj(cluster_type,
8288
string(cluster_handle),
@@ -86,6 +92,7 @@ function deploy_cluster(gcptype::Type{GoogleCloud},
8692
user,
8793
zone,
8894
project,
95+
network_interface,
8996
nothing,
9097
cluster_features)
9198

@@ -98,11 +105,11 @@ function deploy_cluster(gcptype::Type{GoogleCloud},
98105
return cluster
99106
end
100107

101-
gcp_build_clusterobj(_::Type{<:PeerWorkers}, name, image_id, count, instance_type, user, zone, project, cluster_nodes, features) =
102-
GCPPeerWorkers(name, image_id, count, instance_type, user, zone, project, cluster_nodes, features)
108+
gcp_build_clusterobj(_::Type{<:PeerWorkers}, name, image_id, count, instance_type, user, zone, project, network_interface, cluster_nodes, features) =
109+
GCPPeerWorkers(name, image_id, count, instance_type, user, zone, project, network_interface, cluster_nodes, features)
103110

104-
gcp_build_clusterobj(_::Type{<:PeerWorkersMPI}, name, image_id, count, instance_type, user, zone, project, cluster_nodes, features) =
105-
GCPPeerWorkersMPI(name, image_id, count, instance_type, user, zone, project, cluster_nodes, features)
111+
gcp_build_clusterobj(_::Type{<:PeerWorkersMPI}, name, image_id, count, instance_type, user, zone, project, network_interface, cluster_nodes, features) =
112+
GCPPeerWorkersMPI(name, image_id, count, instance_type, user, zone, project, network_interface, cluster_nodes, features)
106113

107114
function launch_processes(_::Type{GoogleCloud}, cluster_type::Type{<:Cluster}, cluster_handle, ips)
108115
cluster = gcp_cluster_info[cluster_handle]
@@ -151,4 +158,27 @@ function cluster_isrunning(_::Type{GoogleCloud}, cluster_handle)
151158
@warn "Erro ao verificar o status do cluster: ", e
152159
return false
153160
end
161+
end
162+
163+
164+
function cluster_status(_::Type{GoogleCloud}, cluster_handle)
165+
cluster = gcp_cluster_info[cluster_handle]
166+
cluster_nodes = cluster.cluster_nodes
167+
error = false
168+
cluster_status = nothing
169+
for (nodeid,instanceid) in cluster_nodes
170+
node_status = gcp_get_instance_status(cluster, instanceid)
171+
@info "$nodeid ($instanceid) is $node_status"
172+
error = !isnothing(cluster_status) && cluster_status != node_status
173+
cluster_status = node_status
174+
end
175+
if error
176+
@error "The GCP cluster is in a inconsistent status (all nodes must be in the same status)"
177+
else
178+
@info "The cluster $cluster_handle at GCP is in $cluster_status status"
179+
end
180+
end
181+
182+
function cluster_delete(_::Type{GoogleCloud}, cluster_handle)
183+
gcp_delete_cluster(cluster_handle)
154184
end

0 commit comments

Comments
 (0)