Skip to content

Commit 596446c

Browse files
author
vietanhduong
committed
fix: timeout when cluster has too many worker node
Signed-off-by: vietanhduong <anh.duong@kyber.network>
1 parent 3532c36 commit 596446c

File tree

3 files changed

+119
-66
lines changed

3 files changed

+119
-66
lines changed

cmd/gke/unpause/cmd.go

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@ func NewCommand() *cobra.Command {
1818
rm bool
1919
)
2020

21-
var cmd = &cobra.Command{
21+
cmd := &cobra.Command{
2222
Use: "unpause [STATE_FILE]",
2323
Short: "Unpause a GKE cluster",
2424
Long: `Unpause a GKE cluster.
@@ -52,7 +52,7 @@ $ pause-gcp gke unpause gs://bucket/path/json_file.json --rm
5252
return err
5353
}
5454
}
55-
log.Printf("INFO: retrieve cluster state completed!\n")
55+
log.Printf("INFO: retrieved cluster state!\n")
5656
var cluster apis.Cluster
5757
if err = protoutil.Unmarshal(b, &cluster); err != nil {
5858
return err

pkg/gcloud/gke/gke.go

Lines changed: 90 additions & 64 deletions
Original file line numberDiff line numberDiff line change
@@ -127,23 +127,6 @@ func GetCluster(project, location, name string) (*apis.Cluster, error) {
127127
}
128128

129129
func PauseCluster(cluster *apis.Cluster, dryRun bool) error {
130-
resize := func(cluster *apis.Cluster, pool *apis.Cluster_NodePool) error {
131-
_, err := exec.Run(exec.Command("gcloud",
132-
"container",
133-
"clusters",
134-
"resize", cluster.Name,
135-
"--project", cluster.Project,
136-
"--location", cluster.Location,
137-
"--node-pool", pool.Name,
138-
"--num-nodes", "0",
139-
"--quiet",
140-
))
141-
if err != nil {
142-
return errors.Wrapf(err, "pause cluster: resize pool '%s/%s'", cluster.Name, pool.Name)
143-
}
144-
return nil
145-
}
146-
147130
pause := func(cluster *apis.Cluster, pool *apis.Cluster_NodePool) error {
148131
if pool.GetAutoscaling() != nil || pool.GetAutoscaling().GetEnabled() {
149132
_, err := exec.Run(exec.Command("gcloud",
@@ -166,40 +149,10 @@ func PauseCluster(cluster *apis.Cluster, dryRun bool) error {
166149
return nil
167150
}
168151

169-
ticker := time.NewTicker(time.Second)
170-
171-
// resize node pool isn't completed at the first time. After disable the autoscaling, GCP set the nodeCount is
172-
// the initialNodeCount. Currently, we can't change the initialNodeCount setting.
173-
if err := resize(cluster, pool); err != nil {
174-
return err
175-
}
176-
// this will ensure that the nodeCount will be 0
177-
for {
178-
select {
179-
case <-ticker.C:
180-
if size := getNodePoolSize(cluster.Project, cluster.Name, pool.Name); size == 0 {
181-
log.Printf("INFO: resized pool for '%s/%s' is completed!\n", cluster.Name, pool.Name)
182-
return nil
183-
} else {
184-
log.Printf("INFO: current size of '%s/%s': %d\n", cluster.Name, pool.Name, size)
185-
}
186-
_, err := exec.Run(exec.Command("gcloud",
187-
"container",
188-
"clusters",
189-
"resize", cluster.Name,
190-
"--project", cluster.Project,
191-
"--location", cluster.Location,
192-
"--node-pool", pool.Name,
193-
"--num-nodes", "0",
194-
"--quiet",
195-
))
196-
if err != nil {
197-
return errors.Wrapf(err, "pause cluster: resize pool '%s/%s'", cluster.Name, pool.Name)
198-
}
199-
case <-context.Background().Done():
200-
return nil
201-
}
152+
if err := resize(cluster, pool.Name, 0, func(currentSize int) bool { return currentSize == 0 }); err != nil {
153+
return errors.Wrap(err, "pause cluster")
202154
}
155+
return nil
203156
}
204157

205158
var err error
@@ -222,6 +175,7 @@ func PauseCluster(cluster *apis.Cluster, dryRun bool) error {
222175

223176
func UnpauseCluster(cluster *apis.Cluster) error {
224177
unpause := func(cluster *apis.Cluster, p *apis.Cluster_NodePool) error {
178+
// re-enable autoscaling if the setting is presented
225179
if p.GetAutoscaling() != nil && p.GetAutoscaling().GetEnabled() {
226180
_, err := exec.Run(exec.Command("gcloud",
227181
"container",
@@ -240,21 +194,10 @@ func UnpauseCluster(cluster *apis.Cluster) error {
240194
}
241195
log.Printf("INFO: enabled autoscaling for '%s/%s'\n", cluster.Name, p.Name)
242196
}
243-
244-
_, err := exec.Run(exec.Command("gcloud",
245-
"container",
246-
"clusters",
247-
"resize", cluster.Name,
248-
"--project", cluster.Project,
249-
"--location", cluster.Location,
250-
"--node-pool", p.GetName(),
251-
"--num-nodes", strconv.Itoa(int(p.GetCurrentSize())),
252-
"--quiet",
253-
))
254-
if err != nil {
255-
return errors.Wrapf(err, "unpause cluster: resize pool '%s/%s'", cluster.Name, p.Name)
197+
size := int(p.GetCurrentSize())
198+
if err := resize(cluster, p.GetName(), size, func(currentSize int) bool { return currentSize >= size }); err != nil {
199+
return errors.Wrap(err, "unpause cluster")
256200
}
257-
log.Printf("INFO: resized '%s/%s' to %d\n", cluster.Name, p.GetName(), p.GetCurrentSize())
258201
return nil
259202
}
260203

@@ -353,6 +296,59 @@ func RefreshCluster(cluster *apis.Cluster, recreate bool) error {
353296
return eg.Wait()
354297
}
355298

299+
func resize(cluster *apis.Cluster, pool string, size int, sizeCondition func(currentSize int) bool) error {
300+
ticker := time.NewTicker(time.Second)
301+
// resize a node pool might take up to 4 hours
302+
ctx, cancel := context.WithTimeout(context.Background(), 4*time.Hour)
303+
defer cancel()
304+
305+
for {
306+
select {
307+
case <-ticker.C:
308+
if currentSize := getNodePoolSize(cluster.Project, cluster.Name, pool); sizeCondition(currentSize) {
309+
log.Printf("INFO: resized pool for '%s/%s' is completed!\n", cluster.Name, pool)
310+
return nil
311+
} else {
312+
log.Printf("INFO: current size of '%s/%s': %d\n", cluster.Name, pool, currentSize)
313+
}
314+
315+
var op *Operation
316+
var err error
317+
if op, err = getOperation(cluster, pool, OperationFilter{Status: Running, OperationType: SetNodePoolSize}); err != nil {
318+
log.Printf("WARN: get container operation got error: %v", err)
319+
}
320+
321+
// if the cluster already in an operation, we must to need all operations complete
322+
if op != nil {
323+
log.Printf("INFO: cluster %q already in an operation. Tell the process to wait until the operation complete.\n", cluster.GetName())
324+
_, err = exec.Run(exec.Command("gcloud", "container", "operations", "wait", op.Name, "--location", op.Zone, "--project", cluster.GetProject()))
325+
if err != nil {
326+
log.Printf("WARN: cluster %q: wait operation %q incomplete: error: %v\n", cluster.GetName(), op.Name, err)
327+
}
328+
// break the select; we will retry the whole process even the op success or not.
329+
break
330+
}
331+
332+
_, err = exec.Run(exec.Command("gcloud",
333+
"container",
334+
"clusters",
335+
"resize", cluster.Name,
336+
"--project", cluster.Project,
337+
"--location", cluster.Location,
338+
"--node-pool", pool,
339+
"--num-nodes", strconv.Itoa(size),
340+
"--quiet",
341+
"--async",
342+
))
343+
if err != nil {
344+
return errors.Wrapf(err, "pause cluster: resize pool '%s/%s'", cluster.Name, pool)
345+
}
346+
case <-ctx.Done():
347+
return nil
348+
}
349+
}
350+
}
351+
356352
func getNodePoolSize(project, cluster, pool string) int {
357353
out, err := exec.Run(exec.Command("gcloud",
358354
"compute",
@@ -369,3 +365,33 @@ func getNodePoolSize(project, cluster, pool string) int {
369365
val, _ := strconv.Atoi(out)
370366
return val
371367
}
368+
369+
func getOperation(cluster *apis.Cluster, pool string, filter OperationFilter) (*Operation, error) {
370+
filterQuery := fmt.Sprintf("target_link:*/%s/nodePools/%s", cluster.GetName(), pool)
371+
if filter.Status != "" {
372+
filterQuery = fmt.Sprintf("%s AND status:%s", filterQuery, filter.Status)
373+
}
374+
if filter.OperationType != "" {
375+
filterQuery = fmt.Sprintf("%s AND operation_type:%s", filterQuery, filter.OperationType)
376+
}
377+
378+
raw, err := exec.Run(exec.Command("gcloud",
379+
"container",
380+
"operations",
381+
"list",
382+
"--filter", filterQuery,
383+
"--project", cluster.GetProject(),
384+
"--location", cluster.GetLocation(),
385+
"--format", "json",
386+
))
387+
if err != nil {
388+
return nil, errors.Wrapf(err, "get operation")
389+
}
390+
391+
var ops []*Operation
392+
_ = json.UnmarshalFromString(raw, &ops)
393+
if len(ops) == 0 {
394+
return nil, nil
395+
}
396+
return ops[0], nil
397+
}

pkg/gcloud/gke/types.go

Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,27 @@
1+
package gke
2+
3+
const (
4+
// Operation Type
5+
6+
SetNodePoolSize = "SET_NODE_POOL_SIZE"
7+
8+
// Operation Status
9+
10+
Done = "DONE"
11+
Running = "RUNNING"
12+
)
13+
14+
type Operation struct {
15+
Name string `json:"name,omitempty"`
16+
OperationType string `json:"operation_type,omitempty"`
17+
StartTime string `json:"start_time,omitempty"`
18+
EndTime string `json:"end_time,omitempty"`
19+
TargetLink string `json:"target_link,omitempty"`
20+
Zone string `json:"zone,omitempty"`
21+
Status string `json:"status,omitempty"`
22+
}
23+
24+
type OperationFilter struct {
25+
Status string
26+
OperationType string
27+
}

0 commit comments

Comments
 (0)