Skip to content

Commit 1d3fdf9

Browse files
authored
Update ForceLeave Prune (#580)
* Prune flag on ForceLeave added. Refactor of `reap` function to call to `eraseNode`.
1 parent d014479 commit 1d3fdf9

File tree

4 files changed

+486
-22
lines changed

4 files changed

+486
-22
lines changed

serf/serf.go

Lines changed: 24 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1102,6 +1102,10 @@ func (s *Serf) handleNodeLeaveIntent(leaveMsg *messageLeave) bool {
11021102
case StatusAlive:
11031103
member.Status = StatusLeaving
11041104
member.statusLTime = leaveMsg.LTime
1105+
1106+
if leaveMsg.Prune {
1107+
s.handlePrune(member)
1108+
}
11051109
return true
11061110
case StatusFailed:
11071111
member.Status = StatusLeft
@@ -1127,26 +1131,38 @@ func (s *Serf) handleNodeLeaveIntent(leaveMsg *messageLeave) bool {
11271131
}
11281132

11291133
if leaveMsg.Prune {
1130-
s.logger.Printf("[INFO] serf: EventMemberReap (forced): %s %s", member.Name, member.Member.Addr)
1131-
s.leftMembers = removeOldMember(s.leftMembers, member.Name)
1132-
s.eraseNode(member)
1134+
s.handlePrune(member)
11331135
}
11341136

11351137
return true
11361138

1137-
case StatusLeft:
1139+
case StatusLeaving, StatusLeft:
11381140
if leaveMsg.Prune {
1139-
s.logger.Printf("[INFO] serf: EventMemberReap (forced): %s %s", member.Name, member.Member.Addr)
1140-
s.leftMembers = removeOldMember(s.leftMembers, member.Name)
1141-
s.eraseNode(member)
1141+
s.handlePrune(member)
11421142
}
11431143
return true
1144-
11451144
default:
11461145
return false
11471146
}
11481147
}
11491148

1149+
// handlePrune waits for nodes that are leaving and then forcibly
1150+
// erases a member from the list of members
1151+
func (s *Serf) handlePrune(member *memberState) {
1152+
if member.Status == StatusLeaving {
1153+
time.Sleep(s.config.BroadcastTimeout + s.config.LeavePropagateDelay)
1154+
}
1155+
1156+
s.logger.Printf("[INFO] serf: EventMemberReap (forced): %s %s", member.Name, member.Member.Addr)
1157+
1158+
//If we are leaving or left we may be in that list of members
1159+
if member.Status == StatusLeaving || member.Status == StatusLeft {
1160+
s.leftMembers = removeOldMember(s.leftMembers, member.Name)
1161+
}
1162+
s.eraseNode(member)
1163+
1164+
}
1165+
11501166
// handleNodeJoinIntent is called when a node broadcasts a
11511167
// join message to set the lamport time of its join
11521168
func (s *Serf) handleNodeJoinIntent(joinMsg *messageJoin) bool {

serf/serf_test.go

Lines changed: 212 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@ import (
1616
"github.com/hashicorp/memberlist"
1717
"github.com/hashicorp/serf/coordinate"
1818
"github.com/hashicorp/serf/testutil"
19+
"github.com/hashicorp/serf/testutil/retry"
1920
)
2021

2122
func testConfig() *Config {
@@ -65,6 +66,27 @@ func testMember(t *testing.T, members []Member, name string, status MemberStatus
6566
panic(fmt.Sprintf("node not found: %s", name))
6667
}
6768

69+
// testMemberStatus is testMember but returns an error
70+
// instead of failing the test
71+
func testMemberStatus(members []Member, name string, status MemberStatus) error {
72+
for _, m := range members {
73+
if m.Name == name {
74+
if m.Status != status {
75+
return fmt.Errorf("bad state for %s: %d", name, m.Status)
76+
}
77+
return nil
78+
}
79+
}
80+
81+
if status == StatusNone {
82+
// We didn't expect to find it
83+
return nil
84+
}
85+
86+
return fmt.Errorf("node not found: %s", name)
87+
88+
}
89+
6890
func TestCreate_badProtocolVersion(t *testing.T) {
6991
cases := []struct {
7092
version uint8
@@ -500,7 +522,7 @@ func TestSerf_leaveRejoinDifferentRole(t *testing.T) {
500522
t.Fatalf("s1 members: %d", len(s1.Members()))
501523
}
502524

503-
var member *Member = nil
525+
var member *Member
504526
for _, m := range members {
505527
if m.Name == s3Config.NodeName {
506528
member = &m
@@ -517,6 +539,182 @@ func TestSerf_leaveRejoinDifferentRole(t *testing.T) {
517539
}
518540
}
519541

542+
func TestSerf_forceLeaveFailed(t *testing.T) {
543+
s1Config := testConfig()
544+
s2Config := testConfig()
545+
s3Config := testConfig()
546+
547+
s1, err := Create(s1Config)
548+
if err != nil {
549+
t.Fatalf("err: %s", err)
550+
}
551+
defer s1.Shutdown()
552+
553+
s2, err := Create(s2Config)
554+
if err != nil {
555+
t.Fatalf("err: %s", err)
556+
}
557+
558+
defer s2.Shutdown()
559+
560+
s3, err := Create(s3Config)
561+
if err != nil {
562+
t.Fatalf("err: %s", err)
563+
}
564+
565+
defer s3.Shutdown()
566+
567+
_, err = s1.Join([]string{s2Config.MemberlistConfig.BindAddr}, false)
568+
if err != nil {
569+
t.Fatalf("err: %s", err)
570+
}
571+
572+
_, err = s1.Join([]string{s3Config.MemberlistConfig.BindAddr}, false)
573+
if err != nil {
574+
t.Fatalf("err: %s", err)
575+
}
576+
577+
//Put s2 in failed state
578+
s2.Shutdown()
579+
580+
retry.Run(t, func(r *retry.R) {
581+
if err := testMemberStatus(s1.Members(), s2Config.NodeName, StatusFailed); err != nil {
582+
r.Fatal(err)
583+
}
584+
})
585+
s1.forceLeave(s2.config.NodeName, true)
586+
587+
memberlen := len(s1.Members())
588+
if memberlen != 2 {
589+
t.Fatalf("wanted 2 alive members, got %v", s1.Members())
590+
}
591+
592+
}
593+
594+
func TestSerf_forceLeaveLeaving(t *testing.T) {
595+
s1Config := testConfig()
596+
s2Config := testConfig()
597+
s3Config := testConfig()
598+
599+
//make it so it doesn't get reaped
600+
// allow for us to see the leaving state
601+
s1Config.TombstoneTimeout = 1 * time.Hour
602+
s1Config.LeavePropagateDelay = 5 * time.Second
603+
604+
s2Config.TombstoneTimeout = 1 * time.Hour
605+
s2Config.LeavePropagateDelay = 5 * time.Second
606+
607+
s3Config.TombstoneTimeout = 1 * time.Hour
608+
s3Config.LeavePropagateDelay = 5 * time.Second
609+
610+
s1, err := Create(s1Config)
611+
if err != nil {
612+
t.Fatalf("err: %s", err)
613+
}
614+
615+
defer s1.Shutdown()
616+
617+
s2, err := Create(s2Config)
618+
if err != nil {
619+
t.Fatalf("err: %s", err)
620+
}
621+
defer s2.Shutdown()
622+
623+
s3, err := Create(s3Config)
624+
if err != nil {
625+
t.Fatalf("err: %s", err)
626+
}
627+
defer s3.Shutdown()
628+
629+
_, err = s1.Join([]string{s2Config.MemberlistConfig.BindAddr}, true)
630+
if err != nil {
631+
t.Fatalf("err: %s", err)
632+
}
633+
testutil.Yield()
634+
635+
_, err = s1.Join([]string{s3Config.MemberlistConfig.BindAddr}, true)
636+
if err != nil {
637+
t.Fatalf("err: %s", err)
638+
}
639+
testutil.Yield()
640+
641+
//Put s2 in left state
642+
if err := s2.Leave(); err != nil {
643+
t.Fatal(err)
644+
}
645+
646+
retry.Run(t, func(r *retry.R) {
647+
if err := testMemberStatus(s1.Members(), s2Config.NodeName, 3); err != nil {
648+
r.Fatal(err)
649+
}
650+
})
651+
s1.forceLeave(s2.config.NodeName, true)
652+
653+
memberlen := len(s1.Members())
654+
if memberlen != 2 {
655+
t.Fatalf("wanted 2 alive members, got %v", s1.Members())
656+
}
657+
}
658+
659+
func TestSerf_forceLeaveLeft(t *testing.T) {
660+
s1Config := testConfig()
661+
s2Config := testConfig()
662+
s3Config := testConfig()
663+
664+
//make it so it doesn't get reaped
665+
s1Config.TombstoneTimeout = 1 * time.Hour
666+
s2Config.TombstoneTimeout = 1 * time.Hour
667+
s3Config.TombstoneTimeout = 1 * time.Hour
668+
669+
s1, err := Create(s1Config)
670+
if err != nil {
671+
t.Fatalf("err: %s", err)
672+
}
673+
defer s1.Shutdown()
674+
675+
s2, err := Create(s2Config)
676+
if err != nil {
677+
t.Fatalf("err: %s", err)
678+
}
679+
defer s2.Shutdown()
680+
681+
s3, err := Create(s3Config)
682+
if err != nil {
683+
t.Fatalf("err: %s", err)
684+
}
685+
defer s3.Shutdown()
686+
687+
_, err = s1.Join([]string{s2Config.MemberlistConfig.BindAddr}, true)
688+
if err != nil {
689+
t.Fatalf("err: %s", err)
690+
}
691+
testutil.Yield()
692+
693+
_, err = s1.Join([]string{s3Config.MemberlistConfig.BindAddr}, true)
694+
if err != nil {
695+
t.Fatalf("err: %s", err)
696+
}
697+
testutil.Yield()
698+
699+
//Put s2 in left state
700+
if err := s2.Leave(); err != nil {
701+
t.Fatal(err)
702+
}
703+
704+
retry.Run(t, func(r *retry.R) {
705+
if err := testMemberStatus(s1.Members(), s2Config.NodeName, StatusLeft); err != nil {
706+
r.Fatal(err)
707+
}
708+
})
709+
s1.forceLeave(s2.config.NodeName, true)
710+
711+
memberlen := len(s1.Members())
712+
if memberlen != 2 {
713+
t.Fatalf("wanted 2 alive members, got %v", s1.Members())
714+
}
715+
716+
}
717+
520718
func TestSerf_reconnect(t *testing.T) {
521719
eventCh := make(chan Event, 64)
522720
s1Config := testConfig()
@@ -672,7 +870,7 @@ func TestSerf_update(t *testing.T) {
672870

673871
// Add a tag to force an update event, and add a version downgrade as
674872
// well (that alone won't trigger an update).
675-
s2Config.ProtocolVersion -= 1
873+
s2Config.ProtocolVersion--
676874
s2Config.Tags["foo"] = "bar"
677875

678876
// We try for a little while to wait for s2 to fully shutdown since the
@@ -1475,31 +1673,31 @@ func TestSerf_SetTags(t *testing.T) {
14751673

14761674
// Verify the new tags
14771675
m1m := s1.Members()
1478-
m1m_tags := make(map[string]map[string]string)
1676+
m1mTags := make(map[string]map[string]string)
14791677
for _, m := range m1m {
1480-
m1m_tags[m.Name] = m.Tags
1678+
m1mTags[m.Name] = m.Tags
14811679
}
14821680

1483-
if m := m1m_tags[s1.config.NodeName]; m["port"] != "8000" {
1484-
t.Fatalf("bad: %v", m1m_tags)
1681+
if m := m1mTags[s1.config.NodeName]; m["port"] != "8000" {
1682+
t.Fatalf("bad: %v", m1mTags)
14851683
}
14861684

1487-
if m := m1m_tags[s2.config.NodeName]; m["datacenter"] != "east-aws" {
1488-
t.Fatalf("bad: %v", m1m_tags)
1685+
if m := m1mTags[s2.config.NodeName]; m["datacenter"] != "east-aws" {
1686+
t.Fatalf("bad: %v", m1mTags)
14891687
}
14901688

14911689
m2m := s2.Members()
1492-
m2m_tags := make(map[string]map[string]string)
1690+
m2mTags := make(map[string]map[string]string)
14931691
for _, m := range m2m {
1494-
m2m_tags[m.Name] = m.Tags
1692+
m2mTags[m.Name] = m.Tags
14951693
}
14961694

1497-
if m := m2m_tags[s1.config.NodeName]; m["port"] != "8000" {
1498-
t.Fatalf("bad: %v", m1m_tags)
1695+
if m := m2mTags[s1.config.NodeName]; m["port"] != "8000" {
1696+
t.Fatalf("bad: %v", m1mTags)
14991697
}
15001698

1501-
if m := m2m_tags[s2.config.NodeName]; m["datacenter"] != "east-aws" {
1502-
t.Fatalf("bad: %v", m1m_tags)
1699+
if m := m2mTags[s2.config.NodeName]; m["datacenter"] != "east-aws" {
1700+
t.Fatalf("bad: %v", m1mTags)
15031701
}
15041702
}
15051703

0 commit comments

Comments
 (0)