fix: watch channel stuck due to misuse of timer.Reset (#37433)

issue: #37166
cause the misuse of timer.Reset, which cause dispatcher failed to send
msg to virtual channel buffer, and dispatcher do splitting again and
again, which hold the dispatcher manager's lock, block watching channel
progress.

This PR fix the misuse of timer.Reset

Signed-off-by: Wei Liu <wei.liu@zilliz.com>
This commit is contained in:
wei liu 2024-11-07 14:34:24 +08:00 committed by GitHub
parent 86fd3200be
commit 00f6d0ec51
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
5 changed files with 61 additions and 0 deletions

View File

@ -75,6 +75,12 @@ func (t *CreateIndexTask) Await(ctx context.Context) error {
if finished { if finished {
return nil return nil
} }
if !timer.Stop() {
select {
case <-timer.C:
default:
}
}
timer.Reset(t.interval) timer.Reset(t.interval)
case <-ctx.Done(): case <-ctx.Done():
return ctx.Err() return ctx.Err()

View File

@ -57,6 +57,12 @@ func (t *LoadTask) Await(ctx context.Context) error {
if loaded { if loaded {
return nil return nil
} }
if !timer.Stop() {
select {
case <-timer.C:
default:
}
}
timer.Reset(t.interval) timer.Reset(t.interval)
case <-ctx.Done(): case <-ctx.Done():
return ctx.Err() return ctx.Err()
@ -163,6 +169,12 @@ func (t *FlushTask) Await(ctx context.Context) error {
if flushed { if flushed {
return nil return nil
} }
if !timer.Stop() {
select {
case <-timer.C:
default:
}
}
timer.Reset(t.interval) timer.Reset(t.interval)
case <-ctx.Done(): case <-ctx.Done():
return ctx.Err() return ctx.Err()

View File

@ -402,6 +402,12 @@ func (r *opRunner) watchWithTimer(info *datapb.ChannelWatchInfo) *opState {
case <-tickler.GetProgressSig(): case <-tickler.GetProgressSig():
log.Info("Reset timer for tickler updated", zap.Int32("current progress", tickler.Progress())) log.Info("Reset timer for tickler updated", zap.Int32("current progress", tickler.Progress()))
if !timer.Stop() {
select {
case <-timer.C:
default:
}
}
timer.Reset(watchTimeout) timer.Reset(watchTimeout)
case <-successSig: case <-successSig:

View File

@ -73,6 +73,13 @@ func (t *target) send(pack *MsgPack) error {
if t.closed { if t.closed {
return nil return nil
} }
if !t.timer.Stop() {
select {
case <-t.timer.C:
default:
}
}
t.timer.Reset(t.maxLag) t.timer.Reset(t.maxLag)
select { select {
case <-t.cancelCh.CloseCh(): case <-t.cancelCh.CloseCh():

View File

@ -0,0 +1,30 @@
package msgdispatcher
import (
"testing"
"time"
"github.com/stretchr/testify/assert"
"go.uber.org/zap"
"github.com/milvus-io/milvus-proto/go-api/v2/msgpb"
"github.com/milvus-io/milvus/pkg/log"
"github.com/milvus-io/milvus/pkg/mq/msgstream"
"github.com/milvus-io/milvus/pkg/util/paramtable"
)
func TestSendTimeout(t *testing.T) {
target := newTarget("test1", &msgpb.MsgPosition{})
time.Sleep(paramtable.Get().MQCfg.MaxTolerantLag.GetAsDuration(time.Second))
counter := 0
for i := 0; i < 10; i++ {
err := target.send(&msgstream.MsgPack{})
if err != nil {
log.Error("send failed", zap.Int("idx", i), zap.Error(err))
counter++
}
}
assert.Equal(t, counter, 0)
}