amazon-kinesis-client-go/batchconsumer/batcher/message_batcher.go

170 lines
3.8 KiB
Go
Raw Normal View History

package batcher
import (
"fmt"
"sync"
"time"
2017-08-02 19:45:23 +00:00
"github.com/Clever/amazon-kinesis-client-go/kcl"
)
// Sync is used to allow a writer to syncronize with the batcher.
// The writer declares how to write messages (via its `SendBatch` method), while the batcher
// keeps track of messages written
type Sync interface {
SendBatch(batch [][]byte)
}
// Batcher interface
type Batcher interface {
// AddMesage to the batch
2017-08-02 19:45:23 +00:00
AddMessage(msg []byte, sequencePair kcl.SequencePair) error
// Flush all messages from the batch
Flush()
// SmallestSeqPair returns the smallest SequenceNumber and SubSequence number in
// the current batch
2017-08-02 19:45:23 +00:00
SmallestSequencePair() kcl.SequencePair
}
type msgPack struct {
msg []byte
2017-08-02 19:45:23 +00:00
sequencePair kcl.SequencePair
}
type batcher struct {
mux sync.Mutex
flushInterval time.Duration
flushCount int
flushSize int
2017-07-21 01:23:25 +00:00
// smallestSeq are used for checkpointing
2017-08-02 19:45:23 +00:00
smallestSeq kcl.SequencePair
sync Sync
msgChan chan<- msgPack
flushChan chan<- struct{}
}
// New creates a new Batcher
// - sync - synchronizes batcher with writer
// - flushInterval - how often accumulated messages should be flushed (default 1 second).
// - flushCount - number of messages that trigger a flush (default 10).
// - flushSize - size of batch that triggers a flush (default 1024 * 1024 = 1 mb)
func New(sync Sync, flushInterval time.Duration, flushCount int, flushSize int) (Batcher, error) {
if flushSize == 0 {
return nil, fmt.Errorf("flush size must be non-zero")
}
if flushCount == 0 {
return nil, fmt.Errorf("flush count must be non-zero")
}
if flushInterval == 0 {
return nil, fmt.Errorf("flush interval must be non-zero")
}
msgChan := make(chan msgPack)
flushChan := make(chan struct{})
b := &batcher{
flushCount: flushCount,
flushInterval: flushInterval,
flushSize: flushSize,
sync: sync,
msgChan: msgChan,
flushChan: flushChan,
}
go b.startBatcher(msgChan, flushChan)
return b, nil
}
2017-08-02 19:45:23 +00:00
func (b *batcher) SmallestSequencePair() kcl.SequencePair {
b.mux.Lock()
defer b.mux.Unlock()
return b.smallestSeq
}
func (b *batcher) SetFlushInterval(dur time.Duration) {
b.flushInterval = dur
}
func (b *batcher) SetFlushCount(count int) {
b.flushCount = count
}
func (b *batcher) SetFlushSize(size int) {
b.flushSize = size
}
2017-08-02 19:45:23 +00:00
func (b *batcher) AddMessage(msg []byte, pair kcl.SequencePair) error {
if len(msg) <= 0 {
return fmt.Errorf("Empty messages can't be sent")
}
b.msgChan <- msgPack{msg, pair}
return nil
}
// updateSequenceNumbers is used to track the smallest sequenceNumber of any record in the batch.
// When flush() is called, the batcher sends the sequence number to the writer. When the writer
// checkpoints, it does so up to the latest message that was flushed successfully.
2017-08-02 19:45:23 +00:00
func (b *batcher) updateSequenceNumbers(pair kcl.SequencePair) {
b.mux.Lock()
defer b.mux.Unlock()
2017-07-21 01:23:25 +00:00
if b.smallestSeq.IsEmpty() || pair.IsLessThan(b.smallestSeq) {
b.smallestSeq = pair
}
}
func (b *batcher) Flush() {
b.flushChan <- struct{}{}
}
func (b *batcher) batchSize(batch [][]byte) int {
total := 0
for _, msg := range batch {
total += len(msg)
}
return total
}
func (b *batcher) flush(batch [][]byte) [][]byte {
if len(batch) > 0 {
b.sync.SendBatch(batch)
b.mux.Lock()
2017-08-02 19:45:23 +00:00
b.smallestSeq = kcl.SequencePair{}
2017-07-18 19:13:39 +00:00
b.mux.Unlock()
}
return [][]byte{}
}
func (b *batcher) startBatcher(msgChan <-chan msgPack, flushChan <-chan struct{}) {
batch := [][]byte{}
for {
select {
case <-time.After(b.flushInterval):
batch = b.flush(batch)
case <-flushChan:
batch = b.flush(batch)
case pack := <-msgChan:
size := b.batchSize(batch)
if b.flushSize < size+len(pack.msg) {
batch = b.flush(batch)
}
batch = append(batch, pack.msg)
b.updateSequenceNumbers(pack.sequencePair)
if b.flushCount <= len(batch) || b.flushSize <= b.batchSize(batch) {
batch = b.flush(batch)
}
}
}
}