amazon-kinesis-client-go/batchconsumer/batcher/message_batcher.go

package batcher

import (
	"fmt"
	"sync"
	"time"

	"github.com/Clever/amazon-kinesis-client-go/kcl"
)

// Sync is used to allow a writer to syncronize with the batcher.
// The writer declares how to write messages (via its `SendBatch` method), while the batcher
// keeps track of messages written
type Sync interface {
	SendBatch(batch [][]byte)
}

// Batcher interface
type Batcher interface {
	// AddMesage to the batch
	AddMessage(msg []byte, sequencePair kcl.SequencePair) error
	// Flush all messages from the batch
	Flush()
	// SmallestSeqPair returns the smallest SequenceNumber and SubSequence number in
	// the current batch
	SmallestSequencePair() kcl.SequencePair
}

type msgPack struct {
	msg          []byte
	sequencePair kcl.SequencePair
}

type batcher struct {
	mux sync.Mutex

	flushInterval time.Duration
	flushCount    int
	flushSize     int

	// smallestSeq are used for checkpointing
	smallestSeq kcl.SequencePair

	sync      Sync
	msgChan   chan<- msgPack
	flushChan chan<- struct{}
}

// New creates a new Batcher
// - sync - synchronizes batcher with writer
// - flushInterval - how often accumulated messages should be flushed (default 1 second).
// - flushCount - number of messages that trigger a flush (default 10).
// - flushSize - size of batch that triggers a flush (default 1024 * 1024 = 1 mb)
func New(sync Sync, flushInterval time.Duration, flushCount int, flushSize int) (Batcher, error) {
	if flushSize == 0 {
		return nil, fmt.Errorf("flush size must be non-zero")
	}
	if flushCount == 0 {
		return nil, fmt.Errorf("flush count must be non-zero")
	}
	if flushInterval == 0 {
		return nil, fmt.Errorf("flush interval must be non-zero")
	}

	msgChan := make(chan msgPack)
	flushChan := make(chan struct{})

	b := &batcher{
		flushCount:    flushCount,
		flushInterval: flushInterval,
		flushSize:     flushSize,
		sync:          sync,
		msgChan:       msgChan,
		flushChan:     flushChan,
	}

	go b.startBatcher(msgChan, flushChan)

	return b, nil
}

func (b *batcher) SmallestSequencePair() kcl.SequencePair {
	b.mux.Lock()
	defer b.mux.Unlock()

	return b.smallestSeq
}

func (b *batcher) SetFlushInterval(dur time.Duration) {
	b.flushInterval = dur
}

func (b *batcher) SetFlushCount(count int) {
	b.flushCount = count
}

func (b *batcher) SetFlushSize(size int) {
	b.flushSize = size
}

func (b *batcher) AddMessage(msg []byte, pair kcl.SequencePair) error {
	if len(msg) <= 0 {
		return fmt.Errorf("Empty messages can't be sent")
	}

	b.msgChan <- msgPack{msg, pair}
	return nil
}

// updateSequenceNumbers is used to track the smallest sequenceNumber of any record in the batch.
// When flush() is called, the batcher sends the sequence number to the writer. When the writer
// checkpoints, it does so up to the latest message that was flushed successfully.
func (b *batcher) updateSequenceNumbers(pair kcl.SequencePair) {
	b.mux.Lock()
	defer b.mux.Unlock()

	if b.smallestSeq.IsEmpty() || pair.IsLessThan(b.smallestSeq) {
		b.smallestSeq = pair
	}
}

func (b *batcher) Flush() {
	b.flushChan <- struct{}{}
}

func (b *batcher) batchSize(batch [][]byte) int {
	total := 0
	for _, msg := range batch {
		total += len(msg)
	}

	return total
}

func (b *batcher) flush(batch [][]byte) [][]byte {
	if len(batch) > 0 {
		b.sync.SendBatch(batch)

		b.mux.Lock()
		b.smallestSeq = kcl.SequencePair{}
		b.mux.Unlock()
	}
	return [][]byte{}
}

func (b *batcher) startBatcher(msgChan <-chan msgPack, flushChan <-chan struct{}) {
	batch := [][]byte{}

	for {
		select {
		case <-time.After(b.flushInterval):
			batch = b.flush(batch)
		case <-flushChan:
			batch = b.flush(batch)
		case pack := <-msgChan:
			size := b.batchSize(batch)
			if b.flushSize < size+len(pack.msg) {
				batch = b.flush(batch)
			}

			batch = append(batch, pack.msg)
			b.updateSequenceNumbers(pack.sequencePair)

			if b.flushCount <= len(batch) || b.flushSize <= b.batchSize(batch) {
				batch = b.flush(batch)
			}
		}
	}
}
Implemented new consumer interface as well as example consumer: 'batchconsumer' 2017-07-18 02:03:15 +00:00			`package batcher`

			`import (`
			`"fmt"`
			`"sync"`
			`"time"`
Created IsLessThan method to SequencePair to make code more readable 2017-07-18 19:52:26 +00:00
Moved SequencePair to kcl package 2017-08-02 19:45:23 +00:00			`"github.com/Clever/amazon-kinesis-client-go/kcl"`
			`)`
Created IsLessThan method to SequencePair to make code more readable 2017-07-18 19:52:26 +00:00
Implemented new consumer interface as well as example consumer: 'batchconsumer' 2017-07-18 02:03:15 +00:00			`// Sync is used to allow a writer to syncronize with the batcher.`
			// The writer declares how to write messages (via its `SendBatch` method), while the batcher
			`// keeps track of messages written`
			`type Sync interface {`
			`SendBatch(batch [][]byte)`
			`}`

			`// Batcher interface`
			`type Batcher interface {`
			`// AddMesage to the batch`
Moved SequencePair to kcl package 2017-08-02 19:45:23 +00:00			`AddMessage(msg []byte, sequencePair kcl.SequencePair) error`
Implemented new consumer interface as well as example consumer: 'batchconsumer' 2017-07-18 02:03:15 +00:00			`// Flush all messages from the batch`
			`Flush()`
			`// SmallestSeqPair returns the smallest SequenceNumber and SubSequence number in`
			`// the current batch`
Moved SequencePair to kcl package 2017-08-02 19:45:23 +00:00			`SmallestSequencePair() kcl.SequencePair`
Implemented new consumer interface as well as example consumer: 'batchconsumer' 2017-07-18 02:03:15 +00:00			`}`

			`type msgPack struct {`
			`msg []byte`
Moved SequencePair to kcl package 2017-08-02 19:45:23 +00:00			`sequencePair kcl.SequencePair`
Implemented new consumer interface as well as example consumer: 'batchconsumer' 2017-07-18 02:03:15 +00:00			`}`

			`type batcher struct {`
			`mux sync.Mutex`

			`flushInterval time.Duration`
			`flushCount int`
			`flushSize int`

Added IsEmpty to SequencePair 2017-07-21 01:23:25 +00:00			`// smallestSeq are used for checkpointing`
Moved SequencePair to kcl package 2017-08-02 19:45:23 +00:00			`smallestSeq kcl.SequencePair`
Implemented new consumer interface as well as example consumer: 'batchconsumer' 2017-07-18 02:03:15 +00:00
			`sync Sync`
			`msgChan chan<- msgPack`
			`flushChan chan<- struct{}`
			`}`

			`// New creates a new Batcher`
			`// - sync - synchronizes batcher with writer`
			`// - flushInterval - how often accumulated messages should be flushed (default 1 second).`
			`// - flushCount - number of messages that trigger a flush (default 10).`
			`// - flushSize - size of batch that triggers a flush (default 1024 * 1024 = 1 mb)`
Batcher now returns error if bad config is provided 2017-07-21 01:33:59 +00:00			`func New(sync Sync, flushInterval time.Duration, flushCount int, flushSize int) (Batcher, error) {`
			`if flushSize == 0 {`
			`return nil, fmt.Errorf("flush size must be non-zero")`
			`}`
			`if flushCount == 0 {`
			`return nil, fmt.Errorf("flush count must be non-zero")`
			`}`
			`if flushInterval == 0 {`
			`return nil, fmt.Errorf("flush interval must be non-zero")`
			`}`

			`msgChan := make(chan msgPack)`
Implemented new consumer interface as well as example consumer: 'batchconsumer' 2017-07-18 02:03:15 +00:00			`flushChan := make(chan struct{})`

			`b := &batcher{`
			`flushCount: flushCount,`
			`flushInterval: flushInterval,`
			`flushSize: flushSize,`
			`sync: sync,`
			`msgChan: msgChan,`
			`flushChan: flushChan,`
			`}`

			`go b.startBatcher(msgChan, flushChan)`

Batcher now returns error if bad config is provided 2017-07-21 01:33:59 +00:00			`return b, nil`
Implemented new consumer interface as well as example consumer: 'batchconsumer' 2017-07-18 02:03:15 +00:00			`}`

Moved SequencePair to kcl package 2017-08-02 19:45:23 +00:00			`func (b *batcher) SmallestSequencePair() kcl.SequencePair {`
Implemented new consumer interface as well as example consumer: 'batchconsumer' 2017-07-18 02:03:15 +00:00			`b.mux.Lock()`
			`defer b.mux.Unlock()`

			`return b.smallestSeq`
			`}`

			`func (b *batcher) SetFlushInterval(dur time.Duration) {`
			`b.flushInterval = dur`
			`}`

			`func (b *batcher) SetFlushCount(count int) {`
			`b.flushCount = count`
			`}`

			`func (b *batcher) SetFlushSize(size int) {`
			`b.flushSize = size`
			`}`

Moved SequencePair to kcl package 2017-08-02 19:45:23 +00:00			`func (b *batcher) AddMessage(msg []byte, pair kcl.SequencePair) error {`
Implemented new consumer interface as well as example consumer: 'batchconsumer' 2017-07-18 02:03:15 +00:00			`if len(msg) <= 0 {`
			`return fmt.Errorf("Empty messages can't be sent")`
			`}`

			`b.msgChan <- msgPack{msg, pair}`
			`return nil`
			`}`

			`// updateSequenceNumbers is used to track the smallest sequenceNumber of any record in the batch.`
			`// When flush() is called, the batcher sends the sequence number to the writer. When the writer`
			`// checkpoints, it does so up to the latest message that was flushed successfully.`
Moved SequencePair to kcl package 2017-08-02 19:45:23 +00:00			`func (b *batcher) updateSequenceNumbers(pair kcl.SequencePair) {`
Implemented new consumer interface as well as example consumer: 'batchconsumer' 2017-07-18 02:03:15 +00:00			`b.mux.Lock()`
			`defer b.mux.Unlock()`

Added IsEmpty to SequencePair 2017-07-21 01:23:25 +00:00			`if b.smallestSeq.IsEmpty() \|\| pair.IsLessThan(b.smallestSeq) {`
			`b.smallestSeq = pair`
Implemented new consumer interface as well as example consumer: 'batchconsumer' 2017-07-18 02:03:15 +00:00			`}`
			`}`

			`func (b *batcher) Flush() {`
			`b.flushChan <- struct{}{}`
			`}`

			`func (b *batcher) batchSize(batch [][]byte) int {`
			`total := 0`
			`for _, msg := range batch {`
			`total += len(msg)`
			`}`

			`return total`
			`}`

			`func (b *batcher) flush(batch [][]byte) [][]byte {`
			`if len(batch) > 0 {`
			`b.sync.SendBatch(batch)`
Added writter unit tests and fixed a slew of concurrency bugs 2017-07-21 01:35:54 +00:00
			`b.mux.Lock()`
Moved SequencePair to kcl package 2017-08-02 19:45:23 +00:00			`b.smallestSeq = kcl.SequencePair{}`
Fixed race condition 2017-07-18 19:13:39 +00:00			`b.mux.Unlock()`
Implemented new consumer interface as well as example consumer: 'batchconsumer' 2017-07-18 02:03:15 +00:00			`}`
			`return [][]byte{}`
			`}`

			`func (b *batcher) startBatcher(msgChan <-chan msgPack, flushChan <-chan struct{}) {`
			`batch := [][]byte{}`

			`for {`
			`select {`
			`case <-time.After(b.flushInterval):`
			`batch = b.flush(batch)`
			`case <-flushChan:`
			`batch = b.flush(batch)`
			`case pack := <-msgChan:`
			`size := b.batchSize(batch)`
			`if b.flushSize < size+len(pack.msg) {`
			`batch = b.flush(batch)`
			`}`

			`batch = append(batch, pack.msg)`
			`b.updateSequenceNumbers(pack.sequencePair)`

			`if b.flushCount <= len(batch) \|\| b.flushSize <= b.batchSize(batch) {`
			`batch = b.flush(batch)`
			`}`
			`}`
			`}`
			`}`