Refactor to fix dead locks and race conditions.

2017-08-03 21:22:52 +00:00 · 2017-08-03 21:22:52 +00:00 · ba951ff0da
commit ba951ff0da
parent 29f68f77eb
4 changed files with 211 additions and 163 deletions
--- a/batchconsumer/writer.go
+++ b/batchconsumer/writer.go
@ -32,7 +32,7 @@ type batchedWriter struct {
 	checkpointMsg      chan kcl.SequencePair
 	checkpointShutdown chan struct{}
 	checkpointTag      chan string
-	lastProcessedPair  chan kcl.SequencePair
+	lastIgnoredPair    chan kcl.SequencePair
 	batchMsg           chan tagMsgPair
 	shutdown           chan struct{}
@ -58,11 +58,11 @@ func (b *batchedWriter) Initialize(shardID string, checkpointer kcl.Checkpointer
 	b.checkpointShutdown = make(chan struct{})
 	b.startCheckpointListener(checkpointer, b.checkpointMsg, b.checkpointShutdown)
-	b.checkpointTag = make(chan string)
+	b.checkpointTag = make(chan string, 100) // Buffered to workaround
 	b.batchMsg = make(chan tagMsgPair)
 	b.shutdown = make(chan struct{})
-	b.lastProcessedPair = make(chan kcl.SequencePair)
+	b.lastIgnoredPair = make(chan kcl.SequencePair)
-	b.startMessageHandler(b.batchMsg, b.checkpointTag, b.lastProcessedPair, b.shutdown)
+	b.startMessageHandler(b.batchMsg, b.checkpointTag, b.lastIgnoredPair, b.shutdown)
 	return nil
 }
@ -128,32 +128,48 @@ func (b *batchedWriter) createBatcher(tag string) batcher.Batcher {
 // startMessageDistributer starts a go-routine that routes messages to batches.  It's in uses a
 // go routine to avoid racey conditions.
 func (b *batchedWriter) startMessageHandler(
-	batchMsg <-chan tagMsgPair, checkpointTag <-chan string, lastPair <-chan kcl.SequencePair,
+	batchMsg <-chan tagMsgPair, checkpointTag <-chan string, lastIgnored <-chan kcl.SequencePair,
 	shutdown <-chan struct{},
 ) {
-	go func() {
+	getBatcher := make(chan string)
-		var lastProcessedPair kcl.SequencePair
+	rtnBatcher := make(chan batcher.Batcher)
-		batchers := map[string]batcher.Batcher{}
+	shutdownAdder := make(chan struct{})
 		areBatchersEmpty := true
 	go func() {
 		for {
 			select {
 			case tmp := <-batchMsg:
-				batcher, ok := batchers[tmp.tag]
+				getBatcher <- tmp.tag
-				if !ok {
+				batcher := <-rtnBatcher
 					batcher = b.createBatcher(tmp.tag)
 					batchers[tmp.tag] = batcher
 				}
 				err := batcher.AddMessage(tmp.msg, tmp.pair)
 				if err != nil {
 					b.log.ErrorD("add-message", kv.M{
 						"err": err.Error(), "msg": string(tmp.msg), "tag": tmp.tag,
 					})
 				}
 			case <-shutdownAdder:
 			}
 		}
 	}()
 	go func() {
 		var lastIgnoredPair kcl.SequencePair
 		batchers := map[string]batcher.Batcher{}
 		areBatchersEmpty := true
 		for {
 			select {
 			case tag := <-getBatcher:
 				batcher, ok := batchers[tag]
 				if !ok {
 					batcher = b.createBatcher(tag)
 					batchers[tag] = batcher
 				}
 				areBatchersEmpty = false
 				rtnBatcher <- batcher
 			case tag := <-checkpointTag:
-				smallest := lastProcessedPair
+				smallest := lastIgnoredPair
 				isAllEmpty := true
 				for name, batch := range batchers {
@ -166,7 +182,8 @@ func (b *batchedWriter) startMessageHandler(
 						continue
 					}
-					if pair.IsLessThan(smallest) {
+					// Check for empty because it's possible that no messages have been ignored
 					if smallest.IsEmpty() || pair.IsLessThan(smallest) {
 						smallest = pair
 					}
@ -177,17 +194,18 @@ func (b *batchedWriter) startMessageHandler(
 					b.checkpointMsg <- smallest
 				}
 				areBatchersEmpty = isAllEmpty
-			case pair := <-lastPair:
+			case pair := <-lastIgnored:
-				if areBatchersEmpty {
+				if areBatchersEmpty && !pair.IsEmpty() {
 					b.checkpointMsg <- pair
 				}
-				lastProcessedPair = pair
+				lastIgnoredPair = pair
 			case <-shutdown:
 				for _, batch := range batchers {
 					batch.Flush()
 				}
-				b.checkpointMsg <- lastProcessedPair
+				b.checkpointMsg <- b.lastProcessedSeq
 				b.checkpointShutdown <- struct{}{}
 				areBatchersEmpty = true
 			}
 		}
@ -234,6 +252,7 @@ func (b *batchedWriter) ProcessRecords(records []kcl.Record) error {
 		if err != nil {
 			return err
 		}
 		wasPairIgnored := true
 		for _, rawmsg := range messages {
 			msg, tags, err := b.sender.ProcessMessage(rawmsg)
@ -260,11 +279,14 @@ func (b *batchedWriter) ProcessRecords(records []kcl.Record) error {
 				// sequence number amount all the batch (let's call it A).  We then checkpoint at
 				// the A-1 sequence number.
 				b.batchMsg <- tagMsgPair{tag, msg, prevPair}
 				wasPairIgnored = false
 			}
 		}
 		prevPair = pair
-		b.lastProcessedPair <- pair
+		if wasPairIgnored {
 			b.lastIgnoredPair <- pair
 		}
 	}
 	b.lastProcessedSeq = pair
--- a/batchconsumer/writer_test.go
+++ b/batchconsumer/writer_test.go
@ -86,30 +86,28 @@ type mockCheckpointer struct {
 	shutdown          chan struct{}
 }
-func NewMockCheckpointer(maxSeq string, timeout time.Duration) *mockCheckpointer {
+func NewMockCheckpointer(timeout time.Duration) *mockCheckpointer {
 	mcp := &mockCheckpointer{
 		checkpoint: make(chan string),
 		done:       make(chan struct{}, 1),
 		timeout:    make(chan struct{}, 1),
 		shutdown:   make(chan struct{}),
 	}
-	mcp.startWaiter(maxSeq, timeout)
+	mcp.startWaiter(timeout)
 	return mcp
 }
-func (m *mockCheckpointer) startWaiter(maxSeq string, timeout time.Duration) {
+func (m *mockCheckpointer) startWaiter(timeout time.Duration) {
 	go func() {
 		for {
 			select {
 			case seq := <-m.checkpoint:
 				m.recievedSequences = append(m.recievedSequences, seq)
 				if seq == maxSeq {
 					m.done <- struct{}{}
 				}
 			case <-time.NewTimer(timeout).C:
 				m.timeout <- struct{}{}
 			case <-m.shutdown:
 				m.done <- struct{}{}
 				return
 			}
 		}
@ -126,15 +124,10 @@ func (m *mockCheckpointer) wait() error {
 func (m *mockCheckpointer) Shutdown() {
 	m.shutdown <- struct{}{}
 }
-func (m *mockCheckpointer) Checkpoint(sequenceNumber *string, subSequenceNumber *int) error {
+func (m *mockCheckpointer) Checkpoint(pair kcl.SequencePair, retry int) error {
-	m.checkpoint <- *sequenceNumber
+	m.checkpoint <- pair.Sequence.String()
 	return nil
 }
 func (m *mockCheckpointer) CheckpointWithRetry(
 	sequenceNumber *string, subSequenceNumber *int, retryCount int,
 ) error {
 	return m.Checkpoint(sequenceNumber, subSequenceNumber)
 }
 func encode(str string) string {
 	return base64.StdEncoding.EncodeToString([]byte(str))
@ -148,7 +141,7 @@ func TestProcessRecordsIgnoredMessages(t *testing.T) {
 		BatchInterval:  10 * time.Millisecond,
 		CheckpointFreq: 20 * time.Millisecond,
 	})
-	mockcheckpointer := NewMockCheckpointer("4", 5*time.Second)
+	mockcheckpointer := NewMockCheckpointer(5 * time.Second)
 	wrt := NewBatchedWriter(mockconfig, ignoringSender{}, mocklog)
 	wrt.Initialize("test-shard", mockcheckpointer)
@ -161,8 +154,13 @@ func TestProcessRecordsIgnoredMessages(t *testing.T) {
 	})
 	assert.NoError(err)
 	err = wrt.Shutdown("TERMINATE")
 	assert.NoError(err)
 	err = mockcheckpointer.wait()
 	assert.NoError(err)
 	assert.Contains(mockcheckpointer.recievedSequences, "4")
 }
 func TestProcessRecordsMutliBatchBasic(t *testing.T) {
@ -173,7 +171,7 @@ func TestProcessRecordsMutliBatchBasic(t *testing.T) {
 		BatchInterval:  100 * time.Millisecond,
 		CheckpointFreq: 200 * time.Millisecond,
 	})
-	mockcheckpointer := NewMockCheckpointer("8", 5*time.Second)
+	mockcheckpointer := NewMockCheckpointer(5 * time.Second)
 	mocksender := NewMsgAsTagSender()
 	wrt := NewBatchedWriter(mockconfig, mocksender, mocklog)
@ -233,7 +231,7 @@ func TestProcessRecordsMutliBatchWithIgnores(t *testing.T) {
 		BatchInterval:  100 * time.Millisecond,
 		CheckpointFreq: 200 * time.Millisecond,
 	})
-	mockcheckpointer := NewMockCheckpointer("26", 5*time.Second)
+	mockcheckpointer := NewMockCheckpointer(5 * time.Second)
 	mocksender := NewMsgAsTagSender()
 	wrt := NewBatchedWriter(mockconfig, mocksender, mocklog)
@ -312,7 +310,7 @@ func TestStaggeredCheckpionting(t *testing.T) {
 		BatchInterval:  100 * time.Millisecond,
 		CheckpointFreq: 200 * time.Nanosecond,
 	})
-	mockcheckpointer := NewMockCheckpointer("9", 5*time.Second)
+	mockcheckpointer := NewMockCheckpointer(5 * time.Second)
 	mocksender := NewMsgAsTagSender()
 	wrt := NewBatchedWriter(mockconfig, mocksender, mocklog)
@ -352,6 +350,7 @@ func TestStaggeredCheckpionting(t *testing.T) {
 	assert.NotContains(mockcheckpointer.recievedSequences, "6")
 	assert.NotContains(mockcheckpointer.recievedSequences, "7")
 	assert.NotContains(mockcheckpointer.recievedSequences, "8")
 	assert.Contains(mockcheckpointer.recievedSequences, "9")
 	assert.Contains(mocksender.batches, "tag1")
 	assert.Equal(2, len(mocksender.batches["tag1"]))    // One batch
@ -365,8 +364,10 @@ func TestStaggeredCheckpionting(t *testing.T) {
 	assert.Equal(2, len(mocksender.batches["tag3"][0])) // with three items
 	assert.Equal("tag3", string(mocksender.batches["tag3"][0][0]))
 	assert.Equal("tag3", string(mocksender.batches["tag3"][0][1]))
 	assert.Equal(2, len(mocksender.batches["tag3"][1]))
 	assert.Equal("tag3", string(mocksender.batches["tag3"][1][0]))
 	assert.Equal("tag3", string(mocksender.batches["tag3"][1][1]))
 	assert.Equal(2, len(mocksender.batches["tag3"][2]))
 	assert.Equal("tag3", string(mocksender.batches["tag3"][2][0]))
 	assert.Equal("tag3", string(mocksender.batches["tag3"][2][1]))
 }
--- a/cmd/consumer/main.go
+++ b/cmd/consumer/main.go
@ -13,8 +13,7 @@ type sampleRecordProcessor struct {
 	checkpointer      kcl.Checkpointer
 	checkpointRetries int
 	checkpointFreq    time.Duration
-	largestSeq        *big.Int
+	largestPair       kcl.SequencePair
 	largestSubSeq     int
 	lastCheckpoint    time.Time
 }
@ -31,9 +30,8 @@ func (srp *sampleRecordProcessor) Initialize(shardID string, checkpointer kcl.Ch
 	return nil
 }
-func (srp *sampleRecordProcessor) shouldUpdateSequence(sequenceNumber *big.Int, subSequenceNumber int) bool {
+func (srp *sampleRecordProcessor) shouldUpdateSequence(pair kcl.SequencePair) bool {
-	return srp.largestSeq == nil || sequenceNumber.Cmp(srp.largestSeq) == 1 ||
+	return srp.largestPair.IsLessThan(pair)
 		(sequenceNumber.Cmp(srp.largestSeq) == 0 && subSequenceNumber > srp.largestSubSeq)
 }
 func (srp *sampleRecordProcessor) ProcessRecords(records []kcl.Record) error {
@ -43,14 +41,13 @@ func (srp *sampleRecordProcessor) ProcessRecords(records []kcl.Record) error {
 			fmt.Fprintf(os.Stderr, "could not parse sequence number '%s'\n", record.SequenceNumber)
 			continue
 		}
-		if srp.shouldUpdateSequence(seqNumber, record.SubSequenceNumber) {
+		pair := kcl.SequencePair{seqNumber, record.SubSequenceNumber}
-			srp.largestSeq = seqNumber
+		if srp.shouldUpdateSequence(pair) {
-			srp.largestSubSeq = record.SubSequenceNumber
+			srp.largestPair = pair
 		}
 	}
 	if time.Now().Sub(srp.lastCheckpoint) > srp.checkpointFreq {
-		largestSeq := srp.largestSeq.String()
+		srp.checkpointer.Checkpoint(srp.largestPair, srp.checkpointRetries)
 		srp.checkpointer.CheckpointWithRetry(&largestSeq, &srp.largestSubSeq, srp.checkpointRetries)
 		srp.lastCheckpoint = time.Now()
 	}
 	return nil
--- a/kcl/kcl.go
+++ b/kcl/kcl.go
@ -2,12 +2,10 @@ package kcl
 import (
 	"bufio"
 	"bytes"
 	"encoding/json"
 	"fmt"
 	"io"
 	"os"
 	"sync"
 	"time"
 )
@ -18,8 +16,7 @@ type RecordProcessor interface {
 }
 type Checkpointer interface {
-	Checkpoint(sequenceNumber *string, subSequenceNumber *int) error
+	Checkpoint(pair SequencePair, retryCount int) error
 	CheckpointWithRetry(sequenceNumber *string, subSequenceNumber *int, retryCount int) error
 	Shutdown()
 }
@ -31,93 +28,6 @@ func (ce CheckpointError) Error() string {
 	return ce.e
 }
 type checkpointer struct {
 	mux sync.Mutex
 	ioHandler ioHandler
 }
 func (c *checkpointer) getAction() (interface{}, error) {
 	line, err := c.ioHandler.readLine()
 	if err != nil {
 		return nil, err
 	}
 	action, err := c.ioHandler.loadAction(line.String())
 	if err != nil {
 		return nil, err
 	}
 	return action, nil
 }
 func (c *checkpointer) Checkpoint(sequenceNumber *string, subSequenceNumber *int) error {
 	c.mux.Lock()
 	defer c.mux.Unlock()
 	c.ioHandler.writeAction(ActionCheckpoint{
 		Action:            "checkpoint",
 		SequenceNumber:    sequenceNumber,
 		SubSequenceNumber: subSequenceNumber,
 	})
 	line, err := c.ioHandler.readLine()
 	if err != nil {
 		return err
 	}
 	actionI, err := c.ioHandler.loadAction(line.String())
 	if err != nil {
 		return err
 	}
 	action, ok := actionI.(ActionCheckpoint)
 	if !ok {
 		return fmt.Errorf("expected checkpoint response, got '%s'", line.String())
 	}
 	if action.Error != nil && *action.Error != "" {
 		return CheckpointError{
 			e: *action.Error,
 		}
 	}
 	return nil
 }
 // CheckpointWithRetry tries to save a checkPoint up to `retryCount` + 1 times.
 // `retryCount` should be >= 0
 func (c *checkpointer) CheckpointWithRetry(
 	sequenceNumber *string, subSequenceNumber *int, retryCount int,
 ) error {
 	sleepDuration := 5 * time.Second
 	for n := 0; n <= retryCount; n++ {
 		err := c.Checkpoint(sequenceNumber, subSequenceNumber)
 		if err == nil {
 			return nil
 		}
 		if cperr, ok := err.(CheckpointError); ok {
 			switch cperr.Error() {
 			case "ShutdownException":
 				return fmt.Errorf("Encountered shutdown exception, skipping checkpoint")
 			case "ThrottlingException":
 				fmt.Fprintf(os.Stderr, "Was throttled while checkpointing, will attempt again in %s\n", sleepDuration)
 			case "InvalidStateException":
 				fmt.Fprintf(os.Stderr, "MultiLangDaemon reported an invalid state while checkpointing\n")
 			default:
 				fmt.Fprintf(os.Stderr, "Encountered an error while checkpointing: %s", err)
 			}
 		}
 		if n == retryCount {
 			return fmt.Errorf("Failed to checkpoint after %d attempts, giving up.", retryCount)
 		}
 		time.Sleep(sleepDuration)
 	}
 	return nil
 }
 func (c *checkpointer) Shutdown() {
 	c.CheckpointWithRetry(nil, nil, 5)
 }
 type ioHandler struct {
 	inputFile  io.Reader
 	outputFile io.Writer
@ -134,13 +44,13 @@ func (i ioHandler) writeError(message string) {
 	fmt.Fprintf(i.errorFile, "%s\n", message)
 }
-func (i ioHandler) readLine() (*bytes.Buffer, error) {
+func (i ioHandler) readLine() (string, error) {
 	bio := bufio.NewReader(i.inputFile)
 	line, err := bio.ReadString('\n')
 	if err != nil {
-		return nil, err
+		return "", err
 	}
-	return bytes.NewBufferString(line), nil
+	return line, nil
 }
 type ActionInitialize struct {
@ -197,6 +107,8 @@ func (i ioHandler) loadAction(line string) (interface{}, error) {
 			return nil, err
 		}
 		return actionProcessRecords, nil
 	case "shutdownRequested":
 		fallthrough
 	case "shutdown":
 		var actionShutdown ActionShutdown
 		if err := json.Unmarshal(lineBytes, &actionShutdown); err != nil {
@ -223,25 +135,37 @@ func (i ioHandler) writeAction(action interface{}) error {
 	return nil
 }
-func New(inputFile io.Reader, outputFile, errorFile io.Writer, recordProcessor RecordProcessor) *KCLProcess {
+func New(
 	inputFile io.Reader, outputFile, errorFile io.Writer, recordProcessor RecordProcessor,
 ) *KCLProcess {
 	i := ioHandler{
 		inputFile:  inputFile,
 		outputFile: outputFile,
 		errorFile:  errorFile,
 	}
 	return &KCLProcess{
-		ioHandler: i,
+		ioHandler:       i,
 		checkpointer: &checkpointer{
 			ioHandler: i,
 		},
 		recordProcessor: recordProcessor,
 		next:   make(chan struct{}),
 		out:    make(chan string),
 		outErr: make(chan error),
 		checkpoint:    make(chan SequencePair),
 		checkpointErr: make(chan error),
 	}
 }
 type KCLProcess struct {
 	ioHandler       ioHandler
 	checkpointer    Checkpointer
 	recordProcessor RecordProcessor
 	next   chan struct{}
 	out    chan string
 	outErr chan error
 	checkpoint    chan SequencePair
 	checkpointErr chan error
 }
 func (kclp *KCLProcess) reportDone(responseFor string) error {
@ -257,13 +181,13 @@ func (kclp *KCLProcess) reportDone(responseFor string) error {
 func (kclp *KCLProcess) performAction(a interface{}) (string, error) {
 	switch action := a.(type) {
 	case ActionInitialize:
-		return action.Action, kclp.recordProcessor.Initialize(action.ShardID, kclp.checkpointer)
+		return action.Action, kclp.recordProcessor.Initialize(action.ShardID, kclp)
 	case ActionProcessRecords:
 		return action.Action, kclp.recordProcessor.ProcessRecords(action.Records)
 	case ActionShutdown:
 		return action.Action, kclp.recordProcessor.Shutdown(action.Reason)
 	default:
-		return "", fmt.Errorf("unknown action to dispatch: %s", action)
+		return "", fmt.Errorf("unknown action to dispatch: %+#v", action)
 	}
 }
@ -280,20 +204,124 @@ func (kclp *KCLProcess) handleLine(line string) error {
 	return kclp.reportDone(responseFor)
 }
-func (kclp *KCLProcess) Run() {
+func (kclp *KCLProcess) Checkpoint(pair SequencePair, retryCount int) error {
-	for {
+	sleepDuration := 5 * time.Second
-		line, err := kclp.ioHandler.readLine()
+
-		if err != nil {
+	for n := 0; n <= retryCount; n++ {
-			kclp.ioHandler.writeError("Read line error: " + err.Error())
+		kclp.checkpoint <- pair
-			return
+		err := <-kclp.checkpointErr
-		} else if line == nil {
+		if err == nil {
-			kclp.ioHandler.writeError("Empty read line recieved")
+			return nil
 			return
 		}
-		err = kclp.handleLine(line.String())
+		if cperr, ok := err.(CheckpointError); ok {
-		if err != nil {
+			switch cperr.Error() {
-			kclp.ioHandler.writeError("Handle line error: " + err.Error())
+			case "ShutdownException":
 				return fmt.Errorf("Encountered shutdown exception, skipping checkpoint")
 			case "ThrottlingException":
 				fmt.Fprintf(os.Stderr, "Checkpointing throttling, pause for %s\n", sleepDuration)
 			case "InvalidStateException":
 				fmt.Fprintf(os.Stderr, "MultiLangDaemon invalid state while checkpointing\n")
 			default:
 				fmt.Fprintf(os.Stderr, "Encountered an error while checkpointing: %s", err)
 			}
 		}
 		if n == retryCount {
 			return fmt.Errorf("Failed to checkpoint after %d attempts, giving up.", retryCount)
 		}
 		time.Sleep(sleepDuration)
 	}
 	return nil
 }
 func (kclp *KCLProcess) Shutdown() {
 	kclp.Checkpoint(SequencePair{}, 5)
 }
 func (kclp *KCLProcess) processCheckpoint(pair SequencePair) error {
 	var seq *string
 	var subSeq *int
 	if !pair.IsEmpty() { // an empty pair is a signal to shutdown
 		tmp := pair.Sequence.String()
 		seq = &tmp
 		subSeq = &pair.SubSequence
 	}
 	kclp.ioHandler.writeAction(ActionCheckpoint{
 		Action:            "checkpoint",
 		SequenceNumber:    seq,
 		SubSequenceNumber: subSeq,
 	})
 	line, err := kclp.ioHandler.readLine()
 	if err != nil {
 		return err
 	}
 	actionI, err := kclp.ioHandler.loadAction(line)
 	if err != nil {
 		return err
 	}
 	action, ok := actionI.(ActionCheckpoint)
 	if !ok {
 		return fmt.Errorf("expected checkpoint response, got '%s'", line)
 	}
 	if action.Error != nil && *action.Error != "" {
 		return CheckpointError{e: *action.Error}
 	}
 	return nil
 }
 func (kclp *KCLProcess) startLineProcessor(
 	next chan struct{}, out chan string, outErr chan error,
 	checkpoint chan SequencePair, checkpointErr chan error,
 ) {
 	go func() {
 		for {
 			select {
 			case <-next:
 				line, err := kclp.ioHandler.readLine()
 				if err != nil {
 					outErr <- err
 				} else {
 					out <- line
 				}
 			case pair := <-checkpoint:
 				err := kclp.processCheckpoint(pair)
 				checkpointErr <- err
 			}
 		}
 	}()
 }
 func (kclp *KCLProcess) processNextLine() error {
 	kclp.next <- struct{}{} // We're ready for a new line
 	var err error
 	var line string
 	select {
 	case err = <-kclp.outErr:
 	case line = <-kclp.out:
 		if line == "" {
 			err = fmt.Errorf("Empty read line recieved")
 		} else {
 			err = kclp.handleLine(line)
 		}
 	}
 	return err
 }
 func (kclp *KCLProcess) Run() {
 	kclp.startLineProcessor(kclp.next, kclp.out, kclp.outErr, kclp.checkpoint, kclp.checkpointErr)
 	for {
 		err := kclp.processNextLine()
 		if err == io.EOF {
 			kclp.ioHandler.writeError("IO stream closed")
 			return
 		} else if err != nil {
 			kclp.ioHandler.writeError(fmt.Sprintf("ERR Handle line: %+#v", err))
 			return
 		}
 	}