KCL: Stuck on processing after kinesis shard splitting

The processing Kinesis gets stuck after splitting shard. The
reason is that the app doesn't do mandatory checkpoint.

KCL document states:
// When the value of {@link ShutdownInput#getShutdownReason()} is
// {@link com.amazonaws.services.kinesis.clientlibrary.lib.worker.ShutdownReason#TERMINATE} it is required that you
// checkpoint. Failure to do so will result in an IllegalArgumentException, and the KCL no longer making progress.

Also, fix shard lease to prevent one host takes more shard than
its configuration allowed.

Jira CNA-1701

Change-Id: Icbdacaf347c7a67b5793647ad05ff93cca629741
This commit is contained in:
Tao Jiang 2018-08-06 20:49:15 -07:00
parent 48fd4dd51c
commit e2a945d824
4 changed files with 20 additions and 10 deletions

View file

@ -246,7 +246,7 @@ func (checkpointer *DynamoCheckpoint) doesTableExist() bool {
TableName: aws.String(checkpointer.TableName), TableName: aws.String(checkpointer.TableName),
} }
_, err := checkpointer.svc.DescribeTable(input) _, err := checkpointer.svc.DescribeTable(input)
return (err == nil) return err == nil
} }
func (checkpointer *DynamoCheckpoint) saveItem(item map[string]*dynamodb.AttributeValue) error { func (checkpointer *DynamoCheckpoint) saveItem(item map[string]*dynamodb.AttributeValue) error {

View file

@ -47,7 +47,7 @@ func (rc *RecordProcessorCheckpointer) Checkpoint(sequenceNumber *string) error
rc.shard.mux.Lock() rc.shard.mux.Lock()
// checkpoint the last sequence of a closed shard // checkpoint the last sequence of a closed shard
if rc.shard.EndingSequenceNumber == aws.StringValue(sequenceNumber) { if sequenceNumber == nil {
rc.shard.Checkpoint = SHARD_END rc.shard.Checkpoint = SHARD_END
} else { } else {
rc.shard.Checkpoint = aws.StringValue(sequenceNumber) rc.shard.Checkpoint = aws.StringValue(sequenceNumber)

View file

@ -146,7 +146,7 @@ func (w *Worker) initialize() error {
err := w.metricsConfig.Init(w.kclConfig.ApplicationName, w.streamName, w.workerID) err := w.metricsConfig.Init(w.kclConfig.ApplicationName, w.streamName, w.workerID)
if err != nil { if err != nil {
log.Errorf("Failed to start monitoring service: %s", err) log.Errorf("Failed to start monitoring service: %+v", err)
} }
w.mService = w.metricsConfig.GetMonitoringService() w.mService = w.metricsConfig.GetMonitoringService()
@ -195,9 +195,8 @@ func (w *Worker) eventLoop() {
for { for {
err := w.syncShard() err := w.syncShard()
if err != nil { if err != nil {
log.Errorf("Error getting Kinesis shards: %v", err) log.Errorf("Error getting Kinesis shards: %+v", err)
// Back-off? time.Sleep(time.Duration(w.kclConfig.ShardSyncIntervalMillis) * time.Millisecond)
time.Sleep(500 * time.Millisecond)
} }
log.Infof("Found %d shards", len(w.shardStatus)) log.Infof("Found %d shards", len(w.shardStatus))
@ -210,17 +209,17 @@ func (w *Worker) eventLoop() {
} }
} }
// max number of lease has not been reached // max number of lease has not been reached yet
if counter < w.kclConfig.MaxLeasesForWorker { if counter < w.kclConfig.MaxLeasesForWorker {
for _, shard := range w.shardStatus { for _, shard := range w.shardStatus {
// We already own this shard so carry on // already owner of the shard
if shard.getLeaseOwner() == w.workerID { if shard.getLeaseOwner() == w.workerID {
continue continue
} }
err := w.checkpointer.FetchCheckpoint(shard) err := w.checkpointer.FetchCheckpoint(shard)
if err != nil { if err != nil {
// checkpoint may not existed yet if not an error condition. // checkpoint may not existed yet is not an error condition.
if err != ErrSequenceIDNotFound { if err != ErrSequenceIDNotFound {
log.Error(err) log.Error(err)
// move on to next shard // move on to next shard
@ -249,6 +248,8 @@ func (w *Worker) eventLoop() {
sc := w.newShardConsumer(shard) sc := w.newShardConsumer(shard)
go sc.getRecords(shard) go sc.getRecords(shard)
w.waitGroup.Add(1) w.waitGroup.Add(1)
// exit from for loop and not to grab more shard for now.
break
} }
} }
@ -272,16 +273,18 @@ func (w *Worker) getShardIDs(startShardID string, shardInfo map[string]bool) err
args := &kinesis.DescribeStreamInput{ args := &kinesis.DescribeStreamInput{
StreamName: aws.String(w.streamName), StreamName: aws.String(w.streamName),
} }
if startShardID != "" { if startShardID != "" {
args.ExclusiveStartShardId = aws.String(startShardID) args.ExclusiveStartShardId = aws.String(startShardID)
} }
streamDesc, err := w.kc.DescribeStream(args) streamDesc, err := w.kc.DescribeStream(args)
if err != nil { if err != nil {
return err return err
} }
if *streamDesc.StreamDescription.StreamStatus != "ACTIVE" { if *streamDesc.StreamDescription.StreamStatus != "ACTIVE" {
return errors.New("Stream not active") return errors.New("stream not active")
} }
var lastShardID string var lastShardID string

View file

@ -154,4 +154,11 @@ func (dd *dumpRecordProcessor) ProcessRecords(input *kc.ProcessRecordsInput) {
func (dd *dumpRecordProcessor) Shutdown(input *kc.ShutdownInput) { func (dd *dumpRecordProcessor) Shutdown(input *kc.ShutdownInput) {
dd.t.Logf("Shutdown Reason: %v", aws.StringValue(kc.ShutdownReasonMessage(input.ShutdownReason))) dd.t.Logf("Shutdown Reason: %v", aws.StringValue(kc.ShutdownReasonMessage(input.ShutdownReason)))
// When the value of {@link ShutdownInput#getShutdownReason()} is
// {@link com.amazonaws.services.kinesis.clientlibrary.lib.worker.ShutdownReason#TERMINATE} it is required that you
// checkpoint. Failure to do so will result in an IllegalArgumentException, and the KCL no longer making progress.
if input.ShutdownReason == kc.TERMINATE {
input.Checkpointer.Checkpoint(nil)
}
} }