KCL: Add support for handling shard split
Add support for handling child/parent shard. When processing child shard, it has to wait until parent shard finished before processing itself. Change-Id: I8bbf104c22ae93409d856be9c6829988c1b2d7eb
This commit is contained in:
parent
c05bfb7ac8
commit
869a8e4275
4 changed files with 81 additions and 12 deletions
|
|
@ -19,6 +19,10 @@ const (
|
||||||
LEASE_OWNER_KEY = "AssignedTo"
|
LEASE_OWNER_KEY = "AssignedTo"
|
||||||
LEASE_TIMEOUT_KEY = "LeaseTimeout"
|
LEASE_TIMEOUT_KEY = "LeaseTimeout"
|
||||||
CHECKPOINT_SEQUENCE_NUMBER_KEY = "Checkpoint"
|
CHECKPOINT_SEQUENCE_NUMBER_KEY = "Checkpoint"
|
||||||
|
PARENT_SHARD_ID_KEY = "ParentShardId"
|
||||||
|
|
||||||
|
// We've completely processed all records in this shard.
|
||||||
|
SHARD_END = "SHARD_END"
|
||||||
|
|
||||||
// ErrLeaseNotAquired is returned when we failed to get a lock on the shard
|
// ErrLeaseNotAquired is returned when we failed to get a lock on the shard
|
||||||
ErrLeaseNotAquired = "Lease is already held by another node"
|
ErrLeaseNotAquired = "Lease is already held by another node"
|
||||||
|
|
@ -124,6 +128,10 @@ func (checkpointer *DynamoCheckpoint) GetLease(shard *shardStatus, newAssignTo s
|
||||||
},
|
},
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if len(shard.ParentShardId) > 0 {
|
||||||
|
marshalledCheckpoint[PARENT_SHARD_ID_KEY] = &dynamodb.AttributeValue{S: &shard.ParentShardId}
|
||||||
|
}
|
||||||
|
|
||||||
if shard.Checkpoint != "" {
|
if shard.Checkpoint != "" {
|
||||||
marshalledCheckpoint[CHECKPOINT_SEQUENCE_NUMBER_KEY] = &dynamodb.AttributeValue{
|
marshalledCheckpoint[CHECKPOINT_SEQUENCE_NUMBER_KEY] = &dynamodb.AttributeValue{
|
||||||
S: &shard.Checkpoint,
|
S: &shard.Checkpoint,
|
||||||
|
|
@ -165,6 +173,11 @@ func (checkpointer *DynamoCheckpoint) CheckpointSequence(shard *shardStatus) err
|
||||||
S: &leaseTimeout,
|
S: &leaseTimeout,
|
||||||
},
|
},
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if len(shard.ParentShardId) > 0 {
|
||||||
|
marshalledCheckpoint[PARENT_SHARD_ID_KEY] = &dynamodb.AttributeValue{S: &shard.ParentShardId}
|
||||||
|
}
|
||||||
|
|
||||||
return checkpointer.saveItem(marshalledCheckpoint)
|
return checkpointer.saveItem(marshalledCheckpoint)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -45,7 +45,14 @@ func (pc *PreparedCheckpointer) Checkpoint() error {
|
||||||
|
|
||||||
func (rc *RecordProcessorCheckpointer) Checkpoint(sequenceNumber *string) error {
|
func (rc *RecordProcessorCheckpointer) Checkpoint(sequenceNumber *string) error {
|
||||||
rc.shard.mux.Lock()
|
rc.shard.mux.Lock()
|
||||||
rc.shard.Checkpoint = aws.StringValue(sequenceNumber)
|
|
||||||
|
// checkpoint the last sequence of a closed shard
|
||||||
|
if rc.shard.EndingSequenceNumber == aws.StringValue(sequenceNumber) {
|
||||||
|
rc.shard.Checkpoint = SHARD_END
|
||||||
|
} else {
|
||||||
|
rc.shard.Checkpoint = aws.StringValue(sequenceNumber)
|
||||||
|
}
|
||||||
|
|
||||||
rc.shard.mux.Unlock()
|
rc.shard.mux.Unlock()
|
||||||
return rc.checkpoint.CheckpointSequence(rc.shard)
|
return rc.checkpoint.CheckpointSequence(rc.shard)
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -96,6 +96,12 @@ func (sc *ShardConsumer) getShardIterator(shard *shardStatus) (*string, error) {
|
||||||
func (sc *ShardConsumer) getRecords(shard *shardStatus) error {
|
func (sc *ShardConsumer) getRecords(shard *shardStatus) error {
|
||||||
defer sc.waitGroup.Done()
|
defer sc.waitGroup.Done()
|
||||||
|
|
||||||
|
// If the shard is child shard, need to wait until the parent finished.
|
||||||
|
if err := sc.waitOnParentShard(shard); err != nil {
|
||||||
|
log.Errorf("Error in waiting for parent shard: %v to finish. Error: %+v", shard.ParentShardId, err)
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
|
||||||
shardIterator, err := sc.getShardIterator(shard)
|
shardIterator, err := sc.getShardIterator(shard)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
log.Errorf("Unable to get shard iterator for %s: %v", shard.ID, err)
|
log.Errorf("Unable to get shard iterator for %s: %v", shard.ID, err)
|
||||||
|
|
@ -208,3 +214,28 @@ func (sc *ShardConsumer) getRecords(shard *shardStatus) error {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Need to wait until the parent shard finished
|
||||||
|
func (sc *ShardConsumer) waitOnParentShard(shard *shardStatus) error {
|
||||||
|
if len(shard.ParentShardId) == 0 {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
pshard := &shardStatus{
|
||||||
|
ID: shard.ParentShardId,
|
||||||
|
mux: &sync.Mutex{},
|
||||||
|
}
|
||||||
|
|
||||||
|
for {
|
||||||
|
if err := sc.checkpointer.FetchCheckpoint(pshard); err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
|
||||||
|
// Parent shard is finished.
|
||||||
|
if pshard.Checkpoint == SHARD_END {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
time.Sleep(time.Duration(sc.kclConfig.ParentShardPollIntervalMillis) * time.Millisecond)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
|
||||||
|
|
@ -22,11 +22,16 @@ import (
|
||||||
)
|
)
|
||||||
|
|
||||||
type shardStatus struct {
|
type shardStatus struct {
|
||||||
ID string
|
ID string
|
||||||
Checkpoint string
|
ParentShardId string
|
||||||
AssignedTo string
|
Checkpoint string
|
||||||
mux *sync.Mutex
|
AssignedTo string
|
||||||
LeaseTimeout time.Time
|
mux *sync.Mutex
|
||||||
|
LeaseTimeout time.Time
|
||||||
|
// Shard Range
|
||||||
|
StartingSequenceNumber string
|
||||||
|
// child shard doesn't have end sequence number
|
||||||
|
EndingSequenceNumber string
|
||||||
}
|
}
|
||||||
|
|
||||||
func (ss *shardStatus) getLeaseOwner() string {
|
func (ss *shardStatus) getLeaseOwner() string {
|
||||||
|
|
@ -214,19 +219,29 @@ func (w *Worker) eventLoop() {
|
||||||
|
|
||||||
err := w.checkpointer.FetchCheckpoint(shard)
|
err := w.checkpointer.FetchCheckpoint(shard)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
|
// checkpoint may not existed yet if not an error condition.
|
||||||
if err != ErrSequenceIDNotFound {
|
if err != ErrSequenceIDNotFound {
|
||||||
log.Fatal(err)
|
log.Error(err)
|
||||||
|
// move on to next shard
|
||||||
|
continue
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// The shard is closed and we have processed all records
|
||||||
|
if shard.Checkpoint == SHARD_END {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
|
||||||
err = w.checkpointer.GetLease(shard, w.workerID)
|
err = w.checkpointer.GetLease(shard, w.workerID)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
if err.Error() == ErrLeaseNotAquired {
|
// cannot get lease on the shard
|
||||||
continue
|
if err.Error() != ErrLeaseNotAquired {
|
||||||
|
log.Error(err)
|
||||||
}
|
}
|
||||||
log.Fatal(err)
|
continue
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// log metrics on got lease
|
||||||
w.mService.LeaseGained(shard.ID)
|
w.mService.LeaseGained(shard.ID)
|
||||||
|
|
||||||
log.Infof("Start Shard Consumer for shard: %v", shard.ID)
|
log.Infof("Start Shard Consumer for shard: %v", shard.ID)
|
||||||
|
|
@ -272,8 +287,11 @@ func (w *Worker) getShardIDs(startShardID string) error {
|
||||||
if _, ok := w.shardStatus[*s.ShardId]; !ok {
|
if _, ok := w.shardStatus[*s.ShardId]; !ok {
|
||||||
log.Debugf("Found shard with id %s", *s.ShardId)
|
log.Debugf("Found shard with id %s", *s.ShardId)
|
||||||
w.shardStatus[*s.ShardId] = &shardStatus{
|
w.shardStatus[*s.ShardId] = &shardStatus{
|
||||||
ID: *s.ShardId,
|
ID: *s.ShardId,
|
||||||
mux: &sync.Mutex{},
|
ParentShardId: aws.StringValue(s.ParentShardId),
|
||||||
|
mux: &sync.Mutex{},
|
||||||
|
StartingSequenceNumber: aws.StringValue(s.SequenceNumberRange.StartingSequenceNumber),
|
||||||
|
EndingSequenceNumber: aws.StringValue(s.SequenceNumberRange.EndingSequenceNumber),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
lastShardID = *s.ShardId
|
lastShardID = *s.ShardId
|
||||||
|
|
|
||||||
Loading…
Reference in a new issue