fix: add hard cap maxRetries for getRecord errors

Signed-off-by: Shiva Pentakota <spentakota@vmware.com>
This commit is contained in:
Shiva Pentakota 2023-01-24 11:59:32 -08:00
parent 66006caf89
commit b5515931d1
3 changed files with 30 additions and 0 deletions

View file

@ -136,6 +136,9 @@ const (
// DefaultLeaseSyncingIntervalMillis Number of milliseconds to wait before syncing with lease table (dynamodDB) // DefaultLeaseSyncingIntervalMillis Number of milliseconds to wait before syncing with lease table (dynamodDB)
DefaultLeaseSyncingIntervalMillis = 60000 DefaultLeaseSyncingIntervalMillis = 60000
// DefaultMaxRetryCount The default maximum number of retries in case of error
DefaultMaxRetryCount = 5
) )
type ( type (
@ -283,6 +286,9 @@ type (
// LeaseSyncingTimeInterval The number of milliseconds to wait before syncing with lease table (dynamoDB) // LeaseSyncingTimeInterval The number of milliseconds to wait before syncing with lease table (dynamoDB)
LeaseSyncingTimeIntervalMillis int LeaseSyncingTimeIntervalMillis int
// MaxRetryCount The maximum number of retries in case of error
MaxRetryCount int
} }
) )

View file

@ -102,6 +102,7 @@ func NewKinesisClientLibConfigWithCredentials(applicationName, streamName, regio
LeaseStealingIntervalMillis: DefaultLeaseStealingIntervalMillis, LeaseStealingIntervalMillis: DefaultLeaseStealingIntervalMillis,
LeaseStealingClaimTimeoutMillis: DefaultLeaseStealingClaimTimeoutMillis, LeaseStealingClaimTimeoutMillis: DefaultLeaseStealingClaimTimeoutMillis,
LeaseSyncingTimeIntervalMillis: DefaultLeaseSyncingIntervalMillis, LeaseSyncingTimeIntervalMillis: DefaultLeaseSyncingIntervalMillis,
MaxRetryCount: DefaultMaxRetryCount,
Logger: logger.GetDefaultLogger(), Logger: logger.GetDefaultLogger(),
} }
} }
@ -211,6 +212,13 @@ func (c *KinesisClientLibConfiguration) WithLogger(logger logger.Logger) *Kinesi
return c return c
} }
// WithMaxRetryCount sets the max retry count in case of error.
func (c *KinesisClientLibConfiguration) WithMaxRetryCount(maxRetryCount int) *KinesisClientLibConfiguration {
checkIsValuePositive("maxRetryCount", maxRetryCount)
c.MaxRetryCount = maxRetryCount
return c
}
// WithMonitoringService sets the monitoring service to use to publish metrics. // WithMonitoringService sets the monitoring service to use to publish metrics.
func (c *KinesisClientLibConfiguration) WithMonitoringService(mService metrics.MonitoringService) *KinesisClientLibConfiguration { func (c *KinesisClientLibConfiguration) WithMonitoringService(mService metrics.MonitoringService) *KinesisClientLibConfiguration {
// Nil case is handled downward (at worker creation) so no need to do it here. // Nil case is handled downward (at worker creation) so no need to do it here.

View file

@ -157,6 +157,14 @@ func (sc *PollingShardConsumer) getRecords() error {
var throughputExceededErr *types.ProvisionedThroughputExceededException var throughputExceededErr *types.ProvisionedThroughputExceededException
var kmsThrottlingErr *types.KMSThrottlingException var kmsThrottlingErr *types.KMSThrottlingException
if errors.As(err, &throughputExceededErr) || err == localTPSExceededError { if errors.As(err, &throughputExceededErr) || err == localTPSExceededError {
retriedErrors++
if retriedErrors > sc.kclConfig.MaxRetryCount {
log.Errorf("message", "reached max retry count getting records from shard",
"shardId", sc.shard.ID,
"retryCount", retriedErrors,
"error", err)
return err
}
// If there is insufficient provisioned throughput on the stream, // If there is insufficient provisioned throughput on the stream,
// subsequent calls made within the next 1 second throw ProvisionedThroughputExceededException. // subsequent calls made within the next 1 second throw ProvisionedThroughputExceededException.
// ref: https://docs.aws.amazon.com/streams/latest/dev/service-sizes-and-limits.html // ref: https://docs.aws.amazon.com/streams/latest/dev/service-sizes-and-limits.html
@ -166,6 +174,14 @@ func (sc *PollingShardConsumer) getRecords() error {
if errors.As(err, &kmsThrottlingErr) { if errors.As(err, &kmsThrottlingErr) {
log.Errorf("Error getting records from shard %v: %+v", sc.shard.ID, err) log.Errorf("Error getting records from shard %v: %+v", sc.shard.ID, err)
retriedErrors++ retriedErrors++
// Greater than MaxRetryCount so we get the last retry
if retriedErrors > sc.kclConfig.MaxRetryCount {
log.Errorf("message", "reached max retry count getting records from shard",
"shardId", sc.shard.ID,
"retryCount", retriedErrors,
"error", err)
return err
}
// exponential backoff // exponential backoff
// https://docs.aws.amazon.com/amazondynamodb/latest/developerguide/Programming.Errors.html#Programming.Errors.RetryAndBackoff // https://docs.aws.amazon.com/amazondynamodb/latest/developerguide/Programming.Errors.html#Programming.Errors.RetryAndBackoff
time.Sleep(time.Duration(math.Exp2(float64(retriedErrors))*100) * time.Millisecond) time.Sleep(time.Duration(math.Exp2(float64(retriedErrors))*100) * time.Millisecond)