diff --git a/amazon-kinesis-client/src/main/java/software/amazon/kinesis/coordinator/PeriodicShardSyncManager.java b/amazon-kinesis-client/src/main/java/software/amazon/kinesis/coordinator/PeriodicShardSyncManager.java index c84547e2..fdc359bd 100644 --- a/amazon-kinesis-client/src/main/java/software/amazon/kinesis/coordinator/PeriodicShardSyncManager.java +++ b/amazon-kinesis-client/src/main/java/software/amazon/kinesis/coordinator/PeriodicShardSyncManager.java @@ -23,6 +23,7 @@ import lombok.Value; import lombok.experimental.Accessors; import lombok.extern.slf4j.Slf4j; import org.apache.commons.lang3.Validate; +import software.amazon.awssdk.services.cloudwatch.model.StandardUnit; import software.amazon.awssdk.services.kinesis.model.Shard; import software.amazon.awssdk.utils.CollectionUtils; import software.amazon.kinesis.common.HashKeyRangeForLease; @@ -38,6 +39,11 @@ import software.amazon.kinesis.leases.exceptions.DependencyException; import software.amazon.kinesis.leases.exceptions.InvalidStateException; import software.amazon.kinesis.leases.exceptions.ProvisionedThroughputException; import software.amazon.kinesis.lifecycle.TaskResult; +import software.amazon.kinesis.metrics.MetricsFactory; +import software.amazon.kinesis.metrics.MetricsLevel; +import software.amazon.kinesis.metrics.MetricsScope; +import software.amazon.kinesis.metrics.MetricsUtil; +import software.amazon.kinesis.metrics.NullMetricsScope; import java.io.Serializable; import java.math.BigInteger; @@ -67,13 +73,11 @@ import static software.amazon.kinesis.common.HashKeyRangeForLease.fromHashKeyRan @Slf4j class PeriodicShardSyncManager { private static final long INITIAL_DELAY = 60 * 1000L; - private static final long PERIODIC_SHARD_SYNC_INTERVAL_MILLIS = 2 * 60 * 1000L; @VisibleForTesting static final BigInteger MIN_HASH_KEY = BigInteger.ZERO; @VisibleForTesting static final BigInteger MAX_HASH_KEY = new BigInteger("2").pow(128).subtract(BigInteger.ONE); - @VisibleForTesting - static final int CONSECUTIVE_HOLES_FOR_TRIGGERING_RECOVERY = 3; + static final String PERODIC_SHARD_SYNC_MANAGER = "PeriodicShardSyncManager"; private Map hashRangeHoleTrackerMap = new HashMap<>(); private final String workerId; @@ -83,19 +87,29 @@ class PeriodicShardSyncManager { private final Function shardSyncTaskManagerProvider; private final ScheduledExecutorService shardSyncThreadPool; private final boolean isMultiStreamingMode; + private final MetricsFactory metricsFactory; + private final long leasesRecoveryAuditorExecutionFrequencyMillis; + private final int leasesRecoveryAuditorInconsistencyConfidenceThreshold; private boolean isRunning; PeriodicShardSyncManager(String workerId, LeaderDecider leaderDecider, LeaseRefresher leaseRefresher, Map currentStreamConfigMap, - Function shardSyncTaskManagerProvider, boolean isMultiStreamingMode) { + Function shardSyncTaskManagerProvider, boolean isMultiStreamingMode, + MetricsFactory metricsFactory, + long leasesRecoveryAuditorExecutionFrequencyMillis, + int leasesRecoveryAuditorInconsistencyConfidenceThreshold) { this(workerId, leaderDecider, leaseRefresher, currentStreamConfigMap, shardSyncTaskManagerProvider, - Executors.newSingleThreadScheduledExecutor(), isMultiStreamingMode); + Executors.newSingleThreadScheduledExecutor(), isMultiStreamingMode, metricsFactory, + leasesRecoveryAuditorExecutionFrequencyMillis, leasesRecoveryAuditorInconsistencyConfidenceThreshold); } PeriodicShardSyncManager(String workerId, LeaderDecider leaderDecider, LeaseRefresher leaseRefresher, Map currentStreamConfigMap, Function shardSyncTaskManagerProvider, - ScheduledExecutorService shardSyncThreadPool, boolean isMultiStreamingMode) { + ScheduledExecutorService shardSyncThreadPool, boolean isMultiStreamingMode, + MetricsFactory metricsFactory, + long leasesRecoveryAuditorExecutionFrequencyMillis, + int leasesRecoveryAuditorInconsistencyConfidenceThreshold) { Validate.notBlank(workerId, "WorkerID is required to initialize PeriodicShardSyncManager."); Validate.notNull(leaderDecider, "LeaderDecider is required to initialize PeriodicShardSyncManager."); this.workerId = workerId; @@ -105,6 +119,9 @@ class PeriodicShardSyncManager { this.shardSyncTaskManagerProvider = shardSyncTaskManagerProvider; this.shardSyncThreadPool = shardSyncThreadPool; this.isMultiStreamingMode = isMultiStreamingMode; + this.metricsFactory = metricsFactory; + this.leasesRecoveryAuditorExecutionFrequencyMillis = leasesRecoveryAuditorExecutionFrequencyMillis; + this.leasesRecoveryAuditorInconsistencyConfidenceThreshold = leasesRecoveryAuditorInconsistencyConfidenceThreshold; } public synchronized TaskResult start() { @@ -116,7 +133,7 @@ class PeriodicShardSyncManager { log.error("Error during runShardSync.", t); } }; - shardSyncThreadPool.scheduleWithFixedDelay(periodicShardSyncer, INITIAL_DELAY, PERIODIC_SHARD_SYNC_INTERVAL_MILLIS, + shardSyncThreadPool.scheduleWithFixedDelay(periodicShardSyncer, INITIAL_DELAY, leasesRecoveryAuditorExecutionFrequencyMillis, TimeUnit.MILLISECONDS); isRunning = true; @@ -157,6 +174,13 @@ class PeriodicShardSyncManager { private void runShardSync() { if (leaderDecider.isLeader(workerId)) { log.info(String.format("WorkerId %s is leader, running the periodic shard sync task", workerId)); + + final MetricsScope scope = MetricsUtil.createMetricsWithOperation(metricsFactory, PERODIC_SHARD_SYNC_MANAGER); + int numStreamsWithPartialLeases = 0; + int numStreamsToSync = 0; + boolean isRunSuccess = false; + final long runStartMillis = System.currentTimeMillis(); + try { // Construct the stream to leases map to be used in the lease sync final Map> streamToLeasesMap = getStreamToLeasesMap( @@ -166,6 +190,10 @@ class PeriodicShardSyncManager { for (Map.Entry streamConfigEntry : currentStreamConfigMap.entrySet()) { final ShardSyncResponse shardSyncResponse = checkForShardSync(streamConfigEntry.getKey(), streamToLeasesMap.get(streamConfigEntry.getKey())); + + numStreamsWithPartialLeases += shardSyncResponse.isHoleDetected() ? 1 : 0; + numStreamsToSync += shardSyncResponse.shouldDoShardSync ? 1 : 0; + if (shardSyncResponse.shouldDoShardSync()) { log.info("Periodic shard syncer initiating shard sync for {} due to the reason - {} ", streamConfigEntry.getKey(), shardSyncResponse.reasonForDecision()); @@ -181,8 +209,14 @@ class PeriodicShardSyncManager { shardSyncResponse.reasonForDecision()); } } + isRunSuccess = true; } catch (Exception e) { log.error("Caught exception while running periodic shard syncer.", e); + } finally { + scope.addData("NumStreamsWithPartialLeases", numStreamsWithPartialLeases, StandardUnit.COUNT, MetricsLevel.SUMMARY); + scope.addData("NumStreamsToSync", numStreamsToSync, StandardUnit.COUNT, MetricsLevel.SUMMARY); + MetricsUtil.addSuccessAndLatency(scope, isRunSuccess, runStartMillis, MetricsLevel.SUMMARY); + scope.end(); } } else { log.debug("WorkerId {} is not a leader, not running the shard sync task", workerId); @@ -214,7 +248,7 @@ class PeriodicShardSyncManager { if (CollectionUtils.isNullOrEmpty(leases)) { // If the leases is null or empty then we need to do shard sync log.info("No leases found for {}. Will be triggering shard sync", streamIdentifier); - return new ShardSyncResponse(true, "No leases found for " + streamIdentifier); + return new ShardSyncResponse(true, false,"No leases found for " + streamIdentifier); } // Check if there are any holes in the leases and return the first hole if present. Optional hashRangeHoleOpt = hasHoleInLeases(streamIdentifier, leases); @@ -227,15 +261,15 @@ class PeriodicShardSyncManager { .computeIfAbsent(streamIdentifier, s -> new HashRangeHoleTracker()); final boolean hasHoleWithHighConfidence = hashRangeHoleTracker .hasHighConfidenceOfHoleWith(hashRangeHoleOpt.get()); - return new ShardSyncResponse(hasHoleWithHighConfidence, + return new ShardSyncResponse(hasHoleWithHighConfidence, true, "Detected same hole for " + hashRangeHoleTracker.getNumConsecutiveHoles() + " times. Shard sync will be initiated when threshold reaches " - + CONSECUTIVE_HOLES_FOR_TRIGGERING_RECOVERY); + + leasesRecoveryAuditorInconsistencyConfidenceThreshold); } else { // If hole is not present, clear any previous tracking for this stream and return false; hashRangeHoleTrackerMap.remove(streamIdentifier); - return new ShardSyncResponse(false, "Hash Ranges are complete for " + streamIdentifier); + return new ShardSyncResponse(false, false, "Hash Ranges are complete for " + streamIdentifier); } } @@ -244,6 +278,7 @@ class PeriodicShardSyncManager { @VisibleForTesting static class ShardSyncResponse { private final boolean shouldDoShardSync; + private final boolean isHoleDetected; private final String reasonForDecision; } @@ -365,7 +400,7 @@ class PeriodicShardSyncManager { private final HashKeyRangeForLease hashRangeAtEndOfPossibleHole; } - private static class HashRangeHoleTracker { + private class HashRangeHoleTracker { private HashRangeHole hashRangeHole; @Getter private Integer numConsecutiveHoles; @@ -377,7 +412,7 @@ class PeriodicShardSyncManager { this.hashRangeHole = hashRangeHole; this.numConsecutiveHoles = 1; } - return numConsecutiveHoles >= CONSECUTIVE_HOLES_FOR_TRIGGERING_RECOVERY; + return numConsecutiveHoles >= leasesRecoveryAuditorInconsistencyConfidenceThreshold; } } diff --git a/amazon-kinesis-client/src/main/java/software/amazon/kinesis/coordinator/Scheduler.java b/amazon-kinesis-client/src/main/java/software/amazon/kinesis/coordinator/Scheduler.java index 3e74e23b..e36f0582 100644 --- a/amazon-kinesis-client/src/main/java/software/amazon/kinesis/coordinator/Scheduler.java +++ b/amazon-kinesis-client/src/main/java/software/amazon/kinesis/coordinator/Scheduler.java @@ -290,7 +290,9 @@ public class Scheduler implements Runnable { this.schedulerInitializationBackoffTimeMillis = this.coordinatorConfig.schedulerInitializationBackoffTimeMillis(); this.leaderElectedPeriodicShardSyncManager = new PeriodicShardSyncManager( leaseManagementConfig.workerIdentifier(), leaderDecider, leaseRefresher, currentStreamConfigMap, - shardSyncTaskManagerProvider, isMultiStreamMode); + shardSyncTaskManagerProvider, isMultiStreamMode, metricsFactory, + leaseManagementConfig.leasesRecoveryAuditorExecutionFrequencyMillis(), + leaseManagementConfig.leasesRecoveryAuditorInconsistencyConfidenceThreshold()); this.leaseCleanupManager = this.leaseManagementConfig.leaseManagementFactory(leaseSerializer, isMultiStreamMode) .createLeaseCleanupManager(metricsFactory); } diff --git a/amazon-kinesis-client/src/main/java/software/amazon/kinesis/leases/LeaseManagementConfig.java b/amazon-kinesis-client/src/main/java/software/amazon/kinesis/leases/LeaseManagementConfig.java index 82c02060..473db5bb 100644 --- a/amazon-kinesis-client/src/main/java/software/amazon/kinesis/leases/LeaseManagementConfig.java +++ b/amazon-kinesis-client/src/main/java/software/amazon/kinesis/leases/LeaseManagementConfig.java @@ -52,6 +52,9 @@ public class LeaseManagementConfig { public static final long DEFAULT_LEASE_CLEANUP_INTERVAL_MILLIS = Duration.ofMinutes(1).toMillis(); public static final long DEFAULT_COMPLETED_LEASE_CLEANUP_INTERVAL_MILLIS = Duration.ofMinutes(5).toMillis(); public static final long DEFAULT_GARBAGE_LEASE_CLEANUP_INTERVAL_MILLIS = Duration.ofMinutes(30).toMillis(); + public static final long DEFAULT_PERIODIC_SHARD_SYNC_INTERVAL_MILLIS = 2 * 60 * 1000L; + public static final int DEFAULT_CONSECUTIVE_HOLES_FOR_TRIGGERING_LEASE_RECOVERY = 3; + public static final LeaseCleanupConfig DEFAULT_LEASE_CLEANUP_CONFIG = LeaseCleanupConfig.builder() .leaseCleanupIntervalMillis(DEFAULT_LEASE_CLEANUP_INTERVAL_MILLIS) @@ -195,6 +198,20 @@ public class LeaseManagementConfig { private BillingMode billingMode = BillingMode.PROVISIONED; + /** + * Frequency (in millis) of the auditor job to scan for partial leases in the lease table. + * If the auditor detects any hole in the leases for a stream, then it would trigger shard sync based on + * {@link #leasesRecoveryAuditorInconsistencyConfidenceThreshold} + */ + private long leasesRecoveryAuditorExecutionFrequencyMillis = DEFAULT_PERIODIC_SHARD_SYNC_INTERVAL_MILLIS; + + /** + * Confidence threshold for the periodic auditor job to determine if leases for a stream in the lease table + * is inconsistent. If the auditor finds same set of inconsistencies consecutively for a stream for this many times, + * then it would trigger a shard sync. + */ + private int leasesRecoveryAuditorInconsistencyConfidenceThreshold = DEFAULT_CONSECUTIVE_HOLES_FOR_TRIGGERING_LEASE_RECOVERY; + /** * The initial position for getting records from Kinesis streams. * diff --git a/amazon-kinesis-client/src/test/java/software/amazon/kinesis/coordinator/PeriodicShardSyncManagerTest.java b/amazon-kinesis-client/src/test/java/software/amazon/kinesis/coordinator/PeriodicShardSyncManagerTest.java index dfba2791..a2047a6b 100644 --- a/amazon-kinesis-client/src/test/java/software/amazon/kinesis/coordinator/PeriodicShardSyncManagerTest.java +++ b/amazon-kinesis-client/src/test/java/software/amazon/kinesis/coordinator/PeriodicShardSyncManagerTest.java @@ -34,6 +34,7 @@ import software.amazon.kinesis.leases.LeaseRefresher; import software.amazon.kinesis.leases.MultiStreamLease; import software.amazon.kinesis.leases.ShardDetector; import software.amazon.kinesis.leases.ShardSyncTaskManager; +import software.amazon.kinesis.metrics.NullMetricsFactory; import software.amazon.kinesis.retrieval.kpl.ExtendedSequenceNumber; import java.math.BigInteger; @@ -49,9 +50,9 @@ import static org.mockito.Matchers.any; import static org.mockito.Mockito.mock; import static org.mockito.Mockito.when; import static software.amazon.kinesis.common.HashKeyRangeForLease.deserialize; -import static software.amazon.kinesis.coordinator.PeriodicShardSyncManager.CONSECUTIVE_HOLES_FOR_TRIGGERING_RECOVERY; import static software.amazon.kinesis.coordinator.PeriodicShardSyncManager.MAX_HASH_KEY; import static software.amazon.kinesis.coordinator.PeriodicShardSyncManager.MIN_HASH_KEY; +import static software.amazon.kinesis.leases.LeaseManagementConfig.DEFAULT_CONSECUTIVE_HOLES_FOR_TRIGGERING_LEASE_RECOVERY; @RunWith(MockitoJUnitRunner.class) @@ -72,7 +73,7 @@ public class PeriodicShardSyncManagerTest { public void setup() { streamIdentifier = StreamIdentifier.multiStreamInstance("123:stream:456"); periodicShardSyncManager = new PeriodicShardSyncManager("worker", leaderDecider, leaseRefresher, currentStreamConfigMap, - shardSyncTaskManagerProvider, true); + shardSyncTaskManagerProvider, true, new NullMetricsFactory(), 2 * 60 * 1000, 3); } @Test @@ -173,7 +174,7 @@ public class PeriodicShardSyncManagerTest { lease.checkpoint(ExtendedSequenceNumber.TRIM_HORIZON); return lease; }).collect(Collectors.toList()); - IntStream.range(1, CONSECUTIVE_HOLES_FOR_TRIGGERING_RECOVERY).forEach(i -> Assert + IntStream.range(1, DEFAULT_CONSECUTIVE_HOLES_FOR_TRIGGERING_LEASE_RECOVERY).forEach(i -> Assert .assertFalse(periodicShardSyncManager.checkForShardSync(streamIdentifier, multiStreamLeases).shouldDoShardSync())); } @@ -191,7 +192,7 @@ public class PeriodicShardSyncManagerTest { lease.checkpoint(ExtendedSequenceNumber.TRIM_HORIZON); return lease; }).collect(Collectors.toList()); - IntStream.range(1, CONSECUTIVE_HOLES_FOR_TRIGGERING_RECOVERY).forEach(i -> Assert + IntStream.range(1, DEFAULT_CONSECUTIVE_HOLES_FOR_TRIGGERING_LEASE_RECOVERY).forEach(i -> Assert .assertFalse(periodicShardSyncManager.checkForShardSync(streamIdentifier, multiStreamLeases).shouldDoShardSync())); Assert.assertTrue(periodicShardSyncManager.checkForShardSync(streamIdentifier, multiStreamLeases).shouldDoShardSync()); } @@ -214,7 +215,7 @@ public class PeriodicShardSyncManagerTest { } return lease; }).collect(Collectors.toList()); - IntStream.range(1, CONSECUTIVE_HOLES_FOR_TRIGGERING_RECOVERY).forEach(i -> Assert + IntStream.range(1, DEFAULT_CONSECUTIVE_HOLES_FOR_TRIGGERING_LEASE_RECOVERY).forEach(i -> Assert .assertFalse(periodicShardSyncManager.checkForShardSync(streamIdentifier, multiStreamLeases).shouldDoShardSync())); Assert.assertTrue(periodicShardSyncManager.checkForShardSync(streamIdentifier, multiStreamLeases).shouldDoShardSync()); } @@ -233,7 +234,7 @@ public class PeriodicShardSyncManagerTest { lease.checkpoint(ExtendedSequenceNumber.SHARD_END); return lease; }).collect(Collectors.toList()); - IntStream.range(1, CONSECUTIVE_HOLES_FOR_TRIGGERING_RECOVERY).forEach(i -> Assert + IntStream.range(1, DEFAULT_CONSECUTIVE_HOLES_FOR_TRIGGERING_LEASE_RECOVERY).forEach(i -> Assert .assertFalse(periodicShardSyncManager.checkForShardSync(streamIdentifier, multiStreamLeases).shouldDoShardSync())); Assert.assertTrue(periodicShardSyncManager.checkForShardSync(streamIdentifier, multiStreamLeases).shouldDoShardSync()); } @@ -252,7 +253,7 @@ public class PeriodicShardSyncManagerTest { lease.checkpoint(ExtendedSequenceNumber.TRIM_HORIZON); return lease; }).collect(Collectors.toList()); - IntStream.range(1, CONSECUTIVE_HOLES_FOR_TRIGGERING_RECOVERY).forEach(i -> Assert + IntStream.range(1, DEFAULT_CONSECUTIVE_HOLES_FOR_TRIGGERING_LEASE_RECOVERY).forEach(i -> Assert .assertFalse(periodicShardSyncManager.checkForShardSync(streamIdentifier, multiStreamLeases).shouldDoShardSync())); List multiStreamLeases2 = new ArrayList() {{ add(deserialize(MIN_HASH_KEY.toString(), "1")); @@ -267,7 +268,7 @@ public class PeriodicShardSyncManagerTest { return lease; }).collect(Collectors.toList()); // Resetting the holes - IntStream.range(1, CONSECUTIVE_HOLES_FOR_TRIGGERING_RECOVERY).forEach(i -> Assert + IntStream.range(1, DEFAULT_CONSECUTIVE_HOLES_FOR_TRIGGERING_LEASE_RECOVERY).forEach(i -> Assert .assertFalse(periodicShardSyncManager.checkForShardSync(streamIdentifier, multiStreamLeases2).shouldDoShardSync())); Assert.assertTrue(periodicShardSyncManager.checkForShardSync(streamIdentifier, multiStreamLeases2).shouldDoShardSync()); } @@ -286,7 +287,7 @@ public class PeriodicShardSyncManagerTest { lease.checkpoint(ExtendedSequenceNumber.TRIM_HORIZON); return lease; }).collect(Collectors.toList()); - IntStream.range(1, CONSECUTIVE_HOLES_FOR_TRIGGERING_RECOVERY).forEach(i -> Assert + IntStream.range(1, DEFAULT_CONSECUTIVE_HOLES_FOR_TRIGGERING_LEASE_RECOVERY).forEach(i -> Assert .assertFalse(periodicShardSyncManager.checkForShardSync(streamIdentifier, multiStreamLeases).shouldDoShardSync())); List multiStreamLeases2 = new ArrayList() {{ add(deserialize(MIN_HASH_KEY.toString(), "1")); @@ -301,10 +302,10 @@ public class PeriodicShardSyncManagerTest { return lease; }).collect(Collectors.toList()); // Resetting the holes - IntStream.range(1, CONSECUTIVE_HOLES_FOR_TRIGGERING_RECOVERY).forEach(i -> Assert + IntStream.range(1, DEFAULT_CONSECUTIVE_HOLES_FOR_TRIGGERING_LEASE_RECOVERY).forEach(i -> Assert .assertFalse(periodicShardSyncManager.checkForShardSync(streamIdentifier, multiStreamLeases2).shouldDoShardSync())); // Resetting the holes - IntStream.range(1, CONSECUTIVE_HOLES_FOR_TRIGGERING_RECOVERY).forEach(i -> Assert + IntStream.range(1, DEFAULT_CONSECUTIVE_HOLES_FOR_TRIGGERING_LEASE_RECOVERY).forEach(i -> Assert .assertFalse(periodicShardSyncManager.checkForShardSync(streamIdentifier, multiStreamLeases).shouldDoShardSync())); Assert.assertTrue(periodicShardSyncManager.checkForShardSync(streamIdentifier, multiStreamLeases).shouldDoShardSync()); } @@ -347,7 +348,7 @@ public class PeriodicShardSyncManagerTest { }).collect(Collectors.toList()); // Assert that shard sync should never trigger - IntStream.range(1, CONSECUTIVE_HOLES_FOR_TRIGGERING_RECOVERY).forEach(i -> Assert + IntStream.range(1, DEFAULT_CONSECUTIVE_HOLES_FOR_TRIGGERING_LEASE_RECOVERY).forEach(i -> Assert .assertFalse(periodicShardSyncManager.checkForShardSync(streamIdentifier, multiStreamLeases).shouldDoShardSync())); Assert.assertFalse(periodicShardSyncManager.checkForShardSync(streamIdentifier, multiStreamLeases).shouldDoShardSync()); @@ -395,7 +396,7 @@ public class PeriodicShardSyncManagerTest { }).collect(Collectors.toList()); // Assert that shard sync should never trigger - IntStream.range(1, CONSECUTIVE_HOLES_FOR_TRIGGERING_RECOVERY).forEach(i -> Assert + IntStream.range(1, DEFAULT_CONSECUTIVE_HOLES_FOR_TRIGGERING_LEASE_RECOVERY).forEach(i -> Assert .assertFalse(periodicShardSyncManager.checkForShardSync(streamIdentifier, multiStreamLeases).shouldDoShardSync())); Assert.assertTrue(periodicShardSyncManager.checkForShardSync(streamIdentifier, multiStreamLeases).shouldDoShardSync());