Handle Possible Race Conditions, and Comments
Handle some possible race conditions during the shutdown process. Added more comments clarifying the how shutdown works, and the race conditions it can face.
This commit is contained in:
parent
33655bdedb
commit
524717538e
5 changed files with 114 additions and 37 deletions
|
|
@ -15,8 +15,6 @@
|
||||||
package com.amazonaws.services.kinesis.clientlibrary.lib.worker;
|
package com.amazonaws.services.kinesis.clientlibrary.lib.worker;
|
||||||
|
|
||||||
|
|
||||||
import java.util.Collections;
|
|
||||||
import java.util.Set;
|
|
||||||
import java.util.concurrent.ExecutorService;
|
import java.util.concurrent.ExecutorService;
|
||||||
import java.util.concurrent.Future;
|
import java.util.concurrent.Future;
|
||||||
import java.util.concurrent.RejectedExecutionException;
|
import java.util.concurrent.RejectedExecutionException;
|
||||||
|
|
@ -39,12 +37,8 @@ import com.google.common.annotations.VisibleForTesting;
|
||||||
*/
|
*/
|
||||||
class ShardConsumer {
|
class ShardConsumer {
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
private static final Log LOG = LogFactory.getLog(ShardConsumer.class);
|
private static final Log LOG = LogFactory.getLog(ShardConsumer.class);
|
||||||
|
|
||||||
private static final Set<ConsumerStates.ShardConsumerState> EMPTY_DISALLOWED_SET = Collections.emptySet();
|
|
||||||
|
|
||||||
private final StreamConfig streamConfig;
|
private final StreamConfig streamConfig;
|
||||||
private final IRecordProcessor recordProcessor;
|
private final IRecordProcessor recordProcessor;
|
||||||
private final RecordProcessorCheckpointer recordProcessorCheckpointer;
|
private final RecordProcessorCheckpointer recordProcessorCheckpointer;
|
||||||
|
|
|
||||||
|
|
@ -47,6 +47,10 @@ class ShardConsumerShutdownNotification implements ShutdownNotification {
|
||||||
if (notificationComplete) {
|
if (notificationComplete) {
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
//
|
||||||
|
// Once the notification has been completed, the lease needs to dropped to allow the worker to complete
|
||||||
|
// shutdown of the record processor.
|
||||||
|
//
|
||||||
leaseCoordinator.dropLease(lease);
|
leaseCoordinator.dropLease(lease);
|
||||||
notificationCompleteLatch.countDown();
|
notificationCompleteLatch.countDown();
|
||||||
notificationComplete = true;
|
notificationComplete = true;
|
||||||
|
|
|
||||||
|
|
@ -20,8 +20,6 @@ class ShutdownFuture implements Future<Void> {
|
||||||
private final CountDownLatch notificationCompleteLatch;
|
private final CountDownLatch notificationCompleteLatch;
|
||||||
private final Worker worker;
|
private final Worker worker;
|
||||||
|
|
||||||
private boolean workerShutdownCalled = false;
|
|
||||||
|
|
||||||
ShutdownFuture(CountDownLatch shutdownCompleteLatch, CountDownLatch notificationCompleteLatch, Worker worker) {
|
ShutdownFuture(CountDownLatch shutdownCompleteLatch, CountDownLatch notificationCompleteLatch, Worker worker) {
|
||||||
this.shutdownCompleteLatch = shutdownCompleteLatch;
|
this.shutdownCompleteLatch = shutdownCompleteLatch;
|
||||||
this.notificationCompleteLatch = notificationCompleteLatch;
|
this.notificationCompleteLatch = notificationCompleteLatch;
|
||||||
|
|
@ -48,57 +46,91 @@ class ShutdownFuture implements Future<Void> {
|
||||||
}
|
}
|
||||||
|
|
||||||
private long outstandingRecordProcessors(long timeout, TimeUnit unit)
|
private long outstandingRecordProcessors(long timeout, TimeUnit unit)
|
||||||
throws InterruptedException, ExecutionException {
|
throws InterruptedException, ExecutionException, TimeoutException {
|
||||||
|
|
||||||
|
long startNanos = System.nanoTime();
|
||||||
|
|
||||||
//
|
//
|
||||||
// Awaiting for all ShardConsumer/RecordProcessors to be notified that a shutdown has been requested.
|
// Awaiting for all ShardConsumer/RecordProcessors to be notified that a shutdown has been requested.
|
||||||
|
// There is the possibility of a race condition where a lease is terminated after the shutdown request
|
||||||
|
// notification is started, but before the ShardConsumer is sent the notification. In this case the
|
||||||
|
// ShardConsumer would start the lease loss shutdown, and may never call the notification methods.
|
||||||
//
|
//
|
||||||
if (!notificationCompleteLatch.await(timeout, unit)) {
|
if (!notificationCompleteLatch.await(timeout, unit)) {
|
||||||
long awaitingNotification = notificationCompleteLatch.getCount();
|
long awaitingNotification = notificationCompleteLatch.getCount();
|
||||||
log.info("Awaiting " + awaitingNotification + " record processors to complete initial shutdown");
|
log.info("Awaiting " + awaitingNotification + " record processors to complete shutdown notification");
|
||||||
long awaitingFinalShutdown = shutdownCompleteLatch.getCount();
|
long awaitingFinalShutdown = shutdownCompleteLatch.getCount();
|
||||||
if (awaitingFinalShutdown != 0) {
|
if (awaitingFinalShutdown != 0) {
|
||||||
return awaitingFinalShutdown;
|
//
|
||||||
|
// The number of record processor awaiting final shutdown should be a superset of the those awaiting
|
||||||
|
// notification
|
||||||
|
//
|
||||||
|
return checkWorkerShutdownMiss(awaitingFinalShutdown);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
long remaining = remainingTimeout(timeout, unit, startNanos);
|
||||||
|
throwTimeoutMessageIfExceeded(remaining, "Notification hasn't completed within timeout time.");
|
||||||
|
|
||||||
//
|
//
|
||||||
// Once all record processors have been notified of the shutdown it is safe to allow the worker to
|
// Once all record processors have been notified of the shutdown it is safe to allow the worker to
|
||||||
// start its shutdown behavior. Once shutdown starts it will stop renewer, and drop any remaining leases.
|
// start its shutdown behavior. Once shutdown starts it will stop renewer, and drop any remaining leases.
|
||||||
//
|
//
|
||||||
if (!workerShutdownCalled) {
|
worker.shutdown();
|
||||||
//
|
remaining = remainingTimeout(timeout, unit, startNanos);
|
||||||
// Unfortunately Worker#shutdown() doesn't appear to be idempotent.
|
throwTimeoutMessageIfExceeded(remaining, "Shutdown hasn't completed within timeout time.");
|
||||||
//
|
|
||||||
worker.shutdown();
|
|
||||||
}
|
|
||||||
//
|
//
|
||||||
// Want to wait for all the remaining ShardConsumers/RecordProcessor's to complete their final shutdown
|
// Want to wait for all the remaining ShardConsumers/RecordProcessor's to complete their final shutdown
|
||||||
// processing. This should really be a no-op since as part of the notification completion the lease for
|
// processing. This should really be a no-op since as part of the notification completion the lease for
|
||||||
// ShardConsumer is terminated.
|
// ShardConsumer is terminated.
|
||||||
//
|
//
|
||||||
if (!shutdownCompleteLatch.await(timeout, unit)) {
|
if (!shutdownCompleteLatch.await(remaining, TimeUnit.NANOSECONDS)) {
|
||||||
long outstanding = shutdownCompleteLatch.getCount();
|
long outstanding = shutdownCompleteLatch.getCount();
|
||||||
log.info("Awaiting " + outstanding + " record processors to complete final shutdown");
|
log.info("Awaiting " + outstanding + " record processors to complete final shutdown");
|
||||||
if (isWorkerShutdownComplete()) {
|
|
||||||
if (outstanding != 0) {
|
return checkWorkerShutdownMiss(outstanding);
|
||||||
log.warn("Shutdown completed, but shutdownCompleteLatch still had outstanding " + outstanding
|
|
||||||
+ " with a current value of " + shutdownCompleteLatch.getCount()
|
|
||||||
+ ". shutdownComplete: " + worker.isShutdownComplete() + " -- Consumer Map: "
|
|
||||||
+ worker.getShardInfoShardConsumerMap().size());
|
|
||||||
}
|
|
||||||
return 0;
|
|
||||||
}
|
|
||||||
return outstanding;
|
|
||||||
}
|
}
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private long remainingTimeout(long timeout, TimeUnit unit, long startNanos) {
|
||||||
|
long checkNanos = System.nanoTime() - startNanos;
|
||||||
|
return unit.toNanos(timeout) - checkNanos;
|
||||||
|
}
|
||||||
|
|
||||||
|
private void throwTimeoutMessageIfExceeded(long remainingNanos, String message) throws TimeoutException {
|
||||||
|
if (remainingNanos <= 0) {
|
||||||
|
throw new TimeoutException(message);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private long checkWorkerShutdownMiss(long outstanding) {
|
||||||
|
if (isWorkerShutdownComplete()) {
|
||||||
|
if (outstanding != 0) {
|
||||||
|
log.warn("Shutdown completed, but shutdownCompleteLatch still had outstanding " + outstanding
|
||||||
|
+ " with a current value of " + shutdownCompleteLatch.getCount() + ". shutdownComplete: "
|
||||||
|
+ worker.isShutdownComplete() + " -- Consumer Map: "
|
||||||
|
+ worker.getShardInfoShardConsumerMap().size());
|
||||||
|
}
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
return outstanding;
|
||||||
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public Void get() throws InterruptedException, ExecutionException {
|
public Void get() throws InterruptedException, ExecutionException {
|
||||||
long outstanding;
|
boolean complete = false;
|
||||||
do {
|
do {
|
||||||
outstanding = outstandingRecordProcessors(1, TimeUnit.SECONDS);
|
try {
|
||||||
log.info("Awaiting " + outstanding + " consumer(s) to finish shutdown.");
|
long outstanding = outstandingRecordProcessors(1, TimeUnit.SECONDS);
|
||||||
} while(outstanding != 0);
|
complete = outstanding == 0;
|
||||||
|
log.info("Awaiting " + outstanding + " consumer(s) to finish shutdown.");
|
||||||
|
} catch (TimeoutException te) {
|
||||||
|
log.info("Timeout while waiting for completion: " + te.getMessage());
|
||||||
|
}
|
||||||
|
|
||||||
|
} while(!complete);
|
||||||
return null;
|
return null;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -28,7 +28,6 @@ import java.util.concurrent.ThreadFactory;
|
||||||
import java.util.concurrent.ThreadPoolExecutor;
|
import java.util.concurrent.ThreadPoolExecutor;
|
||||||
import java.util.concurrent.TimeUnit;
|
import java.util.concurrent.TimeUnit;
|
||||||
|
|
||||||
import com.amazonaws.services.kinesis.clientlibrary.interfaces.v2.IShutdownNotificationAware;
|
|
||||||
import org.apache.commons.logging.Log;
|
import org.apache.commons.logging.Log;
|
||||||
import org.apache.commons.logging.LogFactory;
|
import org.apache.commons.logging.LogFactory;
|
||||||
|
|
||||||
|
|
@ -43,6 +42,7 @@ import com.amazonaws.services.kinesis.AmazonKinesisClient;
|
||||||
import com.amazonaws.services.kinesis.clientlibrary.interfaces.ICheckpoint;
|
import com.amazonaws.services.kinesis.clientlibrary.interfaces.ICheckpoint;
|
||||||
import com.amazonaws.services.kinesis.clientlibrary.interfaces.v2.IRecordProcessor;
|
import com.amazonaws.services.kinesis.clientlibrary.interfaces.v2.IRecordProcessor;
|
||||||
import com.amazonaws.services.kinesis.clientlibrary.interfaces.v2.IRecordProcessorFactory;
|
import com.amazonaws.services.kinesis.clientlibrary.interfaces.v2.IRecordProcessorFactory;
|
||||||
|
import com.amazonaws.services.kinesis.clientlibrary.interfaces.v2.IShutdownNotificationAware;
|
||||||
import com.amazonaws.services.kinesis.clientlibrary.proxies.KinesisProxyFactory;
|
import com.amazonaws.services.kinesis.clientlibrary.proxies.KinesisProxyFactory;
|
||||||
import com.amazonaws.services.kinesis.leases.exceptions.LeasingException;
|
import com.amazonaws.services.kinesis.leases.exceptions.LeasingException;
|
||||||
import com.amazonaws.services.kinesis.leases.impl.KinesisClientLease;
|
import com.amazonaws.services.kinesis.leases.impl.KinesisClientLease;
|
||||||
|
|
@ -490,9 +490,11 @@ public class Worker implements Runnable {
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Requests shutdown of the worker, notifying record processors, that implement
|
* Requests shutdown of the worker, notifying record processors, that implement {@link IShutdownNotificationAware},
|
||||||
* {@link IShutdownNotificationAware}, of the impending shutdown.
|
* of the impending shutdown. This gives the record processor a final chance to checkpoint.
|
||||||
* This gives the record processor a final chance to checkpoint.
|
*
|
||||||
|
* <b>It's possible that a record processor won't be notify before being shutdown. This can occur if the lease is
|
||||||
|
* lost after requesting shutdown, but before the notification is dispatched.</b>
|
||||||
*
|
*
|
||||||
* <h2>Requested Shutdown Process</h2> When a shutdown process is requested it operates slightly differently to
|
* <h2>Requested Shutdown Process</h2> When a shutdown process is requested it operates slightly differently to
|
||||||
* allow the record processors a chance to checkpoint a final time.
|
* allow the record processors a chance to checkpoint a final time.
|
||||||
|
|
@ -567,6 +569,10 @@ public class Worker implements Runnable {
|
||||||
* </ol>
|
* </ol>
|
||||||
*/
|
*/
|
||||||
public void shutdown() {
|
public void shutdown() {
|
||||||
|
if (shutdown) {
|
||||||
|
LOG.warn("Shutdown requested a second time.");
|
||||||
|
return;
|
||||||
|
}
|
||||||
LOG.info("Worker shutdown requested.");
|
LOG.info("Worker shutdown requested.");
|
||||||
|
|
||||||
// Set shutdown flag, so Worker.run can start shutdown process.
|
// Set shutdown flag, so Worker.run can start shutdown process.
|
||||||
|
|
|
||||||
|
|
@ -3,6 +3,7 @@ package com.amazonaws.services.kinesis.clientlibrary.lib.worker;
|
||||||
import static org.hamcrest.MatcherAssert.assertThat;
|
import static org.hamcrest.MatcherAssert.assertThat;
|
||||||
import static org.mockito.Matchers.any;
|
import static org.mockito.Matchers.any;
|
||||||
import static org.mockito.Matchers.anyLong;
|
import static org.mockito.Matchers.anyLong;
|
||||||
|
import static org.mockito.Mockito.mock;
|
||||||
import static org.mockito.Mockito.never;
|
import static org.mockito.Mockito.never;
|
||||||
import static org.mockito.Mockito.times;
|
import static org.mockito.Mockito.times;
|
||||||
import static org.mockito.Mockito.verify;
|
import static org.mockito.Mockito.verify;
|
||||||
|
|
@ -16,7 +17,10 @@ import java.util.concurrent.TimeoutException;
|
||||||
import org.junit.Test;
|
import org.junit.Test;
|
||||||
import org.junit.runner.RunWith;
|
import org.junit.runner.RunWith;
|
||||||
import org.mockito.Mock;
|
import org.mockito.Mock;
|
||||||
|
import org.mockito.invocation.InvocationOnMock;
|
||||||
import org.mockito.runners.MockitoJUnitRunner;
|
import org.mockito.runners.MockitoJUnitRunner;
|
||||||
|
import org.mockito.stubbing.Answer;
|
||||||
|
import org.mockito.stubbing.OngoingStubbing;
|
||||||
|
|
||||||
@RunWith(MockitoJUnitRunner.class)
|
@RunWith(MockitoJUnitRunner.class)
|
||||||
public class ShutdownFutureTest {
|
public class ShutdownFutureTest {
|
||||||
|
|
@ -51,6 +55,10 @@ public class ShutdownFutureTest {
|
||||||
mockNotificationComplete(false, true);
|
mockNotificationComplete(false, true);
|
||||||
mockShutdownComplete(true);
|
mockShutdownComplete(true);
|
||||||
|
|
||||||
|
when(worker.getShardInfoShardConsumerMap()).thenReturn(shardInfoConsumerMap);
|
||||||
|
when(shardInfoConsumerMap.isEmpty()).thenReturn(false);
|
||||||
|
when(worker.isShutdownComplete()).thenReturn(false);
|
||||||
|
|
||||||
when(notificationCompleteLatch.getCount()).thenReturn(1L);
|
when(notificationCompleteLatch.getCount()).thenReturn(1L);
|
||||||
when(shutdownCompleteLatch.getCount()).thenReturn(1L);
|
when(shutdownCompleteLatch.getCount()).thenReturn(1L);
|
||||||
|
|
||||||
|
|
@ -135,8 +143,41 @@ public class ShutdownFutureTest {
|
||||||
|
|
||||||
verify(shardInfoConsumerMap).isEmpty();
|
verify(shardInfoConsumerMap).isEmpty();
|
||||||
verify(shardInfoConsumerMap).size();
|
verify(shardInfoConsumerMap).size();
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testNotificationNotCompleteButShardConsumerEmpty() throws Exception {
|
||||||
|
ShutdownFuture future = create();
|
||||||
|
mockNotificationComplete(false);
|
||||||
|
mockShutdownComplete(false);
|
||||||
|
|
||||||
|
mockOutstanding(notificationCompleteLatch, 1L);
|
||||||
|
mockOutstanding(shutdownCompleteLatch, 1L);
|
||||||
|
|
||||||
|
when(worker.isShutdownComplete()).thenReturn(false);
|
||||||
|
mockShardInfoConsumerMap(0);
|
||||||
|
|
||||||
|
awaitFuture(future);
|
||||||
|
verify(worker, never()).shutdown();
|
||||||
|
verifyLatchAwait(notificationCompleteLatch);
|
||||||
|
verify(shutdownCompleteLatch, never()).await();
|
||||||
|
|
||||||
|
verify(worker, times(2)).isShutdownComplete();
|
||||||
|
verify(worker, times(2)).getShardInfoShardConsumerMap();
|
||||||
|
|
||||||
|
verify(shardInfoConsumerMap).isEmpty();
|
||||||
|
verify(shardInfoConsumerMap).size();
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test(expected = TimeoutException.class)
|
||||||
|
public void testTimeExceededException() throws Exception {
|
||||||
|
ShutdownFuture future = create();
|
||||||
|
mockNotificationComplete(false);
|
||||||
|
mockOutstanding(notificationCompleteLatch, 1L);
|
||||||
|
when(worker.isShutdownComplete()).thenReturn(false);
|
||||||
|
mockShardInfoConsumerMap(1);
|
||||||
|
|
||||||
|
future.get(1, TimeUnit.NANOSECONDS);
|
||||||
}
|
}
|
||||||
|
|
||||||
private ShutdownFuture create() {
|
private ShutdownFuture create() {
|
||||||
|
|
@ -172,7 +213,7 @@ public class ShutdownFutureTest {
|
||||||
}
|
}
|
||||||
|
|
||||||
private void awaitFuture(ShutdownFuture future) throws Exception {
|
private void awaitFuture(ShutdownFuture future) throws Exception {
|
||||||
future.get(1, TimeUnit.MILLISECONDS);
|
future.get(1, TimeUnit.SECONDS);
|
||||||
}
|
}
|
||||||
|
|
||||||
private void mockNotificationComplete(Boolean initial, Boolean... states) throws Exception {
|
private void mockNotificationComplete(Boolean initial, Boolean... states) throws Exception {
|
||||||
|
|
|
||||||
Loading…
Reference in a new issue