Fix to prevent data loss and stuck shards in the event of failed records delivery in Polling readers (#603)

* Fix to prevent data loss and stuck shards in the event of failed records delivery.

* Review comment fixes

* Access specifiers fix
This commit is contained in:
ashwing 2019-09-03 09:20:34 -07:00 committed by Micah Jaffe
parent 85d31c91f1
commit f6dec3e579
4 changed files with 309 additions and 59 deletions

View file

@ -18,14 +18,16 @@ package software.amazon.kinesis.retrieval.polling;
import java.time.Duration;
import java.time.Instant;
import java.util.List;
import java.util.UUID;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.LinkedBlockingQueue;
import java.util.concurrent.Semaphore;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.atomic.AtomicBoolean;
import java.util.concurrent.atomic.AtomicLong;
import java.util.concurrent.locks.ReentrantReadWriteLock;
import java.util.stream.Collectors;
import org.apache.commons.lang3.StringUtils;
import org.apache.commons.lang3.Validate;
import org.reactivestreams.Subscriber;
import org.reactivestreams.Subscription;
@ -48,6 +50,7 @@ import software.amazon.kinesis.metrics.MetricsLevel;
import software.amazon.kinesis.metrics.MetricsScope;
import software.amazon.kinesis.metrics.MetricsUtil;
import software.amazon.kinesis.metrics.ThreadSafeMetricsDelegatingFactory;
import software.amazon.kinesis.retrieval.BatchUniqueIdentifier;
import software.amazon.kinesis.retrieval.GetRecordsRetrievalStrategy;
import software.amazon.kinesis.retrieval.KinesisClientRecord;
import software.amazon.kinesis.retrieval.RecordsDeliveryAck;
@ -65,6 +68,12 @@ import static software.amazon.kinesis.common.DiagnosticUtils.takeDelayedDelivery
* i.e. the byte size of the records stored in the cache and maxRecordsCount i.e. the max number of records that should
* be present in the cache across multiple GetRecordsResult object. If no data is available in the cache, the call from
* the record processor is blocked till records are retrieved from Kinesis.
*
* There are three threads namely publisher, demand-notifier and ack-notifier which will contend to drain the events
* to the Subscriber (ShardConsumer in KCL). The publisher/demand-notifier thread gains the control to drain only when
* there is no pending event in the prefetch queue waiting for the ack. Otherwise, it will be the ack-notifier thread
* which will drain an event on the receipt of an ack.
*
*/
@Slf4j
@KinesisClientInternalApi
@ -97,8 +106,11 @@ public class PrefetchRecordsPublisher implements RecordsPublisher {
private final ReentrantReadWriteLock resetLock = new ReentrantReadWriteLock();
private boolean wasReset = false;
private final Semaphore eventDeliveryLock = new Semaphore(1);
private Instant eventDeliveryLockAcquireTime = Instant.EPOCH;
private Instant lastEventDeliveryTime = Instant.EPOCH;
// This flag controls who should drain the next request in the prefetch queue.
// When set to false, the publisher and demand-notifier thread would have the control.
// When set to true, the event-notifier thread would have the control.
private AtomicBoolean shouldDrainEventOnlyOnAck = new AtomicBoolean(false);
/**
* Constructor for the PrefetchRecordsPublisher. This cache prefetches records from Kinesis and stores them in a
@ -151,13 +163,13 @@ public class PrefetchRecordsPublisher implements RecordsPublisher {
dataFetcher.initialize(extendedSequenceNumber, initialPositionInStreamExtended);
if (!started) {
log.info("Starting prefetching thread.");
log.info("{} : Starting prefetching thread.", shardId);
executorService.execute(defaultGetRecordsCacheDaemon);
}
started = true;
}
RecordsRetrieved getNextResult() {
private void throwOnIllegalState() {
if (executorService.isShutdown()) {
throw new IllegalStateException("Shutdown has been called on the cache, can't accept new requests.");
}
@ -165,16 +177,26 @@ public class PrefetchRecordsPublisher implements RecordsPublisher {
if (!started) {
throw new IllegalStateException("Cache has not been initialized, make sure to call start.");
}
PrefetchRecordsRetrieved result = null;
try {
result = getRecordsResultQueue.take().prepareForPublish();
}
private RecordsRetrieved peekNextResult() {
throwOnIllegalState();
final PrefetchRecordsRetrieved result = getRecordsResultQueue.peek();
return result == null ? result : result.prepareForPublish();
}
@VisibleForTesting
RecordsRetrieved pollNextResultAndUpdatePrefetchCounters() {
throwOnIllegalState();
final PrefetchRecordsRetrieved result = getRecordsResultQueue.poll();
if (result != null) {
prefetchCounters.removed(result.processRecordsInput);
requestedResponses.decrementAndGet();
} catch (InterruptedException e) {
log.error("Interrupted while getting records from the cache", e);
} else {
log.info(
"{}: No record batch found while evicting from the prefetch queue. This indicates the prefetch buffer"
+ "was reset.", shardId);
}
return result;
}
@ -195,6 +217,12 @@ public class PrefetchRecordsPublisher implements RecordsPublisher {
resetLock.writeLock().lock();
try {
getRecordsResultQueue.clear();
// Give the drain control to publisher/demand-notifier thread.
log.debug("{} : Publisher thread takes over the draining control. Queue Size : {}, Demand : {}", shardId,
getRecordsResultQueue.size(), requestedResponses.get());
shouldDrainEventOnlyOnAck.set(false);
prefetchCounters.reset();
highestSequenceNumber = prefetchRecordsRetrieved.lastBatchSequenceNumber();
@ -213,7 +241,7 @@ public class PrefetchRecordsPublisher implements RecordsPublisher {
@Override
public void request(long n) {
requestedResponses.addAndGet(n);
drainQueueForRequests();
drainQueueForRequestsIfAllowed();
}
@Override
@ -224,12 +252,35 @@ public class PrefetchRecordsPublisher implements RecordsPublisher {
}
@Override
public void notify(RecordsDeliveryAck ack) {
eventDeliveryLock.release();
public synchronized void notify(RecordsDeliveryAck recordsDeliveryAck) {
final RecordsRetrieved recordsToCheck = peekNextResult();
// Verify if the ack matches the head of the queue and evict it.
if (recordsToCheck != null && recordsToCheck.batchUniqueIdentifier()
.equals(recordsDeliveryAck.batchUniqueIdentifier())) {
pollNextResultAndUpdatePrefetchCounters();
// Upon evicting, check if queue is empty. if yes, then give the drain control back to publisher thread.
if (getRecordsResultQueue.isEmpty()) {
log.debug("{} : Publisher thread takes over the draining control. Queue Size : {}, Demand : {}",
shardId, getRecordsResultQueue.size(), requestedResponses.get());
shouldDrainEventOnlyOnAck.set(false);
} else {
// Else attempt to drain the queue.
drainQueueForRequests();
}
} else {
// Log and ignore any other ack received. As long as an ack is received for head of the queue
// we are good. Any stale or future ack received can be ignored, though the latter is not feasible
// to happen.
final BatchUniqueIdentifier peekedBatchUniqueIdentifier =
recordsToCheck == null ? null : recordsToCheck.batchUniqueIdentifier();
log.info("{} : Received a stale notification with id {} instead of expected id {} at {}. Will ignore.",
shardId, recordsDeliveryAck.batchUniqueIdentifier(), peekedBatchUniqueIdentifier, Instant.now());
}
// Take action based on the time spent by the event in queue.
takeDelayedDeliveryActionIfRequired(shardId, eventDeliveryLockAcquireTime, log);
takeDelayedDeliveryActionIfRequired(shardId, lastEventDeliveryTime, log);
}
// Note : Do not make this method synchronous as notify() will not be able to evict any entry from the queue.
private void addArrivedRecordsInput(PrefetchRecordsRetrieved recordsRetrieved) throws InterruptedException {
wasReset = false;
while (!getRecordsResultQueue.offer(recordsRetrieved, idleMillisBetweenCalls, TimeUnit.MILLISECONDS)) {
@ -248,11 +299,39 @@ public class PrefetchRecordsPublisher implements RecordsPublisher {
prefetchCounters.added(recordsRetrieved.processRecordsInput);
}
/**
* Method that will be called by the 'publisher thread' and the 'demand notifying thread',
* to drain the events if the 'event notifying thread' do not have the control.
*/
private synchronized void drainQueueForRequestsIfAllowed() {
if (!shouldDrainEventOnlyOnAck.get()) {
drainQueueForRequests();
}
}
/**
* Method to drain the queue based on the demand and the events availability in the queue.
*/
private synchronized void drainQueueForRequests() {
while (requestedResponses.get() > 0 && !getRecordsResultQueue.isEmpty()) {
eventDeliveryLock.acquireUninterruptibly();
eventDeliveryLockAcquireTime = Instant.now();
subscriber.onNext(getNextResult());
final RecordsRetrieved recordsToDeliver = peekNextResult();
// If there is an event available to drain and if there is at least one demand,
// then schedule it for delivery
if (requestedResponses.get() > 0 && recordsToDeliver != null) {
lastEventDeliveryTime = Instant.now();
subscriber.onNext(recordsToDeliver);
if (!shouldDrainEventOnlyOnAck.get()) {
log.debug("{} : Notifier thread takes over the draining control. Queue Size : {}, Demand : {}", shardId,
getRecordsResultQueue.size(), requestedResponses.get());
shouldDrainEventOnlyOnAck.set(true);
}
} else {
// Since we haven't scheduled the event delivery, give the drain control back to publisher/demand-notifier
// thread.
if (shouldDrainEventOnlyOnAck.get()) {
log.debug("{} : Publisher thread takes over the draining control. Queue Size : {}, Demand : {}",
shardId, getRecordsResultQueue.size(), requestedResponses.get());
shouldDrainEventOnlyOnAck.set(false);
}
}
}
@ -263,12 +342,26 @@ public class PrefetchRecordsPublisher implements RecordsPublisher {
final ProcessRecordsInput processRecordsInput;
final String lastBatchSequenceNumber;
final String shardIterator;
final BatchUniqueIdentifier batchUniqueIdentifier;
PrefetchRecordsRetrieved prepareForPublish() {
return new PrefetchRecordsRetrieved(processRecordsInput.toBuilder().cacheExitTime(Instant.now()).build(),
lastBatchSequenceNumber, shardIterator);
lastBatchSequenceNumber, shardIterator, batchUniqueIdentifier);
}
@Override
public BatchUniqueIdentifier batchUniqueIdentifier() {
return batchUniqueIdentifier;
}
/**
* Generate batch unique identifier for PrefetchRecordsRetrieved, where flow will be empty.
* @return BatchUniqueIdentifier
*/
public static BatchUniqueIdentifier generateBatchUniqueIdentifier() {
return new BatchUniqueIdentifier(UUID.randomUUID().toString(),
StringUtils.EMPTY);
}
}
private String calculateHighestSequenceNumber(ProcessRecordsInput processRecordsInput) {
@ -291,7 +384,7 @@ public class PrefetchRecordsPublisher implements RecordsPublisher {
public void run() {
while (!isShutdown) {
if (Thread.currentThread().isInterrupted()) {
log.warn("Prefetch thread was interrupted.");
log.warn("{} : Prefetch thread was interrupted.", shardId);
break;
}
@ -299,7 +392,7 @@ public class PrefetchRecordsPublisher implements RecordsPublisher {
try {
makeRetrievalAttempt();
} catch(PositionResetException pre) {
log.debug("Position was reset while attempting to add item to queue.");
log.debug("{} : Position was reset while attempting to add item to queue.", shardId);
} finally {
resetLock.readLock().unlock();
}
@ -328,30 +421,31 @@ public class PrefetchRecordsPublisher implements RecordsPublisher {
highestSequenceNumber = calculateHighestSequenceNumber(processRecordsInput);
PrefetchRecordsRetrieved recordsRetrieved = new PrefetchRecordsRetrieved(processRecordsInput,
highestSequenceNumber, getRecordsResult.nextShardIterator());
highestSequenceNumber, getRecordsResult.nextShardIterator(),
PrefetchRecordsRetrieved.generateBatchUniqueIdentifier());
highestSequenceNumber = recordsRetrieved.lastBatchSequenceNumber;
addArrivedRecordsInput(recordsRetrieved);
drainQueueForRequests();
drainQueueForRequestsIfAllowed();
} catch (PositionResetException pse) {
throw pse;
} catch (RetryableRetrievalException rre) {
log.info("Timeout occurred while waiting for response from Kinesis. Will retry the request.");
log.info("{} : Timeout occurred while waiting for response from Kinesis. Will retry the request.", shardId);
} catch (InterruptedException e) {
log.info("Thread was interrupted, indicating shutdown was called on the cache.");
log.info("{} : Thread was interrupted, indicating shutdown was called on the cache.", shardId);
} catch (ExpiredIteratorException e) {
log.info("ShardId {}: records threw ExpiredIteratorException - restarting"
log.info("{} : records threw ExpiredIteratorException - restarting"
+ " after greatest seqNum passed to customer", shardId, e);
scope.addData(EXPIRED_ITERATOR_METRIC, 1, StandardUnit.COUNT, MetricsLevel.SUMMARY);
dataFetcher.restartIterator();
} catch (SdkException e) {
log.error("Exception thrown while fetching records from Kinesis", e);
log.error("{} : Exception thrown while fetching records from Kinesis", shardId, e);
} catch (Throwable e) {
log.error("Unexpected exception was thrown. This could probably be an issue or a bug." +
log.error("{} : Unexpected exception was thrown. This could probably be an issue or a bug." +
" Please search for the exception/error online to check what is going on. If the " +
"issue persists or is a recurring problem, feel free to open an issue on, " +
"https://github.com/awslabs/amazon-kinesis-client.", e);
"https://github.com/awslabs/amazon-kinesis-client.", shardId, e);
} finally {
MetricsUtil.endScope(scope);
}
@ -362,8 +456,8 @@ public class PrefetchRecordsPublisher implements RecordsPublisher {
try {
prefetchCounters.waitForConsumer();
} catch (InterruptedException ie) {
log.info("Thread was interrupted while waiting for the consumer. " +
"Shutdown has probably been started");
log.info("{} : Thread was interrupted while waiting for the consumer. " +
"Shutdown has probably been started", shardId);
}
}
}
@ -410,14 +504,14 @@ public class PrefetchRecordsPublisher implements RecordsPublisher {
public synchronized void waitForConsumer() throws InterruptedException {
if (!shouldGetNewRecords()) {
log.debug("Queue is full waiting for consumer for {} ms", idleMillisBetweenCalls);
log.debug("{} : Queue is full waiting for consumer for {} ms", shardId, idleMillisBetweenCalls);
this.wait(idleMillisBetweenCalls);
}
}
public synchronized boolean shouldGetNewRecords() {
if (log.isDebugEnabled()) {
log.debug("Current Prefetch Counter States: {}", this.toString());
log.debug("{} : Current Prefetch Counter States: {}", shardId, this.toString());
}
return size < maxRecordsCount && byteSize < maxByteSize;
}

View file

@ -27,12 +27,11 @@ import static org.mockito.Mockito.mock;
import static org.mockito.Mockito.spy;
import static org.mockito.Mockito.verify;
import static org.mockito.Mockito.when;
import static software.amazon.kinesis.utils.BlockingUtils.blockUntilRecordsAvailable;
import java.nio.ByteBuffer;
import java.util.ArrayList;
import java.util.List;
import java.util.concurrent.CompletableFuture;
import java.util.concurrent.ExecutionException;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.concurrent.TimeUnit;
@ -123,13 +122,15 @@ public class PrefetchRecordsPublisherIntegrationTest {
getRecordsCache.start(extendedSequenceNumber, initialPosition);
sleep(IDLE_MILLIS_BETWEEN_CALLS);
ProcessRecordsInput processRecordsInput1 = getRecordsCache.getNextResult().processRecordsInput();
ProcessRecordsInput processRecordsInput1 = blockUntilRecordsAvailable(getRecordsCache::pollNextResultAndUpdatePrefetchCounters, 1000L)
.processRecordsInput();
assertTrue(processRecordsInput1.records().isEmpty());
assertEquals(processRecordsInput1.millisBehindLatest(), new Long(1000));
assertNotNull(processRecordsInput1.cacheEntryTime());
ProcessRecordsInput processRecordsInput2 = getRecordsCache.getNextResult().processRecordsInput();
ProcessRecordsInput processRecordsInput2 = blockUntilRecordsAvailable(getRecordsCache::pollNextResultAndUpdatePrefetchCounters, 1000L)
.processRecordsInput();
assertNotEquals(processRecordsInput1, processRecordsInput2);
}
@ -141,8 +142,10 @@ public class PrefetchRecordsPublisherIntegrationTest {
assertEquals(getRecordsCache.getRecordsResultQueue.size(), MAX_SIZE);
ProcessRecordsInput processRecordsInput1 = getRecordsCache.getNextResult().processRecordsInput();
ProcessRecordsInput processRecordsInput2 = getRecordsCache.getNextResult().processRecordsInput();
ProcessRecordsInput processRecordsInput1 = blockUntilRecordsAvailable(getRecordsCache::pollNextResultAndUpdatePrefetchCounters, 1000L)
.processRecordsInput();
ProcessRecordsInput processRecordsInput2 = blockUntilRecordsAvailable(getRecordsCache::pollNextResultAndUpdatePrefetchCounters, 1000L)
.processRecordsInput();
assertNotEquals(processRecordsInput1, processRecordsInput2);
}
@ -181,9 +184,9 @@ public class PrefetchRecordsPublisherIntegrationTest {
sleep(IDLE_MILLIS_BETWEEN_CALLS);
ProcessRecordsInput p1 = getRecordsCache.getNextResult().processRecordsInput();
ProcessRecordsInput p1 = getRecordsCache.pollNextResultAndUpdatePrefetchCounters().processRecordsInput();
ProcessRecordsInput p2 = recordsPublisher2.getNextResult().processRecordsInput();
ProcessRecordsInput p2 = recordsPublisher2.pollNextResultAndUpdatePrefetchCounters().processRecordsInput();
assertNotEquals(p1, p2);
assertTrue(p1.records().isEmpty());
@ -209,7 +212,8 @@ public class PrefetchRecordsPublisherIntegrationTest {
getRecordsCache.start(extendedSequenceNumber, initialPosition);
sleep(IDLE_MILLIS_BETWEEN_CALLS);
ProcessRecordsInput processRecordsInput = getRecordsCache.getNextResult().processRecordsInput();
ProcessRecordsInput processRecordsInput = blockUntilRecordsAvailable(getRecordsCache::pollNextResultAndUpdatePrefetchCounters, 1000L)
.processRecordsInput();
assertNotNull(processRecordsInput);
assertTrue(processRecordsInput.records().isEmpty());

View file

@ -33,6 +33,7 @@ import static org.mockito.Mockito.spy;
import static org.mockito.Mockito.times;
import static org.mockito.Mockito.verify;
import static org.mockito.Mockito.when;
import static software.amazon.kinesis.utils.BlockingUtils.blockUntilRecordsAvailable;
import static software.amazon.kinesis.utils.ProcessRecordsInputMatcher.eqProcessRecordsInput;
import java.time.Duration;
@ -44,12 +45,15 @@ import java.util.List;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.concurrent.LinkedBlockingQueue;
import java.util.concurrent.ScheduledExecutorService;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.TimeoutException;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.stream.Collectors;
import java.util.stream.IntStream;
import java.util.stream.Stream;
import io.reactivex.plugins.RxJavaPlugins;
import org.apache.commons.lang3.StringUtils;
import org.junit.After;
import org.junit.Before;
@ -76,6 +80,7 @@ import software.amazon.kinesis.lifecycle.events.ProcessRecordsInput;
import software.amazon.kinesis.metrics.NullMetricsFactory;
import software.amazon.kinesis.retrieval.GetRecordsRetrievalStrategy;
import software.amazon.kinesis.retrieval.KinesisClientRecord;
import software.amazon.kinesis.retrieval.RecordsPublisher;
import software.amazon.kinesis.retrieval.RecordsRetrieved;
import software.amazon.kinesis.retrieval.RetryableRetrievalException;
import software.amazon.kinesis.retrieval.kpl.ExtendedSequenceNumber;
@ -143,7 +148,8 @@ public class PrefetchRecordsPublisherTest {
.map(KinesisClientRecord::fromRecord).collect(Collectors.toList());
getRecordsCache.start(sequenceNumber, initialPosition);
ProcessRecordsInput result = getRecordsCache.getNextResult().processRecordsInput();
ProcessRecordsInput result = blockUntilRecordsAvailable(getRecordsCache::pollNextResultAndUpdatePrefetchCounters, 1000L)
.processRecordsInput();
assertEquals(expectedRecords, result.records());
@ -183,7 +189,7 @@ public class PrefetchRecordsPublisherTest {
// TODO: fix this verification
// verify(getRecordsRetrievalStrategy, times(callRate)).getRecords(MAX_RECORDS_PER_CALL);
// assertEquals(spyQueue.size(), callRate);
assertTrue(callRate < MAX_SIZE);
assertTrue("Call Rate is "+callRate,callRate < MAX_SIZE);
}
@Test
@ -212,7 +218,7 @@ public class PrefetchRecordsPublisherTest {
.map(KinesisClientRecord::fromRecord).collect(Collectors.toList());
getRecordsCache.start(sequenceNumber, initialPosition);
ProcessRecordsInput processRecordsInput = getRecordsCache.getNextResult().processRecordsInput();
ProcessRecordsInput processRecordsInput = getRecordsCache.pollNextResultAndUpdatePrefetchCounters().processRecordsInput();
verify(executorService).execute(any());
assertEquals(expectedRecords, processRecordsInput.records());
@ -221,7 +227,7 @@ public class PrefetchRecordsPublisherTest {
sleep(2000);
ProcessRecordsInput processRecordsInput2 = getRecordsCache.getNextResult().processRecordsInput();
ProcessRecordsInput processRecordsInput2 = getRecordsCache.pollNextResultAndUpdatePrefetchCounters().processRecordsInput();
assertNotEquals(processRecordsInput, processRecordsInput2);
assertEquals(expectedRecords, processRecordsInput2.records());
assertNotEquals(processRecordsInput2.timeSpentInCache(), Duration.ZERO);
@ -232,13 +238,13 @@ public class PrefetchRecordsPublisherTest {
@Test(expected = IllegalStateException.class)
public void testGetNextRecordsWithoutStarting() {
verify(executorService, times(0)).execute(any());
getRecordsCache.getNextResult();
getRecordsCache.pollNextResultAndUpdatePrefetchCounters();
}
@Test(expected = IllegalStateException.class)
public void testCallAfterShutdown() {
when(executorService.isShutdown()).thenReturn(true);
getRecordsCache.getNextResult();
getRecordsCache.pollNextResultAndUpdatePrefetchCounters();
}
@Test
@ -251,7 +257,7 @@ public class PrefetchRecordsPublisherTest {
doNothing().when(dataFetcher).restartIterator();
getRecordsCache.getNextResult();
blockUntilRecordsAvailable(() -> getRecordsCache.pollNextResultAndUpdatePrefetchCounters(), 1000L);
sleep(1000);
@ -266,11 +272,11 @@ public class PrefetchRecordsPublisherTest {
getRecordsCache.start(sequenceNumber, initialPosition);
RecordsRetrieved records = getRecordsCache.getNextResult();
RecordsRetrieved records = blockUntilRecordsAvailable(getRecordsCache::pollNextResultAndUpdatePrefetchCounters, 1000);
assertThat(records.processRecordsInput().millisBehindLatest(), equalTo(response.millisBehindLatest()));
}
@Test(timeout = 1000L)
@Test(timeout = 20000L)
public void testNoDeadlockOnFullQueue() {
//
// Fixes https://github.com/awslabs/amazon-kinesis-client/issues/448
@ -284,6 +290,8 @@ public class PrefetchRecordsPublisherTest {
.build();
when(getRecordsRetrievalStrategy.getRecords(anyInt())).thenReturn(response);
RxJavaPlugins.setErrorHandler(e -> e.printStackTrace());
getRecordsCache.start(sequenceNumber, initialPosition);
//
@ -296,7 +304,7 @@ public class PrefetchRecordsPublisherTest {
log.info("Queue is currently at {} starting subscriber", getRecordsCache.getRecordsResultQueue.size());
AtomicInteger receivedItems = new AtomicInteger(0);
final int expectedItems = MAX_SIZE * 3;
final int expectedItems = MAX_SIZE * 1000;
Object lock = new Object();
@ -351,6 +359,85 @@ public class PrefetchRecordsPublisherTest {
assertThat(receivedItems.get(), equalTo(expectedItems));
}
@Test(timeout = 20000L)
public void testNoDeadlockOnFullQueueAndLossOfNotification() {
//
// Fixes https://github.com/awslabs/amazon-kinesis-client/issues/602
//
// This test is to verify that the data consumption is not stuck in the case of an failed event delivery
// to the subscriber.
GetRecordsResponse response = GetRecordsResponse.builder().records(
Record.builder().data(SdkBytes.fromByteArray(new byte[] { 1, 2, 3 })).sequenceNumber("123").build())
.build();
when(getRecordsRetrievalStrategy.getRecords(anyInt())).thenReturn(response);
getRecordsCache.start(sequenceNumber, initialPosition);
//
// Wait for the queue to fill up, and the publisher to block on adding items to the queue.
//
log.info("Waiting for queue to fill up");
while (getRecordsCache.getRecordsResultQueue.size() < MAX_SIZE) {
Thread.yield();
}
log.info("Queue is currently at {} starting subscriber", getRecordsCache.getRecordsResultQueue.size());
AtomicInteger receivedItems = new AtomicInteger(0);
final int expectedItems = MAX_SIZE * 100;
Object lock = new Object();
Subscriber<RecordsRetrieved> delegateSubscriber = new Subscriber<RecordsRetrieved>() {
Subscription sub;
@Override
public void onSubscribe(Subscription s) {
sub = s;
s.request(1);
}
@Override
public void onNext(RecordsRetrieved recordsRetrieved) {
receivedItems.incrementAndGet();
if (receivedItems.get() >= expectedItems) {
synchronized (lock) {
log.info("Notifying waiters");
lock.notifyAll();
}
sub.cancel();
} else {
sub.request(1);
}
}
@Override
public void onError(Throwable t) {
log.error("Caught error", t);
throw new RuntimeException(t);
}
@Override
public void onComplete() {
fail("onComplete not expected in this test");
}
};
Subscriber<RecordsRetrieved> subscriber = new LossyNotificationSubscriber(delegateSubscriber, getRecordsCache);
synchronized (lock) {
log.info("Awaiting notification");
Flowable.fromPublisher(getRecordsCache).subscribeOn(Schedulers.computation())
.observeOn(Schedulers.computation(), true, 8).subscribe(subscriber);
try {
lock.wait();
} catch (InterruptedException e) {
throw new RuntimeException(e);
}
}
verify(getRecordsRetrievalStrategy, atLeast(expectedItems)).getRecords(anyInt());
assertThat(receivedItems.get(), equalTo(expectedItems));
}
@Test
public void testResetClearsRemainingData() {
List<GetRecordsResponse> responses = Stream.iterate(0, i -> i + 1).limit(10).map(i -> {
@ -372,14 +459,14 @@ public class PrefetchRecordsPublisherTest {
getRecordsCache.start(sequenceNumber, initialPosition);
RecordsRetrieved lastProcessed = getRecordsCache.getNextResult();
RecordsRetrieved expected = getRecordsCache.getNextResult();
RecordsRetrieved lastProcessed = blockUntilRecordsAvailable(getRecordsCache::pollNextResultAndUpdatePrefetchCounters, 1000);
RecordsRetrieved expected = blockUntilRecordsAvailable(getRecordsCache::pollNextResultAndUpdatePrefetchCounters, 1000);
//
// Skip some of the records the cache
//
getRecordsCache.getNextResult();
getRecordsCache.getNextResult();
blockUntilRecordsAvailable(getRecordsCache::pollNextResultAndUpdatePrefetchCounters, 1000);
blockUntilRecordsAvailable(getRecordsCache::pollNextResultAndUpdatePrefetchCounters, 1000);
verify(getRecordsRetrievalStrategy, atLeast(2)).getRecords(anyInt());
@ -388,7 +475,7 @@ public class PrefetchRecordsPublisherTest {
}
getRecordsCache.restartFrom(lastProcessed);
RecordsRetrieved postRestart = getRecordsCache.getNextResult();
RecordsRetrieved postRestart = blockUntilRecordsAvailable(getRecordsCache::pollNextResultAndUpdatePrefetchCounters, 1000);
assertThat(postRestart.processRecordsInput(), eqProcessRecordsInput(expected.processRecordsInput()));
verify(dataFetcher).resetIterator(eq(responses.get(0).nextShardIterator()),
@ -432,6 +519,33 @@ public class PrefetchRecordsPublisherTest {
}
}
private static class LossyNotificationSubscriber extends ShardConsumerNotifyingSubscriber {
private static final int LOSS_EVERY_NTH_RECORD = 100;
private static int recordCounter = 0;
private static final ScheduledExecutorService consumerHealthChecker = Executors.newScheduledThreadPool(1);
public LossyNotificationSubscriber(Subscriber<RecordsRetrieved> delegate, RecordsPublisher recordsPublisher) {
super(delegate, recordsPublisher);
}
@Override
public void onNext(RecordsRetrieved recordsRetrieved) {
log.info("Subscriber received onNext");
if (!(recordCounter % LOSS_EVERY_NTH_RECORD == LOSS_EVERY_NTH_RECORD - 1)) {
getRecordsPublisher().notify(getRecordsDeliveryAck(recordsRetrieved));
getDelegateSubscriber().onNext(recordsRetrieved);
} else {
log.info("Record Loss Triggered");
consumerHealthChecker.schedule(() -> {
getRecordsPublisher().restartFrom(recordsRetrieved);
Flowable.fromPublisher(getRecordsPublisher()).subscribeOn(Schedulers.computation())
.observeOn(Schedulers.computation(), true, 8).subscribe(this);
}, 1000, TimeUnit.MILLISECONDS);
}
recordCounter++;
}
}
@After
public void shutdown() {
getRecordsCache.shutdown();

View file

@ -0,0 +1,38 @@
/*
* Copyright 2019 Amazon.com, Inc. or its affiliates.
* Licensed under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package software.amazon.kinesis.utils;
import java.util.function.Supplier;
public class BlockingUtils {
public static <Records> Records blockUntilRecordsAvailable(Supplier<Records> recordsSupplier, long timeoutMillis) {
Records recordsRetrieved;
while((recordsRetrieved = recordsSupplier.get()) == null && timeoutMillis > 0 ) {
try {
Thread.sleep(100);
} catch (InterruptedException e) {
Thread.currentThread().interrupt();
}
timeoutMillis -= 100;
}
if(recordsRetrieved != null) {
return recordsRetrieved;
} else {
throw new RuntimeException("No records found");
}
}
}