KCLv3 merge
This commit is contained in:
parent
a159fa31fb
commit
a754364d29
175 changed files with 18424 additions and 2349 deletions
|
|
@ -21,7 +21,7 @@
|
|||
<parent>
|
||||
<artifactId>amazon-kinesis-client-pom</artifactId>
|
||||
<groupId>software.amazon.kinesis</groupId>
|
||||
<version>2.6.1-SNAPSHOT</version>
|
||||
<version>3.0.0</version>
|
||||
</parent>
|
||||
<modelVersion>4.0.0</modelVersion>
|
||||
|
||||
|
|
@ -72,7 +72,7 @@
|
|||
<dependency>
|
||||
<groupId>org.projectlombok</groupId>
|
||||
<artifactId>lombok</artifactId>
|
||||
<version>1.18.24</version>
|
||||
<version>1.18.28</version>
|
||||
<scope>provided</scope>
|
||||
</dependency>
|
||||
<dependency>
|
||||
|
|
|
|||
|
|
@ -23,7 +23,7 @@
|
|||
<parent>
|
||||
<groupId>software.amazon.kinesis</groupId>
|
||||
<artifactId>amazon-kinesis-client-pom</artifactId>
|
||||
<version>2.6.1-SNAPSHOT</version>
|
||||
<version>3.0.0</version>
|
||||
</parent>
|
||||
|
||||
<artifactId>amazon-kinesis-client</artifactId>
|
||||
|
|
@ -68,6 +68,18 @@
|
|||
<artifactId>dynamodb</artifactId>
|
||||
<version>${awssdk.version}</version>
|
||||
</dependency>
|
||||
<!-- https://mvnrepository.com/artifact/software.amazon.awssdk/dynamodb-enhanced -->
|
||||
<dependency>
|
||||
<groupId>software.amazon.awssdk</groupId>
|
||||
<artifactId>dynamodb-enhanced</artifactId>
|
||||
<version>${awssdk.version}</version>
|
||||
</dependency>
|
||||
<!-- https://mvnrepository.com/artifact/com.amazonaws/dynamodb-lock-client -->
|
||||
<dependency>
|
||||
<groupId>com.amazonaws</groupId>
|
||||
<artifactId>dynamodb-lock-client</artifactId>
|
||||
<version>1.3.0</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>software.amazon.awssdk</groupId>
|
||||
<artifactId>cloudwatch</artifactId>
|
||||
|
|
@ -103,11 +115,23 @@
|
|||
<artifactId>commons-lang3</artifactId>
|
||||
<version>3.14.0</version>
|
||||
</dependency>
|
||||
<!-- https://mvnrepository.com/artifact/commons-collections/commons-collections -->
|
||||
<dependency>
|
||||
<groupId>commons-collections</groupId>
|
||||
<artifactId>commons-collections</artifactId>
|
||||
<version>3.2.2</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.slf4j</groupId>
|
||||
<artifactId>slf4j-api</artifactId>
|
||||
<version>${slf4j.version}</version>
|
||||
</dependency>
|
||||
<!-- https://mvnrepository.com/artifact/org.jetbrains/annotations -->
|
||||
<dependency>
|
||||
<groupId>org.jetbrains</groupId>
|
||||
<artifactId>annotations</artifactId>
|
||||
<version>26.0.1</version>
|
||||
</dependency>
|
||||
|
||||
<dependency>
|
||||
<groupId>io.reactivex.rxjava3</groupId>
|
||||
|
|
@ -123,35 +147,47 @@
|
|||
</dependency>
|
||||
|
||||
<!-- Test -->
|
||||
<!-- TODO: Migrate all tests to Junit5 -->
|
||||
<dependency>
|
||||
<groupId>org.junit.jupiter</groupId>
|
||||
<artifactId>junit-jupiter-api</artifactId>
|
||||
<version>5.11.3</version>
|
||||
<scope>test</scope>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>junit</groupId>
|
||||
<artifactId>junit</artifactId>
|
||||
<version>4.13.2</version>
|
||||
<scope>test</scope>
|
||||
</dependency>
|
||||
|
||||
<!-- https://mvnrepository.com/artifact/org.junit.jupiter/junit-jupiter-params -->
|
||||
<dependency>
|
||||
<groupId>org.mockito</groupId>
|
||||
<artifactId>mockito-all</artifactId>
|
||||
<version>1.10.19</version>
|
||||
<groupId>org.junit.jupiter</groupId>
|
||||
<artifactId>junit-jupiter-params</artifactId>
|
||||
<version>5.11.3</version>
|
||||
<scope>test</scope>
|
||||
</dependency>
|
||||
<!-- Using older version to be compatible with Java 8 -->
|
||||
<dependency>
|
||||
<groupId>org.mockito</groupId>
|
||||
<artifactId>mockito-junit-jupiter</artifactId>
|
||||
<version>3.12.4</version>
|
||||
<scope>test</scope>
|
||||
</dependency>
|
||||
|
||||
<dependency>
|
||||
<groupId>org.hamcrest</groupId>
|
||||
<artifactId>hamcrest-all</artifactId>
|
||||
<version>1.3</version>
|
||||
<scope>test</scope>
|
||||
</dependency>
|
||||
|
||||
|
||||
<!--<dependency>-->
|
||||
<!--<groupId>com.amazonaws</groupId>-->
|
||||
<!--<artifactId>DynamoDBLocal</artifactId>-->
|
||||
<!--<version>1.11.86</version>-->
|
||||
<!--<scope>test</scope>-->
|
||||
<!--</dependency>-->
|
||||
|
||||
<!-- Using older version to be compatible with Java 8 -->
|
||||
<!-- https://mvnrepository.com/artifact/com.amazonaws/DynamoDBLocal -->
|
||||
<dependency>
|
||||
<groupId>com.amazonaws</groupId>
|
||||
<artifactId>DynamoDBLocal</artifactId>
|
||||
<version>1.25.0</version>
|
||||
<scope>test</scope>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>ch.qos.logback</groupId>
|
||||
<artifactId>logback-classic</artifactId>
|
||||
|
|
|
|||
|
|
@ -256,7 +256,8 @@ public class ConfigsBuilder {
|
|||
* @return LeaseManagementConfig
|
||||
*/
|
||||
public LeaseManagementConfig leaseManagementConfig() {
|
||||
return new LeaseManagementConfig(tableName(), dynamoDBClient(), kinesisClient(), workerIdentifier());
|
||||
return new LeaseManagementConfig(
|
||||
tableName(), applicationName(), dynamoDBClient(), kinesisClient(), workerIdentifier());
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
|
|||
|
|
@ -0,0 +1,57 @@
|
|||
/*
|
||||
* Copyright 2024 Amazon.com, Inc. or its affiliates.
|
||||
* Licensed under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package software.amazon.kinesis.common;
|
||||
|
||||
import lombok.Data;
|
||||
import lombok.NoArgsConstructor;
|
||||
import lombok.experimental.Accessors;
|
||||
import software.amazon.awssdk.services.dynamodb.model.BillingMode;
|
||||
|
||||
/**
|
||||
* Configurations of a DDB table created by KCL for its internal operations.
|
||||
*/
|
||||
@Data
|
||||
@Accessors(fluent = true)
|
||||
@NoArgsConstructor
|
||||
public class DdbTableConfig {
|
||||
|
||||
protected DdbTableConfig(final String applicationName, final String tableSuffix) {
|
||||
this.tableName = applicationName + "-" + tableSuffix;
|
||||
}
|
||||
|
||||
/**
|
||||
* name to use for the DDB table. If null, it will default to
|
||||
* applicationName-tableSuffix. If multiple KCL applications
|
||||
* run in the same account, a unique tableName must be provided.
|
||||
*/
|
||||
private String tableName;
|
||||
|
||||
/**
|
||||
* Billing mode used to create the DDB table.
|
||||
*/
|
||||
private BillingMode billingMode = BillingMode.PAY_PER_REQUEST;
|
||||
|
||||
/**
|
||||
* read capacity to provision during DDB table creation,
|
||||
* if billing mode is PROVISIONED.
|
||||
*/
|
||||
private long readCapacity;
|
||||
|
||||
/**
|
||||
* write capacity to provision during DDB table creation,
|
||||
* if billing mode is PROVISIONED.
|
||||
*/
|
||||
private long writeCapacity;
|
||||
}
|
||||
|
|
@ -15,10 +15,13 @@
|
|||
package software.amazon.kinesis.common;
|
||||
|
||||
import java.time.Duration;
|
||||
import java.util.concurrent.CompletableFuture;
|
||||
import java.util.concurrent.CompletionException;
|
||||
import java.util.concurrent.ExecutionException;
|
||||
import java.util.concurrent.Future;
|
||||
import java.util.concurrent.TimeUnit;
|
||||
import java.util.concurrent.TimeoutException;
|
||||
import java.util.function.Supplier;
|
||||
|
||||
public class FutureUtils {
|
||||
|
||||
|
|
@ -31,4 +34,15 @@ public class FutureUtils {
|
|||
throw te;
|
||||
}
|
||||
}
|
||||
|
||||
public static <T> T unwrappingFuture(final Supplier<CompletableFuture<T>> supplier) {
|
||||
try {
|
||||
return supplier.get().join();
|
||||
} catch (CompletionException e) {
|
||||
if (e.getCause() instanceof RuntimeException) {
|
||||
throw (RuntimeException) e.getCause();
|
||||
}
|
||||
throw e;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -1,5 +1,5 @@
|
|||
/*
|
||||
* Copyright 2019 Amazon.com, Inc. or its affiliates.
|
||||
* Copyright 2024 Amazon.com, Inc. or its affiliates.
|
||||
* Licensed under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
|
|
@ -12,18 +12,16 @@
|
|||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package software.amazon.kinesis.common;
|
||||
|
||||
package software.amazon.kinesis.leases.dynamodb;
|
||||
public class StackTraceUtils {
|
||||
public static String getPrintableStackTrace(final StackTraceElement[] stackTrace) {
|
||||
final StringBuilder stackTraceString = new StringBuilder();
|
||||
|
||||
import lombok.AccessLevel;
|
||||
import lombok.NoArgsConstructor;
|
||||
|
||||
/**
|
||||
* This class is just a holder for initial lease table IOPs units. This class will be removed in a future release.
|
||||
*/
|
||||
@Deprecated
|
||||
@NoArgsConstructor(access = AccessLevel.PRIVATE)
|
||||
public class TableConstants {
|
||||
public static final long DEFAULT_INITIAL_LEASE_TABLE_READ_CAPACITY = 10L;
|
||||
public static final long DEFAULT_INITIAL_LEASE_TABLE_WRITE_CAPACITY = 10L;
|
||||
for (final StackTraceElement traceElement : stackTrace) {
|
||||
stackTraceString.append("\tat ").append(traceElement).append("\n");
|
||||
}
|
||||
|
||||
return stackTraceString.toString();
|
||||
}
|
||||
}
|
||||
|
|
@ -18,6 +18,7 @@ package software.amazon.kinesis.coordinator;
|
|||
import lombok.Data;
|
||||
import lombok.NonNull;
|
||||
import lombok.experimental.Accessors;
|
||||
import software.amazon.kinesis.common.DdbTableConfig;
|
||||
import software.amazon.kinesis.leases.NoOpShardPrioritization;
|
||||
import software.amazon.kinesis.leases.ShardPrioritization;
|
||||
|
||||
|
|
@ -27,6 +28,14 @@ import software.amazon.kinesis.leases.ShardPrioritization;
|
|||
@Data
|
||||
@Accessors(fluent = true)
|
||||
public class CoordinatorConfig {
|
||||
|
||||
private static final int PERIODIC_SHARD_SYNC_MAX_WORKERS_DEFAULT = 1;
|
||||
|
||||
public CoordinatorConfig(final String applicationName) {
|
||||
this.applicationName = applicationName;
|
||||
this.coordinatorStateConfig = new CoordinatorStateTableConfig(applicationName);
|
||||
}
|
||||
|
||||
/**
|
||||
* Application name used by checkpointer to checkpoint.
|
||||
*
|
||||
|
|
@ -96,4 +105,53 @@ public class CoordinatorConfig {
|
|||
* <p>Default value: 1000L</p>
|
||||
*/
|
||||
private long schedulerInitializationBackoffTimeMillis = 1000L;
|
||||
|
||||
/**
|
||||
* Version the KCL needs to operate in. For more details check the KCLv3 migration
|
||||
* documentation.
|
||||
*/
|
||||
public enum ClientVersionConfig {
|
||||
/**
|
||||
* For an application that was operating with previous KCLv2.x, during
|
||||
* upgrade to KCLv3.x, a migration process is needed due to the incompatible
|
||||
* changes between the 2 versions. During the migration process, application
|
||||
* must use ClientVersion=CLIENT_VERSION_COMPATIBLE_WITH_2x so that it runs in
|
||||
* a compatible mode until all workers in the cluster have upgraded to the version
|
||||
* running 3.x version (which is determined based on workers emitting WorkerMetricStats)
|
||||
* Once all known workers are in 3.x mode, the library auto toggles to 3.x mode;
|
||||
* but prior to that it runs in a mode compatible with 2.x workers.
|
||||
* This version also allows rolling back to the compatible mode from the
|
||||
* auto-toggled 3.x mode.
|
||||
*/
|
||||
CLIENT_VERSION_CONFIG_COMPATIBLE_WITH_2x,
|
||||
/**
|
||||
* A new application operating with KCLv3.x will use this value. Also, an application
|
||||
* that has successfully upgraded to 3.x version and no longer needs the ability
|
||||
* for a rollback to a 2.x compatible version, will use this value. In this version,
|
||||
* KCL will operate with new algorithms introduced in 3.x which is not compatible
|
||||
* with prior versions. And once in this version, rollback to 2.x is not supported.
|
||||
*/
|
||||
CLIENT_VERSION_CONFIG_3x,
|
||||
}
|
||||
|
||||
/**
|
||||
* Client version KCL must operate in, by default it operates in 3.x version which is not
|
||||
* compatible with prior versions.
|
||||
*/
|
||||
private ClientVersionConfig clientVersionConfig = ClientVersionConfig.CLIENT_VERSION_CONFIG_3x;
|
||||
|
||||
public static class CoordinatorStateTableConfig extends DdbTableConfig {
|
||||
private CoordinatorStateTableConfig(final String applicationName) {
|
||||
super(applicationName, "CoordinatorState");
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Configuration to control how the CoordinatorState DDB table is created, such as table name,
|
||||
* billing mode, provisioned capacity. If no table name is specified, the table name will
|
||||
* default to applicationName-CoordinatorState. If no billing more is chosen, default is
|
||||
* On-Demand.
|
||||
*/
|
||||
@NonNull
|
||||
private final CoordinatorStateTableConfig coordinatorStateConfig;
|
||||
}
|
||||
|
|
|
|||
|
|
@ -0,0 +1,52 @@
|
|||
/*
|
||||
* Copyright 2024 Amazon.com, Inc. or its affiliates.
|
||||
* Licensed under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package software.amazon.kinesis.coordinator;
|
||||
|
||||
import java.util.Map;
|
||||
|
||||
import lombok.AccessLevel;
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.Builder;
|
||||
import lombok.Data;
|
||||
import lombok.NoArgsConstructor;
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
import software.amazon.awssdk.services.dynamodb.model.AttributeValue;
|
||||
import software.amazon.kinesis.annotations.KinesisClientInternalApi;
|
||||
|
||||
/**
|
||||
* DataModel for CoordinatorState, this data model is used to store various state information required
|
||||
* for coordination across the KCL worker fleet. Therefore, the model follows a flexible schema.
|
||||
*/
|
||||
@Data
|
||||
@Builder
|
||||
@NoArgsConstructor
|
||||
@AllArgsConstructor(access = AccessLevel.PRIVATE)
|
||||
@Slf4j
|
||||
@KinesisClientInternalApi
|
||||
public class CoordinatorState {
|
||||
public static final String COORDINATOR_STATE_TABLE_HASH_KEY_ATTRIBUTE_NAME = "key";
|
||||
|
||||
/**
|
||||
* Key value for the item in the CoordinatorState table used for leader
|
||||
* election among the KCL workers. The attributes relevant to this item
|
||||
* is dictated by the DDB Lock client implementation that is used to
|
||||
* provide mutual exclusion.
|
||||
*/
|
||||
public static final String LEADER_HASH_KEY = "Leader";
|
||||
|
||||
private String key;
|
||||
|
||||
private Map<String, AttributeValue> attributes;
|
||||
}
|
||||
|
|
@ -0,0 +1,417 @@
|
|||
/*
|
||||
* Copyright 2024 Amazon.com, Inc. or its affiliates.
|
||||
* Licensed under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package software.amazon.kinesis.coordinator;
|
||||
|
||||
import java.time.Duration;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Collections;
|
||||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.Objects;
|
||||
|
||||
import com.amazonaws.services.dynamodbv2.AmazonDynamoDBLockClientOptions;
|
||||
import com.amazonaws.services.dynamodbv2.AmazonDynamoDBLockClientOptions.AmazonDynamoDBLockClientOptionsBuilder;
|
||||
import lombok.NonNull;
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
import org.apache.commons.collections4.MapUtils;
|
||||
import software.amazon.awssdk.core.waiters.WaiterResponse;
|
||||
import software.amazon.awssdk.services.dynamodb.DynamoDbAsyncClient;
|
||||
import software.amazon.awssdk.services.dynamodb.DynamoDbClient;
|
||||
import software.amazon.awssdk.services.dynamodb.model.AttributeAction;
|
||||
import software.amazon.awssdk.services.dynamodb.model.AttributeDefinition;
|
||||
import software.amazon.awssdk.services.dynamodb.model.AttributeValue;
|
||||
import software.amazon.awssdk.services.dynamodb.model.AttributeValueUpdate;
|
||||
import software.amazon.awssdk.services.dynamodb.model.BillingMode;
|
||||
import software.amazon.awssdk.services.dynamodb.model.ConditionalCheckFailedException;
|
||||
import software.amazon.awssdk.services.dynamodb.model.CreateTableRequest;
|
||||
import software.amazon.awssdk.services.dynamodb.model.CreateTableResponse;
|
||||
import software.amazon.awssdk.services.dynamodb.model.DescribeTableRequest;
|
||||
import software.amazon.awssdk.services.dynamodb.model.DescribeTableResponse;
|
||||
import software.amazon.awssdk.services.dynamodb.model.DynamoDbException;
|
||||
import software.amazon.awssdk.services.dynamodb.model.ExpectedAttributeValue;
|
||||
import software.amazon.awssdk.services.dynamodb.model.GetItemRequest;
|
||||
import software.amazon.awssdk.services.dynamodb.model.GetItemResponse;
|
||||
import software.amazon.awssdk.services.dynamodb.model.KeySchemaElement;
|
||||
import software.amazon.awssdk.services.dynamodb.model.KeyType;
|
||||
import software.amazon.awssdk.services.dynamodb.model.ProvisionedThroughput;
|
||||
import software.amazon.awssdk.services.dynamodb.model.ProvisionedThroughputExceededException;
|
||||
import software.amazon.awssdk.services.dynamodb.model.PutItemRequest;
|
||||
import software.amazon.awssdk.services.dynamodb.model.ResourceNotFoundException;
|
||||
import software.amazon.awssdk.services.dynamodb.model.ScalarAttributeType;
|
||||
import software.amazon.awssdk.services.dynamodb.model.ScanRequest;
|
||||
import software.amazon.awssdk.services.dynamodb.model.ScanResponse;
|
||||
import software.amazon.awssdk.services.dynamodb.model.TableDescription;
|
||||
import software.amazon.awssdk.services.dynamodb.model.TableStatus;
|
||||
import software.amazon.awssdk.services.dynamodb.model.UpdateItemRequest;
|
||||
import software.amazon.awssdk.services.dynamodb.waiters.DynamoDbAsyncWaiter;
|
||||
import software.amazon.awssdk.utils.CollectionUtils;
|
||||
import software.amazon.kinesis.common.FutureUtils;
|
||||
import software.amazon.kinesis.coordinator.CoordinatorConfig.CoordinatorStateTableConfig;
|
||||
import software.amazon.kinesis.coordinator.migration.MigrationState;
|
||||
import software.amazon.kinesis.leases.DynamoUtils;
|
||||
import software.amazon.kinesis.leases.exceptions.DependencyException;
|
||||
import software.amazon.kinesis.leases.exceptions.InvalidStateException;
|
||||
import software.amazon.kinesis.leases.exceptions.ProvisionedThroughputException;
|
||||
|
||||
import static software.amazon.kinesis.common.FutureUtils.unwrappingFuture;
|
||||
import static software.amazon.kinesis.coordinator.CoordinatorState.COORDINATOR_STATE_TABLE_HASH_KEY_ATTRIBUTE_NAME;
|
||||
|
||||
/**
|
||||
* Data Access Object to abstract accessing {@link CoordinatorState} from
|
||||
* the CoordinatorState DDB table.
|
||||
*/
|
||||
@Slf4j
|
||||
public class CoordinatorStateDAO {
|
||||
private final DynamoDbAsyncClient dynamoDbAsyncClient;
|
||||
private final DynamoDbClient dynamoDbSyncClient;
|
||||
|
||||
private final CoordinatorStateTableConfig config;
|
||||
|
||||
public CoordinatorStateDAO(
|
||||
final DynamoDbAsyncClient dynamoDbAsyncClient, final CoordinatorStateTableConfig config) {
|
||||
this.dynamoDbAsyncClient = dynamoDbAsyncClient;
|
||||
this.config = config;
|
||||
this.dynamoDbSyncClient = createDelegateClient();
|
||||
}
|
||||
|
||||
public void initialize() throws DependencyException {
|
||||
createTableIfNotExists();
|
||||
}
|
||||
|
||||
private DynamoDbClient createDelegateClient() {
|
||||
return new DynamoDbAsyncToSyncClientAdapter(dynamoDbAsyncClient);
|
||||
}
|
||||
|
||||
public AmazonDynamoDBLockClientOptionsBuilder getDDBLockClientOptionsBuilder() {
|
||||
return AmazonDynamoDBLockClientOptions.builder(dynamoDbSyncClient, config.tableName())
|
||||
.withPartitionKeyName(COORDINATOR_STATE_TABLE_HASH_KEY_ATTRIBUTE_NAME);
|
||||
}
|
||||
|
||||
/**
|
||||
* List all the {@link CoordinatorState} from the DDB table synchronously
|
||||
*
|
||||
* @throws DependencyException if DynamoDB scan fails in an unexpected way
|
||||
* @throws InvalidStateException if ddb table does not exist
|
||||
* @throws ProvisionedThroughputException if DynamoDB scan fails due to lack of capacity
|
||||
*
|
||||
* @return list of state
|
||||
*/
|
||||
public List<CoordinatorState> listCoordinatorState()
|
||||
throws ProvisionedThroughputException, DependencyException, InvalidStateException {
|
||||
log.debug("Listing coordinatorState");
|
||||
|
||||
final ScanRequest request =
|
||||
ScanRequest.builder().tableName(config.tableName()).build();
|
||||
|
||||
try {
|
||||
ScanResponse response = FutureUtils.unwrappingFuture(() -> dynamoDbAsyncClient.scan(request));
|
||||
final List<CoordinatorState> stateList = new ArrayList<>();
|
||||
while (Objects.nonNull(response)) {
|
||||
log.debug("Scan response {}", response);
|
||||
|
||||
response.items().stream().map(this::fromDynamoRecord).forEach(stateList::add);
|
||||
if (!CollectionUtils.isNullOrEmpty(response.lastEvaluatedKey())) {
|
||||
final ScanRequest continuationRequest = request.toBuilder()
|
||||
.exclusiveStartKey(response.lastEvaluatedKey())
|
||||
.build();
|
||||
log.debug("Scan request {}", continuationRequest);
|
||||
response = FutureUtils.unwrappingFuture(() -> dynamoDbAsyncClient.scan(continuationRequest));
|
||||
} else {
|
||||
log.debug("Scan finished");
|
||||
response = null;
|
||||
}
|
||||
}
|
||||
return stateList;
|
||||
} catch (final ProvisionedThroughputExceededException e) {
|
||||
log.warn(
|
||||
"Provisioned throughput on {} has exceeded. It is recommended to increase the IOPs"
|
||||
+ " on the table.",
|
||||
config.tableName());
|
||||
throw new ProvisionedThroughputException(e);
|
||||
} catch (final ResourceNotFoundException e) {
|
||||
throw new InvalidStateException(
|
||||
String.format("Cannot list coordinatorState, because table %s does not exist", config.tableName()));
|
||||
} catch (final DynamoDbException e) {
|
||||
throw new DependencyException(e);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Create a new {@link CoordinatorState} if it does not exist.
|
||||
* @param state the state to create
|
||||
* @return true if state was created, false if it already exists
|
||||
*
|
||||
* @throws DependencyException if DynamoDB put fails in an unexpected way
|
||||
* @throws InvalidStateException if lease table does not exist
|
||||
* @throws ProvisionedThroughputException if DynamoDB put fails due to lack of capacity
|
||||
*/
|
||||
public boolean createCoordinatorStateIfNotExists(final CoordinatorState state)
|
||||
throws DependencyException, InvalidStateException, ProvisionedThroughputException {
|
||||
log.debug("Creating coordinatorState {}", state);
|
||||
|
||||
final PutItemRequest request = PutItemRequest.builder()
|
||||
.tableName(config.tableName())
|
||||
.item(toDynamoRecord(state))
|
||||
.expected(getDynamoNonExistentExpectation())
|
||||
.build();
|
||||
|
||||
try {
|
||||
FutureUtils.unwrappingFuture(() -> dynamoDbAsyncClient.putItem(request));
|
||||
} catch (final ConditionalCheckFailedException e) {
|
||||
log.info("Not creating coordinator state because the key already exists");
|
||||
return false;
|
||||
} catch (final ProvisionedThroughputExceededException e) {
|
||||
log.warn(
|
||||
"Provisioned throughput on {} has exceeded. It is recommended to increase the IOPs"
|
||||
+ " on the table.",
|
||||
config.tableName());
|
||||
throw new ProvisionedThroughputException(e);
|
||||
} catch (final ResourceNotFoundException e) {
|
||||
throw new InvalidStateException(String.format(
|
||||
"Cannot create coordinatorState %s, because table %s does not exist", state, config.tableName()));
|
||||
} catch (final DynamoDbException e) {
|
||||
throw new DependencyException(e);
|
||||
}
|
||||
|
||||
log.info("Created CoordinatorState: {}", state);
|
||||
return true;
|
||||
}
|
||||
|
||||
/**
|
||||
* @param key Get the CoordinatorState for this key
|
||||
*
|
||||
* @throws InvalidStateException if ddb table does not exist
|
||||
* @throws ProvisionedThroughputException if DynamoDB get fails due to lack of capacity
|
||||
* @throws DependencyException if DynamoDB get fails in an unexpected way
|
||||
*
|
||||
* @return state for the specified key, or null if one doesn't exist
|
||||
*/
|
||||
public CoordinatorState getCoordinatorState(@NonNull final String key)
|
||||
throws DependencyException, InvalidStateException, ProvisionedThroughputException {
|
||||
log.debug("Getting coordinatorState with key {}", key);
|
||||
|
||||
final GetItemRequest request = GetItemRequest.builder()
|
||||
.tableName(config.tableName())
|
||||
.key(getCoordinatorStateKey(key))
|
||||
.consistentRead(true)
|
||||
.build();
|
||||
|
||||
try {
|
||||
final GetItemResponse result = FutureUtils.unwrappingFuture(() -> dynamoDbAsyncClient.getItem(request));
|
||||
|
||||
final Map<String, AttributeValue> dynamoRecord = result.item();
|
||||
if (CollectionUtils.isNullOrEmpty(dynamoRecord)) {
|
||||
log.debug("No coordinatorState found with key {}, returning null.", key);
|
||||
return null;
|
||||
}
|
||||
return fromDynamoRecord(dynamoRecord);
|
||||
} catch (final ProvisionedThroughputExceededException e) {
|
||||
log.warn(
|
||||
"Provisioned throughput on {} has exceeded. It is recommended to increase the IOPs"
|
||||
+ " on the table.",
|
||||
config.tableName());
|
||||
throw new ProvisionedThroughputException(e);
|
||||
} catch (final ResourceNotFoundException e) {
|
||||
throw new InvalidStateException(String.format(
|
||||
"Cannot get coordinatorState for key %s, because table %s does not exist",
|
||||
key, config.tableName()));
|
||||
} catch (final DynamoDbException e) {
|
||||
throw new DependencyException(e);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Update fields of the given coordinator state in DynamoDB. Conditional on the provided expectation.
|
||||
*
|
||||
* @return true if update succeeded, false otherwise when expectations are not met
|
||||
*
|
||||
* @throws InvalidStateException if table does not exist
|
||||
* @throws ProvisionedThroughputException if DynamoDB update fails due to lack of capacity
|
||||
* @throws DependencyException if DynamoDB update fails in an unexpected way
|
||||
*/
|
||||
public boolean updateCoordinatorStateWithExpectation(
|
||||
@NonNull final CoordinatorState state, final Map<String, ExpectedAttributeValue> expectations)
|
||||
throws DependencyException, InvalidStateException, ProvisionedThroughputException {
|
||||
final Map<String, ExpectedAttributeValue> expectationMap = getDynamoExistentExpectation(state.getKey());
|
||||
expectationMap.putAll(MapUtils.emptyIfNull(expectations));
|
||||
|
||||
final Map<String, AttributeValueUpdate> updateMap = getDynamoCoordinatorStateUpdate(state);
|
||||
|
||||
final UpdateItemRequest request = UpdateItemRequest.builder()
|
||||
.tableName(config.tableName())
|
||||
.key(getCoordinatorStateKey(state.getKey()))
|
||||
.expected(expectationMap)
|
||||
.attributeUpdates(updateMap)
|
||||
.build();
|
||||
|
||||
try {
|
||||
FutureUtils.unwrappingFuture(() -> dynamoDbAsyncClient.updateItem(request));
|
||||
} catch (final ConditionalCheckFailedException e) {
|
||||
log.debug("CoordinatorState update {} failed because conditions were not met", state);
|
||||
return false;
|
||||
} catch (final ProvisionedThroughputExceededException e) {
|
||||
log.warn(
|
||||
"Provisioned throughput on {} has exceeded. It is recommended to increase the IOPs"
|
||||
+ " on the table.",
|
||||
config.tableName());
|
||||
throw new ProvisionedThroughputException(e);
|
||||
} catch (final ResourceNotFoundException e) {
|
||||
throw new InvalidStateException(String.format(
|
||||
"Cannot update coordinatorState for key %s, because table %s does not exist",
|
||||
state.getKey(), config.tableName()));
|
||||
} catch (final DynamoDbException e) {
|
||||
throw new DependencyException(e);
|
||||
}
|
||||
|
||||
log.info("Coordinator state updated {}", state);
|
||||
return true;
|
||||
}
|
||||
|
||||
private void createTableIfNotExists() throws DependencyException {
|
||||
TableDescription tableDescription = getTableDescription();
|
||||
if (tableDescription == null) {
|
||||
final CreateTableResponse response = unwrappingFuture(() -> dynamoDbAsyncClient.createTable(getRequest()));
|
||||
tableDescription = response.tableDescription();
|
||||
log.info("DDB Table: {} created", config.tableName());
|
||||
} else {
|
||||
log.info("Skipping DDB table {} creation as it already exists", config.tableName());
|
||||
}
|
||||
|
||||
if (tableDescription.tableStatus() != TableStatus.ACTIVE) {
|
||||
log.info("Waiting for DDB Table: {} to become active", config.tableName());
|
||||
try (final DynamoDbAsyncWaiter waiter = dynamoDbAsyncClient.waiter()) {
|
||||
final WaiterResponse<DescribeTableResponse> response =
|
||||
unwrappingFuture(() -> waiter.waitUntilTableExists(
|
||||
r -> r.tableName(config.tableName()), o -> o.waitTimeout(Duration.ofMinutes(10))));
|
||||
response.matched()
|
||||
.response()
|
||||
.orElseThrow(() -> new DependencyException(new IllegalStateException(
|
||||
"Creating CoordinatorState table timed out",
|
||||
response.matched().exception().orElse(null))));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private CreateTableRequest getRequest() {
|
||||
final CreateTableRequest.Builder requestBuilder = CreateTableRequest.builder()
|
||||
.tableName(config.tableName())
|
||||
.keySchema(KeySchemaElement.builder()
|
||||
.attributeName(COORDINATOR_STATE_TABLE_HASH_KEY_ATTRIBUTE_NAME)
|
||||
.keyType(KeyType.HASH)
|
||||
.build())
|
||||
.attributeDefinitions(AttributeDefinition.builder()
|
||||
.attributeName(COORDINATOR_STATE_TABLE_HASH_KEY_ATTRIBUTE_NAME)
|
||||
.attributeType(ScalarAttributeType.S)
|
||||
.build());
|
||||
|
||||
switch (config.billingMode()) {
|
||||
case PAY_PER_REQUEST:
|
||||
requestBuilder.billingMode(BillingMode.PAY_PER_REQUEST);
|
||||
break;
|
||||
case PROVISIONED:
|
||||
requestBuilder.billingMode(BillingMode.PROVISIONED);
|
||||
|
||||
final ProvisionedThroughput throughput = ProvisionedThroughput.builder()
|
||||
.readCapacityUnits(config.readCapacity())
|
||||
.writeCapacityUnits(config.writeCapacity())
|
||||
.build();
|
||||
requestBuilder.provisionedThroughput(throughput);
|
||||
break;
|
||||
}
|
||||
return requestBuilder.build();
|
||||
}
|
||||
|
||||
private Map<String, AttributeValue> getCoordinatorStateKey(@NonNull final String key) {
|
||||
return Collections.singletonMap(
|
||||
COORDINATOR_STATE_TABLE_HASH_KEY_ATTRIBUTE_NAME, DynamoUtils.createAttributeValue(key));
|
||||
}
|
||||
|
||||
private CoordinatorState fromDynamoRecord(final Map<String, AttributeValue> dynamoRecord) {
|
||||
final HashMap<String, AttributeValue> attributes = new HashMap<>(dynamoRecord);
|
||||
final String keyValue =
|
||||
DynamoUtils.safeGetString(attributes.remove(COORDINATOR_STATE_TABLE_HASH_KEY_ATTRIBUTE_NAME));
|
||||
|
||||
final MigrationState migrationState = MigrationState.deserialize(keyValue, attributes);
|
||||
if (migrationState != null) {
|
||||
log.debug("Retrieved MigrationState {}", migrationState);
|
||||
return migrationState;
|
||||
}
|
||||
|
||||
final CoordinatorState c =
|
||||
CoordinatorState.builder().key(keyValue).attributes(attributes).build();
|
||||
log.debug("Retrieved coordinatorState {}", c);
|
||||
|
||||
return c;
|
||||
}
|
||||
|
||||
private Map<String, AttributeValue> toDynamoRecord(final CoordinatorState state) {
|
||||
final Map<String, AttributeValue> result = new HashMap<>();
|
||||
result.put(COORDINATOR_STATE_TABLE_HASH_KEY_ATTRIBUTE_NAME, DynamoUtils.createAttributeValue(state.getKey()));
|
||||
if (state instanceof MigrationState) {
|
||||
result.putAll(((MigrationState) state).serialize());
|
||||
}
|
||||
if (!CollectionUtils.isNullOrEmpty(state.getAttributes())) {
|
||||
result.putAll(state.getAttributes());
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
private Map<String, ExpectedAttributeValue> getDynamoNonExistentExpectation() {
|
||||
final Map<String, ExpectedAttributeValue> result = new HashMap<>();
|
||||
|
||||
final ExpectedAttributeValue expectedAV =
|
||||
ExpectedAttributeValue.builder().exists(false).build();
|
||||
result.put(COORDINATOR_STATE_TABLE_HASH_KEY_ATTRIBUTE_NAME, expectedAV);
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
private Map<String, ExpectedAttributeValue> getDynamoExistentExpectation(final String keyValue) {
|
||||
final Map<String, ExpectedAttributeValue> result = new HashMap<>();
|
||||
|
||||
final ExpectedAttributeValue expectedAV = ExpectedAttributeValue.builder()
|
||||
.value(AttributeValue.fromS(keyValue))
|
||||
.build();
|
||||
result.put(COORDINATOR_STATE_TABLE_HASH_KEY_ATTRIBUTE_NAME, expectedAV);
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
private Map<String, AttributeValueUpdate> getDynamoCoordinatorStateUpdate(final CoordinatorState state) {
|
||||
final HashMap<String, AttributeValueUpdate> updates = new HashMap<>();
|
||||
if (state instanceof MigrationState) {
|
||||
updates.putAll(((MigrationState) state).getDynamoUpdate());
|
||||
}
|
||||
state.getAttributes()
|
||||
.forEach((attribute, value) -> updates.put(
|
||||
attribute,
|
||||
AttributeValueUpdate.builder()
|
||||
.value(value)
|
||||
.action(AttributeAction.PUT)
|
||||
.build()));
|
||||
return updates;
|
||||
}
|
||||
|
||||
private TableDescription getTableDescription() {
|
||||
try {
|
||||
final DescribeTableResponse response = unwrappingFuture(() -> dynamoDbAsyncClient.describeTable(
|
||||
DescribeTableRequest.builder().tableName(config.tableName()).build()));
|
||||
return response.table();
|
||||
} catch (final ResourceNotFoundException e) {
|
||||
return null;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -28,12 +28,17 @@ import java.util.function.BooleanSupplier;
|
|||
import java.util.stream.Collectors;
|
||||
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
import software.amazon.awssdk.services.cloudwatch.model.StandardUnit;
|
||||
import software.amazon.awssdk.utils.CollectionUtils;
|
||||
import software.amazon.kinesis.leases.Lease;
|
||||
import software.amazon.kinesis.leases.LeaseRefresher;
|
||||
import software.amazon.kinesis.leases.exceptions.DependencyException;
|
||||
import software.amazon.kinesis.leases.exceptions.InvalidStateException;
|
||||
import software.amazon.kinesis.leases.exceptions.ProvisionedThroughputException;
|
||||
import software.amazon.kinesis.metrics.MetricsFactory;
|
||||
import software.amazon.kinesis.metrics.MetricsLevel;
|
||||
import software.amazon.kinesis.metrics.MetricsScope;
|
||||
import software.amazon.kinesis.metrics.MetricsUtil;
|
||||
|
||||
/**
|
||||
* An implementation of the {@code LeaderDecider} to elect leader(s) based on workerId.
|
||||
|
|
@ -46,7 +51,7 @@ import software.amazon.kinesis.leases.exceptions.ProvisionedThroughputException;
|
|||
* This ensures redundancy for shard-sync during host failures.
|
||||
*/
|
||||
@Slf4j
|
||||
class DeterministicShuffleShardSyncLeaderDecider implements LeaderDecider {
|
||||
public class DeterministicShuffleShardSyncLeaderDecider implements LeaderDecider {
|
||||
// Fixed seed so that the shuffle order is preserved across workers
|
||||
static final int DETERMINISTIC_SHUFFLE_SEED = 1947;
|
||||
|
||||
|
|
@ -59,6 +64,7 @@ class DeterministicShuffleShardSyncLeaderDecider implements LeaderDecider {
|
|||
private final LeaseRefresher leaseRefresher;
|
||||
private final int numPeriodicShardSyncWorkers;
|
||||
private final ScheduledExecutorService leaderElectionThreadPool;
|
||||
private final MetricsFactory metricsFactory;
|
||||
|
||||
private volatile Set<String> leaders;
|
||||
|
||||
|
|
@ -67,11 +73,17 @@ class DeterministicShuffleShardSyncLeaderDecider implements LeaderDecider {
|
|||
* @param leaderElectionThreadPool Thread-pool to be used for leaderElection.
|
||||
* @param numPeriodicShardSyncWorkers Number of leaders that will be elected to perform periodic shard syncs.
|
||||
*/
|
||||
DeterministicShuffleShardSyncLeaderDecider(
|
||||
public DeterministicShuffleShardSyncLeaderDecider(
|
||||
LeaseRefresher leaseRefresher,
|
||||
ScheduledExecutorService leaderElectionThreadPool,
|
||||
int numPeriodicShardSyncWorkers) {
|
||||
this(leaseRefresher, leaderElectionThreadPool, numPeriodicShardSyncWorkers, new ReentrantReadWriteLock());
|
||||
int numPeriodicShardSyncWorkers,
|
||||
MetricsFactory metricsFactory) {
|
||||
this(
|
||||
leaseRefresher,
|
||||
leaderElectionThreadPool,
|
||||
numPeriodicShardSyncWorkers,
|
||||
new ReentrantReadWriteLock(),
|
||||
metricsFactory);
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
@ -84,11 +96,13 @@ class DeterministicShuffleShardSyncLeaderDecider implements LeaderDecider {
|
|||
LeaseRefresher leaseRefresher,
|
||||
ScheduledExecutorService leaderElectionThreadPool,
|
||||
int numPeriodicShardSyncWorkers,
|
||||
ReadWriteLock readWriteLock) {
|
||||
ReadWriteLock readWriteLock,
|
||||
MetricsFactory metricsFactory) {
|
||||
this.leaseRefresher = leaseRefresher;
|
||||
this.leaderElectionThreadPool = leaderElectionThreadPool;
|
||||
this.numPeriodicShardSyncWorkers = numPeriodicShardSyncWorkers;
|
||||
this.readWriteLock = readWriteLock;
|
||||
this.metricsFactory = metricsFactory;
|
||||
}
|
||||
|
||||
/*
|
||||
|
|
@ -146,8 +160,13 @@ class DeterministicShuffleShardSyncLeaderDecider implements LeaderDecider {
|
|||
ELECTION_SCHEDULING_INTERVAL_MILLIS,
|
||||
TimeUnit.MILLISECONDS);
|
||||
}
|
||||
|
||||
return executeConditionCheckWithReadLock(() -> isWorkerLeaderForShardSync(workerId));
|
||||
final boolean response = executeConditionCheckWithReadLock(() -> isWorkerLeaderForShardSync(workerId));
|
||||
final MetricsScope metricsScope =
|
||||
MetricsUtil.createMetricsWithOperation(metricsFactory, METRIC_OPERATION_LEADER_DECIDER);
|
||||
metricsScope.addData(
|
||||
METRIC_OPERATION_LEADER_DECIDER_IS_LEADER, response ? 1 : 0, StandardUnit.COUNT, MetricsLevel.DETAILED);
|
||||
MetricsUtil.endScope(metricsScope);
|
||||
return response;
|
||||
}
|
||||
|
||||
@Override
|
||||
|
|
|
|||
|
|
@ -0,0 +1,403 @@
|
|||
/*
|
||||
* Copyright 2024 Amazon.com, Inc. or its affiliates.
|
||||
* Licensed under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package software.amazon.kinesis.coordinator;
|
||||
|
||||
import java.time.Duration;
|
||||
import java.util.Objects;
|
||||
import java.util.concurrent.ScheduledExecutorService;
|
||||
import java.util.concurrent.ScheduledFuture;
|
||||
import java.util.concurrent.TimeUnit;
|
||||
import java.util.function.BiFunction;
|
||||
import java.util.function.Supplier;
|
||||
|
||||
import lombok.AccessLevel;
|
||||
import lombok.Builder;
|
||||
import lombok.Getter;
|
||||
import lombok.experimental.Accessors;
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
import software.amazon.awssdk.annotations.ThreadSafe;
|
||||
import software.amazon.kinesis.annotations.KinesisClientInternalApi;
|
||||
import software.amazon.kinesis.coordinator.MigrationAdaptiveLeaseAssignmentModeProvider.LeaseAssignmentMode;
|
||||
import software.amazon.kinesis.coordinator.assignment.LeaseAssignmentManager;
|
||||
import software.amazon.kinesis.coordinator.migration.ClientVersion;
|
||||
import software.amazon.kinesis.leader.DynamoDBLockBasedLeaderDecider;
|
||||
import software.amazon.kinesis.leader.MigrationAdaptiveLeaderDecider;
|
||||
import software.amazon.kinesis.leases.LeaseManagementConfig.WorkerUtilizationAwareAssignmentConfig;
|
||||
import software.amazon.kinesis.leases.LeaseRefresher;
|
||||
import software.amazon.kinesis.leases.exceptions.DependencyException;
|
||||
import software.amazon.kinesis.metrics.MetricsFactory;
|
||||
import software.amazon.kinesis.worker.metricstats.WorkerMetricStatsDAO;
|
||||
import software.amazon.kinesis.worker.metricstats.WorkerMetricStatsManager;
|
||||
import software.amazon.kinesis.worker.metricstats.WorkerMetricStatsReporter;
|
||||
|
||||
import static software.amazon.kinesis.coordinator.MigrationAdaptiveLeaseAssignmentModeProvider.LeaseAssignmentMode.DEFAULT_LEASE_COUNT_BASED_ASSIGNMENT;
|
||||
import static software.amazon.kinesis.coordinator.MigrationAdaptiveLeaseAssignmentModeProvider.LeaseAssignmentMode.WORKER_UTILIZATION_AWARE_ASSIGNMENT;
|
||||
import static software.amazon.kinesis.coordinator.assignment.LeaseAssignmentManager.DEFAULT_NO_OF_SKIP_STAT_FOR_DEAD_WORKER_THRESHOLD;
|
||||
|
||||
/**
|
||||
* This class is responsible for initializing the KCL components that supports
|
||||
* seamless upgrade from v2.x to v3.x.
|
||||
* During specific versions, it also dynamically switches the functionality
|
||||
* to be either vanilla 3.x or 2.x compatible.
|
||||
*
|
||||
* It is responsible for creating:
|
||||
* 1. LeaderDecider
|
||||
* 2. LAM
|
||||
* 3. WorkerMetricStatsReporter
|
||||
*
|
||||
* It manages initializing the following components at initialization time
|
||||
* 1. workerMetricsDAO and workerMetricsManager
|
||||
* 2. leaderDecider
|
||||
* 3. MigrationAdaptiveLeaseAssignmentModeProvider
|
||||
*
|
||||
* It updates the following components dynamically:
|
||||
* 1. starts/stops LAM
|
||||
* 2. starts/stops WorkerMetricStatsReporter
|
||||
* 3. updates LeaseAssignmentMode to either DEFAULT_LEASE_COUNT_BASED_ASSIGNMENT or WORKER_UTILIZATION_AWARE_ASSIGNMENT
|
||||
* 4. creates GSI (deletion is done by KclMigrationTool)
|
||||
* 5. creates WorkerMetricStats table (deletion is done by KclMigrationTool)
|
||||
* 6. updates LeaderDecider to either DeterministicShuffleShardSyncLeaderDecider or DynamoDBLockBasedLeaderDecider
|
||||
*/
|
||||
@Slf4j
|
||||
@KinesisClientInternalApi
|
||||
@ThreadSafe
|
||||
@Accessors(fluent = true)
|
||||
public final class DynamicMigrationComponentsInitializer {
|
||||
private static final long SCHEDULER_SHUTDOWN_TIMEOUT_SECONDS = 60L;
|
||||
|
||||
@Getter
|
||||
private final MetricsFactory metricsFactory;
|
||||
|
||||
@Getter
|
||||
private final LeaseRefresher leaseRefresher;
|
||||
|
||||
private final CoordinatorStateDAO coordinatorStateDAO;
|
||||
private final ScheduledExecutorService workerMetricsThreadPool;
|
||||
|
||||
@Getter
|
||||
private final WorkerMetricStatsDAO workerMetricsDAO;
|
||||
|
||||
private final WorkerMetricStatsManager workerMetricsManager;
|
||||
private final ScheduledExecutorService lamThreadPool;
|
||||
private final BiFunction<ScheduledExecutorService, LeaderDecider, LeaseAssignmentManager> lamCreator;
|
||||
private final Supplier<MigrationAdaptiveLeaderDecider> adaptiveLeaderDeciderCreator;
|
||||
private final Supplier<DeterministicShuffleShardSyncLeaderDecider> deterministicLeaderDeciderCreator;
|
||||
private final Supplier<DynamoDBLockBasedLeaderDecider> ddbLockBasedLeaderDeciderCreator;
|
||||
|
||||
@Getter
|
||||
private final String workerIdentifier;
|
||||
|
||||
private final WorkerUtilizationAwareAssignmentConfig workerUtilizationAwareAssignmentConfig;
|
||||
|
||||
@Getter
|
||||
private final long workerMetricsExpirySeconds;
|
||||
|
||||
private final MigrationAdaptiveLeaseAssignmentModeProvider leaseModeChangeConsumer;
|
||||
|
||||
@Getter
|
||||
private LeaderDecider leaderDecider;
|
||||
|
||||
private LeaseAssignmentManager leaseAssignmentManager;
|
||||
private ScheduledFuture<?> workerMetricsReporterFuture;
|
||||
private LeaseAssignmentMode currentAssignmentMode;
|
||||
private boolean dualMode;
|
||||
private boolean initialized;
|
||||
|
||||
@Builder(access = AccessLevel.PACKAGE)
|
||||
DynamicMigrationComponentsInitializer(
|
||||
final MetricsFactory metricsFactory,
|
||||
final LeaseRefresher leaseRefresher,
|
||||
final CoordinatorStateDAO coordinatorStateDAO,
|
||||
final ScheduledExecutorService workerMetricsThreadPool,
|
||||
final WorkerMetricStatsDAO workerMetricsDAO,
|
||||
final WorkerMetricStatsManager workerMetricsManager,
|
||||
final ScheduledExecutorService lamThreadPool,
|
||||
final BiFunction<ScheduledExecutorService, LeaderDecider, LeaseAssignmentManager> lamCreator,
|
||||
final Supplier<MigrationAdaptiveLeaderDecider> adaptiveLeaderDeciderCreator,
|
||||
final Supplier<DeterministicShuffleShardSyncLeaderDecider> deterministicLeaderDeciderCreator,
|
||||
final Supplier<DynamoDBLockBasedLeaderDecider> ddbLockBasedLeaderDeciderCreator,
|
||||
final String workerIdentifier,
|
||||
final WorkerUtilizationAwareAssignmentConfig workerUtilizationAwareAssignmentConfig,
|
||||
final MigrationAdaptiveLeaseAssignmentModeProvider leaseAssignmentModeProvider) {
|
||||
this.metricsFactory = metricsFactory;
|
||||
this.leaseRefresher = leaseRefresher;
|
||||
this.coordinatorStateDAO = coordinatorStateDAO;
|
||||
this.workerIdentifier = workerIdentifier;
|
||||
this.workerUtilizationAwareAssignmentConfig = workerUtilizationAwareAssignmentConfig;
|
||||
this.workerMetricsExpirySeconds = Duration.ofMillis(DEFAULT_NO_OF_SKIP_STAT_FOR_DEAD_WORKER_THRESHOLD
|
||||
* workerUtilizationAwareAssignmentConfig.workerMetricsReporterFreqInMillis())
|
||||
.getSeconds();
|
||||
this.workerMetricsManager = workerMetricsManager;
|
||||
this.workerMetricsDAO = workerMetricsDAO;
|
||||
this.workerMetricsThreadPool = workerMetricsThreadPool;
|
||||
this.lamThreadPool = lamThreadPool;
|
||||
this.lamCreator = lamCreator;
|
||||
this.adaptiveLeaderDeciderCreator = adaptiveLeaderDeciderCreator;
|
||||
this.deterministicLeaderDeciderCreator = deterministicLeaderDeciderCreator;
|
||||
this.ddbLockBasedLeaderDeciderCreator = ddbLockBasedLeaderDeciderCreator;
|
||||
this.leaseModeChangeConsumer = leaseAssignmentModeProvider;
|
||||
}
|
||||
|
||||
public void initialize(final ClientVersion migrationStateMachineStartingClientVersion) throws DependencyException {
|
||||
if (initialized) {
|
||||
log.info("Already initialized, nothing to do");
|
||||
return;
|
||||
}
|
||||
|
||||
// always collect metrics so that when we flip to start reporting we will have accurate historical data.
|
||||
log.info("Start collection of WorkerMetricStats");
|
||||
workerMetricsManager.startManager();
|
||||
if (migrationStateMachineStartingClientVersion == ClientVersion.CLIENT_VERSION_3x) {
|
||||
initializeComponentsFor3x();
|
||||
} else {
|
||||
initializeComponentsForMigration(migrationStateMachineStartingClientVersion);
|
||||
}
|
||||
log.info("Initialized dual mode {} current assignment mode {}", dualMode, currentAssignmentMode);
|
||||
|
||||
log.info("Creating LAM");
|
||||
leaseAssignmentManager = lamCreator.apply(lamThreadPool, leaderDecider);
|
||||
log.info("Initializing {}", leaseModeChangeConsumer.getClass().getSimpleName());
|
||||
leaseModeChangeConsumer.initialize(dualMode, currentAssignmentMode);
|
||||
initialized = true;
|
||||
}
|
||||
|
||||
private void initializeComponentsFor3x() {
|
||||
log.info("Initializing for 3x functionality");
|
||||
dualMode = false;
|
||||
currentAssignmentMode = WORKER_UTILIZATION_AWARE_ASSIGNMENT;
|
||||
log.info("Initializing dualMode {} assignmentMode {}", dualMode, currentAssignmentMode);
|
||||
leaderDecider = ddbLockBasedLeaderDeciderCreator.get();
|
||||
log.info("Initializing {}", leaderDecider.getClass().getSimpleName());
|
||||
leaderDecider.initialize();
|
||||
}
|
||||
|
||||
private void initializeComponentsForMigration(final ClientVersion migrationStateMachineStartingClientVersion) {
|
||||
log.info("Initializing for migration to 3x");
|
||||
dualMode = true;
|
||||
final LeaderDecider initialLeaderDecider;
|
||||
if (migrationStateMachineStartingClientVersion == ClientVersion.CLIENT_VERSION_3x_WITH_ROLLBACK) {
|
||||
currentAssignmentMode = WORKER_UTILIZATION_AWARE_ASSIGNMENT;
|
||||
initialLeaderDecider = ddbLockBasedLeaderDeciderCreator.get();
|
||||
} else {
|
||||
currentAssignmentMode = DEFAULT_LEASE_COUNT_BASED_ASSIGNMENT;
|
||||
initialLeaderDecider = deterministicLeaderDeciderCreator.get();
|
||||
}
|
||||
log.info("Initializing dualMode {} assignmentMode {}", dualMode, currentAssignmentMode);
|
||||
|
||||
final MigrationAdaptiveLeaderDecider adaptiveLeaderDecider = adaptiveLeaderDeciderCreator.get();
|
||||
log.info(
|
||||
"Initializing MigrationAdaptiveLeaderDecider with {}",
|
||||
initialLeaderDecider.getClass().getSimpleName());
|
||||
adaptiveLeaderDecider.updateLeaderDecider(initialLeaderDecider);
|
||||
this.leaderDecider = adaptiveLeaderDecider;
|
||||
}
|
||||
|
||||
void shutdown() {
|
||||
log.info("Shutting down components");
|
||||
if (initialized) {
|
||||
log.info("Stopping LAM, LeaderDecider, workerMetrics reporting and collection");
|
||||
leaseAssignmentManager.stop();
|
||||
// leader decider is shut down later when scheduler is doing a final shutdown
|
||||
// since scheduler still accesses the leader decider while shutting down
|
||||
stopWorkerMetricsReporter();
|
||||
workerMetricsManager.stopManager();
|
||||
}
|
||||
|
||||
// lam does not manage lifecycle of its threadpool to easily stop/start dynamically.
|
||||
// once migration code is obsolete (i.e. all 3x functionality is the baseline and no
|
||||
// migration is needed), it can be moved inside lam
|
||||
log.info("Shutting down lamThreadPool and workerMetrics reporter thread pool");
|
||||
lamThreadPool.shutdown();
|
||||
workerMetricsThreadPool.shutdown();
|
||||
try {
|
||||
if (!lamThreadPool.awaitTermination(SCHEDULER_SHUTDOWN_TIMEOUT_SECONDS, TimeUnit.SECONDS)) {
|
||||
lamThreadPool.shutdownNow();
|
||||
}
|
||||
} catch (final InterruptedException e) {
|
||||
log.warn("Interrupted while waiting for shutdown of LeaseAssignmentManager ThreadPool", e);
|
||||
lamThreadPool.shutdownNow();
|
||||
}
|
||||
|
||||
try {
|
||||
if (!workerMetricsThreadPool.awaitTermination(SCHEDULER_SHUTDOWN_TIMEOUT_SECONDS, TimeUnit.SECONDS)) {
|
||||
workerMetricsThreadPool.shutdownNow();
|
||||
}
|
||||
} catch (final InterruptedException e) {
|
||||
Thread.currentThread().interrupt();
|
||||
log.warn("Interrupted while waiting for shutdown of WorkerMetricStatsManager ThreadPool", e);
|
||||
workerMetricsThreadPool.shutdownNow();
|
||||
}
|
||||
}
|
||||
|
||||
private void startWorkerMetricsReporting() throws DependencyException {
|
||||
if (workerMetricsReporterFuture != null) {
|
||||
log.info("Worker metrics reporting is already running...");
|
||||
return;
|
||||
}
|
||||
log.info("Initializing WorkerMetricStats");
|
||||
this.workerMetricsDAO.initialize();
|
||||
log.info("Starting worker metrics reporter");
|
||||
// Start with a delay for workerStatsManager to capture some values and start reporting.
|
||||
workerMetricsReporterFuture = workerMetricsThreadPool.scheduleAtFixedRate(
|
||||
new WorkerMetricStatsReporter(metricsFactory, workerIdentifier, workerMetricsManager, workerMetricsDAO),
|
||||
workerUtilizationAwareAssignmentConfig.inMemoryWorkerMetricsCaptureFrequencyMillis() * 2L,
|
||||
workerUtilizationAwareAssignmentConfig.workerMetricsReporterFreqInMillis(),
|
||||
TimeUnit.MILLISECONDS);
|
||||
}
|
||||
|
||||
private void stopWorkerMetricsReporter() {
|
||||
log.info("Stopping worker metrics reporter");
|
||||
if (workerMetricsReporterFuture != null) {
|
||||
workerMetricsReporterFuture.cancel(false);
|
||||
workerMetricsReporterFuture = null;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Create LeaseOwnerToLeaseKey GSI for the lease table
|
||||
* @param blockingWait whether to wait for the GSI creation or not, if false, the gsi creation will be initiated
|
||||
* but this call will not block for its creation
|
||||
* @throws DependencyException If DDB fails unexpectedly when creating the GSI
|
||||
*/
|
||||
private void createGsi(final boolean blockingWait) throws DependencyException {
|
||||
log.info("Creating Lease table GSI if it does not exist");
|
||||
// KCLv3.0 always starts with GSI available
|
||||
leaseRefresher.createLeaseOwnerToLeaseKeyIndexIfNotExists();
|
||||
|
||||
if (blockingWait) {
|
||||
log.info("Waiting for Lease table GSI creation");
|
||||
final long secondsBetweenPolls = 10L;
|
||||
final long timeoutSeconds = 600L;
|
||||
final boolean isIndexActive =
|
||||
leaseRefresher.waitUntilLeaseOwnerToLeaseKeyIndexExists(secondsBetweenPolls, timeoutSeconds);
|
||||
|
||||
if (!isIndexActive) {
|
||||
throw new DependencyException(
|
||||
new IllegalStateException("Creating LeaseOwnerToLeaseKeyIndex on Lease table timed out"));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Initialize KCL with components and configuration to support upgrade from 2x. This can happen
|
||||
* at KCL Worker startup when MigrationStateMachine starts in ClientVersion.CLIENT_VERSION_UPGRADE_FROM_2x.
|
||||
* Or Dynamically during roll-forward from ClientVersion.CLIENT_VERSION_2x.
|
||||
*/
|
||||
public synchronized void initializeClientVersionForUpgradeFrom2x(final ClientVersion fromClientVersion)
|
||||
throws DependencyException {
|
||||
log.info("Initializing KCL components for upgrade from 2x from {}", fromClientVersion);
|
||||
|
||||
createGsi(false);
|
||||
startWorkerMetricsReporting();
|
||||
// LAM is not started until the dynamic flip to 3xWithRollback
|
||||
}
|
||||
|
||||
/**
|
||||
* Initialize KCL with components and configuration to run vanilla 3x functionality. This can happen
|
||||
* at KCL Worker startup when MigrationStateMachine starts in ClientVersion.CLIENT_VERSION_3x, or dynamically
|
||||
* during a new deployment when existing worker are in ClientVersion.CLIENT_VERSION_3x_WITH_ROLLBACK
|
||||
*/
|
||||
public synchronized void initializeClientVersionFor3x(final ClientVersion fromClientVersion)
|
||||
throws DependencyException {
|
||||
log.info("Initializing KCL components for 3x from {}", fromClientVersion);
|
||||
|
||||
log.info("Initializing LeaseAssignmentManager, DDB-lock-based leader decider, WorkerMetricStats manager"
|
||||
+ " and creating the Lease table GSI if it does not exist");
|
||||
if (fromClientVersion == ClientVersion.CLIENT_VERSION_INIT) {
|
||||
// gsi may already exist and be active for migrated application.
|
||||
createGsi(true);
|
||||
startWorkerMetricsReporting();
|
||||
log.info("Starting LAM");
|
||||
leaseAssignmentManager.start();
|
||||
}
|
||||
// nothing to do when transitioning from CLIENT_VERSION_3x_WITH_ROLLBACK.
|
||||
}
|
||||
|
||||
/**
|
||||
* Initialize KCL with components and configuration to run 2x compatible functionality
|
||||
* while allowing roll-forward. This can happen at KCL Worker startup when MigrationStateMachine
|
||||
* starts in ClientVersion.CLIENT_VERSION_2x (after a rollback)
|
||||
* Or Dynamically during rollback from CLIENT_VERSION_UPGRADE_FROM_2x or CLIENT_VERSION_3x_WITH_ROLLBACK.
|
||||
*/
|
||||
public synchronized void initializeClientVersionFor2x(final ClientVersion fromClientVersion) {
|
||||
log.info("Initializing KCL components for rollback to 2x from {}", fromClientVersion);
|
||||
|
||||
if (fromClientVersion != ClientVersion.CLIENT_VERSION_INIT) {
|
||||
// dynamic rollback
|
||||
stopWorkerMetricsReporter();
|
||||
// Migration Tool will delete the lease table LeaseOwner GSI
|
||||
// and WorkerMetricStats table
|
||||
}
|
||||
|
||||
if (fromClientVersion == ClientVersion.CLIENT_VERSION_3x_WITH_ROLLBACK) {
|
||||
// we are rolling back after flip
|
||||
currentAssignmentMode = DEFAULT_LEASE_COUNT_BASED_ASSIGNMENT;
|
||||
notifyLeaseAssignmentModeChange();
|
||||
log.info("Stopping LAM");
|
||||
leaseAssignmentManager.stop();
|
||||
final LeaderDecider leaderDecider = deterministicLeaderDeciderCreator.get();
|
||||
if (this.leaderDecider instanceof MigrationAdaptiveLeaderDecider) {
|
||||
log.info(
|
||||
"Updating LeaderDecider to {}", leaderDecider.getClass().getSimpleName());
|
||||
((MigrationAdaptiveLeaderDecider) this.leaderDecider).updateLeaderDecider(leaderDecider);
|
||||
} else {
|
||||
throw new IllegalStateException(String.format("Unexpected leader decider %s", this.leaderDecider));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Initialize KCL with components and configuration to run vanilla 3x functionality
|
||||
* while allowing roll-back to 2x functionality. This can happen at KCL Worker startup
|
||||
* when MigrationStateMachine starts in ClientVersion.CLIENT_VERSION_3x_WITH_ROLLBACK (after the flip)
|
||||
* Or Dynamically during flip from CLIENT_VERSION_UPGRADE_FROM_2x.
|
||||
*/
|
||||
public synchronized void initializeClientVersionFor3xWithRollback(final ClientVersion fromClientVersion)
|
||||
throws DependencyException {
|
||||
log.info("Initializing KCL components for 3x with rollback from {}", fromClientVersion);
|
||||
|
||||
if (fromClientVersion == ClientVersion.CLIENT_VERSION_UPGRADE_FROM_2x) {
|
||||
// dynamic flip
|
||||
currentAssignmentMode = WORKER_UTILIZATION_AWARE_ASSIGNMENT;
|
||||
notifyLeaseAssignmentModeChange();
|
||||
final LeaderDecider leaderDecider = ddbLockBasedLeaderDeciderCreator.get();
|
||||
log.info("Updating LeaderDecider to {}", leaderDecider.getClass().getSimpleName());
|
||||
((MigrationAdaptiveLeaderDecider) this.leaderDecider).updateLeaderDecider(leaderDecider);
|
||||
} else {
|
||||
startWorkerMetricsReporting();
|
||||
}
|
||||
|
||||
log.info("Starting LAM");
|
||||
leaseAssignmentManager.start();
|
||||
}
|
||||
|
||||
/**
|
||||
* Synchronously invoke the consumer to change the lease assignment mode.
|
||||
*/
|
||||
private void notifyLeaseAssignmentModeChange() {
|
||||
if (dualMode) {
|
||||
log.info("Notifying {} of {}", leaseModeChangeConsumer, currentAssignmentMode);
|
||||
if (Objects.nonNull(leaseModeChangeConsumer)) {
|
||||
try {
|
||||
leaseModeChangeConsumer.updateLeaseAssignmentMode(currentAssignmentMode);
|
||||
} catch (final Exception e) {
|
||||
log.warn("LeaseAssignmentMode change consumer threw exception", e);
|
||||
}
|
||||
}
|
||||
} else {
|
||||
throw new IllegalStateException("Unexpected assignment mode change");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,144 @@
|
|||
package software.amazon.kinesis.coordinator;
|
||||
|
||||
import java.util.concurrent.CompletableFuture;
|
||||
import java.util.concurrent.CompletionException;
|
||||
import java.util.function.Supplier;
|
||||
|
||||
import software.amazon.awssdk.services.dynamodb.DynamoDbAsyncClient;
|
||||
import software.amazon.awssdk.services.dynamodb.DynamoDbClient;
|
||||
import software.amazon.awssdk.services.dynamodb.model.BatchGetItemRequest;
|
||||
import software.amazon.awssdk.services.dynamodb.model.BatchGetItemResponse;
|
||||
import software.amazon.awssdk.services.dynamodb.model.BatchWriteItemRequest;
|
||||
import software.amazon.awssdk.services.dynamodb.model.BatchWriteItemResponse;
|
||||
import software.amazon.awssdk.services.dynamodb.model.CreateTableRequest;
|
||||
import software.amazon.awssdk.services.dynamodb.model.CreateTableResponse;
|
||||
import software.amazon.awssdk.services.dynamodb.model.DeleteItemRequest;
|
||||
import software.amazon.awssdk.services.dynamodb.model.DeleteItemResponse;
|
||||
import software.amazon.awssdk.services.dynamodb.model.DeleteTableRequest;
|
||||
import software.amazon.awssdk.services.dynamodb.model.DeleteTableResponse;
|
||||
import software.amazon.awssdk.services.dynamodb.model.DescribeTableRequest;
|
||||
import software.amazon.awssdk.services.dynamodb.model.DescribeTableResponse;
|
||||
import software.amazon.awssdk.services.dynamodb.model.GetItemRequest;
|
||||
import software.amazon.awssdk.services.dynamodb.model.GetItemResponse;
|
||||
import software.amazon.awssdk.services.dynamodb.model.PutItemRequest;
|
||||
import software.amazon.awssdk.services.dynamodb.model.PutItemResponse;
|
||||
import software.amazon.awssdk.services.dynamodb.model.QueryRequest;
|
||||
import software.amazon.awssdk.services.dynamodb.model.QueryResponse;
|
||||
import software.amazon.awssdk.services.dynamodb.model.ScanRequest;
|
||||
import software.amazon.awssdk.services.dynamodb.model.ScanResponse;
|
||||
import software.amazon.awssdk.services.dynamodb.model.UpdateItemRequest;
|
||||
import software.amazon.awssdk.services.dynamodb.model.UpdateItemResponse;
|
||||
import software.amazon.awssdk.services.dynamodb.paginators.BatchGetItemIterable;
|
||||
import software.amazon.awssdk.services.dynamodb.paginators.QueryIterable;
|
||||
import software.amazon.awssdk.services.dynamodb.paginators.ScanIterable;
|
||||
|
||||
/**
|
||||
* DDB Lock client depends on DynamoDbClient and KCL only has DynamoDbAsyncClient configured.
|
||||
* This wrapper delegates APIs from sync client to async client internally so that it can
|
||||
* be used with the DDB Lock client.
|
||||
*/
|
||||
public class DynamoDbAsyncToSyncClientAdapter implements DynamoDbClient {
|
||||
private final DynamoDbAsyncClient asyncClient;
|
||||
|
||||
public DynamoDbAsyncToSyncClientAdapter(final DynamoDbAsyncClient asyncClient) {
|
||||
this.asyncClient = asyncClient;
|
||||
}
|
||||
|
||||
@Override
|
||||
public String serviceName() {
|
||||
return asyncClient.serviceName();
|
||||
}
|
||||
|
||||
@Override
|
||||
public void close() {
|
||||
asyncClient.close();
|
||||
}
|
||||
|
||||
private <T> T handleException(final Supplier<CompletableFuture<T>> task) {
|
||||
try {
|
||||
return task.get().join();
|
||||
} catch (final CompletionException e) {
|
||||
rethrow(e.getCause());
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public CreateTableResponse createTable(final CreateTableRequest request) {
|
||||
return handleException(() -> asyncClient.createTable(request));
|
||||
}
|
||||
|
||||
@Override
|
||||
public DescribeTableResponse describeTable(final DescribeTableRequest request) {
|
||||
return handleException(() -> asyncClient.describeTable(request));
|
||||
}
|
||||
|
||||
@Override
|
||||
public DeleteTableResponse deleteTable(final DeleteTableRequest request) {
|
||||
return handleException(() -> asyncClient.deleteTable(request));
|
||||
}
|
||||
|
||||
@Override
|
||||
public DeleteItemResponse deleteItem(final DeleteItemRequest request) {
|
||||
return handleException(() -> asyncClient.deleteItem(request));
|
||||
}
|
||||
|
||||
@Override
|
||||
public GetItemResponse getItem(final GetItemRequest request) {
|
||||
return handleException(() -> asyncClient.getItem(request));
|
||||
}
|
||||
|
||||
@Override
|
||||
public PutItemResponse putItem(final PutItemRequest request) {
|
||||
return handleException(() -> asyncClient.putItem(request));
|
||||
}
|
||||
|
||||
@Override
|
||||
public UpdateItemResponse updateItem(final UpdateItemRequest request) {
|
||||
return handleException(() -> asyncClient.updateItem(request));
|
||||
}
|
||||
|
||||
@Override
|
||||
public QueryResponse query(final QueryRequest request) {
|
||||
return handleException(() -> asyncClient.query(request));
|
||||
}
|
||||
|
||||
@Override
|
||||
public ScanResponse scan(final ScanRequest request) {
|
||||
return handleException(() -> asyncClient.scan(request));
|
||||
}
|
||||
|
||||
@Override
|
||||
public QueryIterable queryPaginator(final QueryRequest request) {
|
||||
return new QueryIterable(this, request);
|
||||
}
|
||||
|
||||
@Override
|
||||
public ScanIterable scanPaginator(final ScanRequest request) {
|
||||
return new ScanIterable(this, request);
|
||||
}
|
||||
|
||||
@Override
|
||||
public BatchGetItemResponse batchGetItem(final BatchGetItemRequest request) {
|
||||
return handleException(() -> asyncClient.batchGetItem(request));
|
||||
}
|
||||
|
||||
@Override
|
||||
public BatchWriteItemResponse batchWriteItem(final BatchWriteItemRequest request) {
|
||||
return handleException(() -> asyncClient.batchWriteItem(request));
|
||||
}
|
||||
|
||||
@Override
|
||||
public BatchGetItemIterable batchGetItemPaginator(final BatchGetItemRequest request) {
|
||||
return new BatchGetItemIterable(this, request);
|
||||
}
|
||||
|
||||
private static void rethrow(final Throwable e) {
|
||||
castAndThrow(e);
|
||||
}
|
||||
|
||||
@SuppressWarnings("unchecked")
|
||||
private static <T extends Throwable> void castAndThrow(final Throwable e) throws T {
|
||||
throw (T) e;
|
||||
}
|
||||
}
|
||||
|
|
@ -21,6 +21,8 @@ package software.amazon.kinesis.coordinator;
|
|||
* worker is one of the leaders designated to execute shard-sync and then acts accordingly.
|
||||
*/
|
||||
public interface LeaderDecider {
|
||||
String METRIC_OPERATION_LEADER_DECIDER = "LeaderDecider";
|
||||
String METRIC_OPERATION_LEADER_DECIDER_IS_LEADER = METRIC_OPERATION_LEADER_DECIDER + ":IsLeader";
|
||||
|
||||
/**
|
||||
* Method invoked to check the given workerId corresponds to one of the workers
|
||||
|
|
@ -36,4 +38,32 @@ public interface LeaderDecider {
|
|||
* being used in the LeaderDecider implementation.
|
||||
*/
|
||||
void shutdown();
|
||||
|
||||
/**
|
||||
* Performs initialization tasks for decider if any.
|
||||
*/
|
||||
default void initialize() {
|
||||
// No-op by default
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns if any ACTIVE leader exists that is elected by the current implementation.
|
||||
* Note: Some implementation (like DeterministicShuffleShardSyncLeaderDecider) will always have a leader and will
|
||||
* return true always.
|
||||
*/
|
||||
default boolean isAnyLeaderElected() {
|
||||
return true;
|
||||
}
|
||||
|
||||
/**
|
||||
* If the current worker is the leader, then releases the leadership else does nothing.
|
||||
* This might not be relevant for some implementations, for e.g. DeterministicShuffleShardSyncLeaderDecider does
|
||||
* not have mechanism to release leadership.
|
||||
*
|
||||
* Current worker if leader releases leadership, it's possible that the current worker assume leadership sometime
|
||||
* later again in future elections.
|
||||
*/
|
||||
default void releaseLeadershipIfHeld() {
|
||||
// No-op by default
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -0,0 +1,126 @@
|
|||
/*
|
||||
* Copyright 2024 Amazon.com, Inc. or its affiliates.
|
||||
* Licensed under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package software.amazon.kinesis.coordinator;
|
||||
|
||||
import lombok.NoArgsConstructor;
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
import software.amazon.awssdk.annotations.ThreadSafe;
|
||||
import software.amazon.kinesis.annotations.KinesisClientInternalApi;
|
||||
|
||||
/**
|
||||
* Provides the lease assignment mode KCL must operate in during migration
|
||||
* from 2.x to 3.x.
|
||||
* KCL v2.x lease assignment is based on distributed-worker-stealing algorithm
|
||||
* which balances lease count across workers.
|
||||
* KCL v3.x lease assignment is based on a centralized-lease-assignment algorithm
|
||||
* which balances resource utilization metrics(e.g. CPU utilization) across workers.
|
||||
*
|
||||
* For a new application starting in KCL v3.x, there is no migration needed,
|
||||
* so KCL will initialize with the lease assignment mode accordingly, and it will
|
||||
* not change dynamically.
|
||||
*
|
||||
* During upgrade from 2.x to 3.x, KCL library needs an ability to
|
||||
* start in v2.x assignment mode but dynamically change to v3.x assignment.
|
||||
* In this case, both 2.x and 3.x lease assignment will be running but one
|
||||
* of them will be a no-op based on the mode.
|
||||
*
|
||||
* The methods and internal state is guarded for concurrent access to allow
|
||||
* both lease assignment algorithms to access the state concurrently while
|
||||
* it could be dynamically updated.
|
||||
*/
|
||||
@KinesisClientInternalApi
|
||||
@Slf4j
|
||||
@ThreadSafe
|
||||
@NoArgsConstructor
|
||||
public final class MigrationAdaptiveLeaseAssignmentModeProvider {
|
||||
|
||||
public enum LeaseAssignmentMode {
|
||||
/**
|
||||
* This is the 2.x assignment mode.
|
||||
* This mode assigns leases based on the number of leases.
|
||||
* This mode involves each worker independently determining how many leases to pick or how many leases to steal
|
||||
* from other workers.
|
||||
*/
|
||||
DEFAULT_LEASE_COUNT_BASED_ASSIGNMENT,
|
||||
|
||||
/**
|
||||
* This is the 3.x assigment mode.
|
||||
* This mode uses each worker's resource utilization to perform lease assignment.
|
||||
* Assignment is done by a single worker (elected leader), which looks at WorkerMetricStats for each worker to
|
||||
* determine lease assignment.
|
||||
*
|
||||
* This mode primarily does
|
||||
* 1. Starts WorkerMetricStatsManager on the worker which starts publishing WorkerMetricStats
|
||||
* 2. Starts the LeaseDiscoverer
|
||||
* 3. Creates if not already available the LeaseOwnerToLeaseKey GSI on the lease table and validate that is
|
||||
* ACTIVE.
|
||||
*/
|
||||
WORKER_UTILIZATION_AWARE_ASSIGNMENT;
|
||||
}
|
||||
|
||||
private LeaseAssignmentMode currentMode;
|
||||
private boolean initialized = false;
|
||||
private boolean dynamicModeChangeSupportNeeded;
|
||||
|
||||
/**
|
||||
* Specify whether both lease assignment algorithms should be initialized to
|
||||
* support dynamically changing lease mode.
|
||||
* @return true if lease assignment mode can change dynamically
|
||||
* false otherwise.
|
||||
*/
|
||||
public synchronized boolean dynamicModeChangeSupportNeeded() {
|
||||
return dynamicModeChangeSupportNeeded;
|
||||
}
|
||||
|
||||
/**
|
||||
* Provide the current lease assignment mode in which KCL should perform lease assignment
|
||||
* @return the current lease assignment mode
|
||||
*/
|
||||
public synchronized LeaseAssignmentMode getLeaseAssignmentMode() {
|
||||
if (!initialized) {
|
||||
throw new IllegalStateException("AssignmentMode is not initialized");
|
||||
}
|
||||
return currentMode;
|
||||
}
|
||||
|
||||
synchronized void initialize(final boolean dynamicModeChangeSupportNeeded, final LeaseAssignmentMode mode) {
|
||||
if (!initialized) {
|
||||
log.info("Initializing dynamicModeChangeSupportNeeded {} mode {}", dynamicModeChangeSupportNeeded, mode);
|
||||
this.dynamicModeChangeSupportNeeded = dynamicModeChangeSupportNeeded;
|
||||
this.currentMode = mode;
|
||||
this.initialized = true;
|
||||
return;
|
||||
}
|
||||
log.info(
|
||||
"Already initialized dynamicModeChangeSupportNeeded {} mode {}. Ignoring new values {}, {}",
|
||||
this.dynamicModeChangeSupportNeeded,
|
||||
this.currentMode,
|
||||
dynamicModeChangeSupportNeeded,
|
||||
mode);
|
||||
}
|
||||
|
||||
synchronized void updateLeaseAssignmentMode(final LeaseAssignmentMode mode) {
|
||||
if (!initialized) {
|
||||
throw new IllegalStateException("Cannot change mode before initializing");
|
||||
}
|
||||
if (dynamicModeChangeSupportNeeded) {
|
||||
log.info("Changing Lease assignment mode from {} to {}", currentMode, mode);
|
||||
this.currentMode = mode;
|
||||
return;
|
||||
}
|
||||
throw new IllegalStateException(String.format(
|
||||
"Lease assignment mode already initialized to %s cannot" + " change to %s", this.currentMode, mode));
|
||||
}
|
||||
}
|
||||
|
|
@ -87,7 +87,7 @@ class PeriodicShardSyncManager {
|
|||
private final Map<StreamIdentifier, HashRangeHoleTracker> hashRangeHoleTrackerMap = new HashMap<>();
|
||||
|
||||
private final String workerId;
|
||||
private final LeaderDecider leaderDecider;
|
||||
private LeaderDecider leaderDecider;
|
||||
private final LeaseRefresher leaseRefresher;
|
||||
private final Map<StreamIdentifier, StreamConfig> currentStreamConfigMap;
|
||||
private final Function<StreamConfig, ShardSyncTaskManager> shardSyncTaskManagerProvider;
|
||||
|
|
@ -105,7 +105,6 @@ class PeriodicShardSyncManager {
|
|||
|
||||
PeriodicShardSyncManager(
|
||||
String workerId,
|
||||
LeaderDecider leaderDecider,
|
||||
LeaseRefresher leaseRefresher,
|
||||
Map<StreamIdentifier, StreamConfig> currentStreamConfigMap,
|
||||
Function<StreamConfig, ShardSyncTaskManager> shardSyncTaskManagerProvider,
|
||||
|
|
@ -117,7 +116,6 @@ class PeriodicShardSyncManager {
|
|||
AtomicBoolean leaderSynced) {
|
||||
this(
|
||||
workerId,
|
||||
leaderDecider,
|
||||
leaseRefresher,
|
||||
currentStreamConfigMap,
|
||||
shardSyncTaskManagerProvider,
|
||||
|
|
@ -132,7 +130,6 @@ class PeriodicShardSyncManager {
|
|||
|
||||
PeriodicShardSyncManager(
|
||||
String workerId,
|
||||
LeaderDecider leaderDecider,
|
||||
LeaseRefresher leaseRefresher,
|
||||
Map<StreamIdentifier, StreamConfig> currentStreamConfigMap,
|
||||
Function<StreamConfig, ShardSyncTaskManager> shardSyncTaskManagerProvider,
|
||||
|
|
@ -144,9 +141,7 @@ class PeriodicShardSyncManager {
|
|||
int leasesRecoveryAuditorInconsistencyConfidenceThreshold,
|
||||
AtomicBoolean leaderSynced) {
|
||||
Validate.notBlank(workerId, "WorkerID is required to initialize PeriodicShardSyncManager.");
|
||||
Validate.notNull(leaderDecider, "LeaderDecider is required to initialize PeriodicShardSyncManager.");
|
||||
this.workerId = workerId;
|
||||
this.leaderDecider = leaderDecider;
|
||||
this.leaseRefresher = leaseRefresher;
|
||||
this.currentStreamConfigMap = currentStreamConfigMap;
|
||||
this.shardSyncTaskManagerProvider = shardSyncTaskManagerProvider;
|
||||
|
|
@ -160,7 +155,9 @@ class PeriodicShardSyncManager {
|
|||
this.leaderSynced = leaderSynced;
|
||||
}
|
||||
|
||||
public synchronized TaskResult start() {
|
||||
public synchronized TaskResult start(final LeaderDecider leaderDecider) {
|
||||
Validate.notNull(leaderDecider, "LeaderDecider is required to start PeriodicShardSyncManager.");
|
||||
this.leaderDecider = leaderDecider;
|
||||
if (!isRunning) {
|
||||
final Runnable periodicShardSyncer = () -> {
|
||||
try {
|
||||
|
|
|
|||
|
|
@ -26,6 +26,7 @@ import java.util.Iterator;
|
|||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.Optional;
|
||||
import java.util.Random;
|
||||
import java.util.Set;
|
||||
import java.util.concurrent.Callable;
|
||||
import java.util.concurrent.CompletableFuture;
|
||||
|
|
@ -44,6 +45,7 @@ import java.util.stream.Collectors;
|
|||
|
||||
import com.google.common.annotations.VisibleForTesting;
|
||||
import com.google.common.base.Stopwatch;
|
||||
import com.google.common.util.concurrent.ThreadFactoryBuilder;
|
||||
import io.reactivex.rxjava3.plugins.RxJavaPlugins;
|
||||
import lombok.AccessLevel;
|
||||
import lombok.Getter;
|
||||
|
|
@ -55,15 +57,23 @@ import lombok.extern.slf4j.Slf4j;
|
|||
import software.amazon.awssdk.arns.Arn;
|
||||
import software.amazon.awssdk.regions.Region;
|
||||
import software.amazon.awssdk.utils.Validate;
|
||||
import software.amazon.kinesis.annotations.KinesisClientInternalApi;
|
||||
import software.amazon.kinesis.checkpoint.CheckpointConfig;
|
||||
import software.amazon.kinesis.checkpoint.ShardRecordProcessorCheckpointer;
|
||||
import software.amazon.kinesis.common.StreamConfig;
|
||||
import software.amazon.kinesis.common.StreamIdentifier;
|
||||
import software.amazon.kinesis.coordinator.assignment.LeaseAssignmentManager;
|
||||
import software.amazon.kinesis.coordinator.migration.MigrationStateMachine;
|
||||
import software.amazon.kinesis.coordinator.migration.MigrationStateMachineImpl;
|
||||
import software.amazon.kinesis.leader.DynamoDBLockBasedLeaderDecider;
|
||||
import software.amazon.kinesis.leader.MigrationAdaptiveLeaderDecider;
|
||||
import software.amazon.kinesis.leases.HierarchicalShardSyncer;
|
||||
import software.amazon.kinesis.leases.Lease;
|
||||
import software.amazon.kinesis.leases.LeaseCleanupManager;
|
||||
import software.amazon.kinesis.leases.LeaseCoordinator;
|
||||
import software.amazon.kinesis.leases.LeaseManagementConfig;
|
||||
import software.amazon.kinesis.leases.LeaseManagementConfig.WorkerUtilizationAwareAssignmentConfig;
|
||||
import software.amazon.kinesis.leases.LeaseManagementFactory;
|
||||
import software.amazon.kinesis.leases.LeaseRefresher;
|
||||
import software.amazon.kinesis.leases.LeaseSerializer;
|
||||
import software.amazon.kinesis.leases.MultiStreamLease;
|
||||
|
|
@ -98,6 +108,9 @@ import software.amazon.kinesis.retrieval.AggregatorUtil;
|
|||
import software.amazon.kinesis.retrieval.RecordsPublisher;
|
||||
import software.amazon.kinesis.retrieval.RetrievalConfig;
|
||||
import software.amazon.kinesis.schemaregistry.SchemaRegistryDecoder;
|
||||
import software.amazon.kinesis.worker.WorkerMetricsSelector;
|
||||
import software.amazon.kinesis.worker.metricstats.WorkerMetricStatsDAO;
|
||||
import software.amazon.kinesis.worker.metricstats.WorkerMetricStatsManager;
|
||||
|
||||
import static software.amazon.kinesis.common.ArnUtil.constructStreamArn;
|
||||
import static software.amazon.kinesis.processor.FormerStreamsLeasesDeletionStrategy.StreamsLeasesDeletionType;
|
||||
|
|
@ -106,12 +119,14 @@ import static software.amazon.kinesis.processor.FormerStreamsLeasesDeletionStrat
|
|||
/**
|
||||
*
|
||||
*/
|
||||
@Getter
|
||||
@Getter(AccessLevel.PRIVATE)
|
||||
@Accessors(fluent = true)
|
||||
@Slf4j
|
||||
@KinesisClientInternalApi
|
||||
public class Scheduler implements Runnable {
|
||||
|
||||
private static final int PERIODIC_SHARD_SYNC_MAX_WORKERS_DEFAULT = 1;
|
||||
|
||||
private static final long LEASE_TABLE_CHECK_FREQUENCY_MILLIS = 3 * 1000L;
|
||||
private static final long MIN_WAIT_TIME_FOR_LEASE_TABLE_CHECK_MILLIS = 1000L;
|
||||
private static final long MAX_WAIT_TIME_FOR_LEASE_TABLE_CHECK_MILLIS = 30 * 1000L;
|
||||
|
|
@ -133,7 +148,9 @@ public class Scheduler implements Runnable {
|
|||
private final ProcessorConfig processorConfig;
|
||||
private final RetrievalConfig retrievalConfig;
|
||||
|
||||
@Getter(AccessLevel.PACKAGE)
|
||||
private final String applicationName;
|
||||
|
||||
private final int maxInitializationAttempts;
|
||||
private final Checkpointer checkpoint;
|
||||
private final long shardConsumerDispatchPollIntervalMillis;
|
||||
|
|
@ -156,7 +173,10 @@ public class Scheduler implements Runnable {
|
|||
private final long failoverTimeMillis;
|
||||
private final long taskBackoffTimeMillis;
|
||||
private final boolean isMultiStreamMode;
|
||||
|
||||
@Getter(AccessLevel.PACKAGE)
|
||||
private final Map<StreamIdentifier, StreamConfig> currentStreamConfigMap = new StreamConfigMap();
|
||||
|
||||
private final StreamTracker streamTracker;
|
||||
private final FormerStreamsLeasesDeletionStrategy formerStreamsLeasesDeletionStrategy;
|
||||
private final long listShardsBackoffTimeMillis;
|
||||
|
|
@ -167,19 +187,30 @@ public class Scheduler implements Runnable {
|
|||
private final AggregatorUtil aggregatorUtil;
|
||||
private final Function<StreamConfig, HierarchicalShardSyncer> hierarchicalShardSyncerProvider;
|
||||
private final long schedulerInitializationBackoffTimeMillis;
|
||||
private final LeaderDecider leaderDecider;
|
||||
private LeaderDecider leaderDecider;
|
||||
|
||||
@Getter(AccessLevel.PACKAGE)
|
||||
private final Map<StreamIdentifier, Instant> staleStreamDeletionMap = new HashMap<>();
|
||||
|
||||
private final LeaseCleanupManager leaseCleanupManager;
|
||||
private final SchemaRegistryDecoder schemaRegistryDecoder;
|
||||
|
||||
@Getter(AccessLevel.PACKAGE)
|
||||
private final DeletedStreamListProvider deletedStreamListProvider;
|
||||
|
||||
private final MigrationStateMachine migrationStateMachine;
|
||||
private final DynamicMigrationComponentsInitializer migrationComponentsInitializer;
|
||||
private final MigrationAdaptiveLeaseAssignmentModeProvider leaseAssignmentModeProvider;
|
||||
|
||||
// Holds consumers for shards the worker is currently tracking. Key is shard
|
||||
// info, value is ShardConsumer.
|
||||
@Getter(AccessLevel.PACKAGE)
|
||||
private final ConcurrentMap<ShardInfo, ShardConsumer> shardInfoShardConsumerMap = new ConcurrentHashMap<>();
|
||||
|
||||
private volatile boolean shutdown;
|
||||
private volatile long shutdownStartTimeMillis;
|
||||
|
||||
@Getter(AccessLevel.PACKAGE)
|
||||
private volatile boolean shutdownComplete = false;
|
||||
|
||||
private final Object lock = new Object();
|
||||
|
|
@ -187,8 +218,6 @@ public class Scheduler implements Runnable {
|
|||
private final Stopwatch streamSyncWatch = Stopwatch.createUnstarted();
|
||||
|
||||
private boolean leasesSyncedOnAppInit = false;
|
||||
|
||||
@Getter(AccessLevel.NONE)
|
||||
private final AtomicBoolean leaderSynced = new AtomicBoolean(false);
|
||||
|
||||
/**
|
||||
|
|
@ -200,7 +229,6 @@ public class Scheduler implements Runnable {
|
|||
* CountDownLatch used by the GracefulShutdownCoordinator. Reaching zero means that
|
||||
* the scheduler's finalShutdown() call has completed.
|
||||
*/
|
||||
@Getter(AccessLevel.NONE)
|
||||
private final CountDownLatch finalShutdownLatch = new CountDownLatch(1);
|
||||
|
||||
@VisibleForTesting
|
||||
|
|
@ -259,11 +287,32 @@ public class Scheduler implements Runnable {
|
|||
// Determine leaseSerializer based on availability of MultiStreamTracker.
|
||||
final LeaseSerializer leaseSerializer =
|
||||
isMultiStreamMode ? new DynamoDBMultiStreamLeaseSerializer() : new DynamoDBLeaseSerializer();
|
||||
this.leaseCoordinator = this.leaseManagementConfig
|
||||
.leaseManagementFactory(leaseSerializer, isMultiStreamMode)
|
||||
.createLeaseCoordinator(this.metricsFactory);
|
||||
|
||||
final LeaseManagementFactory leaseManagementFactory =
|
||||
this.leaseManagementConfig.leaseManagementFactory(leaseSerializer, isMultiStreamMode);
|
||||
this.leaseCoordinator =
|
||||
leaseManagementFactory.createLeaseCoordinator(this.metricsFactory, shardInfoShardConsumerMap);
|
||||
this.leaseRefresher = this.leaseCoordinator.leaseRefresher();
|
||||
|
||||
final CoordinatorStateDAO coordinatorStateDAO = new CoordinatorStateDAO(
|
||||
leaseManagementConfig.dynamoDBClient(), coordinatorConfig().coordinatorStateConfig());
|
||||
this.leaseAssignmentModeProvider = new MigrationAdaptiveLeaseAssignmentModeProvider();
|
||||
this.migrationComponentsInitializer = createDynamicMigrationComponentsInitializer(coordinatorStateDAO);
|
||||
this.migrationStateMachine = new MigrationStateMachineImpl(
|
||||
metricsFactory,
|
||||
System::currentTimeMillis,
|
||||
coordinatorStateDAO,
|
||||
Executors.newScheduledThreadPool(
|
||||
2,
|
||||
new ThreadFactoryBuilder()
|
||||
.setNameFormat("MigrationStateMachine-%04d")
|
||||
.build()),
|
||||
coordinatorConfig.clientVersionConfig(),
|
||||
new Random(),
|
||||
this.migrationComponentsInitializer,
|
||||
leaseManagementConfig.workerIdentifier(),
|
||||
Duration.ofMinutes(10).getSeconds());
|
||||
|
||||
//
|
||||
// TODO: Figure out what to do with lease manage <=> checkpoint relationship
|
||||
//
|
||||
|
|
@ -280,9 +329,8 @@ public class Scheduler implements Runnable {
|
|||
this.diagnosticEventFactory = diagnosticEventFactory;
|
||||
this.diagnosticEventHandler = new DiagnosticEventLogger();
|
||||
this.deletedStreamListProvider = new DeletedStreamListProvider();
|
||||
this.shardSyncTaskManagerProvider = streamConfig -> this.leaseManagementConfig
|
||||
.leaseManagementFactory(leaseSerializer, isMultiStreamMode)
|
||||
.createShardSyncTaskManager(this.metricsFactory, streamConfig, this.deletedStreamListProvider);
|
||||
this.shardSyncTaskManagerProvider = streamConfig -> leaseManagementFactory.createShardSyncTaskManager(
|
||||
this.metricsFactory, streamConfig, this.deletedStreamListProvider);
|
||||
this.shardPrioritization = this.coordinatorConfig.shardPrioritization();
|
||||
this.cleanupLeasesUponShardCompletion = this.leaseManagementConfig.cleanupLeasesUponShardCompletion();
|
||||
this.skipShardSyncAtWorkerInitializationIfLeasesExist =
|
||||
|
|
@ -299,8 +347,6 @@ public class Scheduler implements Runnable {
|
|||
this.workerStateChangeListener =
|
||||
this.coordinatorConfig.coordinatorFactory().createWorkerStateChangeListener();
|
||||
}
|
||||
this.leaderDecider = new DeterministicShuffleShardSyncLeaderDecider(
|
||||
leaseRefresher, Executors.newSingleThreadScheduledExecutor(), PERIODIC_SHARD_SYNC_MAX_WORKERS_DEFAULT);
|
||||
this.failoverTimeMillis = this.leaseManagementConfig.failoverTimeMillis();
|
||||
this.taskBackoffTimeMillis = this.lifecycleConfig.taskBackoffTimeMillis();
|
||||
this.listShardsBackoffTimeMillis = this.retrievalConfig.listShardsBackoffTimeInMillis();
|
||||
|
|
@ -315,7 +361,6 @@ public class Scheduler implements Runnable {
|
|||
this.coordinatorConfig.schedulerInitializationBackoffTimeMillis();
|
||||
this.leaderElectedPeriodicShardSyncManager = new PeriodicShardSyncManager(
|
||||
leaseManagementConfig.workerIdentifier(),
|
||||
leaderDecider,
|
||||
leaseRefresher,
|
||||
currentStreamConfigMap,
|
||||
shardSyncTaskManagerProvider,
|
||||
|
|
@ -325,14 +370,69 @@ public class Scheduler implements Runnable {
|
|||
leaseManagementConfig.leasesRecoveryAuditorExecutionFrequencyMillis(),
|
||||
leaseManagementConfig.leasesRecoveryAuditorInconsistencyConfidenceThreshold(),
|
||||
leaderSynced);
|
||||
this.leaseCleanupManager = this.leaseManagementConfig
|
||||
.leaseManagementFactory(leaseSerializer, isMultiStreamMode)
|
||||
.createLeaseCleanupManager(metricsFactory);
|
||||
this.leaseCleanupManager = leaseManagementFactory.createLeaseCleanupManager(metricsFactory);
|
||||
this.schemaRegistryDecoder = this.retrievalConfig.glueSchemaRegistryDeserializer() == null
|
||||
? null
|
||||
: new SchemaRegistryDecoder(this.retrievalConfig.glueSchemaRegistryDeserializer());
|
||||
}
|
||||
|
||||
/**
|
||||
* Depends on LeaseCoordinator and LeaseRefresher to be created first
|
||||
*/
|
||||
private DynamicMigrationComponentsInitializer createDynamicMigrationComponentsInitializer(
|
||||
final CoordinatorStateDAO coordinatorStateDAO) {
|
||||
selectWorkerMetricsIfAvailable(leaseManagementConfig.workerUtilizationAwareAssignmentConfig());
|
||||
|
||||
final WorkerMetricStatsManager workerMetricsManager = new WorkerMetricStatsManager(
|
||||
leaseManagementConfig.workerUtilizationAwareAssignmentConfig().noOfPersistedMetricsPerWorkerMetrics(),
|
||||
leaseManagementConfig.workerUtilizationAwareAssignmentConfig().workerMetricList(),
|
||||
metricsFactory,
|
||||
leaseManagementConfig
|
||||
.workerUtilizationAwareAssignmentConfig()
|
||||
.inMemoryWorkerMetricsCaptureFrequencyMillis());
|
||||
|
||||
final WorkerMetricStatsDAO workerMetricsDAO = new WorkerMetricStatsDAO(
|
||||
leaseManagementConfig.dynamoDBClient(),
|
||||
leaseManagementConfig.workerUtilizationAwareAssignmentConfig().workerMetricsTableConfig(),
|
||||
leaseManagementConfig.workerUtilizationAwareAssignmentConfig().workerMetricsReporterFreqInMillis());
|
||||
|
||||
return DynamicMigrationComponentsInitializer.builder()
|
||||
.metricsFactory(metricsFactory)
|
||||
.leaseRefresher(leaseRefresher)
|
||||
.coordinatorStateDAO(coordinatorStateDAO)
|
||||
.workerMetricsThreadPool(Executors.newScheduledThreadPool(
|
||||
1,
|
||||
new ThreadFactoryBuilder()
|
||||
.setNameFormat("worker-metrics-reporter")
|
||||
.build()))
|
||||
.workerMetricsDAO(workerMetricsDAO)
|
||||
.workerMetricsManager(workerMetricsManager)
|
||||
.lamThreadPool(Executors.newScheduledThreadPool(
|
||||
1,
|
||||
new ThreadFactoryBuilder().setNameFormat("lam-thread").build()))
|
||||
.lamCreator((lamThreadPool, leaderDecider) -> new LeaseAssignmentManager(
|
||||
leaseRefresher,
|
||||
workerMetricsDAO,
|
||||
leaderDecider,
|
||||
leaseManagementConfig.workerUtilizationAwareAssignmentConfig(),
|
||||
leaseCoordinator.workerIdentifier(),
|
||||
leaseManagementConfig.failoverTimeMillis(),
|
||||
metricsFactory,
|
||||
lamThreadPool,
|
||||
System::nanoTime,
|
||||
leaseManagementConfig.maxLeasesForWorker(),
|
||||
leaseManagementConfig.gracefulLeaseHandoffConfig()))
|
||||
.adaptiveLeaderDeciderCreator(() -> new MigrationAdaptiveLeaderDecider(metricsFactory))
|
||||
.deterministicLeaderDeciderCreator(() -> new DeterministicShuffleShardSyncLeaderDecider(
|
||||
leaseRefresher, Executors.newSingleThreadScheduledExecutor(), 1, metricsFactory))
|
||||
.ddbLockBasedLeaderDeciderCreator(() -> DynamoDBLockBasedLeaderDecider.create(
|
||||
coordinatorStateDAO, leaseCoordinator.workerIdentifier(), metricsFactory))
|
||||
.workerIdentifier(leaseCoordinator.workerIdentifier())
|
||||
.workerUtilizationAwareAssignmentConfig(leaseManagementConfig.workerUtilizationAwareAssignmentConfig())
|
||||
.leaseAssignmentModeProvider(leaseAssignmentModeProvider)
|
||||
.build();
|
||||
}
|
||||
|
||||
/**
|
||||
* Start consuming data from the stream, and pass it to the application record processors.
|
||||
*/
|
||||
|
|
@ -342,13 +442,19 @@ public class Scheduler implements Runnable {
|
|||
return;
|
||||
}
|
||||
|
||||
final MetricsScope metricsScope =
|
||||
MetricsUtil.createMetricsWithOperation(metricsFactory, "Scheduler:Initialize");
|
||||
boolean success = false;
|
||||
try {
|
||||
initialize();
|
||||
success = true;
|
||||
log.info("Initialization complete. Starting worker loop.");
|
||||
} catch (RuntimeException e) {
|
||||
log.error("Unable to initialize after {} attempts. Shutting down.", maxInitializationAttempts, e);
|
||||
workerStateChangeListener.onAllInitializationAttemptsFailed(e);
|
||||
shutdown();
|
||||
} finally {
|
||||
MetricsUtil.addSuccess(metricsScope, "Initialize", success, MetricsLevel.SUMMARY);
|
||||
}
|
||||
while (!shouldShutdown()) {
|
||||
runProcessLoop();
|
||||
|
|
@ -363,14 +469,13 @@ public class Scheduler implements Runnable {
|
|||
synchronized (lock) {
|
||||
registerErrorHandlerForUndeliverableAsyncTaskExceptions();
|
||||
workerStateChangeListener.onWorkerStateChange(WorkerStateChangeListener.WorkerState.INITIALIZING);
|
||||
|
||||
boolean isDone = false;
|
||||
Exception lastException = null;
|
||||
|
||||
for (int i = 0; (!isDone) && (i < maxInitializationAttempts); i++) {
|
||||
try {
|
||||
log.info("Initializing LeaseCoordinator attempt {}", (i + 1));
|
||||
leaseCoordinator.initialize();
|
||||
|
||||
if (!skipShardSyncAtWorkerInitializationIfLeasesExist || leaseRefresher.isLeaseTableEmpty()) {
|
||||
if (shouldInitiateLeaseSync()) {
|
||||
log.info(
|
||||
|
|
@ -382,21 +487,29 @@ public class Scheduler implements Runnable {
|
|||
log.info("Skipping shard sync per configuration setting (and lease table is not empty)");
|
||||
}
|
||||
|
||||
// Initialize the state machine after lease table has been initialized
|
||||
// Migration state machine creates and waits for GSI if necessary,
|
||||
// it must be initialized before starting leaseCoordinator, which runs LeaseDiscoverer
|
||||
// and that requires GSI to be present and active. (migrationStateMachine.initialize is idempotent)
|
||||
migrationStateMachine.initialize();
|
||||
leaderDecider = migrationComponentsInitializer.leaderDecider();
|
||||
|
||||
leaseCleanupManager.start();
|
||||
|
||||
// If we reach this point, then we either skipped the lease sync or did not have any exception
|
||||
// for any of the shard sync in the previous attempt.
|
||||
|
||||
if (!leaseCoordinator.isRunning()) {
|
||||
log.info("Starting LeaseCoordinator");
|
||||
leaseCoordinator.start();
|
||||
leaseCoordinator.start(leaseAssignmentModeProvider);
|
||||
} else {
|
||||
log.info("LeaseCoordinator is already running. No need to start it.");
|
||||
}
|
||||
log.info("Scheduling periodicShardSync");
|
||||
leaderElectedPeriodicShardSyncManager.start();
|
||||
leaderElectedPeriodicShardSyncManager.start(leaderDecider);
|
||||
streamSyncWatch.start();
|
||||
isDone = true;
|
||||
} catch (Exception e) {
|
||||
} catch (final Exception e) {
|
||||
log.error("Caught exception when initializing LeaseCoordinator", e);
|
||||
lastException = e;
|
||||
}
|
||||
|
|
@ -863,7 +976,7 @@ public class Scheduler implements Runnable {
|
|||
leaseCoordinator, lease, notificationCompleteLatch, shutdownCompleteLatch);
|
||||
ShardInfo shardInfo = DynamoDBLeaseCoordinator.convertLeaseToAssignment(lease);
|
||||
ShardConsumer consumer = shardInfoShardConsumerMap.get(shardInfo);
|
||||
if (consumer != null) {
|
||||
if (consumer != null && !consumer.isShutdown()) {
|
||||
consumer.gracefulShutdown(shutdownNotification);
|
||||
} else {
|
||||
//
|
||||
|
|
@ -912,6 +1025,8 @@ public class Scheduler implements Runnable {
|
|||
shutdown = true;
|
||||
shutdownStartTimeMillis = System.currentTimeMillis();
|
||||
|
||||
migrationStateMachine.shutdown();
|
||||
migrationComponentsInitializer.shutdown();
|
||||
// Stop lease coordinator, so leases are not renewed or stolen from other workers.
|
||||
// Lost leases will force Worker to begin shutdown process for all shard consumers in
|
||||
// Worker.run().
|
||||
|
|
@ -1228,4 +1343,23 @@ public class Scheduler implements Runnable {
|
|||
public Future<Void> requestShutdown() {
|
||||
return null;
|
||||
}
|
||||
|
||||
/**
|
||||
* If WorkerMetricStats list is empty and the disable flag is false, select WorkerMetricStats automatically.
|
||||
*/
|
||||
private void selectWorkerMetricsIfAvailable(
|
||||
final WorkerUtilizationAwareAssignmentConfig workerUtilizationAwareAssignmentConfig) {
|
||||
try {
|
||||
if (workerUtilizationAwareAssignmentConfig.workerMetricList().isEmpty()
|
||||
&& !workerUtilizationAwareAssignmentConfig.disableWorkerMetrics()) {
|
||||
workerUtilizationAwareAssignmentConfig.workerMetricList(
|
||||
WorkerMetricsSelector.create().getDefaultWorkerMetrics());
|
||||
}
|
||||
} catch (final Exception e) {
|
||||
log.warn(
|
||||
"Exception encountered during WorkerMetricStats selection. If this is persistent please try setting the "
|
||||
+ "WorkerMetricStats explicitly.",
|
||||
e);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -0,0 +1,21 @@
|
|||
package software.amazon.kinesis.coordinator.assignment;
|
||||
|
||||
import java.util.List;
|
||||
|
||||
import software.amazon.kinesis.leases.Lease;
|
||||
|
||||
public interface LeaseAssignmentDecider {
|
||||
|
||||
/**
|
||||
* Assigns expiredOrUnAssignedLeases to the available workers.
|
||||
*/
|
||||
void assignExpiredOrUnassignedLeases(final List<Lease> expiredOrUnAssignedLeases);
|
||||
|
||||
/**
|
||||
* Balances the leases between workers in the fleet.
|
||||
* Implementation can choose to balance leases based on lease count or throughput or to bring the variance in
|
||||
* resource utilization to a minimum.
|
||||
* Check documentation on implementation class to see how it balances the leases.
|
||||
*/
|
||||
void balanceWorkerVariance();
|
||||
}
|
||||
|
|
@ -0,0 +1,719 @@
|
|||
package software.amazon.kinesis.coordinator.assignment;
|
||||
|
||||
import java.time.Duration;
|
||||
import java.time.Instant;
|
||||
import java.util.Collections;
|
||||
import java.util.HashMap;
|
||||
import java.util.HashSet;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.Objects;
|
||||
import java.util.Optional;
|
||||
import java.util.Set;
|
||||
import java.util.concurrent.Callable;
|
||||
import java.util.concurrent.CompletableFuture;
|
||||
import java.util.concurrent.CompletionException;
|
||||
import java.util.concurrent.ExecutorService;
|
||||
import java.util.concurrent.Executors;
|
||||
import java.util.concurrent.Future;
|
||||
import java.util.concurrent.ScheduledExecutorService;
|
||||
import java.util.concurrent.TimeUnit;
|
||||
import java.util.concurrent.atomic.AtomicInteger;
|
||||
import java.util.function.Function;
|
||||
import java.util.function.Supplier;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import com.google.common.collect.ImmutableList;
|
||||
import com.google.common.collect.ImmutableMap;
|
||||
import lombok.Getter;
|
||||
import lombok.RequiredArgsConstructor;
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
import org.apache.commons.collections.CollectionUtils;
|
||||
import software.amazon.awssdk.services.cloudwatch.model.StandardUnit;
|
||||
import software.amazon.kinesis.annotations.KinesisClientInternalApi;
|
||||
import software.amazon.kinesis.coordinator.LeaderDecider;
|
||||
import software.amazon.kinesis.leases.Lease;
|
||||
import software.amazon.kinesis.leases.LeaseManagementConfig;
|
||||
import software.amazon.kinesis.leases.LeaseRefresher;
|
||||
import software.amazon.kinesis.leases.exceptions.DependencyException;
|
||||
import software.amazon.kinesis.leases.exceptions.InvalidStateException;
|
||||
import software.amazon.kinesis.leases.exceptions.ProvisionedThroughputException;
|
||||
import software.amazon.kinesis.metrics.MetricsFactory;
|
||||
import software.amazon.kinesis.metrics.MetricsLevel;
|
||||
import software.amazon.kinesis.metrics.MetricsScope;
|
||||
import software.amazon.kinesis.metrics.MetricsUtil;
|
||||
import software.amazon.kinesis.metrics.NullMetricsScope;
|
||||
import software.amazon.kinesis.worker.metricstats.WorkerMetricStats;
|
||||
import software.amazon.kinesis.worker.metricstats.WorkerMetricStatsDAO;
|
||||
|
||||
import static java.util.Objects.isNull;
|
||||
import static java.util.Objects.nonNull;
|
||||
|
||||
/**
|
||||
* Performs the LeaseAssignment for the application. This starts by loading the leases and workerMetrics from the
|
||||
* storage and then starts by assignment (in-memory) of expired and/or unassigned leases after which it tries to perform
|
||||
* balancing of load among the workers by re-assign leases.
|
||||
* In the end, performs actual assignment by writing to storage.
|
||||
*/
|
||||
@Slf4j
|
||||
@RequiredArgsConstructor
|
||||
@KinesisClientInternalApi
|
||||
public final class LeaseAssignmentManager {
|
||||
|
||||
/**
|
||||
* Default number of continuous failure execution after which leadership is released.
|
||||
*/
|
||||
private static final int DEFAULT_FAILURE_COUNT_TO_SWITCH_LEADER = 3;
|
||||
|
||||
/**
|
||||
* Default multiplier for LAM frequency with respect to leaseDurationMillis (lease failover millis).
|
||||
* If leaseDurationMillis is 10000 millis, default LAM frequency is 20000 millis.
|
||||
*/
|
||||
private static final int DEFAULT_LEASE_ASSIGNMENT_MANAGER_FREQ_MULTIPLIER = 2;
|
||||
|
||||
/**
|
||||
* Default parallelism factor for scaling lease table.
|
||||
*/
|
||||
private static final int DEFAULT_LEASE_TABLE_SCAN_PARALLELISM_FACTOR = 10;
|
||||
|
||||
private static final String FORCE_LEADER_RELEASE_METRIC_NAME = "ForceLeaderRelease";
|
||||
|
||||
/**
|
||||
* Default retry attempt for loading leases and workers before giving up.
|
||||
*/
|
||||
private static final int DDB_LOAD_RETRY_ATTEMPT = 1;
|
||||
|
||||
/**
|
||||
* Internal threadpool used to parallely perform assignment operation by calling storage.
|
||||
*/
|
||||
private static final ExecutorService LEASE_ASSIGNMENT_CALL_THREAD_POOL =
|
||||
Executors.newFixedThreadPool(Runtime.getRuntime().availableProcessors());
|
||||
|
||||
private static final String METRICS_LEASE_ASSIGNMENT_MANAGER = "LeaseAssignmentManager";
|
||||
private static final String METRICS_INCOMPLETE_EXPIRED_LEASES_ASSIGNMENT =
|
||||
"LeaseAssignmentManager.IncompleteExpiredLeasesAssignment";
|
||||
public static final int DEFAULT_NO_OF_SKIP_STAT_FOR_DEAD_WORKER_THRESHOLD = 2;
|
||||
|
||||
private final LeaseRefresher leaseRefresher;
|
||||
private final WorkerMetricStatsDAO workerMetricsDAO;
|
||||
private final LeaderDecider leaderDecider;
|
||||
private final LeaseManagementConfig.WorkerUtilizationAwareAssignmentConfig config;
|
||||
private final String currentWorkerId;
|
||||
private final Long leaseDurationMillis;
|
||||
private final MetricsFactory metricsFactory;
|
||||
private final ScheduledExecutorService executorService;
|
||||
private final Supplier<Long> nanoTimeProvider;
|
||||
private final int maxLeasesForWorker;
|
||||
private final LeaseManagementConfig.GracefulLeaseHandoffConfig gracefulLeaseHandoffConfig;
|
||||
private boolean tookOverLeadershipInThisRun = false;
|
||||
private final Map<String, Lease> prevRunLeasesState = new HashMap<>();
|
||||
|
||||
private Future<?> managerFuture;
|
||||
|
||||
private int noOfContinuousFailedAttempts = 0;
|
||||
private int lamRunCounter = 0;
|
||||
|
||||
public synchronized void start() {
|
||||
if (isNull(managerFuture)) {
|
||||
// LAM can be dynamically started/stopped and restarted during MigrationStateMachine execution
|
||||
// so reset the flag to refresh the state before processing during a restart of LAM.
|
||||
tookOverLeadershipInThisRun = false;
|
||||
managerFuture = executorService.scheduleWithFixedDelay(
|
||||
this::performAssignment,
|
||||
0L,
|
||||
leaseDurationMillis * DEFAULT_LEASE_ASSIGNMENT_MANAGER_FREQ_MULTIPLIER,
|
||||
TimeUnit.MILLISECONDS);
|
||||
log.info("Started LeaseAssignmentManager");
|
||||
return;
|
||||
}
|
||||
log.info("LeaseAssignmentManager already running...");
|
||||
}
|
||||
|
||||
public synchronized void stop() {
|
||||
if (nonNull(managerFuture)) {
|
||||
log.info("Completed shutdown of LeaseAssignmentManager");
|
||||
managerFuture.cancel(true);
|
||||
managerFuture = null;
|
||||
return;
|
||||
}
|
||||
log.info("LeaseAssignmentManager is not running...");
|
||||
}
|
||||
|
||||
/**
|
||||
* Creates the MetricsScope for given {@param operation} by calling metricsFactory and falls back to
|
||||
* NullMetricsScope if failed to create MetricsScope.
|
||||
* @param operation Operation name for MetricsScope
|
||||
* @return instance of MetricsScope
|
||||
*/
|
||||
private MetricsScope createMetricsScope(final String operation) {
|
||||
try {
|
||||
return MetricsUtil.createMetricsWithOperation(metricsFactory, operation);
|
||||
} catch (final Exception e) {
|
||||
log.error("Failed to create metrics scope defaulting to no metrics.", e);
|
||||
return new NullMetricsScope();
|
||||
}
|
||||
}
|
||||
|
||||
private void performAssignment() {
|
||||
|
||||
final MetricsScope metricsScope = createMetricsScope(METRICS_LEASE_ASSIGNMENT_MANAGER);
|
||||
final long startTime = System.currentTimeMillis();
|
||||
boolean success = false;
|
||||
|
||||
try {
|
||||
|
||||
// If the current worker is not leader, then do nothing as assignment is executed on leader.
|
||||
if (!leaderDecider.isLeader(currentWorkerId)) {
|
||||
log.info("Current worker {} is not a leader, ignore", currentWorkerId);
|
||||
this.tookOverLeadershipInThisRun = false;
|
||||
success = true;
|
||||
return;
|
||||
}
|
||||
|
||||
if (!this.tookOverLeadershipInThisRun) {
|
||||
// This means that there was leader change, perform cleanup of state as this is leader switch.
|
||||
this.tookOverLeadershipInThisRun = true;
|
||||
this.lamRunCounter = 0;
|
||||
prepareAfterLeaderSwitch();
|
||||
}
|
||||
log.info("Current worker {} is a leader, performing assignment", currentWorkerId);
|
||||
|
||||
final InMemoryStorageView inMemoryStorageView = new InMemoryStorageView();
|
||||
|
||||
final long loadStartTime = System.currentTimeMillis();
|
||||
inMemoryStorageView.loadInMemoryStorageView(metricsScope);
|
||||
MetricsUtil.addLatency(metricsScope, "LeaseAndWorkerMetricsLoad", loadStartTime, MetricsLevel.DETAILED);
|
||||
|
||||
publishLeaseAndWorkerCountMetrics(metricsScope, inMemoryStorageView);
|
||||
final LeaseAssignmentDecider leaseAssignmentDecider = new VarianceBasedLeaseAssignmentDecider(
|
||||
inMemoryStorageView,
|
||||
config.dampeningPercentage(),
|
||||
config.reBalanceThresholdPercentage(),
|
||||
config.allowThroughputOvershoot());
|
||||
|
||||
updateLeasesLastCounterIncrementNanosAndLeaseShutdownTimeout(
|
||||
inMemoryStorageView.getLeaseList(), inMemoryStorageView.getLeaseTableScanTime());
|
||||
|
||||
// This does not include the leases from the worker that has expired (based on WorkerMetricStats's
|
||||
// lastUpdateTime)
|
||||
// but the lease is not expired (based on the leaseCounter on lease).
|
||||
// If a worker has died, the lease will be expired and assigned in next iteration.
|
||||
final List<Lease> expiredOrUnAssignedLeases = inMemoryStorageView.getLeaseList().stream()
|
||||
.filter(lease -> lease.isExpired(
|
||||
TimeUnit.MILLISECONDS.toNanos(leaseDurationMillis),
|
||||
inMemoryStorageView.getLeaseTableScanTime()))
|
||||
// marking them for direct reassignment.
|
||||
.map(l -> l.isExpiredOrUnassigned(true))
|
||||
.collect(Collectors.toList());
|
||||
|
||||
log.info("Total expiredOrUnassignedLeases count : {}", expiredOrUnAssignedLeases.size());
|
||||
metricsScope.addData(
|
||||
"ExpiredLeases", expiredOrUnAssignedLeases.size(), StandardUnit.COUNT, MetricsLevel.SUMMARY);
|
||||
|
||||
final long expiredAndUnassignedLeaseAssignmentStartTime = System.currentTimeMillis();
|
||||
leaseAssignmentDecider.assignExpiredOrUnassignedLeases(expiredOrUnAssignedLeases);
|
||||
MetricsUtil.addLatency(
|
||||
metricsScope,
|
||||
"AssignExpiredOrUnassignedLeases",
|
||||
expiredAndUnassignedLeaseAssignmentStartTime,
|
||||
MetricsLevel.DETAILED);
|
||||
|
||||
if (!expiredOrUnAssignedLeases.isEmpty()) {
|
||||
// When expiredOrUnAssignedLeases is not empty, that means
|
||||
// that we were not able to assign all expired or unassigned leases and hit the maxThroughput
|
||||
// per worker for all workers.
|
||||
log.warn("Not able to assign all expiredOrUnAssignedLeases");
|
||||
metricsScope.addData(
|
||||
"LeaseSpillover", expiredOrUnAssignedLeases.size(), StandardUnit.COUNT, MetricsLevel.SUMMARY);
|
||||
}
|
||||
|
||||
if (shouldRunVarianceBalancing()) {
|
||||
final long balanceWorkerVarianceStartTime = System.currentTimeMillis();
|
||||
final int totalNewAssignmentBeforeWorkerVarianceBalancing =
|
||||
inMemoryStorageView.leaseToNewAssignedWorkerMap.size();
|
||||
leaseAssignmentDecider.balanceWorkerVariance();
|
||||
MetricsUtil.addLatency(
|
||||
metricsScope, "BalanceWorkerVariance", balanceWorkerVarianceStartTime, MetricsLevel.DETAILED);
|
||||
metricsScope.addData(
|
||||
"NumOfLeasesReassignment",
|
||||
inMemoryStorageView.leaseToNewAssignedWorkerMap.size()
|
||||
- totalNewAssignmentBeforeWorkerVarianceBalancing,
|
||||
StandardUnit.COUNT,
|
||||
MetricsLevel.SUMMARY);
|
||||
}
|
||||
|
||||
if (inMemoryStorageView.leaseToNewAssignedWorkerMap.isEmpty()) {
|
||||
log.info("No new lease assignment performed in this iteration");
|
||||
}
|
||||
|
||||
parallelyAssignLeases(inMemoryStorageView, metricsScope);
|
||||
printPerWorkerLeases(inMemoryStorageView);
|
||||
deleteStaleWorkerMetricsEntries(inMemoryStorageView, metricsScope);
|
||||
success = true;
|
||||
noOfContinuousFailedAttempts = 0;
|
||||
} catch (final Exception e) {
|
||||
log.error("LeaseAssignmentManager failed to perform lease assignment.", e);
|
||||
noOfContinuousFailedAttempts++;
|
||||
if (noOfContinuousFailedAttempts >= DEFAULT_FAILURE_COUNT_TO_SWITCH_LEADER) {
|
||||
log.error(
|
||||
"Failed to perform assignment {} times in a row, releasing leadership from worker : {}",
|
||||
DEFAULT_FAILURE_COUNT_TO_SWITCH_LEADER,
|
||||
currentWorkerId);
|
||||
MetricsUtil.addCount(metricsScope, FORCE_LEADER_RELEASE_METRIC_NAME, 1, MetricsLevel.SUMMARY);
|
||||
leaderDecider.releaseLeadershipIfHeld();
|
||||
}
|
||||
} finally {
|
||||
MetricsUtil.addSuccessAndLatency(metricsScope, success, startTime, MetricsLevel.SUMMARY);
|
||||
MetricsUtil.endScope(metricsScope);
|
||||
}
|
||||
}
|
||||
|
||||
private boolean shouldRunVarianceBalancing() {
|
||||
final boolean response = this.lamRunCounter == 0;
|
||||
/*
|
||||
To avoid lamRunCounter grow large, keep it within [0,varianceBalancingFrequency).
|
||||
If varianceBalancingFrequency is 5 lamRunCounter value will be within 0 to 4 and method return true when
|
||||
lamRunCounter is 0.
|
||||
*/
|
||||
this.lamRunCounter = (this.lamRunCounter + 1) % config.varianceBalancingFrequency();
|
||||
return response;
|
||||
}
|
||||
|
||||
/**
|
||||
* Deletes the WorkerMetricStats entries which are stale(not updated since long time, ref
|
||||
* {@link LeaseAssignmentManager#isWorkerMetricsEntryStale} for the condition to evaluate staleness)
|
||||
*/
|
||||
private void deleteStaleWorkerMetricsEntries(
|
||||
final InMemoryStorageView inMemoryStorageView, final MetricsScope metricsScope) {
|
||||
final long startTime = System.currentTimeMillis();
|
||||
try {
|
||||
final List<WorkerMetricStats> staleWorkerMetricsList = inMemoryStorageView.getWorkerMetricsList().stream()
|
||||
.filter(this::isWorkerMetricsEntryStale)
|
||||
.collect(Collectors.toList());
|
||||
MetricsUtil.addCount(
|
||||
metricsScope, "TotalStaleWorkerMetricsEntry", staleWorkerMetricsList.size(), MetricsLevel.DETAILED);
|
||||
log.info("Number of stale workerMetrics entries : {}", staleWorkerMetricsList.size());
|
||||
log.info("Stale workerMetrics list : {}", staleWorkerMetricsList);
|
||||
|
||||
final List<CompletableFuture<Boolean>> completableFutures = staleWorkerMetricsList.stream()
|
||||
.map(workerMetrics -> CompletableFuture.supplyAsync(
|
||||
() -> workerMetricsDAO.deleteMetrics(workerMetrics), LEASE_ASSIGNMENT_CALL_THREAD_POOL))
|
||||
.collect(Collectors.toList());
|
||||
|
||||
CompletableFuture.allOf(completableFutures.toArray(new CompletableFuture[0]))
|
||||
.join();
|
||||
} finally {
|
||||
MetricsUtil.addLatency(metricsScope, "StaleWorkerMetricsCleanup", startTime, MetricsLevel.DETAILED);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* WorkerMetricStats entry is considered stale if the lastUpdateTime of the workerMetrics is older than
|
||||
* workerMetricsStalenessThreshold * workerMetricsReporterFreqInMillis.
|
||||
*/
|
||||
private boolean isWorkerMetricsEntryStale(final WorkerMetricStats workerMetrics) {
|
||||
return Duration.between(Instant.ofEpochSecond(workerMetrics.getLastUpdateTime()), Instant.now())
|
||||
.toMillis()
|
||||
> config.staleWorkerMetricsEntryCleanupDuration().toMillis();
|
||||
}
|
||||
|
||||
private void printPerWorkerLeases(final InMemoryStorageView storageView) {
|
||||
storageView.getActiveWorkerIdSet().forEach(activeWorkerId -> {
|
||||
log.info(
|
||||
"Worker : {} and total leases : {} and totalThroughput : {}",
|
||||
activeWorkerId,
|
||||
Optional.ofNullable(storageView.getWorkerToLeasesMap().get(activeWorkerId))
|
||||
.orElse(Collections.EMPTY_SET)
|
||||
.size(),
|
||||
storageView.getWorkerToTotalAssignedThroughputMap().get(activeWorkerId));
|
||||
});
|
||||
}
|
||||
|
||||
private void parallelyAssignLeases(final InMemoryStorageView inMemoryStorageView, final MetricsScope metricsScope) {
|
||||
final AtomicInteger failedAssignmentCounter = new AtomicInteger(0);
|
||||
final long startTime = System.currentTimeMillis();
|
||||
boolean success = false;
|
||||
try {
|
||||
CompletableFuture.allOf(inMemoryStorageView.getLeaseToNewAssignedWorkerMap().entrySet().stream()
|
||||
// ignore leases that are heartbeating and pending graceful shutdown checkpoint.
|
||||
.filter(entry -> !entry.getKey().blockedOnPendingCheckpoint(getNanoTimeMillis()))
|
||||
.map(entry -> CompletableFuture.supplyAsync(
|
||||
() -> {
|
||||
try {
|
||||
final Lease lease = entry.getKey();
|
||||
if (gracefulLeaseHandoffConfig.isGracefulLeaseHandoffEnabled()
|
||||
&& lease.isEligibleForGracefulShutdown()) {
|
||||
return handleGracefulLeaseHandoff(
|
||||
lease, entry.getValue(), failedAssignmentCounter);
|
||||
} else {
|
||||
return handleRegularLeaseAssignment(
|
||||
lease, entry.getValue(), failedAssignmentCounter);
|
||||
}
|
||||
} catch (Exception e) {
|
||||
throw new CompletionException(e);
|
||||
}
|
||||
},
|
||||
LEASE_ASSIGNMENT_CALL_THREAD_POOL))
|
||||
.toArray(CompletableFuture[]::new))
|
||||
.join();
|
||||
success = true;
|
||||
} finally {
|
||||
MetricsUtil.addCount(
|
||||
metricsScope, "FailedAssignmentCount", failedAssignmentCounter.get(), MetricsLevel.DETAILED);
|
||||
MetricsUtil.addSuccessAndLatency(
|
||||
metricsScope, "ParallelyAssignLeases", success, startTime, MetricsLevel.DETAILED);
|
||||
}
|
||||
}
|
||||
|
||||
private boolean handleGracefulLeaseHandoff(Lease lease, String newOwner, AtomicInteger failedAssignmentCounter)
|
||||
throws ProvisionedThroughputException, InvalidStateException, DependencyException {
|
||||
final boolean response = leaseRefresher.initiateGracefulLeaseHandoff(lease, newOwner);
|
||||
if (response) {
|
||||
// new handoff assignment. add the timeout.
|
||||
lease.checkpointOwnerTimeoutTimestampMillis(getCheckpointOwnerTimeoutTimestampMillis());
|
||||
} else {
|
||||
failedAssignmentCounter.incrementAndGet();
|
||||
}
|
||||
return response;
|
||||
}
|
||||
|
||||
private boolean handleRegularLeaseAssignment(Lease lease, String newOwner, AtomicInteger failedAssignmentCounter)
|
||||
throws ProvisionedThroughputException, InvalidStateException, DependencyException {
|
||||
final boolean response = leaseRefresher.assignLease(lease, newOwner);
|
||||
if (response) {
|
||||
// Successful assignment updates the leaseCounter, update the nanoTime for counter update.
|
||||
lease.lastCounterIncrementNanos(nanoTimeProvider.get());
|
||||
} else {
|
||||
failedAssignmentCounter.incrementAndGet();
|
||||
}
|
||||
return response;
|
||||
}
|
||||
|
||||
private void publishLeaseAndWorkerCountMetrics(
|
||||
final MetricsScope metricsScope, final InMemoryStorageView inMemoryStorageView) {
|
||||
// Names of the metrics are kept in sync with what is published in LeaseTaker.
|
||||
metricsScope.addData(
|
||||
"TotalLeases", inMemoryStorageView.leaseList.size(), StandardUnit.COUNT, MetricsLevel.SUMMARY);
|
||||
metricsScope.addData(
|
||||
"NumWorkers", inMemoryStorageView.activeWorkerMetrics.size(), StandardUnit.COUNT, MetricsLevel.SUMMARY);
|
||||
}
|
||||
|
||||
// Method updates all new leases with currentTime if the counter is updated since last run else keeps whatever
|
||||
// was prev and update the prevRunLeasesState
|
||||
private void updateLeasesLastCounterIncrementNanosAndLeaseShutdownTimeout(
|
||||
final List<Lease> leaseList, final Long scanTime) {
|
||||
for (final Lease lease : leaseList) {
|
||||
final Lease prevLease = prevRunLeasesState.get(lease.leaseKey());
|
||||
|
||||
// make sure lease shutdown timeouts are tracked.
|
||||
if (lease.shutdownRequested()) {
|
||||
// previous and current leases might have same next and checkpoint owners but there is no
|
||||
// guarantee that the latest shutdown is the same shutdown in the previous lease for example
|
||||
// some other leaders change the lease states while this worker waiting for it's LAM run.
|
||||
// This is the best effort to prevent marking the incorrect timeout.
|
||||
if (isNull(prevLease) || !prevLease.shutdownRequested() || !isSameOwners(lease, prevLease)) {
|
||||
// Add new value if previous is null, previous lease is not shutdown pending or the owners
|
||||
// don't match
|
||||
lease.checkpointOwnerTimeoutTimestampMillis(getCheckpointOwnerTimeoutTimestampMillis());
|
||||
} else {
|
||||
lease.checkpointOwnerTimeoutTimestampMillis(prevLease.checkpointOwnerTimeoutTimestampMillis());
|
||||
}
|
||||
}
|
||||
|
||||
if (isNull(prevLease)) {
|
||||
lease.lastCounterIncrementNanos(
|
||||
isNull(lease.actualOwner())
|
||||
// This is an unassigned lease, mark as 0L that puts this in first in assignment order
|
||||
? 0L
|
||||
: scanTime);
|
||||
} else {
|
||||
lease.lastCounterIncrementNanos(
|
||||
lease.leaseCounter() > prevLease.leaseCounter()
|
||||
? scanTime
|
||||
: prevLease.lastCounterIncrementNanos());
|
||||
}
|
||||
}
|
||||
prevRunLeasesState.clear();
|
||||
prevRunLeasesState.putAll(leaseList.stream().collect(Collectors.toMap(Lease::leaseKey, Function.identity())));
|
||||
}
|
||||
|
||||
private void prepareAfterLeaderSwitch() {
|
||||
prevRunLeasesState.clear();
|
||||
noOfContinuousFailedAttempts = 0;
|
||||
}
|
||||
|
||||
/**
|
||||
* In memory view of the leases and workerMetrics.
|
||||
* This class supports queries (e.g., leases assigned to worker or total throughout assigned to worker).
|
||||
*/
|
||||
@Getter
|
||||
class InMemoryStorageView {
|
||||
|
||||
// This is in-memory view of the workerToLeaseMapping, this is updated in-memory before actual
|
||||
// changes to storage.
|
||||
private final Map<String, Set<Lease>> workerToLeasesMap = new HashMap<>();
|
||||
/**
|
||||
* This is computed initially after the loading leases and then updated when the
|
||||
* {@link InMemoryStorageView#performLeaseAssignment} is called.
|
||||
*/
|
||||
private final Map<String, Double> workerToTotalAssignedThroughputMap = new HashMap<>();
|
||||
/**
|
||||
* Captures the new assignment done during the lifecycle of single run.
|
||||
*/
|
||||
private final Map<Lease, String> leaseToNewAssignedWorkerMap = new HashMap<>();
|
||||
|
||||
/**
|
||||
* List of all leases in the application.
|
||||
*/
|
||||
private List<Lease> leaseList;
|
||||
/**
|
||||
* List of workers which are active (i.e., updated metric stats before the threshold ref)
|
||||
* {@link this#computeWorkerExpiryThresholdInSecond})
|
||||
*/
|
||||
private List<WorkerMetricStats> activeWorkerMetrics;
|
||||
/**
|
||||
* List of all workerMetrics entries from storage.
|
||||
*/
|
||||
private List<WorkerMetricStats> workerMetricsList;
|
||||
/**
|
||||
* List of active workers ids.
|
||||
*/
|
||||
private Set<String> activeWorkerIdSet;
|
||||
/**
|
||||
* Wall time in nanoseconds when the lease table scan was completed.
|
||||
*/
|
||||
private long leaseTableScanTime = 0L;
|
||||
/**
|
||||
* Average throughput for all workers.
|
||||
*/
|
||||
private double targetAverageThroughput;
|
||||
|
||||
/**
|
||||
* Update {@ref inMemoryWorkerToLeasesMapping} with the change in ownership and update newLeaseAssignmentMap
|
||||
*
|
||||
* @param lease lease changing assignment
|
||||
* @param newOwner new owner of the lease
|
||||
*/
|
||||
public void performLeaseAssignment(final Lease lease, final String newOwner) {
|
||||
final String existingOwner = lease.actualOwner();
|
||||
workerToLeasesMap.get(existingOwner).remove(lease);
|
||||
workerToLeasesMap
|
||||
.computeIfAbsent(newOwner, owner -> new HashSet<>())
|
||||
.add(lease);
|
||||
updateWorkerThroughput(newOwner, lease.throughputKBps());
|
||||
// Remove the same lease throughput from oldOwner
|
||||
updateWorkerThroughput(existingOwner, -lease.throughputKBps());
|
||||
leaseToNewAssignedWorkerMap.put(lease, newOwner);
|
||||
}
|
||||
|
||||
/**
|
||||
* Scans the LeaseTable and WorkerMetricStats in parallel and load the data and populate datastructures used
|
||||
* in lease assignment.
|
||||
*/
|
||||
public void loadInMemoryStorageView(final MetricsScope metricsScope) throws Exception {
|
||||
final CompletableFuture<Map.Entry<List<Lease>, List<String>>> leaseListFuture = loadLeaseListAsync();
|
||||
|
||||
final CompletableFuture<List<WorkerMetricStats>> workerMetricsFuture = loadWorkerMetricStats();
|
||||
|
||||
final List<WorkerMetricStats> workerMetricsFromStorage = workerMetricsFuture.join();
|
||||
|
||||
final List<String> listOfWorkerIdOfInvalidWorkerMetricsEntry = workerMetricsFromStorage.stream()
|
||||
.filter(workerMetrics -> !workerMetrics.isValidWorkerMetric())
|
||||
.map(WorkerMetricStats::getWorkerId)
|
||||
.collect(Collectors.toList());
|
||||
log.warn("List of workerIds with invalid entries : {}", listOfWorkerIdOfInvalidWorkerMetricsEntry);
|
||||
if (!listOfWorkerIdOfInvalidWorkerMetricsEntry.isEmpty()) {
|
||||
metricsScope.addData(
|
||||
"NumWorkersWithInvalidEntry",
|
||||
listOfWorkerIdOfInvalidWorkerMetricsEntry.size(),
|
||||
StandardUnit.COUNT,
|
||||
MetricsLevel.SUMMARY);
|
||||
}
|
||||
|
||||
// Valid entries are considered further, for validity of entry refer WorkerMetricStats#isValidWorkerMetrics
|
||||
this.workerMetricsList = workerMetricsFromStorage.stream()
|
||||
.filter(WorkerMetricStats::isValidWorkerMetric)
|
||||
.collect(Collectors.toList());
|
||||
|
||||
log.info("Total WorkerMetricStats available : {}", workerMetricsList.size());
|
||||
final long workerExpiryThreshold = computeWorkerExpiryThresholdInSecond();
|
||||
|
||||
final long countOfWorkersWithFailingWorkerMetric = workerMetricsList.stream()
|
||||
.filter(WorkerMetricStats::isAnyWorkerMetricFailing)
|
||||
.count();
|
||||
if (countOfWorkersWithFailingWorkerMetric != 0) {
|
||||
metricsScope.addData(
|
||||
"NumWorkersWithFailingWorkerMetric",
|
||||
countOfWorkersWithFailingWorkerMetric,
|
||||
StandardUnit.COUNT,
|
||||
MetricsLevel.SUMMARY);
|
||||
}
|
||||
|
||||
final Map.Entry<List<Lease>, List<String>> leaseListResponse = leaseListFuture.join();
|
||||
this.leaseList = leaseListResponse.getKey();
|
||||
log.warn("Leases that failed deserialization : {}", leaseListResponse.getValue());
|
||||
if (!leaseListResponse.getValue().isEmpty()) {
|
||||
MetricsUtil.addCount(
|
||||
metricsScope,
|
||||
"LeaseDeserializationFailureCount",
|
||||
leaseListResponse.getValue().size(),
|
||||
MetricsLevel.SUMMARY);
|
||||
}
|
||||
this.leaseTableScanTime = nanoTimeProvider.get();
|
||||
log.info("Total Leases available : {}", leaseList.size());
|
||||
|
||||
final double averageLeaseThroughput = leaseList.stream()
|
||||
.filter(lease -> nonNull(lease.throughputKBps()))
|
||||
.mapToDouble(Lease::throughputKBps)
|
||||
.average()
|
||||
// If none of the leases has any value, that means its app
|
||||
// startup time and thus assigns 0 in that case to start with.
|
||||
.orElse(0D);
|
||||
/*
|
||||
* If a workerMetrics has a metric (i.e. has -1 value in last index which denotes failure),
|
||||
* skip it from activeWorkerMetrics and no new action on it will be done
|
||||
* (new assignment etc.) until the metric has non -1 value in last index. This is to avoid performing action
|
||||
* with the stale data on worker.
|
||||
*/
|
||||
this.activeWorkerMetrics = workerMetricsList.stream()
|
||||
.filter(workerMetrics -> workerMetrics.getLastUpdateTime() >= workerExpiryThreshold
|
||||
&& !workerMetrics.isAnyWorkerMetricFailing())
|
||||
.collect(Collectors.toList());
|
||||
log.info("activeWorkerMetrics : {}", activeWorkerMetrics.size());
|
||||
targetAverageThroughput =
|
||||
averageLeaseThroughput * leaseList.size() / Math.max(1, activeWorkerMetrics.size());
|
||||
leaseList.forEach(lease -> {
|
||||
if (isNull(lease.throughputKBps())) {
|
||||
// If the lease is unassigned, it will not have any throughput value, use average throughput
|
||||
// as good enough value to start with.
|
||||
lease.throughputKBps(averageLeaseThroughput);
|
||||
}
|
||||
workerToLeasesMap
|
||||
.computeIfAbsent(lease.actualOwner(), workerId -> new HashSet<>())
|
||||
.add(lease);
|
||||
updateWorkerThroughput(lease.actualOwner(), lease.throughputKBps());
|
||||
});
|
||||
|
||||
this.activeWorkerIdSet = new HashSet<>();
|
||||
// Calculate initial ratio
|
||||
this.activeWorkerMetrics.forEach(workerMetrics -> {
|
||||
activeWorkerIdSet.add(workerMetrics.getWorkerId());
|
||||
workerMetrics.setEmaAlpha(config.workerMetricsEMAAlpha());
|
||||
if (workerMetrics.isUsingDefaultWorkerMetric()) {
|
||||
setOperatingRangeAndWorkerMetricsDataForDefaultWorker(
|
||||
workerMetrics,
|
||||
getTotalAssignedThroughput(workerMetrics.getWorkerId()) / targetAverageThroughput);
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
private void updateWorkerThroughput(final String workerId, final double leaseThroughput) {
|
||||
double value = workerToTotalAssignedThroughputMap.computeIfAbsent(workerId, worker -> (double) 0L);
|
||||
workerToTotalAssignedThroughputMap.put(workerId, value + leaseThroughput);
|
||||
}
|
||||
|
||||
private void setOperatingRangeAndWorkerMetricsDataForDefaultWorker(
|
||||
final WorkerMetricStats workerMetrics, final Double ratio) {
|
||||
// for workers with default WorkerMetricStats, the operating range ceiling of 100 represents the
|
||||
// target throughput. This way, with either heterogeneous or homogeneous fleets
|
||||
// of explicit WorkerMetricStats and default WorkerMetricStats applications, load will be evenly
|
||||
// distributed.
|
||||
log.info(
|
||||
"Worker [{}] is using default WorkerMetricStats, setting initial utilization ratio to [{}].",
|
||||
workerMetrics.getWorkerId(),
|
||||
ratio);
|
||||
workerMetrics.setOperatingRange(ImmutableMap.of("T", ImmutableList.of(100L)));
|
||||
workerMetrics.setMetricStats(ImmutableMap.of("T", ImmutableList.of(ratio * 100, ratio * 100)));
|
||||
}
|
||||
|
||||
/**
|
||||
* Calculates the value threshold in seconds for a worker to be considered as active.
|
||||
* If a worker has not updated the WorkerMetricStats entry within this threshold, the worker is not considered
|
||||
* as active.
|
||||
*
|
||||
* @return wall time in seconds
|
||||
*/
|
||||
private long computeWorkerExpiryThresholdInSecond() {
|
||||
final long timeInSeconds = Duration.ofMillis(System.currentTimeMillis()
|
||||
- DEFAULT_NO_OF_SKIP_STAT_FOR_DEAD_WORKER_THRESHOLD
|
||||
* config.workerMetricsReporterFreqInMillis())
|
||||
.getSeconds();
|
||||
log.info("WorkerMetricStats expiry time in seconds : {}", timeInSeconds);
|
||||
return timeInSeconds;
|
||||
}
|
||||
|
||||
/**
|
||||
* Looks at inMemoryWorkerToLeasesMapping for lease assignment and figures out if there is room considering
|
||||
* any new assignment that would have happened.
|
||||
*/
|
||||
public boolean isWorkerTotalThroughputLessThanMaxThroughput(final String workerId) {
|
||||
return getTotalAssignedThroughput(workerId) <= config.maxThroughputPerHostKBps();
|
||||
}
|
||||
|
||||
/**
|
||||
* Looks at inMemoryWorkerToLeasesMapping for lease assignment of a worker and returns true if the worker has
|
||||
* no leases assigned or less than maxNumberOfLeasesPerHost else false.
|
||||
*/
|
||||
public boolean isWorkerAssignedLeasesLessThanMaxLeases(final String workerId) {
|
||||
final Set<Lease> assignedLeases = workerToLeasesMap.get(workerId);
|
||||
if (CollectionUtils.isEmpty(assignedLeases)) {
|
||||
// There are no leases assigned to the worker, that means its less than maxNumberOfLeasesPerHost.
|
||||
return true;
|
||||
} else {
|
||||
return assignedLeases.size() < maxLeasesForWorker;
|
||||
}
|
||||
}
|
||||
|
||||
public Double getTotalAssignedThroughput(final String workerId) {
|
||||
return workerToTotalAssignedThroughputMap.getOrDefault(workerId, 0D);
|
||||
}
|
||||
|
||||
private CompletableFuture<List<WorkerMetricStats>> loadWorkerMetricStats() {
|
||||
return CompletableFuture.supplyAsync(() -> loadWithRetry(workerMetricsDAO::getAllWorkerMetricStats));
|
||||
}
|
||||
|
||||
private CompletableFuture<Map.Entry<List<Lease>, List<String>>> loadLeaseListAsync() {
|
||||
return CompletableFuture.supplyAsync(() -> loadWithRetry(() -> leaseRefresher.listLeasesParallely(
|
||||
LEASE_ASSIGNMENT_CALL_THREAD_POOL, DEFAULT_LEASE_TABLE_SCAN_PARALLELISM_FACTOR)));
|
||||
}
|
||||
|
||||
private <T> T loadWithRetry(final Callable<T> loadFunction) {
|
||||
int retryAttempt = 0;
|
||||
while (true) {
|
||||
try {
|
||||
return loadFunction.call();
|
||||
} catch (final Exception e) {
|
||||
if (retryAttempt < DDB_LOAD_RETRY_ATTEMPT) {
|
||||
log.warn(
|
||||
"Failed to load : {}, retrying",
|
||||
loadFunction.getClass().getName(),
|
||||
e);
|
||||
retryAttempt++;
|
||||
} else {
|
||||
throw new CompletionException(e);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private long getCheckpointOwnerTimeoutTimestampMillis() {
|
||||
// this is a future timestamp in millis that the graceful lease handoff shutdown can be considered
|
||||
// expired. LeaseDurationMillis is used here to account for how long it might take for the
|
||||
// lease owner to receive the shutdown signal before executing shutdown.
|
||||
return getNanoTimeMillis()
|
||||
+ gracefulLeaseHandoffConfig.gracefulLeaseHandoffTimeoutMillis()
|
||||
+ leaseDurationMillis;
|
||||
}
|
||||
|
||||
private long getNanoTimeMillis() {
|
||||
// this is not a wall clock time. But if we stick with using this time provider for calculating the elapsed
|
||||
// time it should be okay to use in checkpoint expiration calculation.
|
||||
return TimeUnit.NANOSECONDS.toMillis(nanoTimeProvider.get());
|
||||
}
|
||||
|
||||
private static boolean isSameOwners(Lease currentLease, Lease previousLease) {
|
||||
return Objects.equals(currentLease.leaseOwner(), previousLease.leaseOwner())
|
||||
&& Objects.equals(currentLease.checkpointOwner(), previousLease.checkpointOwner());
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,348 @@
|
|||
package software.amazon.kinesis.coordinator.assignment;
|
||||
|
||||
import java.util.AbstractMap.SimpleEntry;
|
||||
import java.util.ArrayDeque;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Collections;
|
||||
import java.util.Comparator;
|
||||
import java.util.HashMap;
|
||||
import java.util.HashSet;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.PriorityQueue;
|
||||
import java.util.Queue;
|
||||
import java.util.Set;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
import software.amazon.kinesis.annotations.KinesisClientInternalApi;
|
||||
import software.amazon.kinesis.leases.Lease;
|
||||
import software.amazon.kinesis.worker.metricstats.WorkerMetricStats;
|
||||
|
||||
import static java.util.Objects.isNull;
|
||||
import static java.util.Objects.nonNull;
|
||||
|
||||
/**
|
||||
* VarianceBasedLeaseAssignmentDecider
|
||||
* This implementation of LeaseAssignmentDecider performs lease assignment by considering the WorkerMetricStats values of workers
|
||||
* with respect to fleet level average of that WorkerMetricStats.
|
||||
* Rebalanced leases are assigned to workers which has maximum capacity to in terms of throughput to reach fleet level
|
||||
* across the WorkerMetricStats value. In case of multiple WorkerMetricStats, the capacity to reach fleet level average is determined by outlier
|
||||
* WorkerMetricStats.
|
||||
* To minimize the variance, the algorithm picks the fleet level average of the WorkerMetricStats for workers as a
|
||||
* pivot point and uses it to determine workers to take leases from and then assign to other workers.
|
||||
* The threshold for considering a worker for re-balance is configurable via
|
||||
* {@code reBalanceThreshold}. During reassignments the {@code dampeningPercentageValue} is used to achieve
|
||||
* critical dampening.
|
||||
*/
|
||||
@Slf4j
|
||||
@KinesisClientInternalApi
|
||||
public final class VarianceBasedLeaseAssignmentDecider implements LeaseAssignmentDecider {
|
||||
private final LeaseAssignmentManager.InMemoryStorageView inMemoryStorageView;
|
||||
private final int dampeningPercentageValue;
|
||||
private final int reBalanceThreshold;
|
||||
private final boolean allowThroughputOvershoot;
|
||||
private final Map<String, Double> workerMetricsToFleetLevelAverageMap = new HashMap<>();
|
||||
private final PriorityQueue<WorkerMetricStats> assignableWorkerSortedByAvailableCapacity;
|
||||
private int targetLeasePerWorker;
|
||||
|
||||
public VarianceBasedLeaseAssignmentDecider(
|
||||
final LeaseAssignmentManager.InMemoryStorageView inMemoryStorageView,
|
||||
final int dampeningPercentageValue,
|
||||
final int reBalanceThreshold,
|
||||
final boolean allowThroughputOvershoot) {
|
||||
this.inMemoryStorageView = inMemoryStorageView;
|
||||
this.dampeningPercentageValue = dampeningPercentageValue;
|
||||
this.reBalanceThreshold = reBalanceThreshold;
|
||||
this.allowThroughputOvershoot = allowThroughputOvershoot;
|
||||
initialize();
|
||||
final Comparator<WorkerMetricStats> comparator = Comparator.comparingDouble(
|
||||
workerMetrics -> workerMetrics.computePercentageToReachAverage(workerMetricsToFleetLevelAverageMap));
|
||||
this.assignableWorkerSortedByAvailableCapacity = new PriorityQueue<>(comparator.reversed());
|
||||
this.assignableWorkerSortedByAvailableCapacity.addAll(
|
||||
getAvailableWorkersForAssignment(inMemoryStorageView.getActiveWorkerMetrics()));
|
||||
}
|
||||
|
||||
private void initialize() {
|
||||
final Map<String, Double> workerMetricsNameToAverage = inMemoryStorageView.getActiveWorkerMetrics().stream()
|
||||
.flatMap(workerMetrics -> workerMetrics.getMetricStats().keySet().stream()
|
||||
.map(workerMetricsName ->
|
||||
new SimpleEntry<>(workerMetricsName, workerMetrics.getMetricStat(workerMetricsName))))
|
||||
.collect(Collectors.groupingBy(
|
||||
SimpleEntry::getKey, HashMap::new, Collectors.averagingDouble(SimpleEntry::getValue)));
|
||||
|
||||
workerMetricsToFleetLevelAverageMap.putAll(workerMetricsNameToAverage);
|
||||
|
||||
final int totalWorkers =
|
||||
Math.max(inMemoryStorageView.getActiveWorkerMetrics().size(), 1);
|
||||
this.targetLeasePerWorker = Math.max(inMemoryStorageView.getLeaseList().size() / totalWorkers, 1);
|
||||
}
|
||||
|
||||
private List<WorkerMetricStats> getAvailableWorkersForAssignment(final List<WorkerMetricStats> workerMetricsList) {
|
||||
// Workers with WorkerMetricStats running hot are also available for assignment as the goal is to balance
|
||||
// utilization
|
||||
// always (e.g., if all workers have hot WorkerMetricStats, balance the variance between them too)
|
||||
return workerMetricsList.stream()
|
||||
.filter(workerMetrics -> inMemoryStorageView.isWorkerTotalThroughputLessThanMaxThroughput(
|
||||
workerMetrics.getWorkerId())
|
||||
&& inMemoryStorageView.isWorkerAssignedLeasesLessThanMaxLeases(workerMetrics.getWorkerId()))
|
||||
.collect(Collectors.toList());
|
||||
}
|
||||
|
||||
@Override
|
||||
public void assignExpiredOrUnassignedLeases(final List<Lease> expiredOrUnAssignedLeases) {
|
||||
// Sort the expiredOrUnAssignedLeases using lastCounterIncrementNanos such that leases expired first are
|
||||
// picked first.
|
||||
// Unassigned leases have lastCounterIncrementNanos as zero and thus assigned first.
|
||||
Collections.sort(expiredOrUnAssignedLeases, Comparator.comparing(Lease::lastCounterIncrementNanos));
|
||||
final Set<Lease> assignedLeases = new HashSet<>();
|
||||
for (final Lease lease : expiredOrUnAssignedLeases) {
|
||||
final WorkerMetricStats workerToAssignLease = assignableWorkerSortedByAvailableCapacity.poll();
|
||||
if (nonNull(workerToAssignLease)) {
|
||||
assignLease(lease, workerToAssignLease);
|
||||
assignedLeases.add(lease);
|
||||
} else {
|
||||
log.info("No worker available to assign lease {}", lease.leaseKey());
|
||||
break;
|
||||
}
|
||||
}
|
||||
expiredOrUnAssignedLeases.removeAll(assignedLeases);
|
||||
}
|
||||
|
||||
private List<WorkerMetricStats> getWorkersToTakeLeasesFromIfRequired(
|
||||
final List<WorkerMetricStats> currentWorkerMetrics,
|
||||
final String workerMetricsName,
|
||||
final double workerMetricsValueAvg) {
|
||||
final List<WorkerMetricStats> workerIdsAboveAverage = new ArrayList<>();
|
||||
|
||||
final double upperLimit = workerMetricsValueAvg * (1.0D + (double) reBalanceThreshold / 100);
|
||||
final double lowerLimit = workerMetricsValueAvg * (1.0D - (double) reBalanceThreshold / 100);
|
||||
|
||||
WorkerMetricStats mostLoadedWorker = null;
|
||||
|
||||
log.info("Range for re-balance upper threshold {} and lower threshold {}", upperLimit, lowerLimit);
|
||||
|
||||
boolean shouldTriggerReBalance = false;
|
||||
for (final WorkerMetricStats workerMetrics : currentWorkerMetrics) {
|
||||
final double currentWorkerMetricsValue = workerMetrics.getMetricStat(workerMetricsName);
|
||||
final boolean isCurrentWorkerMetricsAboveOperatingRange =
|
||||
workerMetrics.isWorkerMetricAboveOperatingRange(workerMetricsName);
|
||||
/*
|
||||
If there is any worker, whose WorkerMetricStats value is between +/- reBalanceThreshold % of workerMetricsValueAvg or if
|
||||
worker's WorkerMetricStats value is above operating range trigger re-balance
|
||||
*/
|
||||
if (currentWorkerMetricsValue > upperLimit
|
||||
|| currentWorkerMetricsValue < lowerLimit
|
||||
|| isCurrentWorkerMetricsAboveOperatingRange) {
|
||||
shouldTriggerReBalance = true;
|
||||
}
|
||||
// Perform re-balance on the worker if its above upperLimit or if current WorkerMetricStats is above
|
||||
// operating range.
|
||||
if (currentWorkerMetricsValue >= upperLimit || isCurrentWorkerMetricsAboveOperatingRange) {
|
||||
workerIdsAboveAverage.add(workerMetrics);
|
||||
}
|
||||
if (mostLoadedWorker == null
|
||||
|| mostLoadedWorker.getMetricStat(workerMetricsName) < currentWorkerMetricsValue) {
|
||||
mostLoadedWorker = workerMetrics;
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
If workerIdsAboveAverage is empty that means there is no worker with WorkerMetricStats value above upperLimit so pick
|
||||
the worker with higher CPU. This can happen when there is worker with WorkerMetricStats value below lowerLimit but
|
||||
all other workers are within upperLimit.
|
||||
*/
|
||||
if (workerIdsAboveAverage.isEmpty()) {
|
||||
workerIdsAboveAverage.add(mostLoadedWorker);
|
||||
}
|
||||
|
||||
return shouldTriggerReBalance ? workerIdsAboveAverage : Collections.emptyList();
|
||||
}
|
||||
|
||||
/**
|
||||
* Performs the balancing of the throughput assigned to workers based on the WorkerMetricsValues of worker with respect
|
||||
* to fleet level average.
|
||||
* Each WorkerMetricStats is treated independently to determine workers for re-balance computed (computed based on
|
||||
* reBalanceThreshold) are determined.
|
||||
* The magnitude of throughput to take is determined by how much worker is away from the average of that WorkerMetricStats
|
||||
* across fleet and in case of multiple WorkerMetricStats, the one with maximum magnitude of throughput is considered.
|
||||
*/
|
||||
@Override
|
||||
public void balanceWorkerVariance() {
|
||||
final List<WorkerMetricStats> activeWorkerMetrics = inMemoryStorageView.getActiveWorkerMetrics();
|
||||
|
||||
log.info("WorkerMetricStats to corresponding fleet level average : {}", workerMetricsToFleetLevelAverageMap);
|
||||
log.info("Active WorkerMetricStats : {}", activeWorkerMetrics);
|
||||
|
||||
final Map<String, Double> workerIdToThroughputToTakeMap = new HashMap<>();
|
||||
String largestOutlierWorkerMetricsName = "";
|
||||
double maxThroughputTake = -1.0D;
|
||||
|
||||
for (final Map.Entry<String, Double> workerMetricsToFleetLevelAverageEntry :
|
||||
workerMetricsToFleetLevelAverageMap.entrySet()) {
|
||||
final String workerMetricsName = workerMetricsToFleetLevelAverageEntry.getKey();
|
||||
|
||||
// Filter workers that does not have current WorkerMetricStats. This is possible if application is adding a
|
||||
// new WorkerMetricStats and currently in phase of deployment.
|
||||
final List<WorkerMetricStats> currentWorkerMetrics = activeWorkerMetrics.stream()
|
||||
.filter(workerMetrics -> workerMetrics.containsMetricStat(workerMetricsName))
|
||||
.collect(Collectors.toList());
|
||||
|
||||
final double fleetAverageForWorkerMetrics = workerMetricsToFleetLevelAverageEntry.getValue();
|
||||
|
||||
final List<WorkerMetricStats> workerToTakeLeasesFrom = getWorkersToTakeLeasesFromIfRequired(
|
||||
currentWorkerMetrics, workerMetricsName, fleetAverageForWorkerMetrics);
|
||||
|
||||
final Map<String, Double> workerIdToThroughputToTakeForCurrentWorkerMetrics = new HashMap<>();
|
||||
double totalThroughputToTakeForCurrentWorkerMetrics = 0D;
|
||||
for (final WorkerMetricStats workerToTakeLease : workerToTakeLeasesFrom) {
|
||||
final double workerMetricsValueForWorker = workerToTakeLease.getMetricStat(workerMetricsName);
|
||||
// Load to take based on the difference compared to the fleet level average
|
||||
final double loadPercentageToTake =
|
||||
(workerMetricsValueForWorker - fleetAverageForWorkerMetrics) / workerMetricsValueForWorker;
|
||||
// Dampen the load based on dampeningPercentageValue
|
||||
final double dampenedLoadPercentageToTake =
|
||||
loadPercentageToTake * ((double) dampeningPercentageValue / 100);
|
||||
final double throughputToTake =
|
||||
inMemoryStorageView.getTotalAssignedThroughput(workerToTakeLease.getWorkerId())
|
||||
* dampenedLoadPercentageToTake;
|
||||
log.info(
|
||||
"For worker : {} taking throughput : {} after dampening based on WorkerMetricStats : {}",
|
||||
workerToTakeLease.getWorkerId(),
|
||||
throughputToTake,
|
||||
workerMetricsName);
|
||||
totalThroughputToTakeForCurrentWorkerMetrics += throughputToTake;
|
||||
workerIdToThroughputToTakeForCurrentWorkerMetrics.put(
|
||||
workerToTakeLease.getWorkerId(), throughputToTake);
|
||||
}
|
||||
|
||||
/*
|
||||
If totalThroughputToTakeForCurrentWorkerMetrics is more than maxThroughputTake that means this WorkerMetricStats is more
|
||||
outlier so consider this for reBalancing
|
||||
*/
|
||||
if (maxThroughputTake < totalThroughputToTakeForCurrentWorkerMetrics) {
|
||||
largestOutlierWorkerMetricsName = workerMetricsName;
|
||||
workerIdToThroughputToTakeMap.clear();
|
||||
workerIdToThroughputToTakeMap.putAll(workerIdToThroughputToTakeForCurrentWorkerMetrics);
|
||||
maxThroughputTake = totalThroughputToTakeForCurrentWorkerMetrics;
|
||||
}
|
||||
}
|
||||
|
||||
log.info(
|
||||
"Largest outlier WorkerMetricStats is : {} and total of {} throughput will be rebalanced",
|
||||
largestOutlierWorkerMetricsName,
|
||||
maxThroughputTake);
|
||||
log.info("Workers to throughput taken from them is : {}", workerIdToThroughputToTakeMap);
|
||||
|
||||
final List<Map.Entry<String, Double>> sortedWorkerIdToThroughputToTakeEntries =
|
||||
new ArrayList<>(workerIdToThroughputToTakeMap.entrySet());
|
||||
// sort entries by values.
|
||||
Collections.sort(sortedWorkerIdToThroughputToTakeEntries, (e1, e2) -> e2.getValue()
|
||||
.compareTo(e1.getValue()));
|
||||
|
||||
for (final Map.Entry<String, Double> workerIdToThroughputToTakeEntry :
|
||||
sortedWorkerIdToThroughputToTakeEntries) {
|
||||
final String workerId = workerIdToThroughputToTakeEntry.getKey();
|
||||
|
||||
final double throughputToTake = workerIdToThroughputToTakeEntry.getValue();
|
||||
|
||||
final Queue<Lease> leasesToTake = getLeasesToTake(workerId, throughputToTake);
|
||||
|
||||
log.info(
|
||||
"Leases taken from worker : {} are : {}",
|
||||
workerId,
|
||||
leasesToTake.stream().map(Lease::leaseKey).collect(Collectors.toSet()));
|
||||
|
||||
for (final Lease lease : leasesToTake) {
|
||||
final WorkerMetricStats workerToAssign = assignableWorkerSortedByAvailableCapacity.poll();
|
||||
if (nonNull(workerToAssign)
|
||||
&& workerToAssign.willAnyMetricStatsGoAboveAverageUtilizationOrOperatingRange(
|
||||
workerMetricsToFleetLevelAverageMap,
|
||||
inMemoryStorageView.getTargetAverageThroughput(),
|
||||
lease.throughputKBps(),
|
||||
targetLeasePerWorker)) {
|
||||
log.info("No worker to assign anymore in this iteration due to hitting average values");
|
||||
break;
|
||||
}
|
||||
if (nonNull(workerToAssign)) {
|
||||
assignLease(lease, workerToAssign);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
printWorkerToUtilizationLog(inMemoryStorageView.getActiveWorkerMetrics());
|
||||
}
|
||||
|
||||
private Queue<Lease> getLeasesToTake(final String workerId, final double throughputToTake) {
|
||||
final Set<Lease> existingLeases =
|
||||
inMemoryStorageView.getWorkerToLeasesMap().get(workerId);
|
||||
|
||||
if (isNull(existingLeases) || existingLeases.isEmpty()) {
|
||||
return new ArrayDeque<>();
|
||||
}
|
||||
|
||||
if (inMemoryStorageView.getTotalAssignedThroughput(workerId) == 0D) {
|
||||
// This is the case where throughput of this worker is zero and have 1 or more leases assigned.
|
||||
// Its not possible to determine leases to take based on throughput so simply take 1 lease and move on.
|
||||
return new ArrayDeque<>(new ArrayList<>(existingLeases).subList(0, 1));
|
||||
}
|
||||
|
||||
return getLeasesCombiningToThroughput(workerId, throughputToTake);
|
||||
}
|
||||
|
||||
private void assignLease(final Lease lease, final WorkerMetricStats workerMetrics) {
|
||||
if (nonNull(lease.actualOwner()) && lease.actualOwner().equals(workerMetrics.getWorkerId())) {
|
||||
// if a new owner and current owner are same then no assignment to do
|
||||
// put back the worker as well as no assignment is done
|
||||
assignableWorkerSortedByAvailableCapacity.add(workerMetrics);
|
||||
return;
|
||||
}
|
||||
workerMetrics.extrapolateMetricStatValuesForAddedThroughput(
|
||||
workerMetricsToFleetLevelAverageMap,
|
||||
inMemoryStorageView.getTargetAverageThroughput(),
|
||||
lease.throughputKBps(),
|
||||
targetLeasePerWorker);
|
||||
log.info("Assigning lease : {} to worker : {}", lease.leaseKey(), workerMetrics.getWorkerId());
|
||||
inMemoryStorageView.performLeaseAssignment(lease, workerMetrics.getWorkerId());
|
||||
if (inMemoryStorageView.isWorkerTotalThroughputLessThanMaxThroughput(workerMetrics.getWorkerId())
|
||||
&& inMemoryStorageView.isWorkerAssignedLeasesLessThanMaxLeases(workerMetrics.getWorkerId())) {
|
||||
assignableWorkerSortedByAvailableCapacity.add(workerMetrics);
|
||||
}
|
||||
}
|
||||
|
||||
private void printWorkerToUtilizationLog(final List<WorkerMetricStats> activeWorkerMetrics) {
|
||||
activeWorkerMetrics.forEach(workerMetrics -> log.info(
|
||||
"WorkerId : {} and average WorkerMetricStats data : {}",
|
||||
workerMetrics.getWorkerId(),
|
||||
workerMetrics.getMetricStatsMap()));
|
||||
}
|
||||
|
||||
private Queue<Lease> getLeasesCombiningToThroughput(final String workerId, final double throughputToGet) {
|
||||
final List<Lease> assignedLeases =
|
||||
new ArrayList<>(inMemoryStorageView.getWorkerToLeasesMap().get(workerId));
|
||||
if (assignedLeases.isEmpty()) {
|
||||
// This is possible if the worker is having high utilization but does not have any leases assigned to it
|
||||
return new ArrayDeque<>();
|
||||
}
|
||||
// Shuffle leases to randomize what leases gets picked.
|
||||
Collections.shuffle(assignedLeases);
|
||||
final Queue<Lease> response = new ArrayDeque<>();
|
||||
double remainingThroughputToGet = throughputToGet;
|
||||
for (final Lease lease : assignedLeases) {
|
||||
// if adding this lease makes throughout to take go below zero avoid taking this lease.
|
||||
if (remainingThroughputToGet - lease.throughputKBps() <= 0) {
|
||||
continue;
|
||||
}
|
||||
remainingThroughputToGet -= lease.throughputKBps();
|
||||
response.add(lease);
|
||||
}
|
||||
|
||||
// If allowThroughputOvershoot is set to true, take a minimum throughput lease
|
||||
if (allowThroughputOvershoot && response.isEmpty()) {
|
||||
assignedLeases.stream()
|
||||
.min(Comparator.comparingDouble(Lease::throughputKBps))
|
||||
.ifPresent(response::add);
|
||||
}
|
||||
return response;
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,58 @@
|
|||
/*
|
||||
* Copyright 2024 Amazon.com, Inc. or its affiliates.
|
||||
* Licensed under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package software.amazon.kinesis.coordinator.migration;
|
||||
|
||||
/**
|
||||
* ClientVersion support during upgrade from KCLv2.x to KCLv3.x
|
||||
*
|
||||
* This enum is persisted in storage, so any changes to it needs to be backward compatible.
|
||||
* Reorganizing the values is not backward compatible, also if versions are removed, the corresponding
|
||||
* enum value cannot be reused without backward compatibility considerations.
|
||||
*/
|
||||
public enum ClientVersion {
|
||||
/**
|
||||
* This is a transient start state version used during initialization of the Migration State Machine.
|
||||
*/
|
||||
CLIENT_VERSION_INIT,
|
||||
/**
|
||||
* This version is used during the upgrade of an application from KCLv2.x to KCLv3.x, in this version
|
||||
* KCL workers will emit WorkerMetricStats and run KCLv2.x algorithms for leader election and lease
|
||||
* assignment. KCL will also monitor for upgrade to KCLv3.x readiness of the worker fleet.
|
||||
*/
|
||||
CLIENT_VERSION_UPGRADE_FROM_2x,
|
||||
/**
|
||||
* This version is used during rollback from CLIENT_VERSION_UPGRADE_FROM_2x or CLIENT_VERSION_3x_WITH_ROLLBACK,
|
||||
* which can only be initiated using a KCL migration tool, when customer wants to revert to KCLv2.x functionality.
|
||||
* In this version, KCL will not emit WorkerMetricStats and run KCLv2.x algorithms for leader election
|
||||
* and lease assignment. In this version, KCL will monitor for roll-forward scenario where
|
||||
* client version is updated to CLIENT_VERSION_UPGRADE_FROM_2x using the migration tool.
|
||||
*/
|
||||
CLIENT_VERSION_2x,
|
||||
/**
|
||||
* When workers are operating in CLIENT_VERSION_UPGRADE_FROM_2x and when worker fleet is determined to be
|
||||
* KCLv3.x ready (when lease table GSI is active and worker-metrics are being emitted by all lease owners)
|
||||
* then the leader will initiate the switch to KCLv3.x algorithms for leader election and lease assignment,
|
||||
* by using this version and persisting it in the {@link MigrationState} that allows all worker hosts
|
||||
* to also flip to KCLv3.x functionality. In this KCL will also monitor for rollback to detect when the
|
||||
* customer updates version to CLIENT_VERSION_2x using migration tool, so that it instantly flips back
|
||||
* to CLIENT_VERSION_2x.
|
||||
*/
|
||||
CLIENT_VERSION_3x_WITH_ROLLBACK,
|
||||
/**
|
||||
* A new application starting KCLv3.x or an upgraded application from KCLv2.x after upgrade is successful
|
||||
* can use this version to default all KCLv3.x algorithms without any monitor to rollback.
|
||||
*/
|
||||
CLIENT_VERSION_3x;
|
||||
}
|
||||
|
|
@ -0,0 +1,161 @@
|
|||
/*
|
||||
* Copyright 2024 Amazon.com, Inc. or its affiliates.
|
||||
* Licensed under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package software.amazon.kinesis.coordinator.migration;
|
||||
|
||||
import java.time.Duration;
|
||||
import java.util.Random;
|
||||
import java.util.concurrent.ScheduledExecutorService;
|
||||
import java.util.concurrent.ScheduledFuture;
|
||||
import java.util.concurrent.TimeUnit;
|
||||
|
||||
import lombok.RequiredArgsConstructor;
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
import software.amazon.awssdk.annotations.ThreadSafe;
|
||||
import software.amazon.awssdk.services.cloudwatch.model.StandardUnit;
|
||||
import software.amazon.kinesis.coordinator.CoordinatorStateDAO;
|
||||
import software.amazon.kinesis.leases.exceptions.DependencyException;
|
||||
import software.amazon.kinesis.leases.exceptions.InvalidStateException;
|
||||
import software.amazon.kinesis.metrics.MetricsFactory;
|
||||
import software.amazon.kinesis.metrics.MetricsLevel;
|
||||
import software.amazon.kinesis.metrics.MetricsScope;
|
||||
import software.amazon.kinesis.metrics.MetricsUtil;
|
||||
|
||||
import static software.amazon.kinesis.coordinator.migration.MigrationState.MIGRATION_HASH_KEY;
|
||||
import static software.amazon.kinesis.coordinator.migration.MigrationStateMachineImpl.METRICS_OPERATION;
|
||||
|
||||
/**
|
||||
* Change monitor for MigrationState.clientVersion to notify a callback if the value
|
||||
* changes from a given value. This monitor will be run to monitor
|
||||
* rollback, roll-forward and also upgrade to 3.x scenarios. Look at {@link ClientVersion}
|
||||
* for more details.
|
||||
*
|
||||
* Since all KCL workers will be running the monitor, the monitor poll interval uses
|
||||
* a random jitter to stagger the reads to ddb.
|
||||
*
|
||||
* The class is thread-safe and will invoke callback on a separate thread.
|
||||
*/
|
||||
@Slf4j
|
||||
@RequiredArgsConstructor
|
||||
@ThreadSafe
|
||||
public class ClientVersionChangeMonitor implements Runnable {
|
||||
|
||||
/**
|
||||
* Interface of a callback to invoke when monitor condition is true.
|
||||
*/
|
||||
public interface ClientVersionChangeCallback {
|
||||
void accept(final MigrationState currentMigrationState) throws InvalidStateException, DependencyException;
|
||||
}
|
||||
|
||||
private static final long MONITOR_INTERVAL_MILLIS = Duration.ofMinutes(1).toMillis();
|
||||
private static final double JITTER_FACTOR = 0.1;
|
||||
|
||||
private final MetricsFactory metricsFactory;
|
||||
private final CoordinatorStateDAO coordinatorStateDAO;
|
||||
private final ScheduledExecutorService stateMachineThreadPool;
|
||||
private final ClientVersionChangeCallback callback;
|
||||
private final ClientVersion expectedVersion;
|
||||
private final Random random;
|
||||
private long monitorIntervalMillis;
|
||||
|
||||
private ScheduledFuture<?> scheduledFuture;
|
||||
|
||||
public synchronized void startMonitor() {
|
||||
if (scheduledFuture == null) {
|
||||
final long jitter = (long) (random.nextDouble() * MONITOR_INTERVAL_MILLIS * JITTER_FACTOR);
|
||||
monitorIntervalMillis = MONITOR_INTERVAL_MILLIS + jitter;
|
||||
log.info(
|
||||
"Monitoring for MigrationState client version change from {} every {}ms",
|
||||
expectedVersion,
|
||||
monitorIntervalMillis);
|
||||
scheduledFuture = stateMachineThreadPool.scheduleWithFixedDelay(
|
||||
this, monitorIntervalMillis, monitorIntervalMillis, TimeUnit.MILLISECONDS);
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
return new StringBuilder(getClass().getSimpleName())
|
||||
.append("[")
|
||||
.append(expectedVersion)
|
||||
.append("]")
|
||||
.toString();
|
||||
}
|
||||
|
||||
/**
|
||||
* Cancel the monitor explicity before the condition is met, e.g. when the worker is going down.
|
||||
* Note on synchronization: callback of this monitor is invoked while holding the lock on this monitor object.
|
||||
* If cancel is called from within the same lock context that callback uses, then it can lead to
|
||||
* deadlock. Ensure synchronization context between callback the caller of cancel is not shared.
|
||||
*/
|
||||
public synchronized void cancel() {
|
||||
if (scheduledFuture != null) {
|
||||
log.info("Cancelling {}", this);
|
||||
scheduledFuture.cancel(false);
|
||||
} else {
|
||||
log.info("Monitor {} is not running", this);
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public synchronized void run() {
|
||||
try {
|
||||
if (scheduledFuture == null) {
|
||||
log.debug("Monitor has been cancelled, not running...");
|
||||
return;
|
||||
}
|
||||
|
||||
final MigrationState migrationState =
|
||||
(MigrationState) coordinatorStateDAO.getCoordinatorState(MIGRATION_HASH_KEY);
|
||||
if (migrationState != null) {
|
||||
if (migrationState.getClientVersion() != expectedVersion) {
|
||||
log.info("MigrationState client version has changed {}, invoking monitor callback", migrationState);
|
||||
callback.accept(migrationState);
|
||||
log.info("Callback successful, monitoring cancelling itself.");
|
||||
// stop further monitoring
|
||||
scheduledFuture.cancel(false);
|
||||
scheduledFuture = null;
|
||||
} else {
|
||||
emitMetrics();
|
||||
log.debug("No change detected {}", this);
|
||||
}
|
||||
}
|
||||
} catch (final Exception e) {
|
||||
log.warn(
|
||||
"Exception occurred when monitoring for client version change from {}, will retry in {}",
|
||||
expectedVersion,
|
||||
monitorIntervalMillis,
|
||||
e);
|
||||
}
|
||||
}
|
||||
|
||||
private void emitMetrics() {
|
||||
final MetricsScope scope = MetricsUtil.createMetricsWithOperation(metricsFactory, METRICS_OPERATION);
|
||||
try {
|
||||
switch (expectedVersion) {
|
||||
case CLIENT_VERSION_3x_WITH_ROLLBACK:
|
||||
scope.addData("CurrentState:3xWorker", 1, StandardUnit.COUNT, MetricsLevel.SUMMARY);
|
||||
break;
|
||||
case CLIENT_VERSION_2x:
|
||||
case CLIENT_VERSION_UPGRADE_FROM_2x:
|
||||
scope.addData("CurrentState:2xCompatibleWorker", 1, StandardUnit.COUNT, MetricsLevel.SUMMARY);
|
||||
break;
|
||||
default:
|
||||
throw new IllegalStateException(String.format("Unexpected version %s", expectedVersion.name()));
|
||||
}
|
||||
} finally {
|
||||
MetricsUtil.endScope(scope);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,159 @@
|
|||
/*
|
||||
* Copyright 2024 Amazon.com, Inc. or its affiliates.
|
||||
* Licensed under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package software.amazon.kinesis.coordinator.migration;
|
||||
|
||||
import java.util.Random;
|
||||
import java.util.concurrent.CompletableFuture;
|
||||
import java.util.concurrent.ScheduledExecutorService;
|
||||
|
||||
import lombok.NonNull;
|
||||
import lombok.RequiredArgsConstructor;
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
import software.amazon.awssdk.annotations.ThreadSafe;
|
||||
import software.amazon.awssdk.services.cloudwatch.model.StandardUnit;
|
||||
import software.amazon.kinesis.annotations.KinesisClientInternalApi;
|
||||
import software.amazon.kinesis.coordinator.CoordinatorStateDAO;
|
||||
import software.amazon.kinesis.coordinator.DynamicMigrationComponentsInitializer;
|
||||
import software.amazon.kinesis.leases.exceptions.DependencyException;
|
||||
import software.amazon.kinesis.leases.exceptions.InvalidStateException;
|
||||
import software.amazon.kinesis.metrics.MetricsLevel;
|
||||
import software.amazon.kinesis.metrics.MetricsScope;
|
||||
import software.amazon.kinesis.metrics.MetricsUtil;
|
||||
|
||||
import static software.amazon.kinesis.coordinator.migration.ClientVersion.CLIENT_VERSION_2x;
|
||||
import static software.amazon.kinesis.coordinator.migration.ClientVersion.CLIENT_VERSION_UPGRADE_FROM_2x;
|
||||
import static software.amazon.kinesis.coordinator.migration.MigrationStateMachineImpl.FAULT_METRIC;
|
||||
import static software.amazon.kinesis.coordinator.migration.MigrationStateMachineImpl.METRICS_OPERATION;
|
||||
|
||||
/**
|
||||
* State for CLIENT_VERSION_2x. In this state, the only allowed valid transition is
|
||||
* the roll-forward scenario which can only be performed using the KCL Migration tool.
|
||||
* So when the state machine enters this state, a monitor is started to detect the
|
||||
* roll-forward scenario.
|
||||
*/
|
||||
@KinesisClientInternalApi
|
||||
@RequiredArgsConstructor
|
||||
@Slf4j
|
||||
@ThreadSafe
|
||||
public class MigrationClientVersion2xState implements MigrationClientVersionState {
|
||||
private final MigrationStateMachine stateMachine;
|
||||
private final CoordinatorStateDAO coordinatorStateDAO;
|
||||
private final ScheduledExecutorService stateMachineThreadPool;
|
||||
private final DynamicMigrationComponentsInitializer initializer;
|
||||
private final Random random;
|
||||
|
||||
private ClientVersionChangeMonitor rollForwardMonitor;
|
||||
private boolean entered = false;
|
||||
private boolean left = false;
|
||||
|
||||
@Override
|
||||
public ClientVersion clientVersion() {
|
||||
return CLIENT_VERSION_2x;
|
||||
}
|
||||
|
||||
@Override
|
||||
public synchronized void enter(final ClientVersion fromClientVersion) {
|
||||
if (!entered) {
|
||||
log.info("Entering {} from {}", this, fromClientVersion);
|
||||
initializer.initializeClientVersionFor2x(fromClientVersion);
|
||||
|
||||
log.info("Starting roll-forward monitor");
|
||||
rollForwardMonitor = new ClientVersionChangeMonitor(
|
||||
initializer.metricsFactory(),
|
||||
coordinatorStateDAO,
|
||||
stateMachineThreadPool,
|
||||
this::onClientVersionChange,
|
||||
clientVersion(),
|
||||
random);
|
||||
rollForwardMonitor.startMonitor();
|
||||
entered = true;
|
||||
} else {
|
||||
log.info("Not entering {}", left ? "already exited state" : "already entered state");
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public synchronized void leave() {
|
||||
if (entered && !left) {
|
||||
log.info("Leaving {}", this);
|
||||
cancelRollForwardMonitor();
|
||||
left = false;
|
||||
} else {
|
||||
log.info("Cannot leave {}", entered ? "already exited state" : "because state is not active");
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
return getClass().getSimpleName();
|
||||
}
|
||||
|
||||
/**
|
||||
* Callback handler to handle client version changes in MigrationState in DDB.
|
||||
* @param newState current MigrationState read from DDB where client version is not CLIENT_VERSION_2x
|
||||
* @throws InvalidStateException during transition to the next state based on the new ClientVersion
|
||||
* or if the new state in DDB is unexpected.
|
||||
*/
|
||||
private synchronized void onClientVersionChange(@NonNull final MigrationState newState)
|
||||
throws InvalidStateException, DependencyException {
|
||||
if (!entered || left) {
|
||||
log.warn("Received client version change notification on inactive state {}", this);
|
||||
return;
|
||||
}
|
||||
final MetricsScope scope =
|
||||
MetricsUtil.createMetricsWithOperation(initializer.metricsFactory(), METRICS_OPERATION);
|
||||
try {
|
||||
if (newState.getClientVersion() == CLIENT_VERSION_UPGRADE_FROM_2x) {
|
||||
log.info(
|
||||
"A roll-forward has been initiated for the application. Transition to {}",
|
||||
CLIENT_VERSION_UPGRADE_FROM_2x);
|
||||
// If this succeeds, the monitor will cancel itself.
|
||||
stateMachine.transitionTo(CLIENT_VERSION_UPGRADE_FROM_2x, newState);
|
||||
} else {
|
||||
// This should not happen, so throw an exception that allows the monitor to continue monitoring
|
||||
// changes, this allows KCL to operate in the current state and keep monitoring until a valid
|
||||
// state transition is possible.
|
||||
// However, there could be a split brain here, new workers will use DDB value as source of truth,
|
||||
// so we could also write back CLIENT_VERSION_2x to DDB to ensure all workers have consistent
|
||||
// behavior.
|
||||
// Ideally we don't expect modifications to DDB table out of the KCL migration tool scope,
|
||||
// so keeping it simple and not writing back to DDB, the error log below would help capture
|
||||
// any strange behavior if this happens.
|
||||
log.error(
|
||||
"Migration state has invalid client version {}. Transition from {} is not supported",
|
||||
newState,
|
||||
CLIENT_VERSION_2x);
|
||||
throw new InvalidStateException(String.format("Unexpected new state %s", newState));
|
||||
}
|
||||
} catch (final InvalidStateException | DependencyException e) {
|
||||
scope.addData(FAULT_METRIC, 1, StandardUnit.COUNT, MetricsLevel.SUMMARY);
|
||||
throw e;
|
||||
} finally {
|
||||
MetricsUtil.endScope(scope);
|
||||
}
|
||||
}
|
||||
|
||||
private void cancelRollForwardMonitor() {
|
||||
if (rollForwardMonitor != null) {
|
||||
final ClientVersionChangeMonitor localRollForwardMonitor = rollForwardMonitor;
|
||||
CompletableFuture.supplyAsync(() -> {
|
||||
log.info("Cancelling roll-forward monitor");
|
||||
localRollForwardMonitor.cancel();
|
||||
return null;
|
||||
});
|
||||
rollForwardMonitor = null;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,70 @@
|
|||
/*
|
||||
* Copyright 2024 Amazon.com, Inc. or its affiliates.
|
||||
* Licensed under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package software.amazon.kinesis.coordinator.migration;
|
||||
|
||||
import lombok.RequiredArgsConstructor;
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
import software.amazon.awssdk.annotations.ThreadSafe;
|
||||
import software.amazon.kinesis.annotations.KinesisClientInternalApi;
|
||||
import software.amazon.kinesis.coordinator.DynamicMigrationComponentsInitializer;
|
||||
import software.amazon.kinesis.leases.exceptions.DependencyException;
|
||||
|
||||
/**
|
||||
* State for CLIENT_VERSION_3x which enables KCL to run 3.x algorithms on new KCLv3.x application
|
||||
* or successfully upgraded application which upgraded from v2.x. This is a terminal state of the
|
||||
* state machine and no rollbacks are supported in this state.
|
||||
*/
|
||||
@KinesisClientInternalApi
|
||||
@RequiredArgsConstructor
|
||||
@Slf4j
|
||||
@ThreadSafe
|
||||
public class MigrationClientVersion3xState implements MigrationClientVersionState {
|
||||
private final MigrationStateMachine stateMachine;
|
||||
private final DynamicMigrationComponentsInitializer initializer;
|
||||
private boolean entered = false;
|
||||
private boolean left = false;
|
||||
|
||||
@Override
|
||||
public ClientVersion clientVersion() {
|
||||
return ClientVersion.CLIENT_VERSION_3x;
|
||||
}
|
||||
|
||||
@Override
|
||||
public synchronized void enter(final ClientVersion fromClientVersion) throws DependencyException {
|
||||
if (!entered) {
|
||||
log.info("Entering {} from {}", this, fromClientVersion);
|
||||
initializer.initializeClientVersionFor3x(fromClientVersion);
|
||||
entered = true;
|
||||
} else {
|
||||
log.info("Not entering {}", left ? "already exited state" : "already entered state");
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public void leave() {
|
||||
if (entered && !left) {
|
||||
log.info("Leaving {}", this);
|
||||
entered = false;
|
||||
left = true;
|
||||
} else {
|
||||
log.info("Cannot leave {}", entered ? "already exited state" : "because state is not active");
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
return getClass().getSimpleName();
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,156 @@
|
|||
/*
|
||||
* Copyright 2024 Amazon.com, Inc. or its affiliates.
|
||||
* Licensed under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package software.amazon.kinesis.coordinator.migration;
|
||||
|
||||
import java.util.Random;
|
||||
import java.util.concurrent.CompletableFuture;
|
||||
import java.util.concurrent.ScheduledExecutorService;
|
||||
|
||||
import lombok.RequiredArgsConstructor;
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
import software.amazon.awssdk.annotations.ThreadSafe;
|
||||
import software.amazon.awssdk.services.cloudwatch.model.StandardUnit;
|
||||
import software.amazon.kinesis.annotations.KinesisClientInternalApi;
|
||||
import software.amazon.kinesis.coordinator.CoordinatorStateDAO;
|
||||
import software.amazon.kinesis.coordinator.DynamicMigrationComponentsInitializer;
|
||||
import software.amazon.kinesis.leases.exceptions.DependencyException;
|
||||
import software.amazon.kinesis.leases.exceptions.InvalidStateException;
|
||||
import software.amazon.kinesis.metrics.MetricsLevel;
|
||||
import software.amazon.kinesis.metrics.MetricsScope;
|
||||
import software.amazon.kinesis.metrics.MetricsUtil;
|
||||
|
||||
import static software.amazon.kinesis.coordinator.migration.ClientVersion.CLIENT_VERSION_2x;
|
||||
import static software.amazon.kinesis.coordinator.migration.ClientVersion.CLIENT_VERSION_3x;
|
||||
import static software.amazon.kinesis.coordinator.migration.MigrationStateMachineImpl.FAULT_METRIC;
|
||||
import static software.amazon.kinesis.coordinator.migration.MigrationStateMachineImpl.METRICS_OPERATION;
|
||||
|
||||
/**
|
||||
* State for CLIENT_VERSION_3x_WITH_ROLLBACK which enables KCL to run its 3.x compliant algorithms
|
||||
* during the upgrade process after all KCL workers in the fleet are 3.x complaint. Since this
|
||||
* is an instant switch from CLIENT_VERSION_UPGRADE_FROM_2x, it also supports rollback if customers
|
||||
* see regression to allow for instant rollbacks as well. This would be achieved by customers
|
||||
* running a KCL migration tool to update MigrationState in DDB. So this state monitors for
|
||||
* rollback triggers and performs state transitions accordingly.
|
||||
*/
|
||||
@Slf4j
|
||||
@KinesisClientInternalApi
|
||||
@RequiredArgsConstructor
|
||||
@ThreadSafe
|
||||
public class MigrationClientVersion3xWithRollbackState implements MigrationClientVersionState {
|
||||
|
||||
private final MigrationStateMachine stateMachine;
|
||||
private final CoordinatorStateDAO coordinatorStateDAO;
|
||||
private final ScheduledExecutorService stateMachineThreadPool;
|
||||
private final DynamicMigrationComponentsInitializer initializer;
|
||||
private final Random random;
|
||||
|
||||
private ClientVersionChangeMonitor rollbackMonitor;
|
||||
private boolean entered;
|
||||
private boolean left;
|
||||
|
||||
@Override
|
||||
public ClientVersion clientVersion() {
|
||||
return ClientVersion.CLIENT_VERSION_3x_WITH_ROLLBACK;
|
||||
}
|
||||
|
||||
@Override
|
||||
public synchronized void enter(final ClientVersion fromClientVersion) throws DependencyException {
|
||||
if (!entered) {
|
||||
log.info("Entering {} from {}", this, fromClientVersion);
|
||||
initializer.initializeClientVersionFor3xWithRollback(fromClientVersion);
|
||||
// we need to run the rollback monitor
|
||||
log.info("Starting rollback monitor");
|
||||
rollbackMonitor = new ClientVersionChangeMonitor(
|
||||
initializer.metricsFactory(),
|
||||
coordinatorStateDAO,
|
||||
stateMachineThreadPool,
|
||||
this::onClientVersionChange,
|
||||
clientVersion(),
|
||||
random);
|
||||
rollbackMonitor.startMonitor();
|
||||
entered = true;
|
||||
} else {
|
||||
log.info("Not entering {}", left ? "already exited state" : "already entered state");
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public void leave() {
|
||||
if (entered && !left) {
|
||||
log.info("Leaving {}", this);
|
||||
cancelRollbackMonitor();
|
||||
entered = false;
|
||||
left = true;
|
||||
} else {
|
||||
log.info("Cannot leave {}", entered ? "already exited state" : "because state is not active");
|
||||
}
|
||||
}
|
||||
|
||||
private synchronized void onClientVersionChange(final MigrationState newState)
|
||||
throws InvalidStateException, DependencyException {
|
||||
if (!entered || left) {
|
||||
log.warn("Received client version change notification on inactive state {}", this);
|
||||
return;
|
||||
}
|
||||
final MetricsScope scope =
|
||||
MetricsUtil.createMetricsWithOperation(initializer.metricsFactory(), METRICS_OPERATION);
|
||||
try {
|
||||
switch (newState.getClientVersion()) {
|
||||
case CLIENT_VERSION_2x:
|
||||
log.info("A rollback has been initiated for the application. Transition to {}", CLIENT_VERSION_2x);
|
||||
stateMachine.transitionTo(ClientVersion.CLIENT_VERSION_2x, newState);
|
||||
break;
|
||||
case CLIENT_VERSION_3x:
|
||||
log.info("Customer has switched to 3.x after successful upgrade, state machine will move to a"
|
||||
+ "terminal state and stop monitoring. Rollbacks will no longer be supported anymore");
|
||||
stateMachine.transitionTo(CLIENT_VERSION_3x, newState);
|
||||
// This worker will still be running the migrationAdaptive components in 3.x mode which will
|
||||
// no longer dynamically switch back to 2.x mode, however to directly run 3.x component without
|
||||
// adaption to migration (i.e. move to CLIENT_VERSION_3x state), it requires this worker to go
|
||||
// through the current deployment which initiated the switch to 3.x mode.
|
||||
break;
|
||||
default:
|
||||
// This should not happen, so throw an exception that allows the monitor to continue monitoring
|
||||
// changes, this allows KCL to operate in the current state and keep monitoring until a valid
|
||||
// state transition is possible.
|
||||
// However, there could be a split brain here, new workers will use DDB value as source of truth,
|
||||
// so we could also write back CLIENT_VERSION_3x_WITH_ROLLBACK to DDB to ensure all workers have
|
||||
// consistent behavior.
|
||||
// Ideally we don't expect modifications to DDB table out of the KCL migration tool scope,
|
||||
// so keeping it simple and not writing back to DDB, the error log below would help capture
|
||||
// any strange behavior if this happens.
|
||||
log.error("Migration state has invalid client version {}", newState);
|
||||
throw new InvalidStateException(String.format("Unexpected new state %s", newState));
|
||||
}
|
||||
} catch (final InvalidStateException | DependencyException e) {
|
||||
scope.addData(FAULT_METRIC, 1, StandardUnit.COUNT, MetricsLevel.SUMMARY);
|
||||
throw e;
|
||||
} finally {
|
||||
MetricsUtil.endScope(scope);
|
||||
}
|
||||
}
|
||||
|
||||
private void cancelRollbackMonitor() {
|
||||
if (rollbackMonitor != null) {
|
||||
final ClientVersionChangeMonitor localRollbackMonitor = rollbackMonitor;
|
||||
CompletableFuture.supplyAsync(() -> {
|
||||
log.info("Cancelling rollback monitor");
|
||||
localRollbackMonitor.cancel();
|
||||
return null;
|
||||
});
|
||||
rollbackMonitor = null;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,47 @@
|
|||
/*
|
||||
* Copyright 2024 Amazon.com, Inc. or its affiliates.
|
||||
* Licensed under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package software.amazon.kinesis.coordinator.migration;
|
||||
|
||||
import software.amazon.kinesis.leases.exceptions.DependencyException;
|
||||
|
||||
/**
|
||||
* Interface of a state implementation for the MigrationStateMachine
|
||||
*/
|
||||
public interface MigrationClientVersionState {
|
||||
|
||||
/**
|
||||
* The associated clientVersion this state corresponds to
|
||||
* @return ClientVersion that this state implements the logic for.
|
||||
*/
|
||||
ClientVersion clientVersion();
|
||||
|
||||
/**
|
||||
* Enter the state and perform the business logic of being in this state
|
||||
* which includes performing any monitoring that allows the next state
|
||||
* transition and also initializing the KCL based on the ClientVersion.
|
||||
* @param fromClientVersion from previous state if any specific action must
|
||||
* be taken based on the state from which this state
|
||||
* is being entered from.
|
||||
* @throws DependencyException if DDB fails in unexpected ways for those states
|
||||
* that create the GSI
|
||||
*/
|
||||
void enter(ClientVersion fromClientVersion) throws DependencyException;
|
||||
|
||||
/**
|
||||
* Invoked after the transition to another state has occurred
|
||||
* to allow printing any helpful logs or performing cleanup.
|
||||
*/
|
||||
void leave();
|
||||
}
|
||||
|
|
@ -0,0 +1,263 @@
|
|||
/*
|
||||
* Copyright 2024 Amazon.com, Inc. or its affiliates.
|
||||
* Licensed under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package software.amazon.kinesis.coordinator.migration;
|
||||
|
||||
import java.util.AbstractMap.SimpleEntry;
|
||||
import java.util.Map;
|
||||
import java.util.Random;
|
||||
import java.util.concurrent.Callable;
|
||||
|
||||
import lombok.RequiredArgsConstructor;
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
import software.amazon.awssdk.annotations.ThreadSafe;
|
||||
import software.amazon.awssdk.services.dynamodb.model.ExpectedAttributeValue;
|
||||
import software.amazon.kinesis.annotations.KinesisClientInternalApi;
|
||||
import software.amazon.kinesis.coordinator.CoordinatorConfig.ClientVersionConfig;
|
||||
import software.amazon.kinesis.coordinator.CoordinatorState;
|
||||
import software.amazon.kinesis.coordinator.CoordinatorStateDAO;
|
||||
import software.amazon.kinesis.leases.exceptions.DependencyException;
|
||||
import software.amazon.kinesis.leases.exceptions.InvalidStateException;
|
||||
import software.amazon.kinesis.leases.exceptions.ProvisionedThroughputException;
|
||||
|
||||
import static software.amazon.kinesis.coordinator.migration.ClientVersion.CLIENT_VERSION_2x;
|
||||
import static software.amazon.kinesis.coordinator.migration.ClientVersion.CLIENT_VERSION_3x;
|
||||
import static software.amazon.kinesis.coordinator.migration.ClientVersion.CLIENT_VERSION_3x_WITH_ROLLBACK;
|
||||
import static software.amazon.kinesis.coordinator.migration.ClientVersion.CLIENT_VERSION_UPGRADE_FROM_2x;
|
||||
import static software.amazon.kinesis.coordinator.migration.MigrationState.MIGRATION_HASH_KEY;
|
||||
|
||||
/**
|
||||
* Initializer to determine start state of the state machine which identifies the
|
||||
* state to initialize KCL when it is starting up. The initial state is determined based on the
|
||||
* customer configured {@link ClientVersionConfig} and the current {@link MigrationState} in DDB,
|
||||
* as follows
|
||||
* ClientVersionConfig | MigrationState (DDB) | initial client version
|
||||
* --------------------+---------------------------------+--------------------------------
|
||||
* COMPATIBLE_WITH_2x | Does not exist | CLIENT_VERSION_UPGRADE_FROM_2x
|
||||
* 3x | Does not exist | CLIENT_VERSION_3x
|
||||
* COMPATIBLE_WITH_2x | CLIENT_VERSION_3x_WITH_ROLLBACK | CLIENT_VERSION_3x_WITH_ROLLBACK
|
||||
* 3x | CLIENT_VERSION_3x_WITH_ROLLBACK | CLIENT_VERSION_3x
|
||||
* any | CLIENT_VERSION_2x | CLIENT_VERSION_2x
|
||||
* any | CLIENT_VERSION_UPGRADE_FROM_2x | CLIENT_VERSION_UPGRADE_FROM_2x
|
||||
* any | CLIENT_VERSION_3x | CLIENT_VERSION_3x
|
||||
*/
|
||||
@KinesisClientInternalApi
|
||||
@RequiredArgsConstructor
|
||||
@Slf4j
|
||||
@ThreadSafe
|
||||
public class MigrationClientVersionStateInitializer {
|
||||
private static final int MAX_INITIALIZATION_RETRY = 10;
|
||||
private static final long INITIALIZATION_RETRY_DELAY_MILLIS = 1000L;
|
||||
/**
|
||||
* A jitter factor of 10% to stagger the retries.
|
||||
*/
|
||||
private static final double JITTER_FACTOR = 0.1;
|
||||
|
||||
private final Callable<Long> timeProvider;
|
||||
private final CoordinatorStateDAO coordinatorStateDAO;
|
||||
private final ClientVersionConfig clientVersionConfig;
|
||||
private final Random random;
|
||||
private final String workerIdentifier;
|
||||
|
||||
public SimpleEntry<ClientVersion, MigrationState> getInitialState() throws DependencyException {
|
||||
log.info("Initializing migration state machine starting state, configured version {}", clientVersionConfig);
|
||||
|
||||
try {
|
||||
MigrationState migrationState = getMigrationStateFromDynamo();
|
||||
int retryCount = 0;
|
||||
while (retryCount++ < MAX_INITIALIZATION_RETRY) {
|
||||
final ClientVersion initialClientVersion = getClientVersionForInitialization(migrationState);
|
||||
if (migrationState.getClientVersion() != initialClientVersion) {
|
||||
// If update fails, the value represents current state in dynamo
|
||||
migrationState = updateMigrationStateInDynamo(migrationState, initialClientVersion);
|
||||
if (migrationState.getClientVersion() == initialClientVersion) {
|
||||
// update succeeded. Transition to the state
|
||||
return new SimpleEntry<>(initialClientVersion, migrationState);
|
||||
}
|
||||
final long delay = getInitializationRetryDelay();
|
||||
log.warn(
|
||||
"Failed to update migration state with {}, retry after delay {}",
|
||||
initialClientVersion,
|
||||
delay);
|
||||
safeSleep(delay);
|
||||
} else {
|
||||
return new SimpleEntry<>(initialClientVersion, migrationState);
|
||||
}
|
||||
}
|
||||
} catch (final InvalidStateException e) {
|
||||
log.error("Unable to initialize state machine", e);
|
||||
}
|
||||
throw new DependencyException(
|
||||
new RuntimeException("Unable to determine initial state for migration state machine"));
|
||||
}
|
||||
|
||||
public ClientVersion getClientVersionForInitialization(final MigrationState migrationState) {
|
||||
final ClientVersion nextClientVersion;
|
||||
switch (migrationState.getClientVersion()) {
|
||||
case CLIENT_VERSION_INIT:
|
||||
// There is no state in DDB, set state to config version and transition to configured version.
|
||||
nextClientVersion = getNextClientVersionBasedOnConfigVersion();
|
||||
log.info("Application is starting in {}", nextClientVersion);
|
||||
break;
|
||||
case CLIENT_VERSION_3x_WITH_ROLLBACK:
|
||||
if (clientVersionConfig == ClientVersionConfig.CLIENT_VERSION_CONFIG_3x) {
|
||||
// upgrade successful, allow transition to 3x.
|
||||
log.info("Application has successfully upgraded, transitioning to {}", CLIENT_VERSION_3x);
|
||||
nextClientVersion = CLIENT_VERSION_3x;
|
||||
break;
|
||||
}
|
||||
log.info("Initialize with {}", CLIENT_VERSION_3x_WITH_ROLLBACK);
|
||||
nextClientVersion = migrationState.getClientVersion();
|
||||
break;
|
||||
case CLIENT_VERSION_2x:
|
||||
log.info("Application has rolled-back, initialize with {}", CLIENT_VERSION_2x);
|
||||
nextClientVersion = migrationState.getClientVersion();
|
||||
break;
|
||||
case CLIENT_VERSION_UPGRADE_FROM_2x:
|
||||
log.info("Application is upgrading, initialize with {}", CLIENT_VERSION_UPGRADE_FROM_2x);
|
||||
nextClientVersion = migrationState.getClientVersion();
|
||||
break;
|
||||
case CLIENT_VERSION_3x:
|
||||
log.info("Initialize with {}", CLIENT_VERSION_3x);
|
||||
nextClientVersion = migrationState.getClientVersion();
|
||||
break;
|
||||
default:
|
||||
throw new IllegalStateException(String.format("Unknown version in DDB %s", migrationState));
|
||||
}
|
||||
return nextClientVersion;
|
||||
}
|
||||
|
||||
/**
|
||||
* Update the migration state's client version in dynamo conditional on the current client version
|
||||
* in dynamo. So that if another worker updates the value first, the update fails. If the update fails,
|
||||
* the method will read the latest value and return so that initialization can be retried.
|
||||
* If the value does not exist in dynamo, it will creat it.
|
||||
*/
|
||||
private MigrationState updateMigrationStateInDynamo(
|
||||
final MigrationState migrationState, final ClientVersion nextClientVersion) throws InvalidStateException {
|
||||
try {
|
||||
if (migrationState.getClientVersion() == ClientVersion.CLIENT_VERSION_INIT) {
|
||||
migrationState.update(nextClientVersion, workerIdentifier);
|
||||
log.info("Creating {}", migrationState);
|
||||
final boolean created = coordinatorStateDAO.createCoordinatorStateIfNotExists(migrationState);
|
||||
if (!created) {
|
||||
log.debug("Create {} did not succeed", migrationState);
|
||||
return getMigrationStateFromDynamo();
|
||||
}
|
||||
} else {
|
||||
log.info("Updating {} with {}", migrationState, nextClientVersion);
|
||||
final Map<String, ExpectedAttributeValue> expectations =
|
||||
migrationState.getDynamoClientVersionExpectation();
|
||||
migrationState.update(nextClientVersion, workerIdentifier);
|
||||
final boolean updated =
|
||||
coordinatorStateDAO.updateCoordinatorStateWithExpectation(migrationState, expectations);
|
||||
if (!updated) {
|
||||
log.debug("Update {} did not succeed", migrationState);
|
||||
return getMigrationStateFromDynamo();
|
||||
}
|
||||
}
|
||||
return migrationState;
|
||||
} catch (final ProvisionedThroughputException | DependencyException e) {
|
||||
log.debug(
|
||||
"Failed to update migration state {} with {}, return previous value to trigger a retry",
|
||||
migrationState,
|
||||
nextClientVersion,
|
||||
e);
|
||||
return migrationState;
|
||||
}
|
||||
}
|
||||
|
||||
private ClientVersion getNextClientVersionBasedOnConfigVersion() {
|
||||
switch (clientVersionConfig) {
|
||||
case CLIENT_VERSION_CONFIG_COMPATIBLE_WITH_2x:
|
||||
return CLIENT_VERSION_UPGRADE_FROM_2x;
|
||||
case CLIENT_VERSION_CONFIG_3x:
|
||||
return CLIENT_VERSION_3x;
|
||||
}
|
||||
throw new IllegalStateException(String.format("Unknown configured Client version %s", clientVersionConfig));
|
||||
}
|
||||
|
||||
/**
|
||||
* Read the current {@link MigrationState} from DDB with retries.
|
||||
* @return current Migration state from DDB, if none exists, an initial Migration State with CLIENT_VERSION_INIT
|
||||
* will be returned
|
||||
* @throws InvalidStateException, this occurs when dynamo table does not exist in which retrying is not useful.
|
||||
*/
|
||||
private MigrationState getMigrationStateFromDynamo() throws InvalidStateException {
|
||||
return executeCallableWithRetryAndJitter(
|
||||
() -> {
|
||||
final CoordinatorState state = coordinatorStateDAO.getCoordinatorState(MIGRATION_HASH_KEY);
|
||||
if (state == null) {
|
||||
log.info("No Migration state available in DDB");
|
||||
return new MigrationState(MIGRATION_HASH_KEY, workerIdentifier);
|
||||
}
|
||||
if (state instanceof MigrationState) {
|
||||
log.info("Current migration state in DDB {}", state);
|
||||
return (MigrationState) state;
|
||||
}
|
||||
throw new InvalidStateException(
|
||||
String.format("Unexpected state found not confirming to MigrationState schema %s", state));
|
||||
},
|
||||
"get MigrationState from DDB");
|
||||
}
|
||||
|
||||
/**
|
||||
* Helper method to retry a given callable upto MAX_INITIALIZATION_RETRY times for all retryable exceptions.
|
||||
* It considers InvalidStateException as non-retryable exception. During retry, it will compute a delay
|
||||
* with jitter before retrying.
|
||||
* @param callable callable to invoke either until it succeeds or max retry attempts exceed.
|
||||
* @param description a meaningful description to log exceptions
|
||||
* @return the value returned by the callable
|
||||
* @param <T> Return type of the callable
|
||||
* @throws InvalidStateException If the callable throws InvalidStateException, it will not be retried and will
|
||||
* be thrown back.
|
||||
*/
|
||||
private <T> T executeCallableWithRetryAndJitter(final Callable<T> callable, final String description)
|
||||
throws InvalidStateException {
|
||||
int retryCount = 0;
|
||||
while (retryCount++ < MAX_INITIALIZATION_RETRY) {
|
||||
try {
|
||||
return callable.call();
|
||||
} catch (final Exception e) {
|
||||
if (e instanceof InvalidStateException) {
|
||||
// throw the non-retryable exception
|
||||
throw (InvalidStateException) e;
|
||||
}
|
||||
final long delay = getInitializationRetryDelay();
|
||||
log.warn("Failed to {}, retry after delay {}", description, delay, e);
|
||||
|
||||
safeSleep(delay);
|
||||
}
|
||||
}
|
||||
throw new RuntimeException(
|
||||
String.format("Failed to %s after %d retries, giving up", description, MAX_INITIALIZATION_RETRY));
|
||||
}
|
||||
|
||||
private void safeSleep(final long delay) {
|
||||
try {
|
||||
Thread.sleep(delay);
|
||||
} catch (final InterruptedException ie) {
|
||||
log.debug("Interrupted sleep during state machine initialization retry");
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Generate a delay with jitter that is factor of the interval.
|
||||
* @return delay with jitter
|
||||
*/
|
||||
private long getInitializationRetryDelay() {
|
||||
final long jitter = (long) (random.nextDouble() * JITTER_FACTOR * INITIALIZATION_RETRY_DELAY_MILLIS);
|
||||
return INITIALIZATION_RETRY_DELAY_MILLIS + jitter;
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,241 @@
|
|||
/*
|
||||
* Copyright 2024 Amazon.com, Inc. or its affiliates.
|
||||
* Licensed under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package software.amazon.kinesis.coordinator.migration;
|
||||
|
||||
import java.util.Random;
|
||||
import java.util.concurrent.Callable;
|
||||
import java.util.concurrent.CompletableFuture;
|
||||
import java.util.concurrent.ScheduledExecutorService;
|
||||
|
||||
import lombok.RequiredArgsConstructor;
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
import software.amazon.awssdk.annotations.ThreadSafe;
|
||||
import software.amazon.awssdk.services.cloudwatch.model.StandardUnit;
|
||||
import software.amazon.kinesis.annotations.KinesisClientInternalApi;
|
||||
import software.amazon.kinesis.coordinator.CoordinatorStateDAO;
|
||||
import software.amazon.kinesis.coordinator.DynamicMigrationComponentsInitializer;
|
||||
import software.amazon.kinesis.leases.exceptions.DependencyException;
|
||||
import software.amazon.kinesis.leases.exceptions.InvalidStateException;
|
||||
import software.amazon.kinesis.metrics.MetricsLevel;
|
||||
import software.amazon.kinesis.metrics.MetricsScope;
|
||||
import software.amazon.kinesis.metrics.MetricsUtil;
|
||||
|
||||
import static software.amazon.kinesis.coordinator.migration.ClientVersion.CLIENT_VERSION_2x;
|
||||
import static software.amazon.kinesis.coordinator.migration.ClientVersion.CLIENT_VERSION_3x_WITH_ROLLBACK;
|
||||
import static software.amazon.kinesis.coordinator.migration.MigrationStateMachineImpl.FAULT_METRIC;
|
||||
import static software.amazon.kinesis.coordinator.migration.MigrationStateMachineImpl.METRICS_OPERATION;
|
||||
|
||||
/**
|
||||
* State for CLIENT_VERSION_UPGRADE_FROM_2x. When state machine enters this state,
|
||||
* KCL is initialized to operate in dual mode for Lease assignment and Leader decider algorithms
|
||||
* which initially start in 2.x compatible mode and when all the KCL workers are 3.x compliant,
|
||||
* it dynamically switches to the 3.x algorithms. It also monitors for rollback
|
||||
* initiated from customer via the KCL migration tool and instantly switches back to the 2.x
|
||||
* complaint algorithms.
|
||||
* The allowed state transitions are to CLIENT_VERSION_3x_WITH_ROLLBACK when KCL workers are
|
||||
* 3.x complaint, and to CLIENT_VERSION_2x when customer has initiated a rollback.
|
||||
* Only the leader KCL worker performs migration ready monitor and notifies all workers (including
|
||||
* itself) via a MigrationState update. When all worker's monitor notice the MigrationState change
|
||||
* (including itself), it will transition to CLIENT_VERSION_3x_WITH_ROLLBACK.
|
||||
*/
|
||||
@KinesisClientInternalApi
|
||||
@RequiredArgsConstructor
|
||||
@Slf4j
|
||||
@ThreadSafe
|
||||
public class MigrationClientVersionUpgradeFrom2xState implements MigrationClientVersionState {
|
||||
private final MigrationStateMachine stateMachine;
|
||||
private final Callable<Long> timeProvider;
|
||||
private final CoordinatorStateDAO coordinatorStateDAO;
|
||||
private final ScheduledExecutorService stateMachineThreadPool;
|
||||
private final DynamicMigrationComponentsInitializer initializer;
|
||||
private final Random random;
|
||||
private final MigrationState currentMigrationState;
|
||||
private final long flipTo3XStabilizerTimeInSeconds;
|
||||
|
||||
private MigrationReadyMonitor migrationMonitor;
|
||||
private ClientVersionChangeMonitor clientVersionChangeMonitor;
|
||||
private boolean entered = false;
|
||||
private boolean left = false;
|
||||
|
||||
@Override
|
||||
public ClientVersion clientVersion() {
|
||||
return ClientVersion.CLIENT_VERSION_UPGRADE_FROM_2x;
|
||||
}
|
||||
|
||||
@Override
|
||||
public synchronized void enter(final ClientVersion fromClientVersion) throws DependencyException {
|
||||
if (!entered) {
|
||||
log.info("Entering state {} from {}", this, fromClientVersion);
|
||||
initializer.initializeClientVersionForUpgradeFrom2x(fromClientVersion);
|
||||
|
||||
log.info("Starting migration ready monitor to monitor 3.x compliance of the KCL workers");
|
||||
migrationMonitor = new MigrationReadyMonitor(
|
||||
initializer.metricsFactory(),
|
||||
timeProvider,
|
||||
initializer.leaderDecider(),
|
||||
initializer.workerIdentifier(),
|
||||
initializer.workerMetricsDAO(),
|
||||
initializer.workerMetricsExpirySeconds(),
|
||||
initializer.leaseRefresher(),
|
||||
stateMachineThreadPool,
|
||||
this::onMigrationReady,
|
||||
flipTo3XStabilizerTimeInSeconds);
|
||||
migrationMonitor.startMonitor();
|
||||
|
||||
log.info("Starting monitor for rollback and flip to 3.x");
|
||||
clientVersionChangeMonitor = new ClientVersionChangeMonitor(
|
||||
initializer.metricsFactory(),
|
||||
coordinatorStateDAO,
|
||||
stateMachineThreadPool,
|
||||
this::onClientVersionChange,
|
||||
clientVersion(),
|
||||
random);
|
||||
clientVersionChangeMonitor.startMonitor();
|
||||
entered = true;
|
||||
} else {
|
||||
log.info("Not entering {}", left ? "already exited state" : "already entered state");
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public synchronized void leave() {
|
||||
if (entered && !left) {
|
||||
log.info("Leaving {}", this);
|
||||
cancelMigrationReadyMonitor();
|
||||
cancelClientChangeVersionMonitor();
|
||||
entered = false;
|
||||
} else {
|
||||
log.info("Cannot leave {}", entered ? "already exited state" : "because state is not active");
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
return getClass().getSimpleName();
|
||||
}
|
||||
|
||||
private synchronized void onMigrationReady() {
|
||||
// this is invoked on the leader worker only
|
||||
if (!entered || left || migrationMonitor == null) {
|
||||
log.info("Ignoring migration ready monitor, state already transitioned");
|
||||
return;
|
||||
}
|
||||
// update dynamo with the state to toggle to 3.x
|
||||
// and let the clientVersionChange kick in to do state transition
|
||||
// this way both leader and non-leader worker all transition when
|
||||
// it discovers the update from ddb.
|
||||
if (updateDynamoStateForTransition()) {
|
||||
// successfully toggled the state, now we can cancel the monitor
|
||||
cancelMigrationReadyMonitor();
|
||||
}
|
||||
// else - either migration ready monitor will retry or
|
||||
// client Version change callback will initiate the next state transition.
|
||||
}
|
||||
|
||||
private void cancelMigrationReadyMonitor() {
|
||||
if (migrationMonitor != null) {
|
||||
final MigrationReadyMonitor localMigrationMonitor = migrationMonitor;
|
||||
CompletableFuture.supplyAsync(() -> {
|
||||
log.info("Cancelling migration ready monitor");
|
||||
localMigrationMonitor.cancel();
|
||||
return null;
|
||||
});
|
||||
migrationMonitor = null;
|
||||
}
|
||||
}
|
||||
|
||||
private void cancelClientChangeVersionMonitor() {
|
||||
if (clientVersionChangeMonitor != null) {
|
||||
final ClientVersionChangeMonitor localClientVersionChangeMonitor = clientVersionChangeMonitor;
|
||||
CompletableFuture.supplyAsync(() -> {
|
||||
log.info("Cancelling client change version monitor");
|
||||
localClientVersionChangeMonitor.cancel();
|
||||
return null;
|
||||
});
|
||||
clientVersionChangeMonitor = null;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Callback handler to handle client version changes in MigrationState in DDB.
|
||||
* @param newState current MigrationState read from DDB where client version is not CLIENT_VERSION_UPGRADE_FROM_2x
|
||||
* @throws InvalidStateException during transition to the next state based on the new ClientVersion
|
||||
* or if the new state in DDB is unexpected.
|
||||
*/
|
||||
private synchronized void onClientVersionChange(final MigrationState newState)
|
||||
throws InvalidStateException, DependencyException {
|
||||
if (!entered || left) {
|
||||
log.warn("Received client version change notification on inactive state {}", this);
|
||||
return;
|
||||
}
|
||||
final MetricsScope scope =
|
||||
MetricsUtil.createMetricsWithOperation(initializer.metricsFactory(), METRICS_OPERATION);
|
||||
try {
|
||||
switch (newState.getClientVersion()) {
|
||||
case CLIENT_VERSION_2x:
|
||||
log.info("A rollback has been initiated for the application. Transition to {}", CLIENT_VERSION_2x);
|
||||
// cancel monitor asynchronously
|
||||
cancelMigrationReadyMonitor();
|
||||
stateMachine.transitionTo(CLIENT_VERSION_2x, newState);
|
||||
break;
|
||||
case CLIENT_VERSION_3x_WITH_ROLLBACK:
|
||||
log.info("KCL workers are v3.x compliant, transition to {}", CLIENT_VERSION_3x_WITH_ROLLBACK);
|
||||
cancelMigrationReadyMonitor();
|
||||
stateMachine.transitionTo(CLIENT_VERSION_3x_WITH_ROLLBACK, newState);
|
||||
break;
|
||||
default:
|
||||
// This should not happen, so throw an exception that allows the monitor to continue monitoring
|
||||
// changes, this allows KCL to operate in the current state and keep monitoring until a valid
|
||||
// state transition is possible.
|
||||
// However, there could be a split brain here, new workers will use DDB value as source of truth,
|
||||
// so we could also write back CLIENT_VERSION_UPGRADE_FROM_2x to DDB to ensure all workers have
|
||||
// consistent behavior.
|
||||
// Ideally we don't expect modifications to DDB table out of the KCL migration tool scope,
|
||||
// so keeping it simple and not writing back to DDB, the error log below would help capture
|
||||
// any strange behavior if this happens.
|
||||
log.error("Migration state has invalid client version {}", newState);
|
||||
throw new InvalidStateException(String.format("Unexpected new state %s", newState));
|
||||
}
|
||||
} catch (final DependencyException | InvalidStateException e) {
|
||||
scope.addData(FAULT_METRIC, 1, StandardUnit.COUNT, MetricsLevel.SUMMARY);
|
||||
throw e;
|
||||
} finally {
|
||||
MetricsUtil.endScope(scope);
|
||||
}
|
||||
}
|
||||
|
||||
private boolean updateDynamoStateForTransition() {
|
||||
final MetricsScope scope =
|
||||
MetricsUtil.createMetricsWithOperation(initializer.metricsFactory(), METRICS_OPERATION);
|
||||
try {
|
||||
final MigrationState newMigrationState = currentMigrationState
|
||||
.copy()
|
||||
.update(CLIENT_VERSION_3x_WITH_ROLLBACK, initializer.workerIdentifier());
|
||||
log.info("Updating Migration State in DDB with {} prev state {}", newMigrationState, currentMigrationState);
|
||||
return coordinatorStateDAO.updateCoordinatorStateWithExpectation(
|
||||
newMigrationState, currentMigrationState.getDynamoClientVersionExpectation());
|
||||
} catch (final Exception e) {
|
||||
log.warn(
|
||||
"Exception occurred when toggling to {}, upgradeReadyMonitor will retry the update"
|
||||
+ " if upgrade condition is still true",
|
||||
CLIENT_VERSION_3x_WITH_ROLLBACK,
|
||||
e);
|
||||
scope.addData(FAULT_METRIC, 1, StandardUnit.COUNT, MetricsLevel.SUMMARY);
|
||||
return false;
|
||||
} finally {
|
||||
MetricsUtil.endScope(scope);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,352 @@
|
|||
/*
|
||||
* Copyright 2024 Amazon.com, Inc. or its affiliates.
|
||||
* Licensed under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package software.amazon.kinesis.coordinator.migration;
|
||||
|
||||
import java.time.Duration;
|
||||
import java.util.HashSet;
|
||||
import java.util.List;
|
||||
import java.util.Objects;
|
||||
import java.util.Set;
|
||||
import java.util.concurrent.Callable;
|
||||
import java.util.concurrent.CompletableFuture;
|
||||
import java.util.concurrent.CompletionException;
|
||||
import java.util.concurrent.ScheduledExecutorService;
|
||||
import java.util.concurrent.ScheduledFuture;
|
||||
import java.util.concurrent.TimeUnit;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import lombok.RequiredArgsConstructor;
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
import software.amazon.awssdk.annotations.ThreadSafe;
|
||||
import software.amazon.awssdk.services.cloudwatch.model.StandardUnit;
|
||||
import software.amazon.kinesis.coordinator.LeaderDecider;
|
||||
import software.amazon.kinesis.leases.Lease;
|
||||
import software.amazon.kinesis.leases.LeaseRefresher;
|
||||
import software.amazon.kinesis.leases.exceptions.DependencyException;
|
||||
import software.amazon.kinesis.metrics.MetricsFactory;
|
||||
import software.amazon.kinesis.metrics.MetricsLevel;
|
||||
import software.amazon.kinesis.metrics.MetricsScope;
|
||||
import software.amazon.kinesis.metrics.MetricsUtil;
|
||||
import software.amazon.kinesis.worker.metricstats.WorkerMetricStats;
|
||||
import software.amazon.kinesis.worker.metricstats.WorkerMetricStatsDAO;
|
||||
|
||||
import static software.amazon.kinesis.coordinator.migration.MigrationStateMachineImpl.METRICS_OPERATION;
|
||||
|
||||
/**
|
||||
* Monitor for KCL workers 3.x readiness. This monitor is started on all workers but only
|
||||
* executed on the leader of the fleet. The leader determines 3.x readiness if GSI of the lease
|
||||
* table is active and all lease owners are emitting WorkerMetricStats. The monitor performs this
|
||||
* check periodically and will invoke callback if the readiness conditions are true. Monitor
|
||||
* needs to be explicitly cancelled after the readiness trigger has successfully been handled.
|
||||
*
|
||||
* Thread safety - Guard for safety against public method invocation and internal runnable method.
|
||||
*/
|
||||
@Slf4j
|
||||
@ThreadSafe
|
||||
public class MigrationReadyMonitor implements Runnable {
|
||||
private static final long MONITOR_INTERVAL_MILLIS = Duration.ofMinutes(1).toMillis();
|
||||
private static final long LOG_INTERVAL_NANOS = Duration.ofMinutes(5).toNanos();
|
||||
|
||||
/**
|
||||
* Default retry attempt for loading leases and workers before giving up.
|
||||
*/
|
||||
private static final int DDB_LOAD_RETRY_ATTEMPT = 1;
|
||||
|
||||
private final MetricsFactory metricsFactory;
|
||||
private final Callable<Long> timeProvider;
|
||||
private final LeaderDecider leaderDecider;
|
||||
private final String currentWorkerId;
|
||||
private final WorkerMetricStatsDAO workerMetricStatsDAO;
|
||||
private final long workerMetricStatsExpirySeconds;
|
||||
private final LeaseRefresher leaseRefresher;
|
||||
private final ScheduledExecutorService stateMachineThreadPool;
|
||||
private final MonitorTriggerStabilizer triggerStabilizer;
|
||||
|
||||
private final LogRateLimiter rateLimitedStatusLogger = new LogRateLimiter(LOG_INTERVAL_NANOS);
|
||||
private ScheduledFuture<?> scheduledFuture;
|
||||
private boolean gsiStatusReady;
|
||||
private boolean workerMetricsReady;
|
||||
private Set<String> lastKnownUniqueLeaseOwners = new HashSet<>();
|
||||
private Set<String> lastKnownWorkersWithActiveWorkerMetrics = new HashSet<>();
|
||||
|
||||
public MigrationReadyMonitor(
|
||||
final MetricsFactory metricsFactory,
|
||||
final Callable<Long> timeProvider,
|
||||
final LeaderDecider leaderDecider,
|
||||
final String currentWorkerId,
|
||||
final WorkerMetricStatsDAO workerMetricStatsDAO,
|
||||
final long workerMetricsExpirySeconds,
|
||||
final LeaseRefresher leaseRefresher,
|
||||
final ScheduledExecutorService stateMachineThreadPool,
|
||||
final Runnable callback,
|
||||
final long callbackStabilizationInSeconds) {
|
||||
this.metricsFactory = metricsFactory;
|
||||
this.timeProvider = timeProvider;
|
||||
this.leaderDecider = leaderDecider;
|
||||
this.currentWorkerId = currentWorkerId;
|
||||
this.workerMetricStatsDAO = workerMetricStatsDAO;
|
||||
this.workerMetricStatsExpirySeconds = workerMetricsExpirySeconds;
|
||||
this.leaseRefresher = leaseRefresher;
|
||||
this.stateMachineThreadPool = stateMachineThreadPool;
|
||||
this.triggerStabilizer =
|
||||
new MonitorTriggerStabilizer(timeProvider, callbackStabilizationInSeconds, callback, currentWorkerId);
|
||||
}
|
||||
|
||||
public synchronized void startMonitor() {
|
||||
if (Objects.isNull(scheduledFuture)) {
|
||||
|
||||
log.info("Starting migration ready monitor");
|
||||
scheduledFuture = stateMachineThreadPool.scheduleWithFixedDelay(
|
||||
this, MONITOR_INTERVAL_MILLIS, MONITOR_INTERVAL_MILLIS, TimeUnit.MILLISECONDS);
|
||||
} else {
|
||||
log.info("Ignoring monitor request, since it is already started");
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Cancel the monitor. Once the method returns callback will not be invoked,
|
||||
* but callback can be invoked reentrantly before this method returns.
|
||||
*/
|
||||
public synchronized void cancel() {
|
||||
if (Objects.nonNull(scheduledFuture)) {
|
||||
log.info("Cancelled migration ready monitor");
|
||||
scheduledFuture.cancel(true);
|
||||
scheduledFuture = null;
|
||||
} else {
|
||||
log.info("{} is currently not active", this);
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public synchronized void run() {
|
||||
try {
|
||||
if (Thread.currentThread().isInterrupted()) {
|
||||
log.info("{} cancelled, exiting...", this);
|
||||
return;
|
||||
}
|
||||
if (!leaderDecider.isLeader(currentWorkerId)) {
|
||||
log.debug("Not the leader, not performing migration ready check {}", this);
|
||||
triggerStabilizer.reset();
|
||||
lastKnownUniqueLeaseOwners.clear();
|
||||
lastKnownWorkersWithActiveWorkerMetrics.clear();
|
||||
return;
|
||||
}
|
||||
|
||||
triggerStabilizer.call(isReadyForUpgradeTo3x());
|
||||
rateLimitedStatusLogger.log(() -> log.info("Monitor ran successfully {}", this));
|
||||
} catch (final Throwable t) {
|
||||
log.warn("{} failed, will retry after {}", this, MONITOR_INTERVAL_MILLIS, t);
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
return new StringBuilder("UpgradeReadyMonitor[")
|
||||
.append("G=")
|
||||
.append(gsiStatusReady)
|
||||
.append(",W=")
|
||||
.append(workerMetricsReady)
|
||||
.append("]")
|
||||
.toString();
|
||||
}
|
||||
|
||||
private boolean isReadyForUpgradeTo3x() throws DependencyException {
|
||||
final MetricsScope scope = MetricsUtil.createMetricsWithOperation(metricsFactory, METRICS_OPERATION);
|
||||
try {
|
||||
// If GSI is not ready, optimize to not check if worker metrics are being emitted
|
||||
final boolean localGsiReadyStatus = leaseRefresher.isLeaseOwnerToLeaseKeyIndexActive();
|
||||
if (localGsiReadyStatus != gsiStatusReady) {
|
||||
gsiStatusReady = localGsiReadyStatus;
|
||||
log.info("Gsi ready status changed to {}", gsiStatusReady);
|
||||
} else {
|
||||
log.debug("GsiReady status {}", gsiStatusReady);
|
||||
}
|
||||
return gsiStatusReady && areLeaseOwnersEmittingWorkerMetrics();
|
||||
} finally {
|
||||
scope.addData("GsiReadyStatus", gsiStatusReady ? 1 : 0, StandardUnit.COUNT, MetricsLevel.SUMMARY);
|
||||
scope.addData(
|
||||
"WorkerMetricsReadyStatus", workerMetricsReady ? 1 : 0, StandardUnit.COUNT, MetricsLevel.SUMMARY);
|
||||
MetricsUtil.endScope(scope);
|
||||
}
|
||||
}
|
||||
|
||||
private boolean areLeaseOwnersEmittingWorkerMetrics() {
|
||||
final CompletableFuture<List<Lease>> leaseListFuture = loadLeaseListAsync();
|
||||
final CompletableFuture<List<WorkerMetricStats>> workerMetricsFuture = loadWorkerMetricStats();
|
||||
|
||||
final List<Lease> leaseList = leaseListFuture.join();
|
||||
final Set<String> leaseOwners = getUniqueLeaseOwnersFromLeaseTable(leaseList);
|
||||
final List<WorkerMetricStats> workerMetricStatsList = workerMetricsFuture.join();
|
||||
final Set<String> workersWithActiveWorkerMetrics = getWorkersWithActiveWorkerMetricStats(workerMetricStatsList);
|
||||
|
||||
// Leases are not checked for expired condition because:
|
||||
// If some worker has gone down and is not active, but has lease assigned to it, those leases
|
||||
// maybe expired. Since the worker is down, it may not have worker-metrics, or worker-metrics may not be active,
|
||||
// In that case, the migration condition is not considered to be met.
|
||||
// However, those leases should be assigned to another worker and so the check in the next
|
||||
// iteration could succeed. This is intentional to make sure all leases owners are accounted for
|
||||
// and the old owner does not come back up without worker metrics and reacquires the lease.
|
||||
final boolean localWorkerMetricsReady = leaseOwners.equals(workersWithActiveWorkerMetrics);
|
||||
if (localWorkerMetricsReady != workerMetricsReady) {
|
||||
workerMetricsReady = localWorkerMetricsReady;
|
||||
log.info("WorkerMetricStats status changed to {}", workerMetricsReady);
|
||||
log.info("Lease List {}", leaseList);
|
||||
log.info("WorkerMetricStats {}", workerMetricStatsList);
|
||||
} else {
|
||||
log.debug("WorkerMetricStats ready status {}", workerMetricsReady);
|
||||
}
|
||||
|
||||
if (lastKnownUniqueLeaseOwners == null) {
|
||||
log.info("Unique lease owners {}", leaseOwners);
|
||||
} else if (!lastKnownUniqueLeaseOwners.equals(leaseOwners)) {
|
||||
log.info("Unique lease owners changed to {}", leaseOwners);
|
||||
}
|
||||
lastKnownUniqueLeaseOwners = leaseOwners;
|
||||
|
||||
if (lastKnownWorkersWithActiveWorkerMetrics == null) {
|
||||
log.info("Workers with active worker metric stats {}", workersWithActiveWorkerMetrics);
|
||||
} else if (!lastKnownWorkersWithActiveWorkerMetrics.equals(workersWithActiveWorkerMetrics)) {
|
||||
log.info("Workers with active worker metric stats changed {}", workersWithActiveWorkerMetrics);
|
||||
}
|
||||
lastKnownWorkersWithActiveWorkerMetrics = workersWithActiveWorkerMetrics;
|
||||
|
||||
return workerMetricsReady;
|
||||
}
|
||||
|
||||
private Set<String> getUniqueLeaseOwnersFromLeaseTable(final List<Lease> leaseList) {
|
||||
return leaseList.stream().map(Lease::leaseOwner).collect(Collectors.toSet());
|
||||
}
|
||||
|
||||
private Set<String> getWorkersWithActiveWorkerMetricStats(final List<WorkerMetricStats> workerMetricStats) {
|
||||
final long nowInSeconds = Duration.ofMillis(now(timeProvider)).getSeconds();
|
||||
return workerMetricStats.stream()
|
||||
.filter(metricStats -> isWorkerMetricStatsActive(metricStats, nowInSeconds))
|
||||
.map(WorkerMetricStats::getWorkerId)
|
||||
.collect(Collectors.toSet());
|
||||
}
|
||||
|
||||
private boolean isWorkerMetricStatsActive(final WorkerMetricStats metricStats, final long nowInSeconds) {
|
||||
return (metricStats.getLastUpdateTime() + workerMetricStatsExpirySeconds) > nowInSeconds;
|
||||
}
|
||||
|
||||
private CompletableFuture<List<WorkerMetricStats>> loadWorkerMetricStats() {
|
||||
return CompletableFuture.supplyAsync(() -> loadWithRetry(workerMetricStatsDAO::getAllWorkerMetricStats));
|
||||
}
|
||||
|
||||
private CompletableFuture<List<Lease>> loadLeaseListAsync() {
|
||||
return CompletableFuture.supplyAsync(() -> loadWithRetry(leaseRefresher::listLeases));
|
||||
}
|
||||
|
||||
private <T> T loadWithRetry(final Callable<T> loadFunction) {
|
||||
int retryAttempt = 0;
|
||||
while (true) {
|
||||
try {
|
||||
return loadFunction.call();
|
||||
} catch (final Exception e) {
|
||||
if (retryAttempt < DDB_LOAD_RETRY_ATTEMPT) {
|
||||
log.warn(
|
||||
"Failed to load : {}, retrying",
|
||||
loadFunction.getClass().getName(),
|
||||
e);
|
||||
retryAttempt++;
|
||||
} else {
|
||||
throw new CompletionException(e);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private static long now(final Callable<Long> timeProvider) {
|
||||
try {
|
||||
return timeProvider.call();
|
||||
} catch (final Exception e) {
|
||||
log.debug("Time provider threw exception, using System.currentTimeMillis", e);
|
||||
return System.currentTimeMillis();
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Stabilize the monitor trigger before invoking the callback
|
||||
* to ensure we are consistently seeing the trigger for a configured
|
||||
* stabilizationDurationInMillis
|
||||
*/
|
||||
private static class MonitorTriggerStabilizer {
|
||||
private final Callable<Long> timeProvider;
|
||||
private final long stabilizationDurationInSeconds;
|
||||
private final Runnable callback;
|
||||
private final String currentWorkerId;
|
||||
private final LogRateLimiter rateLimitedTriggerStatusLogger;
|
||||
|
||||
private long lastToggleTimeInMillis;
|
||||
private boolean currentTriggerStatus;
|
||||
|
||||
public MonitorTriggerStabilizer(
|
||||
final Callable<Long> timeProvider,
|
||||
final long stabilizationDurationInSeconds,
|
||||
final Runnable callback,
|
||||
final String currentWorkerId) {
|
||||
this.timeProvider = timeProvider;
|
||||
this.stabilizationDurationInSeconds = stabilizationDurationInSeconds;
|
||||
this.callback = callback;
|
||||
this.currentWorkerId = currentWorkerId;
|
||||
this.rateLimitedTriggerStatusLogger = new LogRateLimiter(LOG_INTERVAL_NANOS);
|
||||
}
|
||||
|
||||
public void call(final boolean isMonitorTriggered) {
|
||||
final long now = now(timeProvider);
|
||||
if (currentTriggerStatus != isMonitorTriggered) {
|
||||
log.info("Trigger status has changed to {}", isMonitorTriggered);
|
||||
currentTriggerStatus = isMonitorTriggered;
|
||||
lastToggleTimeInMillis = now;
|
||||
}
|
||||
|
||||
if (currentTriggerStatus) {
|
||||
final long deltaSeconds =
|
||||
Duration.ofMillis(now - lastToggleTimeInMillis).getSeconds();
|
||||
if (deltaSeconds >= stabilizationDurationInSeconds) {
|
||||
log.info("Trigger has been consistently true for {}s, invoking callback", deltaSeconds);
|
||||
callback.run();
|
||||
} else {
|
||||
rateLimitedTriggerStatusLogger.log(() -> log.info(
|
||||
"Trigger has been true for {}s, waiting for stabilization time of {}s",
|
||||
deltaSeconds,
|
||||
stabilizationDurationInSeconds));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
public void reset() {
|
||||
if (currentTriggerStatus) {
|
||||
log.info("This worker {} is no longer the leader, reset current status", currentWorkerId);
|
||||
}
|
||||
currentTriggerStatus = false;
|
||||
}
|
||||
}
|
||||
|
||||
@RequiredArgsConstructor
|
||||
private static class LogRateLimiter {
|
||||
private final long logIntervalInNanos;
|
||||
|
||||
private long nextLogTime = System.nanoTime();
|
||||
|
||||
public void log(final Runnable logger) {
|
||||
final long now = System.nanoTime();
|
||||
if (now >= nextLogTime) {
|
||||
logger.run();
|
||||
nextLogTime = now + logIntervalInNanos;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,231 @@
|
|||
package software.amazon.kinesis.coordinator.migration;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import lombok.Getter;
|
||||
import lombok.RequiredArgsConstructor;
|
||||
import lombok.ToString;
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
import software.amazon.awssdk.services.dynamodb.model.AttributeAction;
|
||||
import software.amazon.awssdk.services.dynamodb.model.AttributeValue;
|
||||
import software.amazon.awssdk.services.dynamodb.model.AttributeValueUpdate;
|
||||
import software.amazon.awssdk.services.dynamodb.model.ExpectedAttributeValue;
|
||||
import software.amazon.kinesis.common.StackTraceUtils;
|
||||
import software.amazon.kinesis.coordinator.CoordinatorState;
|
||||
|
||||
/**
|
||||
* Data model of the Migration state. This is used to track the state related to migration
|
||||
* from KCLv2.x to KCLv3.x.
|
||||
*/
|
||||
@Getter
|
||||
@ToString(callSuper = true)
|
||||
@Slf4j
|
||||
public class MigrationState extends CoordinatorState {
|
||||
/**
|
||||
* Key value for the item in the CoordinatorState table
|
||||
*/
|
||||
public static final String MIGRATION_HASH_KEY = "Migration3.0";
|
||||
/**
|
||||
* Attribute name in migration state item, whose value is used during
|
||||
* the KCL v3.x migration process to know whether the workers need to
|
||||
* perform KCL v2.x compatible operations or can perform native KCL v3.x
|
||||
* operations.
|
||||
*/
|
||||
public static final String CLIENT_VERSION_ATTRIBUTE_NAME = "cv";
|
||||
|
||||
public static final String MODIFIED_BY_ATTRIBUTE_NAME = "mb";
|
||||
public static final String MODIFIED_TIMESTAMP_ATTRIBUTE_NAME = "mts";
|
||||
public static final String HISTORY_ATTRIBUTE_NAME = "h";
|
||||
private static final int MAX_HISTORY_ENTRIES = 10;
|
||||
|
||||
private ClientVersion clientVersion;
|
||||
private String modifiedBy;
|
||||
private long modifiedTimestamp;
|
||||
private final List<HistoryEntry> history;
|
||||
|
||||
private MigrationState(
|
||||
final String key,
|
||||
final ClientVersion clientVersion,
|
||||
final String modifiedBy,
|
||||
final long modifiedTimestamp,
|
||||
final List<HistoryEntry> historyEntries,
|
||||
final Map<String, AttributeValue> others) {
|
||||
setKey(key);
|
||||
setAttributes(others);
|
||||
this.clientVersion = clientVersion;
|
||||
this.modifiedBy = modifiedBy;
|
||||
this.modifiedTimestamp = modifiedTimestamp;
|
||||
this.history = historyEntries;
|
||||
}
|
||||
|
||||
public MigrationState(final String key, final String modifiedBy) {
|
||||
this(
|
||||
key,
|
||||
ClientVersion.CLIENT_VERSION_INIT,
|
||||
modifiedBy,
|
||||
System.currentTimeMillis(),
|
||||
new ArrayList<>(),
|
||||
new HashMap<>());
|
||||
}
|
||||
|
||||
public HashMap<String, AttributeValue> serialize() {
|
||||
final HashMap<String, AttributeValue> result = new HashMap<>();
|
||||
result.put(CLIENT_VERSION_ATTRIBUTE_NAME, AttributeValue.fromS(clientVersion.name()));
|
||||
result.put(MODIFIED_BY_ATTRIBUTE_NAME, AttributeValue.fromS(modifiedBy));
|
||||
result.put(MODIFIED_TIMESTAMP_ATTRIBUTE_NAME, AttributeValue.fromN(String.valueOf(modifiedTimestamp)));
|
||||
|
||||
if (!history.isEmpty()) {
|
||||
final List<AttributeValue> historyList = new ArrayList<>();
|
||||
for (final HistoryEntry entry : history) {
|
||||
historyList.add(AttributeValue.builder().m(entry.serialize()).build());
|
||||
}
|
||||
result.put(
|
||||
HISTORY_ATTRIBUTE_NAME,
|
||||
AttributeValue.builder().l(historyList).build());
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
public static MigrationState deserialize(final String key, final HashMap<String, AttributeValue> attributes) {
|
||||
if (!MIGRATION_HASH_KEY.equals(key)) {
|
||||
return null;
|
||||
}
|
||||
|
||||
try {
|
||||
final HashMap<String, AttributeValue> mutableAttributes = new HashMap<>(attributes);
|
||||
final ClientVersion clientVersion = ClientVersion.valueOf(
|
||||
mutableAttributes.remove(CLIENT_VERSION_ATTRIBUTE_NAME).s());
|
||||
final String modifiedBy =
|
||||
mutableAttributes.remove(MODIFIED_BY_ATTRIBUTE_NAME).s();
|
||||
final long modifiedTimestamp = Long.parseLong(
|
||||
mutableAttributes.remove(MODIFIED_TIMESTAMP_ATTRIBUTE_NAME).n());
|
||||
|
||||
final List<HistoryEntry> historyList = new ArrayList<>();
|
||||
if (attributes.containsKey(HISTORY_ATTRIBUTE_NAME)) {
|
||||
mutableAttributes.remove(HISTORY_ATTRIBUTE_NAME).l().stream()
|
||||
.map(historyEntry -> HistoryEntry.deserialize(historyEntry.m()))
|
||||
.forEach(historyList::add);
|
||||
}
|
||||
final MigrationState migrationState = new MigrationState(
|
||||
MIGRATION_HASH_KEY, clientVersion, modifiedBy, modifiedTimestamp, historyList, mutableAttributes);
|
||||
|
||||
if (!mutableAttributes.isEmpty()) {
|
||||
log.info("Unknown attributes {} for state {}", mutableAttributes, migrationState);
|
||||
}
|
||||
return migrationState;
|
||||
|
||||
} catch (final Exception e) {
|
||||
log.warn("Unable to deserialize state with key {} and attributes {}", key, attributes, e);
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
public Map<String, ExpectedAttributeValue> getDynamoClientVersionExpectation() {
|
||||
return new HashMap<String, ExpectedAttributeValue>() {
|
||||
{
|
||||
put(
|
||||
CLIENT_VERSION_ATTRIBUTE_NAME,
|
||||
ExpectedAttributeValue.builder()
|
||||
.value(AttributeValue.fromS(clientVersion.name()))
|
||||
.build());
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
public MigrationState copy() {
|
||||
return new MigrationState(
|
||||
getKey(),
|
||||
getClientVersion(),
|
||||
getModifiedBy(),
|
||||
getModifiedTimestamp(),
|
||||
new ArrayList<>(getHistory()),
|
||||
new HashMap<>(getAttributes()));
|
||||
}
|
||||
|
||||
public MigrationState update(final ClientVersion clientVersion, final String modifiedBy) {
|
||||
log.info(
|
||||
"Migration state is being updated to {} current state {} caller {}",
|
||||
clientVersion,
|
||||
this,
|
||||
StackTraceUtils.getPrintableStackTrace(Thread.currentThread().getStackTrace()));
|
||||
addHistoryEntry(this.clientVersion, this.modifiedBy, this.modifiedTimestamp);
|
||||
this.clientVersion = clientVersion;
|
||||
this.modifiedBy = modifiedBy;
|
||||
this.modifiedTimestamp = System.currentTimeMillis();
|
||||
return this;
|
||||
}
|
||||
|
||||
public void addHistoryEntry(
|
||||
final ClientVersion lastClientVersion, final String lastModifiedBy, final long lastModifiedTimestamp) {
|
||||
history.add(0, new HistoryEntry(lastClientVersion, lastModifiedBy, lastModifiedTimestamp));
|
||||
if (history.size() > MAX_HISTORY_ENTRIES) {
|
||||
log.info("Limit {} reached, dropping history {}", MAX_HISTORY_ENTRIES, history.remove(history.size() - 1));
|
||||
}
|
||||
}
|
||||
|
||||
public Map<String, AttributeValueUpdate> getDynamoUpdate() {
|
||||
final HashMap<String, AttributeValueUpdate> updates = new HashMap<>();
|
||||
updates.put(
|
||||
CLIENT_VERSION_ATTRIBUTE_NAME,
|
||||
AttributeValueUpdate.builder()
|
||||
.value(AttributeValue.fromS(clientVersion.name()))
|
||||
.action(AttributeAction.PUT)
|
||||
.build());
|
||||
updates.put(
|
||||
MODIFIED_BY_ATTRIBUTE_NAME,
|
||||
AttributeValueUpdate.builder()
|
||||
.value(AttributeValue.fromS(modifiedBy))
|
||||
.action(AttributeAction.PUT)
|
||||
.build());
|
||||
updates.put(
|
||||
MODIFIED_TIMESTAMP_ATTRIBUTE_NAME,
|
||||
AttributeValueUpdate.builder()
|
||||
.value(AttributeValue.fromN(String.valueOf(modifiedTimestamp)))
|
||||
.action(AttributeAction.PUT)
|
||||
.build());
|
||||
if (!history.isEmpty()) {
|
||||
updates.put(
|
||||
HISTORY_ATTRIBUTE_NAME,
|
||||
AttributeValueUpdate.builder()
|
||||
.value(AttributeValue.fromL(
|
||||
history.stream().map(HistoryEntry::toAv).collect(Collectors.toList())))
|
||||
.action(AttributeAction.PUT)
|
||||
.build());
|
||||
}
|
||||
return updates;
|
||||
}
|
||||
|
||||
@RequiredArgsConstructor
|
||||
@ToString
|
||||
public static class HistoryEntry {
|
||||
private final ClientVersion lastClientVersion;
|
||||
private final String lastModifiedBy;
|
||||
private final long lastModifiedTimestamp;
|
||||
|
||||
public AttributeValue toAv() {
|
||||
return AttributeValue.fromM(serialize());
|
||||
}
|
||||
|
||||
public Map<String, AttributeValue> serialize() {
|
||||
return new HashMap<String, AttributeValue>() {
|
||||
{
|
||||
put(CLIENT_VERSION_ATTRIBUTE_NAME, AttributeValue.fromS(lastClientVersion.name()));
|
||||
put(MODIFIED_BY_ATTRIBUTE_NAME, AttributeValue.fromS(lastModifiedBy));
|
||||
put(MODIFIED_TIMESTAMP_ATTRIBUTE_NAME, AttributeValue.fromN(String.valueOf(lastModifiedTimestamp)));
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
public static HistoryEntry deserialize(final Map<String, AttributeValue> map) {
|
||||
return new HistoryEntry(
|
||||
ClientVersion.valueOf(map.get(CLIENT_VERSION_ATTRIBUTE_NAME).s()),
|
||||
map.get(MODIFIED_BY_ATTRIBUTE_NAME).s(),
|
||||
Long.parseLong(map.get(MODIFIED_TIMESTAMP_ATTRIBUTE_NAME).n()));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,66 @@
|
|||
/*
|
||||
* Copyright 2024 Amazon.com, Inc. or its affiliates.
|
||||
* Licensed under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package software.amazon.kinesis.coordinator.migration;
|
||||
|
||||
import software.amazon.kinesis.leases.exceptions.DependencyException;
|
||||
import software.amazon.kinesis.leases.exceptions.InvalidStateException;
|
||||
|
||||
/**
|
||||
* State machine that provides:
|
||||
* 1. Seamless upgrade from 2.x to 3.x - 3.x has introduced new algorithms that are not compatible with 2.x
|
||||
* workers, so the state machine allows to seamlessly run the 2.x functionality to be compliant with any
|
||||
* 2.x worker in the fleet, and also seamlessly switch to 3.x functionality when all KCL workers are
|
||||
* 3.x complaint.
|
||||
* 2. Instant rollbacks - Rollbacks are supported using the KCL Migration tool to revert back to 2.x functionality
|
||||
* if customer finds regressions in 3.x functionality.
|
||||
* 3. Instant roll-forwards - Once any issue has been mitigated, rollfowards are supported instantly
|
||||
* with KCL Migration tool.
|
||||
*/
|
||||
public interface MigrationStateMachine {
|
||||
/**
|
||||
* Initialize the state machine by identifying the initial state when the KCL worker comes up for the first time.
|
||||
* @throws DependencyException When unable to identify the initial state.
|
||||
*/
|
||||
void initialize() throws DependencyException;
|
||||
|
||||
/**
|
||||
* Shutdown state machine and perform necessary cleanup for the worker to gracefully shutdown
|
||||
*/
|
||||
void shutdown();
|
||||
|
||||
/**
|
||||
* Terminate the state machine when it reaches a terminal state, which is a successful upgrade
|
||||
* to v3.x.
|
||||
*/
|
||||
void terminate();
|
||||
|
||||
/**
|
||||
* Peform transition from current state to the given new ClientVersion
|
||||
* @param nextClientVersion clientVersion of the new state the state machine must transition to
|
||||
* @param state the current MigrationState in dynamo
|
||||
* @throws InvalidStateException when transition fails, this allows the state machine to stay
|
||||
* in the current state until a valid transition is possible
|
||||
* @throws DependencyException when transition fails due to dependency on DDB failing in
|
||||
* unexpected ways.
|
||||
*/
|
||||
void transitionTo(final ClientVersion nextClientVersion, final MigrationState state)
|
||||
throws InvalidStateException, DependencyException;
|
||||
|
||||
/**
|
||||
* Get the ClientVersion of current state machine state.
|
||||
* @return ClientVersion of current state machine state
|
||||
*/
|
||||
ClientVersion getCurrentClientVersion();
|
||||
}
|
||||
|
|
@ -0,0 +1,254 @@
|
|||
/*
|
||||
* Copyright 2024 Amazon.com, Inc. or its affiliates.
|
||||
* Licensed under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package software.amazon.kinesis.coordinator.migration;
|
||||
|
||||
import java.util.AbstractMap.SimpleEntry;
|
||||
import java.util.Random;
|
||||
import java.util.concurrent.Callable;
|
||||
import java.util.concurrent.ScheduledExecutorService;
|
||||
import java.util.concurrent.TimeUnit;
|
||||
|
||||
import lombok.Getter;
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
import software.amazon.awssdk.annotations.ThreadSafe;
|
||||
import software.amazon.awssdk.services.cloudwatch.model.StandardUnit;
|
||||
import software.amazon.kinesis.annotations.KinesisClientInternalApi;
|
||||
import software.amazon.kinesis.coordinator.CoordinatorConfig.ClientVersionConfig;
|
||||
import software.amazon.kinesis.coordinator.CoordinatorStateDAO;
|
||||
import software.amazon.kinesis.coordinator.DynamicMigrationComponentsInitializer;
|
||||
import software.amazon.kinesis.leases.exceptions.DependencyException;
|
||||
import software.amazon.kinesis.metrics.MetricsFactory;
|
||||
import software.amazon.kinesis.metrics.MetricsLevel;
|
||||
import software.amazon.kinesis.metrics.MetricsScope;
|
||||
import software.amazon.kinesis.metrics.MetricsUtil;
|
||||
|
||||
/**
|
||||
* Implementation of {@link MigrationStateMachine}
|
||||
*/
|
||||
@KinesisClientInternalApi
|
||||
@Getter
|
||||
@Slf4j
|
||||
@ThreadSafe
|
||||
public class MigrationStateMachineImpl implements MigrationStateMachine {
|
||||
public static final String FAULT_METRIC = "Fault";
|
||||
public static final String METRICS_OPERATION = "Migration";
|
||||
|
||||
private static final long THREAD_POOL_SHUTDOWN_TIMEOUT_SECONDS = 5L;
|
||||
|
||||
private final MetricsFactory metricsFactory;
|
||||
private final Callable<Long> timeProvider;
|
||||
private final CoordinatorStateDAO coordinatorStateDAO;
|
||||
private final ScheduledExecutorService stateMachineThreadPool;
|
||||
private DynamicMigrationComponentsInitializer initializer;
|
||||
private final ClientVersionConfig clientVersionConfig;
|
||||
private final Random random;
|
||||
private final String workerId;
|
||||
private final long flipTo3XStabilizerTimeInSeconds;
|
||||
private MigrationState startingMigrationState;
|
||||
|
||||
@Getter
|
||||
private ClientVersion startingClientVersion;
|
||||
|
||||
private MigrationClientVersionState currentMigrationClientVersionState = new MigrationClientVersionState() {
|
||||
@Override
|
||||
public ClientVersion clientVersion() {
|
||||
return ClientVersion.CLIENT_VERSION_INIT;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void enter(final ClientVersion fromClientVersion) {
|
||||
log.info("Entered {}...", clientVersion());
|
||||
}
|
||||
|
||||
@Override
|
||||
public void leave() {
|
||||
log.info("Left {}...", clientVersion());
|
||||
}
|
||||
};
|
||||
private boolean terminated = false;
|
||||
|
||||
public MigrationStateMachineImpl(
|
||||
final MetricsFactory metricsFactory,
|
||||
final Callable<Long> timeProvider,
|
||||
final CoordinatorStateDAO coordinatorStateDAO,
|
||||
final ScheduledExecutorService stateMachineThreadPool,
|
||||
final ClientVersionConfig clientVersionConfig,
|
||||
final Random random,
|
||||
final DynamicMigrationComponentsInitializer initializer,
|
||||
final String workerId,
|
||||
final long flipTo3XStabilizerTimeInSeconds) {
|
||||
this.metricsFactory = metricsFactory;
|
||||
this.timeProvider = timeProvider;
|
||||
this.coordinatorStateDAO = coordinatorStateDAO;
|
||||
this.stateMachineThreadPool = stateMachineThreadPool;
|
||||
this.clientVersionConfig = clientVersionConfig;
|
||||
this.random = random;
|
||||
this.initializer = initializer;
|
||||
this.workerId = workerId;
|
||||
this.flipTo3XStabilizerTimeInSeconds = flipTo3XStabilizerTimeInSeconds;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void initialize() throws DependencyException {
|
||||
if (startingClientVersion == null) {
|
||||
log.info("Initializing MigrationStateMachine");
|
||||
coordinatorStateDAO.initialize();
|
||||
final MigrationClientVersionStateInitializer startingStateInitializer =
|
||||
new MigrationClientVersionStateInitializer(
|
||||
timeProvider, coordinatorStateDAO, clientVersionConfig, random, workerId);
|
||||
final SimpleEntry<ClientVersion, MigrationState> dataForInitialization =
|
||||
startingStateInitializer.getInitialState();
|
||||
initializer.initialize(dataForInitialization.getKey());
|
||||
transitionTo(dataForInitialization.getKey(), dataForInitialization.getValue());
|
||||
startingClientVersion = dataForInitialization.getKey();
|
||||
startingMigrationState = dataForInitialization.getValue();
|
||||
log.info("MigrationStateMachine initial clientVersion {}", startingClientVersion);
|
||||
} else {
|
||||
log.info("MigrationStateMachine already initialized with clientVersion {}", startingClientVersion);
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public void shutdown() {
|
||||
terminate();
|
||||
if (!stateMachineThreadPool.isShutdown()) {
|
||||
stateMachineThreadPool.shutdown();
|
||||
try {
|
||||
if (stateMachineThreadPool.awaitTermination(THREAD_POOL_SHUTDOWN_TIMEOUT_SECONDS, TimeUnit.SECONDS)) {
|
||||
log.info(
|
||||
"StateMachineThreadPool did not shutdown within {} seconds, forcefully shutting down",
|
||||
THREAD_POOL_SHUTDOWN_TIMEOUT_SECONDS);
|
||||
stateMachineThreadPool.shutdownNow();
|
||||
}
|
||||
} catch (final InterruptedException e) {
|
||||
log.info("Interrupted when shutting down StateMachineThreadPool, forcefully shutting down");
|
||||
stateMachineThreadPool.shutdownNow();
|
||||
}
|
||||
}
|
||||
log.info("Shutdown successfully");
|
||||
}
|
||||
|
||||
@Override
|
||||
public synchronized void terminate() {
|
||||
if (!terminated && currentMigrationClientVersionState != null) {
|
||||
log.info("State machine is about to terminate");
|
||||
currentMigrationClientVersionState.leave();
|
||||
currentMigrationClientVersionState = null;
|
||||
log.info("State machine reached a terminal state.");
|
||||
terminated = true;
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public synchronized void transitionTo(final ClientVersion nextClientVersion, final MigrationState migrationState)
|
||||
throws DependencyException {
|
||||
if (terminated) {
|
||||
throw new IllegalStateException(String.format(
|
||||
"Cannot transition to %s after state machine is terminated, %s",
|
||||
nextClientVersion.name(), migrationState));
|
||||
}
|
||||
|
||||
final MigrationClientVersionState nextMigrationClientVersionState =
|
||||
createMigrationClientVersionState(nextClientVersion, migrationState);
|
||||
log.info(
|
||||
"Attempting to transition from {} to {}",
|
||||
currentMigrationClientVersionState.clientVersion(),
|
||||
nextClientVersion);
|
||||
currentMigrationClientVersionState.leave();
|
||||
|
||||
enter(nextMigrationClientVersionState);
|
||||
}
|
||||
|
||||
/**
|
||||
* Enter with retry. When entering the state machine for the first time, the caller has retry so exceptions
|
||||
* will be re-thrown. Once the state machine has initialized all transitions will be an indefinite retry.
|
||||
* It is possible the DDB state has changed by the time enter succeeds but that will occur as a new
|
||||
* state transition after entering the state. Usually the failures are due to unexpected issues with
|
||||
* DDB which will be transitional and will recover on a retry.
|
||||
* @param nextMigrationClientVersionState the state to transition to
|
||||
* @throws DependencyException If entering fails during state machine initialization.
|
||||
*/
|
||||
private void enter(final MigrationClientVersionState nextMigrationClientVersionState) throws DependencyException {
|
||||
boolean success = false;
|
||||
while (!success) {
|
||||
try {
|
||||
// Enter should never fail unless it is the starting state and fails to create the GSI,
|
||||
// in which case it is an unrecoverable error that is bubbled up and KCL start up will fail.
|
||||
nextMigrationClientVersionState.enter(currentMigrationClientVersionState.clientVersion());
|
||||
|
||||
currentMigrationClientVersionState = nextMigrationClientVersionState;
|
||||
log.info("Successfully transitioned to {}", nextMigrationClientVersionState.clientVersion());
|
||||
if (currentMigrationClientVersionState.clientVersion() == ClientVersion.CLIENT_VERSION_3x) {
|
||||
terminate();
|
||||
}
|
||||
success = true;
|
||||
} catch (final DependencyException e) {
|
||||
if (currentMigrationClientVersionState.clientVersion() == ClientVersion.CLIENT_VERSION_INIT) {
|
||||
throw e;
|
||||
}
|
||||
log.info(
|
||||
"Transitioning from {} to {} failed, retrying after a minute",
|
||||
currentMigrationClientVersionState.clientVersion(),
|
||||
nextMigrationClientVersionState.clientVersion(),
|
||||
e);
|
||||
|
||||
final MetricsScope scope = MetricsUtil.createMetricsWithOperation(metricsFactory, METRICS_OPERATION);
|
||||
scope.addData(FAULT_METRIC, 1, StandardUnit.COUNT, MetricsLevel.SUMMARY);
|
||||
MetricsUtil.endScope(scope);
|
||||
|
||||
try {
|
||||
Thread.sleep(1000);
|
||||
} catch (final InterruptedException ie) {
|
||||
log.info("Interrupted while sleeping before retrying state machine transition", ie);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private MigrationClientVersionState createMigrationClientVersionState(
|
||||
final ClientVersion clientVersion, final MigrationState migrationState) {
|
||||
switch (clientVersion) {
|
||||
case CLIENT_VERSION_2x:
|
||||
return new MigrationClientVersion2xState(
|
||||
this, coordinatorStateDAO, stateMachineThreadPool, initializer, random);
|
||||
case CLIENT_VERSION_UPGRADE_FROM_2x:
|
||||
return new MigrationClientVersionUpgradeFrom2xState(
|
||||
this,
|
||||
timeProvider,
|
||||
coordinatorStateDAO,
|
||||
stateMachineThreadPool,
|
||||
initializer,
|
||||
random,
|
||||
migrationState,
|
||||
flipTo3XStabilizerTimeInSeconds);
|
||||
case CLIENT_VERSION_3x_WITH_ROLLBACK:
|
||||
return new MigrationClientVersion3xWithRollbackState(
|
||||
this, coordinatorStateDAO, stateMachineThreadPool, initializer, random);
|
||||
case CLIENT_VERSION_3x:
|
||||
return new MigrationClientVersion3xState(this, initializer);
|
||||
}
|
||||
throw new IllegalStateException(String.format("Unknown client version %s", clientVersion));
|
||||
}
|
||||
|
||||
public ClientVersion getCurrentClientVersion() {
|
||||
if (currentMigrationClientVersionState != null) {
|
||||
return currentMigrationClientVersionState.clientVersion();
|
||||
} else if (terminated) {
|
||||
return ClientVersion.CLIENT_VERSION_3x;
|
||||
}
|
||||
throw new UnsupportedOperationException(
|
||||
"No current state when state machine is either not initialized" + " or already terminated");
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,270 @@
|
|||
package software.amazon.kinesis.leader;
|
||||
|
||||
import java.time.Duration;
|
||||
import java.time.Instant;
|
||||
import java.util.AbstractMap;
|
||||
import java.util.Optional;
|
||||
import java.util.concurrent.TimeUnit;
|
||||
import java.util.concurrent.atomic.AtomicBoolean;
|
||||
|
||||
import com.amazonaws.services.dynamodbv2.AcquireLockOptions;
|
||||
import com.amazonaws.services.dynamodbv2.AmazonDynamoDBLockClient;
|
||||
import com.amazonaws.services.dynamodbv2.GetLockOptions;
|
||||
import com.amazonaws.services.dynamodbv2.LockItem;
|
||||
import com.amazonaws.services.dynamodbv2.model.LockCurrentlyUnavailableException;
|
||||
import com.google.common.annotations.VisibleForTesting;
|
||||
import lombok.RequiredArgsConstructor;
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
import software.amazon.awssdk.services.cloudwatch.model.StandardUnit;
|
||||
import software.amazon.awssdk.services.dynamodb.model.ResourceNotFoundException;
|
||||
import software.amazon.kinesis.coordinator.CoordinatorStateDAO;
|
||||
import software.amazon.kinesis.coordinator.LeaderDecider;
|
||||
import software.amazon.kinesis.metrics.MetricsFactory;
|
||||
import software.amazon.kinesis.metrics.MetricsLevel;
|
||||
import software.amazon.kinesis.metrics.MetricsScope;
|
||||
import software.amazon.kinesis.metrics.MetricsUtil;
|
||||
|
||||
import static java.util.Objects.isNull;
|
||||
import static software.amazon.kinesis.coordinator.CoordinatorState.LEADER_HASH_KEY;
|
||||
|
||||
/**
|
||||
* Implementation for LeaderDecider to elect leader using lock on dynamo db table. This class uses
|
||||
* AmazonDynamoDBLockClient library to perform the leader election.
|
||||
*/
|
||||
@RequiredArgsConstructor
|
||||
@Slf4j
|
||||
public class DynamoDBLockBasedLeaderDecider implements LeaderDecider {
|
||||
private static final Long DEFAULT_LEASE_DURATION_MILLIS =
|
||||
Duration.ofMinutes(2).toMillis();
|
||||
// Heartbeat frequency should be at-least 3 times smaller the lease duration according to LockClient documentation
|
||||
private static final Long DEFAULT_HEARTBEAT_PERIOD_MILLIS =
|
||||
Duration.ofSeconds(30).toMillis();
|
||||
|
||||
private final CoordinatorStateDAO coordinatorStateDao;
|
||||
private final AmazonDynamoDBLockClient dynamoDBLockClient;
|
||||
private final Long heartbeatPeriodMillis;
|
||||
private final String workerId;
|
||||
private final MetricsFactory metricsFactory;
|
||||
|
||||
private long lastCheckTimeInMillis = 0L;
|
||||
private boolean lastIsLeaderResult = false;
|
||||
private final AtomicBoolean isShutdown = new AtomicBoolean(false);
|
||||
|
||||
private long lastIsAnyLeaderElectedDDBReadTimeMillis = 0L;
|
||||
private boolean lastIsAnyLeaderElectedResult = false;
|
||||
/**
|
||||
* Key value pair of LockItem to the time when it was first discovered.
|
||||
* If a new LockItem fetched from ddb has different recordVersionNumber than the one in-memory,
|
||||
* its considered as new LockItem, and the time when it was fetched is stored in memory to identify lockItem
|
||||
* expiry. This is used only in the context of isAnyLeaderElected method.
|
||||
*/
|
||||
private AbstractMap.SimpleEntry<LockItem, Long> lastIsAnyLeaderCheckLockItemToFirstEncounterTime = null;
|
||||
|
||||
@VisibleForTesting
|
||||
static DynamoDBLockBasedLeaderDecider create(
|
||||
final CoordinatorStateDAO coordinatorStateDao,
|
||||
final String workerId,
|
||||
final Long leaseDuration,
|
||||
final Long heartbeatPeriod,
|
||||
final MetricsFactory metricsFactory) {
|
||||
final AmazonDynamoDBLockClient dynamoDBLockClient = new AmazonDynamoDBLockClient(coordinatorStateDao
|
||||
.getDDBLockClientOptionsBuilder()
|
||||
.withTimeUnit(TimeUnit.MILLISECONDS)
|
||||
.withLeaseDuration(leaseDuration)
|
||||
.withHeartbeatPeriod(heartbeatPeriod)
|
||||
.withCreateHeartbeatBackgroundThread(true)
|
||||
.withOwnerName(workerId)
|
||||
.build());
|
||||
|
||||
return new DynamoDBLockBasedLeaderDecider(
|
||||
coordinatorStateDao, dynamoDBLockClient, heartbeatPeriod, workerId, metricsFactory);
|
||||
}
|
||||
|
||||
public static DynamoDBLockBasedLeaderDecider create(
|
||||
final CoordinatorStateDAO coordinatorStateDao, final String workerId, final MetricsFactory metricsFactory) {
|
||||
return create(
|
||||
coordinatorStateDao,
|
||||
workerId,
|
||||
DEFAULT_LEASE_DURATION_MILLIS,
|
||||
DEFAULT_HEARTBEAT_PERIOD_MILLIS,
|
||||
metricsFactory);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void initialize() {
|
||||
log.info("Initializing DDB Lock based leader decider");
|
||||
}
|
||||
|
||||
/**
|
||||
* Check the lockItem in storage and if the current worker is not leader worker, then tries to acquire lock and
|
||||
* returns true if it was able to acquire lock else false.
|
||||
* @param workerId ID of the worker
|
||||
* @return true if current worker is leader else false.
|
||||
*/
|
||||
@Override
|
||||
public synchronized Boolean isLeader(final String workerId) {
|
||||
// if the decider has shutdown, then return false and don't try acquireLock anymore.
|
||||
if (isShutdown.get()) {
|
||||
publishIsLeaderMetrics(false);
|
||||
return false;
|
||||
}
|
||||
// If the last time we tried to take lock and didnt get lock, don't try to take again for heartbeatPeriodMillis
|
||||
// this is to avoid unnecessary calls to dynamoDB.
|
||||
// Different modules in KCL can request for isLeader check within heartbeatPeriodMillis, and this optimization
|
||||
// will help in those cases.
|
||||
// In case the last call returned true, we want to check the source always to ensure the correctness of leader.
|
||||
if (!lastIsLeaderResult && lastCheckTimeInMillis + heartbeatPeriodMillis > System.currentTimeMillis()) {
|
||||
publishIsLeaderMetrics(lastIsLeaderResult);
|
||||
return lastIsLeaderResult;
|
||||
}
|
||||
boolean response;
|
||||
// Get the lockItem from storage (if present
|
||||
final Optional<LockItem> lockItem = dynamoDBLockClient.getLock(LEADER_HASH_KEY, Optional.empty());
|
||||
lockItem.ifPresent(item -> log.info("Worker : {} is the current leader.", item.getOwnerName()));
|
||||
|
||||
// If the lockItem is present and is expired, that means either current worker is not leader.
|
||||
if (!lockItem.isPresent() || lockItem.get().isExpired()) {
|
||||
try {
|
||||
// Current worker does not hold the lock, try to acquireOne.
|
||||
final Optional<LockItem> leaderLockItem =
|
||||
dynamoDBLockClient.tryAcquireLock(AcquireLockOptions.builder(LEADER_HASH_KEY)
|
||||
.withRefreshPeriod(heartbeatPeriodMillis)
|
||||
.withTimeUnit(TimeUnit.MILLISECONDS)
|
||||
.withShouldSkipBlockingWait(true)
|
||||
.build());
|
||||
leaderLockItem.ifPresent(item -> log.info("Worker : {} is new leader", item.getOwnerName()));
|
||||
// if leaderLockItem optional is empty, that means the lock is not acquired by this worker.
|
||||
response = leaderLockItem.isPresent();
|
||||
} catch (final InterruptedException e) {
|
||||
// Something bad happened, don't assume leadership and also release lock just in case the
|
||||
// lock was granted and still interrupt happened.
|
||||
releaseLeadershipIfHeld();
|
||||
log.error("Acquiring lock was interrupted in between", e);
|
||||
response = false;
|
||||
|
||||
} catch (final LockCurrentlyUnavailableException e) {
|
||||
response = false;
|
||||
}
|
||||
|
||||
} else {
|
||||
response = lockItem.get().getOwnerName().equals(workerId);
|
||||
}
|
||||
|
||||
lastCheckTimeInMillis = System.currentTimeMillis();
|
||||
lastIsLeaderResult = response;
|
||||
publishIsLeaderMetrics(response);
|
||||
return response;
|
||||
}
|
||||
|
||||
private void publishIsLeaderMetrics(final boolean response) {
|
||||
final MetricsScope metricsScope =
|
||||
MetricsUtil.createMetricsWithOperation(metricsFactory, METRIC_OPERATION_LEADER_DECIDER);
|
||||
metricsScope.addData(
|
||||
METRIC_OPERATION_LEADER_DECIDER_IS_LEADER, response ? 1 : 0, StandardUnit.COUNT, MetricsLevel.DETAILED);
|
||||
MetricsUtil.endScope(metricsScope);
|
||||
}
|
||||
|
||||
/**
|
||||
* Releases the lock if held by current worker when this method is invoked.
|
||||
*/
|
||||
@Override
|
||||
public void shutdown() {
|
||||
if (!isShutdown.getAndSet(true)) {
|
||||
releaseLeadershipIfHeld();
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public void releaseLeadershipIfHeld() {
|
||||
try {
|
||||
final Optional<LockItem> lockItem = dynamoDBLockClient.getLock(LEADER_HASH_KEY, Optional.empty());
|
||||
if (lockItem.isPresent()
|
||||
&& !lockItem.get().isExpired()
|
||||
&& lockItem.get().getOwnerName().equals(workerId)) {
|
||||
|
||||
log.info(
|
||||
"Current worker : {} holds the lock, releasing it.",
|
||||
lockItem.get().getOwnerName());
|
||||
// LockItem.close() will release the lock if current worker owns it else this call is no op.
|
||||
lockItem.get().close();
|
||||
}
|
||||
} catch (final Exception e) {
|
||||
log.error("Failed to complete releaseLeadershipIfHeld call.", e);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns if any ACTIVE leader exists that is elected by the current implementation which can be outside the
|
||||
* scope of this worker. That is leader elected by this implementation in any worker in fleet.
|
||||
* DynamoDBLockClient does not provide an interface which can tell if an active lock exists or not, thus
|
||||
* we need to put custom implementation.
|
||||
* The implementation performs DDB get every heartbeatPeriodMillis to have low RCU consumption, which means that
|
||||
* the leader could have been elected from the last time the check happened and before check happens again.
|
||||
* The information returned from this method has eventual consistency (up to heartbeatPeriodMillis interval).
|
||||
*
|
||||
* @return true, if any leader is elected else false.
|
||||
*/
|
||||
@Override
|
||||
public synchronized boolean isAnyLeaderElected() {
|
||||
// Avoid going to ddb for every call and do it once every heartbeatPeriod to have low RCU usage.
|
||||
if (Duration.between(
|
||||
Instant.ofEpochMilli(lastIsAnyLeaderElectedDDBReadTimeMillis),
|
||||
Instant.ofEpochMilli(System.currentTimeMillis()))
|
||||
.toMillis()
|
||||
> heartbeatPeriodMillis) {
|
||||
final MetricsScope metricsScope = MetricsUtil.createMetricsWithOperation(
|
||||
metricsFactory, this.getClass().getSimpleName() + ":isAnyLeaderElected");
|
||||
final long startTime = System.currentTimeMillis();
|
||||
try {
|
||||
lastIsAnyLeaderElectedDDBReadTimeMillis = System.currentTimeMillis();
|
||||
final Optional<LockItem> lockItem = dynamoDBLockClient.getLockFromDynamoDB(
|
||||
GetLockOptions.builder(LEADER_HASH_KEY).build());
|
||||
|
||||
if (!lockItem.isPresent()) {
|
||||
// There is no LockItem in the ddb table, that means no one is holding lock.
|
||||
lastIsAnyLeaderElectedResult = false;
|
||||
log.info("LockItem present : {}", false);
|
||||
} else {
|
||||
final LockItem ddbLockItem = lockItem.get();
|
||||
if (isNull(lastIsAnyLeaderCheckLockItemToFirstEncounterTime)
|
||||
|| !ddbLockItem
|
||||
.getRecordVersionNumber()
|
||||
.equals(lastIsAnyLeaderCheckLockItemToFirstEncounterTime
|
||||
.getKey()
|
||||
.getRecordVersionNumber())) {
|
||||
// This is the first isAnyLeaderElected call, so we can't evaluate if the LockItem has expired
|
||||
// or not yet so consider LOCK as ACTIVE.
|
||||
// OR LockItem in ddb and in-memory LockItem have different RecordVersionNumber
|
||||
// and thus the LOCK is still ACTIVE
|
||||
lastIsAnyLeaderElectedResult = true;
|
||||
lastIsAnyLeaderCheckLockItemToFirstEncounterTime =
|
||||
new AbstractMap.SimpleEntry<>(ddbLockItem, lastIsAnyLeaderElectedDDBReadTimeMillis);
|
||||
log.info(
|
||||
"LockItem present : {}, and this is either first call OR lockItem has had "
|
||||
+ "a heartbeat",
|
||||
true);
|
||||
} else {
|
||||
// There is no change in the ddb lock item, so if the last update time is more than
|
||||
// lease duration, the lock is expired else it is still ACTIVE,
|
||||
lastIsAnyLeaderElectedResult = lastIsAnyLeaderCheckLockItemToFirstEncounterTime.getValue()
|
||||
+ ddbLockItem.getLeaseDuration()
|
||||
> lastIsAnyLeaderElectedDDBReadTimeMillis;
|
||||
log.info("LockItem present : {}, and lease expiry: {}", true, lastIsAnyLeaderElectedResult);
|
||||
}
|
||||
}
|
||||
} catch (final ResourceNotFoundException exception) {
|
||||
log.info("Lock table does not exists...");
|
||||
// If the table itself doesn't exist, there is no elected leader.
|
||||
lastIsAnyLeaderElectedResult = false;
|
||||
} finally {
|
||||
metricsScope.addData(
|
||||
"Latency",
|
||||
System.currentTimeMillis() - startTime,
|
||||
StandardUnit.MILLISECONDS,
|
||||
MetricsLevel.DETAILED);
|
||||
MetricsUtil.endScope(metricsScope);
|
||||
}
|
||||
}
|
||||
return lastIsAnyLeaderElectedResult;
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,79 @@
|
|||
package software.amazon.kinesis.leader;
|
||||
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
import software.amazon.awssdk.annotations.ThreadSafe;
|
||||
import software.amazon.awssdk.services.cloudwatch.model.StandardUnit;
|
||||
import software.amazon.kinesis.annotations.KinesisClientInternalApi;
|
||||
import software.amazon.kinesis.coordinator.LeaderDecider;
|
||||
import software.amazon.kinesis.metrics.MetricsFactory;
|
||||
import software.amazon.kinesis.metrics.MetricsLevel;
|
||||
import software.amazon.kinesis.metrics.MetricsScope;
|
||||
import software.amazon.kinesis.metrics.MetricsUtil;
|
||||
|
||||
import static java.util.Objects.nonNull;
|
||||
|
||||
/**
|
||||
* MigrationAdaptiveLeaderDecider that wraps around the actual LeaderDecider which can dynamically
|
||||
* change based on the MigrationStateMachine.
|
||||
*/
|
||||
@Slf4j
|
||||
@KinesisClientInternalApi
|
||||
@ThreadSafe
|
||||
public class MigrationAdaptiveLeaderDecider implements LeaderDecider {
|
||||
|
||||
private final MetricsFactory metricsFactory;
|
||||
private LeaderDecider currentLeaderDecider;
|
||||
|
||||
public MigrationAdaptiveLeaderDecider(final MetricsFactory metricsFactory) {
|
||||
this.metricsFactory = metricsFactory;
|
||||
}
|
||||
|
||||
@Override
|
||||
public synchronized Boolean isLeader(final String workerId) {
|
||||
if (currentLeaderDecider == null) {
|
||||
throw new IllegalStateException("LeaderDecider uninitialized");
|
||||
}
|
||||
|
||||
final MetricsScope scope =
|
||||
MetricsUtil.createMetricsWithOperation(metricsFactory, METRIC_OPERATION_LEADER_DECIDER);
|
||||
try {
|
||||
publishSelectedLeaderDeciderMetrics(scope, currentLeaderDecider);
|
||||
return currentLeaderDecider.isLeader(workerId);
|
||||
} finally {
|
||||
MetricsUtil.endScope(scope);
|
||||
}
|
||||
}
|
||||
|
||||
private static void publishSelectedLeaderDeciderMetrics(
|
||||
final MetricsScope scope, final LeaderDecider leaderDecider) {
|
||||
scope.addData(
|
||||
String.format(leaderDecider.getClass().getSimpleName()), 1D, StandardUnit.COUNT, MetricsLevel.DETAILED);
|
||||
}
|
||||
|
||||
public synchronized void updateLeaderDecider(final LeaderDecider leaderDecider) {
|
||||
if (currentLeaderDecider != null) {
|
||||
currentLeaderDecider.shutdown();
|
||||
log.info(
|
||||
"Updating leader decider dynamically from {} to {}",
|
||||
this.currentLeaderDecider.getClass().getSimpleName(),
|
||||
leaderDecider.getClass().getSimpleName());
|
||||
} else {
|
||||
log.info(
|
||||
"Initializing dynamic leader decider with {}",
|
||||
leaderDecider.getClass().getSimpleName());
|
||||
}
|
||||
currentLeaderDecider = leaderDecider;
|
||||
currentLeaderDecider.initialize();
|
||||
}
|
||||
|
||||
@Override
|
||||
public void shutdown() {
|
||||
if (nonNull(currentLeaderDecider)) {
|
||||
log.info("Shutting down current {}", currentLeaderDecider.getClass().getSimpleName());
|
||||
currentLeaderDecider.shutdown();
|
||||
currentLeaderDecider = null;
|
||||
} else {
|
||||
log.info("LeaderDecider has already been shutdown");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -81,8 +81,20 @@ public class DynamoUtils {
|
|||
}
|
||||
}
|
||||
|
||||
public static AttributeValue createAttributeValue(Double doubleValue) {
|
||||
if (doubleValue == null) {
|
||||
throw new IllegalArgumentException("Double attributeValues cannot be null.");
|
||||
}
|
||||
|
||||
return AttributeValue.builder().n(doubleValue.toString()).build();
|
||||
}
|
||||
|
||||
public static String safeGetString(Map<String, AttributeValue> dynamoRecord, String key) {
|
||||
AttributeValue av = dynamoRecord.get(key);
|
||||
return safeGetString(av);
|
||||
}
|
||||
|
||||
public static String safeGetString(AttributeValue av) {
|
||||
if (av == null) {
|
||||
return null;
|
||||
} else {
|
||||
|
|
@ -99,4 +111,13 @@ public class DynamoUtils {
|
|||
return av.ss();
|
||||
}
|
||||
}
|
||||
|
||||
public static Double safeGetDouble(Map<String, AttributeValue> dynamoRecord, String key) {
|
||||
AttributeValue av = dynamoRecord.get(key);
|
||||
if (av == null) {
|
||||
return null;
|
||||
} else {
|
||||
return new Double(av.n());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -103,26 +103,6 @@ public class KinesisShardDetector implements ShardDetector {
|
|||
|
||||
private static final Boolean THROW_RESOURCE_NOT_FOUND_EXCEPTION = true;
|
||||
|
||||
@Deprecated
|
||||
public KinesisShardDetector(
|
||||
KinesisAsyncClient kinesisClient,
|
||||
String streamName,
|
||||
long listShardsBackoffTimeInMillis,
|
||||
int maxListShardsRetryAttempts,
|
||||
long listShardsCacheAllowedAgeInSeconds,
|
||||
int maxCacheMissesBeforeReload,
|
||||
int cacheMissWarningModulus) {
|
||||
this(
|
||||
kinesisClient,
|
||||
StreamIdentifier.singleStreamInstance(streamName),
|
||||
listShardsBackoffTimeInMillis,
|
||||
maxListShardsRetryAttempts,
|
||||
listShardsCacheAllowedAgeInSeconds,
|
||||
maxCacheMissesBeforeReload,
|
||||
cacheMissWarningModulus,
|
||||
LeaseManagementConfig.DEFAULT_REQUEST_TIMEOUT);
|
||||
}
|
||||
|
||||
public KinesisShardDetector(
|
||||
KinesisAsyncClient kinesisClient,
|
||||
StreamIdentifier streamIdentifier,
|
||||
|
|
|
|||
|
|
@ -46,7 +46,11 @@ import software.amazon.kinesis.retrieval.kpl.ExtendedSequenceNumber;
|
|||
"lastCounterIncrementNanos",
|
||||
"childShardIds",
|
||||
"pendingCheckpointState",
|
||||
"isMarkedForLeaseSteal"
|
||||
"isMarkedForLeaseSteal",
|
||||
"throughputKBps",
|
||||
"checkpointOwner",
|
||||
"checkpointOwnerTimeoutTimestampMillis",
|
||||
"isExpiredOrUnassigned"
|
||||
})
|
||||
@ToString
|
||||
public class Lease {
|
||||
|
|
@ -104,6 +108,33 @@ public class Lease {
|
|||
@Setter
|
||||
private boolean isMarkedForLeaseSteal;
|
||||
|
||||
/**
|
||||
* If true, this indicates that lease is ready to be immediately reassigned.
|
||||
*/
|
||||
@Setter
|
||||
private boolean isExpiredOrUnassigned;
|
||||
|
||||
/**
|
||||
* Throughput in Kbps for the lease.
|
||||
*/
|
||||
private Double throughputKBps;
|
||||
|
||||
/**
|
||||
* Owner of the checkpoint. The attribute is used for graceful shutdowns to indicate the owner that
|
||||
* is allowed to write the checkpoint.
|
||||
*/
|
||||
@Setter
|
||||
private String checkpointOwner;
|
||||
|
||||
/**
|
||||
* This field is used for tracking when the shutdown was requested on the lease so we can expire it. This is
|
||||
* deliberately not persisted in DynamoDB because leaseOwner are expected to transfer lease from itself to the
|
||||
* next owner during shutdown. If the worker dies before shutdown the lease will just become expired then we can
|
||||
* pick it up. If for some reason worker is not able to shut down and continues holding onto the lease
|
||||
* this timeout will kick in and force a lease transfer.
|
||||
*/
|
||||
@Setter
|
||||
private Long checkpointOwnerTimeoutTimestampMillis;
|
||||
/**
|
||||
* Count of distinct lease holders between checkpoints.
|
||||
*/
|
||||
|
|
@ -242,6 +273,54 @@ public class Lease {
|
|||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* @return true if checkpoint owner is set. Indicating a requested shutdown.
|
||||
*/
|
||||
public boolean shutdownRequested() {
|
||||
return checkpointOwner != null;
|
||||
}
|
||||
|
||||
/**
|
||||
* Check whether lease should be blocked on pending checkpoint. We DON'T block if
|
||||
* - lease is expired (Expired lease should be assigned right away) OR
|
||||
* ----- at this point we know lease is assigned -----
|
||||
* - lease is shardEnd (No more processing possible) OR
|
||||
* - lease is NOT requested for shutdown OR
|
||||
* - lease shutdown expired
|
||||
*
|
||||
* @param currentTimeMillis current time in milliseconds
|
||||
* @return true if lease is blocked on pending checkpoint
|
||||
*/
|
||||
public boolean blockedOnPendingCheckpoint(long currentTimeMillis) {
|
||||
// using ORs and negate
|
||||
return !(isExpiredOrUnassigned
|
||||
|| ExtendedSequenceNumber.SHARD_END.equals(checkpoint)
|
||||
|| !shutdownRequested()
|
||||
// if shutdown requested then checkpointOwnerTimeoutTimestampMillis should present
|
||||
|| currentTimeMillis - checkpointOwnerTimeoutTimestampMillis >= 0);
|
||||
}
|
||||
|
||||
/**
|
||||
* Check whether lease is eligible for graceful shutdown. It's eligible if
|
||||
* - lease is still assigned (not expired) AND
|
||||
* - lease is NOT shardEnd (No more processing possible AND
|
||||
* - lease is NOT requested for shutdown
|
||||
*
|
||||
* @return true if lease is eligible for graceful shutdown
|
||||
*/
|
||||
public boolean isEligibleForGracefulShutdown() {
|
||||
return !isExpiredOrUnassigned && !ExtendedSequenceNumber.SHARD_END.equals(checkpoint) && !shutdownRequested();
|
||||
}
|
||||
|
||||
/**
|
||||
* Need to handle the case during graceful shutdown where leaseOwner isn't the current owner
|
||||
*
|
||||
* @return the actual owner
|
||||
*/
|
||||
public String actualOwner() {
|
||||
return checkpointOwner == null ? leaseOwner : checkpointOwner;
|
||||
}
|
||||
|
||||
/**
|
||||
* @return true if lease is not currently owned
|
||||
*/
|
||||
|
|
@ -343,6 +422,15 @@ public class Lease {
|
|||
this.childShardIds.addAll(childShardIds);
|
||||
}
|
||||
|
||||
/**
|
||||
* Sets throughputKbps.
|
||||
*
|
||||
* @param throughputKBps may not be null
|
||||
*/
|
||||
public void throughputKBps(double throughputKBps) {
|
||||
this.throughputKBps = throughputKBps;
|
||||
}
|
||||
|
||||
/**
|
||||
* Set the hash range key for this shard.
|
||||
* @param hashKeyRangeForLease
|
||||
|
|
@ -370,6 +458,8 @@ public class Lease {
|
|||
* @return A deep copy of this object.
|
||||
*/
|
||||
public Lease copy() {
|
||||
return new Lease(this);
|
||||
final Lease lease = new Lease(this);
|
||||
lease.checkpointOwner(this.checkpointOwner);
|
||||
return lease;
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -20,6 +20,7 @@ import java.util.Collections;
|
|||
import java.util.List;
|
||||
import java.util.UUID;
|
||||
|
||||
import software.amazon.kinesis.coordinator.MigrationAdaptiveLeaseAssignmentModeProvider;
|
||||
import software.amazon.kinesis.leases.dynamodb.DynamoDBLeaseCoordinator;
|
||||
import software.amazon.kinesis.leases.exceptions.DependencyException;
|
||||
import software.amazon.kinesis.leases.exceptions.InvalidStateException;
|
||||
|
|
@ -38,11 +39,14 @@ public interface LeaseCoordinator {
|
|||
|
||||
/**
|
||||
* Start background LeaseHolder and LeaseTaker threads.
|
||||
* @param leaseAssignmentModeProvider provider of Lease Assignment mode to determine whether to start components
|
||||
* for both V2 and V3 functionality or only V3 functionality
|
||||
* @throws ProvisionedThroughputException If we can't talk to DynamoDB due to insufficient capacity.
|
||||
* @throws InvalidStateException If the lease table doesn't exist
|
||||
* @throws DependencyException If we encountered exception taking to DynamoDB
|
||||
*/
|
||||
void start() throws DependencyException, InvalidStateException, ProvisionedThroughputException;
|
||||
void start(final MigrationAdaptiveLeaseAssignmentModeProvider leaseAssignmentModeProvider)
|
||||
throws DependencyException, InvalidStateException, ProvisionedThroughputException;
|
||||
|
||||
/**
|
||||
* Runs a single iteration of the lease taker - used by integration tests.
|
||||
|
|
@ -152,4 +156,9 @@ public interface LeaseCoordinator {
|
|||
* @return LeaseCoordinator
|
||||
*/
|
||||
DynamoDBLeaseCoordinator initialLeaseTableReadCapacity(long readCapacity);
|
||||
|
||||
/**
|
||||
* @return instance of {@link LeaseStatsRecorder}
|
||||
*/
|
||||
LeaseStatsRecorder leaseStatsRecorder();
|
||||
}
|
||||
|
|
|
|||
|
|
@ -0,0 +1,20 @@
|
|||
package software.amazon.kinesis.leases;
|
||||
|
||||
import java.util.List;
|
||||
|
||||
import software.amazon.kinesis.leases.exceptions.DependencyException;
|
||||
import software.amazon.kinesis.leases.exceptions.InvalidStateException;
|
||||
import software.amazon.kinesis.leases.exceptions.ProvisionedThroughputException;
|
||||
|
||||
public interface LeaseDiscoverer {
|
||||
/**
|
||||
* Identifies the leases that are assigned to the current worker but are not being tracked and processed by the
|
||||
* current worker.
|
||||
*
|
||||
* @return list of leases assigned to worker which doesn't exist in {@param currentHeldLeaseKeys}
|
||||
* @throws DependencyException if DynamoDB scan fails in an unexpected way
|
||||
* @throws InvalidStateException if lease table does not exist
|
||||
* @throws ProvisionedThroughputException if DynamoDB scan fails due to lack of capacity
|
||||
*/
|
||||
List<Lease> discoverNewLeases() throws ProvisionedThroughputException, InvalidStateException, DependencyException;
|
||||
}
|
||||
|
|
@ -16,7 +16,9 @@
|
|||
package software.amazon.kinesis.leases;
|
||||
|
||||
import java.time.Duration;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Collection;
|
||||
import java.util.List;
|
||||
import java.util.concurrent.ExecutorService;
|
||||
import java.util.concurrent.SynchronousQueue;
|
||||
import java.util.concurrent.ThreadFactory;
|
||||
|
|
@ -25,7 +27,9 @@ import java.util.concurrent.TimeUnit;
|
|||
import java.util.function.Function;
|
||||
|
||||
import com.google.common.util.concurrent.ThreadFactoryBuilder;
|
||||
import lombok.Builder;
|
||||
import lombok.Data;
|
||||
import lombok.Getter;
|
||||
import lombok.NonNull;
|
||||
import lombok.experimental.Accessors;
|
||||
import org.apache.commons.lang3.Validate;
|
||||
|
|
@ -34,6 +38,7 @@ import software.amazon.awssdk.services.dynamodb.DynamoDbAsyncClient;
|
|||
import software.amazon.awssdk.services.dynamodb.model.BillingMode;
|
||||
import software.amazon.awssdk.services.dynamodb.model.Tag;
|
||||
import software.amazon.awssdk.services.kinesis.KinesisAsyncClient;
|
||||
import software.amazon.kinesis.common.DdbTableConfig;
|
||||
import software.amazon.kinesis.common.InitialPositionInStream;
|
||||
import software.amazon.kinesis.common.InitialPositionInStreamExtended;
|
||||
import software.amazon.kinesis.common.LeaseCleanupConfig;
|
||||
|
|
@ -42,6 +47,7 @@ import software.amazon.kinesis.leases.dynamodb.DynamoDBLeaseManagementFactory;
|
|||
import software.amazon.kinesis.leases.dynamodb.TableCreatorCallback;
|
||||
import software.amazon.kinesis.metrics.MetricsFactory;
|
||||
import software.amazon.kinesis.metrics.NullMetricsFactory;
|
||||
import software.amazon.kinesis.worker.metric.WorkerMetric;
|
||||
|
||||
/**
|
||||
* Used by the KCL to configure lease management.
|
||||
|
|
@ -209,6 +215,9 @@ public class LeaseManagementConfig {
|
|||
|
||||
private BillingMode billingMode = BillingMode.PAY_PER_REQUEST;
|
||||
|
||||
private WorkerUtilizationAwareAssignmentConfig workerUtilizationAwareAssignmentConfig =
|
||||
new WorkerUtilizationAwareAssignmentConfig();
|
||||
|
||||
/**
|
||||
* Whether to enable deletion protection on the DynamoDB lease table created by KCL. This does not update
|
||||
* already existing tables.
|
||||
|
|
@ -276,14 +285,17 @@ public class LeaseManagementConfig {
|
|||
}
|
||||
|
||||
public LeaseManagementConfig(
|
||||
String tableName,
|
||||
DynamoDbAsyncClient dynamoDBClient,
|
||||
KinesisAsyncClient kinesisClient,
|
||||
String workerIdentifier) {
|
||||
final String tableName,
|
||||
final String applicationName,
|
||||
final DynamoDbAsyncClient dynamoDBClient,
|
||||
final KinesisAsyncClient kinesisClient,
|
||||
final String workerIdentifier) {
|
||||
this.tableName = tableName;
|
||||
this.dynamoDBClient = dynamoDBClient;
|
||||
this.kinesisClient = kinesisClient;
|
||||
this.workerIdentifier = workerIdentifier;
|
||||
this.workerUtilizationAwareAssignmentConfig.workerMetricsTableConfig =
|
||||
new WorkerMetricsTableConfig(applicationName);
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
@ -361,6 +373,53 @@ public class LeaseManagementConfig {
|
|||
return hierarchicalShardSyncer;
|
||||
}
|
||||
|
||||
/**
|
||||
* Configuration class for controlling the graceful handoff of leases.
|
||||
* This configuration allows tuning of the shutdown behavior during lease transfers.
|
||||
* <p>
|
||||
* It provides settings to control the timeout period for waiting on the record processor
|
||||
* to shut down and an option to enable or disable graceful lease handoff.
|
||||
* </p>
|
||||
*/
|
||||
@Builder
|
||||
@Getter
|
||||
@Accessors(fluent = true)
|
||||
public static class GracefulLeaseHandoffConfig {
|
||||
/**
|
||||
* The minimum amount of time (in milliseconds) to wait for the current shard's RecordProcessor
|
||||
* to gracefully shut down before forcefully transferring the lease to the next owner.
|
||||
* <p>
|
||||
* If each call to {@code processRecords} is expected to run longer than the default value,
|
||||
* it makes sense to set this to a higher value to ensure the RecordProcessor has enough
|
||||
* time to complete its processing.
|
||||
* </p>
|
||||
* <p>
|
||||
* Default value is 30,000 milliseconds (30 seconds).
|
||||
* </p>
|
||||
*/
|
||||
@Builder.Default
|
||||
private long gracefulLeaseHandoffTimeoutMillis = 30_000L;
|
||||
/**
|
||||
* Flag to enable or disable the graceful lease handoff mechanism.
|
||||
* <p>
|
||||
* When set to {@code true}, the KCL will attempt to gracefully transfer leases by
|
||||
* allowing the shard's RecordProcessor sufficient time to complete processing before
|
||||
* handing off the lease to another worker. When {@code false}, the lease will be
|
||||
* handed off without waiting for the RecordProcessor to shut down gracefully. Note
|
||||
* that checkpointing is expected to be implemented inside {@code shutdownRequested}
|
||||
* for this feature to work end to end.
|
||||
* </p>
|
||||
* <p>
|
||||
* Default value is {@code true}.
|
||||
* </p>
|
||||
*/
|
||||
@Builder.Default
|
||||
private boolean isGracefulLeaseHandoffEnabled = true;
|
||||
}
|
||||
|
||||
private GracefulLeaseHandoffConfig gracefulLeaseHandoffConfig =
|
||||
GracefulLeaseHandoffConfig.builder().build();
|
||||
|
||||
@Deprecated
|
||||
public LeaseManagementFactory leaseManagementFactory() {
|
||||
if (leaseManagementFactory == null) {
|
||||
|
|
@ -440,7 +499,9 @@ public class LeaseManagementConfig {
|
|||
leaseSerializer,
|
||||
customShardDetectorProvider(),
|
||||
isMultiStreamingMode,
|
||||
leaseCleanupConfig());
|
||||
leaseCleanupConfig(),
|
||||
workerUtilizationAwareAssignmentConfig(),
|
||||
gracefulLeaseHandoffConfig);
|
||||
}
|
||||
return leaseManagementFactory;
|
||||
}
|
||||
|
|
@ -454,4 +515,89 @@ public class LeaseManagementConfig {
|
|||
this.leaseManagementFactory = leaseManagementFactory;
|
||||
return this;
|
||||
}
|
||||
|
||||
@Data
|
||||
@Accessors(fluent = true)
|
||||
public static class WorkerUtilizationAwareAssignmentConfig {
|
||||
/**
|
||||
* This defines the frequency of capturing worker metric stats in memory. Default is 1s
|
||||
*/
|
||||
private long inMemoryWorkerMetricsCaptureFrequencyMillis =
|
||||
Duration.ofSeconds(1L).toMillis();
|
||||
/**
|
||||
* This defines the frequency of reporting worker metric stats to storage. Default is 30s
|
||||
*/
|
||||
private long workerMetricsReporterFreqInMillis = Duration.ofSeconds(30).toMillis();
|
||||
/**
|
||||
* These are the no. of metrics that are persisted in storage in WorkerMetricStats ddb table.
|
||||
*/
|
||||
private int noOfPersistedMetricsPerWorkerMetrics = 10;
|
||||
/**
|
||||
* Option to disable workerMetrics to use in lease balancing.
|
||||
*/
|
||||
private boolean disableWorkerMetrics = false;
|
||||
/**
|
||||
* List of workerMetrics for the application.
|
||||
*/
|
||||
private List<WorkerMetric> workerMetricList = new ArrayList<>();
|
||||
/**
|
||||
* Max throughput per host KBps, default is unlimited.
|
||||
*/
|
||||
private double maxThroughputPerHostKBps = Double.MAX_VALUE;
|
||||
/**
|
||||
* Percentage of value to achieve critical dampening during this case
|
||||
*/
|
||||
private int dampeningPercentage = 60;
|
||||
/**
|
||||
* Percentage value used to trigger reBalance. If fleet has workers which are have metrics value more or less
|
||||
* than 20% of fleet level average then reBalance is triggered.
|
||||
* Leases are taken from workers with metrics value more than fleet level average. The load to take from these
|
||||
* workers is determined by evaluating how far they are with respect to fleet level average.
|
||||
*/
|
||||
private int reBalanceThresholdPercentage = 10;
|
||||
|
||||
/**
|
||||
* The allowThroughputOvershoot flag determines whether leases should still be taken even if
|
||||
* it causes the total assigned throughput to exceed the desired throughput to take for re-balance.
|
||||
* Enabling this flag provides more flexibility for the LeaseAssignmentManager to explore additional
|
||||
* assignment possibilities, which can lead to faster throughput convergence.
|
||||
*/
|
||||
private boolean allowThroughputOvershoot = true;
|
||||
|
||||
/**
|
||||
* Duration after which workerMetrics entry from WorkerMetricStats table will be cleaned up. When an entry's
|
||||
* lastUpdateTime is older than staleWorkerMetricsEntryCleanupDuration from current time, entry will be removed
|
||||
* from the table.
|
||||
*/
|
||||
private Duration staleWorkerMetricsEntryCleanupDuration = Duration.ofDays(1);
|
||||
|
||||
/**
|
||||
* configuration to configure how to create the WorkerMetricStats table, such as table name,
|
||||
* billing mode, provisioned capacity. If no table name is specified, the table name will
|
||||
* default to applicationName-WorkerMetricStats. If no billing more is chosen, default is
|
||||
* On-Demand.
|
||||
*/
|
||||
private WorkerMetricsTableConfig workerMetricsTableConfig;
|
||||
|
||||
/**
|
||||
* Frequency to perform worker variance balancing frequency. This value is used with respect to the LAM freq,
|
||||
* that is every third (as default) iteration of LAM the worker variance balancing will be performed.
|
||||
* Setting it to 1 will make varianceBalancing run on every iteration of LAM and 2 on every 2nd iteration
|
||||
* and so on.
|
||||
*/
|
||||
private int varianceBalancingFrequency = 3;
|
||||
|
||||
/**
|
||||
* Alpha value used for calculating exponential moving average of worker's metrics values. Selecting
|
||||
* higher alpha value gives more weightage to recent value and thus low smoothing effect on computed average
|
||||
* and selecting smaller alpha values gives more weightage to past value and high smoothing effect.
|
||||
*/
|
||||
private double workerMetricsEMAAlpha = 0.5;
|
||||
}
|
||||
|
||||
public static class WorkerMetricsTableConfig extends DdbTableConfig {
|
||||
public WorkerMetricsTableConfig(final String applicationName) {
|
||||
super(applicationName, "WorkerMetricStats");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -15,9 +15,12 @@
|
|||
|
||||
package software.amazon.kinesis.leases;
|
||||
|
||||
import java.util.concurrent.ConcurrentMap;
|
||||
|
||||
import software.amazon.kinesis.common.StreamConfig;
|
||||
import software.amazon.kinesis.coordinator.DeletedStreamListProvider;
|
||||
import software.amazon.kinesis.leases.dynamodb.DynamoDBLeaseRefresher;
|
||||
import software.amazon.kinesis.lifecycle.ShardConsumer;
|
||||
import software.amazon.kinesis.metrics.MetricsFactory;
|
||||
|
||||
/**
|
||||
|
|
@ -26,6 +29,11 @@ import software.amazon.kinesis.metrics.MetricsFactory;
|
|||
public interface LeaseManagementFactory {
|
||||
LeaseCoordinator createLeaseCoordinator(MetricsFactory metricsFactory);
|
||||
|
||||
default LeaseCoordinator createLeaseCoordinator(
|
||||
MetricsFactory metricsFactory, ConcurrentMap<ShardInfo, ShardConsumer> shardInfoShardConsumerMap) {
|
||||
throw new UnsupportedOperationException();
|
||||
}
|
||||
|
||||
ShardSyncTaskManager createShardSyncTaskManager(MetricsFactory metricsFactory);
|
||||
|
||||
default ShardSyncTaskManager createShardSyncTaskManager(MetricsFactory metricsFactory, StreamConfig streamConfig) {
|
||||
|
|
|
|||
|
|
@ -15,6 +15,9 @@
|
|||
package software.amazon.kinesis.leases;
|
||||
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.concurrent.ExecutorService;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import software.amazon.kinesis.common.StreamIdentifier;
|
||||
import software.amazon.kinesis.leases.exceptions.DependencyException;
|
||||
|
|
@ -75,6 +78,37 @@ public interface LeaseRefresher {
|
|||
*/
|
||||
boolean waitUntilLeaseTableExists(long secondsBetweenPolls, long timeoutSeconds) throws DependencyException;
|
||||
|
||||
/**
|
||||
* Creates the LeaseOwnerToLeaseKey index on the lease table if it doesn't exist and returns the status of index.
|
||||
*
|
||||
* @return indexStatus status of the index.
|
||||
* @throws DependencyException if storage's describe API fails in an unexpected way
|
||||
*/
|
||||
default String createLeaseOwnerToLeaseKeyIndexIfNotExists() throws DependencyException {
|
||||
return null;
|
||||
}
|
||||
|
||||
/**
|
||||
* Blocks until the index exists by polling storage till either the index is ACTIVE or else timeout has
|
||||
* happened.
|
||||
*
|
||||
* @param secondsBetweenPolls time to wait between polls in seconds
|
||||
* @param timeoutSeconds total time to wait in seconds
|
||||
*
|
||||
* @return true if index on the table exists and is ACTIVE, false if timeout was reached
|
||||
*/
|
||||
default boolean waitUntilLeaseOwnerToLeaseKeyIndexExists(
|
||||
final long secondsBetweenPolls, final long timeoutSeconds) {
|
||||
return false;
|
||||
}
|
||||
|
||||
/**
|
||||
* Check if leaseOwner GSI is ACTIVE
|
||||
* @return true if index is active, false otherwise
|
||||
* @throws DependencyException if storage's describe API fails in an unexpected way
|
||||
*/
|
||||
boolean isLeaseOwnerToLeaseKeyIndexActive() throws DependencyException;
|
||||
|
||||
/**
|
||||
* List all leases for a given stream synchronously.
|
||||
*
|
||||
|
|
@ -87,6 +121,24 @@ public interface LeaseRefresher {
|
|||
List<Lease> listLeasesForStream(StreamIdentifier streamIdentifier)
|
||||
throws DependencyException, InvalidStateException, ProvisionedThroughputException;
|
||||
|
||||
/**
|
||||
* List all leases for a given workerIdentifier synchronously.
|
||||
* Default implementation calls listLeases() and filters the results.
|
||||
*
|
||||
* @throws DependencyException if DynamoDB scan fails in an unexpected way
|
||||
* @throws InvalidStateException if lease table does not exist
|
||||
* @throws ProvisionedThroughputException if DynamoDB scan fails due to lack of capacity
|
||||
*
|
||||
* @return list of leases
|
||||
*/
|
||||
default List<String> listLeaseKeysForWorker(final String workerIdentifier)
|
||||
throws DependencyException, InvalidStateException, ProvisionedThroughputException {
|
||||
return listLeases().stream()
|
||||
.filter(lease -> lease.leaseOwner().equals(workerIdentifier))
|
||||
.map(Lease::leaseKey)
|
||||
.collect(Collectors.toList());
|
||||
}
|
||||
|
||||
/**
|
||||
* List all objects in table synchronously.
|
||||
*
|
||||
|
|
@ -98,6 +150,23 @@ public interface LeaseRefresher {
|
|||
*/
|
||||
List<Lease> listLeases() throws DependencyException, InvalidStateException, ProvisionedThroughputException;
|
||||
|
||||
/**
|
||||
* List all leases from the storage parallely and deserialize into Lease objects. Returns the list of leaseKey
|
||||
* that failed deserialize separately.
|
||||
*
|
||||
* @param threadPool threadpool to use for parallel scan
|
||||
* @param parallelismFactor no. of parallel scans
|
||||
* @return Pair of List of leases from the storage and List of items failed to deserialize
|
||||
* @throws DependencyException if DynamoDB scan fails in an unexpected way
|
||||
* @throws InvalidStateException if lease table does not exist
|
||||
* @throws ProvisionedThroughputException if DynamoDB scan fails due to lack of capacity
|
||||
*/
|
||||
default Map.Entry<List<Lease>, List<String>> listLeasesParallely(
|
||||
final ExecutorService threadPool, final int parallelismFactor)
|
||||
throws DependencyException, InvalidStateException, ProvisionedThroughputException {
|
||||
throw new UnsupportedOperationException("listLeasesParallely is not implemented");
|
||||
}
|
||||
|
||||
/**
|
||||
* Create a new lease. Conditional on a lease not already existing with this shardId.
|
||||
*
|
||||
|
|
@ -154,6 +223,47 @@ public interface LeaseRefresher {
|
|||
boolean takeLease(Lease lease, String owner)
|
||||
throws DependencyException, InvalidStateException, ProvisionedThroughputException;
|
||||
|
||||
/**
|
||||
* Assigns given lease to newOwner owner by incrementing its leaseCounter and setting its owner field. Conditional
|
||||
* on the leaseOwner in DynamoDB matching the leaseOwner of the input lease. Mutates the leaseCounter and owner of
|
||||
* the passed-in lease object after updating DynamoDB.
|
||||
*
|
||||
* @param lease the lease to be assigned
|
||||
* @param newOwner the new owner
|
||||
*
|
||||
* @return true if lease was successfully assigned, false otherwise
|
||||
*
|
||||
* @throws InvalidStateException if lease table does not exist
|
||||
* @throws ProvisionedThroughputException if DynamoDB update fails due to lack of capacity
|
||||
* @throws DependencyException if DynamoDB update fails in an unexpected way
|
||||
*/
|
||||
default boolean assignLease(final Lease lease, final String newOwner)
|
||||
throws DependencyException, InvalidStateException, ProvisionedThroughputException {
|
||||
|
||||
throw new UnsupportedOperationException("assignLease is not implemented");
|
||||
}
|
||||
|
||||
/**
|
||||
* Initiates a graceful handoff of the given lease to the specified new owner, allowing the current owner
|
||||
* to complete its processing before transferring ownership.
|
||||
* <p>
|
||||
* This method updates the lease with the new owner information but ensures that the current owner
|
||||
* is given time to gracefully finish its work (e.g., processing records) before the lease is reassigned.
|
||||
* </p>
|
||||
*
|
||||
* @param lease the lease to be assigned
|
||||
* @param newOwner the new owner
|
||||
* @return true if a graceful handoff was successfully initiated
|
||||
* @throws InvalidStateException if lease table does not exist
|
||||
* @throws ProvisionedThroughputException if DynamoDB update fails due to lack of capacity
|
||||
* @throws DependencyException if DynamoDB update fails in an unexpected way
|
||||
*/
|
||||
default boolean initiateGracefulLeaseHandoff(final Lease lease, final String newOwner)
|
||||
throws DependencyException, InvalidStateException, ProvisionedThroughputException {
|
||||
|
||||
throw new UnsupportedOperationException("assignLeaseWithWait is not implemented");
|
||||
}
|
||||
|
||||
/**
|
||||
* Evict the current owner of lease by setting owner to null. Conditional on the owner in DynamoDB matching the owner of
|
||||
* the input. Mutates the lease counter and owner of the passed-in lease object after updating the record in DynamoDB.
|
||||
|
|
|
|||
|
|
@ -15,6 +15,7 @@
|
|||
package software.amazon.kinesis.leases;
|
||||
|
||||
import java.util.Collection;
|
||||
import java.util.Collections;
|
||||
import java.util.Map;
|
||||
|
||||
import software.amazon.awssdk.services.dynamodb.model.AttributeDefinition;
|
||||
|
|
@ -100,6 +101,15 @@ public interface LeaseSerializer {
|
|||
*/
|
||||
Map<String, AttributeValueUpdate> getDynamoTakeLeaseUpdate(Lease lease, String newOwner);
|
||||
|
||||
/**
|
||||
* @param lease lease that needs to be assigned
|
||||
* @param newOwner newLeaseOwner
|
||||
* @return the attribute value map that takes a lease for a new owner
|
||||
*/
|
||||
default Map<String, AttributeValueUpdate> getDynamoAssignLeaseUpdate(Lease lease, String newOwner) {
|
||||
throw new UnsupportedOperationException("getDynamoAssignLeaseUpdate is not implemented");
|
||||
}
|
||||
|
||||
/**
|
||||
* @param lease
|
||||
* @return the attribute value map that voids a lease
|
||||
|
|
@ -127,8 +137,22 @@ public interface LeaseSerializer {
|
|||
*/
|
||||
Collection<KeySchemaElement> getKeySchema();
|
||||
|
||||
default Collection<KeySchemaElement> getWorkerIdToLeaseKeyIndexKeySchema() {
|
||||
return Collections.EMPTY_LIST;
|
||||
}
|
||||
|
||||
default Collection<AttributeDefinition> getWorkerIdToLeaseKeyIndexAttributeDefinitions() {
|
||||
return Collections.EMPTY_LIST;
|
||||
}
|
||||
|
||||
/**
|
||||
* @return attribute definitions for creating a DynamoDB table to store leases
|
||||
*/
|
||||
Collection<AttributeDefinition> getAttributeDefinitions();
|
||||
|
||||
/**
|
||||
* @param lease
|
||||
* @return the attribute value map that includes lease throughput
|
||||
*/
|
||||
Map<String, AttributeValueUpdate> getDynamoLeaseThroughputKbpsUpdate(Lease lease);
|
||||
}
|
||||
|
|
|
|||
|
|
@ -0,0 +1,158 @@
|
|||
package software.amazon.kinesis.leases;
|
||||
|
||||
import java.util.LinkedList;
|
||||
import java.util.Map;
|
||||
import java.util.Queue;
|
||||
import java.util.concurrent.Callable;
|
||||
import java.util.concurrent.ConcurrentHashMap;
|
||||
import java.util.concurrent.ConcurrentLinkedQueue;
|
||||
|
||||
import lombok.Builder;
|
||||
import lombok.Getter;
|
||||
import lombok.NonNull;
|
||||
import lombok.RequiredArgsConstructor;
|
||||
import lombok.ToString;
|
||||
import software.amazon.awssdk.annotations.ThreadSafe;
|
||||
import software.amazon.kinesis.annotations.KinesisClientInternalApi;
|
||||
import software.amazon.kinesis.utils.ExponentialMovingAverage;
|
||||
|
||||
import static java.util.Objects.isNull;
|
||||
|
||||
/**
|
||||
* This class records the stats for the leases.
|
||||
* The stats are recorded in a thread safe queue, and the throughput is calculated by summing up the bytes and dividing
|
||||
* by interval in seconds.
|
||||
* This class is thread safe and backed by thread safe data structures.
|
||||
*/
|
||||
@RequiredArgsConstructor
|
||||
@KinesisClientInternalApi
|
||||
@ThreadSafe
|
||||
public class LeaseStatsRecorder {
|
||||
|
||||
/**
|
||||
* This default alpha is chosen based on the testing so far between simple average and moving average with 0.5.
|
||||
* In the future, if one value does not fit all use cases, inject this via config.
|
||||
*/
|
||||
private static final double DEFAULT_ALPHA = 0.5;
|
||||
|
||||
public static final int BYTES_PER_KB = 1024;
|
||||
|
||||
private final Long renewerFrequencyInMillis;
|
||||
private final Map<String, Queue<LeaseStats>> leaseStatsMap = new ConcurrentHashMap<>();
|
||||
private final Map<String, ExponentialMovingAverage> leaseKeyToExponentialMovingAverageMap =
|
||||
new ConcurrentHashMap<>();
|
||||
private final Callable<Long> timeProviderInMillis;
|
||||
|
||||
/**
|
||||
* This method provides happens-before semantics (i.e., the action (access or removal) from a thread happens
|
||||
* before the action from subsequent thread) for the stats recording in multithreaded environment.
|
||||
*/
|
||||
public void recordStats(@NonNull final LeaseStats leaseStats) {
|
||||
final Queue<LeaseStats> leaseStatsQueue =
|
||||
leaseStatsMap.computeIfAbsent(leaseStats.getLeaseKey(), lease -> new ConcurrentLinkedQueue<>());
|
||||
leaseStatsQueue.add(leaseStats);
|
||||
}
|
||||
|
||||
/**
|
||||
* Calculates the throughput in KBps for the given leaseKey.
|
||||
* Method first clears the items that are older than {@link #renewerFrequencyInMillis} from the queue and then
|
||||
* calculates the throughput per second during {@link #renewerFrequencyInMillis} interval and then returns the
|
||||
* ExponentialMovingAverage of the throughput. If method is called in quick succession with or without new stats
|
||||
* the result can be different as ExponentialMovingAverage decays old values on every new call.
|
||||
* This method is thread safe.
|
||||
* @param leaseKey leaseKey for which stats are required
|
||||
* @return throughput in Kbps, returns null if there is no stats available for the leaseKey.
|
||||
*/
|
||||
public Double getThroughputKBps(final String leaseKey) {
|
||||
final Queue<LeaseStats> leaseStatsQueue = leaseStatsMap.get(leaseKey);
|
||||
|
||||
if (isNull(leaseStatsQueue)) {
|
||||
// This means there is no entry for this leaseKey yet
|
||||
return null;
|
||||
}
|
||||
|
||||
filterExpiredEntries(leaseStatsQueue);
|
||||
|
||||
// Convert bytes into KB and divide by interval in second to get throughput per second.
|
||||
final ExponentialMovingAverage exponentialMovingAverage = leaseKeyToExponentialMovingAverageMap.computeIfAbsent(
|
||||
leaseKey, leaseId -> new ExponentialMovingAverage(DEFAULT_ALPHA));
|
||||
|
||||
// Specifically dividing by 1000.0 rather than using Duration class to get seconds, because Duration class
|
||||
// implementation rounds off to seconds and precision is lost.
|
||||
final double frequency = renewerFrequencyInMillis / 1000.0;
|
||||
final double throughput = readQueue(leaseStatsQueue).stream()
|
||||
.mapToDouble(LeaseStats::getBytes)
|
||||
.sum()
|
||||
/ BYTES_PER_KB
|
||||
/ frequency;
|
||||
exponentialMovingAverage.add(throughput);
|
||||
return exponentialMovingAverage.getValue();
|
||||
}
|
||||
|
||||
/**
|
||||
* Gets the currentTimeMillis and then iterates over the queue to get the stats with creation time less than
|
||||
* currentTimeMillis.
|
||||
* This is specifically done to avoid potential race between with high-frequency put thread blocking get thread.
|
||||
*/
|
||||
private Queue<LeaseStats> readQueue(final Queue<LeaseStats> leaseStatsQueue) {
|
||||
final long currentTimeMillis = getCurrenTimeInMillis();
|
||||
final Queue<LeaseStats> response = new LinkedList<>();
|
||||
for (LeaseStats leaseStats : leaseStatsQueue) {
|
||||
if (leaseStats.creationTimeMillis > currentTimeMillis) {
|
||||
break;
|
||||
}
|
||||
response.add(leaseStats);
|
||||
}
|
||||
return response;
|
||||
}
|
||||
|
||||
private long getCurrenTimeInMillis() {
|
||||
try {
|
||||
return timeProviderInMillis.call();
|
||||
} catch (final Exception e) {
|
||||
// Fallback to using the System.currentTimeMillis if failed.
|
||||
return System.currentTimeMillis();
|
||||
}
|
||||
}
|
||||
|
||||
private void filterExpiredEntries(final Queue<LeaseStats> leaseStatsQueue) {
|
||||
final long currentTime = getCurrenTimeInMillis();
|
||||
while (!leaseStatsQueue.isEmpty()) {
|
||||
final LeaseStats leaseStats = leaseStatsQueue.peek();
|
||||
if (isNull(leaseStats) || currentTime - leaseStats.getCreationTimeMillis() < renewerFrequencyInMillis) {
|
||||
break;
|
||||
}
|
||||
leaseStatsQueue.poll();
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Clear the in-memory stats for the lease when a lease is reassigned (due to shut down or lease stealing)
|
||||
* @param leaseKey leaseKey, for which stats are supposed to be clear.
|
||||
*/
|
||||
public void dropLeaseStats(final String leaseKey) {
|
||||
leaseStatsMap.remove(leaseKey);
|
||||
leaseKeyToExponentialMovingAverageMap.remove(leaseKey);
|
||||
}
|
||||
|
||||
@Builder
|
||||
@Getter
|
||||
@ToString
|
||||
@KinesisClientInternalApi
|
||||
public static final class LeaseStats {
|
||||
/**
|
||||
* Lease key for which this leaseStats object is created.
|
||||
*/
|
||||
private final String leaseKey;
|
||||
/**
|
||||
* Bytes that are processed for a lease
|
||||
*/
|
||||
private final long bytes;
|
||||
/**
|
||||
* Wall time in epoch millis at which this leaseStats object was created. This time is used to determine the
|
||||
* expiry of the lease stats.
|
||||
*/
|
||||
@Builder.Default
|
||||
private final long creationTimeMillis = System.currentTimeMillis();
|
||||
}
|
||||
}
|
||||
|
|
@ -19,6 +19,7 @@ import java.util.Collections;
|
|||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.UUID;
|
||||
import java.util.concurrent.ConcurrentMap;
|
||||
import java.util.concurrent.ExecutorService;
|
||||
import java.util.concurrent.Executors;
|
||||
import java.util.concurrent.LinkedTransferQueue;
|
||||
|
|
@ -30,13 +31,17 @@ import java.util.concurrent.TimeUnit;
|
|||
import java.util.stream.Collectors;
|
||||
|
||||
import com.google.common.util.concurrent.ThreadFactoryBuilder;
|
||||
import lombok.RequiredArgsConstructor;
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
import software.amazon.kinesis.annotations.KinesisClientInternalApi;
|
||||
import software.amazon.kinesis.coordinator.MigrationAdaptiveLeaseAssignmentModeProvider;
|
||||
import software.amazon.kinesis.leases.Lease;
|
||||
import software.amazon.kinesis.leases.LeaseCoordinator;
|
||||
import software.amazon.kinesis.leases.LeaseDiscoverer;
|
||||
import software.amazon.kinesis.leases.LeaseManagementConfig;
|
||||
import software.amazon.kinesis.leases.LeaseRefresher;
|
||||
import software.amazon.kinesis.leases.LeaseRenewer;
|
||||
import software.amazon.kinesis.leases.LeaseStatsRecorder;
|
||||
import software.amazon.kinesis.leases.LeaseTaker;
|
||||
import software.amazon.kinesis.leases.MultiStreamLease;
|
||||
import software.amazon.kinesis.leases.ShardInfo;
|
||||
|
|
@ -44,6 +49,8 @@ import software.amazon.kinesis.leases.exceptions.DependencyException;
|
|||
import software.amazon.kinesis.leases.exceptions.InvalidStateException;
|
||||
import software.amazon.kinesis.leases.exceptions.LeasingException;
|
||||
import software.amazon.kinesis.leases.exceptions.ProvisionedThroughputException;
|
||||
import software.amazon.kinesis.lifecycle.LeaseGracefulShutdownHandler;
|
||||
import software.amazon.kinesis.lifecycle.ShardConsumer;
|
||||
import software.amazon.kinesis.metrics.MetricsFactory;
|
||||
import software.amazon.kinesis.metrics.MetricsLevel;
|
||||
import software.amazon.kinesis.metrics.MetricsScope;
|
||||
|
|
@ -70,115 +77,34 @@ public class DynamoDBLeaseCoordinator implements LeaseCoordinator {
|
|||
.setNameFormat("LeaseRenewer-%04d")
|
||||
.setDaemon(true)
|
||||
.build();
|
||||
private static final ThreadFactory LEASE_DISCOVERY_THREAD_FACTORY = new ThreadFactoryBuilder()
|
||||
.setNameFormat("LeaseDiscovery-%04d")
|
||||
.setDaemon(true)
|
||||
.build();
|
||||
|
||||
private final LeaseRenewer leaseRenewer;
|
||||
private final LeaseTaker leaseTaker;
|
||||
private final LeaseDiscoverer leaseDiscoverer;
|
||||
private final long renewerIntervalMillis;
|
||||
private final long takerIntervalMillis;
|
||||
private final long leaseDiscovererIntervalMillis;
|
||||
private final ExecutorService leaseRenewalThreadpool;
|
||||
private final ExecutorService leaseDiscoveryThreadPool;
|
||||
private final LeaseRefresher leaseRefresher;
|
||||
private final LeaseStatsRecorder leaseStatsRecorder;
|
||||
private final LeaseGracefulShutdownHandler leaseGracefulShutdownHandler;
|
||||
private long initialLeaseTableReadCapacity;
|
||||
private long initialLeaseTableWriteCapacity;
|
||||
protected final MetricsFactory metricsFactory;
|
||||
|
||||
private final Object shutdownLock = new Object();
|
||||
|
||||
private final LeaseManagementConfig.WorkerUtilizationAwareAssignmentConfig workerUtilizationAwareAssignmentConfig;
|
||||
private ScheduledExecutorService leaseCoordinatorThreadPool;
|
||||
private ScheduledFuture<?> leaseDiscoveryFuture;
|
||||
private ScheduledFuture<?> takerFuture;
|
||||
|
||||
private volatile boolean running = false;
|
||||
|
||||
/**
|
||||
* Constructor.
|
||||
*
|
||||
* <p>NOTE: This constructor is deprecated and will be removed in a future release.</p>
|
||||
*
|
||||
* @param leaseRefresher
|
||||
* LeaseRefresher instance to use
|
||||
* @param workerIdentifier
|
||||
* Identifies the worker (e.g. useful to track lease ownership)
|
||||
* @param leaseDurationMillis
|
||||
* Duration of a lease
|
||||
* @param epsilonMillis
|
||||
* Allow for some variance when calculating lease expirations
|
||||
* @param maxLeasesForWorker
|
||||
* Max leases this Worker can handle at a time
|
||||
* @param maxLeasesToStealAtOneTime
|
||||
* Steal up to these many leases at a time (for load balancing)
|
||||
* @param metricsFactory
|
||||
* Used to publish metrics about lease operations
|
||||
*/
|
||||
@Deprecated
|
||||
public DynamoDBLeaseCoordinator(
|
||||
final LeaseRefresher leaseRefresher,
|
||||
final String workerIdentifier,
|
||||
final long leaseDurationMillis,
|
||||
final long epsilonMillis,
|
||||
final int maxLeasesForWorker,
|
||||
final int maxLeasesToStealAtOneTime,
|
||||
final int maxLeaseRenewerThreadCount,
|
||||
final MetricsFactory metricsFactory) {
|
||||
this(
|
||||
leaseRefresher,
|
||||
workerIdentifier,
|
||||
leaseDurationMillis,
|
||||
epsilonMillis,
|
||||
maxLeasesForWorker,
|
||||
maxLeasesToStealAtOneTime,
|
||||
maxLeaseRenewerThreadCount,
|
||||
TableConstants.DEFAULT_INITIAL_LEASE_TABLE_READ_CAPACITY,
|
||||
TableConstants.DEFAULT_INITIAL_LEASE_TABLE_WRITE_CAPACITY,
|
||||
metricsFactory);
|
||||
}
|
||||
|
||||
/**
|
||||
* Constructor.
|
||||
*
|
||||
* @param leaseRefresher
|
||||
* LeaseRefresher instance to use
|
||||
* @param workerIdentifier
|
||||
* Identifies the worker (e.g. useful to track lease ownership)
|
||||
* @param leaseDurationMillis
|
||||
* Duration of a lease
|
||||
* @param epsilonMillis
|
||||
* Allow for some variance when calculating lease expirations
|
||||
* @param maxLeasesForWorker
|
||||
* Max leases this Worker can handle at a time
|
||||
* @param maxLeasesToStealAtOneTime
|
||||
* Steal up to these many leases at a time (for load balancing)
|
||||
* @param initialLeaseTableReadCapacity
|
||||
* Initial dynamodb lease table read iops if creating the lease table
|
||||
* @param initialLeaseTableWriteCapacity
|
||||
* Initial dynamodb lease table write iops if creating the lease table
|
||||
* @param metricsFactory
|
||||
* Used to publish metrics about lease operations
|
||||
*/
|
||||
@Deprecated
|
||||
public DynamoDBLeaseCoordinator(
|
||||
final LeaseRefresher leaseRefresher,
|
||||
final String workerIdentifier,
|
||||
final long leaseDurationMillis,
|
||||
final long epsilonMillis,
|
||||
final int maxLeasesForWorker,
|
||||
final int maxLeasesToStealAtOneTime,
|
||||
final int maxLeaseRenewerThreadCount,
|
||||
final long initialLeaseTableReadCapacity,
|
||||
final long initialLeaseTableWriteCapacity,
|
||||
final MetricsFactory metricsFactory) {
|
||||
this(
|
||||
leaseRefresher,
|
||||
workerIdentifier,
|
||||
leaseDurationMillis,
|
||||
LeaseManagementConfig.DEFAULT_ENABLE_PRIORITY_LEASE_ASSIGNMENT,
|
||||
epsilonMillis,
|
||||
maxLeasesForWorker,
|
||||
maxLeasesToStealAtOneTime,
|
||||
maxLeaseRenewerThreadCount,
|
||||
TableConstants.DEFAULT_INITIAL_LEASE_TABLE_READ_CAPACITY,
|
||||
TableConstants.DEFAULT_INITIAL_LEASE_TABLE_WRITE_CAPACITY,
|
||||
metricsFactory);
|
||||
}
|
||||
|
||||
/**
|
||||
* Constructor.
|
||||
*
|
||||
|
|
@ -214,17 +140,35 @@ public class DynamoDBLeaseCoordinator implements LeaseCoordinator {
|
|||
final int maxLeaseRenewerThreadCount,
|
||||
final long initialLeaseTableReadCapacity,
|
||||
final long initialLeaseTableWriteCapacity,
|
||||
final MetricsFactory metricsFactory) {
|
||||
final MetricsFactory metricsFactory,
|
||||
final LeaseManagementConfig.WorkerUtilizationAwareAssignmentConfig workerUtilizationAwareAssignmentConfig,
|
||||
final LeaseManagementConfig.GracefulLeaseHandoffConfig gracefulLeaseHandoffConfig,
|
||||
final ConcurrentMap<ShardInfo, ShardConsumer> shardInfoShardConsumerMap) {
|
||||
this.leaseRefresher = leaseRefresher;
|
||||
this.leaseRenewalThreadpool = getLeaseRenewalExecutorService(maxLeaseRenewerThreadCount);
|
||||
this.leaseRenewalThreadpool = createExecutorService(maxLeaseRenewerThreadCount, LEASE_RENEWAL_THREAD_FACTORY);
|
||||
this.leaseTaker = new DynamoDBLeaseTaker(leaseRefresher, workerIdentifier, leaseDurationMillis, metricsFactory)
|
||||
.withMaxLeasesForWorker(maxLeasesForWorker)
|
||||
.withMaxLeasesToStealAtOneTime(maxLeasesToStealAtOneTime)
|
||||
.withEnablePriorityLeaseAssignment(enablePriorityLeaseAssignment);
|
||||
this.leaseRenewer = new DynamoDBLeaseRenewer(
|
||||
leaseRefresher, workerIdentifier, leaseDurationMillis, leaseRenewalThreadpool, metricsFactory);
|
||||
this.renewerIntervalMillis = getRenewerTakerIntervalMillis(leaseDurationMillis, epsilonMillis);
|
||||
this.takerIntervalMillis = (leaseDurationMillis + epsilonMillis) * 2;
|
||||
// Should run once every leaseDurationMillis to identify new leases before expiry.
|
||||
this.leaseDiscovererIntervalMillis = leaseDurationMillis - epsilonMillis;
|
||||
this.leaseStatsRecorder = new LeaseStatsRecorder(renewerIntervalMillis, System::currentTimeMillis);
|
||||
this.leaseGracefulShutdownHandler = LeaseGracefulShutdownHandler.create(
|
||||
gracefulLeaseHandoffConfig.gracefulLeaseHandoffTimeoutMillis(), shardInfoShardConsumerMap, this);
|
||||
this.leaseRenewer = new DynamoDBLeaseRenewer(
|
||||
leaseRefresher,
|
||||
workerIdentifier,
|
||||
leaseDurationMillis,
|
||||
leaseRenewalThreadpool,
|
||||
metricsFactory,
|
||||
leaseStatsRecorder,
|
||||
leaseGracefulShutdownHandler::enqueueShutdown);
|
||||
this.leaseDiscoveryThreadPool =
|
||||
createExecutorService(maxLeaseRenewerThreadCount, LEASE_DISCOVERY_THREAD_FACTORY);
|
||||
this.leaseDiscoverer = new DynamoDBLeaseDiscoverer(
|
||||
this.leaseRefresher, this.leaseRenewer, metricsFactory, workerIdentifier, leaseDiscoveryThreadPool);
|
||||
if (initialLeaseTableReadCapacity <= 0) {
|
||||
throw new IllegalArgumentException("readCapacity should be >= 1");
|
||||
}
|
||||
|
|
@ -234,6 +178,7 @@ public class DynamoDBLeaseCoordinator implements LeaseCoordinator {
|
|||
}
|
||||
this.initialLeaseTableWriteCapacity = initialLeaseTableWriteCapacity;
|
||||
this.metricsFactory = metricsFactory;
|
||||
this.workerUtilizationAwareAssignmentConfig = workerUtilizationAwareAssignmentConfig;
|
||||
|
||||
log.info(
|
||||
"With failover time {} ms and epsilon {} ms, LeaseCoordinator will renew leases every {} ms, take"
|
||||
|
|
@ -246,11 +191,49 @@ public class DynamoDBLeaseCoordinator implements LeaseCoordinator {
|
|||
maxLeasesToStealAtOneTime);
|
||||
}
|
||||
|
||||
private class TakerRunnable implements Runnable {
|
||||
@RequiredArgsConstructor
|
||||
private class LeaseDiscoveryRunnable implements Runnable {
|
||||
private final MigrationAdaptiveLeaseAssignmentModeProvider leaseAssignmentModeProvider;
|
||||
|
||||
@Override
|
||||
public void run() {
|
||||
try {
|
||||
// LeaseDiscoverer is run in WORKER_UTILIZATION_AWARE_ASSIGNMENT mode only
|
||||
synchronized (shutdownLock) {
|
||||
if (!leaseAssignmentModeProvider
|
||||
.getLeaseAssignmentMode()
|
||||
.equals(
|
||||
MigrationAdaptiveLeaseAssignmentModeProvider.LeaseAssignmentMode
|
||||
.WORKER_UTILIZATION_AWARE_ASSIGNMENT)) {
|
||||
return;
|
||||
}
|
||||
if (running) {
|
||||
leaseRenewer.addLeasesToRenew(leaseDiscoverer.discoverNewLeases());
|
||||
}
|
||||
}
|
||||
} catch (Exception e) {
|
||||
log.error("Failed to execute lease discovery", e);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@RequiredArgsConstructor
|
||||
private class TakerRunnable implements Runnable {
|
||||
private final MigrationAdaptiveLeaseAssignmentModeProvider leaseAssignmentModeProvider;
|
||||
|
||||
@Override
|
||||
public void run() {
|
||||
try {
|
||||
// LeaseTaker is run in DEFAULT_LEASE_COUNT_BASED_ASSIGNMENT mode only
|
||||
synchronized (shutdownLock) {
|
||||
if (!leaseAssignmentModeProvider
|
||||
.getLeaseAssignmentMode()
|
||||
.equals(
|
||||
MigrationAdaptiveLeaseAssignmentModeProvider.LeaseAssignmentMode
|
||||
.DEFAULT_LEASE_COUNT_BASED_ASSIGNMENT)) {
|
||||
return;
|
||||
}
|
||||
}
|
||||
runLeaseTaker();
|
||||
} catch (LeasingException e) {
|
||||
log.error("LeasingException encountered in lease taking thread", e);
|
||||
|
|
@ -290,18 +273,35 @@ public class DynamoDBLeaseCoordinator implements LeaseCoordinator {
|
|||
}
|
||||
|
||||
@Override
|
||||
public void start() throws DependencyException, InvalidStateException, ProvisionedThroughputException {
|
||||
public void start(final MigrationAdaptiveLeaseAssignmentModeProvider leaseAssignmentModeProvider)
|
||||
throws DependencyException, InvalidStateException, ProvisionedThroughputException {
|
||||
leaseRenewer.initialize();
|
||||
// At max, we need 3 threads - lease renewer, lease taker, lease discoverer - to run without contention.
|
||||
leaseCoordinatorThreadPool = Executors.newScheduledThreadPool(3, LEASE_COORDINATOR_THREAD_FACTORY);
|
||||
|
||||
// 2 because we know we'll have at most 2 concurrent tasks at a time.
|
||||
leaseCoordinatorThreadPool = Executors.newScheduledThreadPool(2, LEASE_COORDINATOR_THREAD_FACTORY);
|
||||
|
||||
// Taker runs with fixed DELAY because we want it to run slower in the event of performance degredation.
|
||||
// During migration to KCLv3.x from KCLv2.x, lease assignment mode can change dynamically, so
|
||||
// both lease assignment algorithms will be started but only one will execute based on
|
||||
// leaseAssignmentModeProvider.getLeaseAssignmentMode(). However for new applications starting in
|
||||
// KCLv3.x or applications successfully migrated to KCLv3.x, lease assignment mode will not
|
||||
// change dynamically and will always be WORKER_UTILIZATION_AWARE_ASSIGNMENT, therefore
|
||||
// don't initialize KCLv2.x lease assignment algorithm components that are not needed.
|
||||
if (leaseAssignmentModeProvider.dynamicModeChangeSupportNeeded()) {
|
||||
// Taker runs with fixed DELAY because we want it to run slower in the event of performance degradation.
|
||||
takerFuture = leaseCoordinatorThreadPool.scheduleWithFixedDelay(
|
||||
new TakerRunnable(), 0L, takerIntervalMillis, TimeUnit.MILLISECONDS);
|
||||
// Renewer runs at fixed INTERVAL because we want it to run at the same rate in the event of degredation.
|
||||
new TakerRunnable(leaseAssignmentModeProvider), 0L, takerIntervalMillis, TimeUnit.MILLISECONDS);
|
||||
}
|
||||
|
||||
leaseDiscoveryFuture = leaseCoordinatorThreadPool.scheduleAtFixedRate(
|
||||
new LeaseDiscoveryRunnable(leaseAssignmentModeProvider),
|
||||
0L,
|
||||
leaseDiscovererIntervalMillis,
|
||||
TimeUnit.MILLISECONDS);
|
||||
|
||||
// Renewer runs at fixed INTERVAL because we want it to run at the same rate in the event of degradation.
|
||||
leaseCoordinatorThreadPool.scheduleAtFixedRate(
|
||||
new RenewerRunnable(), 0L, renewerIntervalMillis, TimeUnit.MILLISECONDS);
|
||||
|
||||
leaseGracefulShutdownHandler.start();
|
||||
running = true;
|
||||
}
|
||||
|
||||
|
|
@ -383,6 +383,8 @@ public class DynamoDBLeaseCoordinator implements LeaseCoordinator {
|
|||
}
|
||||
|
||||
leaseRenewalThreadpool.shutdownNow();
|
||||
leaseCoordinatorThreadPool.shutdownNow();
|
||||
leaseGracefulShutdownHandler.stop();
|
||||
synchronized (shutdownLock) {
|
||||
leaseRenewer.clearCurrentlyHeldLeases();
|
||||
running = false;
|
||||
|
|
@ -393,6 +395,10 @@ public class DynamoDBLeaseCoordinator implements LeaseCoordinator {
|
|||
public void stopLeaseTaker() {
|
||||
if (takerFuture != null) {
|
||||
takerFuture.cancel(false);
|
||||
leaseDiscoveryFuture.cancel(false);
|
||||
// the method is called in worker graceful shutdown. We want to stop any further lease shutdown
|
||||
// so we don't interrupt worker shutdown.
|
||||
leaseGracefulShutdownHandler.stop();
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -418,20 +424,15 @@ public class DynamoDBLeaseCoordinator implements LeaseCoordinator {
|
|||
}
|
||||
|
||||
/**
|
||||
* Returns executor service that should be used for lease renewal.
|
||||
* Returns executor service for given ThreadFactory.
|
||||
* @param maximumPoolSize Maximum allowed thread pool size
|
||||
* @return Executor service that should be used for lease renewal.
|
||||
* @return Executor service
|
||||
*/
|
||||
private static ExecutorService getLeaseRenewalExecutorService(int maximumPoolSize) {
|
||||
private static ExecutorService createExecutorService(final int maximumPoolSize, final ThreadFactory threadFactory) {
|
||||
int coreLeaseCount = Math.max(maximumPoolSize / 4, 2);
|
||||
|
||||
return new ThreadPoolExecutor(
|
||||
coreLeaseCount,
|
||||
maximumPoolSize,
|
||||
60,
|
||||
TimeUnit.SECONDS,
|
||||
new LinkedTransferQueue<>(),
|
||||
LEASE_RENEWAL_THREAD_FACTORY);
|
||||
coreLeaseCount, maximumPoolSize, 60, TimeUnit.SECONDS, new LinkedTransferQueue<>(), threadFactory);
|
||||
}
|
||||
|
||||
@Override
|
||||
|
|
@ -472,6 +473,8 @@ public class DynamoDBLeaseCoordinator implements LeaseCoordinator {
|
|||
* {@inheritDoc}
|
||||
*
|
||||
* <p>NOTE: This method is deprecated. Please set the initial capacity through the constructor.</p>
|
||||
*
|
||||
* This is a method of the public lease coordinator interface.
|
||||
*/
|
||||
@Override
|
||||
@Deprecated
|
||||
|
|
@ -487,6 +490,8 @@ public class DynamoDBLeaseCoordinator implements LeaseCoordinator {
|
|||
* {@inheritDoc}
|
||||
*
|
||||
* <p>NOTE: This method is deprecated. Please set the initial capacity through the constructor.</p>
|
||||
*
|
||||
* This is a method of the public lease coordinator interface.
|
||||
*/
|
||||
@Override
|
||||
@Deprecated
|
||||
|
|
@ -497,4 +502,9 @@ public class DynamoDBLeaseCoordinator implements LeaseCoordinator {
|
|||
initialLeaseTableWriteCapacity = writeCapacity;
|
||||
return this;
|
||||
}
|
||||
|
||||
@Override
|
||||
public LeaseStatsRecorder leaseStatsRecorder() {
|
||||
return leaseStatsRecorder;
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -0,0 +1,120 @@
|
|||
package software.amazon.kinesis.leases.dynamodb;
|
||||
|
||||
import java.util.List;
|
||||
import java.util.Objects;
|
||||
import java.util.Set;
|
||||
import java.util.concurrent.CompletableFuture;
|
||||
import java.util.concurrent.ExecutorService;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import lombok.RequiredArgsConstructor;
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
import software.amazon.kinesis.leases.Lease;
|
||||
import software.amazon.kinesis.leases.LeaseDiscoverer;
|
||||
import software.amazon.kinesis.leases.LeaseRefresher;
|
||||
import software.amazon.kinesis.leases.LeaseRenewer;
|
||||
import software.amazon.kinesis.leases.exceptions.DependencyException;
|
||||
import software.amazon.kinesis.leases.exceptions.InvalidStateException;
|
||||
import software.amazon.kinesis.leases.exceptions.ProvisionedThroughputException;
|
||||
import software.amazon.kinesis.metrics.MetricsFactory;
|
||||
import software.amazon.kinesis.metrics.MetricsLevel;
|
||||
import software.amazon.kinesis.metrics.MetricsScope;
|
||||
import software.amazon.kinesis.metrics.MetricsUtil;
|
||||
|
||||
import static java.util.Objects.isNull;
|
||||
|
||||
/**
|
||||
* An implementation of {@link LeaseDiscoverer}, it uses {@link LeaseRefresher} to query
|
||||
* {@link DynamoDBLeaseRefresher#LEASE_OWNER_TO_LEASE_KEY_INDEX_NAME } and find the leases assigned
|
||||
* to current worker and then filter and returns the leases that have not started processing (looks at
|
||||
* {@link LeaseRenewer#getCurrentlyHeldLeases()} to find out which leases are currently held leases).
|
||||
*/
|
||||
@Slf4j
|
||||
@RequiredArgsConstructor
|
||||
public class DynamoDBLeaseDiscoverer implements LeaseDiscoverer {
|
||||
|
||||
private final LeaseRefresher leaseRefresher;
|
||||
private final LeaseRenewer leaseRenewer;
|
||||
private final MetricsFactory metricsFactory;
|
||||
private final String workerIdentifier;
|
||||
private final ExecutorService executorService;
|
||||
|
||||
@Override
|
||||
public List<Lease> discoverNewLeases()
|
||||
throws ProvisionedThroughputException, InvalidStateException, DependencyException {
|
||||
final MetricsScope metricsScope = MetricsUtil.createMetricsWithOperation(metricsFactory, "LeaseDiscovery");
|
||||
long startTime = System.currentTimeMillis();
|
||||
boolean success = false;
|
||||
try {
|
||||
final Set<String> currentHeldLeaseKeys =
|
||||
leaseRenewer.getCurrentlyHeldLeases().keySet();
|
||||
|
||||
final long listLeaseKeysForWorkerStartTime = System.currentTimeMillis();
|
||||
final List<String> leaseKeys = leaseRefresher.listLeaseKeysForWorker(workerIdentifier);
|
||||
MetricsUtil.addLatency(
|
||||
metricsScope, "ListLeaseKeysForWorker", listLeaseKeysForWorkerStartTime, MetricsLevel.DETAILED);
|
||||
|
||||
final List<String> newLeaseKeys = leaseKeys.stream()
|
||||
.filter(leaseKey -> !currentHeldLeaseKeys.contains(leaseKey))
|
||||
.collect(Collectors.toList());
|
||||
|
||||
final long fetchNewLeasesStartTime = System.currentTimeMillis();
|
||||
final List<CompletableFuture<Lease>> completableFutures = newLeaseKeys.stream()
|
||||
.map(leaseKey ->
|
||||
CompletableFuture.supplyAsync(() -> fetchLease(leaseKey, metricsScope), executorService))
|
||||
.collect(Collectors.toList());
|
||||
|
||||
final List<Lease> newLeases = completableFutures.stream()
|
||||
.map(CompletableFuture::join)
|
||||
.filter(Objects::nonNull)
|
||||
.collect(Collectors.toList());
|
||||
|
||||
log.info(
|
||||
"New leases assigned to worker : {}, count : {}, leases : {}",
|
||||
workerIdentifier,
|
||||
newLeases.size(),
|
||||
newLeases.stream().map(Lease::leaseKey).collect(Collectors.toList()));
|
||||
|
||||
MetricsUtil.addLatency(metricsScope, "FetchNewLeases", fetchNewLeasesStartTime, MetricsLevel.DETAILED);
|
||||
|
||||
success = true;
|
||||
MetricsUtil.addCount(metricsScope, "NewLeasesDiscovered", newLeases.size(), MetricsLevel.DETAILED);
|
||||
return newLeases;
|
||||
} finally {
|
||||
MetricsUtil.addWorkerIdentifier(metricsScope, workerIdentifier);
|
||||
MetricsUtil.addSuccessAndLatency(metricsScope, success, startTime, MetricsLevel.SUMMARY);
|
||||
MetricsUtil.endScope(metricsScope);
|
||||
}
|
||||
}
|
||||
|
||||
private Lease fetchLease(final String leaseKey, final MetricsScope metricsScope) {
|
||||
try {
|
||||
final Lease lease = leaseRefresher.getLease(leaseKey);
|
||||
if (isNull(lease)) {
|
||||
return null;
|
||||
}
|
||||
// GSI is eventually consistent thus, validate that the fetched lease is indeed assigned to this
|
||||
// worker, if not just pass in this run.
|
||||
if (!lease.leaseOwner().equals(workerIdentifier)) {
|
||||
MetricsUtil.addCount(metricsScope, "OwnerMismatch", 1, MetricsLevel.DETAILED);
|
||||
return null;
|
||||
}
|
||||
// if checkpointOwner is not null, it means that the lease is still pending shutdown for the last owner.
|
||||
// Don't add the lease to the in-memory map yet.
|
||||
if (lease.checkpointOwner() != null) {
|
||||
return null;
|
||||
}
|
||||
// when a new lease is discovered, set the lastCounterIncrementNanos to current time as the time
|
||||
// when it has become visible, on next renewer interval this will be updated by LeaseRenewer to
|
||||
// correct time.
|
||||
lease.lastCounterIncrementNanos(System.nanoTime());
|
||||
return lease;
|
||||
} catch (final Exception e) {
|
||||
// if getLease on some lease key fail, continue and fetch other leases, the one failed will
|
||||
// be fetched in the next iteration or will be reassigned if stayed idle for long.
|
||||
MetricsUtil.addCount(metricsScope, "GetLease:Error", 1, MetricsLevel.SUMMARY);
|
||||
log.error("GetLease failed for leaseKey : {}", leaseKey, e);
|
||||
return null;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -17,18 +17,21 @@ package software.amazon.kinesis.leases.dynamodb;
|
|||
|
||||
import java.time.Duration;
|
||||
import java.util.Collection;
|
||||
import java.util.concurrent.ConcurrentHashMap;
|
||||
import java.util.concurrent.ConcurrentMap;
|
||||
import java.util.concurrent.ExecutorService;
|
||||
import java.util.concurrent.Executors;
|
||||
import java.util.function.Function;
|
||||
|
||||
import lombok.Data;
|
||||
import lombok.NonNull;
|
||||
import software.amazon.awssdk.core.util.DefaultSdkAutoConstructList;
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
import software.amazon.awssdk.services.dynamodb.DynamoDbAsyncClient;
|
||||
import software.amazon.awssdk.services.dynamodb.model.BillingMode;
|
||||
import software.amazon.awssdk.services.dynamodb.model.Tag;
|
||||
import software.amazon.awssdk.services.kinesis.KinesisAsyncClient;
|
||||
import software.amazon.kinesis.annotations.KinesisClientInternalApi;
|
||||
import software.amazon.kinesis.common.DdbTableConfig;
|
||||
import software.amazon.kinesis.common.InitialPositionInStreamExtended;
|
||||
import software.amazon.kinesis.common.LeaseCleanupConfig;
|
||||
import software.amazon.kinesis.common.StreamConfig;
|
||||
|
|
@ -42,12 +45,15 @@ import software.amazon.kinesis.leases.LeaseManagementConfig;
|
|||
import software.amazon.kinesis.leases.LeaseManagementFactory;
|
||||
import software.amazon.kinesis.leases.LeaseSerializer;
|
||||
import software.amazon.kinesis.leases.ShardDetector;
|
||||
import software.amazon.kinesis.leases.ShardInfo;
|
||||
import software.amazon.kinesis.leases.ShardSyncTaskManager;
|
||||
import software.amazon.kinesis.lifecycle.ShardConsumer;
|
||||
import software.amazon.kinesis.metrics.MetricsFactory;
|
||||
|
||||
/**
|
||||
*
|
||||
*/
|
||||
@Slf4j
|
||||
@Data
|
||||
@KinesisClientInternalApi
|
||||
public class DynamoDBLeaseManagementFactory implements LeaseManagementFactory {
|
||||
|
|
@ -73,6 +79,8 @@ public class DynamoDBLeaseManagementFactory implements LeaseManagementFactory {
|
|||
@NonNull
|
||||
private final LeaseSerializer leaseSerializer;
|
||||
|
||||
private final LeaseManagementConfig.WorkerUtilizationAwareAssignmentConfig workerUtilizationAwareAssignmentConfig;
|
||||
|
||||
@NonNull
|
||||
private StreamConfig streamConfig;
|
||||
|
||||
|
|
@ -103,434 +111,11 @@ public class DynamoDBLeaseManagementFactory implements LeaseManagementFactory {
|
|||
private final Collection<Tag> tags;
|
||||
private final boolean isMultiStreamMode;
|
||||
private final LeaseCleanupConfig leaseCleanupConfig;
|
||||
private final LeaseManagementConfig.GracefulLeaseHandoffConfig gracefulLeaseHandoffConfig;
|
||||
|
||||
/**
|
||||
* Constructor.
|
||||
*
|
||||
* <p>NOTE: This constructor is deprecated and will be removed in a future release.</p>
|
||||
*
|
||||
* @param kinesisClient
|
||||
* @param streamName
|
||||
* @param dynamoDBClient
|
||||
* @param tableName
|
||||
* @param workerIdentifier
|
||||
* @param executorService
|
||||
* @param initialPositionInStream
|
||||
* @param failoverTimeMillis
|
||||
* @param epsilonMillis
|
||||
* @param maxLeasesForWorker
|
||||
* @param maxLeasesToStealAtOneTime
|
||||
* @param maxLeaseRenewalThreads
|
||||
* @param cleanupLeasesUponShardCompletion
|
||||
* @param ignoreUnexpectedChildShards
|
||||
* @param shardSyncIntervalMillis
|
||||
* @param consistentReads
|
||||
* @param listShardsBackoffTimeMillis
|
||||
* @param maxListShardsRetryAttempts
|
||||
* @param maxCacheMissesBeforeReload
|
||||
* @param listShardsCacheAllowedAgeInSeconds
|
||||
* @param cacheMissWarningModulus
|
||||
*/
|
||||
@Deprecated
|
||||
public DynamoDBLeaseManagementFactory(
|
||||
final KinesisAsyncClient kinesisClient,
|
||||
final String streamName,
|
||||
final DynamoDbAsyncClient dynamoDBClient,
|
||||
final String tableName,
|
||||
final String workerIdentifier,
|
||||
final ExecutorService executorService,
|
||||
final InitialPositionInStreamExtended initialPositionInStream,
|
||||
final long failoverTimeMillis,
|
||||
final long epsilonMillis,
|
||||
final int maxLeasesForWorker,
|
||||
final int maxLeasesToStealAtOneTime,
|
||||
final int maxLeaseRenewalThreads,
|
||||
final boolean cleanupLeasesUponShardCompletion,
|
||||
final boolean ignoreUnexpectedChildShards,
|
||||
final long shardSyncIntervalMillis,
|
||||
final boolean consistentReads,
|
||||
final long listShardsBackoffTimeMillis,
|
||||
final int maxListShardsRetryAttempts,
|
||||
final int maxCacheMissesBeforeReload,
|
||||
final long listShardsCacheAllowedAgeInSeconds,
|
||||
final int cacheMissWarningModulus) {
|
||||
this(
|
||||
kinesisClient,
|
||||
streamName,
|
||||
dynamoDBClient,
|
||||
tableName,
|
||||
workerIdentifier,
|
||||
executorService,
|
||||
initialPositionInStream,
|
||||
failoverTimeMillis,
|
||||
epsilonMillis,
|
||||
maxLeasesForWorker,
|
||||
maxLeasesToStealAtOneTime,
|
||||
maxLeaseRenewalThreads,
|
||||
cleanupLeasesUponShardCompletion,
|
||||
ignoreUnexpectedChildShards,
|
||||
shardSyncIntervalMillis,
|
||||
consistentReads,
|
||||
listShardsBackoffTimeMillis,
|
||||
maxListShardsRetryAttempts,
|
||||
maxCacheMissesBeforeReload,
|
||||
listShardsCacheAllowedAgeInSeconds,
|
||||
cacheMissWarningModulus,
|
||||
TableConstants.DEFAULT_INITIAL_LEASE_TABLE_READ_CAPACITY,
|
||||
TableConstants.DEFAULT_INITIAL_LEASE_TABLE_WRITE_CAPACITY);
|
||||
}
|
||||
|
||||
/**
|
||||
* Constructor.
|
||||
*
|
||||
* <p>
|
||||
* NOTE: This constructor is deprecated and will be removed in a future release.
|
||||
* </p>
|
||||
*
|
||||
* @param kinesisClient
|
||||
* @param streamName
|
||||
* @param dynamoDBClient
|
||||
* @param tableName
|
||||
* @param workerIdentifier
|
||||
* @param executorService
|
||||
* @param initialPositionInStream
|
||||
* @param failoverTimeMillis
|
||||
* @param epsilonMillis
|
||||
* @param maxLeasesForWorker
|
||||
* @param maxLeasesToStealAtOneTime
|
||||
* @param maxLeaseRenewalThreads
|
||||
* @param cleanupLeasesUponShardCompletion
|
||||
* @param ignoreUnexpectedChildShards
|
||||
* @param shardSyncIntervalMillis
|
||||
* @param consistentReads
|
||||
* @param listShardsBackoffTimeMillis
|
||||
* @param maxListShardsRetryAttempts
|
||||
* @param maxCacheMissesBeforeReload
|
||||
* @param listShardsCacheAllowedAgeInSeconds
|
||||
* @param cacheMissWarningModulus
|
||||
* @param initialLeaseTableReadCapacity
|
||||
* @param initialLeaseTableWriteCapacity
|
||||
*/
|
||||
@Deprecated
|
||||
public DynamoDBLeaseManagementFactory(
|
||||
final KinesisAsyncClient kinesisClient,
|
||||
final String streamName,
|
||||
final DynamoDbAsyncClient dynamoDBClient,
|
||||
final String tableName,
|
||||
final String workerIdentifier,
|
||||
final ExecutorService executorService,
|
||||
final InitialPositionInStreamExtended initialPositionInStream,
|
||||
final long failoverTimeMillis,
|
||||
final long epsilonMillis,
|
||||
final int maxLeasesForWorker,
|
||||
final int maxLeasesToStealAtOneTime,
|
||||
final int maxLeaseRenewalThreads,
|
||||
final boolean cleanupLeasesUponShardCompletion,
|
||||
final boolean ignoreUnexpectedChildShards,
|
||||
final long shardSyncIntervalMillis,
|
||||
final boolean consistentReads,
|
||||
final long listShardsBackoffTimeMillis,
|
||||
final int maxListShardsRetryAttempts,
|
||||
final int maxCacheMissesBeforeReload,
|
||||
final long listShardsCacheAllowedAgeInSeconds,
|
||||
final int cacheMissWarningModulus,
|
||||
final long initialLeaseTableReadCapacity,
|
||||
final long initialLeaseTableWriteCapacity) {
|
||||
this(
|
||||
kinesisClient,
|
||||
streamName,
|
||||
dynamoDBClient,
|
||||
tableName,
|
||||
workerIdentifier,
|
||||
executorService,
|
||||
initialPositionInStream,
|
||||
failoverTimeMillis,
|
||||
epsilonMillis,
|
||||
maxLeasesForWorker,
|
||||
maxLeasesToStealAtOneTime,
|
||||
maxLeaseRenewalThreads,
|
||||
cleanupLeasesUponShardCompletion,
|
||||
ignoreUnexpectedChildShards,
|
||||
shardSyncIntervalMillis,
|
||||
consistentReads,
|
||||
listShardsBackoffTimeMillis,
|
||||
maxListShardsRetryAttempts,
|
||||
maxCacheMissesBeforeReload,
|
||||
listShardsCacheAllowedAgeInSeconds,
|
||||
cacheMissWarningModulus,
|
||||
initialLeaseTableReadCapacity,
|
||||
initialLeaseTableWriteCapacity,
|
||||
new HierarchicalShardSyncer(),
|
||||
TableCreatorCallback.NOOP_TABLE_CREATOR_CALLBACK,
|
||||
LeaseManagementConfig.DEFAULT_REQUEST_TIMEOUT);
|
||||
}
|
||||
|
||||
/**
|
||||
* Constructor.
|
||||
*
|
||||
* @param kinesisClient
|
||||
* @param streamName
|
||||
* @param dynamoDBClient
|
||||
* @param tableName
|
||||
* @param workerIdentifier
|
||||
* @param executorService
|
||||
* @param initialPositionInStream
|
||||
* @param failoverTimeMillis
|
||||
* @param epsilonMillis
|
||||
* @param maxLeasesForWorker
|
||||
* @param maxLeasesToStealAtOneTime
|
||||
* @param maxLeaseRenewalThreads
|
||||
* @param cleanupLeasesUponShardCompletion
|
||||
* @param ignoreUnexpectedChildShards
|
||||
* @param shardSyncIntervalMillis
|
||||
* @param consistentReads
|
||||
* @param listShardsBackoffTimeMillis
|
||||
* @param maxListShardsRetryAttempts
|
||||
* @param maxCacheMissesBeforeReload
|
||||
* @param listShardsCacheAllowedAgeInSeconds
|
||||
* @param cacheMissWarningModulus
|
||||
* @param initialLeaseTableReadCapacity
|
||||
* @param initialLeaseTableWriteCapacity
|
||||
* @param hierarchicalShardSyncer
|
||||
* @param tableCreatorCallback
|
||||
*/
|
||||
@Deprecated
|
||||
public DynamoDBLeaseManagementFactory(
|
||||
final KinesisAsyncClient kinesisClient,
|
||||
final String streamName,
|
||||
final DynamoDbAsyncClient dynamoDBClient,
|
||||
final String tableName,
|
||||
final String workerIdentifier,
|
||||
final ExecutorService executorService,
|
||||
final InitialPositionInStreamExtended initialPositionInStream,
|
||||
final long failoverTimeMillis,
|
||||
final long epsilonMillis,
|
||||
final int maxLeasesForWorker,
|
||||
final int maxLeasesToStealAtOneTime,
|
||||
final int maxLeaseRenewalThreads,
|
||||
final boolean cleanupLeasesUponShardCompletion,
|
||||
final boolean ignoreUnexpectedChildShards,
|
||||
final long shardSyncIntervalMillis,
|
||||
final boolean consistentReads,
|
||||
final long listShardsBackoffTimeMillis,
|
||||
final int maxListShardsRetryAttempts,
|
||||
final int maxCacheMissesBeforeReload,
|
||||
final long listShardsCacheAllowedAgeInSeconds,
|
||||
final int cacheMissWarningModulus,
|
||||
final long initialLeaseTableReadCapacity,
|
||||
final long initialLeaseTableWriteCapacity,
|
||||
final HierarchicalShardSyncer hierarchicalShardSyncer,
|
||||
final TableCreatorCallback tableCreatorCallback) {
|
||||
this(
|
||||
kinesisClient,
|
||||
streamName,
|
||||
dynamoDBClient,
|
||||
tableName,
|
||||
workerIdentifier,
|
||||
executorService,
|
||||
initialPositionInStream,
|
||||
failoverTimeMillis,
|
||||
epsilonMillis,
|
||||
maxLeasesForWorker,
|
||||
maxLeasesToStealAtOneTime,
|
||||
maxLeaseRenewalThreads,
|
||||
cleanupLeasesUponShardCompletion,
|
||||
ignoreUnexpectedChildShards,
|
||||
shardSyncIntervalMillis,
|
||||
consistentReads,
|
||||
listShardsBackoffTimeMillis,
|
||||
maxListShardsRetryAttempts,
|
||||
maxCacheMissesBeforeReload,
|
||||
listShardsCacheAllowedAgeInSeconds,
|
||||
cacheMissWarningModulus,
|
||||
initialLeaseTableReadCapacity,
|
||||
initialLeaseTableWriteCapacity,
|
||||
hierarchicalShardSyncer,
|
||||
tableCreatorCallback,
|
||||
LeaseManagementConfig.DEFAULT_REQUEST_TIMEOUT);
|
||||
}
|
||||
|
||||
/**
|
||||
* Constructor.
|
||||
*
|
||||
* @param kinesisClient
|
||||
* @param streamName
|
||||
* @param dynamoDBClient
|
||||
* @param tableName
|
||||
* @param workerIdentifier
|
||||
* @param executorService
|
||||
* @param initialPositionInStream
|
||||
* @param failoverTimeMillis
|
||||
* @param epsilonMillis
|
||||
* @param maxLeasesForWorker
|
||||
* @param maxLeasesToStealAtOneTime
|
||||
* @param maxLeaseRenewalThreads
|
||||
* @param cleanupLeasesUponShardCompletion
|
||||
* @param ignoreUnexpectedChildShards
|
||||
* @param shardSyncIntervalMillis
|
||||
* @param consistentReads
|
||||
* @param listShardsBackoffTimeMillis
|
||||
* @param maxListShardsRetryAttempts
|
||||
* @param maxCacheMissesBeforeReload
|
||||
* @param listShardsCacheAllowedAgeInSeconds
|
||||
* @param cacheMissWarningModulus
|
||||
* @param initialLeaseTableReadCapacity
|
||||
* @param initialLeaseTableWriteCapacity
|
||||
* @param hierarchicalShardSyncer
|
||||
* @param tableCreatorCallback
|
||||
* @param dynamoDbRequestTimeout
|
||||
*/
|
||||
@Deprecated
|
||||
public DynamoDBLeaseManagementFactory(
|
||||
final KinesisAsyncClient kinesisClient,
|
||||
final String streamName,
|
||||
final DynamoDbAsyncClient dynamoDBClient,
|
||||
final String tableName,
|
||||
final String workerIdentifier,
|
||||
final ExecutorService executorService,
|
||||
final InitialPositionInStreamExtended initialPositionInStream,
|
||||
final long failoverTimeMillis,
|
||||
final long epsilonMillis,
|
||||
final int maxLeasesForWorker,
|
||||
final int maxLeasesToStealAtOneTime,
|
||||
final int maxLeaseRenewalThreads,
|
||||
final boolean cleanupLeasesUponShardCompletion,
|
||||
final boolean ignoreUnexpectedChildShards,
|
||||
final long shardSyncIntervalMillis,
|
||||
final boolean consistentReads,
|
||||
final long listShardsBackoffTimeMillis,
|
||||
final int maxListShardsRetryAttempts,
|
||||
final int maxCacheMissesBeforeReload,
|
||||
final long listShardsCacheAllowedAgeInSeconds,
|
||||
final int cacheMissWarningModulus,
|
||||
final long initialLeaseTableReadCapacity,
|
||||
final long initialLeaseTableWriteCapacity,
|
||||
final HierarchicalShardSyncer hierarchicalShardSyncer,
|
||||
final TableCreatorCallback tableCreatorCallback,
|
||||
Duration dynamoDbRequestTimeout) {
|
||||
this(
|
||||
kinesisClient,
|
||||
streamName,
|
||||
dynamoDBClient,
|
||||
tableName,
|
||||
workerIdentifier,
|
||||
executorService,
|
||||
initialPositionInStream,
|
||||
failoverTimeMillis,
|
||||
epsilonMillis,
|
||||
maxLeasesForWorker,
|
||||
maxLeasesToStealAtOneTime,
|
||||
maxLeaseRenewalThreads,
|
||||
cleanupLeasesUponShardCompletion,
|
||||
ignoreUnexpectedChildShards,
|
||||
shardSyncIntervalMillis,
|
||||
consistentReads,
|
||||
listShardsBackoffTimeMillis,
|
||||
maxListShardsRetryAttempts,
|
||||
maxCacheMissesBeforeReload,
|
||||
listShardsCacheAllowedAgeInSeconds,
|
||||
cacheMissWarningModulus,
|
||||
initialLeaseTableReadCapacity,
|
||||
initialLeaseTableWriteCapacity,
|
||||
hierarchicalShardSyncer,
|
||||
tableCreatorCallback,
|
||||
dynamoDbRequestTimeout,
|
||||
BillingMode.PAY_PER_REQUEST);
|
||||
}
|
||||
|
||||
/**
|
||||
* Constructor.
|
||||
*
|
||||
* @param kinesisClient
|
||||
* @param streamName
|
||||
* @param dynamoDBClient
|
||||
* @param tableName
|
||||
* @param workerIdentifier
|
||||
* @param executorService
|
||||
* @param initialPositionInStream
|
||||
* @param failoverTimeMillis
|
||||
* @param epsilonMillis
|
||||
* @param maxLeasesForWorker
|
||||
* @param maxLeasesToStealAtOneTime
|
||||
* @param maxLeaseRenewalThreads
|
||||
* @param cleanupLeasesUponShardCompletion
|
||||
* @param ignoreUnexpectedChildShards
|
||||
* @param shardSyncIntervalMillis
|
||||
* @param consistentReads
|
||||
* @param listShardsBackoffTimeMillis
|
||||
* @param maxListShardsRetryAttempts
|
||||
* @param maxCacheMissesBeforeReload
|
||||
* @param listShardsCacheAllowedAgeInSeconds
|
||||
* @param cacheMissWarningModulus
|
||||
* @param initialLeaseTableReadCapacity
|
||||
* @param initialLeaseTableWriteCapacity
|
||||
* @param hierarchicalShardSyncer
|
||||
* @param tableCreatorCallback
|
||||
* @param dynamoDbRequestTimeout
|
||||
* @param billingMode
|
||||
*/
|
||||
@Deprecated
|
||||
public DynamoDBLeaseManagementFactory(
|
||||
final KinesisAsyncClient kinesisClient,
|
||||
final String streamName,
|
||||
final DynamoDbAsyncClient dynamoDBClient,
|
||||
final String tableName,
|
||||
final String workerIdentifier,
|
||||
final ExecutorService executorService,
|
||||
final InitialPositionInStreamExtended initialPositionInStream,
|
||||
final long failoverTimeMillis,
|
||||
final long epsilonMillis,
|
||||
final int maxLeasesForWorker,
|
||||
final int maxLeasesToStealAtOneTime,
|
||||
final int maxLeaseRenewalThreads,
|
||||
final boolean cleanupLeasesUponShardCompletion,
|
||||
final boolean ignoreUnexpectedChildShards,
|
||||
final long shardSyncIntervalMillis,
|
||||
final boolean consistentReads,
|
||||
final long listShardsBackoffTimeMillis,
|
||||
final int maxListShardsRetryAttempts,
|
||||
final int maxCacheMissesBeforeReload,
|
||||
final long listShardsCacheAllowedAgeInSeconds,
|
||||
final int cacheMissWarningModulus,
|
||||
final long initialLeaseTableReadCapacity,
|
||||
final long initialLeaseTableWriteCapacity,
|
||||
final HierarchicalShardSyncer hierarchicalShardSyncer,
|
||||
final TableCreatorCallback tableCreatorCallback,
|
||||
Duration dynamoDbRequestTimeout,
|
||||
BillingMode billingMode) {
|
||||
|
||||
this(
|
||||
kinesisClient,
|
||||
new StreamConfig(StreamIdentifier.singleStreamInstance(streamName), initialPositionInStream),
|
||||
dynamoDBClient,
|
||||
tableName,
|
||||
workerIdentifier,
|
||||
executorService,
|
||||
failoverTimeMillis,
|
||||
epsilonMillis,
|
||||
maxLeasesForWorker,
|
||||
maxLeasesToStealAtOneTime,
|
||||
maxLeaseRenewalThreads,
|
||||
cleanupLeasesUponShardCompletion,
|
||||
ignoreUnexpectedChildShards,
|
||||
shardSyncIntervalMillis,
|
||||
consistentReads,
|
||||
listShardsBackoffTimeMillis,
|
||||
maxListShardsRetryAttempts,
|
||||
maxCacheMissesBeforeReload,
|
||||
listShardsCacheAllowedAgeInSeconds,
|
||||
cacheMissWarningModulus,
|
||||
initialLeaseTableReadCapacity,
|
||||
initialLeaseTableWriteCapacity,
|
||||
hierarchicalShardSyncer,
|
||||
tableCreatorCallback,
|
||||
dynamoDbRequestTimeout,
|
||||
billingMode,
|
||||
new DynamoDBLeaseSerializer());
|
||||
}
|
||||
|
||||
/**
|
||||
* Constructor.
|
||||
* @deprecated this is used by the deprecated method in LeaseManagementConfig to construct the LeaseManagement factory
|
||||
*
|
||||
* @param kinesisClient
|
||||
* @param streamName
|
||||
|
|
@ -592,291 +177,6 @@ public class DynamoDBLeaseManagementFactory implements LeaseManagementFactory {
|
|||
BillingMode billingMode,
|
||||
Collection<Tag> tags) {
|
||||
|
||||
this(
|
||||
kinesisClient,
|
||||
new StreamConfig(StreamIdentifier.singleStreamInstance(streamName), initialPositionInStream),
|
||||
dynamoDBClient,
|
||||
tableName,
|
||||
workerIdentifier,
|
||||
executorService,
|
||||
failoverTimeMillis,
|
||||
epsilonMillis,
|
||||
maxLeasesForWorker,
|
||||
maxLeasesToStealAtOneTime,
|
||||
maxLeaseRenewalThreads,
|
||||
cleanupLeasesUponShardCompletion,
|
||||
ignoreUnexpectedChildShards,
|
||||
shardSyncIntervalMillis,
|
||||
consistentReads,
|
||||
listShardsBackoffTimeMillis,
|
||||
maxListShardsRetryAttempts,
|
||||
maxCacheMissesBeforeReload,
|
||||
listShardsCacheAllowedAgeInSeconds,
|
||||
cacheMissWarningModulus,
|
||||
initialLeaseTableReadCapacity,
|
||||
initialLeaseTableWriteCapacity,
|
||||
hierarchicalShardSyncer,
|
||||
tableCreatorCallback,
|
||||
dynamoDbRequestTimeout,
|
||||
billingMode,
|
||||
new DynamoDBLeaseSerializer());
|
||||
}
|
||||
|
||||
/**
|
||||
* Constructor.
|
||||
*
|
||||
* @param kinesisClient
|
||||
* @param streamConfig
|
||||
* @param dynamoDBClient
|
||||
* @param tableName
|
||||
* @param workerIdentifier
|
||||
* @param executorService
|
||||
* @param failoverTimeMillis
|
||||
* @param epsilonMillis
|
||||
* @param maxLeasesForWorker
|
||||
* @param maxLeasesToStealAtOneTime
|
||||
* @param maxLeaseRenewalThreads
|
||||
* @param cleanupLeasesUponShardCompletion
|
||||
* @param ignoreUnexpectedChildShards
|
||||
* @param shardSyncIntervalMillis
|
||||
* @param consistentReads
|
||||
* @param listShardsBackoffTimeMillis
|
||||
* @param maxListShardsRetryAttempts
|
||||
* @param maxCacheMissesBeforeReload
|
||||
* @param listShardsCacheAllowedAgeInSeconds
|
||||
* @param cacheMissWarningModulus
|
||||
* @param initialLeaseTableReadCapacity
|
||||
* @param initialLeaseTableWriteCapacity
|
||||
* @param deprecatedHierarchicalShardSyncer
|
||||
* @param tableCreatorCallback
|
||||
* @param dynamoDbRequestTimeout
|
||||
* @param billingMode
|
||||
*/
|
||||
@Deprecated
|
||||
private DynamoDBLeaseManagementFactory(
|
||||
final KinesisAsyncClient kinesisClient,
|
||||
final StreamConfig streamConfig,
|
||||
final DynamoDbAsyncClient dynamoDBClient,
|
||||
final String tableName,
|
||||
final String workerIdentifier,
|
||||
final ExecutorService executorService,
|
||||
final long failoverTimeMillis,
|
||||
final long epsilonMillis,
|
||||
final int maxLeasesForWorker,
|
||||
final int maxLeasesToStealAtOneTime,
|
||||
final int maxLeaseRenewalThreads,
|
||||
final boolean cleanupLeasesUponShardCompletion,
|
||||
final boolean ignoreUnexpectedChildShards,
|
||||
final long shardSyncIntervalMillis,
|
||||
final boolean consistentReads,
|
||||
final long listShardsBackoffTimeMillis,
|
||||
final int maxListShardsRetryAttempts,
|
||||
final int maxCacheMissesBeforeReload,
|
||||
final long listShardsCacheAllowedAgeInSeconds,
|
||||
final int cacheMissWarningModulus,
|
||||
final long initialLeaseTableReadCapacity,
|
||||
final long initialLeaseTableWriteCapacity,
|
||||
final HierarchicalShardSyncer deprecatedHierarchicalShardSyncer,
|
||||
final TableCreatorCallback tableCreatorCallback,
|
||||
Duration dynamoDbRequestTimeout,
|
||||
BillingMode billingMode,
|
||||
LeaseSerializer leaseSerializer) {
|
||||
this(
|
||||
kinesisClient,
|
||||
streamConfig,
|
||||
dynamoDBClient,
|
||||
tableName,
|
||||
workerIdentifier,
|
||||
executorService,
|
||||
failoverTimeMillis,
|
||||
epsilonMillis,
|
||||
maxLeasesForWorker,
|
||||
maxLeasesToStealAtOneTime,
|
||||
maxLeaseRenewalThreads,
|
||||
cleanupLeasesUponShardCompletion,
|
||||
ignoreUnexpectedChildShards,
|
||||
shardSyncIntervalMillis,
|
||||
consistentReads,
|
||||
listShardsBackoffTimeMillis,
|
||||
maxListShardsRetryAttempts,
|
||||
maxCacheMissesBeforeReload,
|
||||
listShardsCacheAllowedAgeInSeconds,
|
||||
cacheMissWarningModulus,
|
||||
initialLeaseTableReadCapacity,
|
||||
initialLeaseTableWriteCapacity,
|
||||
deprecatedHierarchicalShardSyncer,
|
||||
tableCreatorCallback,
|
||||
dynamoDbRequestTimeout,
|
||||
billingMode,
|
||||
LeaseManagementConfig.DEFAULT_LEASE_TABLE_DELETION_PROTECTION_ENABLED,
|
||||
DefaultSdkAutoConstructList.getInstance(),
|
||||
leaseSerializer);
|
||||
}
|
||||
|
||||
/**
|
||||
* Constructor.
|
||||
*
|
||||
* @param kinesisClient
|
||||
* @param streamConfig
|
||||
* @param dynamoDBClient
|
||||
* @param tableName
|
||||
* @param workerIdentifier
|
||||
* @param executorService
|
||||
* @param failoverTimeMillis
|
||||
* @param epsilonMillis
|
||||
* @param maxLeasesForWorker
|
||||
* @param maxLeasesToStealAtOneTime
|
||||
* @param maxLeaseRenewalThreads
|
||||
* @param cleanupLeasesUponShardCompletion
|
||||
* @param ignoreUnexpectedChildShards
|
||||
* @param shardSyncIntervalMillis
|
||||
* @param consistentReads
|
||||
* @param listShardsBackoffTimeMillis
|
||||
* @param maxListShardsRetryAttempts
|
||||
* @param maxCacheMissesBeforeReload
|
||||
* @param listShardsCacheAllowedAgeInSeconds
|
||||
* @param cacheMissWarningModulus
|
||||
* @param initialLeaseTableReadCapacity
|
||||
* @param initialLeaseTableWriteCapacity
|
||||
* @param deprecatedHierarchicalShardSyncer
|
||||
* @param tableCreatorCallback
|
||||
* @param dynamoDbRequestTimeout
|
||||
* @param billingMode
|
||||
* @param leaseTableDeletionProtectionEnabled
|
||||
* @param tags
|
||||
*/
|
||||
@Deprecated
|
||||
private DynamoDBLeaseManagementFactory(
|
||||
final KinesisAsyncClient kinesisClient,
|
||||
final StreamConfig streamConfig,
|
||||
final DynamoDbAsyncClient dynamoDBClient,
|
||||
final String tableName,
|
||||
final String workerIdentifier,
|
||||
final ExecutorService executorService,
|
||||
final long failoverTimeMillis,
|
||||
final long epsilonMillis,
|
||||
final int maxLeasesForWorker,
|
||||
final int maxLeasesToStealAtOneTime,
|
||||
final int maxLeaseRenewalThreads,
|
||||
final boolean cleanupLeasesUponShardCompletion,
|
||||
final boolean ignoreUnexpectedChildShards,
|
||||
final long shardSyncIntervalMillis,
|
||||
final boolean consistentReads,
|
||||
final long listShardsBackoffTimeMillis,
|
||||
final int maxListShardsRetryAttempts,
|
||||
final int maxCacheMissesBeforeReload,
|
||||
final long listShardsCacheAllowedAgeInSeconds,
|
||||
final int cacheMissWarningModulus,
|
||||
final long initialLeaseTableReadCapacity,
|
||||
final long initialLeaseTableWriteCapacity,
|
||||
final HierarchicalShardSyncer deprecatedHierarchicalShardSyncer,
|
||||
final TableCreatorCallback tableCreatorCallback,
|
||||
Duration dynamoDbRequestTimeout,
|
||||
BillingMode billingMode,
|
||||
final boolean leaseTableDeletionProtectionEnabled,
|
||||
Collection<Tag> tags,
|
||||
LeaseSerializer leaseSerializer) {
|
||||
this(
|
||||
kinesisClient,
|
||||
dynamoDBClient,
|
||||
tableName,
|
||||
workerIdentifier,
|
||||
executorService,
|
||||
failoverTimeMillis,
|
||||
epsilonMillis,
|
||||
maxLeasesForWorker,
|
||||
maxLeasesToStealAtOneTime,
|
||||
maxLeaseRenewalThreads,
|
||||
cleanupLeasesUponShardCompletion,
|
||||
ignoreUnexpectedChildShards,
|
||||
shardSyncIntervalMillis,
|
||||
consistentReads,
|
||||
listShardsBackoffTimeMillis,
|
||||
maxListShardsRetryAttempts,
|
||||
maxCacheMissesBeforeReload,
|
||||
listShardsCacheAllowedAgeInSeconds,
|
||||
cacheMissWarningModulus,
|
||||
initialLeaseTableReadCapacity,
|
||||
initialLeaseTableWriteCapacity,
|
||||
deprecatedHierarchicalShardSyncer,
|
||||
tableCreatorCallback,
|
||||
dynamoDbRequestTimeout,
|
||||
billingMode,
|
||||
leaseTableDeletionProtectionEnabled,
|
||||
tags,
|
||||
leaseSerializer,
|
||||
null,
|
||||
false,
|
||||
LeaseManagementConfig.DEFAULT_LEASE_CLEANUP_CONFIG);
|
||||
this.streamConfig = streamConfig;
|
||||
}
|
||||
|
||||
/**
|
||||
* Constructor.
|
||||
* @param kinesisClient
|
||||
* @param dynamoDBClient
|
||||
* @param tableName
|
||||
* @param workerIdentifier
|
||||
* @param executorService
|
||||
* @param failoverTimeMillis
|
||||
* @param epsilonMillis
|
||||
* @param maxLeasesForWorker
|
||||
* @param maxLeasesToStealAtOneTime
|
||||
* @param maxLeaseRenewalThreads
|
||||
* @param cleanupLeasesUponShardCompletion
|
||||
* @param ignoreUnexpectedChildShards
|
||||
* @param shardSyncIntervalMillis
|
||||
* @param consistentReads
|
||||
* @param listShardsBackoffTimeMillis
|
||||
* @param maxListShardsRetryAttempts
|
||||
* @param maxCacheMissesBeforeReload
|
||||
* @param listShardsCacheAllowedAgeInSeconds
|
||||
* @param cacheMissWarningModulus
|
||||
* @param initialLeaseTableReadCapacity
|
||||
* @param initialLeaseTableWriteCapacity
|
||||
* @param deprecatedHierarchicalShardSyncer
|
||||
* @param tableCreatorCallback
|
||||
* @param dynamoDbRequestTimeout
|
||||
* @param billingMode
|
||||
* @param leaseTableDeletionProtectionEnabled
|
||||
* @param leaseSerializer
|
||||
* @param customShardDetectorProvider
|
||||
* @param isMultiStreamMode
|
||||
* @param leaseCleanupConfig
|
||||
*/
|
||||
@Deprecated
|
||||
public DynamoDBLeaseManagementFactory(
|
||||
final KinesisAsyncClient kinesisClient,
|
||||
final DynamoDbAsyncClient dynamoDBClient,
|
||||
final String tableName,
|
||||
final String workerIdentifier,
|
||||
final ExecutorService executorService,
|
||||
final long failoverTimeMillis,
|
||||
final long epsilonMillis,
|
||||
final int maxLeasesForWorker,
|
||||
final int maxLeasesToStealAtOneTime,
|
||||
final int maxLeaseRenewalThreads,
|
||||
final boolean cleanupLeasesUponShardCompletion,
|
||||
final boolean ignoreUnexpectedChildShards,
|
||||
final long shardSyncIntervalMillis,
|
||||
final boolean consistentReads,
|
||||
final long listShardsBackoffTimeMillis,
|
||||
final int maxListShardsRetryAttempts,
|
||||
final int maxCacheMissesBeforeReload,
|
||||
final long listShardsCacheAllowedAgeInSeconds,
|
||||
final int cacheMissWarningModulus,
|
||||
final long initialLeaseTableReadCapacity,
|
||||
final long initialLeaseTableWriteCapacity,
|
||||
final HierarchicalShardSyncer deprecatedHierarchicalShardSyncer,
|
||||
final TableCreatorCallback tableCreatorCallback,
|
||||
Duration dynamoDbRequestTimeout,
|
||||
BillingMode billingMode,
|
||||
final boolean leaseTableDeletionProtectionEnabled,
|
||||
Collection<Tag> tags,
|
||||
LeaseSerializer leaseSerializer,
|
||||
Function<StreamConfig, ShardDetector> customShardDetectorProvider,
|
||||
boolean isMultiStreamMode,
|
||||
LeaseCleanupConfig leaseCleanupConfig) {
|
||||
this(
|
||||
kinesisClient,
|
||||
dynamoDBClient,
|
||||
|
|
@ -900,16 +200,21 @@ public class DynamoDBLeaseManagementFactory implements LeaseManagementFactory {
|
|||
cacheMissWarningModulus,
|
||||
initialLeaseTableReadCapacity,
|
||||
initialLeaseTableWriteCapacity,
|
||||
deprecatedHierarchicalShardSyncer,
|
||||
hierarchicalShardSyncer,
|
||||
tableCreatorCallback,
|
||||
dynamoDbRequestTimeout,
|
||||
billingMode,
|
||||
leaseTableDeletionProtectionEnabled,
|
||||
LeaseManagementConfig.DEFAULT_LEASE_TABLE_DELETION_PROTECTION_ENABLED,
|
||||
LeaseManagementConfig.DEFAULT_LEASE_TABLE_PITR_ENABLED,
|
||||
tags,
|
||||
leaseSerializer,
|
||||
customShardDetectorProvider,
|
||||
isMultiStreamMode,
|
||||
leaseCleanupConfig);
|
||||
new DynamoDBLeaseSerializer(),
|
||||
null,
|
||||
false,
|
||||
LeaseManagementConfig.DEFAULT_LEASE_CLEANUP_CONFIG,
|
||||
new LeaseManagementConfig.WorkerUtilizationAwareAssignmentConfig(),
|
||||
LeaseManagementConfig.GracefulLeaseHandoffConfig.builder().build());
|
||||
this.streamConfig =
|
||||
new StreamConfig(StreamIdentifier.singleStreamInstance(streamName), initialPositionInStream);
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
@ -947,75 +252,6 @@ public class DynamoDBLeaseManagementFactory implements LeaseManagementFactory {
|
|||
* @param leaseCleanupConfig
|
||||
*/
|
||||
@Deprecated
|
||||
public DynamoDBLeaseManagementFactory(
|
||||
final KinesisAsyncClient kinesisClient,
|
||||
final DynamoDbAsyncClient dynamoDBClient,
|
||||
final String tableName,
|
||||
final String workerIdentifier,
|
||||
final ExecutorService executorService,
|
||||
final long failoverTimeMillis,
|
||||
final boolean enablePriorityLeaseAssignment,
|
||||
final long epsilonMillis,
|
||||
final int maxLeasesForWorker,
|
||||
final int maxLeasesToStealAtOneTime,
|
||||
final int maxLeaseRenewalThreads,
|
||||
final boolean cleanupLeasesUponShardCompletion,
|
||||
final boolean ignoreUnexpectedChildShards,
|
||||
final long shardSyncIntervalMillis,
|
||||
final boolean consistentReads,
|
||||
final long listShardsBackoffTimeMillis,
|
||||
final int maxListShardsRetryAttempts,
|
||||
final int maxCacheMissesBeforeReload,
|
||||
final long listShardsCacheAllowedAgeInSeconds,
|
||||
final int cacheMissWarningModulus,
|
||||
final long initialLeaseTableReadCapacity,
|
||||
final long initialLeaseTableWriteCapacity,
|
||||
final HierarchicalShardSyncer deprecatedHierarchicalShardSyncer,
|
||||
final TableCreatorCallback tableCreatorCallback,
|
||||
Duration dynamoDbRequestTimeout,
|
||||
BillingMode billingMode,
|
||||
final boolean leaseTableDeletionProtectionEnabled,
|
||||
Collection<Tag> tags,
|
||||
LeaseSerializer leaseSerializer,
|
||||
Function<StreamConfig, ShardDetector> customShardDetectorProvider,
|
||||
boolean isMultiStreamMode,
|
||||
LeaseCleanupConfig leaseCleanupConfig) {
|
||||
this(
|
||||
kinesisClient,
|
||||
dynamoDBClient,
|
||||
tableName,
|
||||
workerIdentifier,
|
||||
executorService,
|
||||
failoverTimeMillis,
|
||||
enablePriorityLeaseAssignment,
|
||||
epsilonMillis,
|
||||
maxLeasesForWorker,
|
||||
maxLeasesToStealAtOneTime,
|
||||
maxLeaseRenewalThreads,
|
||||
cleanupLeasesUponShardCompletion,
|
||||
ignoreUnexpectedChildShards,
|
||||
shardSyncIntervalMillis,
|
||||
consistentReads,
|
||||
listShardsBackoffTimeMillis,
|
||||
maxListShardsRetryAttempts,
|
||||
maxCacheMissesBeforeReload,
|
||||
listShardsCacheAllowedAgeInSeconds,
|
||||
cacheMissWarningModulus,
|
||||
initialLeaseTableReadCapacity,
|
||||
initialLeaseTableWriteCapacity,
|
||||
deprecatedHierarchicalShardSyncer,
|
||||
tableCreatorCallback,
|
||||
dynamoDbRequestTimeout,
|
||||
billingMode,
|
||||
leaseTableDeletionProtectionEnabled,
|
||||
LeaseManagementConfig.DEFAULT_LEASE_TABLE_PITR_ENABLED,
|
||||
tags,
|
||||
leaseSerializer,
|
||||
customShardDetectorProvider,
|
||||
isMultiStreamMode,
|
||||
leaseCleanupConfig);
|
||||
}
|
||||
|
||||
public DynamoDBLeaseManagementFactory(
|
||||
final KinesisAsyncClient kinesisClient,
|
||||
final DynamoDbAsyncClient dynamoDBClient,
|
||||
|
|
@ -1049,7 +285,9 @@ public class DynamoDBLeaseManagementFactory implements LeaseManagementFactory {
|
|||
LeaseSerializer leaseSerializer,
|
||||
Function<StreamConfig, ShardDetector> customShardDetectorProvider,
|
||||
boolean isMultiStreamMode,
|
||||
LeaseCleanupConfig leaseCleanupConfig) {
|
||||
LeaseCleanupConfig leaseCleanupConfig,
|
||||
final LeaseManagementConfig.WorkerUtilizationAwareAssignmentConfig workerUtilizationAwareAssignmentConfig,
|
||||
final LeaseManagementConfig.GracefulLeaseHandoffConfig gracefulLeaseHandoffConfig) {
|
||||
this.kinesisClient = kinesisClient;
|
||||
this.dynamoDBClient = dynamoDBClient;
|
||||
this.tableName = tableName;
|
||||
|
|
@ -1083,10 +321,19 @@ public class DynamoDBLeaseManagementFactory implements LeaseManagementFactory {
|
|||
this.isMultiStreamMode = isMultiStreamMode;
|
||||
this.leaseCleanupConfig = leaseCleanupConfig;
|
||||
this.tags = tags;
|
||||
this.workerUtilizationAwareAssignmentConfig = workerUtilizationAwareAssignmentConfig;
|
||||
this.gracefulLeaseHandoffConfig = gracefulLeaseHandoffConfig;
|
||||
}
|
||||
|
||||
@Override
|
||||
public LeaseCoordinator createLeaseCoordinator(@NonNull final MetricsFactory metricsFactory) {
|
||||
return createLeaseCoordinator(metricsFactory, new ConcurrentHashMap<>());
|
||||
}
|
||||
|
||||
@Override
|
||||
public LeaseCoordinator createLeaseCoordinator(
|
||||
@NonNull final MetricsFactory metricsFactory,
|
||||
@NonNull final ConcurrentMap<ShardInfo, ShardConsumer> shardInfoShardConsumerMap) {
|
||||
return new DynamoDBLeaseCoordinator(
|
||||
this.createLeaseRefresher(),
|
||||
workerIdentifier,
|
||||
|
|
@ -1098,9 +345,15 @@ public class DynamoDBLeaseManagementFactory implements LeaseManagementFactory {
|
|||
maxLeaseRenewalThreads,
|
||||
initialLeaseTableReadCapacity,
|
||||
initialLeaseTableWriteCapacity,
|
||||
metricsFactory);
|
||||
metricsFactory,
|
||||
workerUtilizationAwareAssignmentConfig,
|
||||
gracefulLeaseHandoffConfig,
|
||||
shardInfoShardConsumerMap);
|
||||
}
|
||||
|
||||
/**
|
||||
* Even though this is deprecated, this is a method part of the public interface in LeaseManagementFactory
|
||||
*/
|
||||
@Override
|
||||
@Deprecated
|
||||
public ShardSyncTaskManager createShardSyncTaskManager(@NonNull final MetricsFactory metricsFactory) {
|
||||
|
|
@ -1155,6 +408,10 @@ public class DynamoDBLeaseManagementFactory implements LeaseManagementFactory {
|
|||
|
||||
@Override
|
||||
public DynamoDBLeaseRefresher createLeaseRefresher() {
|
||||
final DdbTableConfig ddbTableConfig = new DdbTableConfig();
|
||||
ddbTableConfig.billingMode(billingMode);
|
||||
ddbTableConfig.readCapacity(initialLeaseTableReadCapacity);
|
||||
ddbTableConfig.writeCapacity(initialLeaseTableWriteCapacity);
|
||||
return new DynamoDBLeaseRefresher(
|
||||
tableName,
|
||||
dynamoDBClient,
|
||||
|
|
@ -1162,12 +419,15 @@ public class DynamoDBLeaseManagementFactory implements LeaseManagementFactory {
|
|||
consistentReads,
|
||||
tableCreatorCallback,
|
||||
dynamoDbRequestTimeout,
|
||||
billingMode,
|
||||
ddbTableConfig,
|
||||
leaseTableDeletionProtectionEnabled,
|
||||
leaseTablePitrEnabled,
|
||||
tags);
|
||||
}
|
||||
|
||||
/**
|
||||
* Even though this is deprecated, this is a method part of the public interface in LeaseManagementFactory
|
||||
*/
|
||||
@Override
|
||||
@Deprecated
|
||||
public ShardDetector createShardDetector() {
|
||||
|
|
|
|||
File diff suppressed because it is too large
Load diff
|
|
@ -14,6 +14,8 @@
|
|||
*/
|
||||
package software.amazon.kinesis.leases.dynamodb;
|
||||
|
||||
import java.math.BigDecimal;
|
||||
import java.math.RoundingMode;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Collection;
|
||||
import java.util.HashMap;
|
||||
|
|
@ -26,8 +28,10 @@ import java.util.concurrent.ConcurrentNavigableMap;
|
|||
import java.util.concurrent.ConcurrentSkipListMap;
|
||||
import java.util.concurrent.ExecutionException;
|
||||
import java.util.concurrent.ExecutorService;
|
||||
import java.util.concurrent.Executors;
|
||||
import java.util.concurrent.Future;
|
||||
import java.util.concurrent.TimeUnit;
|
||||
import java.util.function.Consumer;
|
||||
|
||||
import lombok.NonNull;
|
||||
import lombok.RequiredArgsConstructor;
|
||||
|
|
@ -39,6 +43,7 @@ import software.amazon.kinesis.common.StreamIdentifier;
|
|||
import software.amazon.kinesis.leases.Lease;
|
||||
import software.amazon.kinesis.leases.LeaseRefresher;
|
||||
import software.amazon.kinesis.leases.LeaseRenewer;
|
||||
import software.amazon.kinesis.leases.LeaseStatsRecorder;
|
||||
import software.amazon.kinesis.leases.MultiStreamLease;
|
||||
import software.amazon.kinesis.leases.exceptions.DependencyException;
|
||||
import software.amazon.kinesis.leases.exceptions.InvalidStateException;
|
||||
|
|
@ -48,21 +53,32 @@ import software.amazon.kinesis.metrics.MetricsLevel;
|
|||
import software.amazon.kinesis.metrics.MetricsScope;
|
||||
import software.amazon.kinesis.metrics.MetricsUtil;
|
||||
|
||||
import static java.util.Objects.nonNull;
|
||||
import static software.amazon.kinesis.leases.LeaseStatsRecorder.BYTES_PER_KB;
|
||||
|
||||
/**
|
||||
* An implementation of {@link LeaseRenewer} that uses DynamoDB via {@link LeaseRefresher}.
|
||||
*/
|
||||
@Slf4j
|
||||
@KinesisClientInternalApi
|
||||
public class DynamoDBLeaseRenewer implements LeaseRenewer {
|
||||
|
||||
/**
|
||||
* 6 digit after decimal gives the granularity of 0.001 byte per second.
|
||||
*/
|
||||
private static final int DEFAULT_THROUGHPUT_DIGIT_AFTER_DECIMAL = 6;
|
||||
|
||||
private static final int RENEWAL_RETRIES = 2;
|
||||
private static final String RENEW_ALL_LEASES_DIMENSION = "RenewAllLeases";
|
||||
private static final String LEASE_RENEWER_INITIALIZE = "LeaseRenewerInitialize";
|
||||
|
||||
private final LeaseRefresher leaseRefresher;
|
||||
private final String workerIdentifier;
|
||||
private final long leaseDurationNanos;
|
||||
private final ExecutorService executorService;
|
||||
private final MetricsFactory metricsFactory;
|
||||
|
||||
private final LeaseStatsRecorder leaseStatsRecorder;
|
||||
private final Consumer<Lease> leaseGracefulShutdownCallback;
|
||||
private final ConcurrentNavigableMap<String, Lease> ownedLeases = new ConcurrentSkipListMap<>();
|
||||
|
||||
/**
|
||||
|
|
@ -82,12 +98,16 @@ public class DynamoDBLeaseRenewer implements LeaseRenewer {
|
|||
final String workerIdentifier,
|
||||
final long leaseDurationMillis,
|
||||
final ExecutorService executorService,
|
||||
final MetricsFactory metricsFactory) {
|
||||
final MetricsFactory metricsFactory,
|
||||
final LeaseStatsRecorder leaseStatsRecorder,
|
||||
final Consumer<Lease> leaseGracefulShutdownCallback) {
|
||||
this.leaseRefresher = leaseRefresher;
|
||||
this.workerIdentifier = workerIdentifier;
|
||||
this.leaseDurationNanos = TimeUnit.MILLISECONDS.toNanos(leaseDurationMillis);
|
||||
this.executorService = executorService;
|
||||
this.metricsFactory = metricsFactory;
|
||||
this.leaseStatsRecorder = leaseStatsRecorder;
|
||||
this.leaseGracefulShutdownCallback = leaseGracefulShutdownCallback;
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
@ -187,11 +207,21 @@ public class DynamoDBLeaseRenewer implements LeaseRenewer {
|
|||
// ShutdownException).
|
||||
boolean isLeaseExpired = lease.isExpired(leaseDurationNanos, System.nanoTime());
|
||||
if (renewEvenIfExpired || !isLeaseExpired) {
|
||||
final Double throughputPerKBps = this.leaseStatsRecorder.getThroughputKBps(leaseKey);
|
||||
if (nonNull(throughputPerKBps)) {
|
||||
lease.throughputKBps(BigDecimal.valueOf(throughputPerKBps)
|
||||
.setScale(DEFAULT_THROUGHPUT_DIGIT_AFTER_DECIMAL, RoundingMode.HALF_UP)
|
||||
.doubleValue());
|
||||
}
|
||||
renewedLease = leaseRefresher.renewLease(lease);
|
||||
}
|
||||
if (renewedLease) {
|
||||
lease.lastCounterIncrementNanos(System.nanoTime());
|
||||
}
|
||||
if (lease.shutdownRequested()) {
|
||||
// the underlying function will dedup
|
||||
leaseGracefulShutdownCallback.accept(lease.copy());
|
||||
}
|
||||
}
|
||||
|
||||
if (renewedLease) {
|
||||
|
|
@ -391,6 +421,12 @@ public class DynamoDBLeaseRenewer implements LeaseRenewer {
|
|||
* every time we acquire a lease, it gets a new concurrency token.
|
||||
*/
|
||||
authoritativeLease.concurrencyToken(UUID.randomUUID());
|
||||
if (nonNull(lease.throughputKBps())) {
|
||||
leaseStatsRecorder.recordStats(LeaseStatsRecorder.LeaseStats.builder()
|
||||
.leaseKey(lease.leaseKey())
|
||||
.bytes(Math.round(lease.throughputKBps() * BYTES_PER_KB)) // Convert KB to Bytes
|
||||
.build());
|
||||
}
|
||||
ownedLeases.put(authoritativeLease.leaseKey(), authoritativeLease);
|
||||
}
|
||||
}
|
||||
|
|
@ -409,6 +445,7 @@ public class DynamoDBLeaseRenewer implements LeaseRenewer {
|
|||
*/
|
||||
@Override
|
||||
public void dropLease(Lease lease) {
|
||||
leaseStatsRecorder.dropLeaseStats(lease.leaseKey());
|
||||
ownedLeases.remove(lease.leaseKey());
|
||||
}
|
||||
|
||||
|
|
@ -417,15 +454,27 @@ public class DynamoDBLeaseRenewer implements LeaseRenewer {
|
|||
*/
|
||||
@Override
|
||||
public void initialize() throws DependencyException, InvalidStateException, ProvisionedThroughputException {
|
||||
Collection<Lease> leases = leaseRefresher.listLeases();
|
||||
List<Lease> myLeases = new LinkedList<>();
|
||||
final MetricsScope scope = MetricsUtil.createMetricsWithOperation(metricsFactory, LEASE_RENEWER_INITIALIZE);
|
||||
final ExecutorService singleThreadExecutorService = Executors.newSingleThreadExecutor();
|
||||
boolean success = false;
|
||||
try {
|
||||
final Map.Entry<List<Lease>, List<String>> response =
|
||||
leaseRefresher.listLeasesParallely(singleThreadExecutorService, 1);
|
||||
|
||||
if (!response.getValue().isEmpty()) {
|
||||
log.warn("List of leaseKeys failed to deserialize : {} ", response.getValue());
|
||||
}
|
||||
|
||||
final List<Lease> myLeases = new LinkedList<>();
|
||||
boolean renewEvenIfExpired = true;
|
||||
|
||||
for (Lease lease : leases) {
|
||||
for (Lease lease : response.getKey()) {
|
||||
if (workerIdentifier.equals(lease.leaseOwner())) {
|
||||
log.info(" Worker {} found lease {}", workerIdentifier, lease);
|
||||
// Okay to renew even if lease is expired, because we start with an empty list and we add the lease to
|
||||
// our list only after a successful renew. So we don't need to worry about the edge case where we could
|
||||
// Okay to renew even if lease is expired, because we start with an empty list and we add the lease
|
||||
// to
|
||||
// our list only after a successful renew. So we don't need to worry about the edge case where we
|
||||
// could
|
||||
// continue renewing a lease after signaling a lease loss to the application.
|
||||
|
||||
if (renewLease(lease, renewEvenIfExpired)) {
|
||||
|
|
@ -437,6 +486,16 @@ public class DynamoDBLeaseRenewer implements LeaseRenewer {
|
|||
}
|
||||
|
||||
addLeasesToRenew(myLeases);
|
||||
success = true;
|
||||
} catch (final Exception e) {
|
||||
// It's ok to swollow exception here fail to discover all leases here, as the assignment logic takes
|
||||
// care of reassignment if some lease is expired.
|
||||
log.warn("LeaseRefresher failed in initialization during renewing of pre assigned leases", e);
|
||||
} finally {
|
||||
singleThreadExecutorService.shutdown();
|
||||
MetricsUtil.addCount(scope, "Fault", success ? 0 : 1, MetricsLevel.DETAILED);
|
||||
MetricsUtil.endScope(scope);
|
||||
}
|
||||
}
|
||||
|
||||
private void verifyNotNull(Object object, String message) {
|
||||
|
|
|
|||
|
|
@ -44,11 +44,8 @@ import software.amazon.kinesis.retrieval.kpl.ExtendedSequenceNumber;
|
|||
*/
|
||||
@KinesisClientInternalApi
|
||||
public class DynamoDBLeaseSerializer implements LeaseSerializer {
|
||||
private static final String LEASE_KEY_KEY = "leaseKey";
|
||||
private static final String LEASE_OWNER_KEY = "leaseOwner";
|
||||
private static final String LEASE_COUNTER_KEY = "leaseCounter";
|
||||
private static final String OWNER_SWITCHES_KEY = "ownerSwitchesSinceCheckpoint";
|
||||
private static final String CHECKPOINT_SEQUENCE_NUMBER_KEY = "checkpoint";
|
||||
private static final String CHECKPOINT_SUBSEQUENCE_NUMBER_KEY = "checkpointSubSequenceNumber";
|
||||
private static final String PENDING_CHECKPOINT_SEQUENCE_KEY = "pendingCheckpoint";
|
||||
private static final String PENDING_CHECKPOINT_SUBSEQUENCE_KEY = "pendingCheckpointSubSequenceNumber";
|
||||
|
|
@ -57,6 +54,11 @@ public class DynamoDBLeaseSerializer implements LeaseSerializer {
|
|||
private static final String CHILD_SHARD_IDS_KEY = "childShardIds";
|
||||
private static final String STARTING_HASH_KEY = "startingHashKey";
|
||||
private static final String ENDING_HASH_KEY = "endingHashKey";
|
||||
private static final String THROUGHOUT_PUT_KBPS = "throughputKBps";
|
||||
private static final String CHECKPOINT_SEQUENCE_NUMBER_KEY = "checkpoint";
|
||||
static final String CHECKPOINT_OWNER = "checkpointOwner";
|
||||
static final String LEASE_OWNER_KEY = "leaseOwner";
|
||||
static final String LEASE_KEY_KEY = "leaseKey";
|
||||
|
||||
@Override
|
||||
public Map<String, AttributeValue> toDynamoRecord(final Lease lease) {
|
||||
|
|
@ -110,6 +112,13 @@ public class DynamoDBLeaseSerializer implements LeaseSerializer {
|
|||
lease.hashKeyRangeForLease().serializedEndingHashKey()));
|
||||
}
|
||||
|
||||
if (lease.throughputKBps() != null) {
|
||||
result.put(THROUGHOUT_PUT_KBPS, DynamoUtils.createAttributeValue(lease.throughputKBps()));
|
||||
}
|
||||
|
||||
if (lease.checkpointOwner() != null) {
|
||||
result.put(CHECKPOINT_OWNER, DynamoUtils.createAttributeValue(lease.checkpointOwner()));
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
|
|
@ -146,6 +155,14 @@ public class DynamoDBLeaseSerializer implements LeaseSerializer {
|
|||
leaseToUpdate.hashKeyRange(HashKeyRangeForLease.deserialize(startingHashKey, endingHashKey));
|
||||
}
|
||||
|
||||
if (DynamoUtils.safeGetDouble(dynamoRecord, THROUGHOUT_PUT_KBPS) != null) {
|
||||
leaseToUpdate.throughputKBps(DynamoUtils.safeGetDouble(dynamoRecord, THROUGHOUT_PUT_KBPS));
|
||||
}
|
||||
|
||||
if (DynamoUtils.safeGetString(dynamoRecord, CHECKPOINT_OWNER) != null) {
|
||||
leaseToUpdate.checkpointOwner(DynamoUtils.safeGetString(dynamoRecord, CHECKPOINT_OWNER));
|
||||
}
|
||||
|
||||
return leaseToUpdate;
|
||||
}
|
||||
|
||||
|
|
@ -181,18 +198,9 @@ public class DynamoDBLeaseSerializer implements LeaseSerializer {
|
|||
|
||||
@Override
|
||||
public Map<String, ExpectedAttributeValue> getDynamoLeaseOwnerExpectation(final Lease lease) {
|
||||
Map<String, ExpectedAttributeValue> result = new HashMap<>();
|
||||
|
||||
ExpectedAttributeValue.Builder eavBuilder = ExpectedAttributeValue.builder();
|
||||
|
||||
if (lease.leaseOwner() == null) {
|
||||
eavBuilder = eavBuilder.exists(false);
|
||||
} else {
|
||||
eavBuilder = eavBuilder.value(DynamoUtils.createAttributeValue(lease.leaseOwner()));
|
||||
}
|
||||
|
||||
result.put(LEASE_OWNER_KEY, eavBuilder.build());
|
||||
|
||||
final Map<String, ExpectedAttributeValue> result = new HashMap<>();
|
||||
result.put(LEASE_OWNER_KEY, buildExpectedAttributeValueIfExistsOrValue(lease.leaseOwner()));
|
||||
result.put(CHECKPOINT_OWNER, buildExpectedAttributeValueIfExistsOrValue(lease.checkpointOwner()));
|
||||
return result;
|
||||
}
|
||||
|
||||
|
|
@ -247,9 +255,17 @@ public class DynamoDBLeaseSerializer implements LeaseSerializer {
|
|||
.value(DynamoUtils.createAttributeValue(owner))
|
||||
.action(AttributeAction.PUT)
|
||||
.build());
|
||||
// this method is currently used by assignLease and takeLease. In both case we want the checkpoint owner to be
|
||||
// deleted as this is a fresh assignment
|
||||
result.put(
|
||||
CHECKPOINT_OWNER,
|
||||
AttributeValueUpdate.builder().action(AttributeAction.DELETE).build());
|
||||
|
||||
String oldOwner = lease.leaseOwner();
|
||||
if (oldOwner != null && !oldOwner.equals(owner)) {
|
||||
String checkpointOwner = lease.checkpointOwner();
|
||||
// if checkpoint owner is not null, this update is supposed to remove the checkpoint owner
|
||||
// and transfer the lease ownership to the leaseOwner so incrementing the owner switch key
|
||||
if (oldOwner != null && !oldOwner.equals(owner) || (checkpointOwner != null && checkpointOwner.equals(owner))) {
|
||||
result.put(
|
||||
OWNER_SWITCHES_KEY,
|
||||
AttributeValueUpdate.builder()
|
||||
|
|
@ -261,18 +277,38 @@ public class DynamoDBLeaseSerializer implements LeaseSerializer {
|
|||
return result;
|
||||
}
|
||||
|
||||
/**
|
||||
* AssignLease performs the PUT action on the LeaseOwner and ADD (1) action on the leaseCounter.
|
||||
* @param lease lease that needs to be assigned
|
||||
* @param newOwner newLeaseOwner
|
||||
* @return Map of AttributeName to update operation
|
||||
*/
|
||||
@Override
|
||||
public Map<String, AttributeValueUpdate> getDynamoAssignLeaseUpdate(final Lease lease, final String newOwner) {
|
||||
Map<String, AttributeValueUpdate> result = getDynamoTakeLeaseUpdate(lease, newOwner);
|
||||
|
||||
result.put(LEASE_COUNTER_KEY, getAttributeValueUpdateForAdd());
|
||||
return result;
|
||||
}
|
||||
|
||||
@Override
|
||||
public Map<String, AttributeValueUpdate> getDynamoEvictLeaseUpdate(final Lease lease) {
|
||||
Map<String, AttributeValueUpdate> result = new HashMap<>();
|
||||
AttributeValue value = null;
|
||||
|
||||
final Map<String, AttributeValueUpdate> result = new HashMap<>();
|
||||
// if checkpointOwner is not null, it means lease handoff is initiated. In this case we just remove the
|
||||
// checkpoint owner so the next owner (leaseOwner) can pick up the lease without waiting for assignment.
|
||||
// Otherwise, remove the leaseOwner
|
||||
if (lease.checkpointOwner() == null) {
|
||||
result.put(
|
||||
LEASE_OWNER_KEY,
|
||||
AttributeValueUpdate.builder()
|
||||
.value(value)
|
||||
.action(AttributeAction.DELETE)
|
||||
.build());
|
||||
|
||||
}
|
||||
// We always want to remove checkpointOwner, it's ok even if it's null
|
||||
result.put(
|
||||
CHECKPOINT_OWNER,
|
||||
AttributeValueUpdate.builder().action(AttributeAction.DELETE).build());
|
||||
result.put(LEASE_COUNTER_KEY, getAttributeValueUpdateForAdd());
|
||||
return result;
|
||||
}
|
||||
|
||||
|
|
@ -394,4 +430,58 @@ public class DynamoDBLeaseSerializer implements LeaseSerializer {
|
|||
|
||||
return definitions;
|
||||
}
|
||||
|
||||
@Override
|
||||
public Collection<KeySchemaElement> getWorkerIdToLeaseKeyIndexKeySchema() {
|
||||
final List<KeySchemaElement> keySchema = new ArrayList<>();
|
||||
keySchema.add(KeySchemaElement.builder()
|
||||
.attributeName(LEASE_OWNER_KEY)
|
||||
.keyType(KeyType.HASH)
|
||||
.build());
|
||||
keySchema.add(KeySchemaElement.builder()
|
||||
.attributeName(LEASE_KEY_KEY)
|
||||
.keyType(KeyType.RANGE)
|
||||
.build());
|
||||
return keySchema;
|
||||
}
|
||||
|
||||
@Override
|
||||
public Collection<AttributeDefinition> getWorkerIdToLeaseKeyIndexAttributeDefinitions() {
|
||||
final List<AttributeDefinition> definitions = new ArrayList<>();
|
||||
definitions.add(AttributeDefinition.builder()
|
||||
.attributeName(LEASE_OWNER_KEY)
|
||||
.attributeType(ScalarAttributeType.S)
|
||||
.build());
|
||||
definitions.add(AttributeDefinition.builder()
|
||||
.attributeName(LEASE_KEY_KEY)
|
||||
.attributeType(ScalarAttributeType.S)
|
||||
.build());
|
||||
return definitions;
|
||||
}
|
||||
|
||||
@Override
|
||||
public Map<String, AttributeValueUpdate> getDynamoLeaseThroughputKbpsUpdate(Lease lease) {
|
||||
final Map<String, AttributeValueUpdate> result = new HashMap<>();
|
||||
final AttributeValueUpdate avu = AttributeValueUpdate.builder()
|
||||
.value(DynamoUtils.createAttributeValue(lease.throughputKBps()))
|
||||
.action(AttributeAction.PUT)
|
||||
.build();
|
||||
result.put(THROUGHOUT_PUT_KBPS, avu);
|
||||
return result;
|
||||
}
|
||||
|
||||
private static ExpectedAttributeValue buildExpectedAttributeValueIfExistsOrValue(String value) {
|
||||
return value == null
|
||||
? ExpectedAttributeValue.builder().exists(false).build()
|
||||
: ExpectedAttributeValue.builder()
|
||||
.value(DynamoUtils.createAttributeValue(value))
|
||||
.build();
|
||||
}
|
||||
|
||||
private static AttributeValueUpdate getAttributeValueUpdateForAdd() {
|
||||
return AttributeValueUpdate.builder()
|
||||
.value(DynamoUtils.createAttributeValue(1L))
|
||||
.action(AttributeAction.ADD)
|
||||
.build();
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -106,15 +106,6 @@ public class DynamoDBLeaseTaker implements LeaseTaker {
|
|||
return this;
|
||||
}
|
||||
|
||||
/**
|
||||
* @deprecated Misspelled method, use {@link DynamoDBLeaseTaker#withVeryOldLeaseDurationNanosMultiplier(int)}
|
||||
*/
|
||||
@Deprecated
|
||||
public DynamoDBLeaseTaker withVeryOldLeaseDurationNanosMultipler(long veryOldLeaseDurationNanosMultipler) {
|
||||
this.veryOldLeaseDurationNanosMultiplier = (int) veryOldLeaseDurationNanosMultipler;
|
||||
return this;
|
||||
}
|
||||
|
||||
/**
|
||||
* Overrides the default very old lease duration nanos multiplier to increase the threshold for taking very old leases.
|
||||
* Setting this to a higher value than 3 will increase the threshold for very old lease taking.
|
||||
|
|
|
|||
|
|
@ -266,7 +266,8 @@ class ConsumerStates {
|
|||
argument.idleTimeInMilliseconds(),
|
||||
argument.aggregatorUtil(),
|
||||
argument.metricsFactory(),
|
||||
argument.schemaRegistryDecoder());
|
||||
argument.schemaRegistryDecoder(),
|
||||
argument.leaseCoordinator().leaseStatsRecorder());
|
||||
}
|
||||
|
||||
@Override
|
||||
|
|
@ -336,7 +337,8 @@ class ConsumerStates {
|
|||
argument.shardRecordProcessor(),
|
||||
argument.recordProcessorCheckpointer(),
|
||||
consumer.shutdownNotification(),
|
||||
argument.shardInfo());
|
||||
argument.shardInfo(),
|
||||
consumer.shardConsumerArgument().leaseCoordinator());
|
||||
}
|
||||
|
||||
@Override
|
||||
|
|
|
|||
|
|
@ -0,0 +1,213 @@
|
|||
package software.amazon.kinesis.lifecycle;
|
||||
|
||||
import java.util.concurrent.ConcurrentHashMap;
|
||||
import java.util.concurrent.ConcurrentMap;
|
||||
import java.util.concurrent.Executors;
|
||||
import java.util.concurrent.ScheduledExecutorService;
|
||||
import java.util.concurrent.TimeUnit;
|
||||
import java.util.function.Supplier;
|
||||
|
||||
import com.google.common.util.concurrent.ThreadFactoryBuilder;
|
||||
import lombok.Data;
|
||||
import lombok.RequiredArgsConstructor;
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
import software.amazon.kinesis.annotations.KinesisClientInternalApi;
|
||||
import software.amazon.kinesis.leases.Lease;
|
||||
import software.amazon.kinesis.leases.LeaseCoordinator;
|
||||
import software.amazon.kinesis.leases.ShardInfo;
|
||||
import software.amazon.kinesis.leases.dynamodb.DynamoDBLeaseCoordinator;
|
||||
import software.amazon.kinesis.leases.exceptions.DependencyException;
|
||||
import software.amazon.kinesis.leases.exceptions.InvalidStateException;
|
||||
import software.amazon.kinesis.leases.exceptions.ProvisionedThroughputException;
|
||||
|
||||
/**
|
||||
* This class handles the graceful shutdown of shard consumers. When a lease is requested for shutdown, it will be
|
||||
* enqueued from the lease renewal thread which will call the shard consumer of the lease to enqueue a shutdown request.
|
||||
* The class monitors those leases and check if the shutdown is properly completed.
|
||||
* If the shard consumer doesn't shut down within the given timeout, it will trigger a lease transfer.
|
||||
*/
|
||||
@Slf4j
|
||||
@RequiredArgsConstructor
|
||||
@KinesisClientInternalApi
|
||||
public class LeaseGracefulShutdownHandler {
|
||||
|
||||
// Arbitrary number to run a similar frequency as the scheduler based on shardConsumerDispatchPollIntervalMillis
|
||||
// which is how fast scheduler triggers state change. It's ok to add few extra second delay to call shutdown since
|
||||
// the leases should still be processing by the current owner so there should not be processing delay due to this.
|
||||
private static final long SHUTDOWN_CHECK_INTERVAL_MILLIS = 2000;
|
||||
|
||||
private final long shutdownTimeoutMillis;
|
||||
private final ConcurrentMap<ShardInfo, ShardConsumer> shardInfoShardConsumerMap;
|
||||
private final LeaseCoordinator leaseCoordinator;
|
||||
private final Supplier<Long> currentTimeSupplier;
|
||||
private final ConcurrentMap<ShardInfo, LeasePendingShutdown> shardInfoLeasePendingShutdownMap =
|
||||
new ConcurrentHashMap<>();
|
||||
private final ScheduledExecutorService executorService;
|
||||
|
||||
private volatile boolean isRunning = false;
|
||||
|
||||
/**
|
||||
* Factory method to create a new instance of LeaseGracefulShutdownHandler.
|
||||
*
|
||||
* @param shutdownTimeoutMillis Timeout for graceful shutdown of shard consumers.
|
||||
* @param shardInfoShardConsumerMap Map of shard info to shard consumer instances.
|
||||
* @param leaseCoordinator Lease coordinator instance to access lease information.
|
||||
* @return A new instance of LeaseGracefulShutdownHandler.
|
||||
*/
|
||||
public static LeaseGracefulShutdownHandler create(
|
||||
long shutdownTimeoutMillis,
|
||||
ConcurrentMap<ShardInfo, ShardConsumer> shardInfoShardConsumerMap,
|
||||
LeaseCoordinator leaseCoordinator) {
|
||||
return new LeaseGracefulShutdownHandler(
|
||||
shutdownTimeoutMillis,
|
||||
shardInfoShardConsumerMap,
|
||||
leaseCoordinator,
|
||||
System::currentTimeMillis,
|
||||
Executors.newSingleThreadScheduledExecutor(new ThreadFactoryBuilder()
|
||||
.setNameFormat("LeaseGracefulShutdown-%04d")
|
||||
.setDaemon(true)
|
||||
.build()));
|
||||
}
|
||||
|
||||
/**
|
||||
* Starts the shard consumer shutdown handler thread.
|
||||
*/
|
||||
public void start() {
|
||||
if (!isRunning) {
|
||||
log.info("Starting graceful lease handoff thread.");
|
||||
executorService.scheduleAtFixedRate(
|
||||
this::monitorGracefulShutdownLeases, 0, SHUTDOWN_CHECK_INTERVAL_MILLIS, TimeUnit.MILLISECONDS);
|
||||
isRunning = true;
|
||||
} else {
|
||||
log.info("Graceful lease handoff thread already running, no need to start.");
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Stops the shard consumer shutdown handler thread.
|
||||
*/
|
||||
public void stop() {
|
||||
if (isRunning) {
|
||||
log.info("Stopping graceful lease handoff thread.");
|
||||
executorService.shutdown();
|
||||
isRunning = false;
|
||||
} else {
|
||||
log.info("Graceful lease handoff thread already stopped.");
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Enqueue a shutdown request for the given lease if the lease has requested shutdown and the shard consumer
|
||||
* is not already shutdown.
|
||||
*
|
||||
* @param lease The lease to enqueue a shutdown request for.
|
||||
*/
|
||||
public void enqueueShutdown(Lease lease) {
|
||||
if (lease == null || !lease.shutdownRequested() || !isRunning) {
|
||||
return;
|
||||
}
|
||||
final ShardInfo shardInfo = DynamoDBLeaseCoordinator.convertLeaseToAssignment(lease);
|
||||
final ShardConsumer consumer = shardInfoShardConsumerMap.get(shardInfo);
|
||||
if (consumer == null || consumer.isShutdown()) {
|
||||
shardInfoLeasePendingShutdownMap.remove(shardInfo);
|
||||
} else {
|
||||
// there could be change shard get enqueued after getting removed. This should be okay because
|
||||
// this enqueue will be no-op and will be removed again because the shardConsumer associated with the
|
||||
// shardInfo is shutdown by then.
|
||||
shardInfoLeasePendingShutdownMap.computeIfAbsent(shardInfo, key -> {
|
||||
log.info("Calling graceful shutdown for lease {}", lease.leaseKey());
|
||||
LeasePendingShutdown leasePendingShutdown = new LeasePendingShutdown(lease, consumer);
|
||||
initiateShutdown(leasePendingShutdown);
|
||||
return leasePendingShutdown;
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Wait for shutdown to complete or transfer ownership of lease to the next owner if timeout is met.
|
||||
*/
|
||||
private void monitorGracefulShutdownLeases() {
|
||||
String leaseKey = null;
|
||||
try {
|
||||
for (ConcurrentMap.Entry<ShardInfo, LeasePendingShutdown> entry :
|
||||
shardInfoLeasePendingShutdownMap.entrySet()) {
|
||||
final LeasePendingShutdown leasePendingShutdown = entry.getValue();
|
||||
final ShardInfo shardInfo = entry.getKey();
|
||||
leaseKey = leasePendingShutdown.lease.leaseKey();
|
||||
|
||||
if (leasePendingShutdown.shardConsumer.isShutdown()
|
||||
|| shardInfoShardConsumerMap.get(shardInfo) == null
|
||||
|| leaseCoordinator.getCurrentlyHeldLease(leaseKey) == null) {
|
||||
logTimeoutMessage(leasePendingShutdown);
|
||||
shardInfoLeasePendingShutdownMap.remove(shardInfo);
|
||||
} else if (getCurrentTimeMillis() >= leasePendingShutdown.timeoutTimestampMillis
|
||||
&& !leasePendingShutdown.leaseTransferCalled) {
|
||||
try {
|
||||
log.info(
|
||||
"Timeout {} millisecond reached waiting for lease {} to graceful handoff."
|
||||
+ " Attempting to transfer the lease to {}",
|
||||
shutdownTimeoutMillis,
|
||||
leaseKey,
|
||||
leasePendingShutdown.lease.leaseOwner());
|
||||
transferLeaseIfOwner(leasePendingShutdown);
|
||||
} catch (DependencyException | InvalidStateException | ProvisionedThroughputException e) {
|
||||
log.warn("Failed to transfer lease for key {}. Will retry", leaseKey, e);
|
||||
}
|
||||
}
|
||||
}
|
||||
} catch (Exception e) {
|
||||
log.error("Error in graceful shutdown for lease {}", leaseKey, e);
|
||||
}
|
||||
}
|
||||
|
||||
private void initiateShutdown(LeasePendingShutdown tracker) {
|
||||
tracker.shardConsumer.gracefulShutdown(null);
|
||||
tracker.shutdownRequested = true;
|
||||
tracker.timeoutTimestampMillis = getCurrentTimeMillis() + shutdownTimeoutMillis;
|
||||
}
|
||||
|
||||
private void logTimeoutMessage(LeasePendingShutdown leasePendingShutdown) {
|
||||
if (leasePendingShutdown.leaseTransferCalled) {
|
||||
final long timeElapsedSinceShutdownInitiated =
|
||||
getCurrentTimeMillis() - leasePendingShutdown.timeoutTimestampMillis + shutdownTimeoutMillis;
|
||||
log.info(
|
||||
"Lease {} took {} milliseconds to complete the shutdown. "
|
||||
+ "Consider tuning the GracefulLeaseHandoffTimeoutMillis to prevent timeouts, "
|
||||
+ "if necessary.",
|
||||
leasePendingShutdown.lease.leaseKey(),
|
||||
timeElapsedSinceShutdownInitiated);
|
||||
}
|
||||
}
|
||||
|
||||
private void transferLeaseIfOwner(LeasePendingShutdown leasePendingShutdown)
|
||||
throws ProvisionedThroughputException, InvalidStateException, DependencyException {
|
||||
final Lease lease = leasePendingShutdown.lease;
|
||||
if (leaseCoordinator.workerIdentifier().equals(lease.checkpointOwner())) {
|
||||
// assignLease will increment the leaseCounter which will cause the heartbeat to stop on the current owner
|
||||
// for the lease
|
||||
leaseCoordinator.leaseRefresher().assignLease(lease, lease.leaseOwner());
|
||||
} else {
|
||||
// the worker ID check is just for sanity. We don't expect it to be different from the current worker.
|
||||
log.error(
|
||||
"Lease {} checkpoint owner mismatch found {} but it should be {}",
|
||||
lease.leaseKey(),
|
||||
lease.checkpointOwner(),
|
||||
leaseCoordinator.workerIdentifier());
|
||||
}
|
||||
// mark it true because we don't want to enter the method again because update is not possible anymore.
|
||||
leasePendingShutdown.leaseTransferCalled = true;
|
||||
}
|
||||
|
||||
private long getCurrentTimeMillis() {
|
||||
return currentTimeSupplier.get();
|
||||
}
|
||||
|
||||
@Data
|
||||
private static class LeasePendingShutdown {
|
||||
final Lease lease;
|
||||
final ShardConsumer shardConsumer;
|
||||
long timeoutTimestampMillis;
|
||||
boolean shutdownRequested = false;
|
||||
boolean leaseTransferCalled = false;
|
||||
}
|
||||
}
|
||||
|
|
@ -24,6 +24,7 @@ import software.amazon.awssdk.services.kinesis.model.Shard;
|
|||
import software.amazon.kinesis.annotations.KinesisClientInternalApi;
|
||||
import software.amazon.kinesis.checkpoint.ShardRecordProcessorCheckpointer;
|
||||
import software.amazon.kinesis.common.StreamIdentifier;
|
||||
import software.amazon.kinesis.leases.LeaseStatsRecorder;
|
||||
import software.amazon.kinesis.leases.ShardDetector;
|
||||
import software.amazon.kinesis.leases.ShardInfo;
|
||||
import software.amazon.kinesis.lifecycle.events.ProcessRecordsInput;
|
||||
|
|
@ -65,6 +66,7 @@ public class ProcessTask implements ConsumerTask {
|
|||
private final AggregatorUtil aggregatorUtil;
|
||||
private final String shardInfoId;
|
||||
private final SchemaRegistryDecoder schemaRegistryDecoder;
|
||||
private final LeaseStatsRecorder leaseStatsRecorder;
|
||||
|
||||
public ProcessTask(
|
||||
@NonNull ShardInfo shardInfo,
|
||||
|
|
@ -79,7 +81,8 @@ public class ProcessTask implements ConsumerTask {
|
|||
long idleTimeInMilliseconds,
|
||||
@NonNull AggregatorUtil aggregatorUtil,
|
||||
@NonNull MetricsFactory metricsFactory,
|
||||
SchemaRegistryDecoder schemaRegistryDecoder) {
|
||||
SchemaRegistryDecoder schemaRegistryDecoder,
|
||||
@NonNull LeaseStatsRecorder leaseStatsRecorder) {
|
||||
this.shardInfo = shardInfo;
|
||||
this.shardInfoId = ShardInfo.getLeaseKey(shardInfo);
|
||||
this.shardRecordProcessor = shardRecordProcessor;
|
||||
|
|
@ -91,6 +94,7 @@ public class ProcessTask implements ConsumerTask {
|
|||
this.idleTimeInMilliseconds = idleTimeInMilliseconds;
|
||||
this.metricsFactory = metricsFactory;
|
||||
this.schemaRegistryDecoder = schemaRegistryDecoder;
|
||||
this.leaseStatsRecorder = leaseStatsRecorder;
|
||||
|
||||
if (!skipShardSyncAtWorkerInitializationIfLeasesExist) {
|
||||
this.shard = shardDetector.shard(shardInfo.shardId());
|
||||
|
|
@ -173,6 +177,7 @@ public class ProcessTask implements ConsumerTask {
|
|||
recordProcessorCheckpointer.largestPermittedCheckpointValue()));
|
||||
|
||||
if (shouldCallProcessRecords(records)) {
|
||||
publishLeaseStats(records);
|
||||
callProcessRecords(processRecordsInput, records);
|
||||
}
|
||||
success = true;
|
||||
|
|
@ -197,6 +202,15 @@ public class ProcessTask implements ConsumerTask {
|
|||
}
|
||||
}
|
||||
|
||||
private void publishLeaseStats(final List<KinesisClientRecord> records) {
|
||||
leaseStatsRecorder.recordStats(LeaseStatsRecorder.LeaseStats.builder()
|
||||
.bytes(records.stream()
|
||||
.mapToInt(record -> record.data().limit())
|
||||
.sum())
|
||||
.leaseKey(ShardInfo.getLeaseKey(shardInfo))
|
||||
.build());
|
||||
}
|
||||
|
||||
private List<KinesisClientRecord> deaggregateAnyKplRecords(List<KinesisClientRecord> records) {
|
||||
if (shard == null) {
|
||||
return aggregatorUtil.deaggregate(records);
|
||||
|
|
|
|||
|
|
@ -21,7 +21,6 @@ import java.util.concurrent.CompletableFuture;
|
|||
import java.util.concurrent.ExecutionException;
|
||||
import java.util.concurrent.ExecutorService;
|
||||
import java.util.concurrent.RejectedExecutionException;
|
||||
import java.util.function.Function;
|
||||
|
||||
import com.google.common.annotations.VisibleForTesting;
|
||||
import lombok.AccessLevel;
|
||||
|
|
@ -35,8 +34,6 @@ import software.amazon.kinesis.exceptions.internal.BlockedOnParentShardException
|
|||
import software.amazon.kinesis.leases.ShardInfo;
|
||||
import software.amazon.kinesis.lifecycle.events.ProcessRecordsInput;
|
||||
import software.amazon.kinesis.lifecycle.events.TaskExecutionListenerInput;
|
||||
import software.amazon.kinesis.metrics.MetricsCollectingTaskDecorator;
|
||||
import software.amazon.kinesis.metrics.MetricsFactory;
|
||||
import software.amazon.kinesis.retrieval.RecordsPublisher;
|
||||
|
||||
/**
|
||||
|
|
@ -59,12 +56,6 @@ public class ShardConsumer {
|
|||
@NonNull
|
||||
private final Optional<Long> logWarningForTaskAfterMillis;
|
||||
|
||||
/**
|
||||
* @deprecated unused; to be removed in a "major" version bump
|
||||
*/
|
||||
@Deprecated
|
||||
private final Function<ConsumerTask, ConsumerTask> taskMetricsDecorator;
|
||||
|
||||
private final int bufferSize;
|
||||
private final TaskExecutionListener taskExecutionListener;
|
||||
private final String streamIdentifier;
|
||||
|
|
@ -95,27 +86,6 @@ public class ShardConsumer {
|
|||
|
||||
private ProcessRecordsInput shardEndProcessRecordsInput;
|
||||
|
||||
@Deprecated
|
||||
public ShardConsumer(
|
||||
RecordsPublisher recordsPublisher,
|
||||
ExecutorService executorService,
|
||||
ShardInfo shardInfo,
|
||||
Optional<Long> logWarningForTaskAfterMillis,
|
||||
ShardConsumerArgument shardConsumerArgument,
|
||||
TaskExecutionListener taskExecutionListener) {
|
||||
this(
|
||||
recordsPublisher,
|
||||
executorService,
|
||||
shardInfo,
|
||||
logWarningForTaskAfterMillis,
|
||||
shardConsumerArgument,
|
||||
ConsumerStates.INITIAL_STATE,
|
||||
ShardConsumer.metricsWrappingFunction(shardConsumerArgument.metricsFactory()),
|
||||
8,
|
||||
taskExecutionListener,
|
||||
LifecycleConfig.DEFAULT_READ_TIMEOUTS_TO_IGNORE);
|
||||
}
|
||||
|
||||
public ShardConsumer(
|
||||
RecordsPublisher recordsPublisher,
|
||||
ExecutorService executorService,
|
||||
|
|
@ -131,36 +101,11 @@ public class ShardConsumer {
|
|||
logWarningForTaskAfterMillis,
|
||||
shardConsumerArgument,
|
||||
ConsumerStates.INITIAL_STATE,
|
||||
ShardConsumer.metricsWrappingFunction(shardConsumerArgument.metricsFactory()),
|
||||
8,
|
||||
taskExecutionListener,
|
||||
readTimeoutsToIgnoreBeforeWarning);
|
||||
}
|
||||
|
||||
@Deprecated
|
||||
public ShardConsumer(
|
||||
RecordsPublisher recordsPublisher,
|
||||
ExecutorService executorService,
|
||||
ShardInfo shardInfo,
|
||||
Optional<Long> logWarningForTaskAfterMillis,
|
||||
ShardConsumerArgument shardConsumerArgument,
|
||||
ConsumerState initialState,
|
||||
Function<ConsumerTask, ConsumerTask> taskMetricsDecorator,
|
||||
int bufferSize,
|
||||
TaskExecutionListener taskExecutionListener) {
|
||||
this(
|
||||
recordsPublisher,
|
||||
executorService,
|
||||
shardInfo,
|
||||
logWarningForTaskAfterMillis,
|
||||
shardConsumerArgument,
|
||||
initialState,
|
||||
taskMetricsDecorator,
|
||||
bufferSize,
|
||||
taskExecutionListener,
|
||||
LifecycleConfig.DEFAULT_READ_TIMEOUTS_TO_IGNORE);
|
||||
}
|
||||
|
||||
//
|
||||
// TODO: Make bufferSize configurable
|
||||
//
|
||||
|
|
@ -171,7 +116,6 @@ public class ShardConsumer {
|
|||
Optional<Long> logWarningForTaskAfterMillis,
|
||||
ShardConsumerArgument shardConsumerArgument,
|
||||
ConsumerState initialState,
|
||||
Function<ConsumerTask, ConsumerTask> taskMetricsDecorator,
|
||||
int bufferSize,
|
||||
TaskExecutionListener taskExecutionListener,
|
||||
int readTimeoutsToIgnoreBeforeWarning) {
|
||||
|
|
@ -183,7 +127,6 @@ public class ShardConsumer {
|
|||
this.logWarningForTaskAfterMillis = logWarningForTaskAfterMillis;
|
||||
this.taskExecutionListener = taskExecutionListener;
|
||||
this.currentState = initialState;
|
||||
this.taskMetricsDecorator = taskMetricsDecorator;
|
||||
subscriber = new ShardConsumerSubscriber(
|
||||
recordsPublisher, executorService, bufferSize, this, readTimeoutsToIgnoreBeforeWarning);
|
||||
this.bufferSize = bufferSize;
|
||||
|
|
@ -484,17 +427,18 @@ public class ShardConsumer {
|
|||
}
|
||||
|
||||
/**
|
||||
* Requests the shutdown of the this ShardConsumer. This should give the record processor a chance to checkpoint
|
||||
* Requests the shutdown of the ShardConsumer. This should give the record processor a chance to checkpoint
|
||||
* before being shutdown.
|
||||
*
|
||||
* @param shutdownNotification
|
||||
* used to signal that the record processor has been given the chance to shutdown.
|
||||
* @param shutdownNotification used to signal that the record processor has been given the chance to shut down.
|
||||
*/
|
||||
public void gracefulShutdown(ShutdownNotification shutdownNotification) {
|
||||
if (subscriber != null) {
|
||||
subscriber.cancel();
|
||||
}
|
||||
if (shutdownNotification != null) {
|
||||
this.shutdownNotification = shutdownNotification;
|
||||
}
|
||||
markForShutdown(ShutdownReason.REQUESTED);
|
||||
}
|
||||
|
||||
|
|
@ -542,21 +486,4 @@ public class ShardConsumer {
|
|||
return shutdownReason != null;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Default task wrapping function for metrics
|
||||
*
|
||||
* @param metricsFactory
|
||||
* the factory used for reporting metrics
|
||||
* @return a function that will wrap the task with a metrics reporter
|
||||
*/
|
||||
private static Function<ConsumerTask, ConsumerTask> metricsWrappingFunction(MetricsFactory metricsFactory) {
|
||||
return (task) -> {
|
||||
if (task == null) {
|
||||
return null;
|
||||
} else {
|
||||
return new MetricsCollectingTaskDecorator(task, metricsFactory);
|
||||
}
|
||||
};
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -18,7 +18,12 @@ import lombok.AccessLevel;
|
|||
import lombok.RequiredArgsConstructor;
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
import software.amazon.kinesis.annotations.KinesisClientInternalApi;
|
||||
import software.amazon.kinesis.leases.Lease;
|
||||
import software.amazon.kinesis.leases.LeaseCoordinator;
|
||||
import software.amazon.kinesis.leases.ShardInfo;
|
||||
import software.amazon.kinesis.leases.exceptions.DependencyException;
|
||||
import software.amazon.kinesis.leases.exceptions.InvalidStateException;
|
||||
import software.amazon.kinesis.leases.exceptions.ProvisionedThroughputException;
|
||||
import software.amazon.kinesis.lifecycle.events.ShutdownRequestedInput;
|
||||
import software.amazon.kinesis.processor.RecordProcessorCheckpointer;
|
||||
import software.amazon.kinesis.processor.ShardRecordProcessor;
|
||||
|
|
@ -33,23 +38,41 @@ public class ShutdownNotificationTask implements ConsumerTask {
|
|||
private final ShardRecordProcessor shardRecordProcessor;
|
||||
private final RecordProcessorCheckpointer recordProcessorCheckpointer;
|
||||
private final ShutdownNotification shutdownNotification;
|
||||
// TODO: remove if not used
|
||||
private final ShardInfo shardInfo;
|
||||
private final LeaseCoordinator leaseCoordinator;
|
||||
|
||||
@Override
|
||||
public TaskResult call() {
|
||||
final String leaseKey = ShardInfo.getLeaseKey(shardInfo);
|
||||
final Lease currentShardLease = leaseCoordinator.getCurrentlyHeldLease(leaseKey);
|
||||
try {
|
||||
try {
|
||||
shardRecordProcessor.shutdownRequested(ShutdownRequestedInput.builder()
|
||||
.checkpointer(recordProcessorCheckpointer)
|
||||
.build());
|
||||
attemptLeaseTransfer(currentShardLease);
|
||||
} catch (Exception ex) {
|
||||
return new TaskResult(ex);
|
||||
}
|
||||
|
||||
return new TaskResult(null);
|
||||
} finally {
|
||||
if (shutdownNotification != null) {
|
||||
shutdownNotification.shutdownNotificationComplete();
|
||||
} else {
|
||||
// shutdownNotification is null if this is a shard level graceful shutdown instead of a worker level
|
||||
// one. We need to drop lease like what's done in the shutdownNotificationComplete so we can
|
||||
// transition to next state.
|
||||
leaseCoordinator.dropLease(currentShardLease);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private void attemptLeaseTransfer(Lease lease)
|
||||
throws ProvisionedThroughputException, InvalidStateException, DependencyException {
|
||||
if (lease != null && lease.shutdownRequested()) {
|
||||
if (leaseCoordinator.workerIdentifier().equals(lease.checkpointOwner())) {
|
||||
leaseCoordinator.leaseRefresher().assignLease(lease, lease.leaseOwner());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -164,7 +164,6 @@ public class ShutdownTask implements ConsumerTask {
|
|||
} else {
|
||||
throwOnApplicationException(leaseKey, leaseLostAction, scope, startTime);
|
||||
}
|
||||
|
||||
log.debug("Shutting down retrieval strategy for shard {}.", leaseKey);
|
||||
recordsPublisher.shutdown();
|
||||
|
||||
|
|
|
|||
|
|
@ -49,7 +49,7 @@ public class RetrievalConfig {
|
|||
*/
|
||||
public static final String KINESIS_CLIENT_LIB_USER_AGENT = "amazon-kinesis-client-library-java";
|
||||
|
||||
public static final String KINESIS_CLIENT_LIB_USER_AGENT_VERSION = "2.6.1-SNAPSHOT";
|
||||
public static final String KINESIS_CLIENT_LIB_USER_AGENT_VERSION = "3.0.0";
|
||||
|
||||
/**
|
||||
* Client used to make calls to Kinesis for records retrieval
|
||||
|
|
|
|||
|
|
@ -26,6 +26,7 @@ import lombok.NonNull;
|
|||
import lombok.Setter;
|
||||
import lombok.ToString;
|
||||
import lombok.experimental.Accessors;
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
import software.amazon.awssdk.services.kinesis.KinesisAsyncClient;
|
||||
import software.amazon.awssdk.services.kinesis.model.GetRecordsRequest;
|
||||
import software.amazon.kinesis.retrieval.DataFetcherProviderConfig;
|
||||
|
|
@ -38,12 +39,15 @@ import software.amazon.kinesis.retrieval.RetrievalSpecificConfig;
|
|||
@Setter
|
||||
@ToString
|
||||
@EqualsAndHashCode
|
||||
@Slf4j
|
||||
public class PollingConfig implements RetrievalSpecificConfig {
|
||||
|
||||
public static final Duration DEFAULT_REQUEST_TIMEOUT = Duration.ofSeconds(30);
|
||||
|
||||
public static final int DEFAULT_MAX_RECORDS = 10000;
|
||||
|
||||
public static final long MIN_IDLE_MILLIS_BETWEEN_READS = 200L;
|
||||
|
||||
/**
|
||||
* Configurable functional interface to override the existing DataFetcher.
|
||||
*/
|
||||
|
|
@ -138,9 +142,18 @@ public class PollingConfig implements RetrievalSpecificConfig {
|
|||
/**
|
||||
* Set the value for how long the ShardConsumer should sleep in between calls to
|
||||
* {@link KinesisAsyncClient#getRecords(GetRecordsRequest)}. If this is not specified here the value provided in
|
||||
* {@link RecordsFetcherFactory} will be used.
|
||||
* {@link RecordsFetcherFactory} will be used. Cannot set value below MIN_IDLE_MILLIS_BETWEEN_READS.
|
||||
*/
|
||||
public PollingConfig idleTimeBetweenReadsInMillis(long idleTimeBetweenReadsInMillis) {
|
||||
if (idleTimeBetweenReadsInMillis < MIN_IDLE_MILLIS_BETWEEN_READS) {
|
||||
log.warn(
|
||||
"idleTimeBetweenReadsInMillis must be greater than or equal to {} but current value is {}."
|
||||
+ " Defaulting to minimum {}.",
|
||||
MIN_IDLE_MILLIS_BETWEEN_READS,
|
||||
idleTimeBetweenReadsInMillis,
|
||||
MIN_IDLE_MILLIS_BETWEEN_READS);
|
||||
idleTimeBetweenReadsInMillis = MIN_IDLE_MILLIS_BETWEEN_READS;
|
||||
}
|
||||
usePollingConfigIdleTimeValue = true;
|
||||
this.idleTimeBetweenReadsInMillis = idleTimeBetweenReadsInMillis;
|
||||
return this;
|
||||
|
|
|
|||
|
|
@ -61,6 +61,7 @@ import software.amazon.kinesis.retrieval.RecordsDeliveryAck;
|
|||
import software.amazon.kinesis.retrieval.RecordsPublisher;
|
||||
import software.amazon.kinesis.retrieval.RecordsRetrieved;
|
||||
import software.amazon.kinesis.retrieval.RetryableRetrievalException;
|
||||
import software.amazon.kinesis.retrieval.ThrottlingReporter;
|
||||
import software.amazon.kinesis.retrieval.kpl.ExtendedSequenceNumber;
|
||||
|
||||
import static software.amazon.kinesis.common.DiagnosticUtils.takeDelayedDeliveryActionIfRequired;
|
||||
|
|
@ -109,6 +110,7 @@ public class PrefetchRecordsPublisher implements RecordsPublisher {
|
|||
private boolean wasReset = false;
|
||||
private Instant lastEventDeliveryTime = Instant.EPOCH;
|
||||
private final RequestDetails lastSuccessfulRequestDetails = new RequestDetails();
|
||||
private final ThrottlingReporter throttlingReporter;
|
||||
|
||||
@Data
|
||||
@Accessors(fluent = true)
|
||||
|
|
@ -233,6 +235,7 @@ public class PrefetchRecordsPublisher implements RecordsPublisher {
|
|||
@NonNull final MetricsFactory metricsFactory,
|
||||
@NonNull final String operation,
|
||||
@NonNull final String shardId,
|
||||
final ThrottlingReporter throttlingReporter,
|
||||
final long awaitTerminationTimeoutMillis) {
|
||||
this.getRecordsRetrievalStrategy = getRecordsRetrievalStrategy;
|
||||
this.maxRecordsPerCall = maxRecordsPerCall;
|
||||
|
|
@ -248,6 +251,7 @@ public class PrefetchRecordsPublisher implements RecordsPublisher {
|
|||
this.idleMillisBetweenCalls = idleMillisBetweenCalls;
|
||||
this.defaultGetRecordsCacheDaemon = new DefaultGetRecordsCacheDaemon();
|
||||
Validate.notEmpty(operation, "Operation cannot be empty");
|
||||
this.throttlingReporter = throttlingReporter;
|
||||
this.operation = operation;
|
||||
this.streamId = this.getRecordsRetrievalStrategy.dataFetcher().getStreamIdentifier();
|
||||
this.streamAndShardId = this.streamId.serialize() + ":" + shardId;
|
||||
|
|
@ -279,7 +283,8 @@ public class PrefetchRecordsPublisher implements RecordsPublisher {
|
|||
final long idleMillisBetweenCalls,
|
||||
final MetricsFactory metricsFactory,
|
||||
final String operation,
|
||||
final String shardId) {
|
||||
final String shardId,
|
||||
final ThrottlingReporter throttlingReporter) {
|
||||
this(
|
||||
maxPendingProcessRecordsInput,
|
||||
maxByteSize,
|
||||
|
|
@ -291,6 +296,7 @@ public class PrefetchRecordsPublisher implements RecordsPublisher {
|
|||
metricsFactory,
|
||||
operation,
|
||||
shardId,
|
||||
throttlingReporter,
|
||||
DEFAULT_AWAIT_TERMINATION_TIMEOUT_MILLIS);
|
||||
}
|
||||
|
||||
|
|
@ -555,6 +561,7 @@ public class PrefetchRecordsPublisher implements RecordsPublisher {
|
|||
recordsRetrieved.lastBatchSequenceNumber);
|
||||
addArrivedRecordsInput(recordsRetrieved);
|
||||
drainQueueForRequests();
|
||||
throttlingReporter.success();
|
||||
} catch (PositionResetException pse) {
|
||||
throw pse;
|
||||
} catch (RetryableRetrievalException rre) {
|
||||
|
|
@ -584,10 +591,11 @@ public class PrefetchRecordsPublisher implements RecordsPublisher {
|
|||
|
||||
publisherSession.dataFetcher().restartIterator();
|
||||
} catch (ProvisionedThroughputExceededException e) {
|
||||
// Update the lastSuccessfulCall if we get a throttling exception so that we back off idleMillis
|
||||
// for the next call
|
||||
lastSuccessfulCall = Instant.now();
|
||||
log.error("{} : Exception thrown while fetching records from Kinesis", streamAndShardId, e);
|
||||
log.error(
|
||||
"{} : ProvisionedThroughputExceededException thrown while fetching records from Kinesis",
|
||||
streamAndShardId,
|
||||
e);
|
||||
throttlingReporter.throttled();
|
||||
} catch (SdkException e) {
|
||||
log.error("{} : Exception thrown while fetching records from Kinesis", streamAndShardId, e);
|
||||
} finally {
|
||||
|
|
|
|||
|
|
@ -24,6 +24,7 @@ import software.amazon.kinesis.retrieval.DataFetchingStrategy;
|
|||
import software.amazon.kinesis.retrieval.GetRecordsRetrievalStrategy;
|
||||
import software.amazon.kinesis.retrieval.RecordsFetcherFactory;
|
||||
import software.amazon.kinesis.retrieval.RecordsPublisher;
|
||||
import software.amazon.kinesis.retrieval.ThrottlingReporter;
|
||||
|
||||
@Slf4j
|
||||
@KinesisClientInternalApi
|
||||
|
|
@ -32,6 +33,7 @@ public class SimpleRecordsFetcherFactory implements RecordsFetcherFactory {
|
|||
private int maxByteSize = 8 * 1024 * 1024;
|
||||
private int maxRecordsCount = 30000;
|
||||
private long idleMillisBetweenCalls = 1500L;
|
||||
private int maxConsecutiveThrottles = 5;
|
||||
private DataFetchingStrategy dataFetchingStrategy = DataFetchingStrategy.DEFAULT;
|
||||
|
||||
@Override
|
||||
|
|
@ -56,7 +58,8 @@ public class SimpleRecordsFetcherFactory implements RecordsFetcherFactory {
|
|||
idleMillisBetweenCalls,
|
||||
metricsFactory,
|
||||
"ProcessTask",
|
||||
shardId);
|
||||
shardId,
|
||||
new ThrottlingReporter(maxConsecutiveThrottles, shardId));
|
||||
}
|
||||
|
||||
@Override
|
||||
|
|
|
|||
|
|
@ -7,7 +7,6 @@ import java.util.List;
|
|||
import com.amazonaws.services.schemaregistry.common.Schema;
|
||||
import com.amazonaws.services.schemaregistry.deserializers.GlueSchemaRegistryDeserializer;
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
import software.amazon.kinesis.common.KinesisClientLibraryPackage;
|
||||
import software.amazon.kinesis.retrieval.KinesisClientRecord;
|
||||
|
||||
/**
|
||||
|
|
@ -15,7 +14,7 @@ import software.amazon.kinesis.retrieval.KinesisClientRecord;
|
|||
*/
|
||||
@Slf4j
|
||||
public class SchemaRegistryDecoder {
|
||||
private static final String USER_AGENT_APP_NAME = "kcl" + "-" + KinesisClientLibraryPackage.VERSION;
|
||||
private static final String USER_AGENT_APP_NAME = "kcl" + "-" + "3.0.0";
|
||||
private final GlueSchemaRegistryDeserializer glueSchemaRegistryDeserializer;
|
||||
|
||||
public SchemaRegistryDecoder(GlueSchemaRegistryDeserializer glueSchemaRegistryDeserializer) {
|
||||
|
|
|
|||
|
|
@ -0,0 +1,61 @@
|
|||
package software.amazon.kinesis.utils;
|
||||
|
||||
import java.io.BufferedReader;
|
||||
import java.io.File;
|
||||
import java.io.FileReader;
|
||||
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
|
||||
@Slf4j
|
||||
public class Cgroup {
|
||||
|
||||
public static String readSingleLineFile(String path) {
|
||||
BufferedReader bufferedReader = null;
|
||||
try {
|
||||
final File file = new File(path);
|
||||
if (file.exists()) {
|
||||
bufferedReader = new BufferedReader(new FileReader(file));
|
||||
return bufferedReader.readLine();
|
||||
} else {
|
||||
throw new IllegalArgumentException(String.format("Failed to read file. %s does not exist", path));
|
||||
}
|
||||
} catch (final Throwable t) {
|
||||
if (t instanceof IllegalArgumentException) {
|
||||
throw (IllegalArgumentException) t;
|
||||
}
|
||||
throw new IllegalArgumentException("Failed to read file.", t);
|
||||
} finally {
|
||||
try {
|
||||
if (bufferedReader != null) {
|
||||
bufferedReader.close();
|
||||
}
|
||||
} catch (Throwable x) {
|
||||
log.warn("Failed to close bufferedReader ", x);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Calculates the number of available cpus from the cpuset
|
||||
* See https://docs.kernel.org/admin-guide/cgroup-v2.html#cpuset for more information
|
||||
* "0-7" represents 8 cores
|
||||
* "0-4,6,8-10" represents 9 cores (cores 0,1,2,3,4 and core 6 and core 8,9,10)
|
||||
* @param cpuSet a single line from the cgroup cpuset file
|
||||
* @return the number of available cpus
|
||||
*/
|
||||
public static int getAvailableCpusFromEffectiveCpuSet(final String cpuSet) {
|
||||
final String[] cpuSetArr = cpuSet.split(",");
|
||||
|
||||
int sumCpus = 0;
|
||||
for (String cpuSetGroup : cpuSetArr) {
|
||||
if (cpuSetGroup.contains("-")) {
|
||||
final String[] cpuSetGroupSplit = cpuSetGroup.split("-");
|
||||
// Values are inclusive
|
||||
sumCpus += Integer.parseInt(cpuSetGroupSplit[1]) - Integer.parseInt(cpuSetGroupSplit[0]) + 1;
|
||||
} else {
|
||||
sumCpus += 1;
|
||||
}
|
||||
}
|
||||
return sumCpus;
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,31 @@
|
|||
package software.amazon.kinesis.utils;
|
||||
|
||||
import lombok.Getter;
|
||||
import lombok.RequiredArgsConstructor;
|
||||
|
||||
/**
|
||||
* Uses the formula mentioned below for simple ExponentialMovingAverage
|
||||
* <a href="https://en.wikipedia.org/wiki/Moving_average#Exponential_moving_average"/>
|
||||
*
|
||||
* Values of alpha close to 1 have less of a smoothing effect and give greater weight to recent changes in the data,
|
||||
* while values of alpha closer to 0 have a greater smoothing effect and are less responsive to recent changes.
|
||||
*/
|
||||
@RequiredArgsConstructor
|
||||
public class ExponentialMovingAverage {
|
||||
|
||||
private final double alpha;
|
||||
|
||||
@Getter
|
||||
private double value;
|
||||
|
||||
private boolean initialized = false;
|
||||
|
||||
public void add(final double newValue) {
|
||||
if (!initialized) {
|
||||
this.value = newValue;
|
||||
initialized = true;
|
||||
} else {
|
||||
this.value = alpha * newValue + (1 - alpha) * this.value;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,44 @@
|
|||
package software.amazon.kinesis.utils;
|
||||
|
||||
import java.util.AbstractMap;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
|
||||
public class Statistics {
|
||||
|
||||
/**
|
||||
* Calculates the simple mean of the given values
|
||||
* @param values list of values (double)
|
||||
* @return mean of the given values, if the {@param values} is empty then returns 0;
|
||||
*/
|
||||
public static double calculateSimpleMean(final List<Double> values) {
|
||||
if (values.isEmpty()) {
|
||||
return 0D;
|
||||
}
|
||||
double sum = 0.0;
|
||||
for (final double i : values) {
|
||||
sum += i;
|
||||
}
|
||||
return sum / values.size();
|
||||
}
|
||||
|
||||
/**
|
||||
* For the given values find the standard deviation (SD).
|
||||
* For details of SD calculation ref : <a href="https://en.wikipedia.org/wiki/Standard_deviation"/>
|
||||
* @param values list of values (double)
|
||||
* @return Map.Entry of mean to standard deviation for {@param values}, if {@param values} is empty then return
|
||||
* Map.Entry with 0 as mean and 0 as SD.
|
||||
*/
|
||||
public static Map.Entry<Double, Double> calculateStandardDeviationAndMean(final List<Double> values) {
|
||||
if (values.isEmpty()) {
|
||||
return new AbstractMap.SimpleEntry<>(0D, 0D);
|
||||
}
|
||||
final double mean = calculateSimpleMean(values);
|
||||
// calculate the standard deviation
|
||||
double standardDeviation = 0.0;
|
||||
for (final double num : values) {
|
||||
standardDeviation += Math.pow(num - mean, 2);
|
||||
}
|
||||
return new AbstractMap.SimpleEntry<>(mean, Math.sqrt(standardDeviation / values.size()));
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,92 @@
|
|||
package software.amazon.kinesis.worker;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
import java.util.Optional;
|
||||
|
||||
import lombok.RequiredArgsConstructor;
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
import software.amazon.kinesis.annotations.KinesisClientInternalApi;
|
||||
import software.amazon.kinesis.worker.metric.OperatingRange;
|
||||
import software.amazon.kinesis.worker.metric.WorkerMetric;
|
||||
import software.amazon.kinesis.worker.metric.impl.container.Cgroupv1CpuWorkerMetric;
|
||||
import software.amazon.kinesis.worker.metric.impl.container.Cgroupv2CpuWorkerMetric;
|
||||
import software.amazon.kinesis.worker.metric.impl.container.EcsCpuWorkerMetric;
|
||||
import software.amazon.kinesis.worker.metric.impl.linux.LinuxCpuWorkerMetric;
|
||||
import software.amazon.kinesis.worker.platform.Ec2Resource;
|
||||
import software.amazon.kinesis.worker.platform.EcsResource;
|
||||
import software.amazon.kinesis.worker.platform.EksResource;
|
||||
import software.amazon.kinesis.worker.platform.OperatingRangeDataProvider;
|
||||
import software.amazon.kinesis.worker.platform.ResourceMetadataProvider;
|
||||
|
||||
/**
|
||||
* Class to select appropriate WorkerMetricStats based on the operating range provider that is available on the instance.
|
||||
*/
|
||||
@Slf4j
|
||||
@RequiredArgsConstructor
|
||||
@KinesisClientInternalApi
|
||||
public class WorkerMetricsSelector {
|
||||
|
||||
private static final OperatingRange DEFAULT_100_PERC_UTILIZED_OPERATING_RANGE =
|
||||
OperatingRange.builder().maxUtilization(100).build();
|
||||
|
||||
private final List<ResourceMetadataProvider> workerComputePlatforms;
|
||||
|
||||
/**
|
||||
* Factory method to create an instance of WorkerMetricsSelector.
|
||||
*
|
||||
* @return WorkerMetricsSelector instance
|
||||
*/
|
||||
public static WorkerMetricsSelector create() {
|
||||
final List<ResourceMetadataProvider> resourceMetadataProviders = new ArrayList<>();
|
||||
resourceMetadataProviders.add(EcsResource.create());
|
||||
resourceMetadataProviders.add(EksResource.create());
|
||||
// ec2 has to be the last one to check
|
||||
resourceMetadataProviders.add(Ec2Resource.create());
|
||||
return new WorkerMetricsSelector(resourceMetadataProviders);
|
||||
}
|
||||
|
||||
private Optional<OperatingRangeDataProvider> getOperatingRangeDataProvider() {
|
||||
for (ResourceMetadataProvider platform : workerComputePlatforms) {
|
||||
if (platform.isOnPlatform()) {
|
||||
final ResourceMetadataProvider.ComputePlatform computePlatform = platform.getPlatform();
|
||||
log.info("Worker is running on {}", computePlatform);
|
||||
return platform.getOperatingRangeDataProvider();
|
||||
}
|
||||
}
|
||||
return Optional.empty();
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns a list of WorkerMetricStats based on the operating range provider the worker uses.
|
||||
*
|
||||
* @return List of WorkerMetricStats
|
||||
*/
|
||||
public List<WorkerMetric> getDefaultWorkerMetrics() {
|
||||
final List<WorkerMetric> workerMetrics = new ArrayList<>();
|
||||
final Optional<OperatingRangeDataProvider> optionalProvider = getOperatingRangeDataProvider();
|
||||
if (!optionalProvider.isPresent()) {
|
||||
log.warn("Did not find an operating range metadata provider.");
|
||||
return workerMetrics;
|
||||
}
|
||||
final OperatingRangeDataProvider dataProvider = optionalProvider.get();
|
||||
log.info("Worker has operating range metadata provider {} ", dataProvider);
|
||||
switch (dataProvider) {
|
||||
case LINUX_PROC:
|
||||
workerMetrics.add(new LinuxCpuWorkerMetric(DEFAULT_100_PERC_UTILIZED_OPERATING_RANGE));
|
||||
break;
|
||||
case LINUX_ECS_METADATA_KEY_V4:
|
||||
workerMetrics.add(new EcsCpuWorkerMetric(DEFAULT_100_PERC_UTILIZED_OPERATING_RANGE));
|
||||
break;
|
||||
case LINUX_EKS_CGROUP_V2:
|
||||
workerMetrics.add(new Cgroupv2CpuWorkerMetric(DEFAULT_100_PERC_UTILIZED_OPERATING_RANGE));
|
||||
break;
|
||||
case LINUX_EKS_CGROUP_V1:
|
||||
workerMetrics.add(new Cgroupv1CpuWorkerMetric(DEFAULT_100_PERC_UTILIZED_OPERATING_RANGE));
|
||||
break;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
return workerMetrics;
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,20 @@
|
|||
package software.amazon.kinesis.worker.metric;
|
||||
|
||||
import com.google.common.base.Preconditions;
|
||||
import lombok.Builder;
|
||||
import lombok.Data;
|
||||
|
||||
@Data
|
||||
@Builder
|
||||
public class OperatingRange {
|
||||
|
||||
/**
|
||||
* Max utilization percentage allowed for the workerMetrics.
|
||||
*/
|
||||
private final int maxUtilization;
|
||||
|
||||
private OperatingRange(final int maxUtilization) {
|
||||
Preconditions.checkArgument(!(maxUtilization < 0 || maxUtilization > 100), "Invalid maxUtilization value");
|
||||
this.maxUtilization = maxUtilization;
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,52 @@
|
|||
package software.amazon.kinesis.worker.metric;
|
||||
|
||||
import com.google.common.base.Preconditions;
|
||||
import lombok.Builder;
|
||||
import lombok.Getter;
|
||||
import lombok.NonNull;
|
||||
|
||||
public interface WorkerMetric {
|
||||
/**
|
||||
* WorkerMetricStats short name that is used as attribute name for it in storage.
|
||||
* @return short name for the WorkerMetricStats
|
||||
*/
|
||||
String getShortName();
|
||||
|
||||
/**
|
||||
* Current WorkerMetricValue. WorkerMetricValue is a normalized percentage value to its max configured limits.
|
||||
* E.g., if for a worker max network bandwidth is 10Gbps and current used bandwidth is 2Gbps, then WorkerMetricValue for
|
||||
* NetworkWorkerMetrics will be 20 (%).
|
||||
*
|
||||
* @return WorkerMetricValue between 0 and 100 (both inclusive)
|
||||
*/
|
||||
WorkerMetricValue capture();
|
||||
|
||||
/**
|
||||
* Gets the operating range for this workerMetrics
|
||||
* @return Operating range for this workerMetrics
|
||||
*/
|
||||
OperatingRange getOperatingRange();
|
||||
|
||||
/**
|
||||
* Type of the current WorkerMetricStats.
|
||||
* @return WorkerMetricType
|
||||
*/
|
||||
WorkerMetricType getWorkerMetricType();
|
||||
|
||||
/**
|
||||
* WorkerMetricValue model class is used as return type for the capture() method to have a strong checks at the build
|
||||
* time of the object itself.
|
||||
*/
|
||||
@Builder
|
||||
class WorkerMetricValue {
|
||||
|
||||
@Getter
|
||||
private final Double value;
|
||||
|
||||
private WorkerMetricValue(@NonNull final Double value) {
|
||||
Preconditions.checkArgument(
|
||||
!(value < 0 || value > 100), value + " is either less than 0 or greater than 100");
|
||||
this.value = value;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,16 @@
|
|||
package software.amazon.kinesis.worker.metric;
|
||||
|
||||
import lombok.Getter;
|
||||
import lombok.RequiredArgsConstructor;
|
||||
|
||||
@RequiredArgsConstructor
|
||||
public enum WorkerMetricType {
|
||||
CPU("C"),
|
||||
MEMORY("M"),
|
||||
NETWORK_IN("NI"),
|
||||
NETWORK_OUT("NO"),
|
||||
THROUGHPUT("T");
|
||||
|
||||
@Getter
|
||||
private final String shortName;
|
||||
}
|
||||
|
|
@ -0,0 +1,128 @@
|
|||
package software.amazon.kinesis.worker.metric.impl.container;
|
||||
|
||||
import java.time.Clock;
|
||||
import java.util.concurrent.TimeUnit;
|
||||
|
||||
import lombok.AccessLevel;
|
||||
import lombok.RequiredArgsConstructor;
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
import software.amazon.kinesis.worker.metric.OperatingRange;
|
||||
import software.amazon.kinesis.worker.metric.WorkerMetric;
|
||||
import software.amazon.kinesis.worker.metric.WorkerMetricType;
|
||||
|
||||
import static software.amazon.kinesis.utils.Cgroup.getAvailableCpusFromEffectiveCpuSet;
|
||||
import static software.amazon.kinesis.utils.Cgroup.readSingleLineFile;
|
||||
|
||||
/**
|
||||
* Utilizes Linux Control Groups by reading cpu time and available cpu from cgroup directory.This works for Elastic
|
||||
* Kubernetes Service (EKS) containers running on Linux instances which use cgroupv1.
|
||||
*
|
||||
* EC2 instances must use a Linux instance that uses cgroupv1. Amazon Linux 2 uses cgroupv1.
|
||||
* Fargate versions 1.4.0 and 1.3.0 use Amazon Linux 2 and can use this.
|
||||
*
|
||||
* CPU time is measured in CPU cores time. A container is limited by amount of CPU core time it is allocated. So if over
|
||||
* a second the container uses 0.5 CPU core time and is allocated 2 CPU cores, the cpu utilization would be 25%.
|
||||
*
|
||||
* When this is invoked for the first time, the value returned is always 0 as the prev values are not available
|
||||
* to calculate the diff.
|
||||
* In case the file is not present or any other exception occurs, this throws IllegalArgumentException.
|
||||
*/
|
||||
@Slf4j
|
||||
@RequiredArgsConstructor(access = AccessLevel.PACKAGE)
|
||||
public class Cgroupv1CpuWorkerMetric implements WorkerMetric {
|
||||
|
||||
private static final Object LOCK_OBJECT = new Object();
|
||||
private static final WorkerMetricType CPU_WORKER_METRICS_TYPE = WorkerMetricType.CPU;
|
||||
private static final String CGROUP_ROOT = "/sys/fs/cgroup/";
|
||||
private static final String CPU_TIME_FILE = CGROUP_ROOT + "cpu/cpuacct.usage";
|
||||
private static final String CPU_CFS_QUOTA_FILE = CGROUP_ROOT + "cpu/cpu.cfs_quota_us";
|
||||
private static final String CPU_CFS_PERIOD_FILE = CGROUP_ROOT + "cpu/cpu.cfs_period_us";
|
||||
private static final String EFFECTIVE_CPU_SET_FILE = CGROUP_ROOT + "cpuset/cpuset.effective_cpus";
|
||||
private final OperatingRange operatingRange;
|
||||
private final String cpuTimeFile;
|
||||
private final String cfsQuotaFile;
|
||||
private final String cfsPeriodFile;
|
||||
private final String effectiveCpuSetFile;
|
||||
private final Clock clock;
|
||||
private double cpuLimit = -1;
|
||||
private long lastCpuUseTimeNanos = 0;
|
||||
private long lastSystemTimeNanos = 0;
|
||||
|
||||
public Cgroupv1CpuWorkerMetric(final OperatingRange operatingRange) {
|
||||
this(
|
||||
operatingRange,
|
||||
CPU_TIME_FILE,
|
||||
CPU_CFS_QUOTA_FILE,
|
||||
CPU_CFS_PERIOD_FILE,
|
||||
EFFECTIVE_CPU_SET_FILE,
|
||||
Clock.systemUTC());
|
||||
}
|
||||
|
||||
@Override
|
||||
public String getShortName() {
|
||||
return CPU_WORKER_METRICS_TYPE.getShortName();
|
||||
}
|
||||
|
||||
@Override
|
||||
public WorkerMetricValue capture() {
|
||||
return WorkerMetricValue.builder().value(calculateCpuUsage()).build();
|
||||
}
|
||||
|
||||
private double calculateCpuUsage() {
|
||||
if (cpuLimit == -1) {
|
||||
cpuLimit = calculateCpuLimit();
|
||||
}
|
||||
|
||||
final long cpuTimeNanos = Long.parseLong(readSingleLineFile(cpuTimeFile));
|
||||
final long currentTimeNanos = TimeUnit.MILLISECONDS.toNanos(clock.millis());
|
||||
|
||||
boolean skip = false;
|
||||
double cpuCoreTimeUsed;
|
||||
synchronized (LOCK_OBJECT) {
|
||||
if (lastCpuUseTimeNanos == 0 && lastSystemTimeNanos == 0) {
|
||||
// Case where this is a first call so no diff available
|
||||
skip = true;
|
||||
}
|
||||
|
||||
final long nanoTimeDiff = currentTimeNanos - lastSystemTimeNanos;
|
||||
final long cpuUseDiff = cpuTimeNanos - lastCpuUseTimeNanos;
|
||||
// This value is not a percent, but rather how much CPU core time was consumed. i.e. this number can be
|
||||
// 2.2 which stands for 2.2 CPU cores were fully utilized. If this number is less than 1 than that means
|
||||
// that less than 1 CPU core was used.
|
||||
cpuCoreTimeUsed = ((double) cpuUseDiff / nanoTimeDiff);
|
||||
|
||||
lastCpuUseTimeNanos = cpuTimeNanos;
|
||||
lastSystemTimeNanos = currentTimeNanos;
|
||||
}
|
||||
|
||||
if (skip) {
|
||||
return 0D;
|
||||
} else {
|
||||
// In case of rounding error, treat everything above 100% as 100%
|
||||
return Math.min(100.0, cpuCoreTimeUsed / cpuLimit * 100.0);
|
||||
}
|
||||
}
|
||||
|
||||
private double calculateCpuLimit() {
|
||||
// Documentation on these values:
|
||||
// https://docs.redhat.com/en/documentation/red_hat_enterprise_linux/6/html/resource_management_guide/sec-cpu#sect-cfs
|
||||
final long cfsQuota = Long.parseLong(readSingleLineFile(cfsQuotaFile));
|
||||
final long cfsPeriod = Long.parseLong(readSingleLineFile(cfsPeriodFile));
|
||||
if (cfsQuota == -1) {
|
||||
// If quota is -1, a limit is not set on the container. The container can use all available cores.
|
||||
return getAvailableCpusFromEffectiveCpuSet(readSingleLineFile(effectiveCpuSetFile));
|
||||
} else {
|
||||
return ((double) cfsQuota) / cfsPeriod;
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public OperatingRange getOperatingRange() {
|
||||
return operatingRange;
|
||||
}
|
||||
|
||||
@Override
|
||||
public WorkerMetricType getWorkerMetricType() {
|
||||
return CPU_WORKER_METRICS_TYPE;
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,128 @@
|
|||
package software.amazon.kinesis.worker.metric.impl.container;
|
||||
|
||||
import java.time.Clock;
|
||||
import java.util.concurrent.TimeUnit;
|
||||
|
||||
import lombok.AccessLevel;
|
||||
import lombok.RequiredArgsConstructor;
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
import software.amazon.kinesis.worker.metric.OperatingRange;
|
||||
import software.amazon.kinesis.worker.metric.WorkerMetric;
|
||||
import software.amazon.kinesis.worker.metric.WorkerMetricType;
|
||||
|
||||
import static software.amazon.kinesis.utils.Cgroup.getAvailableCpusFromEffectiveCpuSet;
|
||||
import static software.amazon.kinesis.utils.Cgroup.readSingleLineFile;
|
||||
|
||||
/**
|
||||
* Utilizes Linux Control Groups by reading cpu time and available cpu from cgroup directory. This works for Elastic
|
||||
* Kubernetes Service (EKS) containers running on Linux instances which use cgroupv2.
|
||||
*
|
||||
* EC2 instances must use a Linux instance that uses cgroupv2. Amazon Linux 2023 uses cgroupv2.
|
||||
*
|
||||
* CPU time is measured in CPU cores time. A container is limited by amount of CPU core time it is allocated. So if over
|
||||
* a second the container uses 0.5 CPU core time and is allocated 2 CPU cores, the cpu utilization would be 25%.
|
||||
*
|
||||
* When this is invoked for the first time, the value returned is always 0 as the prev values are not available
|
||||
* to calculate the diff.
|
||||
* In case the file is not present or any other exception occurs, this throws IllegalArgumentException.
|
||||
*/
|
||||
@Slf4j
|
||||
@RequiredArgsConstructor(access = AccessLevel.PACKAGE)
|
||||
public class Cgroupv2CpuWorkerMetric implements WorkerMetric {
|
||||
|
||||
private static final Object LOCK_OBJECT = new Object();
|
||||
private static final WorkerMetricType CPU_WORKER_METRICS_TYPE = WorkerMetricType.CPU;
|
||||
private static final String CGROUP_ROOT = "/sys/fs/cgroup/";
|
||||
private static final String CPU_MAX_FILE = CGROUP_ROOT + "cpu.max";
|
||||
private static final String EFFECTIVE_CPU_SET_FILE = CGROUP_ROOT + "cpuset.cpus.effective";
|
||||
private static final String CPU_STAT_FILE = CGROUP_ROOT + "cpu.stat";
|
||||
private final OperatingRange operatingRange;
|
||||
private final String cpuMaxFile;
|
||||
private final String effectiveCpuSetFile;
|
||||
private final String cpuStatFile;
|
||||
private final Clock clock;
|
||||
private double cpuLimit = -1;
|
||||
private long lastCpuUseTimeMicros = 0;
|
||||
private long lastSystemTimeMicros = 0;
|
||||
|
||||
public Cgroupv2CpuWorkerMetric(final OperatingRange operatingRange) {
|
||||
this(operatingRange, CPU_MAX_FILE, EFFECTIVE_CPU_SET_FILE, CPU_STAT_FILE, Clock.systemUTC());
|
||||
}
|
||||
|
||||
@Override
|
||||
public String getShortName() {
|
||||
return CPU_WORKER_METRICS_TYPE.getShortName();
|
||||
}
|
||||
|
||||
@Override
|
||||
public WorkerMetricValue capture() {
|
||||
return WorkerMetricValue.builder().value(calculateCpuUsage()).build();
|
||||
}
|
||||
|
||||
private double calculateCpuUsage() {
|
||||
if (cpuLimit == -1) {
|
||||
cpuLimit = calculateCpuLimit();
|
||||
}
|
||||
|
||||
// The first line of this file is of the format
|
||||
// usage_usec $MICROSECONDS
|
||||
// where $MICROSECONDS is always a number
|
||||
final String cpuUsageStat = readSingleLineFile(cpuStatFile);
|
||||
final long cpuTimeMicros = Long.parseLong(cpuUsageStat.split(" ")[1]);
|
||||
final long currentTimeMicros = TimeUnit.MILLISECONDS.toMicros(clock.millis());
|
||||
|
||||
boolean skip = false;
|
||||
double cpuCoreTimeUsed;
|
||||
synchronized (LOCK_OBJECT) {
|
||||
if (lastCpuUseTimeMicros == 0 && lastSystemTimeMicros == 0) {
|
||||
// Case where this is a first call so no diff available
|
||||
skip = true;
|
||||
}
|
||||
|
||||
final long microTimeDiff = currentTimeMicros - lastSystemTimeMicros;
|
||||
final long cpuUseDiff = cpuTimeMicros - lastCpuUseTimeMicros;
|
||||
// This value is not a percent, but rather how much CPU core time was consumed. i.e. this number can be
|
||||
// 2.2 which stands for 2.2 CPU cores were fully utilized. If this number is less than 1 than that means
|
||||
// that less than 1 CPU core was used.
|
||||
cpuCoreTimeUsed = ((double) cpuUseDiff / microTimeDiff);
|
||||
|
||||
lastCpuUseTimeMicros = cpuTimeMicros;
|
||||
lastSystemTimeMicros = currentTimeMicros;
|
||||
}
|
||||
|
||||
if (skip) {
|
||||
return 0D;
|
||||
} else {
|
||||
// In case of rounding error, treat everything above 100% as 100%
|
||||
return Math.min(100.0, cpuCoreTimeUsed / cpuLimit * 100.0);
|
||||
}
|
||||
}
|
||||
|
||||
private double calculateCpuLimit() {
|
||||
// This file contains two values separated by space ($MAX $PERIOD).
|
||||
// $MAX is either a number or "max"
|
||||
// $PERIOD is always a number
|
||||
final String cpuMax = readSingleLineFile(cpuMaxFile);
|
||||
final String[] cpuMaxArr = cpuMax.split(" ");
|
||||
final String max = cpuMaxArr[0];
|
||||
final String period = cpuMaxArr[1];
|
||||
|
||||
if (max.equals("max")) {
|
||||
// if first value in file is "max", a limit is not set on the container. The container can use all available
|
||||
// cores
|
||||
return getAvailableCpusFromEffectiveCpuSet(readSingleLineFile(effectiveCpuSetFile));
|
||||
} else {
|
||||
return Double.parseDouble(max) / Long.parseLong(period);
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public OperatingRange getOperatingRange() {
|
||||
return operatingRange;
|
||||
}
|
||||
|
||||
@Override
|
||||
public WorkerMetricType getWorkerMetricType() {
|
||||
return CPU_WORKER_METRICS_TYPE;
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,203 @@
|
|||
package software.amazon.kinesis.worker.metric.impl.container;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.InputStreamReader;
|
||||
import java.net.MalformedURLException;
|
||||
import java.net.URL;
|
||||
import java.nio.charset.Charset;
|
||||
import java.util.Iterator;
|
||||
|
||||
import com.fasterxml.jackson.databind.JsonNode;
|
||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||
import lombok.AccessLevel;
|
||||
import lombok.RequiredArgsConstructor;
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
import software.amazon.kinesis.worker.metric.OperatingRange;
|
||||
import software.amazon.kinesis.worker.metric.WorkerMetric;
|
||||
import software.amazon.kinesis.worker.metric.WorkerMetricType;
|
||||
|
||||
/**
|
||||
* Queries the Amazon ECS task metadata endpoint version 4 to get CPU metric stats as well as allocated CPU to the ECS task and
|
||||
* containers to calculate percent CPU utilization. This works for all ECS containers running on the following
|
||||
* platforms:
|
||||
*
|
||||
* Fargate agent version 1.4.0
|
||||
* EC2 instance running at least 1.39.0 of the Amazon ECS container agent
|
||||
*
|
||||
* For more information, see
|
||||
* https://docs.aws.amazon.com/AmazonECS/latest/developerguide/task-metadata-endpoint-v4.html
|
||||
*/
|
||||
@Slf4j
|
||||
@RequiredArgsConstructor(access = AccessLevel.PACKAGE)
|
||||
public class EcsCpuWorkerMetric implements WorkerMetric {
|
||||
|
||||
private static final WorkerMetricType CPU_WORKER_METRICS_TYPE = WorkerMetricType.CPU;
|
||||
private static final String SYS_VAR_ECS_METADATA_URI = "ECS_CONTAINER_METADATA_URI_V4";
|
||||
private final OperatingRange operatingRange;
|
||||
private final String containerStatsUri;
|
||||
private final String taskMetadataUri;
|
||||
private final String containerMetadataUri;
|
||||
private double containerCpuLimit = -1;
|
||||
private double onlineCpus = -1;
|
||||
|
||||
public EcsCpuWorkerMetric(final OperatingRange operatingRange) {
|
||||
this.operatingRange = operatingRange;
|
||||
|
||||
final String ecsMetadataRootUri = System.getenv(SYS_VAR_ECS_METADATA_URI);
|
||||
if (ecsMetadataRootUri != null) {
|
||||
this.containerStatsUri = ecsMetadataRootUri + "/stats";
|
||||
this.taskMetadataUri = ecsMetadataRootUri + "/task";
|
||||
this.containerMetadataUri = ecsMetadataRootUri;
|
||||
} else {
|
||||
this.containerStatsUri = null;
|
||||
this.taskMetadataUri = null;
|
||||
this.containerMetadataUri = null;
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public String getShortName() {
|
||||
return CPU_WORKER_METRICS_TYPE.getShortName();
|
||||
}
|
||||
|
||||
@Override
|
||||
public WorkerMetricValue capture() {
|
||||
return WorkerMetricValue.builder().value(calculateCpuUsage()).build();
|
||||
}
|
||||
|
||||
private double calculateCpuUsage() {
|
||||
// Read current container metrics
|
||||
final JsonNode containerStatsRootNode = readEcsMetadata(containerStatsUri);
|
||||
|
||||
final long cpuUsage = containerStatsRootNode
|
||||
.path("cpu_stats")
|
||||
.path("cpu_usage")
|
||||
.path("total_usage")
|
||||
.asLong();
|
||||
final long systemCpuUsage = containerStatsRootNode
|
||||
.path("cpu_stats")
|
||||
.path("system_cpu_usage")
|
||||
.asLong();
|
||||
final long prevCpuUsage = containerStatsRootNode
|
||||
.path("precpu_stats")
|
||||
.path("cpu_usage")
|
||||
.path("total_usage")
|
||||
.asLong();
|
||||
final long prevSystemCpuUsage = containerStatsRootNode
|
||||
.path("precpu_stats")
|
||||
.path("system_cpu_usage")
|
||||
.asLong();
|
||||
|
||||
if (containerCpuLimit == -1 && onlineCpus == -1) {
|
||||
onlineCpus =
|
||||
containerStatsRootNode.path("cpu_stats").path("online_cpus").asDouble();
|
||||
containerCpuLimit = calculateContainerCpuLimit(onlineCpus);
|
||||
}
|
||||
|
||||
// precpu_stats values will be 0 if it is the first call
|
||||
if (prevCpuUsage == 0 && prevSystemCpuUsage == 0) {
|
||||
return 0D;
|
||||
}
|
||||
|
||||
final long cpuUsageDiff = cpuUsage - prevCpuUsage;
|
||||
final long systemCpuUsageDiff = systemCpuUsage - prevSystemCpuUsage;
|
||||
|
||||
// Edge case when there is no systemCpu usage, then that means that 100% of the cpu is used.
|
||||
if (systemCpuUsageDiff == 0) {
|
||||
return 100D;
|
||||
}
|
||||
|
||||
// This value is not a percent, but rather how much CPU core time was consumed. i.e. this number can be
|
||||
// 2.2 which stands for 2.2 CPU cores were fully utilized. If this number is less than 1 than that means
|
||||
// that less than 1 CPU core was used.
|
||||
final double cpuCoreTimeUsed = ((double) cpuUsageDiff) / systemCpuUsageDiff * onlineCpus;
|
||||
|
||||
// This calculated value is cpu utilization percent. This can burst past 100%, but we will take min with 100%
|
||||
// because only this amount is guaranteed CPU time to the container
|
||||
return Math.min(100.0, cpuCoreTimeUsed / containerCpuLimit * 100.0);
|
||||
}
|
||||
|
||||
/**
|
||||
* All containers in an ECS task can use up to the task level CPU limit. However, CPU is shared among all containers
|
||||
* in the task according to the relative ratio of CPU shares allocated to each container.
|
||||
* i.e.
|
||||
* CPU limit of task is 8 cores
|
||||
* Container 1 with 10 CPU shares
|
||||
* Container 2 with 30 CPU shares
|
||||
* Sum of CPU shares is 40
|
||||
* Container 1 can use 25% of the 8 cores in CPU core time, so this function returns 2
|
||||
* Container 2 can use 75% of the 8 cores in CPU core time, so this function returns 6
|
||||
* @return the CPU core time allocated to the container
|
||||
*/
|
||||
private double calculateContainerCpuLimit(double onlineCpus) {
|
||||
// Read task metadata
|
||||
final JsonNode taskStatsRootNode = readEcsMetadata(taskMetadataUri);
|
||||
double taskCpuLimit = calculateTaskCpuLimit(taskStatsRootNode, onlineCpus);
|
||||
|
||||
// Read current container metadata
|
||||
final String currentContainerId =
|
||||
readEcsMetadata(containerMetadataUri).path("DockerId").asText();
|
||||
final Iterator<JsonNode> containersIterator =
|
||||
taskStatsRootNode.path("Containers").iterator();
|
||||
|
||||
// The default if this value is not provided is 2 CPU shares (in ECS agent versions >= 1.2.0)
|
||||
int currentContainerCpuShare = 2;
|
||||
int containersCpuShareSum = 0;
|
||||
while (containersIterator.hasNext()) {
|
||||
final JsonNode containerNode = containersIterator.next();
|
||||
final int containerCpuShare =
|
||||
containerNode.path("Limits").path("CPU").asInt();
|
||||
if (containerNode.path("DockerId").asText().equals(currentContainerId)) {
|
||||
currentContainerCpuShare = containerCpuShare;
|
||||
}
|
||||
containersCpuShareSum += containerCpuShare;
|
||||
}
|
||||
return ((double) currentContainerCpuShare) / containersCpuShareSum * taskCpuLimit;
|
||||
}
|
||||
|
||||
private double calculateTaskCpuLimit(JsonNode taskStatsRootNode, double onlineCpus) {
|
||||
final JsonNode limitsNode = taskStatsRootNode.path("Limits");
|
||||
if (limitsNode.isMissingNode()) {
|
||||
// Neither a memory limit nor cpu limit is set at the task level (possible on EC2 instances)
|
||||
return onlineCpus;
|
||||
}
|
||||
final JsonNode cpuLimitsNode = limitsNode.path("CPU");
|
||||
if (cpuLimitsNode.isMissingNode()) {
|
||||
// When only a memory limit is set at the task level (possible on ec2 instances)
|
||||
return onlineCpus;
|
||||
}
|
||||
return cpuLimitsNode.asDouble();
|
||||
}
|
||||
|
||||
private JsonNode readEcsMetadata(String uri) {
|
||||
if (this.containerMetadataUri == null) {
|
||||
throw new IllegalArgumentException("No ECS metadata endpoint found from environment variables.");
|
||||
}
|
||||
|
||||
URL url;
|
||||
try {
|
||||
url = new URL(uri);
|
||||
} catch (MalformedURLException e) {
|
||||
throw new IllegalArgumentException(
|
||||
"CpuWorkerMetrics is not configured properly. ECS metadata url is malformed", e);
|
||||
}
|
||||
try {
|
||||
final ObjectMapper mapper = new ObjectMapper();
|
||||
final JsonNode rootNode =
|
||||
mapper.readValue(new InputStreamReader(url.openStream(), Charset.defaultCharset()), JsonNode.class);
|
||||
return rootNode;
|
||||
} catch (IOException e) {
|
||||
throw new IllegalArgumentException("Error in parsing ECS metadata", e);
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public OperatingRange getOperatingRange() {
|
||||
return operatingRange;
|
||||
}
|
||||
|
||||
@Override
|
||||
public WorkerMetricType getWorkerMetricType() {
|
||||
return CPU_WORKER_METRICS_TYPE;
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,108 @@
|
|||
package software.amazon.kinesis.worker.metric.impl.jmx;
|
||||
|
||||
import java.lang.management.ManagementFactory;
|
||||
import java.util.Collections;
|
||||
import java.util.HashSet;
|
||||
import java.util.Set;
|
||||
import javax.management.MBeanServerConnection;
|
||||
import javax.management.ObjectName;
|
||||
import javax.management.openmbean.CompositeDataSupport;
|
||||
|
||||
import lombok.RequiredArgsConstructor;
|
||||
import software.amazon.kinesis.worker.metric.OperatingRange;
|
||||
import software.amazon.kinesis.worker.metric.WorkerMetric;
|
||||
import software.amazon.kinesis.worker.metric.WorkerMetricType;
|
||||
|
||||
/**
|
||||
* Memory WorkerMetricStats that reads the heap memory after GC. The way memory usage is calculated that, all the
|
||||
* available memory pools are read except Eden (as this is allocation buffer) and used memory and total memory is
|
||||
* computed.
|
||||
* Then percentage is computed by dividing used memory by total memory.
|
||||
*
|
||||
*/
|
||||
@RequiredArgsConstructor
|
||||
public class HeapMemoryAfterGCWorkerMetric implements WorkerMetric {
|
||||
|
||||
private static final WorkerMetricType MEMORY_WORKER_METRICS_TYPE = WorkerMetricType.MEMORY;
|
||||
|
||||
private final OperatingRange operatingRange;
|
||||
|
||||
private Set<ObjectName> garbageCollectorMxBeans;
|
||||
private Set<String> memoryPoolNames;
|
||||
|
||||
@Override
|
||||
public String getShortName() {
|
||||
return MEMORY_WORKER_METRICS_TYPE.getShortName();
|
||||
}
|
||||
|
||||
@Override
|
||||
public WorkerMetricValue capture() {
|
||||
return WorkerMetricValue.builder()
|
||||
.value(getAfterGCMemoryUsage(ManagementFactory.getPlatformMBeanServer()))
|
||||
.build();
|
||||
}
|
||||
|
||||
private double getAfterGCMemoryUsage(final MBeanServerConnection connection) {
|
||||
try {
|
||||
if (garbageCollectorMxBeans == null) {
|
||||
garbageCollectorMxBeans = connection.queryNames(
|
||||
new ObjectName(ManagementFactory.GARBAGE_COLLECTOR_MXBEAN_DOMAIN_TYPE + ",*"), null);
|
||||
|
||||
memoryPoolNames = new HashSet<String>();
|
||||
for (ObjectName on : garbageCollectorMxBeans) {
|
||||
String[] poolNames = (String[]) connection.getAttribute(on, "MemoryPoolNames");
|
||||
// A given MemoryPool may be associated with multiple GarbageCollectors,
|
||||
// but will appear only once in memoryPoolNames
|
||||
Collections.addAll(memoryPoolNames, poolNames);
|
||||
}
|
||||
}
|
||||
|
||||
// Report on the sum of non-Eden HEAP spaces after the last gc
|
||||
Long used, max;
|
||||
long usedKb = 0, totalKb = 0;
|
||||
|
||||
for (String poolName : memoryPoolNames) {
|
||||
if (!poolName.contains("Eden")) {
|
||||
// Ignore Eden, since it's just an allocation buffer
|
||||
ObjectName on =
|
||||
new ObjectName(ManagementFactory.MEMORY_POOL_MXBEAN_DOMAIN_TYPE + ",name=" + poolName);
|
||||
String mt = (String) connection.getAttribute(on, "Type");
|
||||
if (mt.equals("HEAP")) {
|
||||
// Paranoia: ignore non-HEAP memory pools
|
||||
CompositeDataSupport data =
|
||||
(CompositeDataSupport) connection.getAttribute(on, "CollectionUsage");
|
||||
|
||||
used = (Long) data.get("used");
|
||||
usedKb += used / 1024;
|
||||
|
||||
max = (Long) data.get("max");
|
||||
// max can be undefined (-1)
|
||||
// http://docs.oracle.com/javase/7/docs/api/java/lang/management/MemoryUsage.html
|
||||
totalKb += max == -1 ? 0 : max / 1024;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (totalKb <= 0) {
|
||||
throw new IllegalArgumentException("Total memory value for JVM is greater than zero");
|
||||
}
|
||||
|
||||
return 100.0 * (double) usedKb / (double) totalKb;
|
||||
} catch (final Exception e) {
|
||||
if (e instanceof IllegalArgumentException) {
|
||||
throw (IllegalArgumentException) e;
|
||||
}
|
||||
throw new IllegalArgumentException(e);
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public OperatingRange getOperatingRange() {
|
||||
return operatingRange;
|
||||
}
|
||||
|
||||
@Override
|
||||
public WorkerMetricType getWorkerMetricType() {
|
||||
return MEMORY_WORKER_METRICS_TYPE;
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,133 @@
|
|||
package software.amazon.kinesis.worker.metric.impl.linux;
|
||||
|
||||
import java.io.BufferedReader;
|
||||
import java.io.File;
|
||||
import java.io.FileReader;
|
||||
|
||||
import lombok.AccessLevel;
|
||||
import lombok.RequiredArgsConstructor;
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
import software.amazon.kinesis.worker.metric.OperatingRange;
|
||||
import software.amazon.kinesis.worker.metric.WorkerMetric;
|
||||
import software.amazon.kinesis.worker.metric.WorkerMetricType;
|
||||
|
||||
/**
|
||||
* Reads CPU usage statistics out of /proc/stat file that is present on the EC2 instances. The value is % utilization
|
||||
* of the CPU.
|
||||
* When this is invoked for the first time, the value returned is always 0 as the prev values are not available
|
||||
* to calculate the diff. If the file hasn't changed this also returns 0.
|
||||
* In case the file is not present or any other exception occurs, this throws IllegalArgumentException.
|
||||
*/
|
||||
@Slf4j
|
||||
@RequiredArgsConstructor(access = AccessLevel.PACKAGE)
|
||||
public class LinuxCpuWorkerMetric implements WorkerMetric {
|
||||
|
||||
private static final Object LOCK_OBJECT = new Object();
|
||||
private static final WorkerMetricType CPU_WORKER_METRICS_TYPE = WorkerMetricType.CPU;
|
||||
private final OperatingRange operatingRange;
|
||||
private final String statFile;
|
||||
private long lastUsr, lastIow, lastSys, lastIdl, lastTot;
|
||||
private String lastLine;
|
||||
|
||||
public LinuxCpuWorkerMetric(final OperatingRange operatingRange) {
|
||||
this(operatingRange, "/proc/stat");
|
||||
}
|
||||
|
||||
@Override
|
||||
public String getShortName() {
|
||||
return CPU_WORKER_METRICS_TYPE.getShortName();
|
||||
}
|
||||
|
||||
@Override
|
||||
public WorkerMetricValue capture() {
|
||||
return WorkerMetricValue.builder().value(calculateCpuUsage()).build();
|
||||
}
|
||||
|
||||
private double calculateCpuUsage() {
|
||||
BufferedReader bufferedReader = null;
|
||||
try {
|
||||
|
||||
final File stat = new File(statFile);
|
||||
if (stat.exists()) {
|
||||
|
||||
bufferedReader = new BufferedReader(new FileReader(stat));
|
||||
final String line = bufferedReader.readLine();
|
||||
final String[] lineVals = line.split("\\s+");
|
||||
|
||||
long usr = Long.parseLong(lineVals[1]) + Long.parseLong(lineVals[2]);
|
||||
long sys = Long.parseLong(lineVals[3]);
|
||||
long idl = Long.parseLong(lineVals[4]);
|
||||
long iow = Long.parseLong(lineVals[5]);
|
||||
long tot = usr + sys + idl + iow;
|
||||
long diffIdl = -1;
|
||||
long diffTot = -1;
|
||||
|
||||
boolean skip = false;
|
||||
synchronized (LOCK_OBJECT) {
|
||||
if (lastUsr == 0 || line.equals(lastLine)) {
|
||||
// Case where this is a first call so no diff available or
|
||||
// /proc/stat file is not updated since last time.
|
||||
skip = true;
|
||||
}
|
||||
|
||||
diffIdl = Math.abs(idl - lastIdl);
|
||||
diffTot = Math.abs(tot - lastTot);
|
||||
if (diffTot < diffIdl) {
|
||||
log.warn(
|
||||
"diffTot is less than diff_idle. \nPrev cpu line : {} and current cpu line : {} ",
|
||||
lastLine,
|
||||
line);
|
||||
if (iow < lastIow) {
|
||||
// this is case where current iow value less than prev, this can happen in rare cases as per
|
||||
// https://docs.kernel.org/filesystems/proc.html, and when the worker is idle
|
||||
// there is no increase in usr or sys values as well resulting in diffTot < diffIdl as
|
||||
// current tot increases less than current idl
|
||||
// return 0 in this case as this is the case where worker is not doing anything anyways.
|
||||
skip = true;
|
||||
}
|
||||
}
|
||||
lastUsr = usr;
|
||||
lastSys = sys;
|
||||
lastIdl = idl;
|
||||
lastIow = iow;
|
||||
lastTot = usr + sys + idl + iow;
|
||||
lastLine = line;
|
||||
}
|
||||
|
||||
if (skip) {
|
||||
return 0D;
|
||||
}
|
||||
|
||||
return ((double) (diffTot - diffIdl) / (double) diffTot) * 100.0;
|
||||
|
||||
} else {
|
||||
throw new IllegalArgumentException(String.format(
|
||||
"LinuxCpuWorkerMetric is not configured properly, file : %s does not exists", this.statFile));
|
||||
}
|
||||
} catch (final Throwable t) {
|
||||
if (t instanceof IllegalArgumentException) {
|
||||
throw (IllegalArgumentException) t;
|
||||
}
|
||||
throw new IllegalArgumentException(
|
||||
"LinuxCpuWorkerMetric failed to read metric stats or not configured properly.", t);
|
||||
} finally {
|
||||
try {
|
||||
if (bufferedReader != null) {
|
||||
bufferedReader.close();
|
||||
}
|
||||
} catch (Throwable x) {
|
||||
log.warn("Failed to close bufferedReader ", x);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public OperatingRange getOperatingRange() {
|
||||
return operatingRange;
|
||||
}
|
||||
|
||||
@Override
|
||||
public WorkerMetricType getWorkerMetricType() {
|
||||
return CPU_WORKER_METRICS_TYPE;
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,42 @@
|
|||
package software.amazon.kinesis.worker.metric.impl.linux;
|
||||
|
||||
import com.google.common.annotations.VisibleForTesting;
|
||||
import com.google.common.base.Stopwatch;
|
||||
import software.amazon.kinesis.worker.metric.OperatingRange;
|
||||
import software.amazon.kinesis.worker.metric.WorkerMetricType;
|
||||
|
||||
/**
|
||||
* Ref java doc for {@link LinuxNetworkWorkerMetricBase}
|
||||
*/
|
||||
public class LinuxNetworkInWorkerMetric extends LinuxNetworkWorkerMetricBase {
|
||||
private static final WorkerMetricType NETWORK_IN_WORKER_METRICS_TYPE = WorkerMetricType.NETWORK_IN;
|
||||
|
||||
public LinuxNetworkInWorkerMetric(
|
||||
final OperatingRange operatingRange, final String interfaceName, final double maxBandwidthInMB) {
|
||||
this(operatingRange, interfaceName, DEFAULT_NETWORK_STAT_FILE, maxBandwidthInMB, Stopwatch.createUnstarted());
|
||||
}
|
||||
|
||||
public LinuxNetworkInWorkerMetric(final OperatingRange operatingRange, final double maxBandwidthInMB) {
|
||||
this(
|
||||
operatingRange,
|
||||
DEFAULT_INTERFACE_NAME,
|
||||
DEFAULT_NETWORK_STAT_FILE,
|
||||
maxBandwidthInMB,
|
||||
Stopwatch.createUnstarted());
|
||||
}
|
||||
|
||||
@VisibleForTesting
|
||||
LinuxNetworkInWorkerMetric(
|
||||
final OperatingRange operatingRange,
|
||||
final String interfaceName,
|
||||
final String statFile,
|
||||
final double maxBandwidthInMB,
|
||||
final Stopwatch stopwatch) {
|
||||
super(operatingRange, interfaceName, statFile, maxBandwidthInMB, stopwatch);
|
||||
}
|
||||
|
||||
@Override
|
||||
protected WorkerMetricType getWorkerMetricsType() {
|
||||
return NETWORK_IN_WORKER_METRICS_TYPE;
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,42 @@
|
|||
package software.amazon.kinesis.worker.metric.impl.linux;
|
||||
|
||||
import com.google.common.annotations.VisibleForTesting;
|
||||
import com.google.common.base.Stopwatch;
|
||||
import software.amazon.kinesis.worker.metric.OperatingRange;
|
||||
import software.amazon.kinesis.worker.metric.WorkerMetricType;
|
||||
|
||||
/**
|
||||
* Ref java doc for {@link LinuxNetworkWorkerMetricBase}
|
||||
*/
|
||||
public class LinuxNetworkOutWorkerMetric extends LinuxNetworkWorkerMetricBase {
|
||||
private static final WorkerMetricType NETWORK_OUT_WORKER_METRICS_TYPE = WorkerMetricType.NETWORK_OUT;
|
||||
|
||||
public LinuxNetworkOutWorkerMetric(
|
||||
final OperatingRange operatingRange, final String interfaceName, final double maxBandwidthInMB) {
|
||||
this(operatingRange, interfaceName, DEFAULT_NETWORK_STAT_FILE, maxBandwidthInMB, Stopwatch.createUnstarted());
|
||||
}
|
||||
|
||||
public LinuxNetworkOutWorkerMetric(final OperatingRange operatingRange, final double maxBandwidthInMB) {
|
||||
this(
|
||||
operatingRange,
|
||||
DEFAULT_INTERFACE_NAME,
|
||||
DEFAULT_NETWORK_STAT_FILE,
|
||||
maxBandwidthInMB,
|
||||
Stopwatch.createUnstarted());
|
||||
}
|
||||
|
||||
@VisibleForTesting
|
||||
LinuxNetworkOutWorkerMetric(
|
||||
final OperatingRange operatingRange,
|
||||
final String interfaceName,
|
||||
final String statFile,
|
||||
final double maxBandwidthInMB,
|
||||
final Stopwatch stopwatch) {
|
||||
super(operatingRange, interfaceName, statFile, maxBandwidthInMB, stopwatch);
|
||||
}
|
||||
|
||||
@Override
|
||||
protected WorkerMetricType getWorkerMetricsType() {
|
||||
return NETWORK_OUT_WORKER_METRICS_TYPE;
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,188 @@
|
|||
package software.amazon.kinesis.worker.metric.impl.linux;
|
||||
|
||||
import java.io.BufferedReader;
|
||||
import java.io.File;
|
||||
import java.io.FileReader;
|
||||
import java.time.Duration;
|
||||
import java.util.Map;
|
||||
|
||||
import com.google.common.base.Preconditions;
|
||||
import com.google.common.base.Stopwatch;
|
||||
import com.google.common.collect.ImmutableMap;
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
import software.amazon.kinesis.worker.metric.OperatingRange;
|
||||
import software.amazon.kinesis.worker.metric.WorkerMetric;
|
||||
import software.amazon.kinesis.worker.metric.WorkerMetricType;
|
||||
|
||||
/**
|
||||
* Base class for EC2NetworkWorkerMetrics, this reads and parses /proc/net/dev file and look for the specific
|
||||
* interface and reads received and transmitted bytes.
|
||||
* To get the percentage of bandwidth consumed, the fetch bytes are converted to per second (based on the interval
|
||||
* between invocation) and percentage is calculated by dividing it by the maximum bandwidth in MBps.
|
||||
*
|
||||
* When this is invoked for the first time, the value returned is always 0 as the prev values are not available
|
||||
* to calculate the diff.
|
||||
* In case the stat file is not present or any other exception occurs, this throws IllegalArgumentException.
|
||||
*/
|
||||
@Slf4j
|
||||
public abstract class LinuxNetworkWorkerMetricBase implements WorkerMetric {
|
||||
|
||||
protected static final String DEFAULT_NETWORK_STAT_FILE = "/proc/net/dev";
|
||||
protected static final String DEFAULT_INTERFACE_NAME = "eth0";
|
||||
private final Object lockObject = new Object();
|
||||
|
||||
private final OperatingRange operatingRange;
|
||||
private final String interfaceName;
|
||||
private final String statFile;
|
||||
private final double maxBandwidthInMBps;
|
||||
// Stopwatch to keep track of elapsed time between invocation.
|
||||
private final Stopwatch stopwatch;
|
||||
|
||||
public LinuxNetworkWorkerMetricBase(
|
||||
final OperatingRange operatingRange,
|
||||
final String interfaceName,
|
||||
final String statFile,
|
||||
final double maxBandwidthInMBps,
|
||||
final Stopwatch stopwatch) {
|
||||
Preconditions.checkArgument(maxBandwidthInMBps > 0, "maxBandwidthInMBps should be greater than 0.");
|
||||
this.operatingRange = operatingRange;
|
||||
this.interfaceName = interfaceName;
|
||||
this.statFile = statFile;
|
||||
this.maxBandwidthInMBps = maxBandwidthInMBps;
|
||||
this.stopwatch = stopwatch;
|
||||
}
|
||||
|
||||
private long lastRx = -1;
|
||||
private long lastTx = -1;
|
||||
|
||||
@Override
|
||||
public String getShortName() {
|
||||
return getWorkerMetricsType().getShortName();
|
||||
}
|
||||
|
||||
@Override
|
||||
public OperatingRange getOperatingRange() {
|
||||
return this.operatingRange;
|
||||
}
|
||||
|
||||
@Override
|
||||
public WorkerMetricType getWorkerMetricType() {
|
||||
return getWorkerMetricsType();
|
||||
}
|
||||
|
||||
/**
|
||||
* Reads the stat file and find the total bytes (in and out) and divide it by the time elapsed since last read to
|
||||
* get the bytes per second.
|
||||
* Converts the bytes per second to MBps and then normalizes it to a percentage of the maximum bandwidth.
|
||||
* @return WorkerMetricValue with the % of network bandwidth consumed.
|
||||
*/
|
||||
@Override
|
||||
public WorkerMetricValue capture() {
|
||||
final double percentageOfMaxBandwidth =
|
||||
convertToMBps(calculateNetworkUsage().get(getWorkerMetricsType())) / maxBandwidthInMBps * 100;
|
||||
return WorkerMetricValue.builder()
|
||||
// If maxBandwidthInMBps is less than utilized (could be wrong configuration),
|
||||
// default to 100 % bandwidth utilization.
|
||||
.value(Math.min(100, percentageOfMaxBandwidth))
|
||||
.build();
|
||||
}
|
||||
|
||||
private double convertToMBps(final long bytes) {
|
||||
final double elapsedTimeInSecond;
|
||||
if (!stopwatch.isRunning()) {
|
||||
// stopwatch is not running during the first request only, in this case assume 1 second as elapsed as
|
||||
// during the first request even bytes are zero, any value of elapsedTimeInSecond does not have any effect.
|
||||
elapsedTimeInSecond = 1.0;
|
||||
} else {
|
||||
// Specifically, getting nanos and converting to seconds to get the decimal precision.
|
||||
elapsedTimeInSecond = (double) stopwatch.elapsed().toNanos()
|
||||
/ Duration.ofSeconds(1).toNanos();
|
||||
}
|
||||
stopwatch.reset().start();
|
||||
// Convert bytes to MB
|
||||
final double totalDataMB = (double) bytes / (1024 * 1024);
|
||||
if (elapsedTimeInSecond == 0) {
|
||||
// This should never happen, as getting called twice within 1 nanoSecond is never expected.
|
||||
// If this happens something is real wrong.
|
||||
throw new IllegalArgumentException("elapsedTimeInSecond is zero which in incorrect");
|
||||
}
|
||||
return totalDataMB / elapsedTimeInSecond;
|
||||
}
|
||||
|
||||
protected abstract WorkerMetricType getWorkerMetricsType();
|
||||
|
||||
/**
|
||||
* Returns the absolute bytes in and out since the last invocation of the method.
|
||||
* @return Map of WorkerMetricType to bytes
|
||||
*/
|
||||
private Map<WorkerMetricType, Long> calculateNetworkUsage() {
|
||||
BufferedReader bufferedReader = null;
|
||||
try {
|
||||
final File net = new File(statFile);
|
||||
if (net.exists()) {
|
||||
bufferedReader = new BufferedReader(new FileReader(net));
|
||||
|
||||
// skip over header lines
|
||||
bufferedReader.readLine();
|
||||
bufferedReader.readLine();
|
||||
|
||||
// find specified interface
|
||||
String line = bufferedReader.readLine();
|
||||
while (line != null && !line.matches("^\\s*" + interfaceName + ":.*")) {
|
||||
line = bufferedReader.readLine();
|
||||
}
|
||||
if (line == null) {
|
||||
throw new IllegalArgumentException(
|
||||
"Failed to parse the file and find interface : " + interfaceName);
|
||||
}
|
||||
|
||||
int n = line.indexOf(':') + 1;
|
||||
line = line.substring(n).trim();
|
||||
String[] parts = line.split("\\s+");
|
||||
|
||||
long rx = Long.parseLong(parts[0]);
|
||||
long tx = Long.parseLong(parts[8]);
|
||||
long diffRx = -1, diffTx = -1;
|
||||
boolean skip = false;
|
||||
synchronized (lockObject) {
|
||||
if (lastRx == -1) {
|
||||
skip = true;
|
||||
} else {
|
||||
diffRx = Math.abs(rx - lastRx);
|
||||
diffTx = Math.abs(tx - lastTx);
|
||||
}
|
||||
lastRx = rx;
|
||||
lastTx = tx;
|
||||
}
|
||||
|
||||
if (skip) {
|
||||
return createResponse(0L, 0L);
|
||||
}
|
||||
|
||||
return createResponse(diffRx, diffTx);
|
||||
} else {
|
||||
throw new IllegalArgumentException(String.format(
|
||||
"NetworkWorkerMetrics is not configured properly, file : %s does not exists", this.statFile));
|
||||
}
|
||||
} catch (final Throwable t) {
|
||||
if (t instanceof IllegalArgumentException) {
|
||||
throw (IllegalArgumentException) t;
|
||||
}
|
||||
throw new IllegalArgumentException("Cannot read/parse " + this.statFile, t);
|
||||
} finally {
|
||||
try {
|
||||
if (bufferedReader != null) {
|
||||
bufferedReader.close();
|
||||
}
|
||||
} catch (Throwable x) {
|
||||
log.warn("Failed to close bufferedReader ", x);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private Map<WorkerMetricType, Long> createResponse(final long diffRx, final long diffTx) {
|
||||
return ImmutableMap.of(
|
||||
WorkerMetricType.NETWORK_IN, diffRx,
|
||||
WorkerMetricType.NETWORK_OUT, diffTx);
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,302 @@
|
|||
package software.amazon.kinesis.worker.metricstats;
|
||||
|
||||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
|
||||
import lombok.AccessLevel;
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.Builder;
|
||||
import lombok.Data;
|
||||
import lombok.EqualsAndHashCode;
|
||||
import lombok.Getter;
|
||||
import lombok.NoArgsConstructor;
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
import software.amazon.awssdk.enhanced.dynamodb.mapper.annotations.DynamoDbAttribute;
|
||||
import software.amazon.awssdk.enhanced.dynamodb.mapper.annotations.DynamoDbBean;
|
||||
import software.amazon.awssdk.enhanced.dynamodb.mapper.annotations.DynamoDbIgnore;
|
||||
import software.amazon.awssdk.enhanced.dynamodb.mapper.annotations.DynamoDbPartitionKey;
|
||||
import software.amazon.kinesis.utils.ExponentialMovingAverage;
|
||||
import software.amazon.kinesis.worker.metric.WorkerMetricType;
|
||||
|
||||
import static java.util.Objects.isNull;
|
||||
|
||||
/**
|
||||
* DataModel for a WorkerMetric, this data model is used to store the current state of a Worker in terms of relevant
|
||||
* WorkerMetric(CPU, Memory, Network).
|
||||
*
|
||||
* workerId : unique worker identifier, this is equivalent to the owner attribute from the lease table.
|
||||
* lastUpdateTime : wall epoch in seconds when the entry was last updated
|
||||
* metricStats : Map of WorkerMetric to last N values for it. e.g. entry "CPU" : [10,20,12,10] etc
|
||||
* operatingRange : Map of WorkerMetric to its operating range. First item in the list of values defines the max limit.
|
||||
* metricStatsMap : runtime computed WorkerMetric name to its average value map. This field is not stored in ddb
|
||||
* and is used during Lease assignment only
|
||||
*/
|
||||
@Data
|
||||
@Builder
|
||||
@DynamoDbBean
|
||||
@NoArgsConstructor
|
||||
@AllArgsConstructor(access = AccessLevel.PRIVATE)
|
||||
@Slf4j
|
||||
public class WorkerMetricStats {
|
||||
|
||||
static final String KEY_LAST_UPDATE_TIME = "lut";
|
||||
static final String KEY_WORKER_ID = "wid";
|
||||
|
||||
@Getter(onMethod_ = {@DynamoDbPartitionKey, @DynamoDbAttribute(KEY_WORKER_ID)})
|
||||
private String workerId;
|
||||
|
||||
@Getter(onMethod_ = {@DynamoDbAttribute(KEY_LAST_UPDATE_TIME)})
|
||||
private Long lastUpdateTime;
|
||||
|
||||
@Getter(onMethod_ = {@DynamoDbAttribute("sts")})
|
||||
private Map<String, List<Double>> metricStats;
|
||||
|
||||
@Getter(onMethod_ = {@DynamoDbAttribute("opr")})
|
||||
private Map<String, List<Long>> operatingRange;
|
||||
|
||||
/**
|
||||
* This map contains the WorkerMetric to its metric stat value. Metric stat value stored in this is exponentially averaged over
|
||||
* available number of different datapoints.
|
||||
*/
|
||||
@Getter(onMethod_ = {@DynamoDbIgnore})
|
||||
@EqualsAndHashCode.Exclude
|
||||
@Builder.Default
|
||||
private Map<String, Double> metricStatsMap = new HashMap<>();
|
||||
|
||||
/**
|
||||
* Alpha value used to compute the exponential moving average for worker metrics values.
|
||||
*/
|
||||
@Getter(onMethod_ = {@DynamoDbIgnore})
|
||||
@EqualsAndHashCode.Exclude
|
||||
@Builder.Default
|
||||
private double emaAlpha = 0.2;
|
||||
|
||||
/**
|
||||
* Returns true if given {@param workerMetricName} is available for the current worker else false
|
||||
*/
|
||||
public boolean containsMetricStat(final String workerMetricName) {
|
||||
return metricStats.containsKey(workerMetricName);
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the value for given WorkerMetricStats name.
|
||||
*/
|
||||
public double getMetricStat(final String workerMetricName) {
|
||||
return metricStatsMap.computeIfAbsent(workerMetricName, (key) -> computeAverage(metricStats.get(key)));
|
||||
}
|
||||
|
||||
/**
|
||||
* Increase the WorkerMetricStats value by given increaseLoadPercentage. This is done during execution of LAM and
|
||||
* as assignments are happening the current metric stat value is increased based on increaseLoadPercentage.
|
||||
*/
|
||||
public void extrapolateMetricStatValuesForAddedThroughput(
|
||||
final Map<String, Double> workerMetricsToFleetLevelAverageMap,
|
||||
final double averageThroughput,
|
||||
final double increaseThroughput,
|
||||
final double averageLeaseCount) {
|
||||
|
||||
metricStatsMap.replaceAll((key, value) -> extrapolateMetricsValue(
|
||||
key,
|
||||
workerMetricsToFleetLevelAverageMap.get(key),
|
||||
averageThroughput,
|
||||
increaseThroughput,
|
||||
averageLeaseCount));
|
||||
}
|
||||
|
||||
private double extrapolateMetricsValue(
|
||||
final String metricName,
|
||||
final double fleetLevelMetricAverage,
|
||||
final double averageThroughput,
|
||||
final double increaseThroughput,
|
||||
final double averageLeaseCount) {
|
||||
|
||||
if (averageThroughput > 0) {
|
||||
return metricStatsMap.get(metricName) + increaseThroughput * fleetLevelMetricAverage / averageThroughput;
|
||||
} else {
|
||||
return metricStatsMap.get(metricName) + fleetLevelMetricAverage / averageLeaseCount;
|
||||
}
|
||||
}
|
||||
|
||||
public boolean willAnyMetricStatsGoAboveAverageUtilizationOrOperatingRange(
|
||||
final Map<String, Double> workerMetricsToFleetLevelAverageMap,
|
||||
final double averageThroughput,
|
||||
final double increaseThroughput,
|
||||
final double averageLeaseCount) {
|
||||
for (final String metricStatName : metricStats.keySet()) {
|
||||
final double fleetLevelAverageForMetric = workerMetricsToFleetLevelAverageMap.get(metricStatName);
|
||||
final double updatedValueToBe = extrapolateMetricsValue(
|
||||
metricStatName,
|
||||
fleetLevelAverageForMetric,
|
||||
averageThroughput,
|
||||
increaseThroughput,
|
||||
averageLeaseCount);
|
||||
|
||||
if (updatedValueToBe > fleetLevelAverageForMetric
|
||||
|| updatedValueToBe > operatingRange.get(metricStatName).get(0)) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
/**
|
||||
* Increase the metric stat value corresponding to the added single lease. This is done during execution of LAM and
|
||||
* as assignments are happening the load is increase for LAM to determine workers for assignment.
|
||||
* The increase is done considering that for a WorkerMetric the fleet level average would be met when fleet level
|
||||
* average leases are assigned to a worker and thus 1 lease addition increases the metric stat value by fleet level
|
||||
* average of metric stat by averageLeaseCount
|
||||
*/
|
||||
public void extrapolateMetricStatValuesForAddedLease(
|
||||
final Map<String, Double> workerMetricToFleetLevelAverage, final int averageLeaseCount) {
|
||||
for (Map.Entry<String, Double> workerMetricToMetricStat : metricStatsMap.entrySet()) {
|
||||
final String workerMetricName = workerMetricToMetricStat.getKey();
|
||||
final Double updatedValue = workerMetricToMetricStat.getValue()
|
||||
+ workerMetricToFleetLevelAverage.get(workerMetricName) / averageLeaseCount;
|
||||
metricStatsMap.replace(workerMetricName, updatedValue);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Determines percentage of load to reach the mean for the worker. In case of multiple worker metrics the metric stat
|
||||
* value closest to mean is used to determine the percentage value. This value is indication of how much load in
|
||||
* percentage to current load the worker can take to reach mean value.
|
||||
* @param workerMetricToFleetLevelAverage : WorkerMetric to fleet level mean value.
|
||||
* @return percentage to reach mean based on the WorkerMetric closest to its corresponding average.
|
||||
*/
|
||||
public double computePercentageToReachAverage(final Map<String, Double> workerMetricToFleetLevelAverage) {
|
||||
double minDifferencePercentage = Double.MAX_VALUE;
|
||||
for (final String workerMetricName : metricStats.keySet()) {
|
||||
final double metricStatValue = getMetricStat(workerMetricName);
|
||||
final double differenceRatio;
|
||||
if (metricStatValue == 0D) {
|
||||
// If metric stat value is 0 that means this worker does not have any load so we assume that this worker
|
||||
// can take 100% more load than the current to reach average.
|
||||
differenceRatio = 1;
|
||||
} else {
|
||||
differenceRatio =
|
||||
(workerMetricToFleetLevelAverage.get(workerMetricName) - metricStatValue) / metricStatValue;
|
||||
}
|
||||
minDifferencePercentage = Math.min(minDifferencePercentage, differenceRatio);
|
||||
}
|
||||
return minDifferencePercentage;
|
||||
}
|
||||
|
||||
private Double computeAverage(final List<Double> values) {
|
||||
if (values.isEmpty()) {
|
||||
return 0D;
|
||||
}
|
||||
final ExponentialMovingAverage average = new ExponentialMovingAverage(emaAlpha);
|
||||
// Ignore -1 which denotes the WorkerMetric failure when calculating average, as it possible in past
|
||||
// one of the value is -1 due to some intermediate failure, and it has recovered since.
|
||||
values.forEach(value -> {
|
||||
if (value != -1) {
|
||||
average.add(value);
|
||||
}
|
||||
});
|
||||
return average.getValue();
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns true if any of the metric stat values has -1 in last index which represents that the metric stat value
|
||||
* was not successfully fetched in last attempt by worker.
|
||||
*
|
||||
* @return true if any metric stat value has -1 in last index, false otherwise.
|
||||
*/
|
||||
public boolean isAnyWorkerMetricFailing() {
|
||||
boolean response = false;
|
||||
if (isUsingDefaultWorkerMetric()) {
|
||||
return response;
|
||||
}
|
||||
for (final Map.Entry<String, List<Double>> resourceStatsEntry : metricStats.entrySet()) {
|
||||
if (resourceStatsEntry.getValue().isEmpty()) {
|
||||
continue;
|
||||
}
|
||||
final Double lastEntry = resourceStatsEntry
|
||||
.getValue()
|
||||
.get(resourceStatsEntry.getValue().size() - 1);
|
||||
if (lastEntry != null && lastEntry == -1D) {
|
||||
response = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (response) {
|
||||
log.warn("WorkerStats: {} has a WorkerMetric which is failing.", this);
|
||||
}
|
||||
return response;
|
||||
}
|
||||
|
||||
/**
|
||||
* WorkerMetricStats entry is invalid
|
||||
* if any of the field from lastUpdateTime, operatingRange, resourcesStats are not present or
|
||||
* if resourcesStats is empty or
|
||||
* if any of the WorkerMetrics having resourceStats does not have operatingRange or
|
||||
* if operating range values are not present or
|
||||
* if maxUtilization is 0 for any WorkerMetric
|
||||
* @return true if the entry is valid false otherwise.
|
||||
*/
|
||||
public boolean isValidWorkerMetric() {
|
||||
if (isNull(lastUpdateTime)) {
|
||||
return false;
|
||||
}
|
||||
if (isUsingDefaultWorkerMetric()) {
|
||||
return true;
|
||||
}
|
||||
if (isNull(metricStats) || isNull(operatingRange)) {
|
||||
return false;
|
||||
}
|
||||
for (final Map.Entry<String, List<Double>> entry : metricStats.entrySet()) {
|
||||
if (!operatingRange.containsKey(entry.getKey())) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
for (final Map.Entry<String, List<Long>> operatingRangeEntry : operatingRange.entrySet()) {
|
||||
// If operatingRange for a WorkerMetric is missing or if maxUtilization is 0 then its not valid entry.
|
||||
if (operatingRangeEntry.getValue().isEmpty()
|
||||
|| operatingRangeEntry.getValue().get(0) == 0) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
public boolean isAnyWorkerMetricAboveAverageUtilizationOrOperatingRange(
|
||||
final Map<String, Double> workerMetricToFleetLevelAverage) {
|
||||
for (final String workerMetricName : metricStats.keySet()) {
|
||||
final double value = getMetricStat(workerMetricName);
|
||||
if (value > workerMetricToFleetLevelAverage.get(workerMetricName)) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
// check if any metric stat value is above operating range.
|
||||
return workerMetricToFleetLevelAverage.keySet().stream().anyMatch(this::isWorkerMetricAboveOperatingRange);
|
||||
}
|
||||
|
||||
/**
|
||||
* If a worker is not using an explicit WorkerMetric such as CPU, Memory, or Network, then it
|
||||
* is said to be using the default WorkerMetric. Load management then falls back to throughput.
|
||||
* @return true if the worker is not using an explicit WorkerMetric.
|
||||
*/
|
||||
public boolean isUsingDefaultWorkerMetric() {
|
||||
if ((metricStats == null || metricStats.isEmpty()) && (operatingRange == null || operatingRange.isEmpty())) {
|
||||
return true;
|
||||
}
|
||||
if (metricStats != null) {
|
||||
return metricStats.entrySet().stream()
|
||||
.anyMatch(entry -> entry.getKey().equals(WorkerMetricType.THROUGHPUT.name()));
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
/**
|
||||
* Evaluates if the given metric stat is above operatingRange for the given WorkerMetric name. If the WorkerMetric
|
||||
* does not exist returns false
|
||||
* @param workerMetricName WorkerMetric name to evaluate
|
||||
* @return true if metric stat exists and is above operatingRange for the WorkerMetric
|
||||
*/
|
||||
public boolean isWorkerMetricAboveOperatingRange(final String workerMetricName) {
|
||||
return metricStatsMap.containsKey(workerMetricName)
|
||||
&& metricStatsMap.get(workerMetricName)
|
||||
> operatingRange.get(workerMetricName).get(0);
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,219 @@
|
|||
package software.amazon.kinesis.worker.metricstats;
|
||||
|
||||
import java.time.Duration;
|
||||
import java.time.Instant;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.concurrent.CompletableFuture;
|
||||
import java.util.concurrent.CompletionException;
|
||||
import java.util.function.Supplier;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import com.google.common.base.Preconditions;
|
||||
import com.google.common.collect.ImmutableMap;
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
import org.jetbrains.annotations.NotNull;
|
||||
import software.amazon.awssdk.core.waiters.WaiterResponse;
|
||||
import software.amazon.awssdk.enhanced.dynamodb.DynamoDbAsyncTable;
|
||||
import software.amazon.awssdk.enhanced.dynamodb.DynamoDbEnhancedAsyncClient;
|
||||
import software.amazon.awssdk.enhanced.dynamodb.Expression;
|
||||
import software.amazon.awssdk.enhanced.dynamodb.Key;
|
||||
import software.amazon.awssdk.enhanced.dynamodb.TableSchema;
|
||||
import software.amazon.awssdk.enhanced.dynamodb.model.DeleteItemEnhancedRequest;
|
||||
import software.amazon.awssdk.enhanced.dynamodb.model.UpdateItemEnhancedRequest;
|
||||
import software.amazon.awssdk.services.dynamodb.DynamoDbAsyncClient;
|
||||
import software.amazon.awssdk.services.dynamodb.model.AttributeValue;
|
||||
import software.amazon.awssdk.services.dynamodb.model.BillingMode;
|
||||
import software.amazon.awssdk.services.dynamodb.model.ConditionalCheckFailedException;
|
||||
import software.amazon.awssdk.services.dynamodb.model.DescribeTableRequest;
|
||||
import software.amazon.awssdk.services.dynamodb.model.DescribeTableResponse;
|
||||
import software.amazon.awssdk.services.dynamodb.model.ProvisionedThroughput;
|
||||
import software.amazon.awssdk.services.dynamodb.model.ResourceNotFoundException;
|
||||
import software.amazon.awssdk.services.dynamodb.model.TableDescription;
|
||||
import software.amazon.awssdk.services.dynamodb.model.TableStatus;
|
||||
import software.amazon.awssdk.services.dynamodb.waiters.DynamoDbAsyncWaiter;
|
||||
import software.amazon.kinesis.leases.LeaseManagementConfig.WorkerMetricsTableConfig;
|
||||
import software.amazon.kinesis.leases.exceptions.DependencyException;
|
||||
|
||||
import static java.util.Objects.nonNull;
|
||||
import static software.amazon.kinesis.worker.metricstats.WorkerMetricStats.KEY_LAST_UPDATE_TIME;
|
||||
import static software.amazon.kinesis.worker.metricstats.WorkerMetricStats.KEY_WORKER_ID;
|
||||
|
||||
@Slf4j
|
||||
public class WorkerMetricStatsDAO {
|
||||
private final DynamoDbEnhancedAsyncClient dynamoDbEnhancedAsyncClient;
|
||||
private final DynamoDbAsyncTable<WorkerMetricStats> table;
|
||||
private final DynamoDbAsyncClient dynamoDbAsyncClient;
|
||||
private final WorkerMetricsTableConfig tableConfig;
|
||||
private final Long workerMetricsReporterFrequencyMillis;
|
||||
|
||||
public WorkerMetricStatsDAO(
|
||||
final DynamoDbAsyncClient dynamoDbAsyncClient,
|
||||
final WorkerMetricsTableConfig tableConfig,
|
||||
final Long workerMetricsReporterFrequencyMillis) {
|
||||
this.dynamoDbAsyncClient = dynamoDbAsyncClient;
|
||||
this.dynamoDbEnhancedAsyncClient = DynamoDbEnhancedAsyncClient.builder()
|
||||
.dynamoDbClient(dynamoDbAsyncClient)
|
||||
.build();
|
||||
this.table = dynamoDbEnhancedAsyncClient.table(
|
||||
tableConfig.tableName(), TableSchema.fromBean(WorkerMetricStats.class));
|
||||
this.tableConfig = tableConfig;
|
||||
this.workerMetricsReporterFrequencyMillis = workerMetricsReporterFrequencyMillis;
|
||||
}
|
||||
|
||||
/**
|
||||
* Performs initialization of the WorkerMetricStats DAO and table.
|
||||
* This will create the table if it doesn't exist.
|
||||
*/
|
||||
public void initialize() throws DependencyException {
|
||||
createTableIfDoesNotExist();
|
||||
}
|
||||
|
||||
/**
|
||||
* Updates the workerMetrics for the provided worker, method ignores the null attributes and overrides
|
||||
* the only non-null from {@param workerMetrics}. This is a blocking call.
|
||||
*
|
||||
* @param workerMetrics : Updated WorkerMetricStats object, resourceStats, workerId and lastUpdateTime are
|
||||
* required fields from {@param workerMetrics}
|
||||
*/
|
||||
public void updateMetrics(final WorkerMetricStats workerMetrics) {
|
||||
validateWorkerMetrics(workerMetrics);
|
||||
final UpdateItemEnhancedRequest<WorkerMetricStats> request = UpdateItemEnhancedRequest.builder(
|
||||
WorkerMetricStats.class)
|
||||
.item(workerMetrics)
|
||||
.ignoreNulls(true)
|
||||
.build();
|
||||
unwrappingFuture(() -> table.updateItem(request));
|
||||
}
|
||||
|
||||
/**
|
||||
* Deletes the WorkerMetricStats entry with conditional check on lastUpdateTime, if the worker has come alive and
|
||||
* updated the lastUpdateTime then we no longer need to perform the deletion.
|
||||
* @param workerMetrics WorkerMetricStats that needs to be deleted.
|
||||
* @return
|
||||
*/
|
||||
public boolean deleteMetrics(final WorkerMetricStats workerMetrics) {
|
||||
Preconditions.checkArgument(nonNull(workerMetrics.getWorkerId()), "WorkerID is not provided");
|
||||
Preconditions.checkArgument(nonNull(workerMetrics.getLastUpdateTime()), "LastUpdateTime is not provided");
|
||||
|
||||
final DeleteItemEnhancedRequest request = DeleteItemEnhancedRequest.builder()
|
||||
.key(Key.builder().partitionValue(workerMetrics.getWorkerId()).build())
|
||||
.conditionExpression(Expression.builder()
|
||||
.expression(String.format("#key = :value AND attribute_exists (%s)", KEY_WORKER_ID))
|
||||
.expressionNames(ImmutableMap.of("#key", KEY_LAST_UPDATE_TIME))
|
||||
.expressionValues(ImmutableMap.of(
|
||||
":value", AttributeValue.fromN(Long.toString(workerMetrics.getLastUpdateTime()))))
|
||||
.build())
|
||||
.build();
|
||||
|
||||
try {
|
||||
unwrappingFuture(() -> table.deleteItem(request));
|
||||
return true;
|
||||
} catch (final ConditionalCheckFailedException e) {
|
||||
log.warn(
|
||||
"Failed to delete the WorkerMetricStats due to conditional failure for worker : {}",
|
||||
workerMetrics,
|
||||
e);
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
private void validateWorkerMetrics(final WorkerMetricStats workerMetrics) {
|
||||
Preconditions.checkArgument(nonNull(workerMetrics.getMetricStats()), "ResourceMetrics not provided");
|
||||
|
||||
final List<String> entriesWithoutValues = workerMetrics.getMetricStats().entrySet().stream()
|
||||
.filter(entry -> entry.getValue() == null || entry.getValue().isEmpty())
|
||||
.map(Map.Entry::getKey)
|
||||
.collect(Collectors.toList());
|
||||
|
||||
Preconditions.checkArgument(
|
||||
entriesWithoutValues.isEmpty(), "Following metric stats dont have any values " + entriesWithoutValues);
|
||||
|
||||
Preconditions.checkArgument(nonNull(workerMetrics.getLastUpdateTime()), "LastUpdateTime field not set");
|
||||
|
||||
// If the LastUpdateTime field is 2x older than the reporter interval, it is considered stale.
|
||||
Preconditions.checkArgument(
|
||||
Duration.between(Instant.ofEpochSecond(workerMetrics.getLastUpdateTime()), Instant.now())
|
||||
.toMillis()
|
||||
< 2 * workerMetricsReporterFrequencyMillis,
|
||||
"LastUpdateTime is more than 2x older than workerMetricsReporterFrequencyMillis");
|
||||
}
|
||||
|
||||
/**
|
||||
* Performs the scan on the storage and returns list of all workerMetricStats objects.
|
||||
*
|
||||
* @return : List of all worker metric stats
|
||||
*/
|
||||
public List<WorkerMetricStats> getAllWorkerMetricStats() {
|
||||
log.debug("Scanning DDB table {}", table.tableName());
|
||||
final List<WorkerMetricStats> workerMetricStats = new ArrayList<>();
|
||||
unwrappingFuture(() -> table.scan().items().subscribe(workerMetricStats::add));
|
||||
return workerMetricStats;
|
||||
}
|
||||
|
||||
private TableDescription getTableDescription() {
|
||||
try {
|
||||
final DescribeTableResponse response = unwrappingFuture(() -> dynamoDbAsyncClient.describeTable(
|
||||
DescribeTableRequest.builder().tableName(table.tableName()).build()));
|
||||
return response.table();
|
||||
} catch (final ResourceNotFoundException e) {
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
private void createTableIfDoesNotExist() throws DependencyException {
|
||||
TableDescription tableDescription = getTableDescription();
|
||||
if (tableDescription == null) {
|
||||
unwrappingFuture(getWorkerMetricsDynamoTableCreator());
|
||||
tableDescription = getTableDescription();
|
||||
log.info("Table : {} created.", table.tableName());
|
||||
} else {
|
||||
log.info("Table : {} already existing, skipping creation...", table.tableName());
|
||||
}
|
||||
|
||||
if (tableDescription.tableStatus() != TableStatus.ACTIVE) {
|
||||
log.info("Waiting for DDB Table: {} to become active", table.tableName());
|
||||
try (final DynamoDbAsyncWaiter waiter = dynamoDbAsyncClient.waiter()) {
|
||||
final WaiterResponse<DescribeTableResponse> response =
|
||||
unwrappingFuture(() -> waiter.waitUntilTableExists(
|
||||
r -> r.tableName(table.tableName()), o -> o.waitTimeout(Duration.ofMinutes(10))));
|
||||
response.matched()
|
||||
.response()
|
||||
.orElseThrow(() -> new DependencyException(new IllegalStateException(
|
||||
"Creating WorkerMetricStats table timed out",
|
||||
response.matched().exception().orElse(null))));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@NotNull
|
||||
private Supplier<CompletableFuture<Void>> getWorkerMetricsDynamoTableCreator() {
|
||||
final Supplier<CompletableFuture<Void>> tableCreator;
|
||||
if (tableConfig.billingMode() == BillingMode.PROVISIONED) {
|
||||
log.info(
|
||||
"Creating worker metric stats table {} in provisioned mode with {}wcu and {}rcu",
|
||||
tableConfig.tableName(),
|
||||
tableConfig.writeCapacity(),
|
||||
tableConfig.readCapacity());
|
||||
tableCreator = () -> table.createTable(r -> r.provisionedThroughput(ProvisionedThroughput.builder()
|
||||
.readCapacityUnits(tableConfig.readCapacity())
|
||||
.writeCapacityUnits(tableConfig.writeCapacity())
|
||||
.build()));
|
||||
} else {
|
||||
tableCreator = table::createTable;
|
||||
}
|
||||
return tableCreator;
|
||||
}
|
||||
|
||||
static <T> T unwrappingFuture(final Supplier<CompletableFuture<T>> supplier) {
|
||||
try {
|
||||
return supplier.get().join();
|
||||
} catch (final CompletionException e) {
|
||||
if (e.getCause() instanceof RuntimeException) {
|
||||
throw (RuntimeException) e.getCause();
|
||||
}
|
||||
throw e;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,227 @@
|
|||
package software.amazon.kinesis.worker.metricstats;
|
||||
|
||||
import java.math.BigDecimal;
|
||||
import java.math.RoundingMode;
|
||||
import java.util.ArrayList;
|
||||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.Optional;
|
||||
import java.util.Queue;
|
||||
import java.util.concurrent.Executors;
|
||||
import java.util.concurrent.ScheduledExecutorService;
|
||||
import java.util.concurrent.ScheduledFuture;
|
||||
import java.util.concurrent.TimeUnit;
|
||||
|
||||
import com.google.common.collect.EvictingQueue;
|
||||
import com.google.common.collect.ImmutableList;
|
||||
import com.google.common.collect.Queues;
|
||||
import lombok.AccessLevel;
|
||||
import lombok.Getter;
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
import software.amazon.awssdk.services.cloudwatch.model.StandardUnit;
|
||||
import software.amazon.awssdk.utils.ThreadFactoryBuilder;
|
||||
import software.amazon.kinesis.annotations.KinesisClientInternalApi;
|
||||
import software.amazon.kinesis.metrics.MetricsFactory;
|
||||
import software.amazon.kinesis.metrics.MetricsLevel;
|
||||
import software.amazon.kinesis.metrics.MetricsScope;
|
||||
import software.amazon.kinesis.metrics.MetricsUtil;
|
||||
import software.amazon.kinesis.worker.metric.WorkerMetric;
|
||||
|
||||
/**
|
||||
* WorkerMetricStatsManager is a class that manages the collection of raw WorkerMetricStats values for the list of WorkerMetricStats
|
||||
* periodically and store in a bounded in-memory queue.
|
||||
* This class runs a periodic thread at every {@link #inMemoryStatsCaptureThreadFrequencyMillis} interval which
|
||||
* captures each WorkerMetricStats's raw value and stores them in {@link #workerMetricsToRawHighFreqValuesMap} for each.
|
||||
* When computeStats is invoked, the method drains the in-memory raw values queue for each WorkerMetricStats and computes the
|
||||
* average and stores the computed average in #computedAverageStats for each WorkerMetricStats.
|
||||
* For each WorkerMetricStats last {@link #maxMetricStatsCount} values are captured in {@link #computedAverageMetrics}
|
||||
*
|
||||
* This class is thread safe.
|
||||
*/
|
||||
@Slf4j
|
||||
@KinesisClientInternalApi
|
||||
public final class WorkerMetricStatsManager {
|
||||
|
||||
/**
|
||||
* 6 digit after decimal
|
||||
*/
|
||||
private static final int DEFAULT_AVERAGE_VALUES_DIGIT_AFTER_DECIMAL = 6;
|
||||
|
||||
private static final String METRICS_OPERATION_WORKER_STATS_REPORTER = "WorkerMetricStatsReporter";
|
||||
static final String METRICS_IN_MEMORY_REPORTER_FAILURE = "InMemoryMetricStatsReporterFailure";
|
||||
// 1 value per sec gives 5 minutes worth of past data for 300 count which is sufficient.
|
||||
// In case of reporter running more frequently than 5 minutes the queue will not reach this value anyway.
|
||||
private static final int HIGH_FREQUENCY_STATS_COUNT = 300;
|
||||
private static final long SCHEDULER_SHUTDOWN_TIMEOUT_SECONDS = 60L;
|
||||
|
||||
private final ScheduledExecutorService scheduledExecutorService;
|
||||
/**
|
||||
* Max count of values per WorkerMetricStats that is recorded in the storage.
|
||||
*/
|
||||
private final int maxMetricStatsCount;
|
||||
/**
|
||||
* List of WorkerMetricStats configured for the application, the values from these will be recorded in the storage.
|
||||
*/
|
||||
private final List<WorkerMetric> workerMetricList;
|
||||
/**
|
||||
* Map of WorkerMetricStats to its trailing (#maxMetricStatsCount) values.
|
||||
*/
|
||||
@Getter(AccessLevel.PACKAGE)
|
||||
private final Map<WorkerMetric, Queue<Double>> computedAverageMetrics;
|
||||
/**
|
||||
* Map of the WorkerMetricStats to its raw values since the last flush to storage was done.
|
||||
*/
|
||||
@Getter(AccessLevel.PACKAGE)
|
||||
private final Map<WorkerMetric, Queue<Double>> workerMetricsToRawHighFreqValuesMap;
|
||||
/**
|
||||
* Frequency for capturing raw WorkerMetricsValues in millis.
|
||||
*/
|
||||
private final long inMemoryStatsCaptureThreadFrequencyMillis;
|
||||
|
||||
private final MetricsFactory metricsFactory;
|
||||
private ScheduledFuture<?> managerProcessFuture;
|
||||
|
||||
public WorkerMetricStatsManager(
|
||||
final int maxMetricStatsCount,
|
||||
final List<WorkerMetric> workerMetricList,
|
||||
final MetricsFactory metricsFactory,
|
||||
long inMemoryStatsCaptureThreadFrequencyMillis) {
|
||||
// Set thread as daemon to not block VM from exit.
|
||||
this.scheduledExecutorService = Executors.newScheduledThreadPool(
|
||||
1,
|
||||
new ThreadFactoryBuilder()
|
||||
.daemonThreads(true)
|
||||
.threadNamePrefix("worker-metrics-manager")
|
||||
.build());
|
||||
this.maxMetricStatsCount = maxMetricStatsCount;
|
||||
this.workerMetricList = workerMetricList;
|
||||
this.computedAverageMetrics = new HashMap<>();
|
||||
this.workerMetricsToRawHighFreqValuesMap = new HashMap<>();
|
||||
this.metricsFactory = metricsFactory;
|
||||
this.inMemoryStatsCaptureThreadFrequencyMillis = inMemoryStatsCaptureThreadFrequencyMillis;
|
||||
init();
|
||||
}
|
||||
|
||||
private void init() {
|
||||
for (final WorkerMetric workerMetric : workerMetricList) {
|
||||
computedAverageMetrics.put(workerMetric, EvictingQueue.create(maxMetricStatsCount));
|
||||
workerMetricsToRawHighFreqValuesMap.put(
|
||||
workerMetric, Queues.synchronizedQueue(EvictingQueue.create(HIGH_FREQUENCY_STATS_COUNT)));
|
||||
}
|
||||
log.info(
|
||||
"Completed initialization with maxMetricStatsCount : {} and total WorkerMetricStats : {}",
|
||||
maxMetricStatsCount,
|
||||
workerMetricList.size());
|
||||
}
|
||||
|
||||
public void startManager() {
|
||||
managerProcessFuture = scheduledExecutorService.scheduleWithFixedDelay(
|
||||
this::recordWorkerMetrics, 0, inMemoryStatsCaptureThreadFrequencyMillis, TimeUnit.MILLISECONDS);
|
||||
log.info("Started manager process...");
|
||||
}
|
||||
|
||||
public void stopManager() {
|
||||
if (managerProcessFuture != null) {
|
||||
managerProcessFuture.cancel(false);
|
||||
}
|
||||
if (!scheduledExecutorService.isShutdown()) {
|
||||
scheduledExecutorService.shutdown();
|
||||
try {
|
||||
if (scheduledExecutorService.awaitTermination(SCHEDULER_SHUTDOWN_TIMEOUT_SECONDS, TimeUnit.SECONDS)) {
|
||||
scheduledExecutorService.shutdownNow();
|
||||
}
|
||||
} catch (final InterruptedException e) {
|
||||
Thread.currentThread().interrupt();
|
||||
log.warn("Interrupted when shutting down the scheduler, forcing shutdown", e);
|
||||
scheduledExecutorService.shutdownNow();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private void recordWorkerMetrics() {
|
||||
for (final WorkerMetric workerMetric : workerMetricList) {
|
||||
final Optional<Double> value = fetchWorkerMetricsValue(workerMetric);
|
||||
value.ifPresent(aDouble ->
|
||||
workerMetricsToRawHighFreqValuesMap.get(workerMetric).add(aDouble));
|
||||
}
|
||||
}
|
||||
|
||||
private Optional<Double> fetchWorkerMetricsValue(final WorkerMetric workerMetric) {
|
||||
try {
|
||||
final Double value = workerMetric.capture().getValue();
|
||||
return Optional.of(value);
|
||||
} catch (final Throwable throwable) {
|
||||
log.error(
|
||||
"WorkerMetricStats {} failure : ",
|
||||
workerMetric.getWorkerMetricType().name(),
|
||||
throwable);
|
||||
final MetricsScope scope =
|
||||
MetricsUtil.createMetricsWithOperation(metricsFactory, METRICS_OPERATION_WORKER_STATS_REPORTER);
|
||||
try {
|
||||
scope.addData(METRICS_IN_MEMORY_REPORTER_FAILURE, 1, StandardUnit.COUNT, MetricsLevel.SUMMARY);
|
||||
} finally {
|
||||
MetricsUtil.endScope(scope);
|
||||
}
|
||||
return Optional.empty();
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Computes the metric stats for each WorkerMetricStats by averaging the values in inMemoryQueue and returns last
|
||||
* {@link WorkerMetricStatsManager#maxMetricStatsCount } averaged values for each WorkerMetricStats.
|
||||
*
|
||||
* In the case of empty inMemoryQueue, computedStats has -1 value to denote that specific WorkerMetricStats has failed.
|
||||
* @return Map of WorkerMetricStats shortName to averaged {@link WorkerMetricStatsManager#maxMetricStatsCount } values.
|
||||
*/
|
||||
public synchronized Map<String, List<Double>> computeMetrics() {
|
||||
final Map<String, List<Double>> result = new HashMap<>();
|
||||
workerMetricsToRawHighFreqValuesMap.forEach((workerMetrics, statsQueue) -> {
|
||||
final List<Double> currentWorkerMetricsStats = drainQueue(statsQueue);
|
||||
|
||||
final Queue<Double> computedMetrics = computedAverageMetrics.get(workerMetrics);
|
||||
|
||||
if (currentWorkerMetricsStats.isEmpty()) {
|
||||
// In case currentWorkerMetricsStats is empty that means values from workerMetrics were not capture due
|
||||
// to some
|
||||
// reason, and thus there are no recent values, compute the value to be -1 to denote workerMetrics
|
||||
// failure
|
||||
computedMetrics.add(-1D);
|
||||
} else {
|
||||
computedMetrics.add(computeAverage(currentWorkerMetricsStats));
|
||||
}
|
||||
|
||||
result.put(workerMetrics.getShortName(), new ArrayList<>(computedMetrics));
|
||||
});
|
||||
return result;
|
||||
}
|
||||
|
||||
/**
|
||||
* Gets the operating range for each WorkerMetricStats that is registered.
|
||||
* @return Map of WorkerMetricStats to list of two values, first value is max utilization, and second value is variance %.
|
||||
*/
|
||||
public Map<String, List<Long>> getOperatingRange() {
|
||||
final Map<String, List<Long>> operatingRange = new HashMap<>();
|
||||
workerMetricList.forEach(
|
||||
workerMetrics -> operatingRange.put(workerMetrics.getShortName(), ImmutableList.of((long)
|
||||
workerMetrics.getOperatingRange().getMaxUtilization())));
|
||||
return operatingRange;
|
||||
}
|
||||
|
||||
private static List<Double> drainQueue(final Queue<Double> queue) {
|
||||
final List<Double> elements = new ArrayList<>();
|
||||
final int queueLength = queue.size();
|
||||
for (int i = 0; i < queueLength; ++i) {
|
||||
elements.add(queue.poll());
|
||||
}
|
||||
return elements;
|
||||
}
|
||||
|
||||
private Double computeAverage(final List<Double> values) {
|
||||
final double average =
|
||||
values.stream().mapToDouble(Double::doubleValue).average().orElse(0D);
|
||||
return BigDecimal.valueOf(average)
|
||||
.setScale(DEFAULT_AVERAGE_VALUES_DIGIT_AFTER_DECIMAL, RoundingMode.HALF_UP)
|
||||
.doubleValue();
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,68 @@
|
|||
/*
|
||||
* Copyright 2024 Amazon.com, Inc. or its affiliates.
|
||||
* Licensed under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package software.amazon.kinesis.worker.metricstats;
|
||||
|
||||
import java.time.Instant;
|
||||
|
||||
import lombok.RequiredArgsConstructor;
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
import software.amazon.kinesis.annotations.KinesisClientInternalApi;
|
||||
import software.amazon.kinesis.metrics.MetricsFactory;
|
||||
import software.amazon.kinesis.metrics.MetricsLevel;
|
||||
import software.amazon.kinesis.metrics.MetricsScope;
|
||||
import software.amazon.kinesis.metrics.MetricsUtil;
|
||||
|
||||
/**
|
||||
* Reporter that is periodically executed to report WorkerMetricStats. It collects
|
||||
* the in memory metric stats and writes into the DDB WorkerMetricStats table.
|
||||
*/
|
||||
@Slf4j
|
||||
@RequiredArgsConstructor
|
||||
@KinesisClientInternalApi
|
||||
public class WorkerMetricStatsReporter implements Runnable {
|
||||
private final MetricsFactory metricsFactory;
|
||||
private final String workerIdentifier;
|
||||
private final WorkerMetricStatsManager workerMetricsManager;
|
||||
private final WorkerMetricStatsDAO workerMetricsDAO;
|
||||
|
||||
@Override
|
||||
public void run() {
|
||||
final MetricsScope scope = MetricsUtil.createMetricsWithOperation(metricsFactory, "WorkerMetricStatsReporter");
|
||||
final long startTime = System.currentTimeMillis();
|
||||
boolean success = false;
|
||||
try {
|
||||
/*
|
||||
* OperatingRange value fetched during the initialization and is same afterwards. It's possible
|
||||
* to update OperatingRange only in first call and then skip, but we do not want to do that to avoid
|
||||
* case where a worker can have a failure for some time and thus does not update the workerMetrics entry
|
||||
* and LeaseAssigmentManager cleans it and then worker ends updating entry without operating range.
|
||||
*/
|
||||
final WorkerMetricStats workerMetrics = WorkerMetricStats.builder()
|
||||
.workerId(workerIdentifier)
|
||||
.metricStats(workerMetricsManager.computeMetrics())
|
||||
.operatingRange(workerMetricsManager.getOperatingRange())
|
||||
.lastUpdateTime(Instant.now().getEpochSecond())
|
||||
.build();
|
||||
workerMetricsDAO.updateMetrics(workerMetrics);
|
||||
success = true;
|
||||
} catch (final Exception e) {
|
||||
log.error("Failed to update worker metric stats for worker : {}", workerIdentifier, e);
|
||||
} finally {
|
||||
MetricsUtil.addWorkerIdentifier(scope, workerIdentifier);
|
||||
MetricsUtil.addSuccessAndLatency(scope, success, startTime, MetricsLevel.SUMMARY);
|
||||
MetricsUtil.endScope(scope);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,111 @@
|
|||
package software.amazon.kinesis.worker.platform;
|
||||
|
||||
import java.io.BufferedReader;
|
||||
import java.io.InputStreamReader;
|
||||
import java.net.HttpURLConnection;
|
||||
import java.net.MalformedURLException;
|
||||
import java.net.URL;
|
||||
import java.util.Optional;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
import org.jetbrains.annotations.VisibleForTesting;
|
||||
import software.amazon.kinesis.annotations.KinesisClientInternalApi;
|
||||
|
||||
import static software.amazon.kinesis.worker.platform.OperatingRangeDataProvider.LINUX_PROC;
|
||||
|
||||
/**
|
||||
* Provides resource metadata for EC2.
|
||||
*/
|
||||
@KinesisClientInternalApi
|
||||
@Slf4j
|
||||
public class Ec2Resource implements ResourceMetadataProvider {
|
||||
// https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/retrieve-iid.html
|
||||
private static final String IMDS_URL = "http://169.254.169.254/latest/dynamic/instance-identity/document";
|
||||
private static final String TOKEN_URL = "http://169.254.169.254/latest/api/token";
|
||||
private static final int EC2_INSTANCE_METADATA_TIMEOUT_MILLIS = 5000;
|
||||
|
||||
private final UrlOpener identityDocumentUrl;
|
||||
private final UrlOpener tokenUrl;
|
||||
|
||||
@VisibleForTesting
|
||||
Ec2Resource(UrlOpener identityDocumentUrl, UrlOpener tokenUrl) {
|
||||
this.identityDocumentUrl = identityDocumentUrl;
|
||||
this.tokenUrl = tokenUrl;
|
||||
}
|
||||
|
||||
/**
|
||||
* Factory method to create an instance of Ec2Resource.
|
||||
*
|
||||
* @return Ec2Resource instance
|
||||
*/
|
||||
public static Ec2Resource create() {
|
||||
try {
|
||||
return new Ec2Resource(new UrlOpener(new URL(IMDS_URL)), new UrlOpener(new URL(TOKEN_URL)));
|
||||
} catch (MalformedURLException e) {
|
||||
// It should not throw unless it's unit testing.
|
||||
throw new IllegalArgumentException(e);
|
||||
}
|
||||
}
|
||||
|
||||
private boolean isEc2() {
|
||||
try {
|
||||
final HttpURLConnection connection = identityDocumentUrl.openConnection();
|
||||
connection.setRequestMethod("GET");
|
||||
// IMDS v2 requires IMDS token
|
||||
connection.setRequestProperty("X-aws-ec2-metadata-token", fetchImdsToken());
|
||||
connection.setConnectTimeout(EC2_INSTANCE_METADATA_TIMEOUT_MILLIS);
|
||||
connection.setReadTimeout(EC2_INSTANCE_METADATA_TIMEOUT_MILLIS);
|
||||
if (connection.getResponseCode() == 200) {
|
||||
return true;
|
||||
}
|
||||
} catch (Exception e) {
|
||||
// TODO: probably need to add retries as well.
|
||||
log.error("Unable to retrieve instance metadata", e);
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
private String fetchImdsToken() {
|
||||
try {
|
||||
final HttpURLConnection connection = tokenUrl.openConnection();
|
||||
connection.setRequestMethod("PUT");
|
||||
connection.setRequestProperty("X-aws-ec2-metadata-token-ttl-seconds", "600");
|
||||
connection.setConnectTimeout(EC2_INSTANCE_METADATA_TIMEOUT_MILLIS);
|
||||
connection.setReadTimeout(EC2_INSTANCE_METADATA_TIMEOUT_MILLIS);
|
||||
if (connection.getResponseCode() == 200) {
|
||||
return new BufferedReader(new InputStreamReader(tokenUrl.getInputStream(connection)))
|
||||
.lines()
|
||||
.collect(Collectors.joining());
|
||||
}
|
||||
} catch (Exception e) {
|
||||
log.warn(
|
||||
"Unable to retrieve IMDS token. It could mean that the instance is not EC2 or is using IMDS V1", e);
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
/**
|
||||
* {@inheritDoc}
|
||||
*/
|
||||
@Override
|
||||
public boolean isOnPlatform() {
|
||||
return isEc2();
|
||||
}
|
||||
|
||||
/**
|
||||
* {@inheritDoc}
|
||||
*/
|
||||
@Override
|
||||
public ComputePlatform getPlatform() {
|
||||
return ComputePlatform.EC2;
|
||||
}
|
||||
|
||||
/**
|
||||
* {@inheritDoc}
|
||||
*/
|
||||
@Override
|
||||
public Optional<OperatingRangeDataProvider> getOperatingRangeDataProvider() {
|
||||
return Optional.of(LINUX_PROC).filter(OperatingRangeDataProvider::isProvider);
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,59 @@
|
|||
package software.amazon.kinesis.worker.platform;
|
||||
|
||||
import java.util.Map;
|
||||
import java.util.Optional;
|
||||
|
||||
import org.jetbrains.annotations.VisibleForTesting;
|
||||
import software.amazon.kinesis.annotations.KinesisClientInternalApi;
|
||||
|
||||
import static software.amazon.kinesis.worker.platform.OperatingRangeDataProvider.LINUX_ECS_METADATA_KEY_V4;
|
||||
|
||||
/**
|
||||
* Provides resource metadata for ECS.
|
||||
*/
|
||||
@KinesisClientInternalApi
|
||||
public class EcsResource implements ResourceMetadataProvider {
|
||||
static final String ECS_METADATA_KEY_V3 = "ECS_CONTAINER_METADATA_URI";
|
||||
static final String ECS_METADATA_KEY_V4 = "ECS_CONTAINER_METADATA_URI_V4";
|
||||
|
||||
private final Map<String, String> sysEnv;
|
||||
|
||||
@VisibleForTesting
|
||||
EcsResource(Map<String, String> sysEnv) {
|
||||
this.sysEnv = sysEnv;
|
||||
}
|
||||
|
||||
/**
|
||||
* Factory method to create an instance of EcsResource.
|
||||
*
|
||||
* @return an instance of EcsResource
|
||||
*/
|
||||
public static EcsResource create() {
|
||||
return new EcsResource(System.getenv());
|
||||
}
|
||||
|
||||
/**
|
||||
* {@inheritDoc}
|
||||
*/
|
||||
@Override
|
||||
public boolean isOnPlatform() {
|
||||
return !sysEnv.getOrDefault(ECS_METADATA_KEY_V3, "").isEmpty()
|
||||
|| !sysEnv.getOrDefault(ECS_METADATA_KEY_V4, "").isEmpty();
|
||||
}
|
||||
|
||||
/**
|
||||
* {@inheritDoc}
|
||||
*/
|
||||
@Override
|
||||
public ComputePlatform getPlatform() {
|
||||
return ComputePlatform.ECS;
|
||||
}
|
||||
|
||||
/**
|
||||
* {@inheritDoc}
|
||||
*/
|
||||
@Override
|
||||
public Optional<OperatingRangeDataProvider> getOperatingRangeDataProvider() {
|
||||
return Optional.of(LINUX_ECS_METADATA_KEY_V4).filter(OperatingRangeDataProvider::isProvider);
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,61 @@
|
|||
package software.amazon.kinesis.worker.platform;
|
||||
|
||||
import java.io.File;
|
||||
import java.util.Optional;
|
||||
import java.util.stream.Stream;
|
||||
|
||||
import org.jetbrains.annotations.VisibleForTesting;
|
||||
import software.amazon.kinesis.annotations.KinesisClientInternalApi;
|
||||
|
||||
import static software.amazon.kinesis.worker.platform.OperatingRangeDataProvider.LINUX_EKS_CGROUP_V1;
|
||||
import static software.amazon.kinesis.worker.platform.OperatingRangeDataProvider.LINUX_EKS_CGROUP_V2;
|
||||
|
||||
/**
|
||||
* Provides resource metadata for EKS.
|
||||
*/
|
||||
@KinesisClientInternalApi
|
||||
public class EksResource implements ResourceMetadataProvider {
|
||||
private static final String K8S_TOKEN_PATH = "/var/run/secrets/kubernetes.io/serviceaccount/token";
|
||||
private final String k8sTokenPath;
|
||||
|
||||
@VisibleForTesting
|
||||
EksResource(String k8sTokenPath) {
|
||||
this.k8sTokenPath = k8sTokenPath;
|
||||
}
|
||||
|
||||
/**
|
||||
* Factory method to create an instance of EksResource.
|
||||
*
|
||||
* @return an instance of EksResource
|
||||
*/
|
||||
public static EksResource create() {
|
||||
return new EksResource(K8S_TOKEN_PATH);
|
||||
}
|
||||
|
||||
/**
|
||||
* {@inheritDoc}
|
||||
*/
|
||||
@Override
|
||||
public boolean isOnPlatform() {
|
||||
return new File(this.k8sTokenPath).exists();
|
||||
}
|
||||
|
||||
/**
|
||||
* {@inheritDoc}
|
||||
*/
|
||||
@Override
|
||||
public ComputePlatform getPlatform() {
|
||||
return ComputePlatform.EKS;
|
||||
}
|
||||
|
||||
/**
|
||||
* {@inheritDoc}
|
||||
*/
|
||||
@Override
|
||||
public Optional<OperatingRangeDataProvider> getOperatingRangeDataProvider() {
|
||||
// It is only possible that either cgroupv1 or cgroupv2 is mounted
|
||||
return Stream.of(LINUX_EKS_CGROUP_V2, LINUX_EKS_CGROUP_V1)
|
||||
.filter(OperatingRangeDataProvider::isProvider)
|
||||
.findFirst();
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,73 @@
|
|||
package software.amazon.kinesis.worker.platform;
|
||||
|
||||
import java.io.File;
|
||||
|
||||
import static software.amazon.kinesis.worker.platform.EcsResource.ECS_METADATA_KEY_V4;
|
||||
|
||||
/**
|
||||
* Enum representing the different operating range metadata providers.
|
||||
*/
|
||||
public enum OperatingRangeDataProvider {
|
||||
LINUX_EKS_CGROUP_V1 {
|
||||
@Override
|
||||
public boolean isProvider() {
|
||||
if (!OperatingRangeDataProvider.isLinux()) {
|
||||
return false;
|
||||
}
|
||||
// Check if the cgroup v2 specific file does NOT exist
|
||||
final File cgroupV2File = new File("/sys/fs/cgroup/cgroup.controllers");
|
||||
if (cgroupV2File.exists()) {
|
||||
return false;
|
||||
}
|
||||
|
||||
// Check for common cgroup v1 directories like memory or cpu
|
||||
final File memoryCgroup = new File("/sys/fs/cgroup/memory");
|
||||
final File cpuCgroup = new File("/sys/fs/cgroup/cpu");
|
||||
|
||||
return memoryCgroup.exists() || cpuCgroup.exists();
|
||||
}
|
||||
},
|
||||
LINUX_EKS_CGROUP_V2 {
|
||||
@Override
|
||||
public boolean isProvider() {
|
||||
if (!OperatingRangeDataProvider.isLinux()) {
|
||||
return false;
|
||||
}
|
||||
|
||||
// Check if the cgroup v2 specific file exists
|
||||
final File cgroupV2File = new File("/sys/fs/cgroup/cgroup.controllers");
|
||||
|
||||
return cgroupV2File.exists();
|
||||
}
|
||||
},
|
||||
LINUX_ECS_METADATA_KEY_V4 {
|
||||
@Override
|
||||
public boolean isProvider() {
|
||||
if (!OperatingRangeDataProvider.isLinux()) {
|
||||
return false;
|
||||
}
|
||||
return !System.getenv().getOrDefault(ECS_METADATA_KEY_V4, "").isEmpty();
|
||||
}
|
||||
},
|
||||
LINUX_PROC {
|
||||
@Override
|
||||
public boolean isProvider() {
|
||||
if (!OperatingRangeDataProvider.isLinux()) {
|
||||
return false;
|
||||
}
|
||||
// Check if /proc directory exists (common in Linux environments)
|
||||
return new File("/proc").exists();
|
||||
}
|
||||
};
|
||||
|
||||
private static boolean isLinux() {
|
||||
return System.getProperty("os.name").toLowerCase().contains("linux");
|
||||
}
|
||||
|
||||
/**
|
||||
* Abstract method to check if the provider is supported on the current platform.
|
||||
*
|
||||
* @return true if the provider is supported, false otherwise.
|
||||
*/
|
||||
public abstract boolean isProvider();
|
||||
}
|
||||
|
|
@ -0,0 +1,42 @@
|
|||
package software.amazon.kinesis.worker.platform;
|
||||
|
||||
import java.util.Optional;
|
||||
|
||||
import software.amazon.kinesis.annotations.KinesisClientInternalApi;
|
||||
|
||||
/**
|
||||
* Interface for providing resource metadata for worker.
|
||||
*/
|
||||
@KinesisClientInternalApi
|
||||
public interface ResourceMetadataProvider {
|
||||
/**
|
||||
* Enum representing the different compute platforms.
|
||||
*/
|
||||
enum ComputePlatform {
|
||||
EC2,
|
||||
ECS,
|
||||
EKS,
|
||||
UNKNOWN
|
||||
}
|
||||
|
||||
/**
|
||||
* Check if the worker is running on the specific platform.
|
||||
*
|
||||
* @return true if the worker is running on the specific platform, false otherwise.
|
||||
*/
|
||||
boolean isOnPlatform();
|
||||
|
||||
/**
|
||||
* Get the name of the compute platform.
|
||||
*
|
||||
* @return the platform represent by the class.
|
||||
*/
|
||||
ComputePlatform getPlatform();
|
||||
|
||||
/**
|
||||
* Get the operating range data provider.
|
||||
*
|
||||
* @return the operating range data provider.
|
||||
*/
|
||||
Optional<OperatingRangeDataProvider> getOperatingRangeDataProvider();
|
||||
}
|
||||
|
|
@ -0,0 +1,39 @@
|
|||
package software.amazon.kinesis.worker.platform;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
import java.net.HttpURLConnection;
|
||||
import java.net.URL;
|
||||
|
||||
import lombok.RequiredArgsConstructor;
|
||||
import software.amazon.kinesis.annotations.KinesisClientInternalApi;
|
||||
|
||||
/**
|
||||
* Utility class to open a URL and get the input stream.
|
||||
*/
|
||||
@RequiredArgsConstructor
|
||||
@KinesisClientInternalApi
|
||||
class UrlOpener {
|
||||
private final URL url;
|
||||
|
||||
/**
|
||||
* Open the URL and return the connection.
|
||||
*
|
||||
* @return a HttpURLConnection.
|
||||
* @throws IOException if a connection cannot be established.
|
||||
*/
|
||||
public HttpURLConnection openConnection() throws IOException {
|
||||
return (HttpURLConnection) url.openConnection();
|
||||
}
|
||||
|
||||
/**
|
||||
* Get the input stream from the connection.
|
||||
*
|
||||
* @param connection the connection to get the input stream from.
|
||||
* @return the InputStream for the data.
|
||||
* @throws IOException if an error occurs while getting the input stream.
|
||||
*/
|
||||
public InputStream getInputStream(HttpURLConnection connection) throws IOException {
|
||||
return connection.getInputStream();
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1 @@
|
|||
Sample test ECS metadata for Amazon ECS task metadata v4. For more information, see https://docs.aws.amazon.com/AmazonECS/latest/developerguide/task-metadata-endpoint-v4-examples.html
|
||||
|
|
@ -0,0 +1,44 @@
|
|||
{
|
||||
"DockerId": "ea32192c8553fbff06c9340478a2ff089b2bb5646fb718b4ee206641c9086d66",
|
||||
"Name": "curl",
|
||||
"DockerName": "ecs-curltest-24-curl-cca48e8dcadd97805600",
|
||||
"Image": "111122223333.dkr.ecr.us-west-2.amazonaws.com/curltest:latest",
|
||||
"ImageID": "sha256:d691691e9652791a60114e67b365688d20d19940dde7c4736ea30e660d8d3553",
|
||||
"Labels": {
|
||||
"com.amazonaws.ecs.cluster": "default",
|
||||
"com.amazonaws.ecs.container-name": "curl",
|
||||
"com.amazonaws.ecs.task-arn": "arn:aws:ecs:us-west-2:111122223333:task/default/8f03e41243824aea923aca126495f665",
|
||||
"com.amazonaws.ecs.task-definition-family": "curltest",
|
||||
"com.amazonaws.ecs.task-definition-version": "24"
|
||||
},
|
||||
"DesiredStatus": "RUNNING",
|
||||
"KnownStatus": "RUNNING",
|
||||
"Limits": {
|
||||
"CPU": 50,
|
||||
"Memory": 128
|
||||
},
|
||||
"CreatedAt": "2020-10-02T00:15:07.620912337Z",
|
||||
"StartedAt": "2020-10-02T00:15:08.062559351Z",
|
||||
"Type": "NORMAL",
|
||||
"LogDriver": "awslogs",
|
||||
"LogOptions": {
|
||||
"awslogs-create-group": "true",
|
||||
"awslogs-group": "/ecs/metadata",
|
||||
"awslogs-region": "us-west-2",
|
||||
"awslogs-stream": "ecs/curl/8f03e41243824aea923aca126495f665"
|
||||
},
|
||||
"ContainerARN": "arn:aws:ecs:us-west-2:111122223333:container/0206b271-b33f-47ab-86c6-a0ba208a70a9",
|
||||
"Networks": [
|
||||
{
|
||||
"NetworkMode": "awsvpc",
|
||||
"IPv4Addresses": [
|
||||
"10.0.2.100"
|
||||
],
|
||||
"AttachmentIndex": 0,
|
||||
"MACAddress": "0e:9e:32:c7:48:85",
|
||||
"IPv4SubnetCIDRBlock": "10.0.2.0/24",
|
||||
"PrivateDNSName": "ip-10-0-2-100.us-west-2.compute.internal",
|
||||
"SubnetGatewayIpv4Address": "10.0.2.1/24"
|
||||
}
|
||||
]
|
||||
}
|
||||
|
|
@ -0,0 +1,130 @@
|
|||
{
|
||||
"read": "2020-10-02T00:61:13.410254284Z",
|
||||
"preread": "2020-10-02T00:51:12.406202398Z",
|
||||
"pids_stats": {
|
||||
"current": 3
|
||||
},
|
||||
"blkio_stats": {
|
||||
"io_service_bytes_recursive": [
|
||||
|
||||
],
|
||||
"io_serviced_recursive": [
|
||||
|
||||
],
|
||||
"io_queue_recursive": [
|
||||
|
||||
],
|
||||
"io_service_time_recursive": [
|
||||
|
||||
],
|
||||
"io_wait_time_recursive": [
|
||||
|
||||
],
|
||||
"io_merged_recursive": [
|
||||
|
||||
],
|
||||
"io_time_recursive": [
|
||||
|
||||
],
|
||||
"sectors_recursive": [
|
||||
|
||||
]
|
||||
},
|
||||
"num_procs": 0,
|
||||
"storage_stats": {
|
||||
|
||||
},
|
||||
"cpu_stats": {
|
||||
"cpu_usage": {
|
||||
"total_usage": 150000000,
|
||||
"percpu_usage": [
|
||||
182359190,
|
||||
178608875
|
||||
],
|
||||
"usage_in_kernelmode": 40000000,
|
||||
"usage_in_usermode": 290000000
|
||||
},
|
||||
"system_cpu_usage": 200000000,
|
||||
"online_cpus": 2,
|
||||
"throttling_data": {
|
||||
"periods": 0,
|
||||
"throttled_periods": 0,
|
||||
"throttled_time": 0
|
||||
}
|
||||
},
|
||||
"precpu_stats": {
|
||||
"cpu_usage": {
|
||||
"total_usage": 0,
|
||||
"percpu_usage": [
|
||||
182359190,
|
||||
178608875
|
||||
],
|
||||
"usage_in_kernelmode": 40000000,
|
||||
"usage_in_usermode": 290000000
|
||||
},
|
||||
"system_cpu_usage": 0,
|
||||
"online_cpus": 2,
|
||||
"throttling_data": {
|
||||
"periods": 0,
|
||||
"throttled_periods": 0,
|
||||
"throttled_time": 0
|
||||
}
|
||||
},
|
||||
"memory_stats": {
|
||||
"usage": 1806336,
|
||||
"max_usage": 6299648,
|
||||
"stats": {
|
||||
"active_anon": 606208,
|
||||
"active_file": 0,
|
||||
"cache": 0,
|
||||
"dirty": 0,
|
||||
"hierarchical_memory_limit": 134217728,
|
||||
"hierarchical_memsw_limit": 268435456,
|
||||
"inactive_anon": 0,
|
||||
"inactive_file": 0,
|
||||
"mapped_file": 0,
|
||||
"pgfault": 4185,
|
||||
"pgmajfault": 0,
|
||||
"pgpgin": 2926,
|
||||
"pgpgout": 2778,
|
||||
"rss": 606208,
|
||||
"rss_huge": 0,
|
||||
"total_active_anon": 606208,
|
||||
"total_active_file": 0,
|
||||
"total_cache": 0,
|
||||
"total_dirty": 0,
|
||||
"total_inactive_anon": 0,
|
||||
"total_inactive_file": 0,
|
||||
"total_mapped_file": 0,
|
||||
"total_pgfault": 4185,
|
||||
"total_pgmajfault": 0,
|
||||
"total_pgpgin": 2926,
|
||||
"total_pgpgout": 2778,
|
||||
"total_rss": 606208,
|
||||
"total_rss_huge": 0,
|
||||
"total_unevictable": 0,
|
||||
"total_writeback": 0,
|
||||
"unevictable": 0,
|
||||
"writeback": 0
|
||||
},
|
||||
"limit": 134217728
|
||||
},
|
||||
"name": "/ecs-curltest-26-curl-c2e5f6e0cf91b0bead01",
|
||||
"id": "5fc21e5b015f899d22618f8aede80b6d70d71b2a75465ea49d9462c8f3d2d3af",
|
||||
"networks": {
|
||||
"eth0": {
|
||||
"rx_bytes": 84,
|
||||
"rx_packets": 2,
|
||||
"rx_errors": 0,
|
||||
"rx_dropped": 0,
|
||||
"tx_bytes": 84,
|
||||
"tx_packets": 2,
|
||||
"tx_errors": 0,
|
||||
"tx_dropped": 0
|
||||
}
|
||||
},
|
||||
"network_rate_stats": {
|
||||
"rx_bytes_per_sec": 0,
|
||||
"tx_bytes_per_sec": 0
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,56 @@
|
|||
{
|
||||
"Cluster": "default",
|
||||
"TaskARN": "arn:aws:ecs:us-west-2:111122223333:task/default/158d1c8083dd49d6b527399fd6414f5c",
|
||||
"Family": "curltest",
|
||||
"ServiceName": "MyService",
|
||||
"Revision": "26",
|
||||
"DesiredStatus": "RUNNING",
|
||||
"KnownStatus": "RUNNING",
|
||||
"Limits": {
|
||||
"CPU": 4,
|
||||
"Memory": 128
|
||||
},
|
||||
"PullStartedAt": "2020-10-02T00:43:06.202617438Z",
|
||||
"PullStoppedAt": "2020-10-02T00:43:06.31288465Z",
|
||||
"AvailabilityZone": "us-west-2d",
|
||||
"VPCID": "vpc-1234567890abcdef0",
|
||||
"LaunchType": "EC2",
|
||||
"Containers": [
|
||||
{
|
||||
"DockerId": "ea32192c8553fbff06c9340478a2ff089b2bb5646fb718b4ee206641c9086d66",
|
||||
"Name": "~internal~ecs~pause",
|
||||
"DockerName": "ecs-curltest-26-internalecspause-e292d586b6f9dade4a00",
|
||||
"Image": "amazon/amazon-ecs-pause:0.1.0",
|
||||
"ImageID": "",
|
||||
"Labels": {
|
||||
"com.amazonaws.ecs.cluster": "default",
|
||||
"com.amazonaws.ecs.container-name": "~internal~ecs~pause",
|
||||
"com.amazonaws.ecs.task-arn": "arn:aws:ecs:us-west-2:111122223333:task/default/158d1c8083dd49d6b527399fd6414f5c",
|
||||
"com.amazonaws.ecs.task-definition-family": "curltest",
|
||||
"com.amazonaws.ecs.task-definition-version": "26"
|
||||
},
|
||||
"DesiredStatus": "RESOURCES_PROVISIONED",
|
||||
"KnownStatus": "RESOURCES_PROVISIONED",
|
||||
"Limits": {
|
||||
"CPU": 50,
|
||||
"Memory": 128
|
||||
},
|
||||
"CreatedAt": "2020-10-02T00:43:05.602352471Z",
|
||||
"StartedAt": "2020-10-02T00:43:06.076707576Z",
|
||||
"Type": "CNI_PAUSE",
|
||||
"Networks": [
|
||||
{
|
||||
"NetworkMode": "awsvpc",
|
||||
"IPv4Addresses": [
|
||||
"10.0.2.61"
|
||||
],
|
||||
"AttachmentIndex": 0,
|
||||
"MACAddress": "0e:10:e2:01:bd:91",
|
||||
"IPv4SubnetCIDRBlock": "10.0.2.0/24",
|
||||
"PrivateDNSName": "ip-10-0-2-61.us-west-2.compute.internal",
|
||||
"SubnetGatewayIpv4Address": "10.0.2.1/24"
|
||||
}
|
||||
]
|
||||
}
|
||||
]
|
||||
}
|
||||
|
|
@ -0,0 +1,44 @@
|
|||
{
|
||||
"DockerId": "ea32192c8553fbff06c9340478a2ff089b2bb5646fb718b4ee206641c9086d66",
|
||||
"Name": "curl",
|
||||
"DockerName": "ecs-curltest-24-curl-cca48e8dcadd97805600",
|
||||
"Image": "111122223333.dkr.ecr.us-west-2.amazonaws.com/curltest:latest",
|
||||
"ImageID": "sha256:d691691e9652791a60114e67b365688d20d19940dde7c4736ea30e660d8d3553",
|
||||
"Labels": {
|
||||
"com.amazonaws.ecs.cluster": "default",
|
||||
"com.amazonaws.ecs.container-name": "curl",
|
||||
"com.amazonaws.ecs.task-arn": "arn:aws:ecs:us-west-2:111122223333:task/default/8f03e41243824aea923aca126495f665",
|
||||
"com.amazonaws.ecs.task-definition-family": "curltest",
|
||||
"com.amazonaws.ecs.task-definition-version": "24"
|
||||
},
|
||||
"DesiredStatus": "RUNNING",
|
||||
"KnownStatus": "RUNNING",
|
||||
"Limits": {
|
||||
"CPU": 50,
|
||||
"Memory": 128
|
||||
},
|
||||
"CreatedAt": "2020-10-02T00:15:07.620912337Z",
|
||||
"StartedAt": "2020-10-02T00:15:08.062559351Z",
|
||||
"Type": "NORMAL",
|
||||
"LogDriver": "awslogs",
|
||||
"LogOptions": {
|
||||
"awslogs-create-group": "true",
|
||||
"awslogs-group": "/ecs/metadata",
|
||||
"awslogs-region": "us-west-2",
|
||||
"awslogs-stream": "ecs/curl/8f03e41243824aea923aca126495f665"
|
||||
},
|
||||
"ContainerARN": "arn:aws:ecs:us-west-2:111122223333:container/0206b271-b33f-47ab-86c6-a0ba208a70a9",
|
||||
"Networks": [
|
||||
{
|
||||
"NetworkMode": "awsvpc",
|
||||
"IPv4Addresses": [
|
||||
"10.0.2.100"
|
||||
],
|
||||
"AttachmentIndex": 0,
|
||||
"MACAddress": "0e:9e:32:c7:48:85",
|
||||
"IPv4SubnetCIDRBlock": "10.0.2.0/24",
|
||||
"PrivateDNSName": "ip-10-0-2-100.us-west-2.compute.internal",
|
||||
"SubnetGatewayIpv4Address": "10.0.2.1/24"
|
||||
}
|
||||
]
|
||||
}
|
||||
|
|
@ -0,0 +1,130 @@
|
|||
{
|
||||
"read": "2020-10-02T00:61:13.410254284Z",
|
||||
"preread": "2020-10-02T00:51:12.406202398Z",
|
||||
"pids_stats": {
|
||||
"current": 3
|
||||
},
|
||||
"blkio_stats": {
|
||||
"io_service_bytes_recursive": [
|
||||
|
||||
],
|
||||
"io_serviced_recursive": [
|
||||
|
||||
],
|
||||
"io_queue_recursive": [
|
||||
|
||||
],
|
||||
"io_service_time_recursive": [
|
||||
|
||||
],
|
||||
"io_wait_time_recursive": [
|
||||
|
||||
],
|
||||
"io_merged_recursive": [
|
||||
|
||||
],
|
||||
"io_time_recursive": [
|
||||
|
||||
],
|
||||
"sectors_recursive": [
|
||||
|
||||
]
|
||||
},
|
||||
"num_procs": 0,
|
||||
"storage_stats": {
|
||||
|
||||
},
|
||||
"cpu_stats": {
|
||||
"cpu_usage": {
|
||||
"total_usage": 150000000,
|
||||
"percpu_usage": [
|
||||
182359190,
|
||||
178608875
|
||||
],
|
||||
"usage_in_kernelmode": 40000000,
|
||||
"usage_in_usermode": 290000000
|
||||
},
|
||||
"system_cpu_usage": 100000000,
|
||||
"online_cpus": 2,
|
||||
"throttling_data": {
|
||||
"periods": 0,
|
||||
"throttled_periods": 0,
|
||||
"throttled_time": 0
|
||||
}
|
||||
},
|
||||
"precpu_stats": {
|
||||
"cpu_usage": {
|
||||
"total_usage": 100000000,
|
||||
"percpu_usage": [
|
||||
182359190,
|
||||
178608875
|
||||
],
|
||||
"usage_in_kernelmode": 40000000,
|
||||
"usage_in_usermode": 290000000
|
||||
},
|
||||
"system_cpu_usage": 100000000,
|
||||
"online_cpus": 2,
|
||||
"throttling_data": {
|
||||
"periods": 0,
|
||||
"throttled_periods": 0,
|
||||
"throttled_time": 0
|
||||
}
|
||||
},
|
||||
"memory_stats": {
|
||||
"usage": 1806336,
|
||||
"max_usage": 6299648,
|
||||
"stats": {
|
||||
"active_anon": 606208,
|
||||
"active_file": 0,
|
||||
"cache": 0,
|
||||
"dirty": 0,
|
||||
"hierarchical_memory_limit": 134217728,
|
||||
"hierarchical_memsw_limit": 268435456,
|
||||
"inactive_anon": 0,
|
||||
"inactive_file": 0,
|
||||
"mapped_file": 0,
|
||||
"pgfault": 4185,
|
||||
"pgmajfault": 0,
|
||||
"pgpgin": 2926,
|
||||
"pgpgout": 2778,
|
||||
"rss": 606208,
|
||||
"rss_huge": 0,
|
||||
"total_active_anon": 606208,
|
||||
"total_active_file": 0,
|
||||
"total_cache": 0,
|
||||
"total_dirty": 0,
|
||||
"total_inactive_anon": 0,
|
||||
"total_inactive_file": 0,
|
||||
"total_mapped_file": 0,
|
||||
"total_pgfault": 4185,
|
||||
"total_pgmajfault": 0,
|
||||
"total_pgpgin": 2926,
|
||||
"total_pgpgout": 2778,
|
||||
"total_rss": 606208,
|
||||
"total_rss_huge": 0,
|
||||
"total_unevictable": 0,
|
||||
"total_writeback": 0,
|
||||
"unevictable": 0,
|
||||
"writeback": 0
|
||||
},
|
||||
"limit": 134217728
|
||||
},
|
||||
"name": "/ecs-curltest-26-curl-c2e5f6e0cf91b0bead01",
|
||||
"id": "5fc21e5b015f899d22618f8aede80b6d70d71b2a75465ea49d9462c8f3d2d3af",
|
||||
"networks": {
|
||||
"eth0": {
|
||||
"rx_bytes": 84,
|
||||
"rx_packets": 2,
|
||||
"rx_errors": 0,
|
||||
"rx_dropped": 0,
|
||||
"tx_bytes": 84,
|
||||
"tx_packets": 2,
|
||||
"tx_errors": 0,
|
||||
"tx_dropped": 0
|
||||
}
|
||||
},
|
||||
"network_rate_stats": {
|
||||
"rx_bytes_per_sec": 0,
|
||||
"tx_bytes_per_sec": 0
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,56 @@
|
|||
{
|
||||
"Cluster": "default",
|
||||
"TaskARN": "arn:aws:ecs:us-west-2:111122223333:task/default/158d1c8083dd49d6b527399fd6414f5c",
|
||||
"Family": "curltest",
|
||||
"ServiceName": "MyService",
|
||||
"Revision": "26",
|
||||
"DesiredStatus": "RUNNING",
|
||||
"KnownStatus": "RUNNING",
|
||||
"Limits": {
|
||||
"CPU": 4,
|
||||
"Memory": 128
|
||||
},
|
||||
"PullStartedAt": "2020-10-02T00:43:06.202617438Z",
|
||||
"PullStoppedAt": "2020-10-02T00:43:06.31288465Z",
|
||||
"AvailabilityZone": "us-west-2d",
|
||||
"VPCID": "vpc-1234567890abcdef0",
|
||||
"LaunchType": "EC2",
|
||||
"Containers": [
|
||||
{
|
||||
"DockerId": "ea32192c8553fbff06c9340478a2ff089b2bb5646fb718b4ee206641c9086d66",
|
||||
"Name": "~internal~ecs~pause",
|
||||
"DockerName": "ecs-curltest-26-internalecspause-e292d586b6f9dade4a00",
|
||||
"Image": "amazon/amazon-ecs-pause:0.1.0",
|
||||
"ImageID": "",
|
||||
"Labels": {
|
||||
"com.amazonaws.ecs.cluster": "default",
|
||||
"com.amazonaws.ecs.container-name": "~internal~ecs~pause",
|
||||
"com.amazonaws.ecs.task-arn": "arn:aws:ecs:us-west-2:111122223333:task/default/158d1c8083dd49d6b527399fd6414f5c",
|
||||
"com.amazonaws.ecs.task-definition-family": "curltest",
|
||||
"com.amazonaws.ecs.task-definition-version": "26"
|
||||
},
|
||||
"DesiredStatus": "RESOURCES_PROVISIONED",
|
||||
"KnownStatus": "RESOURCES_PROVISIONED",
|
||||
"Limits": {
|
||||
"CPU": 50,
|
||||
"Memory": 128
|
||||
},
|
||||
"CreatedAt": "2020-10-02T00:43:05.602352471Z",
|
||||
"StartedAt": "2020-10-02T00:43:06.076707576Z",
|
||||
"Type": "CNI_PAUSE",
|
||||
"Networks": [
|
||||
{
|
||||
"NetworkMode": "awsvpc",
|
||||
"IPv4Addresses": [
|
||||
"10.0.2.61"
|
||||
],
|
||||
"AttachmentIndex": 0,
|
||||
"MACAddress": "0e:10:e2:01:bd:91",
|
||||
"IPv4SubnetCIDRBlock": "10.0.2.0/24",
|
||||
"PrivateDNSName": "ip-10-0-2-61.us-west-2.compute.internal",
|
||||
"SubnetGatewayIpv4Address": "10.0.2.1/24"
|
||||
}
|
||||
]
|
||||
}
|
||||
]
|
||||
}
|
||||
|
|
@ -0,0 +1,44 @@
|
|||
{
|
||||
"DockerId": "ea32192c8553fbff06c9340478a2ff089b2bb5646fb718b4ee206641c9086d66",
|
||||
"Name": "curl",
|
||||
"DockerName": "ecs-curltest-24-curl-cca48e8dcadd97805600",
|
||||
"Image": "111122223333.dkr.ecr.us-west-2.amazonaws.com/curltest:latest",
|
||||
"ImageID": "sha256:d691691e9652791a60114e67b365688d20d19940dde7c4736ea30e660d8d3553",
|
||||
"Labels": {
|
||||
"com.amazonaws.ecs.cluster": "default",
|
||||
"com.amazonaws.ecs.container-name": "curl",
|
||||
"com.amazonaws.ecs.task-arn": "arn:aws:ecs:us-west-2:111122223333:task/default/8f03e41243824aea923aca126495f665",
|
||||
"com.amazonaws.ecs.task-definition-family": "curltest",
|
||||
"com.amazonaws.ecs.task-definition-version": "24"
|
||||
},
|
||||
"DesiredStatus": "RUNNING",
|
||||
"KnownStatus": "RUNNING",
|
||||
"Limits": {
|
||||
"CPU": 50,
|
||||
"Memory": 128
|
||||
},
|
||||
"CreatedAt": "2020-10-02T00:15:07.620912337Z",
|
||||
"StartedAt": "2020-10-02T00:15:08.062559351Z",
|
||||
"Type": "NORMAL",
|
||||
"LogDriver": "awslogs",
|
||||
"LogOptions": {
|
||||
"awslogs-create-group": "true",
|
||||
"awslogs-group": "/ecs/metadata",
|
||||
"awslogs-region": "us-west-2",
|
||||
"awslogs-stream": "ecs/curl/8f03e41243824aea923aca126495f665"
|
||||
},
|
||||
"ContainerARN": "arn:aws:ecs:us-west-2:111122223333:container/0206b271-b33f-47ab-86c6-a0ba208a70a9",
|
||||
"Networks": [
|
||||
{
|
||||
"NetworkMode": "awsvpc",
|
||||
"IPv4Addresses": [
|
||||
"10.0.2.100"
|
||||
],
|
||||
"AttachmentIndex": 0,
|
||||
"MACAddress": "0e:9e:32:c7:48:85",
|
||||
"IPv4SubnetCIDRBlock": "10.0.2.0/24",
|
||||
"PrivateDNSName": "ip-10-0-2-100.us-west-2.compute.internal",
|
||||
"SubnetGatewayIpv4Address": "10.0.2.1/24"
|
||||
}
|
||||
]
|
||||
}
|
||||
|
|
@ -0,0 +1,130 @@
|
|||
{
|
||||
"read": "2020-10-02T00:51:13.410254284Z",
|
||||
"preread": "2020-10-02T00:51:12.406202398Z",
|
||||
"pids_stats": {
|
||||
"current": 3
|
||||
},
|
||||
"blkio_stats": {
|
||||
"io_service_bytes_recursive": [
|
||||
|
||||
],
|
||||
"io_serviced_recursive": [
|
||||
|
||||
],
|
||||
"io_queue_recursive": [
|
||||
|
||||
],
|
||||
"io_service_time_recursive": [
|
||||
|
||||
],
|
||||
"io_wait_time_recursive": [
|
||||
|
||||
],
|
||||
"io_merged_recursive": [
|
||||
|
||||
],
|
||||
"io_time_recursive": [
|
||||
|
||||
],
|
||||
"sectors_recursive": [
|
||||
|
||||
]
|
||||
},
|
||||
"num_procs": 0,
|
||||
"storage_stats": {
|
||||
|
||||
},
|
||||
"cpu_stats": {
|
||||
"cpu_usage": {
|
||||
"total_usage": 150000000,
|
||||
"percpu_usage": [
|
||||
182359190,
|
||||
178608875
|
||||
],
|
||||
"usage_in_kernelmode": 40000000,
|
||||
"usage_in_usermode": 290000000
|
||||
},
|
||||
"system_cpu_usage": 200000000,
|
||||
"online_cpus": 2,
|
||||
"throttling_data": {
|
||||
"periods": 0,
|
||||
"throttled_periods": 0,
|
||||
"throttled_time": 0
|
||||
}
|
||||
},
|
||||
"precpu_stats": {
|
||||
"cpu_usage": {
|
||||
"total_usage": 100000000,
|
||||
"percpu_usage": [
|
||||
182359190,
|
||||
178608875
|
||||
],
|
||||
"usage_in_kernelmode": 40000000,
|
||||
"usage_in_usermode": 290000000
|
||||
},
|
||||
"system_cpu_usage": 100000000,
|
||||
"online_cpus": 2,
|
||||
"throttling_data": {
|
||||
"periods": 0,
|
||||
"throttled_periods": 0,
|
||||
"throttled_time": 0
|
||||
}
|
||||
},
|
||||
"memory_stats": {
|
||||
"usage": 1806336,
|
||||
"max_usage": 6299648,
|
||||
"stats": {
|
||||
"active_anon": 606208,
|
||||
"active_file": 0,
|
||||
"cache": 0,
|
||||
"dirty": 0,
|
||||
"hierarchical_memory_limit": 134217728,
|
||||
"hierarchical_memsw_limit": 268435456,
|
||||
"inactive_anon": 0,
|
||||
"inactive_file": 0,
|
||||
"mapped_file": 0,
|
||||
"pgfault": 4185,
|
||||
"pgmajfault": 0,
|
||||
"pgpgin": 2926,
|
||||
"pgpgout": 2778,
|
||||
"rss": 606208,
|
||||
"rss_huge": 0,
|
||||
"total_active_anon": 606208,
|
||||
"total_active_file": 0,
|
||||
"total_cache": 0,
|
||||
"total_dirty": 0,
|
||||
"total_inactive_anon": 0,
|
||||
"total_inactive_file": 0,
|
||||
"total_mapped_file": 0,
|
||||
"total_pgfault": 4185,
|
||||
"total_pgmajfault": 0,
|
||||
"total_pgpgin": 2926,
|
||||
"total_pgpgout": 2778,
|
||||
"total_rss": 606208,
|
||||
"total_rss_huge": 0,
|
||||
"total_unevictable": 0,
|
||||
"total_writeback": 0,
|
||||
"unevictable": 0,
|
||||
"writeback": 0
|
||||
},
|
||||
"limit": 134217728
|
||||
},
|
||||
"name": "/ecs-curltest-26-curl-c2e5f6e0cf91b0bead01",
|
||||
"id": "5fc21e5b015f899d22618f8aede80b6d70d71b2a75465ea49d9462c8f3d2d3af",
|
||||
"networks": {
|
||||
"eth0": {
|
||||
"rx_bytes": 84,
|
||||
"rx_packets": 2,
|
||||
"rx_errors": 0,
|
||||
"rx_dropped": 0,
|
||||
"tx_bytes": 84,
|
||||
"tx_packets": 2,
|
||||
"tx_errors": 0,
|
||||
"tx_dropped": 0
|
||||
}
|
||||
},
|
||||
"network_rate_stats": {
|
||||
"rx_bytes_per_sec": 0,
|
||||
"tx_bytes_per_sec": 0
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,55 @@
|
|||
{
|
||||
"Cluster": "default",
|
||||
"TaskARN": "arn:aws:ecs:us-west-2:111122223333:task/default/158d1c8083dd49d6b527399fd6414f5c",
|
||||
"Family": "curltest",
|
||||
"ServiceName": "MyService",
|
||||
"Revision": "26",
|
||||
"DesiredStatus": "RUNNING",
|
||||
"KnownStatus": "RUNNING",
|
||||
"Limits": {
|
||||
"Memory": 128
|
||||
},
|
||||
"PullStartedAt": "2020-10-02T00:43:06.202617438Z",
|
||||
"PullStoppedAt": "2020-10-02T00:43:06.31288465Z",
|
||||
"AvailabilityZone": "us-west-2d",
|
||||
"VPCID": "vpc-1234567890abcdef0",
|
||||
"LaunchType": "EC2",
|
||||
"Containers": [
|
||||
{
|
||||
"DockerId": "ea32192c8553fbff06c9340478a2ff089b2bb5646fb718b4ee206641c9086d66",
|
||||
"Name": "~internal~ecs~pause",
|
||||
"DockerName": "ecs-curltest-26-internalecspause-e292d586b6f9dade4a00",
|
||||
"Image": "amazon/amazon-ecs-pause:0.1.0",
|
||||
"ImageID": "",
|
||||
"Labels": {
|
||||
"com.amazonaws.ecs.cluster": "default",
|
||||
"com.amazonaws.ecs.container-name": "~internal~ecs~pause",
|
||||
"com.amazonaws.ecs.task-arn": "arn:aws:ecs:us-west-2:111122223333:task/default/158d1c8083dd49d6b527399fd6414f5c",
|
||||
"com.amazonaws.ecs.task-definition-family": "curltest",
|
||||
"com.amazonaws.ecs.task-definition-version": "26"
|
||||
},
|
||||
"DesiredStatus": "RESOURCES_PROVISIONED",
|
||||
"KnownStatus": "RESOURCES_PROVISIONED",
|
||||
"Limits": {
|
||||
"CPU": 50,
|
||||
"Memory": 128
|
||||
},
|
||||
"CreatedAt": "2020-10-02T00:43:05.602352471Z",
|
||||
"StartedAt": "2020-10-02T00:43:06.076707576Z",
|
||||
"Type": "CNI_PAUSE",
|
||||
"Networks": [
|
||||
{
|
||||
"NetworkMode": "awsvpc",
|
||||
"IPv4Addresses": [
|
||||
"10.0.2.61"
|
||||
],
|
||||
"AttachmentIndex": 0,
|
||||
"MACAddress": "0e:10:e2:01:bd:91",
|
||||
"IPv4SubnetCIDRBlock": "10.0.2.0/24",
|
||||
"PrivateDNSName": "ip-10-0-2-61.us-west-2.compute.internal",
|
||||
"SubnetGatewayIpv4Address": "10.0.2.1/24"
|
||||
}
|
||||
]
|
||||
}
|
||||
]
|
||||
}
|
||||
|
|
@ -0,0 +1,44 @@
|
|||
{
|
||||
"DockerId": "ea32192c8553fbff06c9340478a2ff089b2bb5646fb718b4ee206641c9086d66",
|
||||
"Name": "curl",
|
||||
"DockerName": "ecs-curltest-24-curl-cca48e8dcadd97805600",
|
||||
"Image": "111122223333.dkr.ecr.us-west-2.amazonaws.com/curltest:latest",
|
||||
"ImageID": "sha256:d691691e9652791a60114e67b365688d20d19940dde7c4736ea30e660d8d3553",
|
||||
"Labels": {
|
||||
"com.amazonaws.ecs.cluster": "default",
|
||||
"com.amazonaws.ecs.container-name": "curl",
|
||||
"com.amazonaws.ecs.task-arn": "arn:aws:ecs:us-west-2:111122223333:task/default/8f03e41243824aea923aca126495f665",
|
||||
"com.amazonaws.ecs.task-definition-family": "curltest",
|
||||
"com.amazonaws.ecs.task-definition-version": "24"
|
||||
},
|
||||
"DesiredStatus": "RUNNING",
|
||||
"KnownStatus": "RUNNING",
|
||||
"Limits": {
|
||||
"CPU": 50,
|
||||
"Memory": 128
|
||||
},
|
||||
"CreatedAt": "2020-10-02T00:15:07.620912337Z",
|
||||
"StartedAt": "2020-10-02T00:15:08.062559351Z",
|
||||
"Type": "NORMAL",
|
||||
"LogDriver": "awslogs",
|
||||
"LogOptions": {
|
||||
"awslogs-create-group": "true",
|
||||
"awslogs-group": "/ecs/metadata",
|
||||
"awslogs-region": "us-west-2",
|
||||
"awslogs-stream": "ecs/curl/8f03e41243824aea923aca126495f665"
|
||||
},
|
||||
"ContainerARN": "arn:aws:ecs:us-west-2:111122223333:container/0206b271-b33f-47ab-86c6-a0ba208a70a9",
|
||||
"Networks": [
|
||||
{
|
||||
"NetworkMode": "awsvpc",
|
||||
"IPv4Addresses": [
|
||||
"10.0.2.100"
|
||||
],
|
||||
"AttachmentIndex": 0,
|
||||
"MACAddress": "0e:9e:32:c7:48:85",
|
||||
"IPv4SubnetCIDRBlock": "10.0.2.0/24",
|
||||
"PrivateDNSName": "ip-10-0-2-100.us-west-2.compute.internal",
|
||||
"SubnetGatewayIpv4Address": "10.0.2.1/24"
|
||||
}
|
||||
]
|
||||
}
|
||||
|
|
@ -0,0 +1,130 @@
|
|||
{
|
||||
"read": "2020-10-02T00:51:13.410254284Z",
|
||||
"preread": "2020-10-02T00:51:12.406202398Z",
|
||||
"pids_stats": {
|
||||
"current": 3
|
||||
},
|
||||
"blkio_stats": {
|
||||
"io_service_bytes_recursive": [
|
||||
|
||||
],
|
||||
"io_serviced_recursive": [
|
||||
|
||||
],
|
||||
"io_queue_recursive": [
|
||||
|
||||
],
|
||||
"io_service_time_recursive": [
|
||||
|
||||
],
|
||||
"io_wait_time_recursive": [
|
||||
|
||||
],
|
||||
"io_merged_recursive": [
|
||||
|
||||
],
|
||||
"io_time_recursive": [
|
||||
|
||||
],
|
||||
"sectors_recursive": [
|
||||
|
||||
]
|
||||
},
|
||||
"num_procs": 0,
|
||||
"storage_stats": {
|
||||
|
||||
},
|
||||
"cpu_stats": {
|
||||
"cpu_usage": {
|
||||
"total_usage": 150000000,
|
||||
"percpu_usage": [
|
||||
182359190,
|
||||
178608875
|
||||
],
|
||||
"usage_in_kernelmode": 40000000,
|
||||
"usage_in_usermode": 290000000
|
||||
},
|
||||
"system_cpu_usage": 200000000,
|
||||
"online_cpus": 2,
|
||||
"throttling_data": {
|
||||
"periods": 0,
|
||||
"throttled_periods": 0,
|
||||
"throttled_time": 0
|
||||
}
|
||||
},
|
||||
"precpu_stats": {
|
||||
"cpu_usage": {
|
||||
"total_usage": 100000000,
|
||||
"percpu_usage": [
|
||||
182359190,
|
||||
178608875
|
||||
],
|
||||
"usage_in_kernelmode": 40000000,
|
||||
"usage_in_usermode": 290000000
|
||||
},
|
||||
"system_cpu_usage": 100000000,
|
||||
"online_cpus": 2,
|
||||
"throttling_data": {
|
||||
"periods": 0,
|
||||
"throttled_periods": 0,
|
||||
"throttled_time": 0
|
||||
}
|
||||
},
|
||||
"memory_stats": {
|
||||
"usage": 1806336,
|
||||
"max_usage": 6299648,
|
||||
"stats": {
|
||||
"active_anon": 606208,
|
||||
"active_file": 0,
|
||||
"cache": 0,
|
||||
"dirty": 0,
|
||||
"hierarchical_memory_limit": 134217728,
|
||||
"hierarchical_memsw_limit": 268435456,
|
||||
"inactive_anon": 0,
|
||||
"inactive_file": 0,
|
||||
"mapped_file": 0,
|
||||
"pgfault": 4185,
|
||||
"pgmajfault": 0,
|
||||
"pgpgin": 2926,
|
||||
"pgpgout": 2778,
|
||||
"rss": 606208,
|
||||
"rss_huge": 0,
|
||||
"total_active_anon": 606208,
|
||||
"total_active_file": 0,
|
||||
"total_cache": 0,
|
||||
"total_dirty": 0,
|
||||
"total_inactive_anon": 0,
|
||||
"total_inactive_file": 0,
|
||||
"total_mapped_file": 0,
|
||||
"total_pgfault": 4185,
|
||||
"total_pgmajfault": 0,
|
||||
"total_pgpgin": 2926,
|
||||
"total_pgpgout": 2778,
|
||||
"total_rss": 606208,
|
||||
"total_rss_huge": 0,
|
||||
"total_unevictable": 0,
|
||||
"total_writeback": 0,
|
||||
"unevictable": 0,
|
||||
"writeback": 0
|
||||
},
|
||||
"limit": 134217728
|
||||
},
|
||||
"name": "/ecs-curltest-26-curl-c2e5f6e0cf91b0bead01",
|
||||
"id": "5fc21e5b015f899d22618f8aede80b6d70d71b2a75465ea49d9462c8f3d2d3af",
|
||||
"networks": {
|
||||
"eth0": {
|
||||
"rx_bytes": 84,
|
||||
"rx_packets": 2,
|
||||
"rx_errors": 0,
|
||||
"rx_dropped": 0,
|
||||
"tx_bytes": 84,
|
||||
"tx_packets": 2,
|
||||
"tx_errors": 0,
|
||||
"tx_dropped": 0
|
||||
}
|
||||
},
|
||||
"network_rate_stats": {
|
||||
"rx_bytes_per_sec": 0,
|
||||
"tx_bytes_per_sec": 0
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,52 @@
|
|||
{
|
||||
"Cluster": "default",
|
||||
"TaskARN": "arn:aws:ecs:us-west-2:111122223333:task/default/158d1c8083dd49d6b527399fd6414f5c",
|
||||
"Family": "curltest",
|
||||
"ServiceName": "MyService",
|
||||
"Revision": "26",
|
||||
"DesiredStatus": "RUNNING",
|
||||
"KnownStatus": "RUNNING",
|
||||
"PullStartedAt": "2020-10-02T00:43:06.202617438Z",
|
||||
"PullStoppedAt": "2020-10-02T00:43:06.31288465Z",
|
||||
"AvailabilityZone": "us-west-2d",
|
||||
"VPCID": "vpc-1234567890abcdef0",
|
||||
"LaunchType": "EC2",
|
||||
"Containers": [
|
||||
{
|
||||
"DockerId": "ea32192c8553fbff06c9340478a2ff089b2bb5646fb718b4ee206641c9086d66",
|
||||
"Name": "~internal~ecs~pause",
|
||||
"DockerName": "ecs-curltest-26-internalecspause-e292d586b6f9dade4a00",
|
||||
"Image": "amazon/amazon-ecs-pause:0.1.0",
|
||||
"ImageID": "",
|
||||
"Labels": {
|
||||
"com.amazonaws.ecs.cluster": "default",
|
||||
"com.amazonaws.ecs.container-name": "~internal~ecs~pause",
|
||||
"com.amazonaws.ecs.task-arn": "arn:aws:ecs:us-west-2:111122223333:task/default/158d1c8083dd49d6b527399fd6414f5c",
|
||||
"com.amazonaws.ecs.task-definition-family": "curltest",
|
||||
"com.amazonaws.ecs.task-definition-version": "26"
|
||||
},
|
||||
"DesiredStatus": "RESOURCES_PROVISIONED",
|
||||
"KnownStatus": "RESOURCES_PROVISIONED",
|
||||
"Limits": {
|
||||
"CPU": 50,
|
||||
"Memory": 128
|
||||
},
|
||||
"CreatedAt": "2020-10-02T00:43:05.602352471Z",
|
||||
"StartedAt": "2020-10-02T00:43:06.076707576Z",
|
||||
"Type": "CNI_PAUSE",
|
||||
"Networks": [
|
||||
{
|
||||
"NetworkMode": "awsvpc",
|
||||
"IPv4Addresses": [
|
||||
"10.0.2.61"
|
||||
],
|
||||
"AttachmentIndex": 0,
|
||||
"MACAddress": "0e:10:e2:01:bd:91",
|
||||
"IPv4SubnetCIDRBlock": "10.0.2.0/24",
|
||||
"PrivateDNSName": "ip-10-0-2-61.us-west-2.compute.internal",
|
||||
"SubnetGatewayIpv4Address": "10.0.2.1/24"
|
||||
}
|
||||
]
|
||||
}
|
||||
]
|
||||
}
|
||||
|
|
@ -0,0 +1,44 @@
|
|||
{
|
||||
"DockerId": "ea32192c8553fbff06c9340478a2ff089b2bb5646fb718b4ee206641c9086d66",
|
||||
"Name": "curl",
|
||||
"DockerName": "ecs-curltest-24-curl-cca48e8dcadd97805600",
|
||||
"Image": "111122223333.dkr.ecr.us-west-2.amazonaws.com/curltest:latest",
|
||||
"ImageID": "sha256:d691691e9652791a60114e67b365688d20d19940dde7c4736ea30e660d8d3553",
|
||||
"Labels": {
|
||||
"com.amazonaws.ecs.cluster": "default",
|
||||
"com.amazonaws.ecs.container-name": "curl",
|
||||
"com.amazonaws.ecs.task-arn": "arn:aws:ecs:us-west-2:111122223333:task/default/8f03e41243824aea923aca126495f665",
|
||||
"com.amazonaws.ecs.task-definition-family": "curltest",
|
||||
"com.amazonaws.ecs.task-definition-version": "24"
|
||||
},
|
||||
"DesiredStatus": "RUNNING",
|
||||
"KnownStatus": "RUNNING",
|
||||
"Limits": {
|
||||
"CPU": 50,
|
||||
"Memory": 128
|
||||
},
|
||||
"CreatedAt": "2020-10-02T00:15:07.620912337Z",
|
||||
"StartedAt": "2020-10-02T00:15:08.062559351Z",
|
||||
"Type": "NORMAL",
|
||||
"LogDriver": "awslogs",
|
||||
"LogOptions": {
|
||||
"awslogs-create-group": "true",
|
||||
"awslogs-group": "/ecs/metadata",
|
||||
"awslogs-region": "us-west-2",
|
||||
"awslogs-stream": "ecs/curl/8f03e41243824aea923aca126495f665"
|
||||
},
|
||||
"ContainerARN": "arn:aws:ecs:us-west-2:111122223333:container/0206b271-b33f-47ab-86c6-a0ba208a70a9",
|
||||
"Networks": [
|
||||
{
|
||||
"NetworkMode": "awsvpc",
|
||||
"IPv4Addresses": [
|
||||
"10.0.2.100"
|
||||
],
|
||||
"AttachmentIndex": 0,
|
||||
"MACAddress": "0e:9e:32:c7:48:85",
|
||||
"IPv4SubnetCIDRBlock": "10.0.2.0/24",
|
||||
"PrivateDNSName": "ip-10-0-2-100.us-west-2.compute.internal",
|
||||
"SubnetGatewayIpv4Address": "10.0.2.1/24"
|
||||
}
|
||||
]
|
||||
}
|
||||
Some files were not shown because too many files have changed in this diff Show more
Loading…
Reference in a new issue