KCLv3 merge
This commit is contained in:
parent
a159fa31fb
commit
a754364d29
175 changed files with 18424 additions and 2349 deletions
|
|
@ -21,7 +21,7 @@
|
||||||
<parent>
|
<parent>
|
||||||
<artifactId>amazon-kinesis-client-pom</artifactId>
|
<artifactId>amazon-kinesis-client-pom</artifactId>
|
||||||
<groupId>software.amazon.kinesis</groupId>
|
<groupId>software.amazon.kinesis</groupId>
|
||||||
<version>2.6.1-SNAPSHOT</version>
|
<version>3.0.0</version>
|
||||||
</parent>
|
</parent>
|
||||||
<modelVersion>4.0.0</modelVersion>
|
<modelVersion>4.0.0</modelVersion>
|
||||||
|
|
||||||
|
|
@ -72,7 +72,7 @@
|
||||||
<dependency>
|
<dependency>
|
||||||
<groupId>org.projectlombok</groupId>
|
<groupId>org.projectlombok</groupId>
|
||||||
<artifactId>lombok</artifactId>
|
<artifactId>lombok</artifactId>
|
||||||
<version>1.18.24</version>
|
<version>1.18.28</version>
|
||||||
<scope>provided</scope>
|
<scope>provided</scope>
|
||||||
</dependency>
|
</dependency>
|
||||||
<dependency>
|
<dependency>
|
||||||
|
|
|
||||||
|
|
@ -23,7 +23,7 @@
|
||||||
<parent>
|
<parent>
|
||||||
<groupId>software.amazon.kinesis</groupId>
|
<groupId>software.amazon.kinesis</groupId>
|
||||||
<artifactId>amazon-kinesis-client-pom</artifactId>
|
<artifactId>amazon-kinesis-client-pom</artifactId>
|
||||||
<version>2.6.1-SNAPSHOT</version>
|
<version>3.0.0</version>
|
||||||
</parent>
|
</parent>
|
||||||
|
|
||||||
<artifactId>amazon-kinesis-client</artifactId>
|
<artifactId>amazon-kinesis-client</artifactId>
|
||||||
|
|
@ -68,6 +68,18 @@
|
||||||
<artifactId>dynamodb</artifactId>
|
<artifactId>dynamodb</artifactId>
|
||||||
<version>${awssdk.version}</version>
|
<version>${awssdk.version}</version>
|
||||||
</dependency>
|
</dependency>
|
||||||
|
<!-- https://mvnrepository.com/artifact/software.amazon.awssdk/dynamodb-enhanced -->
|
||||||
|
<dependency>
|
||||||
|
<groupId>software.amazon.awssdk</groupId>
|
||||||
|
<artifactId>dynamodb-enhanced</artifactId>
|
||||||
|
<version>${awssdk.version}</version>
|
||||||
|
</dependency>
|
||||||
|
<!-- https://mvnrepository.com/artifact/com.amazonaws/dynamodb-lock-client -->
|
||||||
|
<dependency>
|
||||||
|
<groupId>com.amazonaws</groupId>
|
||||||
|
<artifactId>dynamodb-lock-client</artifactId>
|
||||||
|
<version>1.3.0</version>
|
||||||
|
</dependency>
|
||||||
<dependency>
|
<dependency>
|
||||||
<groupId>software.amazon.awssdk</groupId>
|
<groupId>software.amazon.awssdk</groupId>
|
||||||
<artifactId>cloudwatch</artifactId>
|
<artifactId>cloudwatch</artifactId>
|
||||||
|
|
@ -103,11 +115,23 @@
|
||||||
<artifactId>commons-lang3</artifactId>
|
<artifactId>commons-lang3</artifactId>
|
||||||
<version>3.14.0</version>
|
<version>3.14.0</version>
|
||||||
</dependency>
|
</dependency>
|
||||||
|
<!-- https://mvnrepository.com/artifact/commons-collections/commons-collections -->
|
||||||
|
<dependency>
|
||||||
|
<groupId>commons-collections</groupId>
|
||||||
|
<artifactId>commons-collections</artifactId>
|
||||||
|
<version>3.2.2</version>
|
||||||
|
</dependency>
|
||||||
<dependency>
|
<dependency>
|
||||||
<groupId>org.slf4j</groupId>
|
<groupId>org.slf4j</groupId>
|
||||||
<artifactId>slf4j-api</artifactId>
|
<artifactId>slf4j-api</artifactId>
|
||||||
<version>${slf4j.version}</version>
|
<version>${slf4j.version}</version>
|
||||||
</dependency>
|
</dependency>
|
||||||
|
<!-- https://mvnrepository.com/artifact/org.jetbrains/annotations -->
|
||||||
|
<dependency>
|
||||||
|
<groupId>org.jetbrains</groupId>
|
||||||
|
<artifactId>annotations</artifactId>
|
||||||
|
<version>26.0.1</version>
|
||||||
|
</dependency>
|
||||||
|
|
||||||
<dependency>
|
<dependency>
|
||||||
<groupId>io.reactivex.rxjava3</groupId>
|
<groupId>io.reactivex.rxjava3</groupId>
|
||||||
|
|
@ -123,35 +147,47 @@
|
||||||
</dependency>
|
</dependency>
|
||||||
|
|
||||||
<!-- Test -->
|
<!-- Test -->
|
||||||
|
<!-- TODO: Migrate all tests to Junit5 -->
|
||||||
|
<dependency>
|
||||||
|
<groupId>org.junit.jupiter</groupId>
|
||||||
|
<artifactId>junit-jupiter-api</artifactId>
|
||||||
|
<version>5.11.3</version>
|
||||||
|
<scope>test</scope>
|
||||||
|
</dependency>
|
||||||
<dependency>
|
<dependency>
|
||||||
<groupId>junit</groupId>
|
<groupId>junit</groupId>
|
||||||
<artifactId>junit</artifactId>
|
<artifactId>junit</artifactId>
|
||||||
<version>4.13.2</version>
|
<version>4.13.2</version>
|
||||||
<scope>test</scope>
|
<scope>test</scope>
|
||||||
</dependency>
|
</dependency>
|
||||||
|
<!-- https://mvnrepository.com/artifact/org.junit.jupiter/junit-jupiter-params -->
|
||||||
<dependency>
|
<dependency>
|
||||||
<groupId>org.mockito</groupId>
|
<groupId>org.junit.jupiter</groupId>
|
||||||
<artifactId>mockito-all</artifactId>
|
<artifactId>junit-jupiter-params</artifactId>
|
||||||
<version>1.10.19</version>
|
<version>5.11.3</version>
|
||||||
|
<scope>test</scope>
|
||||||
|
</dependency>
|
||||||
|
<!-- Using older version to be compatible with Java 8 -->
|
||||||
|
<dependency>
|
||||||
|
<groupId>org.mockito</groupId>
|
||||||
|
<artifactId>mockito-junit-jupiter</artifactId>
|
||||||
|
<version>3.12.4</version>
|
||||||
<scope>test</scope>
|
<scope>test</scope>
|
||||||
</dependency>
|
</dependency>
|
||||||
|
|
||||||
<dependency>
|
<dependency>
|
||||||
<groupId>org.hamcrest</groupId>
|
<groupId>org.hamcrest</groupId>
|
||||||
<artifactId>hamcrest-all</artifactId>
|
<artifactId>hamcrest-all</artifactId>
|
||||||
<version>1.3</version>
|
<version>1.3</version>
|
||||||
<scope>test</scope>
|
<scope>test</scope>
|
||||||
</dependency>
|
</dependency>
|
||||||
|
<!-- Using older version to be compatible with Java 8 -->
|
||||||
|
<!-- https://mvnrepository.com/artifact/com.amazonaws/DynamoDBLocal -->
|
||||||
<!--<dependency>-->
|
<dependency>
|
||||||
<!--<groupId>com.amazonaws</groupId>-->
|
<groupId>com.amazonaws</groupId>
|
||||||
<!--<artifactId>DynamoDBLocal</artifactId>-->
|
<artifactId>DynamoDBLocal</artifactId>
|
||||||
<!--<version>1.11.86</version>-->
|
<version>1.25.0</version>
|
||||||
<!--<scope>test</scope>-->
|
<scope>test</scope>
|
||||||
<!--</dependency>-->
|
</dependency>
|
||||||
|
|
||||||
<dependency>
|
<dependency>
|
||||||
<groupId>ch.qos.logback</groupId>
|
<groupId>ch.qos.logback</groupId>
|
||||||
<artifactId>logback-classic</artifactId>
|
<artifactId>logback-classic</artifactId>
|
||||||
|
|
@ -162,11 +198,11 @@
|
||||||
</dependencies>
|
</dependencies>
|
||||||
|
|
||||||
<!--<repositories>-->
|
<!--<repositories>-->
|
||||||
<!--<repository>-->
|
<!--<repository>-->
|
||||||
<!--<id>dynamodblocal</id>-->
|
<!--<id>dynamodblocal</id>-->
|
||||||
<!--<name>AWS DynamoDB Local Release Repository</name>-->
|
<!--<name>AWS DynamoDB Local Release Repository</name>-->
|
||||||
<!--<url>https://s3-us-west-2.amazonaws.com/dynamodb-local/release</url>-->
|
<!--<url>https://s3-us-west-2.amazonaws.com/dynamodb-local/release</url>-->
|
||||||
<!--</repository>-->
|
<!--</repository>-->
|
||||||
<!--</repositories>-->
|
<!--</repositories>-->
|
||||||
|
|
||||||
<developers>
|
<developers>
|
||||||
|
|
@ -203,20 +239,20 @@
|
||||||
</pluginManagement>
|
</pluginManagement>
|
||||||
|
|
||||||
<plugins>
|
<plugins>
|
||||||
<plugin>
|
<plugin>
|
||||||
<groupId>org.xolstice.maven.plugins</groupId>
|
<groupId>org.xolstice.maven.plugins</groupId>
|
||||||
<artifactId>protobuf-maven-plugin</artifactId>
|
<artifactId>protobuf-maven-plugin</artifactId>
|
||||||
<version>0.6.1</version>
|
<version>0.6.1</version>
|
||||||
<executions>
|
<executions>
|
||||||
<execution>
|
<execution>
|
||||||
<goals>
|
<goals>
|
||||||
<goal>compile</goal>
|
<goal>compile</goal>
|
||||||
</goals>
|
</goals>
|
||||||
</execution>
|
</execution>
|
||||||
</executions>
|
</executions>
|
||||||
<configuration>
|
<configuration>
|
||||||
<protocArtifact>com.google.protobuf:protoc:${protobuf.version}:exe:${os.detected.classifier}</protocArtifact>
|
<protocArtifact>com.google.protobuf:protoc:${protobuf.version}:exe:${os.detected.classifier}</protocArtifact>
|
||||||
</configuration>
|
</configuration>
|
||||||
</plugin>
|
</plugin>
|
||||||
<plugin>
|
<plugin>
|
||||||
<groupId>org.apache.maven.plugins</groupId>
|
<groupId>org.apache.maven.plugins</groupId>
|
||||||
|
|
@ -437,4 +473,4 @@
|
||||||
</profile>
|
</profile>
|
||||||
</profiles>
|
</profiles>
|
||||||
|
|
||||||
</project>
|
</project>
|
||||||
|
|
@ -256,7 +256,8 @@ public class ConfigsBuilder {
|
||||||
* @return LeaseManagementConfig
|
* @return LeaseManagementConfig
|
||||||
*/
|
*/
|
||||||
public LeaseManagementConfig leaseManagementConfig() {
|
public LeaseManagementConfig leaseManagementConfig() {
|
||||||
return new LeaseManagementConfig(tableName(), dynamoDBClient(), kinesisClient(), workerIdentifier());
|
return new LeaseManagementConfig(
|
||||||
|
tableName(), applicationName(), dynamoDBClient(), kinesisClient(), workerIdentifier());
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
|
|
||||||
|
|
@ -0,0 +1,57 @@
|
||||||
|
/*
|
||||||
|
* Copyright 2024 Amazon.com, Inc. or its affiliates.
|
||||||
|
* Licensed under the Apache License, Version 2.0 (the
|
||||||
|
* "License"); you may not use this file except in compliance
|
||||||
|
* with the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
package software.amazon.kinesis.common;
|
||||||
|
|
||||||
|
import lombok.Data;
|
||||||
|
import lombok.NoArgsConstructor;
|
||||||
|
import lombok.experimental.Accessors;
|
||||||
|
import software.amazon.awssdk.services.dynamodb.model.BillingMode;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Configurations of a DDB table created by KCL for its internal operations.
|
||||||
|
*/
|
||||||
|
@Data
|
||||||
|
@Accessors(fluent = true)
|
||||||
|
@NoArgsConstructor
|
||||||
|
public class DdbTableConfig {
|
||||||
|
|
||||||
|
protected DdbTableConfig(final String applicationName, final String tableSuffix) {
|
||||||
|
this.tableName = applicationName + "-" + tableSuffix;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* name to use for the DDB table. If null, it will default to
|
||||||
|
* applicationName-tableSuffix. If multiple KCL applications
|
||||||
|
* run in the same account, a unique tableName must be provided.
|
||||||
|
*/
|
||||||
|
private String tableName;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Billing mode used to create the DDB table.
|
||||||
|
*/
|
||||||
|
private BillingMode billingMode = BillingMode.PAY_PER_REQUEST;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* read capacity to provision during DDB table creation,
|
||||||
|
* if billing mode is PROVISIONED.
|
||||||
|
*/
|
||||||
|
private long readCapacity;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* write capacity to provision during DDB table creation,
|
||||||
|
* if billing mode is PROVISIONED.
|
||||||
|
*/
|
||||||
|
private long writeCapacity;
|
||||||
|
}
|
||||||
|
|
@ -15,10 +15,13 @@
|
||||||
package software.amazon.kinesis.common;
|
package software.amazon.kinesis.common;
|
||||||
|
|
||||||
import java.time.Duration;
|
import java.time.Duration;
|
||||||
|
import java.util.concurrent.CompletableFuture;
|
||||||
|
import java.util.concurrent.CompletionException;
|
||||||
import java.util.concurrent.ExecutionException;
|
import java.util.concurrent.ExecutionException;
|
||||||
import java.util.concurrent.Future;
|
import java.util.concurrent.Future;
|
||||||
import java.util.concurrent.TimeUnit;
|
import java.util.concurrent.TimeUnit;
|
||||||
import java.util.concurrent.TimeoutException;
|
import java.util.concurrent.TimeoutException;
|
||||||
|
import java.util.function.Supplier;
|
||||||
|
|
||||||
public class FutureUtils {
|
public class FutureUtils {
|
||||||
|
|
||||||
|
|
@ -31,4 +34,15 @@ public class FutureUtils {
|
||||||
throw te;
|
throw te;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public static <T> T unwrappingFuture(final Supplier<CompletableFuture<T>> supplier) {
|
||||||
|
try {
|
||||||
|
return supplier.get().join();
|
||||||
|
} catch (CompletionException e) {
|
||||||
|
if (e.getCause() instanceof RuntimeException) {
|
||||||
|
throw (RuntimeException) e.getCause();
|
||||||
|
}
|
||||||
|
throw e;
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -1,5 +1,5 @@
|
||||||
/*
|
/*
|
||||||
* Copyright 2019 Amazon.com, Inc. or its affiliates.
|
* Copyright 2024 Amazon.com, Inc. or its affiliates.
|
||||||
* Licensed under the Apache License, Version 2.0 (the
|
* Licensed under the Apache License, Version 2.0 (the
|
||||||
* "License"); you may not use this file except in compliance
|
* "License"); you may not use this file except in compliance
|
||||||
* with the License. You may obtain a copy of the License at
|
* with the License. You may obtain a copy of the License at
|
||||||
|
|
@ -12,18 +12,16 @@
|
||||||
* See the License for the specific language governing permissions and
|
* See the License for the specific language governing permissions and
|
||||||
* limitations under the License.
|
* limitations under the License.
|
||||||
*/
|
*/
|
||||||
|
package software.amazon.kinesis.common;
|
||||||
|
|
||||||
package software.amazon.kinesis.leases.dynamodb;
|
public class StackTraceUtils {
|
||||||
|
public static String getPrintableStackTrace(final StackTraceElement[] stackTrace) {
|
||||||
|
final StringBuilder stackTraceString = new StringBuilder();
|
||||||
|
|
||||||
import lombok.AccessLevel;
|
for (final StackTraceElement traceElement : stackTrace) {
|
||||||
import lombok.NoArgsConstructor;
|
stackTraceString.append("\tat ").append(traceElement).append("\n");
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
return stackTraceString.toString();
|
||||||
* This class is just a holder for initial lease table IOPs units. This class will be removed in a future release.
|
}
|
||||||
*/
|
|
||||||
@Deprecated
|
|
||||||
@NoArgsConstructor(access = AccessLevel.PRIVATE)
|
|
||||||
public class TableConstants {
|
|
||||||
public static final long DEFAULT_INITIAL_LEASE_TABLE_READ_CAPACITY = 10L;
|
|
||||||
public static final long DEFAULT_INITIAL_LEASE_TABLE_WRITE_CAPACITY = 10L;
|
|
||||||
}
|
}
|
||||||
|
|
@ -18,6 +18,7 @@ package software.amazon.kinesis.coordinator;
|
||||||
import lombok.Data;
|
import lombok.Data;
|
||||||
import lombok.NonNull;
|
import lombok.NonNull;
|
||||||
import lombok.experimental.Accessors;
|
import lombok.experimental.Accessors;
|
||||||
|
import software.amazon.kinesis.common.DdbTableConfig;
|
||||||
import software.amazon.kinesis.leases.NoOpShardPrioritization;
|
import software.amazon.kinesis.leases.NoOpShardPrioritization;
|
||||||
import software.amazon.kinesis.leases.ShardPrioritization;
|
import software.amazon.kinesis.leases.ShardPrioritization;
|
||||||
|
|
||||||
|
|
@ -27,6 +28,14 @@ import software.amazon.kinesis.leases.ShardPrioritization;
|
||||||
@Data
|
@Data
|
||||||
@Accessors(fluent = true)
|
@Accessors(fluent = true)
|
||||||
public class CoordinatorConfig {
|
public class CoordinatorConfig {
|
||||||
|
|
||||||
|
private static final int PERIODIC_SHARD_SYNC_MAX_WORKERS_DEFAULT = 1;
|
||||||
|
|
||||||
|
public CoordinatorConfig(final String applicationName) {
|
||||||
|
this.applicationName = applicationName;
|
||||||
|
this.coordinatorStateConfig = new CoordinatorStateTableConfig(applicationName);
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Application name used by checkpointer to checkpoint.
|
* Application name used by checkpointer to checkpoint.
|
||||||
*
|
*
|
||||||
|
|
@ -96,4 +105,53 @@ public class CoordinatorConfig {
|
||||||
* <p>Default value: 1000L</p>
|
* <p>Default value: 1000L</p>
|
||||||
*/
|
*/
|
||||||
private long schedulerInitializationBackoffTimeMillis = 1000L;
|
private long schedulerInitializationBackoffTimeMillis = 1000L;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Version the KCL needs to operate in. For more details check the KCLv3 migration
|
||||||
|
* documentation.
|
||||||
|
*/
|
||||||
|
public enum ClientVersionConfig {
|
||||||
|
/**
|
||||||
|
* For an application that was operating with previous KCLv2.x, during
|
||||||
|
* upgrade to KCLv3.x, a migration process is needed due to the incompatible
|
||||||
|
* changes between the 2 versions. During the migration process, application
|
||||||
|
* must use ClientVersion=CLIENT_VERSION_COMPATIBLE_WITH_2x so that it runs in
|
||||||
|
* a compatible mode until all workers in the cluster have upgraded to the version
|
||||||
|
* running 3.x version (which is determined based on workers emitting WorkerMetricStats)
|
||||||
|
* Once all known workers are in 3.x mode, the library auto toggles to 3.x mode;
|
||||||
|
* but prior to that it runs in a mode compatible with 2.x workers.
|
||||||
|
* This version also allows rolling back to the compatible mode from the
|
||||||
|
* auto-toggled 3.x mode.
|
||||||
|
*/
|
||||||
|
CLIENT_VERSION_CONFIG_COMPATIBLE_WITH_2x,
|
||||||
|
/**
|
||||||
|
* A new application operating with KCLv3.x will use this value. Also, an application
|
||||||
|
* that has successfully upgraded to 3.x version and no longer needs the ability
|
||||||
|
* for a rollback to a 2.x compatible version, will use this value. In this version,
|
||||||
|
* KCL will operate with new algorithms introduced in 3.x which is not compatible
|
||||||
|
* with prior versions. And once in this version, rollback to 2.x is not supported.
|
||||||
|
*/
|
||||||
|
CLIENT_VERSION_CONFIG_3x,
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Client version KCL must operate in, by default it operates in 3.x version which is not
|
||||||
|
* compatible with prior versions.
|
||||||
|
*/
|
||||||
|
private ClientVersionConfig clientVersionConfig = ClientVersionConfig.CLIENT_VERSION_CONFIG_3x;
|
||||||
|
|
||||||
|
public static class CoordinatorStateTableConfig extends DdbTableConfig {
|
||||||
|
private CoordinatorStateTableConfig(final String applicationName) {
|
||||||
|
super(applicationName, "CoordinatorState");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Configuration to control how the CoordinatorState DDB table is created, such as table name,
|
||||||
|
* billing mode, provisioned capacity. If no table name is specified, the table name will
|
||||||
|
* default to applicationName-CoordinatorState. If no billing more is chosen, default is
|
||||||
|
* On-Demand.
|
||||||
|
*/
|
||||||
|
@NonNull
|
||||||
|
private final CoordinatorStateTableConfig coordinatorStateConfig;
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -0,0 +1,52 @@
|
||||||
|
/*
|
||||||
|
* Copyright 2024 Amazon.com, Inc. or its affiliates.
|
||||||
|
* Licensed under the Apache License, Version 2.0 (the
|
||||||
|
* "License"); you may not use this file except in compliance
|
||||||
|
* with the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
package software.amazon.kinesis.coordinator;
|
||||||
|
|
||||||
|
import java.util.Map;
|
||||||
|
|
||||||
|
import lombok.AccessLevel;
|
||||||
|
import lombok.AllArgsConstructor;
|
||||||
|
import lombok.Builder;
|
||||||
|
import lombok.Data;
|
||||||
|
import lombok.NoArgsConstructor;
|
||||||
|
import lombok.extern.slf4j.Slf4j;
|
||||||
|
import software.amazon.awssdk.services.dynamodb.model.AttributeValue;
|
||||||
|
import software.amazon.kinesis.annotations.KinesisClientInternalApi;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* DataModel for CoordinatorState, this data model is used to store various state information required
|
||||||
|
* for coordination across the KCL worker fleet. Therefore, the model follows a flexible schema.
|
||||||
|
*/
|
||||||
|
@Data
|
||||||
|
@Builder
|
||||||
|
@NoArgsConstructor
|
||||||
|
@AllArgsConstructor(access = AccessLevel.PRIVATE)
|
||||||
|
@Slf4j
|
||||||
|
@KinesisClientInternalApi
|
||||||
|
public class CoordinatorState {
|
||||||
|
public static final String COORDINATOR_STATE_TABLE_HASH_KEY_ATTRIBUTE_NAME = "key";
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Key value for the item in the CoordinatorState table used for leader
|
||||||
|
* election among the KCL workers. The attributes relevant to this item
|
||||||
|
* is dictated by the DDB Lock client implementation that is used to
|
||||||
|
* provide mutual exclusion.
|
||||||
|
*/
|
||||||
|
public static final String LEADER_HASH_KEY = "Leader";
|
||||||
|
|
||||||
|
private String key;
|
||||||
|
|
||||||
|
private Map<String, AttributeValue> attributes;
|
||||||
|
}
|
||||||
|
|
@ -0,0 +1,417 @@
|
||||||
|
/*
|
||||||
|
* Copyright 2024 Amazon.com, Inc. or its affiliates.
|
||||||
|
* Licensed under the Apache License, Version 2.0 (the
|
||||||
|
* "License"); you may not use this file except in compliance
|
||||||
|
* with the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
package software.amazon.kinesis.coordinator;
|
||||||
|
|
||||||
|
import java.time.Duration;
|
||||||
|
import java.util.ArrayList;
|
||||||
|
import java.util.Collections;
|
||||||
|
import java.util.HashMap;
|
||||||
|
import java.util.List;
|
||||||
|
import java.util.Map;
|
||||||
|
import java.util.Objects;
|
||||||
|
|
||||||
|
import com.amazonaws.services.dynamodbv2.AmazonDynamoDBLockClientOptions;
|
||||||
|
import com.amazonaws.services.dynamodbv2.AmazonDynamoDBLockClientOptions.AmazonDynamoDBLockClientOptionsBuilder;
|
||||||
|
import lombok.NonNull;
|
||||||
|
import lombok.extern.slf4j.Slf4j;
|
||||||
|
import org.apache.commons.collections4.MapUtils;
|
||||||
|
import software.amazon.awssdk.core.waiters.WaiterResponse;
|
||||||
|
import software.amazon.awssdk.services.dynamodb.DynamoDbAsyncClient;
|
||||||
|
import software.amazon.awssdk.services.dynamodb.DynamoDbClient;
|
||||||
|
import software.amazon.awssdk.services.dynamodb.model.AttributeAction;
|
||||||
|
import software.amazon.awssdk.services.dynamodb.model.AttributeDefinition;
|
||||||
|
import software.amazon.awssdk.services.dynamodb.model.AttributeValue;
|
||||||
|
import software.amazon.awssdk.services.dynamodb.model.AttributeValueUpdate;
|
||||||
|
import software.amazon.awssdk.services.dynamodb.model.BillingMode;
|
||||||
|
import software.amazon.awssdk.services.dynamodb.model.ConditionalCheckFailedException;
|
||||||
|
import software.amazon.awssdk.services.dynamodb.model.CreateTableRequest;
|
||||||
|
import software.amazon.awssdk.services.dynamodb.model.CreateTableResponse;
|
||||||
|
import software.amazon.awssdk.services.dynamodb.model.DescribeTableRequest;
|
||||||
|
import software.amazon.awssdk.services.dynamodb.model.DescribeTableResponse;
|
||||||
|
import software.amazon.awssdk.services.dynamodb.model.DynamoDbException;
|
||||||
|
import software.amazon.awssdk.services.dynamodb.model.ExpectedAttributeValue;
|
||||||
|
import software.amazon.awssdk.services.dynamodb.model.GetItemRequest;
|
||||||
|
import software.amazon.awssdk.services.dynamodb.model.GetItemResponse;
|
||||||
|
import software.amazon.awssdk.services.dynamodb.model.KeySchemaElement;
|
||||||
|
import software.amazon.awssdk.services.dynamodb.model.KeyType;
|
||||||
|
import software.amazon.awssdk.services.dynamodb.model.ProvisionedThroughput;
|
||||||
|
import software.amazon.awssdk.services.dynamodb.model.ProvisionedThroughputExceededException;
|
||||||
|
import software.amazon.awssdk.services.dynamodb.model.PutItemRequest;
|
||||||
|
import software.amazon.awssdk.services.dynamodb.model.ResourceNotFoundException;
|
||||||
|
import software.amazon.awssdk.services.dynamodb.model.ScalarAttributeType;
|
||||||
|
import software.amazon.awssdk.services.dynamodb.model.ScanRequest;
|
||||||
|
import software.amazon.awssdk.services.dynamodb.model.ScanResponse;
|
||||||
|
import software.amazon.awssdk.services.dynamodb.model.TableDescription;
|
||||||
|
import software.amazon.awssdk.services.dynamodb.model.TableStatus;
|
||||||
|
import software.amazon.awssdk.services.dynamodb.model.UpdateItemRequest;
|
||||||
|
import software.amazon.awssdk.services.dynamodb.waiters.DynamoDbAsyncWaiter;
|
||||||
|
import software.amazon.awssdk.utils.CollectionUtils;
|
||||||
|
import software.amazon.kinesis.common.FutureUtils;
|
||||||
|
import software.amazon.kinesis.coordinator.CoordinatorConfig.CoordinatorStateTableConfig;
|
||||||
|
import software.amazon.kinesis.coordinator.migration.MigrationState;
|
||||||
|
import software.amazon.kinesis.leases.DynamoUtils;
|
||||||
|
import software.amazon.kinesis.leases.exceptions.DependencyException;
|
||||||
|
import software.amazon.kinesis.leases.exceptions.InvalidStateException;
|
||||||
|
import software.amazon.kinesis.leases.exceptions.ProvisionedThroughputException;
|
||||||
|
|
||||||
|
import static software.amazon.kinesis.common.FutureUtils.unwrappingFuture;
|
||||||
|
import static software.amazon.kinesis.coordinator.CoordinatorState.COORDINATOR_STATE_TABLE_HASH_KEY_ATTRIBUTE_NAME;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Data Access Object to abstract accessing {@link CoordinatorState} from
|
||||||
|
* the CoordinatorState DDB table.
|
||||||
|
*/
|
||||||
|
@Slf4j
|
||||||
|
public class CoordinatorStateDAO {
|
||||||
|
private final DynamoDbAsyncClient dynamoDbAsyncClient;
|
||||||
|
private final DynamoDbClient dynamoDbSyncClient;
|
||||||
|
|
||||||
|
private final CoordinatorStateTableConfig config;
|
||||||
|
|
||||||
|
public CoordinatorStateDAO(
|
||||||
|
final DynamoDbAsyncClient dynamoDbAsyncClient, final CoordinatorStateTableConfig config) {
|
||||||
|
this.dynamoDbAsyncClient = dynamoDbAsyncClient;
|
||||||
|
this.config = config;
|
||||||
|
this.dynamoDbSyncClient = createDelegateClient();
|
||||||
|
}
|
||||||
|
|
||||||
|
public void initialize() throws DependencyException {
|
||||||
|
createTableIfNotExists();
|
||||||
|
}
|
||||||
|
|
||||||
|
private DynamoDbClient createDelegateClient() {
|
||||||
|
return new DynamoDbAsyncToSyncClientAdapter(dynamoDbAsyncClient);
|
||||||
|
}
|
||||||
|
|
||||||
|
public AmazonDynamoDBLockClientOptionsBuilder getDDBLockClientOptionsBuilder() {
|
||||||
|
return AmazonDynamoDBLockClientOptions.builder(dynamoDbSyncClient, config.tableName())
|
||||||
|
.withPartitionKeyName(COORDINATOR_STATE_TABLE_HASH_KEY_ATTRIBUTE_NAME);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* List all the {@link CoordinatorState} from the DDB table synchronously
|
||||||
|
*
|
||||||
|
* @throws DependencyException if DynamoDB scan fails in an unexpected way
|
||||||
|
* @throws InvalidStateException if ddb table does not exist
|
||||||
|
* @throws ProvisionedThroughputException if DynamoDB scan fails due to lack of capacity
|
||||||
|
*
|
||||||
|
* @return list of state
|
||||||
|
*/
|
||||||
|
public List<CoordinatorState> listCoordinatorState()
|
||||||
|
throws ProvisionedThroughputException, DependencyException, InvalidStateException {
|
||||||
|
log.debug("Listing coordinatorState");
|
||||||
|
|
||||||
|
final ScanRequest request =
|
||||||
|
ScanRequest.builder().tableName(config.tableName()).build();
|
||||||
|
|
||||||
|
try {
|
||||||
|
ScanResponse response = FutureUtils.unwrappingFuture(() -> dynamoDbAsyncClient.scan(request));
|
||||||
|
final List<CoordinatorState> stateList = new ArrayList<>();
|
||||||
|
while (Objects.nonNull(response)) {
|
||||||
|
log.debug("Scan response {}", response);
|
||||||
|
|
||||||
|
response.items().stream().map(this::fromDynamoRecord).forEach(stateList::add);
|
||||||
|
if (!CollectionUtils.isNullOrEmpty(response.lastEvaluatedKey())) {
|
||||||
|
final ScanRequest continuationRequest = request.toBuilder()
|
||||||
|
.exclusiveStartKey(response.lastEvaluatedKey())
|
||||||
|
.build();
|
||||||
|
log.debug("Scan request {}", continuationRequest);
|
||||||
|
response = FutureUtils.unwrappingFuture(() -> dynamoDbAsyncClient.scan(continuationRequest));
|
||||||
|
} else {
|
||||||
|
log.debug("Scan finished");
|
||||||
|
response = null;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return stateList;
|
||||||
|
} catch (final ProvisionedThroughputExceededException e) {
|
||||||
|
log.warn(
|
||||||
|
"Provisioned throughput on {} has exceeded. It is recommended to increase the IOPs"
|
||||||
|
+ " on the table.",
|
||||||
|
config.tableName());
|
||||||
|
throw new ProvisionedThroughputException(e);
|
||||||
|
} catch (final ResourceNotFoundException e) {
|
||||||
|
throw new InvalidStateException(
|
||||||
|
String.format("Cannot list coordinatorState, because table %s does not exist", config.tableName()));
|
||||||
|
} catch (final DynamoDbException e) {
|
||||||
|
throw new DependencyException(e);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Create a new {@link CoordinatorState} if it does not exist.
|
||||||
|
* @param state the state to create
|
||||||
|
* @return true if state was created, false if it already exists
|
||||||
|
*
|
||||||
|
* @throws DependencyException if DynamoDB put fails in an unexpected way
|
||||||
|
* @throws InvalidStateException if lease table does not exist
|
||||||
|
* @throws ProvisionedThroughputException if DynamoDB put fails due to lack of capacity
|
||||||
|
*/
|
||||||
|
public boolean createCoordinatorStateIfNotExists(final CoordinatorState state)
|
||||||
|
throws DependencyException, InvalidStateException, ProvisionedThroughputException {
|
||||||
|
log.debug("Creating coordinatorState {}", state);
|
||||||
|
|
||||||
|
final PutItemRequest request = PutItemRequest.builder()
|
||||||
|
.tableName(config.tableName())
|
||||||
|
.item(toDynamoRecord(state))
|
||||||
|
.expected(getDynamoNonExistentExpectation())
|
||||||
|
.build();
|
||||||
|
|
||||||
|
try {
|
||||||
|
FutureUtils.unwrappingFuture(() -> dynamoDbAsyncClient.putItem(request));
|
||||||
|
} catch (final ConditionalCheckFailedException e) {
|
||||||
|
log.info("Not creating coordinator state because the key already exists");
|
||||||
|
return false;
|
||||||
|
} catch (final ProvisionedThroughputExceededException e) {
|
||||||
|
log.warn(
|
||||||
|
"Provisioned throughput on {} has exceeded. It is recommended to increase the IOPs"
|
||||||
|
+ " on the table.",
|
||||||
|
config.tableName());
|
||||||
|
throw new ProvisionedThroughputException(e);
|
||||||
|
} catch (final ResourceNotFoundException e) {
|
||||||
|
throw new InvalidStateException(String.format(
|
||||||
|
"Cannot create coordinatorState %s, because table %s does not exist", state, config.tableName()));
|
||||||
|
} catch (final DynamoDbException e) {
|
||||||
|
throw new DependencyException(e);
|
||||||
|
}
|
||||||
|
|
||||||
|
log.info("Created CoordinatorState: {}", state);
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @param key Get the CoordinatorState for this key
|
||||||
|
*
|
||||||
|
* @throws InvalidStateException if ddb table does not exist
|
||||||
|
* @throws ProvisionedThroughputException if DynamoDB get fails due to lack of capacity
|
||||||
|
* @throws DependencyException if DynamoDB get fails in an unexpected way
|
||||||
|
*
|
||||||
|
* @return state for the specified key, or null if one doesn't exist
|
||||||
|
*/
|
||||||
|
public CoordinatorState getCoordinatorState(@NonNull final String key)
|
||||||
|
throws DependencyException, InvalidStateException, ProvisionedThroughputException {
|
||||||
|
log.debug("Getting coordinatorState with key {}", key);
|
||||||
|
|
||||||
|
final GetItemRequest request = GetItemRequest.builder()
|
||||||
|
.tableName(config.tableName())
|
||||||
|
.key(getCoordinatorStateKey(key))
|
||||||
|
.consistentRead(true)
|
||||||
|
.build();
|
||||||
|
|
||||||
|
try {
|
||||||
|
final GetItemResponse result = FutureUtils.unwrappingFuture(() -> dynamoDbAsyncClient.getItem(request));
|
||||||
|
|
||||||
|
final Map<String, AttributeValue> dynamoRecord = result.item();
|
||||||
|
if (CollectionUtils.isNullOrEmpty(dynamoRecord)) {
|
||||||
|
log.debug("No coordinatorState found with key {}, returning null.", key);
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
return fromDynamoRecord(dynamoRecord);
|
||||||
|
} catch (final ProvisionedThroughputExceededException e) {
|
||||||
|
log.warn(
|
||||||
|
"Provisioned throughput on {} has exceeded. It is recommended to increase the IOPs"
|
||||||
|
+ " on the table.",
|
||||||
|
config.tableName());
|
||||||
|
throw new ProvisionedThroughputException(e);
|
||||||
|
} catch (final ResourceNotFoundException e) {
|
||||||
|
throw new InvalidStateException(String.format(
|
||||||
|
"Cannot get coordinatorState for key %s, because table %s does not exist",
|
||||||
|
key, config.tableName()));
|
||||||
|
} catch (final DynamoDbException e) {
|
||||||
|
throw new DependencyException(e);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Update fields of the given coordinator state in DynamoDB. Conditional on the provided expectation.
|
||||||
|
*
|
||||||
|
* @return true if update succeeded, false otherwise when expectations are not met
|
||||||
|
*
|
||||||
|
* @throws InvalidStateException if table does not exist
|
||||||
|
* @throws ProvisionedThroughputException if DynamoDB update fails due to lack of capacity
|
||||||
|
* @throws DependencyException if DynamoDB update fails in an unexpected way
|
||||||
|
*/
|
||||||
|
public boolean updateCoordinatorStateWithExpectation(
|
||||||
|
@NonNull final CoordinatorState state, final Map<String, ExpectedAttributeValue> expectations)
|
||||||
|
throws DependencyException, InvalidStateException, ProvisionedThroughputException {
|
||||||
|
final Map<String, ExpectedAttributeValue> expectationMap = getDynamoExistentExpectation(state.getKey());
|
||||||
|
expectationMap.putAll(MapUtils.emptyIfNull(expectations));
|
||||||
|
|
||||||
|
final Map<String, AttributeValueUpdate> updateMap = getDynamoCoordinatorStateUpdate(state);
|
||||||
|
|
||||||
|
final UpdateItemRequest request = UpdateItemRequest.builder()
|
||||||
|
.tableName(config.tableName())
|
||||||
|
.key(getCoordinatorStateKey(state.getKey()))
|
||||||
|
.expected(expectationMap)
|
||||||
|
.attributeUpdates(updateMap)
|
||||||
|
.build();
|
||||||
|
|
||||||
|
try {
|
||||||
|
FutureUtils.unwrappingFuture(() -> dynamoDbAsyncClient.updateItem(request));
|
||||||
|
} catch (final ConditionalCheckFailedException e) {
|
||||||
|
log.debug("CoordinatorState update {} failed because conditions were not met", state);
|
||||||
|
return false;
|
||||||
|
} catch (final ProvisionedThroughputExceededException e) {
|
||||||
|
log.warn(
|
||||||
|
"Provisioned throughput on {} has exceeded. It is recommended to increase the IOPs"
|
||||||
|
+ " on the table.",
|
||||||
|
config.tableName());
|
||||||
|
throw new ProvisionedThroughputException(e);
|
||||||
|
} catch (final ResourceNotFoundException e) {
|
||||||
|
throw new InvalidStateException(String.format(
|
||||||
|
"Cannot update coordinatorState for key %s, because table %s does not exist",
|
||||||
|
state.getKey(), config.tableName()));
|
||||||
|
} catch (final DynamoDbException e) {
|
||||||
|
throw new DependencyException(e);
|
||||||
|
}
|
||||||
|
|
||||||
|
log.info("Coordinator state updated {}", state);
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
private void createTableIfNotExists() throws DependencyException {
|
||||||
|
TableDescription tableDescription = getTableDescription();
|
||||||
|
if (tableDescription == null) {
|
||||||
|
final CreateTableResponse response = unwrappingFuture(() -> dynamoDbAsyncClient.createTable(getRequest()));
|
||||||
|
tableDescription = response.tableDescription();
|
||||||
|
log.info("DDB Table: {} created", config.tableName());
|
||||||
|
} else {
|
||||||
|
log.info("Skipping DDB table {} creation as it already exists", config.tableName());
|
||||||
|
}
|
||||||
|
|
||||||
|
if (tableDescription.tableStatus() != TableStatus.ACTIVE) {
|
||||||
|
log.info("Waiting for DDB Table: {} to become active", config.tableName());
|
||||||
|
try (final DynamoDbAsyncWaiter waiter = dynamoDbAsyncClient.waiter()) {
|
||||||
|
final WaiterResponse<DescribeTableResponse> response =
|
||||||
|
unwrappingFuture(() -> waiter.waitUntilTableExists(
|
||||||
|
r -> r.tableName(config.tableName()), o -> o.waitTimeout(Duration.ofMinutes(10))));
|
||||||
|
response.matched()
|
||||||
|
.response()
|
||||||
|
.orElseThrow(() -> new DependencyException(new IllegalStateException(
|
||||||
|
"Creating CoordinatorState table timed out",
|
||||||
|
response.matched().exception().orElse(null))));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private CreateTableRequest getRequest() {
|
||||||
|
final CreateTableRequest.Builder requestBuilder = CreateTableRequest.builder()
|
||||||
|
.tableName(config.tableName())
|
||||||
|
.keySchema(KeySchemaElement.builder()
|
||||||
|
.attributeName(COORDINATOR_STATE_TABLE_HASH_KEY_ATTRIBUTE_NAME)
|
||||||
|
.keyType(KeyType.HASH)
|
||||||
|
.build())
|
||||||
|
.attributeDefinitions(AttributeDefinition.builder()
|
||||||
|
.attributeName(COORDINATOR_STATE_TABLE_HASH_KEY_ATTRIBUTE_NAME)
|
||||||
|
.attributeType(ScalarAttributeType.S)
|
||||||
|
.build());
|
||||||
|
|
||||||
|
switch (config.billingMode()) {
|
||||||
|
case PAY_PER_REQUEST:
|
||||||
|
requestBuilder.billingMode(BillingMode.PAY_PER_REQUEST);
|
||||||
|
break;
|
||||||
|
case PROVISIONED:
|
||||||
|
requestBuilder.billingMode(BillingMode.PROVISIONED);
|
||||||
|
|
||||||
|
final ProvisionedThroughput throughput = ProvisionedThroughput.builder()
|
||||||
|
.readCapacityUnits(config.readCapacity())
|
||||||
|
.writeCapacityUnits(config.writeCapacity())
|
||||||
|
.build();
|
||||||
|
requestBuilder.provisionedThroughput(throughput);
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
return requestBuilder.build();
|
||||||
|
}
|
||||||
|
|
||||||
|
private Map<String, AttributeValue> getCoordinatorStateKey(@NonNull final String key) {
|
||||||
|
return Collections.singletonMap(
|
||||||
|
COORDINATOR_STATE_TABLE_HASH_KEY_ATTRIBUTE_NAME, DynamoUtils.createAttributeValue(key));
|
||||||
|
}
|
||||||
|
|
||||||
|
private CoordinatorState fromDynamoRecord(final Map<String, AttributeValue> dynamoRecord) {
|
||||||
|
final HashMap<String, AttributeValue> attributes = new HashMap<>(dynamoRecord);
|
||||||
|
final String keyValue =
|
||||||
|
DynamoUtils.safeGetString(attributes.remove(COORDINATOR_STATE_TABLE_HASH_KEY_ATTRIBUTE_NAME));
|
||||||
|
|
||||||
|
final MigrationState migrationState = MigrationState.deserialize(keyValue, attributes);
|
||||||
|
if (migrationState != null) {
|
||||||
|
log.debug("Retrieved MigrationState {}", migrationState);
|
||||||
|
return migrationState;
|
||||||
|
}
|
||||||
|
|
||||||
|
final CoordinatorState c =
|
||||||
|
CoordinatorState.builder().key(keyValue).attributes(attributes).build();
|
||||||
|
log.debug("Retrieved coordinatorState {}", c);
|
||||||
|
|
||||||
|
return c;
|
||||||
|
}
|
||||||
|
|
||||||
|
private Map<String, AttributeValue> toDynamoRecord(final CoordinatorState state) {
|
||||||
|
final Map<String, AttributeValue> result = new HashMap<>();
|
||||||
|
result.put(COORDINATOR_STATE_TABLE_HASH_KEY_ATTRIBUTE_NAME, DynamoUtils.createAttributeValue(state.getKey()));
|
||||||
|
if (state instanceof MigrationState) {
|
||||||
|
result.putAll(((MigrationState) state).serialize());
|
||||||
|
}
|
||||||
|
if (!CollectionUtils.isNullOrEmpty(state.getAttributes())) {
|
||||||
|
result.putAll(state.getAttributes());
|
||||||
|
}
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
|
private Map<String, ExpectedAttributeValue> getDynamoNonExistentExpectation() {
|
||||||
|
final Map<String, ExpectedAttributeValue> result = new HashMap<>();
|
||||||
|
|
||||||
|
final ExpectedAttributeValue expectedAV =
|
||||||
|
ExpectedAttributeValue.builder().exists(false).build();
|
||||||
|
result.put(COORDINATOR_STATE_TABLE_HASH_KEY_ATTRIBUTE_NAME, expectedAV);
|
||||||
|
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
|
private Map<String, ExpectedAttributeValue> getDynamoExistentExpectation(final String keyValue) {
|
||||||
|
final Map<String, ExpectedAttributeValue> result = new HashMap<>();
|
||||||
|
|
||||||
|
final ExpectedAttributeValue expectedAV = ExpectedAttributeValue.builder()
|
||||||
|
.value(AttributeValue.fromS(keyValue))
|
||||||
|
.build();
|
||||||
|
result.put(COORDINATOR_STATE_TABLE_HASH_KEY_ATTRIBUTE_NAME, expectedAV);
|
||||||
|
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
|
private Map<String, AttributeValueUpdate> getDynamoCoordinatorStateUpdate(final CoordinatorState state) {
|
||||||
|
final HashMap<String, AttributeValueUpdate> updates = new HashMap<>();
|
||||||
|
if (state instanceof MigrationState) {
|
||||||
|
updates.putAll(((MigrationState) state).getDynamoUpdate());
|
||||||
|
}
|
||||||
|
state.getAttributes()
|
||||||
|
.forEach((attribute, value) -> updates.put(
|
||||||
|
attribute,
|
||||||
|
AttributeValueUpdate.builder()
|
||||||
|
.value(value)
|
||||||
|
.action(AttributeAction.PUT)
|
||||||
|
.build()));
|
||||||
|
return updates;
|
||||||
|
}
|
||||||
|
|
||||||
|
private TableDescription getTableDescription() {
|
||||||
|
try {
|
||||||
|
final DescribeTableResponse response = unwrappingFuture(() -> dynamoDbAsyncClient.describeTable(
|
||||||
|
DescribeTableRequest.builder().tableName(config.tableName()).build()));
|
||||||
|
return response.table();
|
||||||
|
} catch (final ResourceNotFoundException e) {
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
@ -28,12 +28,17 @@ import java.util.function.BooleanSupplier;
|
||||||
import java.util.stream.Collectors;
|
import java.util.stream.Collectors;
|
||||||
|
|
||||||
import lombok.extern.slf4j.Slf4j;
|
import lombok.extern.slf4j.Slf4j;
|
||||||
|
import software.amazon.awssdk.services.cloudwatch.model.StandardUnit;
|
||||||
import software.amazon.awssdk.utils.CollectionUtils;
|
import software.amazon.awssdk.utils.CollectionUtils;
|
||||||
import software.amazon.kinesis.leases.Lease;
|
import software.amazon.kinesis.leases.Lease;
|
||||||
import software.amazon.kinesis.leases.LeaseRefresher;
|
import software.amazon.kinesis.leases.LeaseRefresher;
|
||||||
import software.amazon.kinesis.leases.exceptions.DependencyException;
|
import software.amazon.kinesis.leases.exceptions.DependencyException;
|
||||||
import software.amazon.kinesis.leases.exceptions.InvalidStateException;
|
import software.amazon.kinesis.leases.exceptions.InvalidStateException;
|
||||||
import software.amazon.kinesis.leases.exceptions.ProvisionedThroughputException;
|
import software.amazon.kinesis.leases.exceptions.ProvisionedThroughputException;
|
||||||
|
import software.amazon.kinesis.metrics.MetricsFactory;
|
||||||
|
import software.amazon.kinesis.metrics.MetricsLevel;
|
||||||
|
import software.amazon.kinesis.metrics.MetricsScope;
|
||||||
|
import software.amazon.kinesis.metrics.MetricsUtil;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* An implementation of the {@code LeaderDecider} to elect leader(s) based on workerId.
|
* An implementation of the {@code LeaderDecider} to elect leader(s) based on workerId.
|
||||||
|
|
@ -46,7 +51,7 @@ import software.amazon.kinesis.leases.exceptions.ProvisionedThroughputException;
|
||||||
* This ensures redundancy for shard-sync during host failures.
|
* This ensures redundancy for shard-sync during host failures.
|
||||||
*/
|
*/
|
||||||
@Slf4j
|
@Slf4j
|
||||||
class DeterministicShuffleShardSyncLeaderDecider implements LeaderDecider {
|
public class DeterministicShuffleShardSyncLeaderDecider implements LeaderDecider {
|
||||||
// Fixed seed so that the shuffle order is preserved across workers
|
// Fixed seed so that the shuffle order is preserved across workers
|
||||||
static final int DETERMINISTIC_SHUFFLE_SEED = 1947;
|
static final int DETERMINISTIC_SHUFFLE_SEED = 1947;
|
||||||
|
|
||||||
|
|
@ -59,6 +64,7 @@ class DeterministicShuffleShardSyncLeaderDecider implements LeaderDecider {
|
||||||
private final LeaseRefresher leaseRefresher;
|
private final LeaseRefresher leaseRefresher;
|
||||||
private final int numPeriodicShardSyncWorkers;
|
private final int numPeriodicShardSyncWorkers;
|
||||||
private final ScheduledExecutorService leaderElectionThreadPool;
|
private final ScheduledExecutorService leaderElectionThreadPool;
|
||||||
|
private final MetricsFactory metricsFactory;
|
||||||
|
|
||||||
private volatile Set<String> leaders;
|
private volatile Set<String> leaders;
|
||||||
|
|
||||||
|
|
@ -67,11 +73,17 @@ class DeterministicShuffleShardSyncLeaderDecider implements LeaderDecider {
|
||||||
* @param leaderElectionThreadPool Thread-pool to be used for leaderElection.
|
* @param leaderElectionThreadPool Thread-pool to be used for leaderElection.
|
||||||
* @param numPeriodicShardSyncWorkers Number of leaders that will be elected to perform periodic shard syncs.
|
* @param numPeriodicShardSyncWorkers Number of leaders that will be elected to perform periodic shard syncs.
|
||||||
*/
|
*/
|
||||||
DeterministicShuffleShardSyncLeaderDecider(
|
public DeterministicShuffleShardSyncLeaderDecider(
|
||||||
LeaseRefresher leaseRefresher,
|
LeaseRefresher leaseRefresher,
|
||||||
ScheduledExecutorService leaderElectionThreadPool,
|
ScheduledExecutorService leaderElectionThreadPool,
|
||||||
int numPeriodicShardSyncWorkers) {
|
int numPeriodicShardSyncWorkers,
|
||||||
this(leaseRefresher, leaderElectionThreadPool, numPeriodicShardSyncWorkers, new ReentrantReadWriteLock());
|
MetricsFactory metricsFactory) {
|
||||||
|
this(
|
||||||
|
leaseRefresher,
|
||||||
|
leaderElectionThreadPool,
|
||||||
|
numPeriodicShardSyncWorkers,
|
||||||
|
new ReentrantReadWriteLock(),
|
||||||
|
metricsFactory);
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
|
@ -84,11 +96,13 @@ class DeterministicShuffleShardSyncLeaderDecider implements LeaderDecider {
|
||||||
LeaseRefresher leaseRefresher,
|
LeaseRefresher leaseRefresher,
|
||||||
ScheduledExecutorService leaderElectionThreadPool,
|
ScheduledExecutorService leaderElectionThreadPool,
|
||||||
int numPeriodicShardSyncWorkers,
|
int numPeriodicShardSyncWorkers,
|
||||||
ReadWriteLock readWriteLock) {
|
ReadWriteLock readWriteLock,
|
||||||
|
MetricsFactory metricsFactory) {
|
||||||
this.leaseRefresher = leaseRefresher;
|
this.leaseRefresher = leaseRefresher;
|
||||||
this.leaderElectionThreadPool = leaderElectionThreadPool;
|
this.leaderElectionThreadPool = leaderElectionThreadPool;
|
||||||
this.numPeriodicShardSyncWorkers = numPeriodicShardSyncWorkers;
|
this.numPeriodicShardSyncWorkers = numPeriodicShardSyncWorkers;
|
||||||
this.readWriteLock = readWriteLock;
|
this.readWriteLock = readWriteLock;
|
||||||
|
this.metricsFactory = metricsFactory;
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
|
|
@ -146,8 +160,13 @@ class DeterministicShuffleShardSyncLeaderDecider implements LeaderDecider {
|
||||||
ELECTION_SCHEDULING_INTERVAL_MILLIS,
|
ELECTION_SCHEDULING_INTERVAL_MILLIS,
|
||||||
TimeUnit.MILLISECONDS);
|
TimeUnit.MILLISECONDS);
|
||||||
}
|
}
|
||||||
|
final boolean response = executeConditionCheckWithReadLock(() -> isWorkerLeaderForShardSync(workerId));
|
||||||
return executeConditionCheckWithReadLock(() -> isWorkerLeaderForShardSync(workerId));
|
final MetricsScope metricsScope =
|
||||||
|
MetricsUtil.createMetricsWithOperation(metricsFactory, METRIC_OPERATION_LEADER_DECIDER);
|
||||||
|
metricsScope.addData(
|
||||||
|
METRIC_OPERATION_LEADER_DECIDER_IS_LEADER, response ? 1 : 0, StandardUnit.COUNT, MetricsLevel.DETAILED);
|
||||||
|
MetricsUtil.endScope(metricsScope);
|
||||||
|
return response;
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
|
|
|
||||||
|
|
@ -0,0 +1,403 @@
|
||||||
|
/*
|
||||||
|
* Copyright 2024 Amazon.com, Inc. or its affiliates.
|
||||||
|
* Licensed under the Apache License, Version 2.0 (the
|
||||||
|
* "License"); you may not use this file except in compliance
|
||||||
|
* with the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
package software.amazon.kinesis.coordinator;
|
||||||
|
|
||||||
|
import java.time.Duration;
|
||||||
|
import java.util.Objects;
|
||||||
|
import java.util.concurrent.ScheduledExecutorService;
|
||||||
|
import java.util.concurrent.ScheduledFuture;
|
||||||
|
import java.util.concurrent.TimeUnit;
|
||||||
|
import java.util.function.BiFunction;
|
||||||
|
import java.util.function.Supplier;
|
||||||
|
|
||||||
|
import lombok.AccessLevel;
|
||||||
|
import lombok.Builder;
|
||||||
|
import lombok.Getter;
|
||||||
|
import lombok.experimental.Accessors;
|
||||||
|
import lombok.extern.slf4j.Slf4j;
|
||||||
|
import software.amazon.awssdk.annotations.ThreadSafe;
|
||||||
|
import software.amazon.kinesis.annotations.KinesisClientInternalApi;
|
||||||
|
import software.amazon.kinesis.coordinator.MigrationAdaptiveLeaseAssignmentModeProvider.LeaseAssignmentMode;
|
||||||
|
import software.amazon.kinesis.coordinator.assignment.LeaseAssignmentManager;
|
||||||
|
import software.amazon.kinesis.coordinator.migration.ClientVersion;
|
||||||
|
import software.amazon.kinesis.leader.DynamoDBLockBasedLeaderDecider;
|
||||||
|
import software.amazon.kinesis.leader.MigrationAdaptiveLeaderDecider;
|
||||||
|
import software.amazon.kinesis.leases.LeaseManagementConfig.WorkerUtilizationAwareAssignmentConfig;
|
||||||
|
import software.amazon.kinesis.leases.LeaseRefresher;
|
||||||
|
import software.amazon.kinesis.leases.exceptions.DependencyException;
|
||||||
|
import software.amazon.kinesis.metrics.MetricsFactory;
|
||||||
|
import software.amazon.kinesis.worker.metricstats.WorkerMetricStatsDAO;
|
||||||
|
import software.amazon.kinesis.worker.metricstats.WorkerMetricStatsManager;
|
||||||
|
import software.amazon.kinesis.worker.metricstats.WorkerMetricStatsReporter;
|
||||||
|
|
||||||
|
import static software.amazon.kinesis.coordinator.MigrationAdaptiveLeaseAssignmentModeProvider.LeaseAssignmentMode.DEFAULT_LEASE_COUNT_BASED_ASSIGNMENT;
|
||||||
|
import static software.amazon.kinesis.coordinator.MigrationAdaptiveLeaseAssignmentModeProvider.LeaseAssignmentMode.WORKER_UTILIZATION_AWARE_ASSIGNMENT;
|
||||||
|
import static software.amazon.kinesis.coordinator.assignment.LeaseAssignmentManager.DEFAULT_NO_OF_SKIP_STAT_FOR_DEAD_WORKER_THRESHOLD;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* This class is responsible for initializing the KCL components that supports
|
||||||
|
* seamless upgrade from v2.x to v3.x.
|
||||||
|
* During specific versions, it also dynamically switches the functionality
|
||||||
|
* to be either vanilla 3.x or 2.x compatible.
|
||||||
|
*
|
||||||
|
* It is responsible for creating:
|
||||||
|
* 1. LeaderDecider
|
||||||
|
* 2. LAM
|
||||||
|
* 3. WorkerMetricStatsReporter
|
||||||
|
*
|
||||||
|
* It manages initializing the following components at initialization time
|
||||||
|
* 1. workerMetricsDAO and workerMetricsManager
|
||||||
|
* 2. leaderDecider
|
||||||
|
* 3. MigrationAdaptiveLeaseAssignmentModeProvider
|
||||||
|
*
|
||||||
|
* It updates the following components dynamically:
|
||||||
|
* 1. starts/stops LAM
|
||||||
|
* 2. starts/stops WorkerMetricStatsReporter
|
||||||
|
* 3. updates LeaseAssignmentMode to either DEFAULT_LEASE_COUNT_BASED_ASSIGNMENT or WORKER_UTILIZATION_AWARE_ASSIGNMENT
|
||||||
|
* 4. creates GSI (deletion is done by KclMigrationTool)
|
||||||
|
* 5. creates WorkerMetricStats table (deletion is done by KclMigrationTool)
|
||||||
|
* 6. updates LeaderDecider to either DeterministicShuffleShardSyncLeaderDecider or DynamoDBLockBasedLeaderDecider
|
||||||
|
*/
|
||||||
|
@Slf4j
|
||||||
|
@KinesisClientInternalApi
|
||||||
|
@ThreadSafe
|
||||||
|
@Accessors(fluent = true)
|
||||||
|
public final class DynamicMigrationComponentsInitializer {
|
||||||
|
private static final long SCHEDULER_SHUTDOWN_TIMEOUT_SECONDS = 60L;
|
||||||
|
|
||||||
|
@Getter
|
||||||
|
private final MetricsFactory metricsFactory;
|
||||||
|
|
||||||
|
@Getter
|
||||||
|
private final LeaseRefresher leaseRefresher;
|
||||||
|
|
||||||
|
private final CoordinatorStateDAO coordinatorStateDAO;
|
||||||
|
private final ScheduledExecutorService workerMetricsThreadPool;
|
||||||
|
|
||||||
|
@Getter
|
||||||
|
private final WorkerMetricStatsDAO workerMetricsDAO;
|
||||||
|
|
||||||
|
private final WorkerMetricStatsManager workerMetricsManager;
|
||||||
|
private final ScheduledExecutorService lamThreadPool;
|
||||||
|
private final BiFunction<ScheduledExecutorService, LeaderDecider, LeaseAssignmentManager> lamCreator;
|
||||||
|
private final Supplier<MigrationAdaptiveLeaderDecider> adaptiveLeaderDeciderCreator;
|
||||||
|
private final Supplier<DeterministicShuffleShardSyncLeaderDecider> deterministicLeaderDeciderCreator;
|
||||||
|
private final Supplier<DynamoDBLockBasedLeaderDecider> ddbLockBasedLeaderDeciderCreator;
|
||||||
|
|
||||||
|
@Getter
|
||||||
|
private final String workerIdentifier;
|
||||||
|
|
||||||
|
private final WorkerUtilizationAwareAssignmentConfig workerUtilizationAwareAssignmentConfig;
|
||||||
|
|
||||||
|
@Getter
|
||||||
|
private final long workerMetricsExpirySeconds;
|
||||||
|
|
||||||
|
private final MigrationAdaptiveLeaseAssignmentModeProvider leaseModeChangeConsumer;
|
||||||
|
|
||||||
|
@Getter
|
||||||
|
private LeaderDecider leaderDecider;
|
||||||
|
|
||||||
|
private LeaseAssignmentManager leaseAssignmentManager;
|
||||||
|
private ScheduledFuture<?> workerMetricsReporterFuture;
|
||||||
|
private LeaseAssignmentMode currentAssignmentMode;
|
||||||
|
private boolean dualMode;
|
||||||
|
private boolean initialized;
|
||||||
|
|
||||||
|
@Builder(access = AccessLevel.PACKAGE)
|
||||||
|
DynamicMigrationComponentsInitializer(
|
||||||
|
final MetricsFactory metricsFactory,
|
||||||
|
final LeaseRefresher leaseRefresher,
|
||||||
|
final CoordinatorStateDAO coordinatorStateDAO,
|
||||||
|
final ScheduledExecutorService workerMetricsThreadPool,
|
||||||
|
final WorkerMetricStatsDAO workerMetricsDAO,
|
||||||
|
final WorkerMetricStatsManager workerMetricsManager,
|
||||||
|
final ScheduledExecutorService lamThreadPool,
|
||||||
|
final BiFunction<ScheduledExecutorService, LeaderDecider, LeaseAssignmentManager> lamCreator,
|
||||||
|
final Supplier<MigrationAdaptiveLeaderDecider> adaptiveLeaderDeciderCreator,
|
||||||
|
final Supplier<DeterministicShuffleShardSyncLeaderDecider> deterministicLeaderDeciderCreator,
|
||||||
|
final Supplier<DynamoDBLockBasedLeaderDecider> ddbLockBasedLeaderDeciderCreator,
|
||||||
|
final String workerIdentifier,
|
||||||
|
final WorkerUtilizationAwareAssignmentConfig workerUtilizationAwareAssignmentConfig,
|
||||||
|
final MigrationAdaptiveLeaseAssignmentModeProvider leaseAssignmentModeProvider) {
|
||||||
|
this.metricsFactory = metricsFactory;
|
||||||
|
this.leaseRefresher = leaseRefresher;
|
||||||
|
this.coordinatorStateDAO = coordinatorStateDAO;
|
||||||
|
this.workerIdentifier = workerIdentifier;
|
||||||
|
this.workerUtilizationAwareAssignmentConfig = workerUtilizationAwareAssignmentConfig;
|
||||||
|
this.workerMetricsExpirySeconds = Duration.ofMillis(DEFAULT_NO_OF_SKIP_STAT_FOR_DEAD_WORKER_THRESHOLD
|
||||||
|
* workerUtilizationAwareAssignmentConfig.workerMetricsReporterFreqInMillis())
|
||||||
|
.getSeconds();
|
||||||
|
this.workerMetricsManager = workerMetricsManager;
|
||||||
|
this.workerMetricsDAO = workerMetricsDAO;
|
||||||
|
this.workerMetricsThreadPool = workerMetricsThreadPool;
|
||||||
|
this.lamThreadPool = lamThreadPool;
|
||||||
|
this.lamCreator = lamCreator;
|
||||||
|
this.adaptiveLeaderDeciderCreator = adaptiveLeaderDeciderCreator;
|
||||||
|
this.deterministicLeaderDeciderCreator = deterministicLeaderDeciderCreator;
|
||||||
|
this.ddbLockBasedLeaderDeciderCreator = ddbLockBasedLeaderDeciderCreator;
|
||||||
|
this.leaseModeChangeConsumer = leaseAssignmentModeProvider;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void initialize(final ClientVersion migrationStateMachineStartingClientVersion) throws DependencyException {
|
||||||
|
if (initialized) {
|
||||||
|
log.info("Already initialized, nothing to do");
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
// always collect metrics so that when we flip to start reporting we will have accurate historical data.
|
||||||
|
log.info("Start collection of WorkerMetricStats");
|
||||||
|
workerMetricsManager.startManager();
|
||||||
|
if (migrationStateMachineStartingClientVersion == ClientVersion.CLIENT_VERSION_3x) {
|
||||||
|
initializeComponentsFor3x();
|
||||||
|
} else {
|
||||||
|
initializeComponentsForMigration(migrationStateMachineStartingClientVersion);
|
||||||
|
}
|
||||||
|
log.info("Initialized dual mode {} current assignment mode {}", dualMode, currentAssignmentMode);
|
||||||
|
|
||||||
|
log.info("Creating LAM");
|
||||||
|
leaseAssignmentManager = lamCreator.apply(lamThreadPool, leaderDecider);
|
||||||
|
log.info("Initializing {}", leaseModeChangeConsumer.getClass().getSimpleName());
|
||||||
|
leaseModeChangeConsumer.initialize(dualMode, currentAssignmentMode);
|
||||||
|
initialized = true;
|
||||||
|
}
|
||||||
|
|
||||||
|
private void initializeComponentsFor3x() {
|
||||||
|
log.info("Initializing for 3x functionality");
|
||||||
|
dualMode = false;
|
||||||
|
currentAssignmentMode = WORKER_UTILIZATION_AWARE_ASSIGNMENT;
|
||||||
|
log.info("Initializing dualMode {} assignmentMode {}", dualMode, currentAssignmentMode);
|
||||||
|
leaderDecider = ddbLockBasedLeaderDeciderCreator.get();
|
||||||
|
log.info("Initializing {}", leaderDecider.getClass().getSimpleName());
|
||||||
|
leaderDecider.initialize();
|
||||||
|
}
|
||||||
|
|
||||||
|
private void initializeComponentsForMigration(final ClientVersion migrationStateMachineStartingClientVersion) {
|
||||||
|
log.info("Initializing for migration to 3x");
|
||||||
|
dualMode = true;
|
||||||
|
final LeaderDecider initialLeaderDecider;
|
||||||
|
if (migrationStateMachineStartingClientVersion == ClientVersion.CLIENT_VERSION_3x_WITH_ROLLBACK) {
|
||||||
|
currentAssignmentMode = WORKER_UTILIZATION_AWARE_ASSIGNMENT;
|
||||||
|
initialLeaderDecider = ddbLockBasedLeaderDeciderCreator.get();
|
||||||
|
} else {
|
||||||
|
currentAssignmentMode = DEFAULT_LEASE_COUNT_BASED_ASSIGNMENT;
|
||||||
|
initialLeaderDecider = deterministicLeaderDeciderCreator.get();
|
||||||
|
}
|
||||||
|
log.info("Initializing dualMode {} assignmentMode {}", dualMode, currentAssignmentMode);
|
||||||
|
|
||||||
|
final MigrationAdaptiveLeaderDecider adaptiveLeaderDecider = adaptiveLeaderDeciderCreator.get();
|
||||||
|
log.info(
|
||||||
|
"Initializing MigrationAdaptiveLeaderDecider with {}",
|
||||||
|
initialLeaderDecider.getClass().getSimpleName());
|
||||||
|
adaptiveLeaderDecider.updateLeaderDecider(initialLeaderDecider);
|
||||||
|
this.leaderDecider = adaptiveLeaderDecider;
|
||||||
|
}
|
||||||
|
|
||||||
|
void shutdown() {
|
||||||
|
log.info("Shutting down components");
|
||||||
|
if (initialized) {
|
||||||
|
log.info("Stopping LAM, LeaderDecider, workerMetrics reporting and collection");
|
||||||
|
leaseAssignmentManager.stop();
|
||||||
|
// leader decider is shut down later when scheduler is doing a final shutdown
|
||||||
|
// since scheduler still accesses the leader decider while shutting down
|
||||||
|
stopWorkerMetricsReporter();
|
||||||
|
workerMetricsManager.stopManager();
|
||||||
|
}
|
||||||
|
|
||||||
|
// lam does not manage lifecycle of its threadpool to easily stop/start dynamically.
|
||||||
|
// once migration code is obsolete (i.e. all 3x functionality is the baseline and no
|
||||||
|
// migration is needed), it can be moved inside lam
|
||||||
|
log.info("Shutting down lamThreadPool and workerMetrics reporter thread pool");
|
||||||
|
lamThreadPool.shutdown();
|
||||||
|
workerMetricsThreadPool.shutdown();
|
||||||
|
try {
|
||||||
|
if (!lamThreadPool.awaitTermination(SCHEDULER_SHUTDOWN_TIMEOUT_SECONDS, TimeUnit.SECONDS)) {
|
||||||
|
lamThreadPool.shutdownNow();
|
||||||
|
}
|
||||||
|
} catch (final InterruptedException e) {
|
||||||
|
log.warn("Interrupted while waiting for shutdown of LeaseAssignmentManager ThreadPool", e);
|
||||||
|
lamThreadPool.shutdownNow();
|
||||||
|
}
|
||||||
|
|
||||||
|
try {
|
||||||
|
if (!workerMetricsThreadPool.awaitTermination(SCHEDULER_SHUTDOWN_TIMEOUT_SECONDS, TimeUnit.SECONDS)) {
|
||||||
|
workerMetricsThreadPool.shutdownNow();
|
||||||
|
}
|
||||||
|
} catch (final InterruptedException e) {
|
||||||
|
Thread.currentThread().interrupt();
|
||||||
|
log.warn("Interrupted while waiting for shutdown of WorkerMetricStatsManager ThreadPool", e);
|
||||||
|
workerMetricsThreadPool.shutdownNow();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private void startWorkerMetricsReporting() throws DependencyException {
|
||||||
|
if (workerMetricsReporterFuture != null) {
|
||||||
|
log.info("Worker metrics reporting is already running...");
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
log.info("Initializing WorkerMetricStats");
|
||||||
|
this.workerMetricsDAO.initialize();
|
||||||
|
log.info("Starting worker metrics reporter");
|
||||||
|
// Start with a delay for workerStatsManager to capture some values and start reporting.
|
||||||
|
workerMetricsReporterFuture = workerMetricsThreadPool.scheduleAtFixedRate(
|
||||||
|
new WorkerMetricStatsReporter(metricsFactory, workerIdentifier, workerMetricsManager, workerMetricsDAO),
|
||||||
|
workerUtilizationAwareAssignmentConfig.inMemoryWorkerMetricsCaptureFrequencyMillis() * 2L,
|
||||||
|
workerUtilizationAwareAssignmentConfig.workerMetricsReporterFreqInMillis(),
|
||||||
|
TimeUnit.MILLISECONDS);
|
||||||
|
}
|
||||||
|
|
||||||
|
private void stopWorkerMetricsReporter() {
|
||||||
|
log.info("Stopping worker metrics reporter");
|
||||||
|
if (workerMetricsReporterFuture != null) {
|
||||||
|
workerMetricsReporterFuture.cancel(false);
|
||||||
|
workerMetricsReporterFuture = null;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Create LeaseOwnerToLeaseKey GSI for the lease table
|
||||||
|
* @param blockingWait whether to wait for the GSI creation or not, if false, the gsi creation will be initiated
|
||||||
|
* but this call will not block for its creation
|
||||||
|
* @throws DependencyException If DDB fails unexpectedly when creating the GSI
|
||||||
|
*/
|
||||||
|
private void createGsi(final boolean blockingWait) throws DependencyException {
|
||||||
|
log.info("Creating Lease table GSI if it does not exist");
|
||||||
|
// KCLv3.0 always starts with GSI available
|
||||||
|
leaseRefresher.createLeaseOwnerToLeaseKeyIndexIfNotExists();
|
||||||
|
|
||||||
|
if (blockingWait) {
|
||||||
|
log.info("Waiting for Lease table GSI creation");
|
||||||
|
final long secondsBetweenPolls = 10L;
|
||||||
|
final long timeoutSeconds = 600L;
|
||||||
|
final boolean isIndexActive =
|
||||||
|
leaseRefresher.waitUntilLeaseOwnerToLeaseKeyIndexExists(secondsBetweenPolls, timeoutSeconds);
|
||||||
|
|
||||||
|
if (!isIndexActive) {
|
||||||
|
throw new DependencyException(
|
||||||
|
new IllegalStateException("Creating LeaseOwnerToLeaseKeyIndex on Lease table timed out"));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Initialize KCL with components and configuration to support upgrade from 2x. This can happen
|
||||||
|
* at KCL Worker startup when MigrationStateMachine starts in ClientVersion.CLIENT_VERSION_UPGRADE_FROM_2x.
|
||||||
|
* Or Dynamically during roll-forward from ClientVersion.CLIENT_VERSION_2x.
|
||||||
|
*/
|
||||||
|
public synchronized void initializeClientVersionForUpgradeFrom2x(final ClientVersion fromClientVersion)
|
||||||
|
throws DependencyException {
|
||||||
|
log.info("Initializing KCL components for upgrade from 2x from {}", fromClientVersion);
|
||||||
|
|
||||||
|
createGsi(false);
|
||||||
|
startWorkerMetricsReporting();
|
||||||
|
// LAM is not started until the dynamic flip to 3xWithRollback
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Initialize KCL with components and configuration to run vanilla 3x functionality. This can happen
|
||||||
|
* at KCL Worker startup when MigrationStateMachine starts in ClientVersion.CLIENT_VERSION_3x, or dynamically
|
||||||
|
* during a new deployment when existing worker are in ClientVersion.CLIENT_VERSION_3x_WITH_ROLLBACK
|
||||||
|
*/
|
||||||
|
public synchronized void initializeClientVersionFor3x(final ClientVersion fromClientVersion)
|
||||||
|
throws DependencyException {
|
||||||
|
log.info("Initializing KCL components for 3x from {}", fromClientVersion);
|
||||||
|
|
||||||
|
log.info("Initializing LeaseAssignmentManager, DDB-lock-based leader decider, WorkerMetricStats manager"
|
||||||
|
+ " and creating the Lease table GSI if it does not exist");
|
||||||
|
if (fromClientVersion == ClientVersion.CLIENT_VERSION_INIT) {
|
||||||
|
// gsi may already exist and be active for migrated application.
|
||||||
|
createGsi(true);
|
||||||
|
startWorkerMetricsReporting();
|
||||||
|
log.info("Starting LAM");
|
||||||
|
leaseAssignmentManager.start();
|
||||||
|
}
|
||||||
|
// nothing to do when transitioning from CLIENT_VERSION_3x_WITH_ROLLBACK.
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Initialize KCL with components and configuration to run 2x compatible functionality
|
||||||
|
* while allowing roll-forward. This can happen at KCL Worker startup when MigrationStateMachine
|
||||||
|
* starts in ClientVersion.CLIENT_VERSION_2x (after a rollback)
|
||||||
|
* Or Dynamically during rollback from CLIENT_VERSION_UPGRADE_FROM_2x or CLIENT_VERSION_3x_WITH_ROLLBACK.
|
||||||
|
*/
|
||||||
|
public synchronized void initializeClientVersionFor2x(final ClientVersion fromClientVersion) {
|
||||||
|
log.info("Initializing KCL components for rollback to 2x from {}", fromClientVersion);
|
||||||
|
|
||||||
|
if (fromClientVersion != ClientVersion.CLIENT_VERSION_INIT) {
|
||||||
|
// dynamic rollback
|
||||||
|
stopWorkerMetricsReporter();
|
||||||
|
// Migration Tool will delete the lease table LeaseOwner GSI
|
||||||
|
// and WorkerMetricStats table
|
||||||
|
}
|
||||||
|
|
||||||
|
if (fromClientVersion == ClientVersion.CLIENT_VERSION_3x_WITH_ROLLBACK) {
|
||||||
|
// we are rolling back after flip
|
||||||
|
currentAssignmentMode = DEFAULT_LEASE_COUNT_BASED_ASSIGNMENT;
|
||||||
|
notifyLeaseAssignmentModeChange();
|
||||||
|
log.info("Stopping LAM");
|
||||||
|
leaseAssignmentManager.stop();
|
||||||
|
final LeaderDecider leaderDecider = deterministicLeaderDeciderCreator.get();
|
||||||
|
if (this.leaderDecider instanceof MigrationAdaptiveLeaderDecider) {
|
||||||
|
log.info(
|
||||||
|
"Updating LeaderDecider to {}", leaderDecider.getClass().getSimpleName());
|
||||||
|
((MigrationAdaptiveLeaderDecider) this.leaderDecider).updateLeaderDecider(leaderDecider);
|
||||||
|
} else {
|
||||||
|
throw new IllegalStateException(String.format("Unexpected leader decider %s", this.leaderDecider));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Initialize KCL with components and configuration to run vanilla 3x functionality
|
||||||
|
* while allowing roll-back to 2x functionality. This can happen at KCL Worker startup
|
||||||
|
* when MigrationStateMachine starts in ClientVersion.CLIENT_VERSION_3x_WITH_ROLLBACK (after the flip)
|
||||||
|
* Or Dynamically during flip from CLIENT_VERSION_UPGRADE_FROM_2x.
|
||||||
|
*/
|
||||||
|
public synchronized void initializeClientVersionFor3xWithRollback(final ClientVersion fromClientVersion)
|
||||||
|
throws DependencyException {
|
||||||
|
log.info("Initializing KCL components for 3x with rollback from {}", fromClientVersion);
|
||||||
|
|
||||||
|
if (fromClientVersion == ClientVersion.CLIENT_VERSION_UPGRADE_FROM_2x) {
|
||||||
|
// dynamic flip
|
||||||
|
currentAssignmentMode = WORKER_UTILIZATION_AWARE_ASSIGNMENT;
|
||||||
|
notifyLeaseAssignmentModeChange();
|
||||||
|
final LeaderDecider leaderDecider = ddbLockBasedLeaderDeciderCreator.get();
|
||||||
|
log.info("Updating LeaderDecider to {}", leaderDecider.getClass().getSimpleName());
|
||||||
|
((MigrationAdaptiveLeaderDecider) this.leaderDecider).updateLeaderDecider(leaderDecider);
|
||||||
|
} else {
|
||||||
|
startWorkerMetricsReporting();
|
||||||
|
}
|
||||||
|
|
||||||
|
log.info("Starting LAM");
|
||||||
|
leaseAssignmentManager.start();
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Synchronously invoke the consumer to change the lease assignment mode.
|
||||||
|
*/
|
||||||
|
private void notifyLeaseAssignmentModeChange() {
|
||||||
|
if (dualMode) {
|
||||||
|
log.info("Notifying {} of {}", leaseModeChangeConsumer, currentAssignmentMode);
|
||||||
|
if (Objects.nonNull(leaseModeChangeConsumer)) {
|
||||||
|
try {
|
||||||
|
leaseModeChangeConsumer.updateLeaseAssignmentMode(currentAssignmentMode);
|
||||||
|
} catch (final Exception e) {
|
||||||
|
log.warn("LeaseAssignmentMode change consumer threw exception", e);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
throw new IllegalStateException("Unexpected assignment mode change");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
@ -0,0 +1,144 @@
|
||||||
|
package software.amazon.kinesis.coordinator;
|
||||||
|
|
||||||
|
import java.util.concurrent.CompletableFuture;
|
||||||
|
import java.util.concurrent.CompletionException;
|
||||||
|
import java.util.function.Supplier;
|
||||||
|
|
||||||
|
import software.amazon.awssdk.services.dynamodb.DynamoDbAsyncClient;
|
||||||
|
import software.amazon.awssdk.services.dynamodb.DynamoDbClient;
|
||||||
|
import software.amazon.awssdk.services.dynamodb.model.BatchGetItemRequest;
|
||||||
|
import software.amazon.awssdk.services.dynamodb.model.BatchGetItemResponse;
|
||||||
|
import software.amazon.awssdk.services.dynamodb.model.BatchWriteItemRequest;
|
||||||
|
import software.amazon.awssdk.services.dynamodb.model.BatchWriteItemResponse;
|
||||||
|
import software.amazon.awssdk.services.dynamodb.model.CreateTableRequest;
|
||||||
|
import software.amazon.awssdk.services.dynamodb.model.CreateTableResponse;
|
||||||
|
import software.amazon.awssdk.services.dynamodb.model.DeleteItemRequest;
|
||||||
|
import software.amazon.awssdk.services.dynamodb.model.DeleteItemResponse;
|
||||||
|
import software.amazon.awssdk.services.dynamodb.model.DeleteTableRequest;
|
||||||
|
import software.amazon.awssdk.services.dynamodb.model.DeleteTableResponse;
|
||||||
|
import software.amazon.awssdk.services.dynamodb.model.DescribeTableRequest;
|
||||||
|
import software.amazon.awssdk.services.dynamodb.model.DescribeTableResponse;
|
||||||
|
import software.amazon.awssdk.services.dynamodb.model.GetItemRequest;
|
||||||
|
import software.amazon.awssdk.services.dynamodb.model.GetItemResponse;
|
||||||
|
import software.amazon.awssdk.services.dynamodb.model.PutItemRequest;
|
||||||
|
import software.amazon.awssdk.services.dynamodb.model.PutItemResponse;
|
||||||
|
import software.amazon.awssdk.services.dynamodb.model.QueryRequest;
|
||||||
|
import software.amazon.awssdk.services.dynamodb.model.QueryResponse;
|
||||||
|
import software.amazon.awssdk.services.dynamodb.model.ScanRequest;
|
||||||
|
import software.amazon.awssdk.services.dynamodb.model.ScanResponse;
|
||||||
|
import software.amazon.awssdk.services.dynamodb.model.UpdateItemRequest;
|
||||||
|
import software.amazon.awssdk.services.dynamodb.model.UpdateItemResponse;
|
||||||
|
import software.amazon.awssdk.services.dynamodb.paginators.BatchGetItemIterable;
|
||||||
|
import software.amazon.awssdk.services.dynamodb.paginators.QueryIterable;
|
||||||
|
import software.amazon.awssdk.services.dynamodb.paginators.ScanIterable;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* DDB Lock client depends on DynamoDbClient and KCL only has DynamoDbAsyncClient configured.
|
||||||
|
* This wrapper delegates APIs from sync client to async client internally so that it can
|
||||||
|
* be used with the DDB Lock client.
|
||||||
|
*/
|
||||||
|
public class DynamoDbAsyncToSyncClientAdapter implements DynamoDbClient {
|
||||||
|
private final DynamoDbAsyncClient asyncClient;
|
||||||
|
|
||||||
|
public DynamoDbAsyncToSyncClientAdapter(final DynamoDbAsyncClient asyncClient) {
|
||||||
|
this.asyncClient = asyncClient;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public String serviceName() {
|
||||||
|
return asyncClient.serviceName();
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void close() {
|
||||||
|
asyncClient.close();
|
||||||
|
}
|
||||||
|
|
||||||
|
private <T> T handleException(final Supplier<CompletableFuture<T>> task) {
|
||||||
|
try {
|
||||||
|
return task.get().join();
|
||||||
|
} catch (final CompletionException e) {
|
||||||
|
rethrow(e.getCause());
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public CreateTableResponse createTable(final CreateTableRequest request) {
|
||||||
|
return handleException(() -> asyncClient.createTable(request));
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public DescribeTableResponse describeTable(final DescribeTableRequest request) {
|
||||||
|
return handleException(() -> asyncClient.describeTable(request));
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public DeleteTableResponse deleteTable(final DeleteTableRequest request) {
|
||||||
|
return handleException(() -> asyncClient.deleteTable(request));
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public DeleteItemResponse deleteItem(final DeleteItemRequest request) {
|
||||||
|
return handleException(() -> asyncClient.deleteItem(request));
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public GetItemResponse getItem(final GetItemRequest request) {
|
||||||
|
return handleException(() -> asyncClient.getItem(request));
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public PutItemResponse putItem(final PutItemRequest request) {
|
||||||
|
return handleException(() -> asyncClient.putItem(request));
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public UpdateItemResponse updateItem(final UpdateItemRequest request) {
|
||||||
|
return handleException(() -> asyncClient.updateItem(request));
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public QueryResponse query(final QueryRequest request) {
|
||||||
|
return handleException(() -> asyncClient.query(request));
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public ScanResponse scan(final ScanRequest request) {
|
||||||
|
return handleException(() -> asyncClient.scan(request));
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public QueryIterable queryPaginator(final QueryRequest request) {
|
||||||
|
return new QueryIterable(this, request);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public ScanIterable scanPaginator(final ScanRequest request) {
|
||||||
|
return new ScanIterable(this, request);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public BatchGetItemResponse batchGetItem(final BatchGetItemRequest request) {
|
||||||
|
return handleException(() -> asyncClient.batchGetItem(request));
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public BatchWriteItemResponse batchWriteItem(final BatchWriteItemRequest request) {
|
||||||
|
return handleException(() -> asyncClient.batchWriteItem(request));
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public BatchGetItemIterable batchGetItemPaginator(final BatchGetItemRequest request) {
|
||||||
|
return new BatchGetItemIterable(this, request);
|
||||||
|
}
|
||||||
|
|
||||||
|
private static void rethrow(final Throwable e) {
|
||||||
|
castAndThrow(e);
|
||||||
|
}
|
||||||
|
|
||||||
|
@SuppressWarnings("unchecked")
|
||||||
|
private static <T extends Throwable> void castAndThrow(final Throwable e) throws T {
|
||||||
|
throw (T) e;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
@ -21,6 +21,8 @@ package software.amazon.kinesis.coordinator;
|
||||||
* worker is one of the leaders designated to execute shard-sync and then acts accordingly.
|
* worker is one of the leaders designated to execute shard-sync and then acts accordingly.
|
||||||
*/
|
*/
|
||||||
public interface LeaderDecider {
|
public interface LeaderDecider {
|
||||||
|
String METRIC_OPERATION_LEADER_DECIDER = "LeaderDecider";
|
||||||
|
String METRIC_OPERATION_LEADER_DECIDER_IS_LEADER = METRIC_OPERATION_LEADER_DECIDER + ":IsLeader";
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Method invoked to check the given workerId corresponds to one of the workers
|
* Method invoked to check the given workerId corresponds to one of the workers
|
||||||
|
|
@ -36,4 +38,32 @@ public interface LeaderDecider {
|
||||||
* being used in the LeaderDecider implementation.
|
* being used in the LeaderDecider implementation.
|
||||||
*/
|
*/
|
||||||
void shutdown();
|
void shutdown();
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Performs initialization tasks for decider if any.
|
||||||
|
*/
|
||||||
|
default void initialize() {
|
||||||
|
// No-op by default
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Returns if any ACTIVE leader exists that is elected by the current implementation.
|
||||||
|
* Note: Some implementation (like DeterministicShuffleShardSyncLeaderDecider) will always have a leader and will
|
||||||
|
* return true always.
|
||||||
|
*/
|
||||||
|
default boolean isAnyLeaderElected() {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* If the current worker is the leader, then releases the leadership else does nothing.
|
||||||
|
* This might not be relevant for some implementations, for e.g. DeterministicShuffleShardSyncLeaderDecider does
|
||||||
|
* not have mechanism to release leadership.
|
||||||
|
*
|
||||||
|
* Current worker if leader releases leadership, it's possible that the current worker assume leadership sometime
|
||||||
|
* later again in future elections.
|
||||||
|
*/
|
||||||
|
default void releaseLeadershipIfHeld() {
|
||||||
|
// No-op by default
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -0,0 +1,126 @@
|
||||||
|
/*
|
||||||
|
* Copyright 2024 Amazon.com, Inc. or its affiliates.
|
||||||
|
* Licensed under the Apache License, Version 2.0 (the
|
||||||
|
* "License"); you may not use this file except in compliance
|
||||||
|
* with the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
package software.amazon.kinesis.coordinator;
|
||||||
|
|
||||||
|
import lombok.NoArgsConstructor;
|
||||||
|
import lombok.extern.slf4j.Slf4j;
|
||||||
|
import software.amazon.awssdk.annotations.ThreadSafe;
|
||||||
|
import software.amazon.kinesis.annotations.KinesisClientInternalApi;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Provides the lease assignment mode KCL must operate in during migration
|
||||||
|
* from 2.x to 3.x.
|
||||||
|
* KCL v2.x lease assignment is based on distributed-worker-stealing algorithm
|
||||||
|
* which balances lease count across workers.
|
||||||
|
* KCL v3.x lease assignment is based on a centralized-lease-assignment algorithm
|
||||||
|
* which balances resource utilization metrics(e.g. CPU utilization) across workers.
|
||||||
|
*
|
||||||
|
* For a new application starting in KCL v3.x, there is no migration needed,
|
||||||
|
* so KCL will initialize with the lease assignment mode accordingly, and it will
|
||||||
|
* not change dynamically.
|
||||||
|
*
|
||||||
|
* During upgrade from 2.x to 3.x, KCL library needs an ability to
|
||||||
|
* start in v2.x assignment mode but dynamically change to v3.x assignment.
|
||||||
|
* In this case, both 2.x and 3.x lease assignment will be running but one
|
||||||
|
* of them will be a no-op based on the mode.
|
||||||
|
*
|
||||||
|
* The methods and internal state is guarded for concurrent access to allow
|
||||||
|
* both lease assignment algorithms to access the state concurrently while
|
||||||
|
* it could be dynamically updated.
|
||||||
|
*/
|
||||||
|
@KinesisClientInternalApi
|
||||||
|
@Slf4j
|
||||||
|
@ThreadSafe
|
||||||
|
@NoArgsConstructor
|
||||||
|
public final class MigrationAdaptiveLeaseAssignmentModeProvider {
|
||||||
|
|
||||||
|
public enum LeaseAssignmentMode {
|
||||||
|
/**
|
||||||
|
* This is the 2.x assignment mode.
|
||||||
|
* This mode assigns leases based on the number of leases.
|
||||||
|
* This mode involves each worker independently determining how many leases to pick or how many leases to steal
|
||||||
|
* from other workers.
|
||||||
|
*/
|
||||||
|
DEFAULT_LEASE_COUNT_BASED_ASSIGNMENT,
|
||||||
|
|
||||||
|
/**
|
||||||
|
* This is the 3.x assigment mode.
|
||||||
|
* This mode uses each worker's resource utilization to perform lease assignment.
|
||||||
|
* Assignment is done by a single worker (elected leader), which looks at WorkerMetricStats for each worker to
|
||||||
|
* determine lease assignment.
|
||||||
|
*
|
||||||
|
* This mode primarily does
|
||||||
|
* 1. Starts WorkerMetricStatsManager on the worker which starts publishing WorkerMetricStats
|
||||||
|
* 2. Starts the LeaseDiscoverer
|
||||||
|
* 3. Creates if not already available the LeaseOwnerToLeaseKey GSI on the lease table and validate that is
|
||||||
|
* ACTIVE.
|
||||||
|
*/
|
||||||
|
WORKER_UTILIZATION_AWARE_ASSIGNMENT;
|
||||||
|
}
|
||||||
|
|
||||||
|
private LeaseAssignmentMode currentMode;
|
||||||
|
private boolean initialized = false;
|
||||||
|
private boolean dynamicModeChangeSupportNeeded;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Specify whether both lease assignment algorithms should be initialized to
|
||||||
|
* support dynamically changing lease mode.
|
||||||
|
* @return true if lease assignment mode can change dynamically
|
||||||
|
* false otherwise.
|
||||||
|
*/
|
||||||
|
public synchronized boolean dynamicModeChangeSupportNeeded() {
|
||||||
|
return dynamicModeChangeSupportNeeded;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Provide the current lease assignment mode in which KCL should perform lease assignment
|
||||||
|
* @return the current lease assignment mode
|
||||||
|
*/
|
||||||
|
public synchronized LeaseAssignmentMode getLeaseAssignmentMode() {
|
||||||
|
if (!initialized) {
|
||||||
|
throw new IllegalStateException("AssignmentMode is not initialized");
|
||||||
|
}
|
||||||
|
return currentMode;
|
||||||
|
}
|
||||||
|
|
||||||
|
synchronized void initialize(final boolean dynamicModeChangeSupportNeeded, final LeaseAssignmentMode mode) {
|
||||||
|
if (!initialized) {
|
||||||
|
log.info("Initializing dynamicModeChangeSupportNeeded {} mode {}", dynamicModeChangeSupportNeeded, mode);
|
||||||
|
this.dynamicModeChangeSupportNeeded = dynamicModeChangeSupportNeeded;
|
||||||
|
this.currentMode = mode;
|
||||||
|
this.initialized = true;
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
log.info(
|
||||||
|
"Already initialized dynamicModeChangeSupportNeeded {} mode {}. Ignoring new values {}, {}",
|
||||||
|
this.dynamicModeChangeSupportNeeded,
|
||||||
|
this.currentMode,
|
||||||
|
dynamicModeChangeSupportNeeded,
|
||||||
|
mode);
|
||||||
|
}
|
||||||
|
|
||||||
|
synchronized void updateLeaseAssignmentMode(final LeaseAssignmentMode mode) {
|
||||||
|
if (!initialized) {
|
||||||
|
throw new IllegalStateException("Cannot change mode before initializing");
|
||||||
|
}
|
||||||
|
if (dynamicModeChangeSupportNeeded) {
|
||||||
|
log.info("Changing Lease assignment mode from {} to {}", currentMode, mode);
|
||||||
|
this.currentMode = mode;
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
throw new IllegalStateException(String.format(
|
||||||
|
"Lease assignment mode already initialized to %s cannot" + " change to %s", this.currentMode, mode));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
@ -87,7 +87,7 @@ class PeriodicShardSyncManager {
|
||||||
private final Map<StreamIdentifier, HashRangeHoleTracker> hashRangeHoleTrackerMap = new HashMap<>();
|
private final Map<StreamIdentifier, HashRangeHoleTracker> hashRangeHoleTrackerMap = new HashMap<>();
|
||||||
|
|
||||||
private final String workerId;
|
private final String workerId;
|
||||||
private final LeaderDecider leaderDecider;
|
private LeaderDecider leaderDecider;
|
||||||
private final LeaseRefresher leaseRefresher;
|
private final LeaseRefresher leaseRefresher;
|
||||||
private final Map<StreamIdentifier, StreamConfig> currentStreamConfigMap;
|
private final Map<StreamIdentifier, StreamConfig> currentStreamConfigMap;
|
||||||
private final Function<StreamConfig, ShardSyncTaskManager> shardSyncTaskManagerProvider;
|
private final Function<StreamConfig, ShardSyncTaskManager> shardSyncTaskManagerProvider;
|
||||||
|
|
@ -105,7 +105,6 @@ class PeriodicShardSyncManager {
|
||||||
|
|
||||||
PeriodicShardSyncManager(
|
PeriodicShardSyncManager(
|
||||||
String workerId,
|
String workerId,
|
||||||
LeaderDecider leaderDecider,
|
|
||||||
LeaseRefresher leaseRefresher,
|
LeaseRefresher leaseRefresher,
|
||||||
Map<StreamIdentifier, StreamConfig> currentStreamConfigMap,
|
Map<StreamIdentifier, StreamConfig> currentStreamConfigMap,
|
||||||
Function<StreamConfig, ShardSyncTaskManager> shardSyncTaskManagerProvider,
|
Function<StreamConfig, ShardSyncTaskManager> shardSyncTaskManagerProvider,
|
||||||
|
|
@ -117,7 +116,6 @@ class PeriodicShardSyncManager {
|
||||||
AtomicBoolean leaderSynced) {
|
AtomicBoolean leaderSynced) {
|
||||||
this(
|
this(
|
||||||
workerId,
|
workerId,
|
||||||
leaderDecider,
|
|
||||||
leaseRefresher,
|
leaseRefresher,
|
||||||
currentStreamConfigMap,
|
currentStreamConfigMap,
|
||||||
shardSyncTaskManagerProvider,
|
shardSyncTaskManagerProvider,
|
||||||
|
|
@ -132,7 +130,6 @@ class PeriodicShardSyncManager {
|
||||||
|
|
||||||
PeriodicShardSyncManager(
|
PeriodicShardSyncManager(
|
||||||
String workerId,
|
String workerId,
|
||||||
LeaderDecider leaderDecider,
|
|
||||||
LeaseRefresher leaseRefresher,
|
LeaseRefresher leaseRefresher,
|
||||||
Map<StreamIdentifier, StreamConfig> currentStreamConfigMap,
|
Map<StreamIdentifier, StreamConfig> currentStreamConfigMap,
|
||||||
Function<StreamConfig, ShardSyncTaskManager> shardSyncTaskManagerProvider,
|
Function<StreamConfig, ShardSyncTaskManager> shardSyncTaskManagerProvider,
|
||||||
|
|
@ -144,9 +141,7 @@ class PeriodicShardSyncManager {
|
||||||
int leasesRecoveryAuditorInconsistencyConfidenceThreshold,
|
int leasesRecoveryAuditorInconsistencyConfidenceThreshold,
|
||||||
AtomicBoolean leaderSynced) {
|
AtomicBoolean leaderSynced) {
|
||||||
Validate.notBlank(workerId, "WorkerID is required to initialize PeriodicShardSyncManager.");
|
Validate.notBlank(workerId, "WorkerID is required to initialize PeriodicShardSyncManager.");
|
||||||
Validate.notNull(leaderDecider, "LeaderDecider is required to initialize PeriodicShardSyncManager.");
|
|
||||||
this.workerId = workerId;
|
this.workerId = workerId;
|
||||||
this.leaderDecider = leaderDecider;
|
|
||||||
this.leaseRefresher = leaseRefresher;
|
this.leaseRefresher = leaseRefresher;
|
||||||
this.currentStreamConfigMap = currentStreamConfigMap;
|
this.currentStreamConfigMap = currentStreamConfigMap;
|
||||||
this.shardSyncTaskManagerProvider = shardSyncTaskManagerProvider;
|
this.shardSyncTaskManagerProvider = shardSyncTaskManagerProvider;
|
||||||
|
|
@ -160,7 +155,9 @@ class PeriodicShardSyncManager {
|
||||||
this.leaderSynced = leaderSynced;
|
this.leaderSynced = leaderSynced;
|
||||||
}
|
}
|
||||||
|
|
||||||
public synchronized TaskResult start() {
|
public synchronized TaskResult start(final LeaderDecider leaderDecider) {
|
||||||
|
Validate.notNull(leaderDecider, "LeaderDecider is required to start PeriodicShardSyncManager.");
|
||||||
|
this.leaderDecider = leaderDecider;
|
||||||
if (!isRunning) {
|
if (!isRunning) {
|
||||||
final Runnable periodicShardSyncer = () -> {
|
final Runnable periodicShardSyncer = () -> {
|
||||||
try {
|
try {
|
||||||
|
|
@ -435,7 +432,7 @@ class PeriodicShardSyncManager {
|
||||||
leaseRefresher.updateLeaseWithMetaInfo(lease, UpdateField.HASH_KEY_RANGE);
|
leaseRefresher.updateLeaseWithMetaInfo(lease, UpdateField.HASH_KEY_RANGE);
|
||||||
} catch (Exception e) {
|
} catch (Exception e) {
|
||||||
log.warn(
|
log.warn(
|
||||||
"Unable to update hash range key information for lease {} of stream {}."
|
"Unable to update hash range key information for lease {} of stream {}. "
|
||||||
+ "This may result in explicit lease sync.",
|
+ "This may result in explicit lease sync.",
|
||||||
lease.leaseKey(),
|
lease.leaseKey(),
|
||||||
streamIdentifier);
|
streamIdentifier);
|
||||||
|
|
|
||||||
|
|
@ -26,6 +26,7 @@ import java.util.Iterator;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
import java.util.Map;
|
import java.util.Map;
|
||||||
import java.util.Optional;
|
import java.util.Optional;
|
||||||
|
import java.util.Random;
|
||||||
import java.util.Set;
|
import java.util.Set;
|
||||||
import java.util.concurrent.Callable;
|
import java.util.concurrent.Callable;
|
||||||
import java.util.concurrent.CompletableFuture;
|
import java.util.concurrent.CompletableFuture;
|
||||||
|
|
@ -44,6 +45,7 @@ import java.util.stream.Collectors;
|
||||||
|
|
||||||
import com.google.common.annotations.VisibleForTesting;
|
import com.google.common.annotations.VisibleForTesting;
|
||||||
import com.google.common.base.Stopwatch;
|
import com.google.common.base.Stopwatch;
|
||||||
|
import com.google.common.util.concurrent.ThreadFactoryBuilder;
|
||||||
import io.reactivex.rxjava3.plugins.RxJavaPlugins;
|
import io.reactivex.rxjava3.plugins.RxJavaPlugins;
|
||||||
import lombok.AccessLevel;
|
import lombok.AccessLevel;
|
||||||
import lombok.Getter;
|
import lombok.Getter;
|
||||||
|
|
@ -55,15 +57,23 @@ import lombok.extern.slf4j.Slf4j;
|
||||||
import software.amazon.awssdk.arns.Arn;
|
import software.amazon.awssdk.arns.Arn;
|
||||||
import software.amazon.awssdk.regions.Region;
|
import software.amazon.awssdk.regions.Region;
|
||||||
import software.amazon.awssdk.utils.Validate;
|
import software.amazon.awssdk.utils.Validate;
|
||||||
|
import software.amazon.kinesis.annotations.KinesisClientInternalApi;
|
||||||
import software.amazon.kinesis.checkpoint.CheckpointConfig;
|
import software.amazon.kinesis.checkpoint.CheckpointConfig;
|
||||||
import software.amazon.kinesis.checkpoint.ShardRecordProcessorCheckpointer;
|
import software.amazon.kinesis.checkpoint.ShardRecordProcessorCheckpointer;
|
||||||
import software.amazon.kinesis.common.StreamConfig;
|
import software.amazon.kinesis.common.StreamConfig;
|
||||||
import software.amazon.kinesis.common.StreamIdentifier;
|
import software.amazon.kinesis.common.StreamIdentifier;
|
||||||
|
import software.amazon.kinesis.coordinator.assignment.LeaseAssignmentManager;
|
||||||
|
import software.amazon.kinesis.coordinator.migration.MigrationStateMachine;
|
||||||
|
import software.amazon.kinesis.coordinator.migration.MigrationStateMachineImpl;
|
||||||
|
import software.amazon.kinesis.leader.DynamoDBLockBasedLeaderDecider;
|
||||||
|
import software.amazon.kinesis.leader.MigrationAdaptiveLeaderDecider;
|
||||||
import software.amazon.kinesis.leases.HierarchicalShardSyncer;
|
import software.amazon.kinesis.leases.HierarchicalShardSyncer;
|
||||||
import software.amazon.kinesis.leases.Lease;
|
import software.amazon.kinesis.leases.Lease;
|
||||||
import software.amazon.kinesis.leases.LeaseCleanupManager;
|
import software.amazon.kinesis.leases.LeaseCleanupManager;
|
||||||
import software.amazon.kinesis.leases.LeaseCoordinator;
|
import software.amazon.kinesis.leases.LeaseCoordinator;
|
||||||
import software.amazon.kinesis.leases.LeaseManagementConfig;
|
import software.amazon.kinesis.leases.LeaseManagementConfig;
|
||||||
|
import software.amazon.kinesis.leases.LeaseManagementConfig.WorkerUtilizationAwareAssignmentConfig;
|
||||||
|
import software.amazon.kinesis.leases.LeaseManagementFactory;
|
||||||
import software.amazon.kinesis.leases.LeaseRefresher;
|
import software.amazon.kinesis.leases.LeaseRefresher;
|
||||||
import software.amazon.kinesis.leases.LeaseSerializer;
|
import software.amazon.kinesis.leases.LeaseSerializer;
|
||||||
import software.amazon.kinesis.leases.MultiStreamLease;
|
import software.amazon.kinesis.leases.MultiStreamLease;
|
||||||
|
|
@ -98,6 +108,9 @@ import software.amazon.kinesis.retrieval.AggregatorUtil;
|
||||||
import software.amazon.kinesis.retrieval.RecordsPublisher;
|
import software.amazon.kinesis.retrieval.RecordsPublisher;
|
||||||
import software.amazon.kinesis.retrieval.RetrievalConfig;
|
import software.amazon.kinesis.retrieval.RetrievalConfig;
|
||||||
import software.amazon.kinesis.schemaregistry.SchemaRegistryDecoder;
|
import software.amazon.kinesis.schemaregistry.SchemaRegistryDecoder;
|
||||||
|
import software.amazon.kinesis.worker.WorkerMetricsSelector;
|
||||||
|
import software.amazon.kinesis.worker.metricstats.WorkerMetricStatsDAO;
|
||||||
|
import software.amazon.kinesis.worker.metricstats.WorkerMetricStatsManager;
|
||||||
|
|
||||||
import static software.amazon.kinesis.common.ArnUtil.constructStreamArn;
|
import static software.amazon.kinesis.common.ArnUtil.constructStreamArn;
|
||||||
import static software.amazon.kinesis.processor.FormerStreamsLeasesDeletionStrategy.StreamsLeasesDeletionType;
|
import static software.amazon.kinesis.processor.FormerStreamsLeasesDeletionStrategy.StreamsLeasesDeletionType;
|
||||||
|
|
@ -106,12 +119,14 @@ import static software.amazon.kinesis.processor.FormerStreamsLeasesDeletionStrat
|
||||||
/**
|
/**
|
||||||
*
|
*
|
||||||
*/
|
*/
|
||||||
@Getter
|
@Getter(AccessLevel.PRIVATE)
|
||||||
@Accessors(fluent = true)
|
@Accessors(fluent = true)
|
||||||
@Slf4j
|
@Slf4j
|
||||||
|
@KinesisClientInternalApi
|
||||||
public class Scheduler implements Runnable {
|
public class Scheduler implements Runnable {
|
||||||
|
|
||||||
private static final int PERIODIC_SHARD_SYNC_MAX_WORKERS_DEFAULT = 1;
|
private static final int PERIODIC_SHARD_SYNC_MAX_WORKERS_DEFAULT = 1;
|
||||||
|
|
||||||
private static final long LEASE_TABLE_CHECK_FREQUENCY_MILLIS = 3 * 1000L;
|
private static final long LEASE_TABLE_CHECK_FREQUENCY_MILLIS = 3 * 1000L;
|
||||||
private static final long MIN_WAIT_TIME_FOR_LEASE_TABLE_CHECK_MILLIS = 1000L;
|
private static final long MIN_WAIT_TIME_FOR_LEASE_TABLE_CHECK_MILLIS = 1000L;
|
||||||
private static final long MAX_WAIT_TIME_FOR_LEASE_TABLE_CHECK_MILLIS = 30 * 1000L;
|
private static final long MAX_WAIT_TIME_FOR_LEASE_TABLE_CHECK_MILLIS = 30 * 1000L;
|
||||||
|
|
@ -133,7 +148,9 @@ public class Scheduler implements Runnable {
|
||||||
private final ProcessorConfig processorConfig;
|
private final ProcessorConfig processorConfig;
|
||||||
private final RetrievalConfig retrievalConfig;
|
private final RetrievalConfig retrievalConfig;
|
||||||
|
|
||||||
|
@Getter(AccessLevel.PACKAGE)
|
||||||
private final String applicationName;
|
private final String applicationName;
|
||||||
|
|
||||||
private final int maxInitializationAttempts;
|
private final int maxInitializationAttempts;
|
||||||
private final Checkpointer checkpoint;
|
private final Checkpointer checkpoint;
|
||||||
private final long shardConsumerDispatchPollIntervalMillis;
|
private final long shardConsumerDispatchPollIntervalMillis;
|
||||||
|
|
@ -156,7 +173,10 @@ public class Scheduler implements Runnable {
|
||||||
private final long failoverTimeMillis;
|
private final long failoverTimeMillis;
|
||||||
private final long taskBackoffTimeMillis;
|
private final long taskBackoffTimeMillis;
|
||||||
private final boolean isMultiStreamMode;
|
private final boolean isMultiStreamMode;
|
||||||
|
|
||||||
|
@Getter(AccessLevel.PACKAGE)
|
||||||
private final Map<StreamIdentifier, StreamConfig> currentStreamConfigMap = new StreamConfigMap();
|
private final Map<StreamIdentifier, StreamConfig> currentStreamConfigMap = new StreamConfigMap();
|
||||||
|
|
||||||
private final StreamTracker streamTracker;
|
private final StreamTracker streamTracker;
|
||||||
private final FormerStreamsLeasesDeletionStrategy formerStreamsLeasesDeletionStrategy;
|
private final FormerStreamsLeasesDeletionStrategy formerStreamsLeasesDeletionStrategy;
|
||||||
private final long listShardsBackoffTimeMillis;
|
private final long listShardsBackoffTimeMillis;
|
||||||
|
|
@ -167,19 +187,30 @@ public class Scheduler implements Runnable {
|
||||||
private final AggregatorUtil aggregatorUtil;
|
private final AggregatorUtil aggregatorUtil;
|
||||||
private final Function<StreamConfig, HierarchicalShardSyncer> hierarchicalShardSyncerProvider;
|
private final Function<StreamConfig, HierarchicalShardSyncer> hierarchicalShardSyncerProvider;
|
||||||
private final long schedulerInitializationBackoffTimeMillis;
|
private final long schedulerInitializationBackoffTimeMillis;
|
||||||
private final LeaderDecider leaderDecider;
|
private LeaderDecider leaderDecider;
|
||||||
|
|
||||||
|
@Getter(AccessLevel.PACKAGE)
|
||||||
private final Map<StreamIdentifier, Instant> staleStreamDeletionMap = new HashMap<>();
|
private final Map<StreamIdentifier, Instant> staleStreamDeletionMap = new HashMap<>();
|
||||||
|
|
||||||
private final LeaseCleanupManager leaseCleanupManager;
|
private final LeaseCleanupManager leaseCleanupManager;
|
||||||
private final SchemaRegistryDecoder schemaRegistryDecoder;
|
private final SchemaRegistryDecoder schemaRegistryDecoder;
|
||||||
|
|
||||||
|
@Getter(AccessLevel.PACKAGE)
|
||||||
private final DeletedStreamListProvider deletedStreamListProvider;
|
private final DeletedStreamListProvider deletedStreamListProvider;
|
||||||
|
|
||||||
|
private final MigrationStateMachine migrationStateMachine;
|
||||||
|
private final DynamicMigrationComponentsInitializer migrationComponentsInitializer;
|
||||||
|
private final MigrationAdaptiveLeaseAssignmentModeProvider leaseAssignmentModeProvider;
|
||||||
|
|
||||||
// Holds consumers for shards the worker is currently tracking. Key is shard
|
// Holds consumers for shards the worker is currently tracking. Key is shard
|
||||||
// info, value is ShardConsumer.
|
// info, value is ShardConsumer.
|
||||||
|
@Getter(AccessLevel.PACKAGE)
|
||||||
private final ConcurrentMap<ShardInfo, ShardConsumer> shardInfoShardConsumerMap = new ConcurrentHashMap<>();
|
private final ConcurrentMap<ShardInfo, ShardConsumer> shardInfoShardConsumerMap = new ConcurrentHashMap<>();
|
||||||
|
|
||||||
private volatile boolean shutdown;
|
private volatile boolean shutdown;
|
||||||
private volatile long shutdownStartTimeMillis;
|
private volatile long shutdownStartTimeMillis;
|
||||||
|
|
||||||
|
@Getter(AccessLevel.PACKAGE)
|
||||||
private volatile boolean shutdownComplete = false;
|
private volatile boolean shutdownComplete = false;
|
||||||
|
|
||||||
private final Object lock = new Object();
|
private final Object lock = new Object();
|
||||||
|
|
@ -187,8 +218,6 @@ public class Scheduler implements Runnable {
|
||||||
private final Stopwatch streamSyncWatch = Stopwatch.createUnstarted();
|
private final Stopwatch streamSyncWatch = Stopwatch.createUnstarted();
|
||||||
|
|
||||||
private boolean leasesSyncedOnAppInit = false;
|
private boolean leasesSyncedOnAppInit = false;
|
||||||
|
|
||||||
@Getter(AccessLevel.NONE)
|
|
||||||
private final AtomicBoolean leaderSynced = new AtomicBoolean(false);
|
private final AtomicBoolean leaderSynced = new AtomicBoolean(false);
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
|
@ -200,7 +229,6 @@ public class Scheduler implements Runnable {
|
||||||
* CountDownLatch used by the GracefulShutdownCoordinator. Reaching zero means that
|
* CountDownLatch used by the GracefulShutdownCoordinator. Reaching zero means that
|
||||||
* the scheduler's finalShutdown() call has completed.
|
* the scheduler's finalShutdown() call has completed.
|
||||||
*/
|
*/
|
||||||
@Getter(AccessLevel.NONE)
|
|
||||||
private final CountDownLatch finalShutdownLatch = new CountDownLatch(1);
|
private final CountDownLatch finalShutdownLatch = new CountDownLatch(1);
|
||||||
|
|
||||||
@VisibleForTesting
|
@VisibleForTesting
|
||||||
|
|
@ -259,11 +287,32 @@ public class Scheduler implements Runnable {
|
||||||
// Determine leaseSerializer based on availability of MultiStreamTracker.
|
// Determine leaseSerializer based on availability of MultiStreamTracker.
|
||||||
final LeaseSerializer leaseSerializer =
|
final LeaseSerializer leaseSerializer =
|
||||||
isMultiStreamMode ? new DynamoDBMultiStreamLeaseSerializer() : new DynamoDBLeaseSerializer();
|
isMultiStreamMode ? new DynamoDBMultiStreamLeaseSerializer() : new DynamoDBLeaseSerializer();
|
||||||
this.leaseCoordinator = this.leaseManagementConfig
|
|
||||||
.leaseManagementFactory(leaseSerializer, isMultiStreamMode)
|
final LeaseManagementFactory leaseManagementFactory =
|
||||||
.createLeaseCoordinator(this.metricsFactory);
|
this.leaseManagementConfig.leaseManagementFactory(leaseSerializer, isMultiStreamMode);
|
||||||
|
this.leaseCoordinator =
|
||||||
|
leaseManagementFactory.createLeaseCoordinator(this.metricsFactory, shardInfoShardConsumerMap);
|
||||||
this.leaseRefresher = this.leaseCoordinator.leaseRefresher();
|
this.leaseRefresher = this.leaseCoordinator.leaseRefresher();
|
||||||
|
|
||||||
|
final CoordinatorStateDAO coordinatorStateDAO = new CoordinatorStateDAO(
|
||||||
|
leaseManagementConfig.dynamoDBClient(), coordinatorConfig().coordinatorStateConfig());
|
||||||
|
this.leaseAssignmentModeProvider = new MigrationAdaptiveLeaseAssignmentModeProvider();
|
||||||
|
this.migrationComponentsInitializer = createDynamicMigrationComponentsInitializer(coordinatorStateDAO);
|
||||||
|
this.migrationStateMachine = new MigrationStateMachineImpl(
|
||||||
|
metricsFactory,
|
||||||
|
System::currentTimeMillis,
|
||||||
|
coordinatorStateDAO,
|
||||||
|
Executors.newScheduledThreadPool(
|
||||||
|
2,
|
||||||
|
new ThreadFactoryBuilder()
|
||||||
|
.setNameFormat("MigrationStateMachine-%04d")
|
||||||
|
.build()),
|
||||||
|
coordinatorConfig.clientVersionConfig(),
|
||||||
|
new Random(),
|
||||||
|
this.migrationComponentsInitializer,
|
||||||
|
leaseManagementConfig.workerIdentifier(),
|
||||||
|
Duration.ofMinutes(10).getSeconds());
|
||||||
|
|
||||||
//
|
//
|
||||||
// TODO: Figure out what to do with lease manage <=> checkpoint relationship
|
// TODO: Figure out what to do with lease manage <=> checkpoint relationship
|
||||||
//
|
//
|
||||||
|
|
@ -280,9 +329,8 @@ public class Scheduler implements Runnable {
|
||||||
this.diagnosticEventFactory = diagnosticEventFactory;
|
this.diagnosticEventFactory = diagnosticEventFactory;
|
||||||
this.diagnosticEventHandler = new DiagnosticEventLogger();
|
this.diagnosticEventHandler = new DiagnosticEventLogger();
|
||||||
this.deletedStreamListProvider = new DeletedStreamListProvider();
|
this.deletedStreamListProvider = new DeletedStreamListProvider();
|
||||||
this.shardSyncTaskManagerProvider = streamConfig -> this.leaseManagementConfig
|
this.shardSyncTaskManagerProvider = streamConfig -> leaseManagementFactory.createShardSyncTaskManager(
|
||||||
.leaseManagementFactory(leaseSerializer, isMultiStreamMode)
|
this.metricsFactory, streamConfig, this.deletedStreamListProvider);
|
||||||
.createShardSyncTaskManager(this.metricsFactory, streamConfig, this.deletedStreamListProvider);
|
|
||||||
this.shardPrioritization = this.coordinatorConfig.shardPrioritization();
|
this.shardPrioritization = this.coordinatorConfig.shardPrioritization();
|
||||||
this.cleanupLeasesUponShardCompletion = this.leaseManagementConfig.cleanupLeasesUponShardCompletion();
|
this.cleanupLeasesUponShardCompletion = this.leaseManagementConfig.cleanupLeasesUponShardCompletion();
|
||||||
this.skipShardSyncAtWorkerInitializationIfLeasesExist =
|
this.skipShardSyncAtWorkerInitializationIfLeasesExist =
|
||||||
|
|
@ -299,8 +347,6 @@ public class Scheduler implements Runnable {
|
||||||
this.workerStateChangeListener =
|
this.workerStateChangeListener =
|
||||||
this.coordinatorConfig.coordinatorFactory().createWorkerStateChangeListener();
|
this.coordinatorConfig.coordinatorFactory().createWorkerStateChangeListener();
|
||||||
}
|
}
|
||||||
this.leaderDecider = new DeterministicShuffleShardSyncLeaderDecider(
|
|
||||||
leaseRefresher, Executors.newSingleThreadScheduledExecutor(), PERIODIC_SHARD_SYNC_MAX_WORKERS_DEFAULT);
|
|
||||||
this.failoverTimeMillis = this.leaseManagementConfig.failoverTimeMillis();
|
this.failoverTimeMillis = this.leaseManagementConfig.failoverTimeMillis();
|
||||||
this.taskBackoffTimeMillis = this.lifecycleConfig.taskBackoffTimeMillis();
|
this.taskBackoffTimeMillis = this.lifecycleConfig.taskBackoffTimeMillis();
|
||||||
this.listShardsBackoffTimeMillis = this.retrievalConfig.listShardsBackoffTimeInMillis();
|
this.listShardsBackoffTimeMillis = this.retrievalConfig.listShardsBackoffTimeInMillis();
|
||||||
|
|
@ -315,7 +361,6 @@ public class Scheduler implements Runnable {
|
||||||
this.coordinatorConfig.schedulerInitializationBackoffTimeMillis();
|
this.coordinatorConfig.schedulerInitializationBackoffTimeMillis();
|
||||||
this.leaderElectedPeriodicShardSyncManager = new PeriodicShardSyncManager(
|
this.leaderElectedPeriodicShardSyncManager = new PeriodicShardSyncManager(
|
||||||
leaseManagementConfig.workerIdentifier(),
|
leaseManagementConfig.workerIdentifier(),
|
||||||
leaderDecider,
|
|
||||||
leaseRefresher,
|
leaseRefresher,
|
||||||
currentStreamConfigMap,
|
currentStreamConfigMap,
|
||||||
shardSyncTaskManagerProvider,
|
shardSyncTaskManagerProvider,
|
||||||
|
|
@ -325,14 +370,69 @@ public class Scheduler implements Runnable {
|
||||||
leaseManagementConfig.leasesRecoveryAuditorExecutionFrequencyMillis(),
|
leaseManagementConfig.leasesRecoveryAuditorExecutionFrequencyMillis(),
|
||||||
leaseManagementConfig.leasesRecoveryAuditorInconsistencyConfidenceThreshold(),
|
leaseManagementConfig.leasesRecoveryAuditorInconsistencyConfidenceThreshold(),
|
||||||
leaderSynced);
|
leaderSynced);
|
||||||
this.leaseCleanupManager = this.leaseManagementConfig
|
this.leaseCleanupManager = leaseManagementFactory.createLeaseCleanupManager(metricsFactory);
|
||||||
.leaseManagementFactory(leaseSerializer, isMultiStreamMode)
|
|
||||||
.createLeaseCleanupManager(metricsFactory);
|
|
||||||
this.schemaRegistryDecoder = this.retrievalConfig.glueSchemaRegistryDeserializer() == null
|
this.schemaRegistryDecoder = this.retrievalConfig.glueSchemaRegistryDeserializer() == null
|
||||||
? null
|
? null
|
||||||
: new SchemaRegistryDecoder(this.retrievalConfig.glueSchemaRegistryDeserializer());
|
: new SchemaRegistryDecoder(this.retrievalConfig.glueSchemaRegistryDeserializer());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Depends on LeaseCoordinator and LeaseRefresher to be created first
|
||||||
|
*/
|
||||||
|
private DynamicMigrationComponentsInitializer createDynamicMigrationComponentsInitializer(
|
||||||
|
final CoordinatorStateDAO coordinatorStateDAO) {
|
||||||
|
selectWorkerMetricsIfAvailable(leaseManagementConfig.workerUtilizationAwareAssignmentConfig());
|
||||||
|
|
||||||
|
final WorkerMetricStatsManager workerMetricsManager = new WorkerMetricStatsManager(
|
||||||
|
leaseManagementConfig.workerUtilizationAwareAssignmentConfig().noOfPersistedMetricsPerWorkerMetrics(),
|
||||||
|
leaseManagementConfig.workerUtilizationAwareAssignmentConfig().workerMetricList(),
|
||||||
|
metricsFactory,
|
||||||
|
leaseManagementConfig
|
||||||
|
.workerUtilizationAwareAssignmentConfig()
|
||||||
|
.inMemoryWorkerMetricsCaptureFrequencyMillis());
|
||||||
|
|
||||||
|
final WorkerMetricStatsDAO workerMetricsDAO = new WorkerMetricStatsDAO(
|
||||||
|
leaseManagementConfig.dynamoDBClient(),
|
||||||
|
leaseManagementConfig.workerUtilizationAwareAssignmentConfig().workerMetricsTableConfig(),
|
||||||
|
leaseManagementConfig.workerUtilizationAwareAssignmentConfig().workerMetricsReporterFreqInMillis());
|
||||||
|
|
||||||
|
return DynamicMigrationComponentsInitializer.builder()
|
||||||
|
.metricsFactory(metricsFactory)
|
||||||
|
.leaseRefresher(leaseRefresher)
|
||||||
|
.coordinatorStateDAO(coordinatorStateDAO)
|
||||||
|
.workerMetricsThreadPool(Executors.newScheduledThreadPool(
|
||||||
|
1,
|
||||||
|
new ThreadFactoryBuilder()
|
||||||
|
.setNameFormat("worker-metrics-reporter")
|
||||||
|
.build()))
|
||||||
|
.workerMetricsDAO(workerMetricsDAO)
|
||||||
|
.workerMetricsManager(workerMetricsManager)
|
||||||
|
.lamThreadPool(Executors.newScheduledThreadPool(
|
||||||
|
1,
|
||||||
|
new ThreadFactoryBuilder().setNameFormat("lam-thread").build()))
|
||||||
|
.lamCreator((lamThreadPool, leaderDecider) -> new LeaseAssignmentManager(
|
||||||
|
leaseRefresher,
|
||||||
|
workerMetricsDAO,
|
||||||
|
leaderDecider,
|
||||||
|
leaseManagementConfig.workerUtilizationAwareAssignmentConfig(),
|
||||||
|
leaseCoordinator.workerIdentifier(),
|
||||||
|
leaseManagementConfig.failoverTimeMillis(),
|
||||||
|
metricsFactory,
|
||||||
|
lamThreadPool,
|
||||||
|
System::nanoTime,
|
||||||
|
leaseManagementConfig.maxLeasesForWorker(),
|
||||||
|
leaseManagementConfig.gracefulLeaseHandoffConfig()))
|
||||||
|
.adaptiveLeaderDeciderCreator(() -> new MigrationAdaptiveLeaderDecider(metricsFactory))
|
||||||
|
.deterministicLeaderDeciderCreator(() -> new DeterministicShuffleShardSyncLeaderDecider(
|
||||||
|
leaseRefresher, Executors.newSingleThreadScheduledExecutor(), 1, metricsFactory))
|
||||||
|
.ddbLockBasedLeaderDeciderCreator(() -> DynamoDBLockBasedLeaderDecider.create(
|
||||||
|
coordinatorStateDAO, leaseCoordinator.workerIdentifier(), metricsFactory))
|
||||||
|
.workerIdentifier(leaseCoordinator.workerIdentifier())
|
||||||
|
.workerUtilizationAwareAssignmentConfig(leaseManagementConfig.workerUtilizationAwareAssignmentConfig())
|
||||||
|
.leaseAssignmentModeProvider(leaseAssignmentModeProvider)
|
||||||
|
.build();
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Start consuming data from the stream, and pass it to the application record processors.
|
* Start consuming data from the stream, and pass it to the application record processors.
|
||||||
*/
|
*/
|
||||||
|
|
@ -342,13 +442,19 @@ public class Scheduler implements Runnable {
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
final MetricsScope metricsScope =
|
||||||
|
MetricsUtil.createMetricsWithOperation(metricsFactory, "Scheduler:Initialize");
|
||||||
|
boolean success = false;
|
||||||
try {
|
try {
|
||||||
initialize();
|
initialize();
|
||||||
|
success = true;
|
||||||
log.info("Initialization complete. Starting worker loop.");
|
log.info("Initialization complete. Starting worker loop.");
|
||||||
} catch (RuntimeException e) {
|
} catch (RuntimeException e) {
|
||||||
log.error("Unable to initialize after {} attempts. Shutting down.", maxInitializationAttempts, e);
|
log.error("Unable to initialize after {} attempts. Shutting down.", maxInitializationAttempts, e);
|
||||||
workerStateChangeListener.onAllInitializationAttemptsFailed(e);
|
workerStateChangeListener.onAllInitializationAttemptsFailed(e);
|
||||||
shutdown();
|
shutdown();
|
||||||
|
} finally {
|
||||||
|
MetricsUtil.addSuccess(metricsScope, "Initialize", success, MetricsLevel.SUMMARY);
|
||||||
}
|
}
|
||||||
while (!shouldShutdown()) {
|
while (!shouldShutdown()) {
|
||||||
runProcessLoop();
|
runProcessLoop();
|
||||||
|
|
@ -363,14 +469,13 @@ public class Scheduler implements Runnable {
|
||||||
synchronized (lock) {
|
synchronized (lock) {
|
||||||
registerErrorHandlerForUndeliverableAsyncTaskExceptions();
|
registerErrorHandlerForUndeliverableAsyncTaskExceptions();
|
||||||
workerStateChangeListener.onWorkerStateChange(WorkerStateChangeListener.WorkerState.INITIALIZING);
|
workerStateChangeListener.onWorkerStateChange(WorkerStateChangeListener.WorkerState.INITIALIZING);
|
||||||
|
|
||||||
boolean isDone = false;
|
boolean isDone = false;
|
||||||
Exception lastException = null;
|
Exception lastException = null;
|
||||||
|
|
||||||
for (int i = 0; (!isDone) && (i < maxInitializationAttempts); i++) {
|
for (int i = 0; (!isDone) && (i < maxInitializationAttempts); i++) {
|
||||||
try {
|
try {
|
||||||
log.info("Initializing LeaseCoordinator attempt {}", (i + 1));
|
log.info("Initializing LeaseCoordinator attempt {}", (i + 1));
|
||||||
leaseCoordinator.initialize();
|
leaseCoordinator.initialize();
|
||||||
|
|
||||||
if (!skipShardSyncAtWorkerInitializationIfLeasesExist || leaseRefresher.isLeaseTableEmpty()) {
|
if (!skipShardSyncAtWorkerInitializationIfLeasesExist || leaseRefresher.isLeaseTableEmpty()) {
|
||||||
if (shouldInitiateLeaseSync()) {
|
if (shouldInitiateLeaseSync()) {
|
||||||
log.info(
|
log.info(
|
||||||
|
|
@ -382,21 +487,29 @@ public class Scheduler implements Runnable {
|
||||||
log.info("Skipping shard sync per configuration setting (and lease table is not empty)");
|
log.info("Skipping shard sync per configuration setting (and lease table is not empty)");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Initialize the state machine after lease table has been initialized
|
||||||
|
// Migration state machine creates and waits for GSI if necessary,
|
||||||
|
// it must be initialized before starting leaseCoordinator, which runs LeaseDiscoverer
|
||||||
|
// and that requires GSI to be present and active. (migrationStateMachine.initialize is idempotent)
|
||||||
|
migrationStateMachine.initialize();
|
||||||
|
leaderDecider = migrationComponentsInitializer.leaderDecider();
|
||||||
|
|
||||||
leaseCleanupManager.start();
|
leaseCleanupManager.start();
|
||||||
|
|
||||||
// If we reach this point, then we either skipped the lease sync or did not have any exception
|
// If we reach this point, then we either skipped the lease sync or did not have any exception
|
||||||
// for any of the shard sync in the previous attempt.
|
// for any of the shard sync in the previous attempt.
|
||||||
|
|
||||||
if (!leaseCoordinator.isRunning()) {
|
if (!leaseCoordinator.isRunning()) {
|
||||||
log.info("Starting LeaseCoordinator");
|
log.info("Starting LeaseCoordinator");
|
||||||
leaseCoordinator.start();
|
leaseCoordinator.start(leaseAssignmentModeProvider);
|
||||||
} else {
|
} else {
|
||||||
log.info("LeaseCoordinator is already running. No need to start it.");
|
log.info("LeaseCoordinator is already running. No need to start it.");
|
||||||
}
|
}
|
||||||
log.info("Scheduling periodicShardSync");
|
log.info("Scheduling periodicShardSync");
|
||||||
leaderElectedPeriodicShardSyncManager.start();
|
leaderElectedPeriodicShardSyncManager.start(leaderDecider);
|
||||||
streamSyncWatch.start();
|
streamSyncWatch.start();
|
||||||
isDone = true;
|
isDone = true;
|
||||||
} catch (Exception e) {
|
} catch (final Exception e) {
|
||||||
log.error("Caught exception when initializing LeaseCoordinator", e);
|
log.error("Caught exception when initializing LeaseCoordinator", e);
|
||||||
lastException = e;
|
lastException = e;
|
||||||
}
|
}
|
||||||
|
|
@ -863,7 +976,7 @@ public class Scheduler implements Runnable {
|
||||||
leaseCoordinator, lease, notificationCompleteLatch, shutdownCompleteLatch);
|
leaseCoordinator, lease, notificationCompleteLatch, shutdownCompleteLatch);
|
||||||
ShardInfo shardInfo = DynamoDBLeaseCoordinator.convertLeaseToAssignment(lease);
|
ShardInfo shardInfo = DynamoDBLeaseCoordinator.convertLeaseToAssignment(lease);
|
||||||
ShardConsumer consumer = shardInfoShardConsumerMap.get(shardInfo);
|
ShardConsumer consumer = shardInfoShardConsumerMap.get(shardInfo);
|
||||||
if (consumer != null) {
|
if (consumer != null && !consumer.isShutdown()) {
|
||||||
consumer.gracefulShutdown(shutdownNotification);
|
consumer.gracefulShutdown(shutdownNotification);
|
||||||
} else {
|
} else {
|
||||||
//
|
//
|
||||||
|
|
@ -912,6 +1025,8 @@ public class Scheduler implements Runnable {
|
||||||
shutdown = true;
|
shutdown = true;
|
||||||
shutdownStartTimeMillis = System.currentTimeMillis();
|
shutdownStartTimeMillis = System.currentTimeMillis();
|
||||||
|
|
||||||
|
migrationStateMachine.shutdown();
|
||||||
|
migrationComponentsInitializer.shutdown();
|
||||||
// Stop lease coordinator, so leases are not renewed or stolen from other workers.
|
// Stop lease coordinator, so leases are not renewed or stolen from other workers.
|
||||||
// Lost leases will force Worker to begin shutdown process for all shard consumers in
|
// Lost leases will force Worker to begin shutdown process for all shard consumers in
|
||||||
// Worker.run().
|
// Worker.run().
|
||||||
|
|
@ -1228,4 +1343,23 @@ public class Scheduler implements Runnable {
|
||||||
public Future<Void> requestShutdown() {
|
public Future<Void> requestShutdown() {
|
||||||
return null;
|
return null;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* If WorkerMetricStats list is empty and the disable flag is false, select WorkerMetricStats automatically.
|
||||||
|
*/
|
||||||
|
private void selectWorkerMetricsIfAvailable(
|
||||||
|
final WorkerUtilizationAwareAssignmentConfig workerUtilizationAwareAssignmentConfig) {
|
||||||
|
try {
|
||||||
|
if (workerUtilizationAwareAssignmentConfig.workerMetricList().isEmpty()
|
||||||
|
&& !workerUtilizationAwareAssignmentConfig.disableWorkerMetrics()) {
|
||||||
|
workerUtilizationAwareAssignmentConfig.workerMetricList(
|
||||||
|
WorkerMetricsSelector.create().getDefaultWorkerMetrics());
|
||||||
|
}
|
||||||
|
} catch (final Exception e) {
|
||||||
|
log.warn(
|
||||||
|
"Exception encountered during WorkerMetricStats selection. If this is persistent please try setting the "
|
||||||
|
+ "WorkerMetricStats explicitly.",
|
||||||
|
e);
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -0,0 +1,21 @@
|
||||||
|
package software.amazon.kinesis.coordinator.assignment;
|
||||||
|
|
||||||
|
import java.util.List;
|
||||||
|
|
||||||
|
import software.amazon.kinesis.leases.Lease;
|
||||||
|
|
||||||
|
public interface LeaseAssignmentDecider {
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Assigns expiredOrUnAssignedLeases to the available workers.
|
||||||
|
*/
|
||||||
|
void assignExpiredOrUnassignedLeases(final List<Lease> expiredOrUnAssignedLeases);
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Balances the leases between workers in the fleet.
|
||||||
|
* Implementation can choose to balance leases based on lease count or throughput or to bring the variance in
|
||||||
|
* resource utilization to a minimum.
|
||||||
|
* Check documentation on implementation class to see how it balances the leases.
|
||||||
|
*/
|
||||||
|
void balanceWorkerVariance();
|
||||||
|
}
|
||||||
|
|
@ -0,0 +1,719 @@
|
||||||
|
package software.amazon.kinesis.coordinator.assignment;
|
||||||
|
|
||||||
|
import java.time.Duration;
|
||||||
|
import java.time.Instant;
|
||||||
|
import java.util.Collections;
|
||||||
|
import java.util.HashMap;
|
||||||
|
import java.util.HashSet;
|
||||||
|
import java.util.List;
|
||||||
|
import java.util.Map;
|
||||||
|
import java.util.Objects;
|
||||||
|
import java.util.Optional;
|
||||||
|
import java.util.Set;
|
||||||
|
import java.util.concurrent.Callable;
|
||||||
|
import java.util.concurrent.CompletableFuture;
|
||||||
|
import java.util.concurrent.CompletionException;
|
||||||
|
import java.util.concurrent.ExecutorService;
|
||||||
|
import java.util.concurrent.Executors;
|
||||||
|
import java.util.concurrent.Future;
|
||||||
|
import java.util.concurrent.ScheduledExecutorService;
|
||||||
|
import java.util.concurrent.TimeUnit;
|
||||||
|
import java.util.concurrent.atomic.AtomicInteger;
|
||||||
|
import java.util.function.Function;
|
||||||
|
import java.util.function.Supplier;
|
||||||
|
import java.util.stream.Collectors;
|
||||||
|
|
||||||
|
import com.google.common.collect.ImmutableList;
|
||||||
|
import com.google.common.collect.ImmutableMap;
|
||||||
|
import lombok.Getter;
|
||||||
|
import lombok.RequiredArgsConstructor;
|
||||||
|
import lombok.extern.slf4j.Slf4j;
|
||||||
|
import org.apache.commons.collections.CollectionUtils;
|
||||||
|
import software.amazon.awssdk.services.cloudwatch.model.StandardUnit;
|
||||||
|
import software.amazon.kinesis.annotations.KinesisClientInternalApi;
|
||||||
|
import software.amazon.kinesis.coordinator.LeaderDecider;
|
||||||
|
import software.amazon.kinesis.leases.Lease;
|
||||||
|
import software.amazon.kinesis.leases.LeaseManagementConfig;
|
||||||
|
import software.amazon.kinesis.leases.LeaseRefresher;
|
||||||
|
import software.amazon.kinesis.leases.exceptions.DependencyException;
|
||||||
|
import software.amazon.kinesis.leases.exceptions.InvalidStateException;
|
||||||
|
import software.amazon.kinesis.leases.exceptions.ProvisionedThroughputException;
|
||||||
|
import software.amazon.kinesis.metrics.MetricsFactory;
|
||||||
|
import software.amazon.kinesis.metrics.MetricsLevel;
|
||||||
|
import software.amazon.kinesis.metrics.MetricsScope;
|
||||||
|
import software.amazon.kinesis.metrics.MetricsUtil;
|
||||||
|
import software.amazon.kinesis.metrics.NullMetricsScope;
|
||||||
|
import software.amazon.kinesis.worker.metricstats.WorkerMetricStats;
|
||||||
|
import software.amazon.kinesis.worker.metricstats.WorkerMetricStatsDAO;
|
||||||
|
|
||||||
|
import static java.util.Objects.isNull;
|
||||||
|
import static java.util.Objects.nonNull;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Performs the LeaseAssignment for the application. This starts by loading the leases and workerMetrics from the
|
||||||
|
* storage and then starts by assignment (in-memory) of expired and/or unassigned leases after which it tries to perform
|
||||||
|
* balancing of load among the workers by re-assign leases.
|
||||||
|
* In the end, performs actual assignment by writing to storage.
|
||||||
|
*/
|
||||||
|
@Slf4j
|
||||||
|
@RequiredArgsConstructor
|
||||||
|
@KinesisClientInternalApi
|
||||||
|
public final class LeaseAssignmentManager {
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Default number of continuous failure execution after which leadership is released.
|
||||||
|
*/
|
||||||
|
private static final int DEFAULT_FAILURE_COUNT_TO_SWITCH_LEADER = 3;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Default multiplier for LAM frequency with respect to leaseDurationMillis (lease failover millis).
|
||||||
|
* If leaseDurationMillis is 10000 millis, default LAM frequency is 20000 millis.
|
||||||
|
*/
|
||||||
|
private static final int DEFAULT_LEASE_ASSIGNMENT_MANAGER_FREQ_MULTIPLIER = 2;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Default parallelism factor for scaling lease table.
|
||||||
|
*/
|
||||||
|
private static final int DEFAULT_LEASE_TABLE_SCAN_PARALLELISM_FACTOR = 10;
|
||||||
|
|
||||||
|
private static final String FORCE_LEADER_RELEASE_METRIC_NAME = "ForceLeaderRelease";
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Default retry attempt for loading leases and workers before giving up.
|
||||||
|
*/
|
||||||
|
private static final int DDB_LOAD_RETRY_ATTEMPT = 1;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Internal threadpool used to parallely perform assignment operation by calling storage.
|
||||||
|
*/
|
||||||
|
private static final ExecutorService LEASE_ASSIGNMENT_CALL_THREAD_POOL =
|
||||||
|
Executors.newFixedThreadPool(Runtime.getRuntime().availableProcessors());
|
||||||
|
|
||||||
|
private static final String METRICS_LEASE_ASSIGNMENT_MANAGER = "LeaseAssignmentManager";
|
||||||
|
private static final String METRICS_INCOMPLETE_EXPIRED_LEASES_ASSIGNMENT =
|
||||||
|
"LeaseAssignmentManager.IncompleteExpiredLeasesAssignment";
|
||||||
|
public static final int DEFAULT_NO_OF_SKIP_STAT_FOR_DEAD_WORKER_THRESHOLD = 2;
|
||||||
|
|
||||||
|
private final LeaseRefresher leaseRefresher;
|
||||||
|
private final WorkerMetricStatsDAO workerMetricsDAO;
|
||||||
|
private final LeaderDecider leaderDecider;
|
||||||
|
private final LeaseManagementConfig.WorkerUtilizationAwareAssignmentConfig config;
|
||||||
|
private final String currentWorkerId;
|
||||||
|
private final Long leaseDurationMillis;
|
||||||
|
private final MetricsFactory metricsFactory;
|
||||||
|
private final ScheduledExecutorService executorService;
|
||||||
|
private final Supplier<Long> nanoTimeProvider;
|
||||||
|
private final int maxLeasesForWorker;
|
||||||
|
private final LeaseManagementConfig.GracefulLeaseHandoffConfig gracefulLeaseHandoffConfig;
|
||||||
|
private boolean tookOverLeadershipInThisRun = false;
|
||||||
|
private final Map<String, Lease> prevRunLeasesState = new HashMap<>();
|
||||||
|
|
||||||
|
private Future<?> managerFuture;
|
||||||
|
|
||||||
|
private int noOfContinuousFailedAttempts = 0;
|
||||||
|
private int lamRunCounter = 0;
|
||||||
|
|
||||||
|
public synchronized void start() {
|
||||||
|
if (isNull(managerFuture)) {
|
||||||
|
// LAM can be dynamically started/stopped and restarted during MigrationStateMachine execution
|
||||||
|
// so reset the flag to refresh the state before processing during a restart of LAM.
|
||||||
|
tookOverLeadershipInThisRun = false;
|
||||||
|
managerFuture = executorService.scheduleWithFixedDelay(
|
||||||
|
this::performAssignment,
|
||||||
|
0L,
|
||||||
|
leaseDurationMillis * DEFAULT_LEASE_ASSIGNMENT_MANAGER_FREQ_MULTIPLIER,
|
||||||
|
TimeUnit.MILLISECONDS);
|
||||||
|
log.info("Started LeaseAssignmentManager");
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
log.info("LeaseAssignmentManager already running...");
|
||||||
|
}
|
||||||
|
|
||||||
|
public synchronized void stop() {
|
||||||
|
if (nonNull(managerFuture)) {
|
||||||
|
log.info("Completed shutdown of LeaseAssignmentManager");
|
||||||
|
managerFuture.cancel(true);
|
||||||
|
managerFuture = null;
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
log.info("LeaseAssignmentManager is not running...");
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Creates the MetricsScope for given {@param operation} by calling metricsFactory and falls back to
|
||||||
|
* NullMetricsScope if failed to create MetricsScope.
|
||||||
|
* @param operation Operation name for MetricsScope
|
||||||
|
* @return instance of MetricsScope
|
||||||
|
*/
|
||||||
|
private MetricsScope createMetricsScope(final String operation) {
|
||||||
|
try {
|
||||||
|
return MetricsUtil.createMetricsWithOperation(metricsFactory, operation);
|
||||||
|
} catch (final Exception e) {
|
||||||
|
log.error("Failed to create metrics scope defaulting to no metrics.", e);
|
||||||
|
return new NullMetricsScope();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private void performAssignment() {
|
||||||
|
|
||||||
|
final MetricsScope metricsScope = createMetricsScope(METRICS_LEASE_ASSIGNMENT_MANAGER);
|
||||||
|
final long startTime = System.currentTimeMillis();
|
||||||
|
boolean success = false;
|
||||||
|
|
||||||
|
try {
|
||||||
|
|
||||||
|
// If the current worker is not leader, then do nothing as assignment is executed on leader.
|
||||||
|
if (!leaderDecider.isLeader(currentWorkerId)) {
|
||||||
|
log.info("Current worker {} is not a leader, ignore", currentWorkerId);
|
||||||
|
this.tookOverLeadershipInThisRun = false;
|
||||||
|
success = true;
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!this.tookOverLeadershipInThisRun) {
|
||||||
|
// This means that there was leader change, perform cleanup of state as this is leader switch.
|
||||||
|
this.tookOverLeadershipInThisRun = true;
|
||||||
|
this.lamRunCounter = 0;
|
||||||
|
prepareAfterLeaderSwitch();
|
||||||
|
}
|
||||||
|
log.info("Current worker {} is a leader, performing assignment", currentWorkerId);
|
||||||
|
|
||||||
|
final InMemoryStorageView inMemoryStorageView = new InMemoryStorageView();
|
||||||
|
|
||||||
|
final long loadStartTime = System.currentTimeMillis();
|
||||||
|
inMemoryStorageView.loadInMemoryStorageView(metricsScope);
|
||||||
|
MetricsUtil.addLatency(metricsScope, "LeaseAndWorkerMetricsLoad", loadStartTime, MetricsLevel.DETAILED);
|
||||||
|
|
||||||
|
publishLeaseAndWorkerCountMetrics(metricsScope, inMemoryStorageView);
|
||||||
|
final LeaseAssignmentDecider leaseAssignmentDecider = new VarianceBasedLeaseAssignmentDecider(
|
||||||
|
inMemoryStorageView,
|
||||||
|
config.dampeningPercentage(),
|
||||||
|
config.reBalanceThresholdPercentage(),
|
||||||
|
config.allowThroughputOvershoot());
|
||||||
|
|
||||||
|
updateLeasesLastCounterIncrementNanosAndLeaseShutdownTimeout(
|
||||||
|
inMemoryStorageView.getLeaseList(), inMemoryStorageView.getLeaseTableScanTime());
|
||||||
|
|
||||||
|
// This does not include the leases from the worker that has expired (based on WorkerMetricStats's
|
||||||
|
// lastUpdateTime)
|
||||||
|
// but the lease is not expired (based on the leaseCounter on lease).
|
||||||
|
// If a worker has died, the lease will be expired and assigned in next iteration.
|
||||||
|
final List<Lease> expiredOrUnAssignedLeases = inMemoryStorageView.getLeaseList().stream()
|
||||||
|
.filter(lease -> lease.isExpired(
|
||||||
|
TimeUnit.MILLISECONDS.toNanos(leaseDurationMillis),
|
||||||
|
inMemoryStorageView.getLeaseTableScanTime()))
|
||||||
|
// marking them for direct reassignment.
|
||||||
|
.map(l -> l.isExpiredOrUnassigned(true))
|
||||||
|
.collect(Collectors.toList());
|
||||||
|
|
||||||
|
log.info("Total expiredOrUnassignedLeases count : {}", expiredOrUnAssignedLeases.size());
|
||||||
|
metricsScope.addData(
|
||||||
|
"ExpiredLeases", expiredOrUnAssignedLeases.size(), StandardUnit.COUNT, MetricsLevel.SUMMARY);
|
||||||
|
|
||||||
|
final long expiredAndUnassignedLeaseAssignmentStartTime = System.currentTimeMillis();
|
||||||
|
leaseAssignmentDecider.assignExpiredOrUnassignedLeases(expiredOrUnAssignedLeases);
|
||||||
|
MetricsUtil.addLatency(
|
||||||
|
metricsScope,
|
||||||
|
"AssignExpiredOrUnassignedLeases",
|
||||||
|
expiredAndUnassignedLeaseAssignmentStartTime,
|
||||||
|
MetricsLevel.DETAILED);
|
||||||
|
|
||||||
|
if (!expiredOrUnAssignedLeases.isEmpty()) {
|
||||||
|
// When expiredOrUnAssignedLeases is not empty, that means
|
||||||
|
// that we were not able to assign all expired or unassigned leases and hit the maxThroughput
|
||||||
|
// per worker for all workers.
|
||||||
|
log.warn("Not able to assign all expiredOrUnAssignedLeases");
|
||||||
|
metricsScope.addData(
|
||||||
|
"LeaseSpillover", expiredOrUnAssignedLeases.size(), StandardUnit.COUNT, MetricsLevel.SUMMARY);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (shouldRunVarianceBalancing()) {
|
||||||
|
final long balanceWorkerVarianceStartTime = System.currentTimeMillis();
|
||||||
|
final int totalNewAssignmentBeforeWorkerVarianceBalancing =
|
||||||
|
inMemoryStorageView.leaseToNewAssignedWorkerMap.size();
|
||||||
|
leaseAssignmentDecider.balanceWorkerVariance();
|
||||||
|
MetricsUtil.addLatency(
|
||||||
|
metricsScope, "BalanceWorkerVariance", balanceWorkerVarianceStartTime, MetricsLevel.DETAILED);
|
||||||
|
metricsScope.addData(
|
||||||
|
"NumOfLeasesReassignment",
|
||||||
|
inMemoryStorageView.leaseToNewAssignedWorkerMap.size()
|
||||||
|
- totalNewAssignmentBeforeWorkerVarianceBalancing,
|
||||||
|
StandardUnit.COUNT,
|
||||||
|
MetricsLevel.SUMMARY);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (inMemoryStorageView.leaseToNewAssignedWorkerMap.isEmpty()) {
|
||||||
|
log.info("No new lease assignment performed in this iteration");
|
||||||
|
}
|
||||||
|
|
||||||
|
parallelyAssignLeases(inMemoryStorageView, metricsScope);
|
||||||
|
printPerWorkerLeases(inMemoryStorageView);
|
||||||
|
deleteStaleWorkerMetricsEntries(inMemoryStorageView, metricsScope);
|
||||||
|
success = true;
|
||||||
|
noOfContinuousFailedAttempts = 0;
|
||||||
|
} catch (final Exception e) {
|
||||||
|
log.error("LeaseAssignmentManager failed to perform lease assignment.", e);
|
||||||
|
noOfContinuousFailedAttempts++;
|
||||||
|
if (noOfContinuousFailedAttempts >= DEFAULT_FAILURE_COUNT_TO_SWITCH_LEADER) {
|
||||||
|
log.error(
|
||||||
|
"Failed to perform assignment {} times in a row, releasing leadership from worker : {}",
|
||||||
|
DEFAULT_FAILURE_COUNT_TO_SWITCH_LEADER,
|
||||||
|
currentWorkerId);
|
||||||
|
MetricsUtil.addCount(metricsScope, FORCE_LEADER_RELEASE_METRIC_NAME, 1, MetricsLevel.SUMMARY);
|
||||||
|
leaderDecider.releaseLeadershipIfHeld();
|
||||||
|
}
|
||||||
|
} finally {
|
||||||
|
MetricsUtil.addSuccessAndLatency(metricsScope, success, startTime, MetricsLevel.SUMMARY);
|
||||||
|
MetricsUtil.endScope(metricsScope);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private boolean shouldRunVarianceBalancing() {
|
||||||
|
final boolean response = this.lamRunCounter == 0;
|
||||||
|
/*
|
||||||
|
To avoid lamRunCounter grow large, keep it within [0,varianceBalancingFrequency).
|
||||||
|
If varianceBalancingFrequency is 5 lamRunCounter value will be within 0 to 4 and method return true when
|
||||||
|
lamRunCounter is 0.
|
||||||
|
*/
|
||||||
|
this.lamRunCounter = (this.lamRunCounter + 1) % config.varianceBalancingFrequency();
|
||||||
|
return response;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Deletes the WorkerMetricStats entries which are stale(not updated since long time, ref
|
||||||
|
* {@link LeaseAssignmentManager#isWorkerMetricsEntryStale} for the condition to evaluate staleness)
|
||||||
|
*/
|
||||||
|
private void deleteStaleWorkerMetricsEntries(
|
||||||
|
final InMemoryStorageView inMemoryStorageView, final MetricsScope metricsScope) {
|
||||||
|
final long startTime = System.currentTimeMillis();
|
||||||
|
try {
|
||||||
|
final List<WorkerMetricStats> staleWorkerMetricsList = inMemoryStorageView.getWorkerMetricsList().stream()
|
||||||
|
.filter(this::isWorkerMetricsEntryStale)
|
||||||
|
.collect(Collectors.toList());
|
||||||
|
MetricsUtil.addCount(
|
||||||
|
metricsScope, "TotalStaleWorkerMetricsEntry", staleWorkerMetricsList.size(), MetricsLevel.DETAILED);
|
||||||
|
log.info("Number of stale workerMetrics entries : {}", staleWorkerMetricsList.size());
|
||||||
|
log.info("Stale workerMetrics list : {}", staleWorkerMetricsList);
|
||||||
|
|
||||||
|
final List<CompletableFuture<Boolean>> completableFutures = staleWorkerMetricsList.stream()
|
||||||
|
.map(workerMetrics -> CompletableFuture.supplyAsync(
|
||||||
|
() -> workerMetricsDAO.deleteMetrics(workerMetrics), LEASE_ASSIGNMENT_CALL_THREAD_POOL))
|
||||||
|
.collect(Collectors.toList());
|
||||||
|
|
||||||
|
CompletableFuture.allOf(completableFutures.toArray(new CompletableFuture[0]))
|
||||||
|
.join();
|
||||||
|
} finally {
|
||||||
|
MetricsUtil.addLatency(metricsScope, "StaleWorkerMetricsCleanup", startTime, MetricsLevel.DETAILED);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* WorkerMetricStats entry is considered stale if the lastUpdateTime of the workerMetrics is older than
|
||||||
|
* workerMetricsStalenessThreshold * workerMetricsReporterFreqInMillis.
|
||||||
|
*/
|
||||||
|
private boolean isWorkerMetricsEntryStale(final WorkerMetricStats workerMetrics) {
|
||||||
|
return Duration.between(Instant.ofEpochSecond(workerMetrics.getLastUpdateTime()), Instant.now())
|
||||||
|
.toMillis()
|
||||||
|
> config.staleWorkerMetricsEntryCleanupDuration().toMillis();
|
||||||
|
}
|
||||||
|
|
||||||
|
private void printPerWorkerLeases(final InMemoryStorageView storageView) {
|
||||||
|
storageView.getActiveWorkerIdSet().forEach(activeWorkerId -> {
|
||||||
|
log.info(
|
||||||
|
"Worker : {} and total leases : {} and totalThroughput : {}",
|
||||||
|
activeWorkerId,
|
||||||
|
Optional.ofNullable(storageView.getWorkerToLeasesMap().get(activeWorkerId))
|
||||||
|
.orElse(Collections.EMPTY_SET)
|
||||||
|
.size(),
|
||||||
|
storageView.getWorkerToTotalAssignedThroughputMap().get(activeWorkerId));
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
private void parallelyAssignLeases(final InMemoryStorageView inMemoryStorageView, final MetricsScope metricsScope) {
|
||||||
|
final AtomicInteger failedAssignmentCounter = new AtomicInteger(0);
|
||||||
|
final long startTime = System.currentTimeMillis();
|
||||||
|
boolean success = false;
|
||||||
|
try {
|
||||||
|
CompletableFuture.allOf(inMemoryStorageView.getLeaseToNewAssignedWorkerMap().entrySet().stream()
|
||||||
|
// ignore leases that are heartbeating and pending graceful shutdown checkpoint.
|
||||||
|
.filter(entry -> !entry.getKey().blockedOnPendingCheckpoint(getNanoTimeMillis()))
|
||||||
|
.map(entry -> CompletableFuture.supplyAsync(
|
||||||
|
() -> {
|
||||||
|
try {
|
||||||
|
final Lease lease = entry.getKey();
|
||||||
|
if (gracefulLeaseHandoffConfig.isGracefulLeaseHandoffEnabled()
|
||||||
|
&& lease.isEligibleForGracefulShutdown()) {
|
||||||
|
return handleGracefulLeaseHandoff(
|
||||||
|
lease, entry.getValue(), failedAssignmentCounter);
|
||||||
|
} else {
|
||||||
|
return handleRegularLeaseAssignment(
|
||||||
|
lease, entry.getValue(), failedAssignmentCounter);
|
||||||
|
}
|
||||||
|
} catch (Exception e) {
|
||||||
|
throw new CompletionException(e);
|
||||||
|
}
|
||||||
|
},
|
||||||
|
LEASE_ASSIGNMENT_CALL_THREAD_POOL))
|
||||||
|
.toArray(CompletableFuture[]::new))
|
||||||
|
.join();
|
||||||
|
success = true;
|
||||||
|
} finally {
|
||||||
|
MetricsUtil.addCount(
|
||||||
|
metricsScope, "FailedAssignmentCount", failedAssignmentCounter.get(), MetricsLevel.DETAILED);
|
||||||
|
MetricsUtil.addSuccessAndLatency(
|
||||||
|
metricsScope, "ParallelyAssignLeases", success, startTime, MetricsLevel.DETAILED);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private boolean handleGracefulLeaseHandoff(Lease lease, String newOwner, AtomicInteger failedAssignmentCounter)
|
||||||
|
throws ProvisionedThroughputException, InvalidStateException, DependencyException {
|
||||||
|
final boolean response = leaseRefresher.initiateGracefulLeaseHandoff(lease, newOwner);
|
||||||
|
if (response) {
|
||||||
|
// new handoff assignment. add the timeout.
|
||||||
|
lease.checkpointOwnerTimeoutTimestampMillis(getCheckpointOwnerTimeoutTimestampMillis());
|
||||||
|
} else {
|
||||||
|
failedAssignmentCounter.incrementAndGet();
|
||||||
|
}
|
||||||
|
return response;
|
||||||
|
}
|
||||||
|
|
||||||
|
private boolean handleRegularLeaseAssignment(Lease lease, String newOwner, AtomicInteger failedAssignmentCounter)
|
||||||
|
throws ProvisionedThroughputException, InvalidStateException, DependencyException {
|
||||||
|
final boolean response = leaseRefresher.assignLease(lease, newOwner);
|
||||||
|
if (response) {
|
||||||
|
// Successful assignment updates the leaseCounter, update the nanoTime for counter update.
|
||||||
|
lease.lastCounterIncrementNanos(nanoTimeProvider.get());
|
||||||
|
} else {
|
||||||
|
failedAssignmentCounter.incrementAndGet();
|
||||||
|
}
|
||||||
|
return response;
|
||||||
|
}
|
||||||
|
|
||||||
|
private void publishLeaseAndWorkerCountMetrics(
|
||||||
|
final MetricsScope metricsScope, final InMemoryStorageView inMemoryStorageView) {
|
||||||
|
// Names of the metrics are kept in sync with what is published in LeaseTaker.
|
||||||
|
metricsScope.addData(
|
||||||
|
"TotalLeases", inMemoryStorageView.leaseList.size(), StandardUnit.COUNT, MetricsLevel.SUMMARY);
|
||||||
|
metricsScope.addData(
|
||||||
|
"NumWorkers", inMemoryStorageView.activeWorkerMetrics.size(), StandardUnit.COUNT, MetricsLevel.SUMMARY);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Method updates all new leases with currentTime if the counter is updated since last run else keeps whatever
|
||||||
|
// was prev and update the prevRunLeasesState
|
||||||
|
private void updateLeasesLastCounterIncrementNanosAndLeaseShutdownTimeout(
|
||||||
|
final List<Lease> leaseList, final Long scanTime) {
|
||||||
|
for (final Lease lease : leaseList) {
|
||||||
|
final Lease prevLease = prevRunLeasesState.get(lease.leaseKey());
|
||||||
|
|
||||||
|
// make sure lease shutdown timeouts are tracked.
|
||||||
|
if (lease.shutdownRequested()) {
|
||||||
|
// previous and current leases might have same next and checkpoint owners but there is no
|
||||||
|
// guarantee that the latest shutdown is the same shutdown in the previous lease for example
|
||||||
|
// some other leaders change the lease states while this worker waiting for it's LAM run.
|
||||||
|
// This is the best effort to prevent marking the incorrect timeout.
|
||||||
|
if (isNull(prevLease) || !prevLease.shutdownRequested() || !isSameOwners(lease, prevLease)) {
|
||||||
|
// Add new value if previous is null, previous lease is not shutdown pending or the owners
|
||||||
|
// don't match
|
||||||
|
lease.checkpointOwnerTimeoutTimestampMillis(getCheckpointOwnerTimeoutTimestampMillis());
|
||||||
|
} else {
|
||||||
|
lease.checkpointOwnerTimeoutTimestampMillis(prevLease.checkpointOwnerTimeoutTimestampMillis());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (isNull(prevLease)) {
|
||||||
|
lease.lastCounterIncrementNanos(
|
||||||
|
isNull(lease.actualOwner())
|
||||||
|
// This is an unassigned lease, mark as 0L that puts this in first in assignment order
|
||||||
|
? 0L
|
||||||
|
: scanTime);
|
||||||
|
} else {
|
||||||
|
lease.lastCounterIncrementNanos(
|
||||||
|
lease.leaseCounter() > prevLease.leaseCounter()
|
||||||
|
? scanTime
|
||||||
|
: prevLease.lastCounterIncrementNanos());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
prevRunLeasesState.clear();
|
||||||
|
prevRunLeasesState.putAll(leaseList.stream().collect(Collectors.toMap(Lease::leaseKey, Function.identity())));
|
||||||
|
}
|
||||||
|
|
||||||
|
private void prepareAfterLeaderSwitch() {
|
||||||
|
prevRunLeasesState.clear();
|
||||||
|
noOfContinuousFailedAttempts = 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* In memory view of the leases and workerMetrics.
|
||||||
|
* This class supports queries (e.g., leases assigned to worker or total throughout assigned to worker).
|
||||||
|
*/
|
||||||
|
@Getter
|
||||||
|
class InMemoryStorageView {
|
||||||
|
|
||||||
|
// This is in-memory view of the workerToLeaseMapping, this is updated in-memory before actual
|
||||||
|
// changes to storage.
|
||||||
|
private final Map<String, Set<Lease>> workerToLeasesMap = new HashMap<>();
|
||||||
|
/**
|
||||||
|
* This is computed initially after the loading leases and then updated when the
|
||||||
|
* {@link InMemoryStorageView#performLeaseAssignment} is called.
|
||||||
|
*/
|
||||||
|
private final Map<String, Double> workerToTotalAssignedThroughputMap = new HashMap<>();
|
||||||
|
/**
|
||||||
|
* Captures the new assignment done during the lifecycle of single run.
|
||||||
|
*/
|
||||||
|
private final Map<Lease, String> leaseToNewAssignedWorkerMap = new HashMap<>();
|
||||||
|
|
||||||
|
/**
|
||||||
|
* List of all leases in the application.
|
||||||
|
*/
|
||||||
|
private List<Lease> leaseList;
|
||||||
|
/**
|
||||||
|
* List of workers which are active (i.e., updated metric stats before the threshold ref)
|
||||||
|
* {@link this#computeWorkerExpiryThresholdInSecond})
|
||||||
|
*/
|
||||||
|
private List<WorkerMetricStats> activeWorkerMetrics;
|
||||||
|
/**
|
||||||
|
* List of all workerMetrics entries from storage.
|
||||||
|
*/
|
||||||
|
private List<WorkerMetricStats> workerMetricsList;
|
||||||
|
/**
|
||||||
|
* List of active workers ids.
|
||||||
|
*/
|
||||||
|
private Set<String> activeWorkerIdSet;
|
||||||
|
/**
|
||||||
|
* Wall time in nanoseconds when the lease table scan was completed.
|
||||||
|
*/
|
||||||
|
private long leaseTableScanTime = 0L;
|
||||||
|
/**
|
||||||
|
* Average throughput for all workers.
|
||||||
|
*/
|
||||||
|
private double targetAverageThroughput;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Update {@ref inMemoryWorkerToLeasesMapping} with the change in ownership and update newLeaseAssignmentMap
|
||||||
|
*
|
||||||
|
* @param lease lease changing assignment
|
||||||
|
* @param newOwner new owner of the lease
|
||||||
|
*/
|
||||||
|
public void performLeaseAssignment(final Lease lease, final String newOwner) {
|
||||||
|
final String existingOwner = lease.actualOwner();
|
||||||
|
workerToLeasesMap.get(existingOwner).remove(lease);
|
||||||
|
workerToLeasesMap
|
||||||
|
.computeIfAbsent(newOwner, owner -> new HashSet<>())
|
||||||
|
.add(lease);
|
||||||
|
updateWorkerThroughput(newOwner, lease.throughputKBps());
|
||||||
|
// Remove the same lease throughput from oldOwner
|
||||||
|
updateWorkerThroughput(existingOwner, -lease.throughputKBps());
|
||||||
|
leaseToNewAssignedWorkerMap.put(lease, newOwner);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Scans the LeaseTable and WorkerMetricStats in parallel and load the data and populate datastructures used
|
||||||
|
* in lease assignment.
|
||||||
|
*/
|
||||||
|
public void loadInMemoryStorageView(final MetricsScope metricsScope) throws Exception {
|
||||||
|
final CompletableFuture<Map.Entry<List<Lease>, List<String>>> leaseListFuture = loadLeaseListAsync();
|
||||||
|
|
||||||
|
final CompletableFuture<List<WorkerMetricStats>> workerMetricsFuture = loadWorkerMetricStats();
|
||||||
|
|
||||||
|
final List<WorkerMetricStats> workerMetricsFromStorage = workerMetricsFuture.join();
|
||||||
|
|
||||||
|
final List<String> listOfWorkerIdOfInvalidWorkerMetricsEntry = workerMetricsFromStorage.stream()
|
||||||
|
.filter(workerMetrics -> !workerMetrics.isValidWorkerMetric())
|
||||||
|
.map(WorkerMetricStats::getWorkerId)
|
||||||
|
.collect(Collectors.toList());
|
||||||
|
log.warn("List of workerIds with invalid entries : {}", listOfWorkerIdOfInvalidWorkerMetricsEntry);
|
||||||
|
if (!listOfWorkerIdOfInvalidWorkerMetricsEntry.isEmpty()) {
|
||||||
|
metricsScope.addData(
|
||||||
|
"NumWorkersWithInvalidEntry",
|
||||||
|
listOfWorkerIdOfInvalidWorkerMetricsEntry.size(),
|
||||||
|
StandardUnit.COUNT,
|
||||||
|
MetricsLevel.SUMMARY);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Valid entries are considered further, for validity of entry refer WorkerMetricStats#isValidWorkerMetrics
|
||||||
|
this.workerMetricsList = workerMetricsFromStorage.stream()
|
||||||
|
.filter(WorkerMetricStats::isValidWorkerMetric)
|
||||||
|
.collect(Collectors.toList());
|
||||||
|
|
||||||
|
log.info("Total WorkerMetricStats available : {}", workerMetricsList.size());
|
||||||
|
final long workerExpiryThreshold = computeWorkerExpiryThresholdInSecond();
|
||||||
|
|
||||||
|
final long countOfWorkersWithFailingWorkerMetric = workerMetricsList.stream()
|
||||||
|
.filter(WorkerMetricStats::isAnyWorkerMetricFailing)
|
||||||
|
.count();
|
||||||
|
if (countOfWorkersWithFailingWorkerMetric != 0) {
|
||||||
|
metricsScope.addData(
|
||||||
|
"NumWorkersWithFailingWorkerMetric",
|
||||||
|
countOfWorkersWithFailingWorkerMetric,
|
||||||
|
StandardUnit.COUNT,
|
||||||
|
MetricsLevel.SUMMARY);
|
||||||
|
}
|
||||||
|
|
||||||
|
final Map.Entry<List<Lease>, List<String>> leaseListResponse = leaseListFuture.join();
|
||||||
|
this.leaseList = leaseListResponse.getKey();
|
||||||
|
log.warn("Leases that failed deserialization : {}", leaseListResponse.getValue());
|
||||||
|
if (!leaseListResponse.getValue().isEmpty()) {
|
||||||
|
MetricsUtil.addCount(
|
||||||
|
metricsScope,
|
||||||
|
"LeaseDeserializationFailureCount",
|
||||||
|
leaseListResponse.getValue().size(),
|
||||||
|
MetricsLevel.SUMMARY);
|
||||||
|
}
|
||||||
|
this.leaseTableScanTime = nanoTimeProvider.get();
|
||||||
|
log.info("Total Leases available : {}", leaseList.size());
|
||||||
|
|
||||||
|
final double averageLeaseThroughput = leaseList.stream()
|
||||||
|
.filter(lease -> nonNull(lease.throughputKBps()))
|
||||||
|
.mapToDouble(Lease::throughputKBps)
|
||||||
|
.average()
|
||||||
|
// If none of the leases has any value, that means its app
|
||||||
|
// startup time and thus assigns 0 in that case to start with.
|
||||||
|
.orElse(0D);
|
||||||
|
/*
|
||||||
|
* If a workerMetrics has a metric (i.e. has -1 value in last index which denotes failure),
|
||||||
|
* skip it from activeWorkerMetrics and no new action on it will be done
|
||||||
|
* (new assignment etc.) until the metric has non -1 value in last index. This is to avoid performing action
|
||||||
|
* with the stale data on worker.
|
||||||
|
*/
|
||||||
|
this.activeWorkerMetrics = workerMetricsList.stream()
|
||||||
|
.filter(workerMetrics -> workerMetrics.getLastUpdateTime() >= workerExpiryThreshold
|
||||||
|
&& !workerMetrics.isAnyWorkerMetricFailing())
|
||||||
|
.collect(Collectors.toList());
|
||||||
|
log.info("activeWorkerMetrics : {}", activeWorkerMetrics.size());
|
||||||
|
targetAverageThroughput =
|
||||||
|
averageLeaseThroughput * leaseList.size() / Math.max(1, activeWorkerMetrics.size());
|
||||||
|
leaseList.forEach(lease -> {
|
||||||
|
if (isNull(lease.throughputKBps())) {
|
||||||
|
// If the lease is unassigned, it will not have any throughput value, use average throughput
|
||||||
|
// as good enough value to start with.
|
||||||
|
lease.throughputKBps(averageLeaseThroughput);
|
||||||
|
}
|
||||||
|
workerToLeasesMap
|
||||||
|
.computeIfAbsent(lease.actualOwner(), workerId -> new HashSet<>())
|
||||||
|
.add(lease);
|
||||||
|
updateWorkerThroughput(lease.actualOwner(), lease.throughputKBps());
|
||||||
|
});
|
||||||
|
|
||||||
|
this.activeWorkerIdSet = new HashSet<>();
|
||||||
|
// Calculate initial ratio
|
||||||
|
this.activeWorkerMetrics.forEach(workerMetrics -> {
|
||||||
|
activeWorkerIdSet.add(workerMetrics.getWorkerId());
|
||||||
|
workerMetrics.setEmaAlpha(config.workerMetricsEMAAlpha());
|
||||||
|
if (workerMetrics.isUsingDefaultWorkerMetric()) {
|
||||||
|
setOperatingRangeAndWorkerMetricsDataForDefaultWorker(
|
||||||
|
workerMetrics,
|
||||||
|
getTotalAssignedThroughput(workerMetrics.getWorkerId()) / targetAverageThroughput);
|
||||||
|
}
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
private void updateWorkerThroughput(final String workerId, final double leaseThroughput) {
|
||||||
|
double value = workerToTotalAssignedThroughputMap.computeIfAbsent(workerId, worker -> (double) 0L);
|
||||||
|
workerToTotalAssignedThroughputMap.put(workerId, value + leaseThroughput);
|
||||||
|
}
|
||||||
|
|
||||||
|
private void setOperatingRangeAndWorkerMetricsDataForDefaultWorker(
|
||||||
|
final WorkerMetricStats workerMetrics, final Double ratio) {
|
||||||
|
// for workers with default WorkerMetricStats, the operating range ceiling of 100 represents the
|
||||||
|
// target throughput. This way, with either heterogeneous or homogeneous fleets
|
||||||
|
// of explicit WorkerMetricStats and default WorkerMetricStats applications, load will be evenly
|
||||||
|
// distributed.
|
||||||
|
log.info(
|
||||||
|
"Worker [{}] is using default WorkerMetricStats, setting initial utilization ratio to [{}].",
|
||||||
|
workerMetrics.getWorkerId(),
|
||||||
|
ratio);
|
||||||
|
workerMetrics.setOperatingRange(ImmutableMap.of("T", ImmutableList.of(100L)));
|
||||||
|
workerMetrics.setMetricStats(ImmutableMap.of("T", ImmutableList.of(ratio * 100, ratio * 100)));
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Calculates the value threshold in seconds for a worker to be considered as active.
|
||||||
|
* If a worker has not updated the WorkerMetricStats entry within this threshold, the worker is not considered
|
||||||
|
* as active.
|
||||||
|
*
|
||||||
|
* @return wall time in seconds
|
||||||
|
*/
|
||||||
|
private long computeWorkerExpiryThresholdInSecond() {
|
||||||
|
final long timeInSeconds = Duration.ofMillis(System.currentTimeMillis()
|
||||||
|
- DEFAULT_NO_OF_SKIP_STAT_FOR_DEAD_WORKER_THRESHOLD
|
||||||
|
* config.workerMetricsReporterFreqInMillis())
|
||||||
|
.getSeconds();
|
||||||
|
log.info("WorkerMetricStats expiry time in seconds : {}", timeInSeconds);
|
||||||
|
return timeInSeconds;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Looks at inMemoryWorkerToLeasesMapping for lease assignment and figures out if there is room considering
|
||||||
|
* any new assignment that would have happened.
|
||||||
|
*/
|
||||||
|
public boolean isWorkerTotalThroughputLessThanMaxThroughput(final String workerId) {
|
||||||
|
return getTotalAssignedThroughput(workerId) <= config.maxThroughputPerHostKBps();
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Looks at inMemoryWorkerToLeasesMapping for lease assignment of a worker and returns true if the worker has
|
||||||
|
* no leases assigned or less than maxNumberOfLeasesPerHost else false.
|
||||||
|
*/
|
||||||
|
public boolean isWorkerAssignedLeasesLessThanMaxLeases(final String workerId) {
|
||||||
|
final Set<Lease> assignedLeases = workerToLeasesMap.get(workerId);
|
||||||
|
if (CollectionUtils.isEmpty(assignedLeases)) {
|
||||||
|
// There are no leases assigned to the worker, that means its less than maxNumberOfLeasesPerHost.
|
||||||
|
return true;
|
||||||
|
} else {
|
||||||
|
return assignedLeases.size() < maxLeasesForWorker;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public Double getTotalAssignedThroughput(final String workerId) {
|
||||||
|
return workerToTotalAssignedThroughputMap.getOrDefault(workerId, 0D);
|
||||||
|
}
|
||||||
|
|
||||||
|
private CompletableFuture<List<WorkerMetricStats>> loadWorkerMetricStats() {
|
||||||
|
return CompletableFuture.supplyAsync(() -> loadWithRetry(workerMetricsDAO::getAllWorkerMetricStats));
|
||||||
|
}
|
||||||
|
|
||||||
|
private CompletableFuture<Map.Entry<List<Lease>, List<String>>> loadLeaseListAsync() {
|
||||||
|
return CompletableFuture.supplyAsync(() -> loadWithRetry(() -> leaseRefresher.listLeasesParallely(
|
||||||
|
LEASE_ASSIGNMENT_CALL_THREAD_POOL, DEFAULT_LEASE_TABLE_SCAN_PARALLELISM_FACTOR)));
|
||||||
|
}
|
||||||
|
|
||||||
|
private <T> T loadWithRetry(final Callable<T> loadFunction) {
|
||||||
|
int retryAttempt = 0;
|
||||||
|
while (true) {
|
||||||
|
try {
|
||||||
|
return loadFunction.call();
|
||||||
|
} catch (final Exception e) {
|
||||||
|
if (retryAttempt < DDB_LOAD_RETRY_ATTEMPT) {
|
||||||
|
log.warn(
|
||||||
|
"Failed to load : {}, retrying",
|
||||||
|
loadFunction.getClass().getName(),
|
||||||
|
e);
|
||||||
|
retryAttempt++;
|
||||||
|
} else {
|
||||||
|
throw new CompletionException(e);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private long getCheckpointOwnerTimeoutTimestampMillis() {
|
||||||
|
// this is a future timestamp in millis that the graceful lease handoff shutdown can be considered
|
||||||
|
// expired. LeaseDurationMillis is used here to account for how long it might take for the
|
||||||
|
// lease owner to receive the shutdown signal before executing shutdown.
|
||||||
|
return getNanoTimeMillis()
|
||||||
|
+ gracefulLeaseHandoffConfig.gracefulLeaseHandoffTimeoutMillis()
|
||||||
|
+ leaseDurationMillis;
|
||||||
|
}
|
||||||
|
|
||||||
|
private long getNanoTimeMillis() {
|
||||||
|
// this is not a wall clock time. But if we stick with using this time provider for calculating the elapsed
|
||||||
|
// time it should be okay to use in checkpoint expiration calculation.
|
||||||
|
return TimeUnit.NANOSECONDS.toMillis(nanoTimeProvider.get());
|
||||||
|
}
|
||||||
|
|
||||||
|
private static boolean isSameOwners(Lease currentLease, Lease previousLease) {
|
||||||
|
return Objects.equals(currentLease.leaseOwner(), previousLease.leaseOwner())
|
||||||
|
&& Objects.equals(currentLease.checkpointOwner(), previousLease.checkpointOwner());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
@ -0,0 +1,348 @@
|
||||||
|
package software.amazon.kinesis.coordinator.assignment;
|
||||||
|
|
||||||
|
import java.util.AbstractMap.SimpleEntry;
|
||||||
|
import java.util.ArrayDeque;
|
||||||
|
import java.util.ArrayList;
|
||||||
|
import java.util.Collections;
|
||||||
|
import java.util.Comparator;
|
||||||
|
import java.util.HashMap;
|
||||||
|
import java.util.HashSet;
|
||||||
|
import java.util.List;
|
||||||
|
import java.util.Map;
|
||||||
|
import java.util.PriorityQueue;
|
||||||
|
import java.util.Queue;
|
||||||
|
import java.util.Set;
|
||||||
|
import java.util.stream.Collectors;
|
||||||
|
|
||||||
|
import lombok.extern.slf4j.Slf4j;
|
||||||
|
import software.amazon.kinesis.annotations.KinesisClientInternalApi;
|
||||||
|
import software.amazon.kinesis.leases.Lease;
|
||||||
|
import software.amazon.kinesis.worker.metricstats.WorkerMetricStats;
|
||||||
|
|
||||||
|
import static java.util.Objects.isNull;
|
||||||
|
import static java.util.Objects.nonNull;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* VarianceBasedLeaseAssignmentDecider
|
||||||
|
* This implementation of LeaseAssignmentDecider performs lease assignment by considering the WorkerMetricStats values of workers
|
||||||
|
* with respect to fleet level average of that WorkerMetricStats.
|
||||||
|
* Rebalanced leases are assigned to workers which has maximum capacity to in terms of throughput to reach fleet level
|
||||||
|
* across the WorkerMetricStats value. In case of multiple WorkerMetricStats, the capacity to reach fleet level average is determined by outlier
|
||||||
|
* WorkerMetricStats.
|
||||||
|
* To minimize the variance, the algorithm picks the fleet level average of the WorkerMetricStats for workers as a
|
||||||
|
* pivot point and uses it to determine workers to take leases from and then assign to other workers.
|
||||||
|
* The threshold for considering a worker for re-balance is configurable via
|
||||||
|
* {@code reBalanceThreshold}. During reassignments the {@code dampeningPercentageValue} is used to achieve
|
||||||
|
* critical dampening.
|
||||||
|
*/
|
||||||
|
@Slf4j
|
||||||
|
@KinesisClientInternalApi
|
||||||
|
public final class VarianceBasedLeaseAssignmentDecider implements LeaseAssignmentDecider {
|
||||||
|
private final LeaseAssignmentManager.InMemoryStorageView inMemoryStorageView;
|
||||||
|
private final int dampeningPercentageValue;
|
||||||
|
private final int reBalanceThreshold;
|
||||||
|
private final boolean allowThroughputOvershoot;
|
||||||
|
private final Map<String, Double> workerMetricsToFleetLevelAverageMap = new HashMap<>();
|
||||||
|
private final PriorityQueue<WorkerMetricStats> assignableWorkerSortedByAvailableCapacity;
|
||||||
|
private int targetLeasePerWorker;
|
||||||
|
|
||||||
|
public VarianceBasedLeaseAssignmentDecider(
|
||||||
|
final LeaseAssignmentManager.InMemoryStorageView inMemoryStorageView,
|
||||||
|
final int dampeningPercentageValue,
|
||||||
|
final int reBalanceThreshold,
|
||||||
|
final boolean allowThroughputOvershoot) {
|
||||||
|
this.inMemoryStorageView = inMemoryStorageView;
|
||||||
|
this.dampeningPercentageValue = dampeningPercentageValue;
|
||||||
|
this.reBalanceThreshold = reBalanceThreshold;
|
||||||
|
this.allowThroughputOvershoot = allowThroughputOvershoot;
|
||||||
|
initialize();
|
||||||
|
final Comparator<WorkerMetricStats> comparator = Comparator.comparingDouble(
|
||||||
|
workerMetrics -> workerMetrics.computePercentageToReachAverage(workerMetricsToFleetLevelAverageMap));
|
||||||
|
this.assignableWorkerSortedByAvailableCapacity = new PriorityQueue<>(comparator.reversed());
|
||||||
|
this.assignableWorkerSortedByAvailableCapacity.addAll(
|
||||||
|
getAvailableWorkersForAssignment(inMemoryStorageView.getActiveWorkerMetrics()));
|
||||||
|
}
|
||||||
|
|
||||||
|
private void initialize() {
|
||||||
|
final Map<String, Double> workerMetricsNameToAverage = inMemoryStorageView.getActiveWorkerMetrics().stream()
|
||||||
|
.flatMap(workerMetrics -> workerMetrics.getMetricStats().keySet().stream()
|
||||||
|
.map(workerMetricsName ->
|
||||||
|
new SimpleEntry<>(workerMetricsName, workerMetrics.getMetricStat(workerMetricsName))))
|
||||||
|
.collect(Collectors.groupingBy(
|
||||||
|
SimpleEntry::getKey, HashMap::new, Collectors.averagingDouble(SimpleEntry::getValue)));
|
||||||
|
|
||||||
|
workerMetricsToFleetLevelAverageMap.putAll(workerMetricsNameToAverage);
|
||||||
|
|
||||||
|
final int totalWorkers =
|
||||||
|
Math.max(inMemoryStorageView.getActiveWorkerMetrics().size(), 1);
|
||||||
|
this.targetLeasePerWorker = Math.max(inMemoryStorageView.getLeaseList().size() / totalWorkers, 1);
|
||||||
|
}
|
||||||
|
|
||||||
|
private List<WorkerMetricStats> getAvailableWorkersForAssignment(final List<WorkerMetricStats> workerMetricsList) {
|
||||||
|
// Workers with WorkerMetricStats running hot are also available for assignment as the goal is to balance
|
||||||
|
// utilization
|
||||||
|
// always (e.g., if all workers have hot WorkerMetricStats, balance the variance between them too)
|
||||||
|
return workerMetricsList.stream()
|
||||||
|
.filter(workerMetrics -> inMemoryStorageView.isWorkerTotalThroughputLessThanMaxThroughput(
|
||||||
|
workerMetrics.getWorkerId())
|
||||||
|
&& inMemoryStorageView.isWorkerAssignedLeasesLessThanMaxLeases(workerMetrics.getWorkerId()))
|
||||||
|
.collect(Collectors.toList());
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void assignExpiredOrUnassignedLeases(final List<Lease> expiredOrUnAssignedLeases) {
|
||||||
|
// Sort the expiredOrUnAssignedLeases using lastCounterIncrementNanos such that leases expired first are
|
||||||
|
// picked first.
|
||||||
|
// Unassigned leases have lastCounterIncrementNanos as zero and thus assigned first.
|
||||||
|
Collections.sort(expiredOrUnAssignedLeases, Comparator.comparing(Lease::lastCounterIncrementNanos));
|
||||||
|
final Set<Lease> assignedLeases = new HashSet<>();
|
||||||
|
for (final Lease lease : expiredOrUnAssignedLeases) {
|
||||||
|
final WorkerMetricStats workerToAssignLease = assignableWorkerSortedByAvailableCapacity.poll();
|
||||||
|
if (nonNull(workerToAssignLease)) {
|
||||||
|
assignLease(lease, workerToAssignLease);
|
||||||
|
assignedLeases.add(lease);
|
||||||
|
} else {
|
||||||
|
log.info("No worker available to assign lease {}", lease.leaseKey());
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
expiredOrUnAssignedLeases.removeAll(assignedLeases);
|
||||||
|
}
|
||||||
|
|
||||||
|
private List<WorkerMetricStats> getWorkersToTakeLeasesFromIfRequired(
|
||||||
|
final List<WorkerMetricStats> currentWorkerMetrics,
|
||||||
|
final String workerMetricsName,
|
||||||
|
final double workerMetricsValueAvg) {
|
||||||
|
final List<WorkerMetricStats> workerIdsAboveAverage = new ArrayList<>();
|
||||||
|
|
||||||
|
final double upperLimit = workerMetricsValueAvg * (1.0D + (double) reBalanceThreshold / 100);
|
||||||
|
final double lowerLimit = workerMetricsValueAvg * (1.0D - (double) reBalanceThreshold / 100);
|
||||||
|
|
||||||
|
WorkerMetricStats mostLoadedWorker = null;
|
||||||
|
|
||||||
|
log.info("Range for re-balance upper threshold {} and lower threshold {}", upperLimit, lowerLimit);
|
||||||
|
|
||||||
|
boolean shouldTriggerReBalance = false;
|
||||||
|
for (final WorkerMetricStats workerMetrics : currentWorkerMetrics) {
|
||||||
|
final double currentWorkerMetricsValue = workerMetrics.getMetricStat(workerMetricsName);
|
||||||
|
final boolean isCurrentWorkerMetricsAboveOperatingRange =
|
||||||
|
workerMetrics.isWorkerMetricAboveOperatingRange(workerMetricsName);
|
||||||
|
/*
|
||||||
|
If there is any worker, whose WorkerMetricStats value is between +/- reBalanceThreshold % of workerMetricsValueAvg or if
|
||||||
|
worker's WorkerMetricStats value is above operating range trigger re-balance
|
||||||
|
*/
|
||||||
|
if (currentWorkerMetricsValue > upperLimit
|
||||||
|
|| currentWorkerMetricsValue < lowerLimit
|
||||||
|
|| isCurrentWorkerMetricsAboveOperatingRange) {
|
||||||
|
shouldTriggerReBalance = true;
|
||||||
|
}
|
||||||
|
// Perform re-balance on the worker if its above upperLimit or if current WorkerMetricStats is above
|
||||||
|
// operating range.
|
||||||
|
if (currentWorkerMetricsValue >= upperLimit || isCurrentWorkerMetricsAboveOperatingRange) {
|
||||||
|
workerIdsAboveAverage.add(workerMetrics);
|
||||||
|
}
|
||||||
|
if (mostLoadedWorker == null
|
||||||
|
|| mostLoadedWorker.getMetricStat(workerMetricsName) < currentWorkerMetricsValue) {
|
||||||
|
mostLoadedWorker = workerMetrics;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
If workerIdsAboveAverage is empty that means there is no worker with WorkerMetricStats value above upperLimit so pick
|
||||||
|
the worker with higher CPU. This can happen when there is worker with WorkerMetricStats value below lowerLimit but
|
||||||
|
all other workers are within upperLimit.
|
||||||
|
*/
|
||||||
|
if (workerIdsAboveAverage.isEmpty()) {
|
||||||
|
workerIdsAboveAverage.add(mostLoadedWorker);
|
||||||
|
}
|
||||||
|
|
||||||
|
return shouldTriggerReBalance ? workerIdsAboveAverage : Collections.emptyList();
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Performs the balancing of the throughput assigned to workers based on the WorkerMetricsValues of worker with respect
|
||||||
|
* to fleet level average.
|
||||||
|
* Each WorkerMetricStats is treated independently to determine workers for re-balance computed (computed based on
|
||||||
|
* reBalanceThreshold) are determined.
|
||||||
|
* The magnitude of throughput to take is determined by how much worker is away from the average of that WorkerMetricStats
|
||||||
|
* across fleet and in case of multiple WorkerMetricStats, the one with maximum magnitude of throughput is considered.
|
||||||
|
*/
|
||||||
|
@Override
|
||||||
|
public void balanceWorkerVariance() {
|
||||||
|
final List<WorkerMetricStats> activeWorkerMetrics = inMemoryStorageView.getActiveWorkerMetrics();
|
||||||
|
|
||||||
|
log.info("WorkerMetricStats to corresponding fleet level average : {}", workerMetricsToFleetLevelAverageMap);
|
||||||
|
log.info("Active WorkerMetricStats : {}", activeWorkerMetrics);
|
||||||
|
|
||||||
|
final Map<String, Double> workerIdToThroughputToTakeMap = new HashMap<>();
|
||||||
|
String largestOutlierWorkerMetricsName = "";
|
||||||
|
double maxThroughputTake = -1.0D;
|
||||||
|
|
||||||
|
for (final Map.Entry<String, Double> workerMetricsToFleetLevelAverageEntry :
|
||||||
|
workerMetricsToFleetLevelAverageMap.entrySet()) {
|
||||||
|
final String workerMetricsName = workerMetricsToFleetLevelAverageEntry.getKey();
|
||||||
|
|
||||||
|
// Filter workers that does not have current WorkerMetricStats. This is possible if application is adding a
|
||||||
|
// new WorkerMetricStats and currently in phase of deployment.
|
||||||
|
final List<WorkerMetricStats> currentWorkerMetrics = activeWorkerMetrics.stream()
|
||||||
|
.filter(workerMetrics -> workerMetrics.containsMetricStat(workerMetricsName))
|
||||||
|
.collect(Collectors.toList());
|
||||||
|
|
||||||
|
final double fleetAverageForWorkerMetrics = workerMetricsToFleetLevelAverageEntry.getValue();
|
||||||
|
|
||||||
|
final List<WorkerMetricStats> workerToTakeLeasesFrom = getWorkersToTakeLeasesFromIfRequired(
|
||||||
|
currentWorkerMetrics, workerMetricsName, fleetAverageForWorkerMetrics);
|
||||||
|
|
||||||
|
final Map<String, Double> workerIdToThroughputToTakeForCurrentWorkerMetrics = new HashMap<>();
|
||||||
|
double totalThroughputToTakeForCurrentWorkerMetrics = 0D;
|
||||||
|
for (final WorkerMetricStats workerToTakeLease : workerToTakeLeasesFrom) {
|
||||||
|
final double workerMetricsValueForWorker = workerToTakeLease.getMetricStat(workerMetricsName);
|
||||||
|
// Load to take based on the difference compared to the fleet level average
|
||||||
|
final double loadPercentageToTake =
|
||||||
|
(workerMetricsValueForWorker - fleetAverageForWorkerMetrics) / workerMetricsValueForWorker;
|
||||||
|
// Dampen the load based on dampeningPercentageValue
|
||||||
|
final double dampenedLoadPercentageToTake =
|
||||||
|
loadPercentageToTake * ((double) dampeningPercentageValue / 100);
|
||||||
|
final double throughputToTake =
|
||||||
|
inMemoryStorageView.getTotalAssignedThroughput(workerToTakeLease.getWorkerId())
|
||||||
|
* dampenedLoadPercentageToTake;
|
||||||
|
log.info(
|
||||||
|
"For worker : {} taking throughput : {} after dampening based on WorkerMetricStats : {}",
|
||||||
|
workerToTakeLease.getWorkerId(),
|
||||||
|
throughputToTake,
|
||||||
|
workerMetricsName);
|
||||||
|
totalThroughputToTakeForCurrentWorkerMetrics += throughputToTake;
|
||||||
|
workerIdToThroughputToTakeForCurrentWorkerMetrics.put(
|
||||||
|
workerToTakeLease.getWorkerId(), throughputToTake);
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
If totalThroughputToTakeForCurrentWorkerMetrics is more than maxThroughputTake that means this WorkerMetricStats is more
|
||||||
|
outlier so consider this for reBalancing
|
||||||
|
*/
|
||||||
|
if (maxThroughputTake < totalThroughputToTakeForCurrentWorkerMetrics) {
|
||||||
|
largestOutlierWorkerMetricsName = workerMetricsName;
|
||||||
|
workerIdToThroughputToTakeMap.clear();
|
||||||
|
workerIdToThroughputToTakeMap.putAll(workerIdToThroughputToTakeForCurrentWorkerMetrics);
|
||||||
|
maxThroughputTake = totalThroughputToTakeForCurrentWorkerMetrics;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
log.info(
|
||||||
|
"Largest outlier WorkerMetricStats is : {} and total of {} throughput will be rebalanced",
|
||||||
|
largestOutlierWorkerMetricsName,
|
||||||
|
maxThroughputTake);
|
||||||
|
log.info("Workers to throughput taken from them is : {}", workerIdToThroughputToTakeMap);
|
||||||
|
|
||||||
|
final List<Map.Entry<String, Double>> sortedWorkerIdToThroughputToTakeEntries =
|
||||||
|
new ArrayList<>(workerIdToThroughputToTakeMap.entrySet());
|
||||||
|
// sort entries by values.
|
||||||
|
Collections.sort(sortedWorkerIdToThroughputToTakeEntries, (e1, e2) -> e2.getValue()
|
||||||
|
.compareTo(e1.getValue()));
|
||||||
|
|
||||||
|
for (final Map.Entry<String, Double> workerIdToThroughputToTakeEntry :
|
||||||
|
sortedWorkerIdToThroughputToTakeEntries) {
|
||||||
|
final String workerId = workerIdToThroughputToTakeEntry.getKey();
|
||||||
|
|
||||||
|
final double throughputToTake = workerIdToThroughputToTakeEntry.getValue();
|
||||||
|
|
||||||
|
final Queue<Lease> leasesToTake = getLeasesToTake(workerId, throughputToTake);
|
||||||
|
|
||||||
|
log.info(
|
||||||
|
"Leases taken from worker : {} are : {}",
|
||||||
|
workerId,
|
||||||
|
leasesToTake.stream().map(Lease::leaseKey).collect(Collectors.toSet()));
|
||||||
|
|
||||||
|
for (final Lease lease : leasesToTake) {
|
||||||
|
final WorkerMetricStats workerToAssign = assignableWorkerSortedByAvailableCapacity.poll();
|
||||||
|
if (nonNull(workerToAssign)
|
||||||
|
&& workerToAssign.willAnyMetricStatsGoAboveAverageUtilizationOrOperatingRange(
|
||||||
|
workerMetricsToFleetLevelAverageMap,
|
||||||
|
inMemoryStorageView.getTargetAverageThroughput(),
|
||||||
|
lease.throughputKBps(),
|
||||||
|
targetLeasePerWorker)) {
|
||||||
|
log.info("No worker to assign anymore in this iteration due to hitting average values");
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
if (nonNull(workerToAssign)) {
|
||||||
|
assignLease(lease, workerToAssign);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
printWorkerToUtilizationLog(inMemoryStorageView.getActiveWorkerMetrics());
|
||||||
|
}
|
||||||
|
|
||||||
|
private Queue<Lease> getLeasesToTake(final String workerId, final double throughputToTake) {
|
||||||
|
final Set<Lease> existingLeases =
|
||||||
|
inMemoryStorageView.getWorkerToLeasesMap().get(workerId);
|
||||||
|
|
||||||
|
if (isNull(existingLeases) || existingLeases.isEmpty()) {
|
||||||
|
return new ArrayDeque<>();
|
||||||
|
}
|
||||||
|
|
||||||
|
if (inMemoryStorageView.getTotalAssignedThroughput(workerId) == 0D) {
|
||||||
|
// This is the case where throughput of this worker is zero and have 1 or more leases assigned.
|
||||||
|
// Its not possible to determine leases to take based on throughput so simply take 1 lease and move on.
|
||||||
|
return new ArrayDeque<>(new ArrayList<>(existingLeases).subList(0, 1));
|
||||||
|
}
|
||||||
|
|
||||||
|
return getLeasesCombiningToThroughput(workerId, throughputToTake);
|
||||||
|
}
|
||||||
|
|
||||||
|
private void assignLease(final Lease lease, final WorkerMetricStats workerMetrics) {
|
||||||
|
if (nonNull(lease.actualOwner()) && lease.actualOwner().equals(workerMetrics.getWorkerId())) {
|
||||||
|
// if a new owner and current owner are same then no assignment to do
|
||||||
|
// put back the worker as well as no assignment is done
|
||||||
|
assignableWorkerSortedByAvailableCapacity.add(workerMetrics);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
workerMetrics.extrapolateMetricStatValuesForAddedThroughput(
|
||||||
|
workerMetricsToFleetLevelAverageMap,
|
||||||
|
inMemoryStorageView.getTargetAverageThroughput(),
|
||||||
|
lease.throughputKBps(),
|
||||||
|
targetLeasePerWorker);
|
||||||
|
log.info("Assigning lease : {} to worker : {}", lease.leaseKey(), workerMetrics.getWorkerId());
|
||||||
|
inMemoryStorageView.performLeaseAssignment(lease, workerMetrics.getWorkerId());
|
||||||
|
if (inMemoryStorageView.isWorkerTotalThroughputLessThanMaxThroughput(workerMetrics.getWorkerId())
|
||||||
|
&& inMemoryStorageView.isWorkerAssignedLeasesLessThanMaxLeases(workerMetrics.getWorkerId())) {
|
||||||
|
assignableWorkerSortedByAvailableCapacity.add(workerMetrics);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private void printWorkerToUtilizationLog(final List<WorkerMetricStats> activeWorkerMetrics) {
|
||||||
|
activeWorkerMetrics.forEach(workerMetrics -> log.info(
|
||||||
|
"WorkerId : {} and average WorkerMetricStats data : {}",
|
||||||
|
workerMetrics.getWorkerId(),
|
||||||
|
workerMetrics.getMetricStatsMap()));
|
||||||
|
}
|
||||||
|
|
||||||
|
private Queue<Lease> getLeasesCombiningToThroughput(final String workerId, final double throughputToGet) {
|
||||||
|
final List<Lease> assignedLeases =
|
||||||
|
new ArrayList<>(inMemoryStorageView.getWorkerToLeasesMap().get(workerId));
|
||||||
|
if (assignedLeases.isEmpty()) {
|
||||||
|
// This is possible if the worker is having high utilization but does not have any leases assigned to it
|
||||||
|
return new ArrayDeque<>();
|
||||||
|
}
|
||||||
|
// Shuffle leases to randomize what leases gets picked.
|
||||||
|
Collections.shuffle(assignedLeases);
|
||||||
|
final Queue<Lease> response = new ArrayDeque<>();
|
||||||
|
double remainingThroughputToGet = throughputToGet;
|
||||||
|
for (final Lease lease : assignedLeases) {
|
||||||
|
// if adding this lease makes throughout to take go below zero avoid taking this lease.
|
||||||
|
if (remainingThroughputToGet - lease.throughputKBps() <= 0) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
remainingThroughputToGet -= lease.throughputKBps();
|
||||||
|
response.add(lease);
|
||||||
|
}
|
||||||
|
|
||||||
|
// If allowThroughputOvershoot is set to true, take a minimum throughput lease
|
||||||
|
if (allowThroughputOvershoot && response.isEmpty()) {
|
||||||
|
assignedLeases.stream()
|
||||||
|
.min(Comparator.comparingDouble(Lease::throughputKBps))
|
||||||
|
.ifPresent(response::add);
|
||||||
|
}
|
||||||
|
return response;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
@ -0,0 +1,58 @@
|
||||||
|
/*
|
||||||
|
* Copyright 2024 Amazon.com, Inc. or its affiliates.
|
||||||
|
* Licensed under the Apache License, Version 2.0 (the
|
||||||
|
* "License"); you may not use this file except in compliance
|
||||||
|
* with the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
package software.amazon.kinesis.coordinator.migration;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* ClientVersion support during upgrade from KCLv2.x to KCLv3.x
|
||||||
|
*
|
||||||
|
* This enum is persisted in storage, so any changes to it needs to be backward compatible.
|
||||||
|
* Reorganizing the values is not backward compatible, also if versions are removed, the corresponding
|
||||||
|
* enum value cannot be reused without backward compatibility considerations.
|
||||||
|
*/
|
||||||
|
public enum ClientVersion {
|
||||||
|
/**
|
||||||
|
* This is a transient start state version used during initialization of the Migration State Machine.
|
||||||
|
*/
|
||||||
|
CLIENT_VERSION_INIT,
|
||||||
|
/**
|
||||||
|
* This version is used during the upgrade of an application from KCLv2.x to KCLv3.x, in this version
|
||||||
|
* KCL workers will emit WorkerMetricStats and run KCLv2.x algorithms for leader election and lease
|
||||||
|
* assignment. KCL will also monitor for upgrade to KCLv3.x readiness of the worker fleet.
|
||||||
|
*/
|
||||||
|
CLIENT_VERSION_UPGRADE_FROM_2x,
|
||||||
|
/**
|
||||||
|
* This version is used during rollback from CLIENT_VERSION_UPGRADE_FROM_2x or CLIENT_VERSION_3x_WITH_ROLLBACK,
|
||||||
|
* which can only be initiated using a KCL migration tool, when customer wants to revert to KCLv2.x functionality.
|
||||||
|
* In this version, KCL will not emit WorkerMetricStats and run KCLv2.x algorithms for leader election
|
||||||
|
* and lease assignment. In this version, KCL will monitor for roll-forward scenario where
|
||||||
|
* client version is updated to CLIENT_VERSION_UPGRADE_FROM_2x using the migration tool.
|
||||||
|
*/
|
||||||
|
CLIENT_VERSION_2x,
|
||||||
|
/**
|
||||||
|
* When workers are operating in CLIENT_VERSION_UPGRADE_FROM_2x and when worker fleet is determined to be
|
||||||
|
* KCLv3.x ready (when lease table GSI is active and worker-metrics are being emitted by all lease owners)
|
||||||
|
* then the leader will initiate the switch to KCLv3.x algorithms for leader election and lease assignment,
|
||||||
|
* by using this version and persisting it in the {@link MigrationState} that allows all worker hosts
|
||||||
|
* to also flip to KCLv3.x functionality. In this KCL will also monitor for rollback to detect when the
|
||||||
|
* customer updates version to CLIENT_VERSION_2x using migration tool, so that it instantly flips back
|
||||||
|
* to CLIENT_VERSION_2x.
|
||||||
|
*/
|
||||||
|
CLIENT_VERSION_3x_WITH_ROLLBACK,
|
||||||
|
/**
|
||||||
|
* A new application starting KCLv3.x or an upgraded application from KCLv2.x after upgrade is successful
|
||||||
|
* can use this version to default all KCLv3.x algorithms without any monitor to rollback.
|
||||||
|
*/
|
||||||
|
CLIENT_VERSION_3x;
|
||||||
|
}
|
||||||
|
|
@ -0,0 +1,161 @@
|
||||||
|
/*
|
||||||
|
* Copyright 2024 Amazon.com, Inc. or its affiliates.
|
||||||
|
* Licensed under the Apache License, Version 2.0 (the
|
||||||
|
* "License"); you may not use this file except in compliance
|
||||||
|
* with the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
package software.amazon.kinesis.coordinator.migration;
|
||||||
|
|
||||||
|
import java.time.Duration;
|
||||||
|
import java.util.Random;
|
||||||
|
import java.util.concurrent.ScheduledExecutorService;
|
||||||
|
import java.util.concurrent.ScheduledFuture;
|
||||||
|
import java.util.concurrent.TimeUnit;
|
||||||
|
|
||||||
|
import lombok.RequiredArgsConstructor;
|
||||||
|
import lombok.extern.slf4j.Slf4j;
|
||||||
|
import software.amazon.awssdk.annotations.ThreadSafe;
|
||||||
|
import software.amazon.awssdk.services.cloudwatch.model.StandardUnit;
|
||||||
|
import software.amazon.kinesis.coordinator.CoordinatorStateDAO;
|
||||||
|
import software.amazon.kinesis.leases.exceptions.DependencyException;
|
||||||
|
import software.amazon.kinesis.leases.exceptions.InvalidStateException;
|
||||||
|
import software.amazon.kinesis.metrics.MetricsFactory;
|
||||||
|
import software.amazon.kinesis.metrics.MetricsLevel;
|
||||||
|
import software.amazon.kinesis.metrics.MetricsScope;
|
||||||
|
import software.amazon.kinesis.metrics.MetricsUtil;
|
||||||
|
|
||||||
|
import static software.amazon.kinesis.coordinator.migration.MigrationState.MIGRATION_HASH_KEY;
|
||||||
|
import static software.amazon.kinesis.coordinator.migration.MigrationStateMachineImpl.METRICS_OPERATION;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Change monitor for MigrationState.clientVersion to notify a callback if the value
|
||||||
|
* changes from a given value. This monitor will be run to monitor
|
||||||
|
* rollback, roll-forward and also upgrade to 3.x scenarios. Look at {@link ClientVersion}
|
||||||
|
* for more details.
|
||||||
|
*
|
||||||
|
* Since all KCL workers will be running the monitor, the monitor poll interval uses
|
||||||
|
* a random jitter to stagger the reads to ddb.
|
||||||
|
*
|
||||||
|
* The class is thread-safe and will invoke callback on a separate thread.
|
||||||
|
*/
|
||||||
|
@Slf4j
|
||||||
|
@RequiredArgsConstructor
|
||||||
|
@ThreadSafe
|
||||||
|
public class ClientVersionChangeMonitor implements Runnable {
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Interface of a callback to invoke when monitor condition is true.
|
||||||
|
*/
|
||||||
|
public interface ClientVersionChangeCallback {
|
||||||
|
void accept(final MigrationState currentMigrationState) throws InvalidStateException, DependencyException;
|
||||||
|
}
|
||||||
|
|
||||||
|
private static final long MONITOR_INTERVAL_MILLIS = Duration.ofMinutes(1).toMillis();
|
||||||
|
private static final double JITTER_FACTOR = 0.1;
|
||||||
|
|
||||||
|
private final MetricsFactory metricsFactory;
|
||||||
|
private final CoordinatorStateDAO coordinatorStateDAO;
|
||||||
|
private final ScheduledExecutorService stateMachineThreadPool;
|
||||||
|
private final ClientVersionChangeCallback callback;
|
||||||
|
private final ClientVersion expectedVersion;
|
||||||
|
private final Random random;
|
||||||
|
private long monitorIntervalMillis;
|
||||||
|
|
||||||
|
private ScheduledFuture<?> scheduledFuture;
|
||||||
|
|
||||||
|
public synchronized void startMonitor() {
|
||||||
|
if (scheduledFuture == null) {
|
||||||
|
final long jitter = (long) (random.nextDouble() * MONITOR_INTERVAL_MILLIS * JITTER_FACTOR);
|
||||||
|
monitorIntervalMillis = MONITOR_INTERVAL_MILLIS + jitter;
|
||||||
|
log.info(
|
||||||
|
"Monitoring for MigrationState client version change from {} every {}ms",
|
||||||
|
expectedVersion,
|
||||||
|
monitorIntervalMillis);
|
||||||
|
scheduledFuture = stateMachineThreadPool.scheduleWithFixedDelay(
|
||||||
|
this, monitorIntervalMillis, monitorIntervalMillis, TimeUnit.MILLISECONDS);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public String toString() {
|
||||||
|
return new StringBuilder(getClass().getSimpleName())
|
||||||
|
.append("[")
|
||||||
|
.append(expectedVersion)
|
||||||
|
.append("]")
|
||||||
|
.toString();
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Cancel the monitor explicity before the condition is met, e.g. when the worker is going down.
|
||||||
|
* Note on synchronization: callback of this monitor is invoked while holding the lock on this monitor object.
|
||||||
|
* If cancel is called from within the same lock context that callback uses, then it can lead to
|
||||||
|
* deadlock. Ensure synchronization context between callback the caller of cancel is not shared.
|
||||||
|
*/
|
||||||
|
public synchronized void cancel() {
|
||||||
|
if (scheduledFuture != null) {
|
||||||
|
log.info("Cancelling {}", this);
|
||||||
|
scheduledFuture.cancel(false);
|
||||||
|
} else {
|
||||||
|
log.info("Monitor {} is not running", this);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public synchronized void run() {
|
||||||
|
try {
|
||||||
|
if (scheduledFuture == null) {
|
||||||
|
log.debug("Monitor has been cancelled, not running...");
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
final MigrationState migrationState =
|
||||||
|
(MigrationState) coordinatorStateDAO.getCoordinatorState(MIGRATION_HASH_KEY);
|
||||||
|
if (migrationState != null) {
|
||||||
|
if (migrationState.getClientVersion() != expectedVersion) {
|
||||||
|
log.info("MigrationState client version has changed {}, invoking monitor callback", migrationState);
|
||||||
|
callback.accept(migrationState);
|
||||||
|
log.info("Callback successful, monitoring cancelling itself.");
|
||||||
|
// stop further monitoring
|
||||||
|
scheduledFuture.cancel(false);
|
||||||
|
scheduledFuture = null;
|
||||||
|
} else {
|
||||||
|
emitMetrics();
|
||||||
|
log.debug("No change detected {}", this);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} catch (final Exception e) {
|
||||||
|
log.warn(
|
||||||
|
"Exception occurred when monitoring for client version change from {}, will retry in {}",
|
||||||
|
expectedVersion,
|
||||||
|
monitorIntervalMillis,
|
||||||
|
e);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private void emitMetrics() {
|
||||||
|
final MetricsScope scope = MetricsUtil.createMetricsWithOperation(metricsFactory, METRICS_OPERATION);
|
||||||
|
try {
|
||||||
|
switch (expectedVersion) {
|
||||||
|
case CLIENT_VERSION_3x_WITH_ROLLBACK:
|
||||||
|
scope.addData("CurrentState:3xWorker", 1, StandardUnit.COUNT, MetricsLevel.SUMMARY);
|
||||||
|
break;
|
||||||
|
case CLIENT_VERSION_2x:
|
||||||
|
case CLIENT_VERSION_UPGRADE_FROM_2x:
|
||||||
|
scope.addData("CurrentState:2xCompatibleWorker", 1, StandardUnit.COUNT, MetricsLevel.SUMMARY);
|
||||||
|
break;
|
||||||
|
default:
|
||||||
|
throw new IllegalStateException(String.format("Unexpected version %s", expectedVersion.name()));
|
||||||
|
}
|
||||||
|
} finally {
|
||||||
|
MetricsUtil.endScope(scope);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
@ -0,0 +1,159 @@
|
||||||
|
/*
|
||||||
|
* Copyright 2024 Amazon.com, Inc. or its affiliates.
|
||||||
|
* Licensed under the Apache License, Version 2.0 (the
|
||||||
|
* "License"); you may not use this file except in compliance
|
||||||
|
* with the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
package software.amazon.kinesis.coordinator.migration;
|
||||||
|
|
||||||
|
import java.util.Random;
|
||||||
|
import java.util.concurrent.CompletableFuture;
|
||||||
|
import java.util.concurrent.ScheduledExecutorService;
|
||||||
|
|
||||||
|
import lombok.NonNull;
|
||||||
|
import lombok.RequiredArgsConstructor;
|
||||||
|
import lombok.extern.slf4j.Slf4j;
|
||||||
|
import software.amazon.awssdk.annotations.ThreadSafe;
|
||||||
|
import software.amazon.awssdk.services.cloudwatch.model.StandardUnit;
|
||||||
|
import software.amazon.kinesis.annotations.KinesisClientInternalApi;
|
||||||
|
import software.amazon.kinesis.coordinator.CoordinatorStateDAO;
|
||||||
|
import software.amazon.kinesis.coordinator.DynamicMigrationComponentsInitializer;
|
||||||
|
import software.amazon.kinesis.leases.exceptions.DependencyException;
|
||||||
|
import software.amazon.kinesis.leases.exceptions.InvalidStateException;
|
||||||
|
import software.amazon.kinesis.metrics.MetricsLevel;
|
||||||
|
import software.amazon.kinesis.metrics.MetricsScope;
|
||||||
|
import software.amazon.kinesis.metrics.MetricsUtil;
|
||||||
|
|
||||||
|
import static software.amazon.kinesis.coordinator.migration.ClientVersion.CLIENT_VERSION_2x;
|
||||||
|
import static software.amazon.kinesis.coordinator.migration.ClientVersion.CLIENT_VERSION_UPGRADE_FROM_2x;
|
||||||
|
import static software.amazon.kinesis.coordinator.migration.MigrationStateMachineImpl.FAULT_METRIC;
|
||||||
|
import static software.amazon.kinesis.coordinator.migration.MigrationStateMachineImpl.METRICS_OPERATION;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* State for CLIENT_VERSION_2x. In this state, the only allowed valid transition is
|
||||||
|
* the roll-forward scenario which can only be performed using the KCL Migration tool.
|
||||||
|
* So when the state machine enters this state, a monitor is started to detect the
|
||||||
|
* roll-forward scenario.
|
||||||
|
*/
|
||||||
|
@KinesisClientInternalApi
|
||||||
|
@RequiredArgsConstructor
|
||||||
|
@Slf4j
|
||||||
|
@ThreadSafe
|
||||||
|
public class MigrationClientVersion2xState implements MigrationClientVersionState {
|
||||||
|
private final MigrationStateMachine stateMachine;
|
||||||
|
private final CoordinatorStateDAO coordinatorStateDAO;
|
||||||
|
private final ScheduledExecutorService stateMachineThreadPool;
|
||||||
|
private final DynamicMigrationComponentsInitializer initializer;
|
||||||
|
private final Random random;
|
||||||
|
|
||||||
|
private ClientVersionChangeMonitor rollForwardMonitor;
|
||||||
|
private boolean entered = false;
|
||||||
|
private boolean left = false;
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public ClientVersion clientVersion() {
|
||||||
|
return CLIENT_VERSION_2x;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public synchronized void enter(final ClientVersion fromClientVersion) {
|
||||||
|
if (!entered) {
|
||||||
|
log.info("Entering {} from {}", this, fromClientVersion);
|
||||||
|
initializer.initializeClientVersionFor2x(fromClientVersion);
|
||||||
|
|
||||||
|
log.info("Starting roll-forward monitor");
|
||||||
|
rollForwardMonitor = new ClientVersionChangeMonitor(
|
||||||
|
initializer.metricsFactory(),
|
||||||
|
coordinatorStateDAO,
|
||||||
|
stateMachineThreadPool,
|
||||||
|
this::onClientVersionChange,
|
||||||
|
clientVersion(),
|
||||||
|
random);
|
||||||
|
rollForwardMonitor.startMonitor();
|
||||||
|
entered = true;
|
||||||
|
} else {
|
||||||
|
log.info("Not entering {}", left ? "already exited state" : "already entered state");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public synchronized void leave() {
|
||||||
|
if (entered && !left) {
|
||||||
|
log.info("Leaving {}", this);
|
||||||
|
cancelRollForwardMonitor();
|
||||||
|
left = false;
|
||||||
|
} else {
|
||||||
|
log.info("Cannot leave {}", entered ? "already exited state" : "because state is not active");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public String toString() {
|
||||||
|
return getClass().getSimpleName();
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Callback handler to handle client version changes in MigrationState in DDB.
|
||||||
|
* @param newState current MigrationState read from DDB where client version is not CLIENT_VERSION_2x
|
||||||
|
* @throws InvalidStateException during transition to the next state based on the new ClientVersion
|
||||||
|
* or if the new state in DDB is unexpected.
|
||||||
|
*/
|
||||||
|
private synchronized void onClientVersionChange(@NonNull final MigrationState newState)
|
||||||
|
throws InvalidStateException, DependencyException {
|
||||||
|
if (!entered || left) {
|
||||||
|
log.warn("Received client version change notification on inactive state {}", this);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
final MetricsScope scope =
|
||||||
|
MetricsUtil.createMetricsWithOperation(initializer.metricsFactory(), METRICS_OPERATION);
|
||||||
|
try {
|
||||||
|
if (newState.getClientVersion() == CLIENT_VERSION_UPGRADE_FROM_2x) {
|
||||||
|
log.info(
|
||||||
|
"A roll-forward has been initiated for the application. Transition to {}",
|
||||||
|
CLIENT_VERSION_UPGRADE_FROM_2x);
|
||||||
|
// If this succeeds, the monitor will cancel itself.
|
||||||
|
stateMachine.transitionTo(CLIENT_VERSION_UPGRADE_FROM_2x, newState);
|
||||||
|
} else {
|
||||||
|
// This should not happen, so throw an exception that allows the monitor to continue monitoring
|
||||||
|
// changes, this allows KCL to operate in the current state and keep monitoring until a valid
|
||||||
|
// state transition is possible.
|
||||||
|
// However, there could be a split brain here, new workers will use DDB value as source of truth,
|
||||||
|
// so we could also write back CLIENT_VERSION_2x to DDB to ensure all workers have consistent
|
||||||
|
// behavior.
|
||||||
|
// Ideally we don't expect modifications to DDB table out of the KCL migration tool scope,
|
||||||
|
// so keeping it simple and not writing back to DDB, the error log below would help capture
|
||||||
|
// any strange behavior if this happens.
|
||||||
|
log.error(
|
||||||
|
"Migration state has invalid client version {}. Transition from {} is not supported",
|
||||||
|
newState,
|
||||||
|
CLIENT_VERSION_2x);
|
||||||
|
throw new InvalidStateException(String.format("Unexpected new state %s", newState));
|
||||||
|
}
|
||||||
|
} catch (final InvalidStateException | DependencyException e) {
|
||||||
|
scope.addData(FAULT_METRIC, 1, StandardUnit.COUNT, MetricsLevel.SUMMARY);
|
||||||
|
throw e;
|
||||||
|
} finally {
|
||||||
|
MetricsUtil.endScope(scope);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private void cancelRollForwardMonitor() {
|
||||||
|
if (rollForwardMonitor != null) {
|
||||||
|
final ClientVersionChangeMonitor localRollForwardMonitor = rollForwardMonitor;
|
||||||
|
CompletableFuture.supplyAsync(() -> {
|
||||||
|
log.info("Cancelling roll-forward monitor");
|
||||||
|
localRollForwardMonitor.cancel();
|
||||||
|
return null;
|
||||||
|
});
|
||||||
|
rollForwardMonitor = null;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
@ -0,0 +1,70 @@
|
||||||
|
/*
|
||||||
|
* Copyright 2024 Amazon.com, Inc. or its affiliates.
|
||||||
|
* Licensed under the Apache License, Version 2.0 (the
|
||||||
|
* "License"); you may not use this file except in compliance
|
||||||
|
* with the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
package software.amazon.kinesis.coordinator.migration;
|
||||||
|
|
||||||
|
import lombok.RequiredArgsConstructor;
|
||||||
|
import lombok.extern.slf4j.Slf4j;
|
||||||
|
import software.amazon.awssdk.annotations.ThreadSafe;
|
||||||
|
import software.amazon.kinesis.annotations.KinesisClientInternalApi;
|
||||||
|
import software.amazon.kinesis.coordinator.DynamicMigrationComponentsInitializer;
|
||||||
|
import software.amazon.kinesis.leases.exceptions.DependencyException;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* State for CLIENT_VERSION_3x which enables KCL to run 3.x algorithms on new KCLv3.x application
|
||||||
|
* or successfully upgraded application which upgraded from v2.x. This is a terminal state of the
|
||||||
|
* state machine and no rollbacks are supported in this state.
|
||||||
|
*/
|
||||||
|
@KinesisClientInternalApi
|
||||||
|
@RequiredArgsConstructor
|
||||||
|
@Slf4j
|
||||||
|
@ThreadSafe
|
||||||
|
public class MigrationClientVersion3xState implements MigrationClientVersionState {
|
||||||
|
private final MigrationStateMachine stateMachine;
|
||||||
|
private final DynamicMigrationComponentsInitializer initializer;
|
||||||
|
private boolean entered = false;
|
||||||
|
private boolean left = false;
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public ClientVersion clientVersion() {
|
||||||
|
return ClientVersion.CLIENT_VERSION_3x;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public synchronized void enter(final ClientVersion fromClientVersion) throws DependencyException {
|
||||||
|
if (!entered) {
|
||||||
|
log.info("Entering {} from {}", this, fromClientVersion);
|
||||||
|
initializer.initializeClientVersionFor3x(fromClientVersion);
|
||||||
|
entered = true;
|
||||||
|
} else {
|
||||||
|
log.info("Not entering {}", left ? "already exited state" : "already entered state");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void leave() {
|
||||||
|
if (entered && !left) {
|
||||||
|
log.info("Leaving {}", this);
|
||||||
|
entered = false;
|
||||||
|
left = true;
|
||||||
|
} else {
|
||||||
|
log.info("Cannot leave {}", entered ? "already exited state" : "because state is not active");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public String toString() {
|
||||||
|
return getClass().getSimpleName();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
@ -0,0 +1,156 @@
|
||||||
|
/*
|
||||||
|
* Copyright 2024 Amazon.com, Inc. or its affiliates.
|
||||||
|
* Licensed under the Apache License, Version 2.0 (the
|
||||||
|
* "License"); you may not use this file except in compliance
|
||||||
|
* with the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
package software.amazon.kinesis.coordinator.migration;
|
||||||
|
|
||||||
|
import java.util.Random;
|
||||||
|
import java.util.concurrent.CompletableFuture;
|
||||||
|
import java.util.concurrent.ScheduledExecutorService;
|
||||||
|
|
||||||
|
import lombok.RequiredArgsConstructor;
|
||||||
|
import lombok.extern.slf4j.Slf4j;
|
||||||
|
import software.amazon.awssdk.annotations.ThreadSafe;
|
||||||
|
import software.amazon.awssdk.services.cloudwatch.model.StandardUnit;
|
||||||
|
import software.amazon.kinesis.annotations.KinesisClientInternalApi;
|
||||||
|
import software.amazon.kinesis.coordinator.CoordinatorStateDAO;
|
||||||
|
import software.amazon.kinesis.coordinator.DynamicMigrationComponentsInitializer;
|
||||||
|
import software.amazon.kinesis.leases.exceptions.DependencyException;
|
||||||
|
import software.amazon.kinesis.leases.exceptions.InvalidStateException;
|
||||||
|
import software.amazon.kinesis.metrics.MetricsLevel;
|
||||||
|
import software.amazon.kinesis.metrics.MetricsScope;
|
||||||
|
import software.amazon.kinesis.metrics.MetricsUtil;
|
||||||
|
|
||||||
|
import static software.amazon.kinesis.coordinator.migration.ClientVersion.CLIENT_VERSION_2x;
|
||||||
|
import static software.amazon.kinesis.coordinator.migration.ClientVersion.CLIENT_VERSION_3x;
|
||||||
|
import static software.amazon.kinesis.coordinator.migration.MigrationStateMachineImpl.FAULT_METRIC;
|
||||||
|
import static software.amazon.kinesis.coordinator.migration.MigrationStateMachineImpl.METRICS_OPERATION;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* State for CLIENT_VERSION_3x_WITH_ROLLBACK which enables KCL to run its 3.x compliant algorithms
|
||||||
|
* during the upgrade process after all KCL workers in the fleet are 3.x complaint. Since this
|
||||||
|
* is an instant switch from CLIENT_VERSION_UPGRADE_FROM_2x, it also supports rollback if customers
|
||||||
|
* see regression to allow for instant rollbacks as well. This would be achieved by customers
|
||||||
|
* running a KCL migration tool to update MigrationState in DDB. So this state monitors for
|
||||||
|
* rollback triggers and performs state transitions accordingly.
|
||||||
|
*/
|
||||||
|
@Slf4j
|
||||||
|
@KinesisClientInternalApi
|
||||||
|
@RequiredArgsConstructor
|
||||||
|
@ThreadSafe
|
||||||
|
public class MigrationClientVersion3xWithRollbackState implements MigrationClientVersionState {
|
||||||
|
|
||||||
|
private final MigrationStateMachine stateMachine;
|
||||||
|
private final CoordinatorStateDAO coordinatorStateDAO;
|
||||||
|
private final ScheduledExecutorService stateMachineThreadPool;
|
||||||
|
private final DynamicMigrationComponentsInitializer initializer;
|
||||||
|
private final Random random;
|
||||||
|
|
||||||
|
private ClientVersionChangeMonitor rollbackMonitor;
|
||||||
|
private boolean entered;
|
||||||
|
private boolean left;
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public ClientVersion clientVersion() {
|
||||||
|
return ClientVersion.CLIENT_VERSION_3x_WITH_ROLLBACK;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public synchronized void enter(final ClientVersion fromClientVersion) throws DependencyException {
|
||||||
|
if (!entered) {
|
||||||
|
log.info("Entering {} from {}", this, fromClientVersion);
|
||||||
|
initializer.initializeClientVersionFor3xWithRollback(fromClientVersion);
|
||||||
|
// we need to run the rollback monitor
|
||||||
|
log.info("Starting rollback monitor");
|
||||||
|
rollbackMonitor = new ClientVersionChangeMonitor(
|
||||||
|
initializer.metricsFactory(),
|
||||||
|
coordinatorStateDAO,
|
||||||
|
stateMachineThreadPool,
|
||||||
|
this::onClientVersionChange,
|
||||||
|
clientVersion(),
|
||||||
|
random);
|
||||||
|
rollbackMonitor.startMonitor();
|
||||||
|
entered = true;
|
||||||
|
} else {
|
||||||
|
log.info("Not entering {}", left ? "already exited state" : "already entered state");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void leave() {
|
||||||
|
if (entered && !left) {
|
||||||
|
log.info("Leaving {}", this);
|
||||||
|
cancelRollbackMonitor();
|
||||||
|
entered = false;
|
||||||
|
left = true;
|
||||||
|
} else {
|
||||||
|
log.info("Cannot leave {}", entered ? "already exited state" : "because state is not active");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private synchronized void onClientVersionChange(final MigrationState newState)
|
||||||
|
throws InvalidStateException, DependencyException {
|
||||||
|
if (!entered || left) {
|
||||||
|
log.warn("Received client version change notification on inactive state {}", this);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
final MetricsScope scope =
|
||||||
|
MetricsUtil.createMetricsWithOperation(initializer.metricsFactory(), METRICS_OPERATION);
|
||||||
|
try {
|
||||||
|
switch (newState.getClientVersion()) {
|
||||||
|
case CLIENT_VERSION_2x:
|
||||||
|
log.info("A rollback has been initiated for the application. Transition to {}", CLIENT_VERSION_2x);
|
||||||
|
stateMachine.transitionTo(ClientVersion.CLIENT_VERSION_2x, newState);
|
||||||
|
break;
|
||||||
|
case CLIENT_VERSION_3x:
|
||||||
|
log.info("Customer has switched to 3.x after successful upgrade, state machine will move to a"
|
||||||
|
+ "terminal state and stop monitoring. Rollbacks will no longer be supported anymore");
|
||||||
|
stateMachine.transitionTo(CLIENT_VERSION_3x, newState);
|
||||||
|
// This worker will still be running the migrationAdaptive components in 3.x mode which will
|
||||||
|
// no longer dynamically switch back to 2.x mode, however to directly run 3.x component without
|
||||||
|
// adaption to migration (i.e. move to CLIENT_VERSION_3x state), it requires this worker to go
|
||||||
|
// through the current deployment which initiated the switch to 3.x mode.
|
||||||
|
break;
|
||||||
|
default:
|
||||||
|
// This should not happen, so throw an exception that allows the monitor to continue monitoring
|
||||||
|
// changes, this allows KCL to operate in the current state and keep monitoring until a valid
|
||||||
|
// state transition is possible.
|
||||||
|
// However, there could be a split brain here, new workers will use DDB value as source of truth,
|
||||||
|
// so we could also write back CLIENT_VERSION_3x_WITH_ROLLBACK to DDB to ensure all workers have
|
||||||
|
// consistent behavior.
|
||||||
|
// Ideally we don't expect modifications to DDB table out of the KCL migration tool scope,
|
||||||
|
// so keeping it simple and not writing back to DDB, the error log below would help capture
|
||||||
|
// any strange behavior if this happens.
|
||||||
|
log.error("Migration state has invalid client version {}", newState);
|
||||||
|
throw new InvalidStateException(String.format("Unexpected new state %s", newState));
|
||||||
|
}
|
||||||
|
} catch (final InvalidStateException | DependencyException e) {
|
||||||
|
scope.addData(FAULT_METRIC, 1, StandardUnit.COUNT, MetricsLevel.SUMMARY);
|
||||||
|
throw e;
|
||||||
|
} finally {
|
||||||
|
MetricsUtil.endScope(scope);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private void cancelRollbackMonitor() {
|
||||||
|
if (rollbackMonitor != null) {
|
||||||
|
final ClientVersionChangeMonitor localRollbackMonitor = rollbackMonitor;
|
||||||
|
CompletableFuture.supplyAsync(() -> {
|
||||||
|
log.info("Cancelling rollback monitor");
|
||||||
|
localRollbackMonitor.cancel();
|
||||||
|
return null;
|
||||||
|
});
|
||||||
|
rollbackMonitor = null;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
@ -0,0 +1,47 @@
|
||||||
|
/*
|
||||||
|
* Copyright 2024 Amazon.com, Inc. or its affiliates.
|
||||||
|
* Licensed under the Apache License, Version 2.0 (the
|
||||||
|
* "License"); you may not use this file except in compliance
|
||||||
|
* with the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
package software.amazon.kinesis.coordinator.migration;
|
||||||
|
|
||||||
|
import software.amazon.kinesis.leases.exceptions.DependencyException;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Interface of a state implementation for the MigrationStateMachine
|
||||||
|
*/
|
||||||
|
public interface MigrationClientVersionState {
|
||||||
|
|
||||||
|
/**
|
||||||
|
* The associated clientVersion this state corresponds to
|
||||||
|
* @return ClientVersion that this state implements the logic for.
|
||||||
|
*/
|
||||||
|
ClientVersion clientVersion();
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Enter the state and perform the business logic of being in this state
|
||||||
|
* which includes performing any monitoring that allows the next state
|
||||||
|
* transition and also initializing the KCL based on the ClientVersion.
|
||||||
|
* @param fromClientVersion from previous state if any specific action must
|
||||||
|
* be taken based on the state from which this state
|
||||||
|
* is being entered from.
|
||||||
|
* @throws DependencyException if DDB fails in unexpected ways for those states
|
||||||
|
* that create the GSI
|
||||||
|
*/
|
||||||
|
void enter(ClientVersion fromClientVersion) throws DependencyException;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Invoked after the transition to another state has occurred
|
||||||
|
* to allow printing any helpful logs or performing cleanup.
|
||||||
|
*/
|
||||||
|
void leave();
|
||||||
|
}
|
||||||
|
|
@ -0,0 +1,263 @@
|
||||||
|
/*
|
||||||
|
* Copyright 2024 Amazon.com, Inc. or its affiliates.
|
||||||
|
* Licensed under the Apache License, Version 2.0 (the
|
||||||
|
* "License"); you may not use this file except in compliance
|
||||||
|
* with the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
package software.amazon.kinesis.coordinator.migration;
|
||||||
|
|
||||||
|
import java.util.AbstractMap.SimpleEntry;
|
||||||
|
import java.util.Map;
|
||||||
|
import java.util.Random;
|
||||||
|
import java.util.concurrent.Callable;
|
||||||
|
|
||||||
|
import lombok.RequiredArgsConstructor;
|
||||||
|
import lombok.extern.slf4j.Slf4j;
|
||||||
|
import software.amazon.awssdk.annotations.ThreadSafe;
|
||||||
|
import software.amazon.awssdk.services.dynamodb.model.ExpectedAttributeValue;
|
||||||
|
import software.amazon.kinesis.annotations.KinesisClientInternalApi;
|
||||||
|
import software.amazon.kinesis.coordinator.CoordinatorConfig.ClientVersionConfig;
|
||||||
|
import software.amazon.kinesis.coordinator.CoordinatorState;
|
||||||
|
import software.amazon.kinesis.coordinator.CoordinatorStateDAO;
|
||||||
|
import software.amazon.kinesis.leases.exceptions.DependencyException;
|
||||||
|
import software.amazon.kinesis.leases.exceptions.InvalidStateException;
|
||||||
|
import software.amazon.kinesis.leases.exceptions.ProvisionedThroughputException;
|
||||||
|
|
||||||
|
import static software.amazon.kinesis.coordinator.migration.ClientVersion.CLIENT_VERSION_2x;
|
||||||
|
import static software.amazon.kinesis.coordinator.migration.ClientVersion.CLIENT_VERSION_3x;
|
||||||
|
import static software.amazon.kinesis.coordinator.migration.ClientVersion.CLIENT_VERSION_3x_WITH_ROLLBACK;
|
||||||
|
import static software.amazon.kinesis.coordinator.migration.ClientVersion.CLIENT_VERSION_UPGRADE_FROM_2x;
|
||||||
|
import static software.amazon.kinesis.coordinator.migration.MigrationState.MIGRATION_HASH_KEY;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Initializer to determine start state of the state machine which identifies the
|
||||||
|
* state to initialize KCL when it is starting up. The initial state is determined based on the
|
||||||
|
* customer configured {@link ClientVersionConfig} and the current {@link MigrationState} in DDB,
|
||||||
|
* as follows
|
||||||
|
* ClientVersionConfig | MigrationState (DDB) | initial client version
|
||||||
|
* --------------------+---------------------------------+--------------------------------
|
||||||
|
* COMPATIBLE_WITH_2x | Does not exist | CLIENT_VERSION_UPGRADE_FROM_2x
|
||||||
|
* 3x | Does not exist | CLIENT_VERSION_3x
|
||||||
|
* COMPATIBLE_WITH_2x | CLIENT_VERSION_3x_WITH_ROLLBACK | CLIENT_VERSION_3x_WITH_ROLLBACK
|
||||||
|
* 3x | CLIENT_VERSION_3x_WITH_ROLLBACK | CLIENT_VERSION_3x
|
||||||
|
* any | CLIENT_VERSION_2x | CLIENT_VERSION_2x
|
||||||
|
* any | CLIENT_VERSION_UPGRADE_FROM_2x | CLIENT_VERSION_UPGRADE_FROM_2x
|
||||||
|
* any | CLIENT_VERSION_3x | CLIENT_VERSION_3x
|
||||||
|
*/
|
||||||
|
@KinesisClientInternalApi
|
||||||
|
@RequiredArgsConstructor
|
||||||
|
@Slf4j
|
||||||
|
@ThreadSafe
|
||||||
|
public class MigrationClientVersionStateInitializer {
|
||||||
|
private static final int MAX_INITIALIZATION_RETRY = 10;
|
||||||
|
private static final long INITIALIZATION_RETRY_DELAY_MILLIS = 1000L;
|
||||||
|
/**
|
||||||
|
* A jitter factor of 10% to stagger the retries.
|
||||||
|
*/
|
||||||
|
private static final double JITTER_FACTOR = 0.1;
|
||||||
|
|
||||||
|
private final Callable<Long> timeProvider;
|
||||||
|
private final CoordinatorStateDAO coordinatorStateDAO;
|
||||||
|
private final ClientVersionConfig clientVersionConfig;
|
||||||
|
private final Random random;
|
||||||
|
private final String workerIdentifier;
|
||||||
|
|
||||||
|
public SimpleEntry<ClientVersion, MigrationState> getInitialState() throws DependencyException {
|
||||||
|
log.info("Initializing migration state machine starting state, configured version {}", clientVersionConfig);
|
||||||
|
|
||||||
|
try {
|
||||||
|
MigrationState migrationState = getMigrationStateFromDynamo();
|
||||||
|
int retryCount = 0;
|
||||||
|
while (retryCount++ < MAX_INITIALIZATION_RETRY) {
|
||||||
|
final ClientVersion initialClientVersion = getClientVersionForInitialization(migrationState);
|
||||||
|
if (migrationState.getClientVersion() != initialClientVersion) {
|
||||||
|
// If update fails, the value represents current state in dynamo
|
||||||
|
migrationState = updateMigrationStateInDynamo(migrationState, initialClientVersion);
|
||||||
|
if (migrationState.getClientVersion() == initialClientVersion) {
|
||||||
|
// update succeeded. Transition to the state
|
||||||
|
return new SimpleEntry<>(initialClientVersion, migrationState);
|
||||||
|
}
|
||||||
|
final long delay = getInitializationRetryDelay();
|
||||||
|
log.warn(
|
||||||
|
"Failed to update migration state with {}, retry after delay {}",
|
||||||
|
initialClientVersion,
|
||||||
|
delay);
|
||||||
|
safeSleep(delay);
|
||||||
|
} else {
|
||||||
|
return new SimpleEntry<>(initialClientVersion, migrationState);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} catch (final InvalidStateException e) {
|
||||||
|
log.error("Unable to initialize state machine", e);
|
||||||
|
}
|
||||||
|
throw new DependencyException(
|
||||||
|
new RuntimeException("Unable to determine initial state for migration state machine"));
|
||||||
|
}
|
||||||
|
|
||||||
|
public ClientVersion getClientVersionForInitialization(final MigrationState migrationState) {
|
||||||
|
final ClientVersion nextClientVersion;
|
||||||
|
switch (migrationState.getClientVersion()) {
|
||||||
|
case CLIENT_VERSION_INIT:
|
||||||
|
// There is no state in DDB, set state to config version and transition to configured version.
|
||||||
|
nextClientVersion = getNextClientVersionBasedOnConfigVersion();
|
||||||
|
log.info("Application is starting in {}", nextClientVersion);
|
||||||
|
break;
|
||||||
|
case CLIENT_VERSION_3x_WITH_ROLLBACK:
|
||||||
|
if (clientVersionConfig == ClientVersionConfig.CLIENT_VERSION_CONFIG_3x) {
|
||||||
|
// upgrade successful, allow transition to 3x.
|
||||||
|
log.info("Application has successfully upgraded, transitioning to {}", CLIENT_VERSION_3x);
|
||||||
|
nextClientVersion = CLIENT_VERSION_3x;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
log.info("Initialize with {}", CLIENT_VERSION_3x_WITH_ROLLBACK);
|
||||||
|
nextClientVersion = migrationState.getClientVersion();
|
||||||
|
break;
|
||||||
|
case CLIENT_VERSION_2x:
|
||||||
|
log.info("Application has rolled-back, initialize with {}", CLIENT_VERSION_2x);
|
||||||
|
nextClientVersion = migrationState.getClientVersion();
|
||||||
|
break;
|
||||||
|
case CLIENT_VERSION_UPGRADE_FROM_2x:
|
||||||
|
log.info("Application is upgrading, initialize with {}", CLIENT_VERSION_UPGRADE_FROM_2x);
|
||||||
|
nextClientVersion = migrationState.getClientVersion();
|
||||||
|
break;
|
||||||
|
case CLIENT_VERSION_3x:
|
||||||
|
log.info("Initialize with {}", CLIENT_VERSION_3x);
|
||||||
|
nextClientVersion = migrationState.getClientVersion();
|
||||||
|
break;
|
||||||
|
default:
|
||||||
|
throw new IllegalStateException(String.format("Unknown version in DDB %s", migrationState));
|
||||||
|
}
|
||||||
|
return nextClientVersion;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Update the migration state's client version in dynamo conditional on the current client version
|
||||||
|
* in dynamo. So that if another worker updates the value first, the update fails. If the update fails,
|
||||||
|
* the method will read the latest value and return so that initialization can be retried.
|
||||||
|
* If the value does not exist in dynamo, it will creat it.
|
||||||
|
*/
|
||||||
|
private MigrationState updateMigrationStateInDynamo(
|
||||||
|
final MigrationState migrationState, final ClientVersion nextClientVersion) throws InvalidStateException {
|
||||||
|
try {
|
||||||
|
if (migrationState.getClientVersion() == ClientVersion.CLIENT_VERSION_INIT) {
|
||||||
|
migrationState.update(nextClientVersion, workerIdentifier);
|
||||||
|
log.info("Creating {}", migrationState);
|
||||||
|
final boolean created = coordinatorStateDAO.createCoordinatorStateIfNotExists(migrationState);
|
||||||
|
if (!created) {
|
||||||
|
log.debug("Create {} did not succeed", migrationState);
|
||||||
|
return getMigrationStateFromDynamo();
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
log.info("Updating {} with {}", migrationState, nextClientVersion);
|
||||||
|
final Map<String, ExpectedAttributeValue> expectations =
|
||||||
|
migrationState.getDynamoClientVersionExpectation();
|
||||||
|
migrationState.update(nextClientVersion, workerIdentifier);
|
||||||
|
final boolean updated =
|
||||||
|
coordinatorStateDAO.updateCoordinatorStateWithExpectation(migrationState, expectations);
|
||||||
|
if (!updated) {
|
||||||
|
log.debug("Update {} did not succeed", migrationState);
|
||||||
|
return getMigrationStateFromDynamo();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return migrationState;
|
||||||
|
} catch (final ProvisionedThroughputException | DependencyException e) {
|
||||||
|
log.debug(
|
||||||
|
"Failed to update migration state {} with {}, return previous value to trigger a retry",
|
||||||
|
migrationState,
|
||||||
|
nextClientVersion,
|
||||||
|
e);
|
||||||
|
return migrationState;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private ClientVersion getNextClientVersionBasedOnConfigVersion() {
|
||||||
|
switch (clientVersionConfig) {
|
||||||
|
case CLIENT_VERSION_CONFIG_COMPATIBLE_WITH_2x:
|
||||||
|
return CLIENT_VERSION_UPGRADE_FROM_2x;
|
||||||
|
case CLIENT_VERSION_CONFIG_3x:
|
||||||
|
return CLIENT_VERSION_3x;
|
||||||
|
}
|
||||||
|
throw new IllegalStateException(String.format("Unknown configured Client version %s", clientVersionConfig));
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Read the current {@link MigrationState} from DDB with retries.
|
||||||
|
* @return current Migration state from DDB, if none exists, an initial Migration State with CLIENT_VERSION_INIT
|
||||||
|
* will be returned
|
||||||
|
* @throws InvalidStateException, this occurs when dynamo table does not exist in which retrying is not useful.
|
||||||
|
*/
|
||||||
|
private MigrationState getMigrationStateFromDynamo() throws InvalidStateException {
|
||||||
|
return executeCallableWithRetryAndJitter(
|
||||||
|
() -> {
|
||||||
|
final CoordinatorState state = coordinatorStateDAO.getCoordinatorState(MIGRATION_HASH_KEY);
|
||||||
|
if (state == null) {
|
||||||
|
log.info("No Migration state available in DDB");
|
||||||
|
return new MigrationState(MIGRATION_HASH_KEY, workerIdentifier);
|
||||||
|
}
|
||||||
|
if (state instanceof MigrationState) {
|
||||||
|
log.info("Current migration state in DDB {}", state);
|
||||||
|
return (MigrationState) state;
|
||||||
|
}
|
||||||
|
throw new InvalidStateException(
|
||||||
|
String.format("Unexpected state found not confirming to MigrationState schema %s", state));
|
||||||
|
},
|
||||||
|
"get MigrationState from DDB");
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Helper method to retry a given callable upto MAX_INITIALIZATION_RETRY times for all retryable exceptions.
|
||||||
|
* It considers InvalidStateException as non-retryable exception. During retry, it will compute a delay
|
||||||
|
* with jitter before retrying.
|
||||||
|
* @param callable callable to invoke either until it succeeds or max retry attempts exceed.
|
||||||
|
* @param description a meaningful description to log exceptions
|
||||||
|
* @return the value returned by the callable
|
||||||
|
* @param <T> Return type of the callable
|
||||||
|
* @throws InvalidStateException If the callable throws InvalidStateException, it will not be retried and will
|
||||||
|
* be thrown back.
|
||||||
|
*/
|
||||||
|
private <T> T executeCallableWithRetryAndJitter(final Callable<T> callable, final String description)
|
||||||
|
throws InvalidStateException {
|
||||||
|
int retryCount = 0;
|
||||||
|
while (retryCount++ < MAX_INITIALIZATION_RETRY) {
|
||||||
|
try {
|
||||||
|
return callable.call();
|
||||||
|
} catch (final Exception e) {
|
||||||
|
if (e instanceof InvalidStateException) {
|
||||||
|
// throw the non-retryable exception
|
||||||
|
throw (InvalidStateException) e;
|
||||||
|
}
|
||||||
|
final long delay = getInitializationRetryDelay();
|
||||||
|
log.warn("Failed to {}, retry after delay {}", description, delay, e);
|
||||||
|
|
||||||
|
safeSleep(delay);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
throw new RuntimeException(
|
||||||
|
String.format("Failed to %s after %d retries, giving up", description, MAX_INITIALIZATION_RETRY));
|
||||||
|
}
|
||||||
|
|
||||||
|
private void safeSleep(final long delay) {
|
||||||
|
try {
|
||||||
|
Thread.sleep(delay);
|
||||||
|
} catch (final InterruptedException ie) {
|
||||||
|
log.debug("Interrupted sleep during state machine initialization retry");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Generate a delay with jitter that is factor of the interval.
|
||||||
|
* @return delay with jitter
|
||||||
|
*/
|
||||||
|
private long getInitializationRetryDelay() {
|
||||||
|
final long jitter = (long) (random.nextDouble() * JITTER_FACTOR * INITIALIZATION_RETRY_DELAY_MILLIS);
|
||||||
|
return INITIALIZATION_RETRY_DELAY_MILLIS + jitter;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
@ -0,0 +1,241 @@
|
||||||
|
/*
|
||||||
|
* Copyright 2024 Amazon.com, Inc. or its affiliates.
|
||||||
|
* Licensed under the Apache License, Version 2.0 (the
|
||||||
|
* "License"); you may not use this file except in compliance
|
||||||
|
* with the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
package software.amazon.kinesis.coordinator.migration;
|
||||||
|
|
||||||
|
import java.util.Random;
|
||||||
|
import java.util.concurrent.Callable;
|
||||||
|
import java.util.concurrent.CompletableFuture;
|
||||||
|
import java.util.concurrent.ScheduledExecutorService;
|
||||||
|
|
||||||
|
import lombok.RequiredArgsConstructor;
|
||||||
|
import lombok.extern.slf4j.Slf4j;
|
||||||
|
import software.amazon.awssdk.annotations.ThreadSafe;
|
||||||
|
import software.amazon.awssdk.services.cloudwatch.model.StandardUnit;
|
||||||
|
import software.amazon.kinesis.annotations.KinesisClientInternalApi;
|
||||||
|
import software.amazon.kinesis.coordinator.CoordinatorStateDAO;
|
||||||
|
import software.amazon.kinesis.coordinator.DynamicMigrationComponentsInitializer;
|
||||||
|
import software.amazon.kinesis.leases.exceptions.DependencyException;
|
||||||
|
import software.amazon.kinesis.leases.exceptions.InvalidStateException;
|
||||||
|
import software.amazon.kinesis.metrics.MetricsLevel;
|
||||||
|
import software.amazon.kinesis.metrics.MetricsScope;
|
||||||
|
import software.amazon.kinesis.metrics.MetricsUtil;
|
||||||
|
|
||||||
|
import static software.amazon.kinesis.coordinator.migration.ClientVersion.CLIENT_VERSION_2x;
|
||||||
|
import static software.amazon.kinesis.coordinator.migration.ClientVersion.CLIENT_VERSION_3x_WITH_ROLLBACK;
|
||||||
|
import static software.amazon.kinesis.coordinator.migration.MigrationStateMachineImpl.FAULT_METRIC;
|
||||||
|
import static software.amazon.kinesis.coordinator.migration.MigrationStateMachineImpl.METRICS_OPERATION;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* State for CLIENT_VERSION_UPGRADE_FROM_2x. When state machine enters this state,
|
||||||
|
* KCL is initialized to operate in dual mode for Lease assignment and Leader decider algorithms
|
||||||
|
* which initially start in 2.x compatible mode and when all the KCL workers are 3.x compliant,
|
||||||
|
* it dynamically switches to the 3.x algorithms. It also monitors for rollback
|
||||||
|
* initiated from customer via the KCL migration tool and instantly switches back to the 2.x
|
||||||
|
* complaint algorithms.
|
||||||
|
* The allowed state transitions are to CLIENT_VERSION_3x_WITH_ROLLBACK when KCL workers are
|
||||||
|
* 3.x complaint, and to CLIENT_VERSION_2x when customer has initiated a rollback.
|
||||||
|
* Only the leader KCL worker performs migration ready monitor and notifies all workers (including
|
||||||
|
* itself) via a MigrationState update. When all worker's monitor notice the MigrationState change
|
||||||
|
* (including itself), it will transition to CLIENT_VERSION_3x_WITH_ROLLBACK.
|
||||||
|
*/
|
||||||
|
@KinesisClientInternalApi
|
||||||
|
@RequiredArgsConstructor
|
||||||
|
@Slf4j
|
||||||
|
@ThreadSafe
|
||||||
|
public class MigrationClientVersionUpgradeFrom2xState implements MigrationClientVersionState {
|
||||||
|
private final MigrationStateMachine stateMachine;
|
||||||
|
private final Callable<Long> timeProvider;
|
||||||
|
private final CoordinatorStateDAO coordinatorStateDAO;
|
||||||
|
private final ScheduledExecutorService stateMachineThreadPool;
|
||||||
|
private final DynamicMigrationComponentsInitializer initializer;
|
||||||
|
private final Random random;
|
||||||
|
private final MigrationState currentMigrationState;
|
||||||
|
private final long flipTo3XStabilizerTimeInSeconds;
|
||||||
|
|
||||||
|
private MigrationReadyMonitor migrationMonitor;
|
||||||
|
private ClientVersionChangeMonitor clientVersionChangeMonitor;
|
||||||
|
private boolean entered = false;
|
||||||
|
private boolean left = false;
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public ClientVersion clientVersion() {
|
||||||
|
return ClientVersion.CLIENT_VERSION_UPGRADE_FROM_2x;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public synchronized void enter(final ClientVersion fromClientVersion) throws DependencyException {
|
||||||
|
if (!entered) {
|
||||||
|
log.info("Entering state {} from {}", this, fromClientVersion);
|
||||||
|
initializer.initializeClientVersionForUpgradeFrom2x(fromClientVersion);
|
||||||
|
|
||||||
|
log.info("Starting migration ready monitor to monitor 3.x compliance of the KCL workers");
|
||||||
|
migrationMonitor = new MigrationReadyMonitor(
|
||||||
|
initializer.metricsFactory(),
|
||||||
|
timeProvider,
|
||||||
|
initializer.leaderDecider(),
|
||||||
|
initializer.workerIdentifier(),
|
||||||
|
initializer.workerMetricsDAO(),
|
||||||
|
initializer.workerMetricsExpirySeconds(),
|
||||||
|
initializer.leaseRefresher(),
|
||||||
|
stateMachineThreadPool,
|
||||||
|
this::onMigrationReady,
|
||||||
|
flipTo3XStabilizerTimeInSeconds);
|
||||||
|
migrationMonitor.startMonitor();
|
||||||
|
|
||||||
|
log.info("Starting monitor for rollback and flip to 3.x");
|
||||||
|
clientVersionChangeMonitor = new ClientVersionChangeMonitor(
|
||||||
|
initializer.metricsFactory(),
|
||||||
|
coordinatorStateDAO,
|
||||||
|
stateMachineThreadPool,
|
||||||
|
this::onClientVersionChange,
|
||||||
|
clientVersion(),
|
||||||
|
random);
|
||||||
|
clientVersionChangeMonitor.startMonitor();
|
||||||
|
entered = true;
|
||||||
|
} else {
|
||||||
|
log.info("Not entering {}", left ? "already exited state" : "already entered state");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public synchronized void leave() {
|
||||||
|
if (entered && !left) {
|
||||||
|
log.info("Leaving {}", this);
|
||||||
|
cancelMigrationReadyMonitor();
|
||||||
|
cancelClientChangeVersionMonitor();
|
||||||
|
entered = false;
|
||||||
|
} else {
|
||||||
|
log.info("Cannot leave {}", entered ? "already exited state" : "because state is not active");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public String toString() {
|
||||||
|
return getClass().getSimpleName();
|
||||||
|
}
|
||||||
|
|
||||||
|
private synchronized void onMigrationReady() {
|
||||||
|
// this is invoked on the leader worker only
|
||||||
|
if (!entered || left || migrationMonitor == null) {
|
||||||
|
log.info("Ignoring migration ready monitor, state already transitioned");
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
// update dynamo with the state to toggle to 3.x
|
||||||
|
// and let the clientVersionChange kick in to do state transition
|
||||||
|
// this way both leader and non-leader worker all transition when
|
||||||
|
// it discovers the update from ddb.
|
||||||
|
if (updateDynamoStateForTransition()) {
|
||||||
|
// successfully toggled the state, now we can cancel the monitor
|
||||||
|
cancelMigrationReadyMonitor();
|
||||||
|
}
|
||||||
|
// else - either migration ready monitor will retry or
|
||||||
|
// client Version change callback will initiate the next state transition.
|
||||||
|
}
|
||||||
|
|
||||||
|
private void cancelMigrationReadyMonitor() {
|
||||||
|
if (migrationMonitor != null) {
|
||||||
|
final MigrationReadyMonitor localMigrationMonitor = migrationMonitor;
|
||||||
|
CompletableFuture.supplyAsync(() -> {
|
||||||
|
log.info("Cancelling migration ready monitor");
|
||||||
|
localMigrationMonitor.cancel();
|
||||||
|
return null;
|
||||||
|
});
|
||||||
|
migrationMonitor = null;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private void cancelClientChangeVersionMonitor() {
|
||||||
|
if (clientVersionChangeMonitor != null) {
|
||||||
|
final ClientVersionChangeMonitor localClientVersionChangeMonitor = clientVersionChangeMonitor;
|
||||||
|
CompletableFuture.supplyAsync(() -> {
|
||||||
|
log.info("Cancelling client change version monitor");
|
||||||
|
localClientVersionChangeMonitor.cancel();
|
||||||
|
return null;
|
||||||
|
});
|
||||||
|
clientVersionChangeMonitor = null;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Callback handler to handle client version changes in MigrationState in DDB.
|
||||||
|
* @param newState current MigrationState read from DDB where client version is not CLIENT_VERSION_UPGRADE_FROM_2x
|
||||||
|
* @throws InvalidStateException during transition to the next state based on the new ClientVersion
|
||||||
|
* or if the new state in DDB is unexpected.
|
||||||
|
*/
|
||||||
|
private synchronized void onClientVersionChange(final MigrationState newState)
|
||||||
|
throws InvalidStateException, DependencyException {
|
||||||
|
if (!entered || left) {
|
||||||
|
log.warn("Received client version change notification on inactive state {}", this);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
final MetricsScope scope =
|
||||||
|
MetricsUtil.createMetricsWithOperation(initializer.metricsFactory(), METRICS_OPERATION);
|
||||||
|
try {
|
||||||
|
switch (newState.getClientVersion()) {
|
||||||
|
case CLIENT_VERSION_2x:
|
||||||
|
log.info("A rollback has been initiated for the application. Transition to {}", CLIENT_VERSION_2x);
|
||||||
|
// cancel monitor asynchronously
|
||||||
|
cancelMigrationReadyMonitor();
|
||||||
|
stateMachine.transitionTo(CLIENT_VERSION_2x, newState);
|
||||||
|
break;
|
||||||
|
case CLIENT_VERSION_3x_WITH_ROLLBACK:
|
||||||
|
log.info("KCL workers are v3.x compliant, transition to {}", CLIENT_VERSION_3x_WITH_ROLLBACK);
|
||||||
|
cancelMigrationReadyMonitor();
|
||||||
|
stateMachine.transitionTo(CLIENT_VERSION_3x_WITH_ROLLBACK, newState);
|
||||||
|
break;
|
||||||
|
default:
|
||||||
|
// This should not happen, so throw an exception that allows the monitor to continue monitoring
|
||||||
|
// changes, this allows KCL to operate in the current state and keep monitoring until a valid
|
||||||
|
// state transition is possible.
|
||||||
|
// However, there could be a split brain here, new workers will use DDB value as source of truth,
|
||||||
|
// so we could also write back CLIENT_VERSION_UPGRADE_FROM_2x to DDB to ensure all workers have
|
||||||
|
// consistent behavior.
|
||||||
|
// Ideally we don't expect modifications to DDB table out of the KCL migration tool scope,
|
||||||
|
// so keeping it simple and not writing back to DDB, the error log below would help capture
|
||||||
|
// any strange behavior if this happens.
|
||||||
|
log.error("Migration state has invalid client version {}", newState);
|
||||||
|
throw new InvalidStateException(String.format("Unexpected new state %s", newState));
|
||||||
|
}
|
||||||
|
} catch (final DependencyException | InvalidStateException e) {
|
||||||
|
scope.addData(FAULT_METRIC, 1, StandardUnit.COUNT, MetricsLevel.SUMMARY);
|
||||||
|
throw e;
|
||||||
|
} finally {
|
||||||
|
MetricsUtil.endScope(scope);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private boolean updateDynamoStateForTransition() {
|
||||||
|
final MetricsScope scope =
|
||||||
|
MetricsUtil.createMetricsWithOperation(initializer.metricsFactory(), METRICS_OPERATION);
|
||||||
|
try {
|
||||||
|
final MigrationState newMigrationState = currentMigrationState
|
||||||
|
.copy()
|
||||||
|
.update(CLIENT_VERSION_3x_WITH_ROLLBACK, initializer.workerIdentifier());
|
||||||
|
log.info("Updating Migration State in DDB with {} prev state {}", newMigrationState, currentMigrationState);
|
||||||
|
return coordinatorStateDAO.updateCoordinatorStateWithExpectation(
|
||||||
|
newMigrationState, currentMigrationState.getDynamoClientVersionExpectation());
|
||||||
|
} catch (final Exception e) {
|
||||||
|
log.warn(
|
||||||
|
"Exception occurred when toggling to {}, upgradeReadyMonitor will retry the update"
|
||||||
|
+ " if upgrade condition is still true",
|
||||||
|
CLIENT_VERSION_3x_WITH_ROLLBACK,
|
||||||
|
e);
|
||||||
|
scope.addData(FAULT_METRIC, 1, StandardUnit.COUNT, MetricsLevel.SUMMARY);
|
||||||
|
return false;
|
||||||
|
} finally {
|
||||||
|
MetricsUtil.endScope(scope);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
@ -0,0 +1,352 @@
|
||||||
|
/*
|
||||||
|
* Copyright 2024 Amazon.com, Inc. or its affiliates.
|
||||||
|
* Licensed under the Apache License, Version 2.0 (the
|
||||||
|
* "License"); you may not use this file except in compliance
|
||||||
|
* with the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
package software.amazon.kinesis.coordinator.migration;
|
||||||
|
|
||||||
|
import java.time.Duration;
|
||||||
|
import java.util.HashSet;
|
||||||
|
import java.util.List;
|
||||||
|
import java.util.Objects;
|
||||||
|
import java.util.Set;
|
||||||
|
import java.util.concurrent.Callable;
|
||||||
|
import java.util.concurrent.CompletableFuture;
|
||||||
|
import java.util.concurrent.CompletionException;
|
||||||
|
import java.util.concurrent.ScheduledExecutorService;
|
||||||
|
import java.util.concurrent.ScheduledFuture;
|
||||||
|
import java.util.concurrent.TimeUnit;
|
||||||
|
import java.util.stream.Collectors;
|
||||||
|
|
||||||
|
import lombok.RequiredArgsConstructor;
|
||||||
|
import lombok.extern.slf4j.Slf4j;
|
||||||
|
import software.amazon.awssdk.annotations.ThreadSafe;
|
||||||
|
import software.amazon.awssdk.services.cloudwatch.model.StandardUnit;
|
||||||
|
import software.amazon.kinesis.coordinator.LeaderDecider;
|
||||||
|
import software.amazon.kinesis.leases.Lease;
|
||||||
|
import software.amazon.kinesis.leases.LeaseRefresher;
|
||||||
|
import software.amazon.kinesis.leases.exceptions.DependencyException;
|
||||||
|
import software.amazon.kinesis.metrics.MetricsFactory;
|
||||||
|
import software.amazon.kinesis.metrics.MetricsLevel;
|
||||||
|
import software.amazon.kinesis.metrics.MetricsScope;
|
||||||
|
import software.amazon.kinesis.metrics.MetricsUtil;
|
||||||
|
import software.amazon.kinesis.worker.metricstats.WorkerMetricStats;
|
||||||
|
import software.amazon.kinesis.worker.metricstats.WorkerMetricStatsDAO;
|
||||||
|
|
||||||
|
import static software.amazon.kinesis.coordinator.migration.MigrationStateMachineImpl.METRICS_OPERATION;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Monitor for KCL workers 3.x readiness. This monitor is started on all workers but only
|
||||||
|
* executed on the leader of the fleet. The leader determines 3.x readiness if GSI of the lease
|
||||||
|
* table is active and all lease owners are emitting WorkerMetricStats. The monitor performs this
|
||||||
|
* check periodically and will invoke callback if the readiness conditions are true. Monitor
|
||||||
|
* needs to be explicitly cancelled after the readiness trigger has successfully been handled.
|
||||||
|
*
|
||||||
|
* Thread safety - Guard for safety against public method invocation and internal runnable method.
|
||||||
|
*/
|
||||||
|
@Slf4j
|
||||||
|
@ThreadSafe
|
||||||
|
public class MigrationReadyMonitor implements Runnable {
|
||||||
|
private static final long MONITOR_INTERVAL_MILLIS = Duration.ofMinutes(1).toMillis();
|
||||||
|
private static final long LOG_INTERVAL_NANOS = Duration.ofMinutes(5).toNanos();
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Default retry attempt for loading leases and workers before giving up.
|
||||||
|
*/
|
||||||
|
private static final int DDB_LOAD_RETRY_ATTEMPT = 1;
|
||||||
|
|
||||||
|
private final MetricsFactory metricsFactory;
|
||||||
|
private final Callable<Long> timeProvider;
|
||||||
|
private final LeaderDecider leaderDecider;
|
||||||
|
private final String currentWorkerId;
|
||||||
|
private final WorkerMetricStatsDAO workerMetricStatsDAO;
|
||||||
|
private final long workerMetricStatsExpirySeconds;
|
||||||
|
private final LeaseRefresher leaseRefresher;
|
||||||
|
private final ScheduledExecutorService stateMachineThreadPool;
|
||||||
|
private final MonitorTriggerStabilizer triggerStabilizer;
|
||||||
|
|
||||||
|
private final LogRateLimiter rateLimitedStatusLogger = new LogRateLimiter(LOG_INTERVAL_NANOS);
|
||||||
|
private ScheduledFuture<?> scheduledFuture;
|
||||||
|
private boolean gsiStatusReady;
|
||||||
|
private boolean workerMetricsReady;
|
||||||
|
private Set<String> lastKnownUniqueLeaseOwners = new HashSet<>();
|
||||||
|
private Set<String> lastKnownWorkersWithActiveWorkerMetrics = new HashSet<>();
|
||||||
|
|
||||||
|
public MigrationReadyMonitor(
|
||||||
|
final MetricsFactory metricsFactory,
|
||||||
|
final Callable<Long> timeProvider,
|
||||||
|
final LeaderDecider leaderDecider,
|
||||||
|
final String currentWorkerId,
|
||||||
|
final WorkerMetricStatsDAO workerMetricStatsDAO,
|
||||||
|
final long workerMetricsExpirySeconds,
|
||||||
|
final LeaseRefresher leaseRefresher,
|
||||||
|
final ScheduledExecutorService stateMachineThreadPool,
|
||||||
|
final Runnable callback,
|
||||||
|
final long callbackStabilizationInSeconds) {
|
||||||
|
this.metricsFactory = metricsFactory;
|
||||||
|
this.timeProvider = timeProvider;
|
||||||
|
this.leaderDecider = leaderDecider;
|
||||||
|
this.currentWorkerId = currentWorkerId;
|
||||||
|
this.workerMetricStatsDAO = workerMetricStatsDAO;
|
||||||
|
this.workerMetricStatsExpirySeconds = workerMetricsExpirySeconds;
|
||||||
|
this.leaseRefresher = leaseRefresher;
|
||||||
|
this.stateMachineThreadPool = stateMachineThreadPool;
|
||||||
|
this.triggerStabilizer =
|
||||||
|
new MonitorTriggerStabilizer(timeProvider, callbackStabilizationInSeconds, callback, currentWorkerId);
|
||||||
|
}
|
||||||
|
|
||||||
|
public synchronized void startMonitor() {
|
||||||
|
if (Objects.isNull(scheduledFuture)) {
|
||||||
|
|
||||||
|
log.info("Starting migration ready monitor");
|
||||||
|
scheduledFuture = stateMachineThreadPool.scheduleWithFixedDelay(
|
||||||
|
this, MONITOR_INTERVAL_MILLIS, MONITOR_INTERVAL_MILLIS, TimeUnit.MILLISECONDS);
|
||||||
|
} else {
|
||||||
|
log.info("Ignoring monitor request, since it is already started");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Cancel the monitor. Once the method returns callback will not be invoked,
|
||||||
|
* but callback can be invoked reentrantly before this method returns.
|
||||||
|
*/
|
||||||
|
public synchronized void cancel() {
|
||||||
|
if (Objects.nonNull(scheduledFuture)) {
|
||||||
|
log.info("Cancelled migration ready monitor");
|
||||||
|
scheduledFuture.cancel(true);
|
||||||
|
scheduledFuture = null;
|
||||||
|
} else {
|
||||||
|
log.info("{} is currently not active", this);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public synchronized void run() {
|
||||||
|
try {
|
||||||
|
if (Thread.currentThread().isInterrupted()) {
|
||||||
|
log.info("{} cancelled, exiting...", this);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
if (!leaderDecider.isLeader(currentWorkerId)) {
|
||||||
|
log.debug("Not the leader, not performing migration ready check {}", this);
|
||||||
|
triggerStabilizer.reset();
|
||||||
|
lastKnownUniqueLeaseOwners.clear();
|
||||||
|
lastKnownWorkersWithActiveWorkerMetrics.clear();
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
triggerStabilizer.call(isReadyForUpgradeTo3x());
|
||||||
|
rateLimitedStatusLogger.log(() -> log.info("Monitor ran successfully {}", this));
|
||||||
|
} catch (final Throwable t) {
|
||||||
|
log.warn("{} failed, will retry after {}", this, MONITOR_INTERVAL_MILLIS, t);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public String toString() {
|
||||||
|
return new StringBuilder("UpgradeReadyMonitor[")
|
||||||
|
.append("G=")
|
||||||
|
.append(gsiStatusReady)
|
||||||
|
.append(",W=")
|
||||||
|
.append(workerMetricsReady)
|
||||||
|
.append("]")
|
||||||
|
.toString();
|
||||||
|
}
|
||||||
|
|
||||||
|
private boolean isReadyForUpgradeTo3x() throws DependencyException {
|
||||||
|
final MetricsScope scope = MetricsUtil.createMetricsWithOperation(metricsFactory, METRICS_OPERATION);
|
||||||
|
try {
|
||||||
|
// If GSI is not ready, optimize to not check if worker metrics are being emitted
|
||||||
|
final boolean localGsiReadyStatus = leaseRefresher.isLeaseOwnerToLeaseKeyIndexActive();
|
||||||
|
if (localGsiReadyStatus != gsiStatusReady) {
|
||||||
|
gsiStatusReady = localGsiReadyStatus;
|
||||||
|
log.info("Gsi ready status changed to {}", gsiStatusReady);
|
||||||
|
} else {
|
||||||
|
log.debug("GsiReady status {}", gsiStatusReady);
|
||||||
|
}
|
||||||
|
return gsiStatusReady && areLeaseOwnersEmittingWorkerMetrics();
|
||||||
|
} finally {
|
||||||
|
scope.addData("GsiReadyStatus", gsiStatusReady ? 1 : 0, StandardUnit.COUNT, MetricsLevel.SUMMARY);
|
||||||
|
scope.addData(
|
||||||
|
"WorkerMetricsReadyStatus", workerMetricsReady ? 1 : 0, StandardUnit.COUNT, MetricsLevel.SUMMARY);
|
||||||
|
MetricsUtil.endScope(scope);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private boolean areLeaseOwnersEmittingWorkerMetrics() {
|
||||||
|
final CompletableFuture<List<Lease>> leaseListFuture = loadLeaseListAsync();
|
||||||
|
final CompletableFuture<List<WorkerMetricStats>> workerMetricsFuture = loadWorkerMetricStats();
|
||||||
|
|
||||||
|
final List<Lease> leaseList = leaseListFuture.join();
|
||||||
|
final Set<String> leaseOwners = getUniqueLeaseOwnersFromLeaseTable(leaseList);
|
||||||
|
final List<WorkerMetricStats> workerMetricStatsList = workerMetricsFuture.join();
|
||||||
|
final Set<String> workersWithActiveWorkerMetrics = getWorkersWithActiveWorkerMetricStats(workerMetricStatsList);
|
||||||
|
|
||||||
|
// Leases are not checked for expired condition because:
|
||||||
|
// If some worker has gone down and is not active, but has lease assigned to it, those leases
|
||||||
|
// maybe expired. Since the worker is down, it may not have worker-metrics, or worker-metrics may not be active,
|
||||||
|
// In that case, the migration condition is not considered to be met.
|
||||||
|
// However, those leases should be assigned to another worker and so the check in the next
|
||||||
|
// iteration could succeed. This is intentional to make sure all leases owners are accounted for
|
||||||
|
// and the old owner does not come back up without worker metrics and reacquires the lease.
|
||||||
|
final boolean localWorkerMetricsReady = leaseOwners.equals(workersWithActiveWorkerMetrics);
|
||||||
|
if (localWorkerMetricsReady != workerMetricsReady) {
|
||||||
|
workerMetricsReady = localWorkerMetricsReady;
|
||||||
|
log.info("WorkerMetricStats status changed to {}", workerMetricsReady);
|
||||||
|
log.info("Lease List {}", leaseList);
|
||||||
|
log.info("WorkerMetricStats {}", workerMetricStatsList);
|
||||||
|
} else {
|
||||||
|
log.debug("WorkerMetricStats ready status {}", workerMetricsReady);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (lastKnownUniqueLeaseOwners == null) {
|
||||||
|
log.info("Unique lease owners {}", leaseOwners);
|
||||||
|
} else if (!lastKnownUniqueLeaseOwners.equals(leaseOwners)) {
|
||||||
|
log.info("Unique lease owners changed to {}", leaseOwners);
|
||||||
|
}
|
||||||
|
lastKnownUniqueLeaseOwners = leaseOwners;
|
||||||
|
|
||||||
|
if (lastKnownWorkersWithActiveWorkerMetrics == null) {
|
||||||
|
log.info("Workers with active worker metric stats {}", workersWithActiveWorkerMetrics);
|
||||||
|
} else if (!lastKnownWorkersWithActiveWorkerMetrics.equals(workersWithActiveWorkerMetrics)) {
|
||||||
|
log.info("Workers with active worker metric stats changed {}", workersWithActiveWorkerMetrics);
|
||||||
|
}
|
||||||
|
lastKnownWorkersWithActiveWorkerMetrics = workersWithActiveWorkerMetrics;
|
||||||
|
|
||||||
|
return workerMetricsReady;
|
||||||
|
}
|
||||||
|
|
||||||
|
private Set<String> getUniqueLeaseOwnersFromLeaseTable(final List<Lease> leaseList) {
|
||||||
|
return leaseList.stream().map(Lease::leaseOwner).collect(Collectors.toSet());
|
||||||
|
}
|
||||||
|
|
||||||
|
private Set<String> getWorkersWithActiveWorkerMetricStats(final List<WorkerMetricStats> workerMetricStats) {
|
||||||
|
final long nowInSeconds = Duration.ofMillis(now(timeProvider)).getSeconds();
|
||||||
|
return workerMetricStats.stream()
|
||||||
|
.filter(metricStats -> isWorkerMetricStatsActive(metricStats, nowInSeconds))
|
||||||
|
.map(WorkerMetricStats::getWorkerId)
|
||||||
|
.collect(Collectors.toSet());
|
||||||
|
}
|
||||||
|
|
||||||
|
private boolean isWorkerMetricStatsActive(final WorkerMetricStats metricStats, final long nowInSeconds) {
|
||||||
|
return (metricStats.getLastUpdateTime() + workerMetricStatsExpirySeconds) > nowInSeconds;
|
||||||
|
}
|
||||||
|
|
||||||
|
private CompletableFuture<List<WorkerMetricStats>> loadWorkerMetricStats() {
|
||||||
|
return CompletableFuture.supplyAsync(() -> loadWithRetry(workerMetricStatsDAO::getAllWorkerMetricStats));
|
||||||
|
}
|
||||||
|
|
||||||
|
private CompletableFuture<List<Lease>> loadLeaseListAsync() {
|
||||||
|
return CompletableFuture.supplyAsync(() -> loadWithRetry(leaseRefresher::listLeases));
|
||||||
|
}
|
||||||
|
|
||||||
|
private <T> T loadWithRetry(final Callable<T> loadFunction) {
|
||||||
|
int retryAttempt = 0;
|
||||||
|
while (true) {
|
||||||
|
try {
|
||||||
|
return loadFunction.call();
|
||||||
|
} catch (final Exception e) {
|
||||||
|
if (retryAttempt < DDB_LOAD_RETRY_ATTEMPT) {
|
||||||
|
log.warn(
|
||||||
|
"Failed to load : {}, retrying",
|
||||||
|
loadFunction.getClass().getName(),
|
||||||
|
e);
|
||||||
|
retryAttempt++;
|
||||||
|
} else {
|
||||||
|
throw new CompletionException(e);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private static long now(final Callable<Long> timeProvider) {
|
||||||
|
try {
|
||||||
|
return timeProvider.call();
|
||||||
|
} catch (final Exception e) {
|
||||||
|
log.debug("Time provider threw exception, using System.currentTimeMillis", e);
|
||||||
|
return System.currentTimeMillis();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Stabilize the monitor trigger before invoking the callback
|
||||||
|
* to ensure we are consistently seeing the trigger for a configured
|
||||||
|
* stabilizationDurationInMillis
|
||||||
|
*/
|
||||||
|
private static class MonitorTriggerStabilizer {
|
||||||
|
private final Callable<Long> timeProvider;
|
||||||
|
private final long stabilizationDurationInSeconds;
|
||||||
|
private final Runnable callback;
|
||||||
|
private final String currentWorkerId;
|
||||||
|
private final LogRateLimiter rateLimitedTriggerStatusLogger;
|
||||||
|
|
||||||
|
private long lastToggleTimeInMillis;
|
||||||
|
private boolean currentTriggerStatus;
|
||||||
|
|
||||||
|
public MonitorTriggerStabilizer(
|
||||||
|
final Callable<Long> timeProvider,
|
||||||
|
final long stabilizationDurationInSeconds,
|
||||||
|
final Runnable callback,
|
||||||
|
final String currentWorkerId) {
|
||||||
|
this.timeProvider = timeProvider;
|
||||||
|
this.stabilizationDurationInSeconds = stabilizationDurationInSeconds;
|
||||||
|
this.callback = callback;
|
||||||
|
this.currentWorkerId = currentWorkerId;
|
||||||
|
this.rateLimitedTriggerStatusLogger = new LogRateLimiter(LOG_INTERVAL_NANOS);
|
||||||
|
}
|
||||||
|
|
||||||
|
public void call(final boolean isMonitorTriggered) {
|
||||||
|
final long now = now(timeProvider);
|
||||||
|
if (currentTriggerStatus != isMonitorTriggered) {
|
||||||
|
log.info("Trigger status has changed to {}", isMonitorTriggered);
|
||||||
|
currentTriggerStatus = isMonitorTriggered;
|
||||||
|
lastToggleTimeInMillis = now;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (currentTriggerStatus) {
|
||||||
|
final long deltaSeconds =
|
||||||
|
Duration.ofMillis(now - lastToggleTimeInMillis).getSeconds();
|
||||||
|
if (deltaSeconds >= stabilizationDurationInSeconds) {
|
||||||
|
log.info("Trigger has been consistently true for {}s, invoking callback", deltaSeconds);
|
||||||
|
callback.run();
|
||||||
|
} else {
|
||||||
|
rateLimitedTriggerStatusLogger.log(() -> log.info(
|
||||||
|
"Trigger has been true for {}s, waiting for stabilization time of {}s",
|
||||||
|
deltaSeconds,
|
||||||
|
stabilizationDurationInSeconds));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public void reset() {
|
||||||
|
if (currentTriggerStatus) {
|
||||||
|
log.info("This worker {} is no longer the leader, reset current status", currentWorkerId);
|
||||||
|
}
|
||||||
|
currentTriggerStatus = false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
@RequiredArgsConstructor
|
||||||
|
private static class LogRateLimiter {
|
||||||
|
private final long logIntervalInNanos;
|
||||||
|
|
||||||
|
private long nextLogTime = System.nanoTime();
|
||||||
|
|
||||||
|
public void log(final Runnable logger) {
|
||||||
|
final long now = System.nanoTime();
|
||||||
|
if (now >= nextLogTime) {
|
||||||
|
logger.run();
|
||||||
|
nextLogTime = now + logIntervalInNanos;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
@ -0,0 +1,231 @@
|
||||||
|
package software.amazon.kinesis.coordinator.migration;
|
||||||
|
|
||||||
|
import java.util.ArrayList;
|
||||||
|
import java.util.HashMap;
|
||||||
|
import java.util.List;
|
||||||
|
import java.util.Map;
|
||||||
|
import java.util.stream.Collectors;
|
||||||
|
|
||||||
|
import lombok.Getter;
|
||||||
|
import lombok.RequiredArgsConstructor;
|
||||||
|
import lombok.ToString;
|
||||||
|
import lombok.extern.slf4j.Slf4j;
|
||||||
|
import software.amazon.awssdk.services.dynamodb.model.AttributeAction;
|
||||||
|
import software.amazon.awssdk.services.dynamodb.model.AttributeValue;
|
||||||
|
import software.amazon.awssdk.services.dynamodb.model.AttributeValueUpdate;
|
||||||
|
import software.amazon.awssdk.services.dynamodb.model.ExpectedAttributeValue;
|
||||||
|
import software.amazon.kinesis.common.StackTraceUtils;
|
||||||
|
import software.amazon.kinesis.coordinator.CoordinatorState;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Data model of the Migration state. This is used to track the state related to migration
|
||||||
|
* from KCLv2.x to KCLv3.x.
|
||||||
|
*/
|
||||||
|
@Getter
|
||||||
|
@ToString(callSuper = true)
|
||||||
|
@Slf4j
|
||||||
|
public class MigrationState extends CoordinatorState {
|
||||||
|
/**
|
||||||
|
* Key value for the item in the CoordinatorState table
|
||||||
|
*/
|
||||||
|
public static final String MIGRATION_HASH_KEY = "Migration3.0";
|
||||||
|
/**
|
||||||
|
* Attribute name in migration state item, whose value is used during
|
||||||
|
* the KCL v3.x migration process to know whether the workers need to
|
||||||
|
* perform KCL v2.x compatible operations or can perform native KCL v3.x
|
||||||
|
* operations.
|
||||||
|
*/
|
||||||
|
public static final String CLIENT_VERSION_ATTRIBUTE_NAME = "cv";
|
||||||
|
|
||||||
|
public static final String MODIFIED_BY_ATTRIBUTE_NAME = "mb";
|
||||||
|
public static final String MODIFIED_TIMESTAMP_ATTRIBUTE_NAME = "mts";
|
||||||
|
public static final String HISTORY_ATTRIBUTE_NAME = "h";
|
||||||
|
private static final int MAX_HISTORY_ENTRIES = 10;
|
||||||
|
|
||||||
|
private ClientVersion clientVersion;
|
||||||
|
private String modifiedBy;
|
||||||
|
private long modifiedTimestamp;
|
||||||
|
private final List<HistoryEntry> history;
|
||||||
|
|
||||||
|
private MigrationState(
|
||||||
|
final String key,
|
||||||
|
final ClientVersion clientVersion,
|
||||||
|
final String modifiedBy,
|
||||||
|
final long modifiedTimestamp,
|
||||||
|
final List<HistoryEntry> historyEntries,
|
||||||
|
final Map<String, AttributeValue> others) {
|
||||||
|
setKey(key);
|
||||||
|
setAttributes(others);
|
||||||
|
this.clientVersion = clientVersion;
|
||||||
|
this.modifiedBy = modifiedBy;
|
||||||
|
this.modifiedTimestamp = modifiedTimestamp;
|
||||||
|
this.history = historyEntries;
|
||||||
|
}
|
||||||
|
|
||||||
|
public MigrationState(final String key, final String modifiedBy) {
|
||||||
|
this(
|
||||||
|
key,
|
||||||
|
ClientVersion.CLIENT_VERSION_INIT,
|
||||||
|
modifiedBy,
|
||||||
|
System.currentTimeMillis(),
|
||||||
|
new ArrayList<>(),
|
||||||
|
new HashMap<>());
|
||||||
|
}
|
||||||
|
|
||||||
|
public HashMap<String, AttributeValue> serialize() {
|
||||||
|
final HashMap<String, AttributeValue> result = new HashMap<>();
|
||||||
|
result.put(CLIENT_VERSION_ATTRIBUTE_NAME, AttributeValue.fromS(clientVersion.name()));
|
||||||
|
result.put(MODIFIED_BY_ATTRIBUTE_NAME, AttributeValue.fromS(modifiedBy));
|
||||||
|
result.put(MODIFIED_TIMESTAMP_ATTRIBUTE_NAME, AttributeValue.fromN(String.valueOf(modifiedTimestamp)));
|
||||||
|
|
||||||
|
if (!history.isEmpty()) {
|
||||||
|
final List<AttributeValue> historyList = new ArrayList<>();
|
||||||
|
for (final HistoryEntry entry : history) {
|
||||||
|
historyList.add(AttributeValue.builder().m(entry.serialize()).build());
|
||||||
|
}
|
||||||
|
result.put(
|
||||||
|
HISTORY_ATTRIBUTE_NAME,
|
||||||
|
AttributeValue.builder().l(historyList).build());
|
||||||
|
}
|
||||||
|
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
|
public static MigrationState deserialize(final String key, final HashMap<String, AttributeValue> attributes) {
|
||||||
|
if (!MIGRATION_HASH_KEY.equals(key)) {
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
|
try {
|
||||||
|
final HashMap<String, AttributeValue> mutableAttributes = new HashMap<>(attributes);
|
||||||
|
final ClientVersion clientVersion = ClientVersion.valueOf(
|
||||||
|
mutableAttributes.remove(CLIENT_VERSION_ATTRIBUTE_NAME).s());
|
||||||
|
final String modifiedBy =
|
||||||
|
mutableAttributes.remove(MODIFIED_BY_ATTRIBUTE_NAME).s();
|
||||||
|
final long modifiedTimestamp = Long.parseLong(
|
||||||
|
mutableAttributes.remove(MODIFIED_TIMESTAMP_ATTRIBUTE_NAME).n());
|
||||||
|
|
||||||
|
final List<HistoryEntry> historyList = new ArrayList<>();
|
||||||
|
if (attributes.containsKey(HISTORY_ATTRIBUTE_NAME)) {
|
||||||
|
mutableAttributes.remove(HISTORY_ATTRIBUTE_NAME).l().stream()
|
||||||
|
.map(historyEntry -> HistoryEntry.deserialize(historyEntry.m()))
|
||||||
|
.forEach(historyList::add);
|
||||||
|
}
|
||||||
|
final MigrationState migrationState = new MigrationState(
|
||||||
|
MIGRATION_HASH_KEY, clientVersion, modifiedBy, modifiedTimestamp, historyList, mutableAttributes);
|
||||||
|
|
||||||
|
if (!mutableAttributes.isEmpty()) {
|
||||||
|
log.info("Unknown attributes {} for state {}", mutableAttributes, migrationState);
|
||||||
|
}
|
||||||
|
return migrationState;
|
||||||
|
|
||||||
|
} catch (final Exception e) {
|
||||||
|
log.warn("Unable to deserialize state with key {} and attributes {}", key, attributes, e);
|
||||||
|
}
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
|
public Map<String, ExpectedAttributeValue> getDynamoClientVersionExpectation() {
|
||||||
|
return new HashMap<String, ExpectedAttributeValue>() {
|
||||||
|
{
|
||||||
|
put(
|
||||||
|
CLIENT_VERSION_ATTRIBUTE_NAME,
|
||||||
|
ExpectedAttributeValue.builder()
|
||||||
|
.value(AttributeValue.fromS(clientVersion.name()))
|
||||||
|
.build());
|
||||||
|
}
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
public MigrationState copy() {
|
||||||
|
return new MigrationState(
|
||||||
|
getKey(),
|
||||||
|
getClientVersion(),
|
||||||
|
getModifiedBy(),
|
||||||
|
getModifiedTimestamp(),
|
||||||
|
new ArrayList<>(getHistory()),
|
||||||
|
new HashMap<>(getAttributes()));
|
||||||
|
}
|
||||||
|
|
||||||
|
public MigrationState update(final ClientVersion clientVersion, final String modifiedBy) {
|
||||||
|
log.info(
|
||||||
|
"Migration state is being updated to {} current state {} caller {}",
|
||||||
|
clientVersion,
|
||||||
|
this,
|
||||||
|
StackTraceUtils.getPrintableStackTrace(Thread.currentThread().getStackTrace()));
|
||||||
|
addHistoryEntry(this.clientVersion, this.modifiedBy, this.modifiedTimestamp);
|
||||||
|
this.clientVersion = clientVersion;
|
||||||
|
this.modifiedBy = modifiedBy;
|
||||||
|
this.modifiedTimestamp = System.currentTimeMillis();
|
||||||
|
return this;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void addHistoryEntry(
|
||||||
|
final ClientVersion lastClientVersion, final String lastModifiedBy, final long lastModifiedTimestamp) {
|
||||||
|
history.add(0, new HistoryEntry(lastClientVersion, lastModifiedBy, lastModifiedTimestamp));
|
||||||
|
if (history.size() > MAX_HISTORY_ENTRIES) {
|
||||||
|
log.info("Limit {} reached, dropping history {}", MAX_HISTORY_ENTRIES, history.remove(history.size() - 1));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public Map<String, AttributeValueUpdate> getDynamoUpdate() {
|
||||||
|
final HashMap<String, AttributeValueUpdate> updates = new HashMap<>();
|
||||||
|
updates.put(
|
||||||
|
CLIENT_VERSION_ATTRIBUTE_NAME,
|
||||||
|
AttributeValueUpdate.builder()
|
||||||
|
.value(AttributeValue.fromS(clientVersion.name()))
|
||||||
|
.action(AttributeAction.PUT)
|
||||||
|
.build());
|
||||||
|
updates.put(
|
||||||
|
MODIFIED_BY_ATTRIBUTE_NAME,
|
||||||
|
AttributeValueUpdate.builder()
|
||||||
|
.value(AttributeValue.fromS(modifiedBy))
|
||||||
|
.action(AttributeAction.PUT)
|
||||||
|
.build());
|
||||||
|
updates.put(
|
||||||
|
MODIFIED_TIMESTAMP_ATTRIBUTE_NAME,
|
||||||
|
AttributeValueUpdate.builder()
|
||||||
|
.value(AttributeValue.fromN(String.valueOf(modifiedTimestamp)))
|
||||||
|
.action(AttributeAction.PUT)
|
||||||
|
.build());
|
||||||
|
if (!history.isEmpty()) {
|
||||||
|
updates.put(
|
||||||
|
HISTORY_ATTRIBUTE_NAME,
|
||||||
|
AttributeValueUpdate.builder()
|
||||||
|
.value(AttributeValue.fromL(
|
||||||
|
history.stream().map(HistoryEntry::toAv).collect(Collectors.toList())))
|
||||||
|
.action(AttributeAction.PUT)
|
||||||
|
.build());
|
||||||
|
}
|
||||||
|
return updates;
|
||||||
|
}
|
||||||
|
|
||||||
|
@RequiredArgsConstructor
|
||||||
|
@ToString
|
||||||
|
public static class HistoryEntry {
|
||||||
|
private final ClientVersion lastClientVersion;
|
||||||
|
private final String lastModifiedBy;
|
||||||
|
private final long lastModifiedTimestamp;
|
||||||
|
|
||||||
|
public AttributeValue toAv() {
|
||||||
|
return AttributeValue.fromM(serialize());
|
||||||
|
}
|
||||||
|
|
||||||
|
public Map<String, AttributeValue> serialize() {
|
||||||
|
return new HashMap<String, AttributeValue>() {
|
||||||
|
{
|
||||||
|
put(CLIENT_VERSION_ATTRIBUTE_NAME, AttributeValue.fromS(lastClientVersion.name()));
|
||||||
|
put(MODIFIED_BY_ATTRIBUTE_NAME, AttributeValue.fromS(lastModifiedBy));
|
||||||
|
put(MODIFIED_TIMESTAMP_ATTRIBUTE_NAME, AttributeValue.fromN(String.valueOf(lastModifiedTimestamp)));
|
||||||
|
}
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
public static HistoryEntry deserialize(final Map<String, AttributeValue> map) {
|
||||||
|
return new HistoryEntry(
|
||||||
|
ClientVersion.valueOf(map.get(CLIENT_VERSION_ATTRIBUTE_NAME).s()),
|
||||||
|
map.get(MODIFIED_BY_ATTRIBUTE_NAME).s(),
|
||||||
|
Long.parseLong(map.get(MODIFIED_TIMESTAMP_ATTRIBUTE_NAME).n()));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
@ -0,0 +1,66 @@
|
||||||
|
/*
|
||||||
|
* Copyright 2024 Amazon.com, Inc. or its affiliates.
|
||||||
|
* Licensed under the Apache License, Version 2.0 (the
|
||||||
|
* "License"); you may not use this file except in compliance
|
||||||
|
* with the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
package software.amazon.kinesis.coordinator.migration;
|
||||||
|
|
||||||
|
import software.amazon.kinesis.leases.exceptions.DependencyException;
|
||||||
|
import software.amazon.kinesis.leases.exceptions.InvalidStateException;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* State machine that provides:
|
||||||
|
* 1. Seamless upgrade from 2.x to 3.x - 3.x has introduced new algorithms that are not compatible with 2.x
|
||||||
|
* workers, so the state machine allows to seamlessly run the 2.x functionality to be compliant with any
|
||||||
|
* 2.x worker in the fleet, and also seamlessly switch to 3.x functionality when all KCL workers are
|
||||||
|
* 3.x complaint.
|
||||||
|
* 2. Instant rollbacks - Rollbacks are supported using the KCL Migration tool to revert back to 2.x functionality
|
||||||
|
* if customer finds regressions in 3.x functionality.
|
||||||
|
* 3. Instant roll-forwards - Once any issue has been mitigated, rollfowards are supported instantly
|
||||||
|
* with KCL Migration tool.
|
||||||
|
*/
|
||||||
|
public interface MigrationStateMachine {
|
||||||
|
/**
|
||||||
|
* Initialize the state machine by identifying the initial state when the KCL worker comes up for the first time.
|
||||||
|
* @throws DependencyException When unable to identify the initial state.
|
||||||
|
*/
|
||||||
|
void initialize() throws DependencyException;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Shutdown state machine and perform necessary cleanup for the worker to gracefully shutdown
|
||||||
|
*/
|
||||||
|
void shutdown();
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Terminate the state machine when it reaches a terminal state, which is a successful upgrade
|
||||||
|
* to v3.x.
|
||||||
|
*/
|
||||||
|
void terminate();
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Peform transition from current state to the given new ClientVersion
|
||||||
|
* @param nextClientVersion clientVersion of the new state the state machine must transition to
|
||||||
|
* @param state the current MigrationState in dynamo
|
||||||
|
* @throws InvalidStateException when transition fails, this allows the state machine to stay
|
||||||
|
* in the current state until a valid transition is possible
|
||||||
|
* @throws DependencyException when transition fails due to dependency on DDB failing in
|
||||||
|
* unexpected ways.
|
||||||
|
*/
|
||||||
|
void transitionTo(final ClientVersion nextClientVersion, final MigrationState state)
|
||||||
|
throws InvalidStateException, DependencyException;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Get the ClientVersion of current state machine state.
|
||||||
|
* @return ClientVersion of current state machine state
|
||||||
|
*/
|
||||||
|
ClientVersion getCurrentClientVersion();
|
||||||
|
}
|
||||||
|
|
@ -0,0 +1,254 @@
|
||||||
|
/*
|
||||||
|
* Copyright 2024 Amazon.com, Inc. or its affiliates.
|
||||||
|
* Licensed under the Apache License, Version 2.0 (the
|
||||||
|
* "License"); you may not use this file except in compliance
|
||||||
|
* with the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
package software.amazon.kinesis.coordinator.migration;
|
||||||
|
|
||||||
|
import java.util.AbstractMap.SimpleEntry;
|
||||||
|
import java.util.Random;
|
||||||
|
import java.util.concurrent.Callable;
|
||||||
|
import java.util.concurrent.ScheduledExecutorService;
|
||||||
|
import java.util.concurrent.TimeUnit;
|
||||||
|
|
||||||
|
import lombok.Getter;
|
||||||
|
import lombok.extern.slf4j.Slf4j;
|
||||||
|
import software.amazon.awssdk.annotations.ThreadSafe;
|
||||||
|
import software.amazon.awssdk.services.cloudwatch.model.StandardUnit;
|
||||||
|
import software.amazon.kinesis.annotations.KinesisClientInternalApi;
|
||||||
|
import software.amazon.kinesis.coordinator.CoordinatorConfig.ClientVersionConfig;
|
||||||
|
import software.amazon.kinesis.coordinator.CoordinatorStateDAO;
|
||||||
|
import software.amazon.kinesis.coordinator.DynamicMigrationComponentsInitializer;
|
||||||
|
import software.amazon.kinesis.leases.exceptions.DependencyException;
|
||||||
|
import software.amazon.kinesis.metrics.MetricsFactory;
|
||||||
|
import software.amazon.kinesis.metrics.MetricsLevel;
|
||||||
|
import software.amazon.kinesis.metrics.MetricsScope;
|
||||||
|
import software.amazon.kinesis.metrics.MetricsUtil;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Implementation of {@link MigrationStateMachine}
|
||||||
|
*/
|
||||||
|
@KinesisClientInternalApi
|
||||||
|
@Getter
|
||||||
|
@Slf4j
|
||||||
|
@ThreadSafe
|
||||||
|
public class MigrationStateMachineImpl implements MigrationStateMachine {
|
||||||
|
public static final String FAULT_METRIC = "Fault";
|
||||||
|
public static final String METRICS_OPERATION = "Migration";
|
||||||
|
|
||||||
|
private static final long THREAD_POOL_SHUTDOWN_TIMEOUT_SECONDS = 5L;
|
||||||
|
|
||||||
|
private final MetricsFactory metricsFactory;
|
||||||
|
private final Callable<Long> timeProvider;
|
||||||
|
private final CoordinatorStateDAO coordinatorStateDAO;
|
||||||
|
private final ScheduledExecutorService stateMachineThreadPool;
|
||||||
|
private DynamicMigrationComponentsInitializer initializer;
|
||||||
|
private final ClientVersionConfig clientVersionConfig;
|
||||||
|
private final Random random;
|
||||||
|
private final String workerId;
|
||||||
|
private final long flipTo3XStabilizerTimeInSeconds;
|
||||||
|
private MigrationState startingMigrationState;
|
||||||
|
|
||||||
|
@Getter
|
||||||
|
private ClientVersion startingClientVersion;
|
||||||
|
|
||||||
|
private MigrationClientVersionState currentMigrationClientVersionState = new MigrationClientVersionState() {
|
||||||
|
@Override
|
||||||
|
public ClientVersion clientVersion() {
|
||||||
|
return ClientVersion.CLIENT_VERSION_INIT;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void enter(final ClientVersion fromClientVersion) {
|
||||||
|
log.info("Entered {}...", clientVersion());
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void leave() {
|
||||||
|
log.info("Left {}...", clientVersion());
|
||||||
|
}
|
||||||
|
};
|
||||||
|
private boolean terminated = false;
|
||||||
|
|
||||||
|
public MigrationStateMachineImpl(
|
||||||
|
final MetricsFactory metricsFactory,
|
||||||
|
final Callable<Long> timeProvider,
|
||||||
|
final CoordinatorStateDAO coordinatorStateDAO,
|
||||||
|
final ScheduledExecutorService stateMachineThreadPool,
|
||||||
|
final ClientVersionConfig clientVersionConfig,
|
||||||
|
final Random random,
|
||||||
|
final DynamicMigrationComponentsInitializer initializer,
|
||||||
|
final String workerId,
|
||||||
|
final long flipTo3XStabilizerTimeInSeconds) {
|
||||||
|
this.metricsFactory = metricsFactory;
|
||||||
|
this.timeProvider = timeProvider;
|
||||||
|
this.coordinatorStateDAO = coordinatorStateDAO;
|
||||||
|
this.stateMachineThreadPool = stateMachineThreadPool;
|
||||||
|
this.clientVersionConfig = clientVersionConfig;
|
||||||
|
this.random = random;
|
||||||
|
this.initializer = initializer;
|
||||||
|
this.workerId = workerId;
|
||||||
|
this.flipTo3XStabilizerTimeInSeconds = flipTo3XStabilizerTimeInSeconds;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void initialize() throws DependencyException {
|
||||||
|
if (startingClientVersion == null) {
|
||||||
|
log.info("Initializing MigrationStateMachine");
|
||||||
|
coordinatorStateDAO.initialize();
|
||||||
|
final MigrationClientVersionStateInitializer startingStateInitializer =
|
||||||
|
new MigrationClientVersionStateInitializer(
|
||||||
|
timeProvider, coordinatorStateDAO, clientVersionConfig, random, workerId);
|
||||||
|
final SimpleEntry<ClientVersion, MigrationState> dataForInitialization =
|
||||||
|
startingStateInitializer.getInitialState();
|
||||||
|
initializer.initialize(dataForInitialization.getKey());
|
||||||
|
transitionTo(dataForInitialization.getKey(), dataForInitialization.getValue());
|
||||||
|
startingClientVersion = dataForInitialization.getKey();
|
||||||
|
startingMigrationState = dataForInitialization.getValue();
|
||||||
|
log.info("MigrationStateMachine initial clientVersion {}", startingClientVersion);
|
||||||
|
} else {
|
||||||
|
log.info("MigrationStateMachine already initialized with clientVersion {}", startingClientVersion);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void shutdown() {
|
||||||
|
terminate();
|
||||||
|
if (!stateMachineThreadPool.isShutdown()) {
|
||||||
|
stateMachineThreadPool.shutdown();
|
||||||
|
try {
|
||||||
|
if (stateMachineThreadPool.awaitTermination(THREAD_POOL_SHUTDOWN_TIMEOUT_SECONDS, TimeUnit.SECONDS)) {
|
||||||
|
log.info(
|
||||||
|
"StateMachineThreadPool did not shutdown within {} seconds, forcefully shutting down",
|
||||||
|
THREAD_POOL_SHUTDOWN_TIMEOUT_SECONDS);
|
||||||
|
stateMachineThreadPool.shutdownNow();
|
||||||
|
}
|
||||||
|
} catch (final InterruptedException e) {
|
||||||
|
log.info("Interrupted when shutting down StateMachineThreadPool, forcefully shutting down");
|
||||||
|
stateMachineThreadPool.shutdownNow();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
log.info("Shutdown successfully");
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public synchronized void terminate() {
|
||||||
|
if (!terminated && currentMigrationClientVersionState != null) {
|
||||||
|
log.info("State machine is about to terminate");
|
||||||
|
currentMigrationClientVersionState.leave();
|
||||||
|
currentMigrationClientVersionState = null;
|
||||||
|
log.info("State machine reached a terminal state.");
|
||||||
|
terminated = true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public synchronized void transitionTo(final ClientVersion nextClientVersion, final MigrationState migrationState)
|
||||||
|
throws DependencyException {
|
||||||
|
if (terminated) {
|
||||||
|
throw new IllegalStateException(String.format(
|
||||||
|
"Cannot transition to %s after state machine is terminated, %s",
|
||||||
|
nextClientVersion.name(), migrationState));
|
||||||
|
}
|
||||||
|
|
||||||
|
final MigrationClientVersionState nextMigrationClientVersionState =
|
||||||
|
createMigrationClientVersionState(nextClientVersion, migrationState);
|
||||||
|
log.info(
|
||||||
|
"Attempting to transition from {} to {}",
|
||||||
|
currentMigrationClientVersionState.clientVersion(),
|
||||||
|
nextClientVersion);
|
||||||
|
currentMigrationClientVersionState.leave();
|
||||||
|
|
||||||
|
enter(nextMigrationClientVersionState);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Enter with retry. When entering the state machine for the first time, the caller has retry so exceptions
|
||||||
|
* will be re-thrown. Once the state machine has initialized all transitions will be an indefinite retry.
|
||||||
|
* It is possible the DDB state has changed by the time enter succeeds but that will occur as a new
|
||||||
|
* state transition after entering the state. Usually the failures are due to unexpected issues with
|
||||||
|
* DDB which will be transitional and will recover on a retry.
|
||||||
|
* @param nextMigrationClientVersionState the state to transition to
|
||||||
|
* @throws DependencyException If entering fails during state machine initialization.
|
||||||
|
*/
|
||||||
|
private void enter(final MigrationClientVersionState nextMigrationClientVersionState) throws DependencyException {
|
||||||
|
boolean success = false;
|
||||||
|
while (!success) {
|
||||||
|
try {
|
||||||
|
// Enter should never fail unless it is the starting state and fails to create the GSI,
|
||||||
|
// in which case it is an unrecoverable error that is bubbled up and KCL start up will fail.
|
||||||
|
nextMigrationClientVersionState.enter(currentMigrationClientVersionState.clientVersion());
|
||||||
|
|
||||||
|
currentMigrationClientVersionState = nextMigrationClientVersionState;
|
||||||
|
log.info("Successfully transitioned to {}", nextMigrationClientVersionState.clientVersion());
|
||||||
|
if (currentMigrationClientVersionState.clientVersion() == ClientVersion.CLIENT_VERSION_3x) {
|
||||||
|
terminate();
|
||||||
|
}
|
||||||
|
success = true;
|
||||||
|
} catch (final DependencyException e) {
|
||||||
|
if (currentMigrationClientVersionState.clientVersion() == ClientVersion.CLIENT_VERSION_INIT) {
|
||||||
|
throw e;
|
||||||
|
}
|
||||||
|
log.info(
|
||||||
|
"Transitioning from {} to {} failed, retrying after a minute",
|
||||||
|
currentMigrationClientVersionState.clientVersion(),
|
||||||
|
nextMigrationClientVersionState.clientVersion(),
|
||||||
|
e);
|
||||||
|
|
||||||
|
final MetricsScope scope = MetricsUtil.createMetricsWithOperation(metricsFactory, METRICS_OPERATION);
|
||||||
|
scope.addData(FAULT_METRIC, 1, StandardUnit.COUNT, MetricsLevel.SUMMARY);
|
||||||
|
MetricsUtil.endScope(scope);
|
||||||
|
|
||||||
|
try {
|
||||||
|
Thread.sleep(1000);
|
||||||
|
} catch (final InterruptedException ie) {
|
||||||
|
log.info("Interrupted while sleeping before retrying state machine transition", ie);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private MigrationClientVersionState createMigrationClientVersionState(
|
||||||
|
final ClientVersion clientVersion, final MigrationState migrationState) {
|
||||||
|
switch (clientVersion) {
|
||||||
|
case CLIENT_VERSION_2x:
|
||||||
|
return new MigrationClientVersion2xState(
|
||||||
|
this, coordinatorStateDAO, stateMachineThreadPool, initializer, random);
|
||||||
|
case CLIENT_VERSION_UPGRADE_FROM_2x:
|
||||||
|
return new MigrationClientVersionUpgradeFrom2xState(
|
||||||
|
this,
|
||||||
|
timeProvider,
|
||||||
|
coordinatorStateDAO,
|
||||||
|
stateMachineThreadPool,
|
||||||
|
initializer,
|
||||||
|
random,
|
||||||
|
migrationState,
|
||||||
|
flipTo3XStabilizerTimeInSeconds);
|
||||||
|
case CLIENT_VERSION_3x_WITH_ROLLBACK:
|
||||||
|
return new MigrationClientVersion3xWithRollbackState(
|
||||||
|
this, coordinatorStateDAO, stateMachineThreadPool, initializer, random);
|
||||||
|
case CLIENT_VERSION_3x:
|
||||||
|
return new MigrationClientVersion3xState(this, initializer);
|
||||||
|
}
|
||||||
|
throw new IllegalStateException(String.format("Unknown client version %s", clientVersion));
|
||||||
|
}
|
||||||
|
|
||||||
|
public ClientVersion getCurrentClientVersion() {
|
||||||
|
if (currentMigrationClientVersionState != null) {
|
||||||
|
return currentMigrationClientVersionState.clientVersion();
|
||||||
|
} else if (terminated) {
|
||||||
|
return ClientVersion.CLIENT_VERSION_3x;
|
||||||
|
}
|
||||||
|
throw new UnsupportedOperationException(
|
||||||
|
"No current state when state machine is either not initialized" + " or already terminated");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
@ -0,0 +1,270 @@
|
||||||
|
package software.amazon.kinesis.leader;
|
||||||
|
|
||||||
|
import java.time.Duration;
|
||||||
|
import java.time.Instant;
|
||||||
|
import java.util.AbstractMap;
|
||||||
|
import java.util.Optional;
|
||||||
|
import java.util.concurrent.TimeUnit;
|
||||||
|
import java.util.concurrent.atomic.AtomicBoolean;
|
||||||
|
|
||||||
|
import com.amazonaws.services.dynamodbv2.AcquireLockOptions;
|
||||||
|
import com.amazonaws.services.dynamodbv2.AmazonDynamoDBLockClient;
|
||||||
|
import com.amazonaws.services.dynamodbv2.GetLockOptions;
|
||||||
|
import com.amazonaws.services.dynamodbv2.LockItem;
|
||||||
|
import com.amazonaws.services.dynamodbv2.model.LockCurrentlyUnavailableException;
|
||||||
|
import com.google.common.annotations.VisibleForTesting;
|
||||||
|
import lombok.RequiredArgsConstructor;
|
||||||
|
import lombok.extern.slf4j.Slf4j;
|
||||||
|
import software.amazon.awssdk.services.cloudwatch.model.StandardUnit;
|
||||||
|
import software.amazon.awssdk.services.dynamodb.model.ResourceNotFoundException;
|
||||||
|
import software.amazon.kinesis.coordinator.CoordinatorStateDAO;
|
||||||
|
import software.amazon.kinesis.coordinator.LeaderDecider;
|
||||||
|
import software.amazon.kinesis.metrics.MetricsFactory;
|
||||||
|
import software.amazon.kinesis.metrics.MetricsLevel;
|
||||||
|
import software.amazon.kinesis.metrics.MetricsScope;
|
||||||
|
import software.amazon.kinesis.metrics.MetricsUtil;
|
||||||
|
|
||||||
|
import static java.util.Objects.isNull;
|
||||||
|
import static software.amazon.kinesis.coordinator.CoordinatorState.LEADER_HASH_KEY;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Implementation for LeaderDecider to elect leader using lock on dynamo db table. This class uses
|
||||||
|
* AmazonDynamoDBLockClient library to perform the leader election.
|
||||||
|
*/
|
||||||
|
@RequiredArgsConstructor
|
||||||
|
@Slf4j
|
||||||
|
public class DynamoDBLockBasedLeaderDecider implements LeaderDecider {
|
||||||
|
private static final Long DEFAULT_LEASE_DURATION_MILLIS =
|
||||||
|
Duration.ofMinutes(2).toMillis();
|
||||||
|
// Heartbeat frequency should be at-least 3 times smaller the lease duration according to LockClient documentation
|
||||||
|
private static final Long DEFAULT_HEARTBEAT_PERIOD_MILLIS =
|
||||||
|
Duration.ofSeconds(30).toMillis();
|
||||||
|
|
||||||
|
private final CoordinatorStateDAO coordinatorStateDao;
|
||||||
|
private final AmazonDynamoDBLockClient dynamoDBLockClient;
|
||||||
|
private final Long heartbeatPeriodMillis;
|
||||||
|
private final String workerId;
|
||||||
|
private final MetricsFactory metricsFactory;
|
||||||
|
|
||||||
|
private long lastCheckTimeInMillis = 0L;
|
||||||
|
private boolean lastIsLeaderResult = false;
|
||||||
|
private final AtomicBoolean isShutdown = new AtomicBoolean(false);
|
||||||
|
|
||||||
|
private long lastIsAnyLeaderElectedDDBReadTimeMillis = 0L;
|
||||||
|
private boolean lastIsAnyLeaderElectedResult = false;
|
||||||
|
/**
|
||||||
|
* Key value pair of LockItem to the time when it was first discovered.
|
||||||
|
* If a new LockItem fetched from ddb has different recordVersionNumber than the one in-memory,
|
||||||
|
* its considered as new LockItem, and the time when it was fetched is stored in memory to identify lockItem
|
||||||
|
* expiry. This is used only in the context of isAnyLeaderElected method.
|
||||||
|
*/
|
||||||
|
private AbstractMap.SimpleEntry<LockItem, Long> lastIsAnyLeaderCheckLockItemToFirstEncounterTime = null;
|
||||||
|
|
||||||
|
@VisibleForTesting
|
||||||
|
static DynamoDBLockBasedLeaderDecider create(
|
||||||
|
final CoordinatorStateDAO coordinatorStateDao,
|
||||||
|
final String workerId,
|
||||||
|
final Long leaseDuration,
|
||||||
|
final Long heartbeatPeriod,
|
||||||
|
final MetricsFactory metricsFactory) {
|
||||||
|
final AmazonDynamoDBLockClient dynamoDBLockClient = new AmazonDynamoDBLockClient(coordinatorStateDao
|
||||||
|
.getDDBLockClientOptionsBuilder()
|
||||||
|
.withTimeUnit(TimeUnit.MILLISECONDS)
|
||||||
|
.withLeaseDuration(leaseDuration)
|
||||||
|
.withHeartbeatPeriod(heartbeatPeriod)
|
||||||
|
.withCreateHeartbeatBackgroundThread(true)
|
||||||
|
.withOwnerName(workerId)
|
||||||
|
.build());
|
||||||
|
|
||||||
|
return new DynamoDBLockBasedLeaderDecider(
|
||||||
|
coordinatorStateDao, dynamoDBLockClient, heartbeatPeriod, workerId, metricsFactory);
|
||||||
|
}
|
||||||
|
|
||||||
|
public static DynamoDBLockBasedLeaderDecider create(
|
||||||
|
final CoordinatorStateDAO coordinatorStateDao, final String workerId, final MetricsFactory metricsFactory) {
|
||||||
|
return create(
|
||||||
|
coordinatorStateDao,
|
||||||
|
workerId,
|
||||||
|
DEFAULT_LEASE_DURATION_MILLIS,
|
||||||
|
DEFAULT_HEARTBEAT_PERIOD_MILLIS,
|
||||||
|
metricsFactory);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void initialize() {
|
||||||
|
log.info("Initializing DDB Lock based leader decider");
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Check the lockItem in storage and if the current worker is not leader worker, then tries to acquire lock and
|
||||||
|
* returns true if it was able to acquire lock else false.
|
||||||
|
* @param workerId ID of the worker
|
||||||
|
* @return true if current worker is leader else false.
|
||||||
|
*/
|
||||||
|
@Override
|
||||||
|
public synchronized Boolean isLeader(final String workerId) {
|
||||||
|
// if the decider has shutdown, then return false and don't try acquireLock anymore.
|
||||||
|
if (isShutdown.get()) {
|
||||||
|
publishIsLeaderMetrics(false);
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
// If the last time we tried to take lock and didnt get lock, don't try to take again for heartbeatPeriodMillis
|
||||||
|
// this is to avoid unnecessary calls to dynamoDB.
|
||||||
|
// Different modules in KCL can request for isLeader check within heartbeatPeriodMillis, and this optimization
|
||||||
|
// will help in those cases.
|
||||||
|
// In case the last call returned true, we want to check the source always to ensure the correctness of leader.
|
||||||
|
if (!lastIsLeaderResult && lastCheckTimeInMillis + heartbeatPeriodMillis > System.currentTimeMillis()) {
|
||||||
|
publishIsLeaderMetrics(lastIsLeaderResult);
|
||||||
|
return lastIsLeaderResult;
|
||||||
|
}
|
||||||
|
boolean response;
|
||||||
|
// Get the lockItem from storage (if present
|
||||||
|
final Optional<LockItem> lockItem = dynamoDBLockClient.getLock(LEADER_HASH_KEY, Optional.empty());
|
||||||
|
lockItem.ifPresent(item -> log.info("Worker : {} is the current leader.", item.getOwnerName()));
|
||||||
|
|
||||||
|
// If the lockItem is present and is expired, that means either current worker is not leader.
|
||||||
|
if (!lockItem.isPresent() || lockItem.get().isExpired()) {
|
||||||
|
try {
|
||||||
|
// Current worker does not hold the lock, try to acquireOne.
|
||||||
|
final Optional<LockItem> leaderLockItem =
|
||||||
|
dynamoDBLockClient.tryAcquireLock(AcquireLockOptions.builder(LEADER_HASH_KEY)
|
||||||
|
.withRefreshPeriod(heartbeatPeriodMillis)
|
||||||
|
.withTimeUnit(TimeUnit.MILLISECONDS)
|
||||||
|
.withShouldSkipBlockingWait(true)
|
||||||
|
.build());
|
||||||
|
leaderLockItem.ifPresent(item -> log.info("Worker : {} is new leader", item.getOwnerName()));
|
||||||
|
// if leaderLockItem optional is empty, that means the lock is not acquired by this worker.
|
||||||
|
response = leaderLockItem.isPresent();
|
||||||
|
} catch (final InterruptedException e) {
|
||||||
|
// Something bad happened, don't assume leadership and also release lock just in case the
|
||||||
|
// lock was granted and still interrupt happened.
|
||||||
|
releaseLeadershipIfHeld();
|
||||||
|
log.error("Acquiring lock was interrupted in between", e);
|
||||||
|
response = false;
|
||||||
|
|
||||||
|
} catch (final LockCurrentlyUnavailableException e) {
|
||||||
|
response = false;
|
||||||
|
}
|
||||||
|
|
||||||
|
} else {
|
||||||
|
response = lockItem.get().getOwnerName().equals(workerId);
|
||||||
|
}
|
||||||
|
|
||||||
|
lastCheckTimeInMillis = System.currentTimeMillis();
|
||||||
|
lastIsLeaderResult = response;
|
||||||
|
publishIsLeaderMetrics(response);
|
||||||
|
return response;
|
||||||
|
}
|
||||||
|
|
||||||
|
private void publishIsLeaderMetrics(final boolean response) {
|
||||||
|
final MetricsScope metricsScope =
|
||||||
|
MetricsUtil.createMetricsWithOperation(metricsFactory, METRIC_OPERATION_LEADER_DECIDER);
|
||||||
|
metricsScope.addData(
|
||||||
|
METRIC_OPERATION_LEADER_DECIDER_IS_LEADER, response ? 1 : 0, StandardUnit.COUNT, MetricsLevel.DETAILED);
|
||||||
|
MetricsUtil.endScope(metricsScope);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Releases the lock if held by current worker when this method is invoked.
|
||||||
|
*/
|
||||||
|
@Override
|
||||||
|
public void shutdown() {
|
||||||
|
if (!isShutdown.getAndSet(true)) {
|
||||||
|
releaseLeadershipIfHeld();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void releaseLeadershipIfHeld() {
|
||||||
|
try {
|
||||||
|
final Optional<LockItem> lockItem = dynamoDBLockClient.getLock(LEADER_HASH_KEY, Optional.empty());
|
||||||
|
if (lockItem.isPresent()
|
||||||
|
&& !lockItem.get().isExpired()
|
||||||
|
&& lockItem.get().getOwnerName().equals(workerId)) {
|
||||||
|
|
||||||
|
log.info(
|
||||||
|
"Current worker : {} holds the lock, releasing it.",
|
||||||
|
lockItem.get().getOwnerName());
|
||||||
|
// LockItem.close() will release the lock if current worker owns it else this call is no op.
|
||||||
|
lockItem.get().close();
|
||||||
|
}
|
||||||
|
} catch (final Exception e) {
|
||||||
|
log.error("Failed to complete releaseLeadershipIfHeld call.", e);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Returns if any ACTIVE leader exists that is elected by the current implementation which can be outside the
|
||||||
|
* scope of this worker. That is leader elected by this implementation in any worker in fleet.
|
||||||
|
* DynamoDBLockClient does not provide an interface which can tell if an active lock exists or not, thus
|
||||||
|
* we need to put custom implementation.
|
||||||
|
* The implementation performs DDB get every heartbeatPeriodMillis to have low RCU consumption, which means that
|
||||||
|
* the leader could have been elected from the last time the check happened and before check happens again.
|
||||||
|
* The information returned from this method has eventual consistency (up to heartbeatPeriodMillis interval).
|
||||||
|
*
|
||||||
|
* @return true, if any leader is elected else false.
|
||||||
|
*/
|
||||||
|
@Override
|
||||||
|
public synchronized boolean isAnyLeaderElected() {
|
||||||
|
// Avoid going to ddb for every call and do it once every heartbeatPeriod to have low RCU usage.
|
||||||
|
if (Duration.between(
|
||||||
|
Instant.ofEpochMilli(lastIsAnyLeaderElectedDDBReadTimeMillis),
|
||||||
|
Instant.ofEpochMilli(System.currentTimeMillis()))
|
||||||
|
.toMillis()
|
||||||
|
> heartbeatPeriodMillis) {
|
||||||
|
final MetricsScope metricsScope = MetricsUtil.createMetricsWithOperation(
|
||||||
|
metricsFactory, this.getClass().getSimpleName() + ":isAnyLeaderElected");
|
||||||
|
final long startTime = System.currentTimeMillis();
|
||||||
|
try {
|
||||||
|
lastIsAnyLeaderElectedDDBReadTimeMillis = System.currentTimeMillis();
|
||||||
|
final Optional<LockItem> lockItem = dynamoDBLockClient.getLockFromDynamoDB(
|
||||||
|
GetLockOptions.builder(LEADER_HASH_KEY).build());
|
||||||
|
|
||||||
|
if (!lockItem.isPresent()) {
|
||||||
|
// There is no LockItem in the ddb table, that means no one is holding lock.
|
||||||
|
lastIsAnyLeaderElectedResult = false;
|
||||||
|
log.info("LockItem present : {}", false);
|
||||||
|
} else {
|
||||||
|
final LockItem ddbLockItem = lockItem.get();
|
||||||
|
if (isNull(lastIsAnyLeaderCheckLockItemToFirstEncounterTime)
|
||||||
|
|| !ddbLockItem
|
||||||
|
.getRecordVersionNumber()
|
||||||
|
.equals(lastIsAnyLeaderCheckLockItemToFirstEncounterTime
|
||||||
|
.getKey()
|
||||||
|
.getRecordVersionNumber())) {
|
||||||
|
// This is the first isAnyLeaderElected call, so we can't evaluate if the LockItem has expired
|
||||||
|
// or not yet so consider LOCK as ACTIVE.
|
||||||
|
// OR LockItem in ddb and in-memory LockItem have different RecordVersionNumber
|
||||||
|
// and thus the LOCK is still ACTIVE
|
||||||
|
lastIsAnyLeaderElectedResult = true;
|
||||||
|
lastIsAnyLeaderCheckLockItemToFirstEncounterTime =
|
||||||
|
new AbstractMap.SimpleEntry<>(ddbLockItem, lastIsAnyLeaderElectedDDBReadTimeMillis);
|
||||||
|
log.info(
|
||||||
|
"LockItem present : {}, and this is either first call OR lockItem has had "
|
||||||
|
+ "a heartbeat",
|
||||||
|
true);
|
||||||
|
} else {
|
||||||
|
// There is no change in the ddb lock item, so if the last update time is more than
|
||||||
|
// lease duration, the lock is expired else it is still ACTIVE,
|
||||||
|
lastIsAnyLeaderElectedResult = lastIsAnyLeaderCheckLockItemToFirstEncounterTime.getValue()
|
||||||
|
+ ddbLockItem.getLeaseDuration()
|
||||||
|
> lastIsAnyLeaderElectedDDBReadTimeMillis;
|
||||||
|
log.info("LockItem present : {}, and lease expiry: {}", true, lastIsAnyLeaderElectedResult);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} catch (final ResourceNotFoundException exception) {
|
||||||
|
log.info("Lock table does not exists...");
|
||||||
|
// If the table itself doesn't exist, there is no elected leader.
|
||||||
|
lastIsAnyLeaderElectedResult = false;
|
||||||
|
} finally {
|
||||||
|
metricsScope.addData(
|
||||||
|
"Latency",
|
||||||
|
System.currentTimeMillis() - startTime,
|
||||||
|
StandardUnit.MILLISECONDS,
|
||||||
|
MetricsLevel.DETAILED);
|
||||||
|
MetricsUtil.endScope(metricsScope);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return lastIsAnyLeaderElectedResult;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
@ -0,0 +1,79 @@
|
||||||
|
package software.amazon.kinesis.leader;
|
||||||
|
|
||||||
|
import lombok.extern.slf4j.Slf4j;
|
||||||
|
import software.amazon.awssdk.annotations.ThreadSafe;
|
||||||
|
import software.amazon.awssdk.services.cloudwatch.model.StandardUnit;
|
||||||
|
import software.amazon.kinesis.annotations.KinesisClientInternalApi;
|
||||||
|
import software.amazon.kinesis.coordinator.LeaderDecider;
|
||||||
|
import software.amazon.kinesis.metrics.MetricsFactory;
|
||||||
|
import software.amazon.kinesis.metrics.MetricsLevel;
|
||||||
|
import software.amazon.kinesis.metrics.MetricsScope;
|
||||||
|
import software.amazon.kinesis.metrics.MetricsUtil;
|
||||||
|
|
||||||
|
import static java.util.Objects.nonNull;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* MigrationAdaptiveLeaderDecider that wraps around the actual LeaderDecider which can dynamically
|
||||||
|
* change based on the MigrationStateMachine.
|
||||||
|
*/
|
||||||
|
@Slf4j
|
||||||
|
@KinesisClientInternalApi
|
||||||
|
@ThreadSafe
|
||||||
|
public class MigrationAdaptiveLeaderDecider implements LeaderDecider {
|
||||||
|
|
||||||
|
private final MetricsFactory metricsFactory;
|
||||||
|
private LeaderDecider currentLeaderDecider;
|
||||||
|
|
||||||
|
public MigrationAdaptiveLeaderDecider(final MetricsFactory metricsFactory) {
|
||||||
|
this.metricsFactory = metricsFactory;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public synchronized Boolean isLeader(final String workerId) {
|
||||||
|
if (currentLeaderDecider == null) {
|
||||||
|
throw new IllegalStateException("LeaderDecider uninitialized");
|
||||||
|
}
|
||||||
|
|
||||||
|
final MetricsScope scope =
|
||||||
|
MetricsUtil.createMetricsWithOperation(metricsFactory, METRIC_OPERATION_LEADER_DECIDER);
|
||||||
|
try {
|
||||||
|
publishSelectedLeaderDeciderMetrics(scope, currentLeaderDecider);
|
||||||
|
return currentLeaderDecider.isLeader(workerId);
|
||||||
|
} finally {
|
||||||
|
MetricsUtil.endScope(scope);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private static void publishSelectedLeaderDeciderMetrics(
|
||||||
|
final MetricsScope scope, final LeaderDecider leaderDecider) {
|
||||||
|
scope.addData(
|
||||||
|
String.format(leaderDecider.getClass().getSimpleName()), 1D, StandardUnit.COUNT, MetricsLevel.DETAILED);
|
||||||
|
}
|
||||||
|
|
||||||
|
public synchronized void updateLeaderDecider(final LeaderDecider leaderDecider) {
|
||||||
|
if (currentLeaderDecider != null) {
|
||||||
|
currentLeaderDecider.shutdown();
|
||||||
|
log.info(
|
||||||
|
"Updating leader decider dynamically from {} to {}",
|
||||||
|
this.currentLeaderDecider.getClass().getSimpleName(),
|
||||||
|
leaderDecider.getClass().getSimpleName());
|
||||||
|
} else {
|
||||||
|
log.info(
|
||||||
|
"Initializing dynamic leader decider with {}",
|
||||||
|
leaderDecider.getClass().getSimpleName());
|
||||||
|
}
|
||||||
|
currentLeaderDecider = leaderDecider;
|
||||||
|
currentLeaderDecider.initialize();
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void shutdown() {
|
||||||
|
if (nonNull(currentLeaderDecider)) {
|
||||||
|
log.info("Shutting down current {}", currentLeaderDecider.getClass().getSimpleName());
|
||||||
|
currentLeaderDecider.shutdown();
|
||||||
|
currentLeaderDecider = null;
|
||||||
|
} else {
|
||||||
|
log.info("LeaderDecider has already been shutdown");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
@ -81,8 +81,20 @@ public class DynamoUtils {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public static AttributeValue createAttributeValue(Double doubleValue) {
|
||||||
|
if (doubleValue == null) {
|
||||||
|
throw new IllegalArgumentException("Double attributeValues cannot be null.");
|
||||||
|
}
|
||||||
|
|
||||||
|
return AttributeValue.builder().n(doubleValue.toString()).build();
|
||||||
|
}
|
||||||
|
|
||||||
public static String safeGetString(Map<String, AttributeValue> dynamoRecord, String key) {
|
public static String safeGetString(Map<String, AttributeValue> dynamoRecord, String key) {
|
||||||
AttributeValue av = dynamoRecord.get(key);
|
AttributeValue av = dynamoRecord.get(key);
|
||||||
|
return safeGetString(av);
|
||||||
|
}
|
||||||
|
|
||||||
|
public static String safeGetString(AttributeValue av) {
|
||||||
if (av == null) {
|
if (av == null) {
|
||||||
return null;
|
return null;
|
||||||
} else {
|
} else {
|
||||||
|
|
@ -99,4 +111,13 @@ public class DynamoUtils {
|
||||||
return av.ss();
|
return av.ss();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public static Double safeGetDouble(Map<String, AttributeValue> dynamoRecord, String key) {
|
||||||
|
AttributeValue av = dynamoRecord.get(key);
|
||||||
|
if (av == null) {
|
||||||
|
return null;
|
||||||
|
} else {
|
||||||
|
return new Double(av.n());
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -103,26 +103,6 @@ public class KinesisShardDetector implements ShardDetector {
|
||||||
|
|
||||||
private static final Boolean THROW_RESOURCE_NOT_FOUND_EXCEPTION = true;
|
private static final Boolean THROW_RESOURCE_NOT_FOUND_EXCEPTION = true;
|
||||||
|
|
||||||
@Deprecated
|
|
||||||
public KinesisShardDetector(
|
|
||||||
KinesisAsyncClient kinesisClient,
|
|
||||||
String streamName,
|
|
||||||
long listShardsBackoffTimeInMillis,
|
|
||||||
int maxListShardsRetryAttempts,
|
|
||||||
long listShardsCacheAllowedAgeInSeconds,
|
|
||||||
int maxCacheMissesBeforeReload,
|
|
||||||
int cacheMissWarningModulus) {
|
|
||||||
this(
|
|
||||||
kinesisClient,
|
|
||||||
StreamIdentifier.singleStreamInstance(streamName),
|
|
||||||
listShardsBackoffTimeInMillis,
|
|
||||||
maxListShardsRetryAttempts,
|
|
||||||
listShardsCacheAllowedAgeInSeconds,
|
|
||||||
maxCacheMissesBeforeReload,
|
|
||||||
cacheMissWarningModulus,
|
|
||||||
LeaseManagementConfig.DEFAULT_REQUEST_TIMEOUT);
|
|
||||||
}
|
|
||||||
|
|
||||||
public KinesisShardDetector(
|
public KinesisShardDetector(
|
||||||
KinesisAsyncClient kinesisClient,
|
KinesisAsyncClient kinesisClient,
|
||||||
StreamIdentifier streamIdentifier,
|
StreamIdentifier streamIdentifier,
|
||||||
|
|
|
||||||
|
|
@ -46,7 +46,11 @@ import software.amazon.kinesis.retrieval.kpl.ExtendedSequenceNumber;
|
||||||
"lastCounterIncrementNanos",
|
"lastCounterIncrementNanos",
|
||||||
"childShardIds",
|
"childShardIds",
|
||||||
"pendingCheckpointState",
|
"pendingCheckpointState",
|
||||||
"isMarkedForLeaseSteal"
|
"isMarkedForLeaseSteal",
|
||||||
|
"throughputKBps",
|
||||||
|
"checkpointOwner",
|
||||||
|
"checkpointOwnerTimeoutTimestampMillis",
|
||||||
|
"isExpiredOrUnassigned"
|
||||||
})
|
})
|
||||||
@ToString
|
@ToString
|
||||||
public class Lease {
|
public class Lease {
|
||||||
|
|
@ -104,6 +108,33 @@ public class Lease {
|
||||||
@Setter
|
@Setter
|
||||||
private boolean isMarkedForLeaseSteal;
|
private boolean isMarkedForLeaseSteal;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* If true, this indicates that lease is ready to be immediately reassigned.
|
||||||
|
*/
|
||||||
|
@Setter
|
||||||
|
private boolean isExpiredOrUnassigned;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Throughput in Kbps for the lease.
|
||||||
|
*/
|
||||||
|
private Double throughputKBps;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Owner of the checkpoint. The attribute is used for graceful shutdowns to indicate the owner that
|
||||||
|
* is allowed to write the checkpoint.
|
||||||
|
*/
|
||||||
|
@Setter
|
||||||
|
private String checkpointOwner;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* This field is used for tracking when the shutdown was requested on the lease so we can expire it. This is
|
||||||
|
* deliberately not persisted in DynamoDB because leaseOwner are expected to transfer lease from itself to the
|
||||||
|
* next owner during shutdown. If the worker dies before shutdown the lease will just become expired then we can
|
||||||
|
* pick it up. If for some reason worker is not able to shut down and continues holding onto the lease
|
||||||
|
* this timeout will kick in and force a lease transfer.
|
||||||
|
*/
|
||||||
|
@Setter
|
||||||
|
private Long checkpointOwnerTimeoutTimestampMillis;
|
||||||
/**
|
/**
|
||||||
* Count of distinct lease holders between checkpoints.
|
* Count of distinct lease holders between checkpoints.
|
||||||
*/
|
*/
|
||||||
|
|
@ -242,6 +273,54 @@ public class Lease {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @return true if checkpoint owner is set. Indicating a requested shutdown.
|
||||||
|
*/
|
||||||
|
public boolean shutdownRequested() {
|
||||||
|
return checkpointOwner != null;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Check whether lease should be blocked on pending checkpoint. We DON'T block if
|
||||||
|
* - lease is expired (Expired lease should be assigned right away) OR
|
||||||
|
* ----- at this point we know lease is assigned -----
|
||||||
|
* - lease is shardEnd (No more processing possible) OR
|
||||||
|
* - lease is NOT requested for shutdown OR
|
||||||
|
* - lease shutdown expired
|
||||||
|
*
|
||||||
|
* @param currentTimeMillis current time in milliseconds
|
||||||
|
* @return true if lease is blocked on pending checkpoint
|
||||||
|
*/
|
||||||
|
public boolean blockedOnPendingCheckpoint(long currentTimeMillis) {
|
||||||
|
// using ORs and negate
|
||||||
|
return !(isExpiredOrUnassigned
|
||||||
|
|| ExtendedSequenceNumber.SHARD_END.equals(checkpoint)
|
||||||
|
|| !shutdownRequested()
|
||||||
|
// if shutdown requested then checkpointOwnerTimeoutTimestampMillis should present
|
||||||
|
|| currentTimeMillis - checkpointOwnerTimeoutTimestampMillis >= 0);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Check whether lease is eligible for graceful shutdown. It's eligible if
|
||||||
|
* - lease is still assigned (not expired) AND
|
||||||
|
* - lease is NOT shardEnd (No more processing possible AND
|
||||||
|
* - lease is NOT requested for shutdown
|
||||||
|
*
|
||||||
|
* @return true if lease is eligible for graceful shutdown
|
||||||
|
*/
|
||||||
|
public boolean isEligibleForGracefulShutdown() {
|
||||||
|
return !isExpiredOrUnassigned && !ExtendedSequenceNumber.SHARD_END.equals(checkpoint) && !shutdownRequested();
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Need to handle the case during graceful shutdown where leaseOwner isn't the current owner
|
||||||
|
*
|
||||||
|
* @return the actual owner
|
||||||
|
*/
|
||||||
|
public String actualOwner() {
|
||||||
|
return checkpointOwner == null ? leaseOwner : checkpointOwner;
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* @return true if lease is not currently owned
|
* @return true if lease is not currently owned
|
||||||
*/
|
*/
|
||||||
|
|
@ -343,6 +422,15 @@ public class Lease {
|
||||||
this.childShardIds.addAll(childShardIds);
|
this.childShardIds.addAll(childShardIds);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Sets throughputKbps.
|
||||||
|
*
|
||||||
|
* @param throughputKBps may not be null
|
||||||
|
*/
|
||||||
|
public void throughputKBps(double throughputKBps) {
|
||||||
|
this.throughputKBps = throughputKBps;
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Set the hash range key for this shard.
|
* Set the hash range key for this shard.
|
||||||
* @param hashKeyRangeForLease
|
* @param hashKeyRangeForLease
|
||||||
|
|
@ -370,6 +458,8 @@ public class Lease {
|
||||||
* @return A deep copy of this object.
|
* @return A deep copy of this object.
|
||||||
*/
|
*/
|
||||||
public Lease copy() {
|
public Lease copy() {
|
||||||
return new Lease(this);
|
final Lease lease = new Lease(this);
|
||||||
|
lease.checkpointOwner(this.checkpointOwner);
|
||||||
|
return lease;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -20,6 +20,7 @@ import java.util.Collections;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
import java.util.UUID;
|
import java.util.UUID;
|
||||||
|
|
||||||
|
import software.amazon.kinesis.coordinator.MigrationAdaptiveLeaseAssignmentModeProvider;
|
||||||
import software.amazon.kinesis.leases.dynamodb.DynamoDBLeaseCoordinator;
|
import software.amazon.kinesis.leases.dynamodb.DynamoDBLeaseCoordinator;
|
||||||
import software.amazon.kinesis.leases.exceptions.DependencyException;
|
import software.amazon.kinesis.leases.exceptions.DependencyException;
|
||||||
import software.amazon.kinesis.leases.exceptions.InvalidStateException;
|
import software.amazon.kinesis.leases.exceptions.InvalidStateException;
|
||||||
|
|
@ -38,11 +39,14 @@ public interface LeaseCoordinator {
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Start background LeaseHolder and LeaseTaker threads.
|
* Start background LeaseHolder and LeaseTaker threads.
|
||||||
|
* @param leaseAssignmentModeProvider provider of Lease Assignment mode to determine whether to start components
|
||||||
|
* for both V2 and V3 functionality or only V3 functionality
|
||||||
* @throws ProvisionedThroughputException If we can't talk to DynamoDB due to insufficient capacity.
|
* @throws ProvisionedThroughputException If we can't talk to DynamoDB due to insufficient capacity.
|
||||||
* @throws InvalidStateException If the lease table doesn't exist
|
* @throws InvalidStateException If the lease table doesn't exist
|
||||||
* @throws DependencyException If we encountered exception taking to DynamoDB
|
* @throws DependencyException If we encountered exception taking to DynamoDB
|
||||||
*/
|
*/
|
||||||
void start() throws DependencyException, InvalidStateException, ProvisionedThroughputException;
|
void start(final MigrationAdaptiveLeaseAssignmentModeProvider leaseAssignmentModeProvider)
|
||||||
|
throws DependencyException, InvalidStateException, ProvisionedThroughputException;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Runs a single iteration of the lease taker - used by integration tests.
|
* Runs a single iteration of the lease taker - used by integration tests.
|
||||||
|
|
@ -152,4 +156,9 @@ public interface LeaseCoordinator {
|
||||||
* @return LeaseCoordinator
|
* @return LeaseCoordinator
|
||||||
*/
|
*/
|
||||||
DynamoDBLeaseCoordinator initialLeaseTableReadCapacity(long readCapacity);
|
DynamoDBLeaseCoordinator initialLeaseTableReadCapacity(long readCapacity);
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @return instance of {@link LeaseStatsRecorder}
|
||||||
|
*/
|
||||||
|
LeaseStatsRecorder leaseStatsRecorder();
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -0,0 +1,20 @@
|
||||||
|
package software.amazon.kinesis.leases;
|
||||||
|
|
||||||
|
import java.util.List;
|
||||||
|
|
||||||
|
import software.amazon.kinesis.leases.exceptions.DependencyException;
|
||||||
|
import software.amazon.kinesis.leases.exceptions.InvalidStateException;
|
||||||
|
import software.amazon.kinesis.leases.exceptions.ProvisionedThroughputException;
|
||||||
|
|
||||||
|
public interface LeaseDiscoverer {
|
||||||
|
/**
|
||||||
|
* Identifies the leases that are assigned to the current worker but are not being tracked and processed by the
|
||||||
|
* current worker.
|
||||||
|
*
|
||||||
|
* @return list of leases assigned to worker which doesn't exist in {@param currentHeldLeaseKeys}
|
||||||
|
* @throws DependencyException if DynamoDB scan fails in an unexpected way
|
||||||
|
* @throws InvalidStateException if lease table does not exist
|
||||||
|
* @throws ProvisionedThroughputException if DynamoDB scan fails due to lack of capacity
|
||||||
|
*/
|
||||||
|
List<Lease> discoverNewLeases() throws ProvisionedThroughputException, InvalidStateException, DependencyException;
|
||||||
|
}
|
||||||
|
|
@ -16,7 +16,9 @@
|
||||||
package software.amazon.kinesis.leases;
|
package software.amazon.kinesis.leases;
|
||||||
|
|
||||||
import java.time.Duration;
|
import java.time.Duration;
|
||||||
|
import java.util.ArrayList;
|
||||||
import java.util.Collection;
|
import java.util.Collection;
|
||||||
|
import java.util.List;
|
||||||
import java.util.concurrent.ExecutorService;
|
import java.util.concurrent.ExecutorService;
|
||||||
import java.util.concurrent.SynchronousQueue;
|
import java.util.concurrent.SynchronousQueue;
|
||||||
import java.util.concurrent.ThreadFactory;
|
import java.util.concurrent.ThreadFactory;
|
||||||
|
|
@ -25,7 +27,9 @@ import java.util.concurrent.TimeUnit;
|
||||||
import java.util.function.Function;
|
import java.util.function.Function;
|
||||||
|
|
||||||
import com.google.common.util.concurrent.ThreadFactoryBuilder;
|
import com.google.common.util.concurrent.ThreadFactoryBuilder;
|
||||||
|
import lombok.Builder;
|
||||||
import lombok.Data;
|
import lombok.Data;
|
||||||
|
import lombok.Getter;
|
||||||
import lombok.NonNull;
|
import lombok.NonNull;
|
||||||
import lombok.experimental.Accessors;
|
import lombok.experimental.Accessors;
|
||||||
import org.apache.commons.lang3.Validate;
|
import org.apache.commons.lang3.Validate;
|
||||||
|
|
@ -34,6 +38,7 @@ import software.amazon.awssdk.services.dynamodb.DynamoDbAsyncClient;
|
||||||
import software.amazon.awssdk.services.dynamodb.model.BillingMode;
|
import software.amazon.awssdk.services.dynamodb.model.BillingMode;
|
||||||
import software.amazon.awssdk.services.dynamodb.model.Tag;
|
import software.amazon.awssdk.services.dynamodb.model.Tag;
|
||||||
import software.amazon.awssdk.services.kinesis.KinesisAsyncClient;
|
import software.amazon.awssdk.services.kinesis.KinesisAsyncClient;
|
||||||
|
import software.amazon.kinesis.common.DdbTableConfig;
|
||||||
import software.amazon.kinesis.common.InitialPositionInStream;
|
import software.amazon.kinesis.common.InitialPositionInStream;
|
||||||
import software.amazon.kinesis.common.InitialPositionInStreamExtended;
|
import software.amazon.kinesis.common.InitialPositionInStreamExtended;
|
||||||
import software.amazon.kinesis.common.LeaseCleanupConfig;
|
import software.amazon.kinesis.common.LeaseCleanupConfig;
|
||||||
|
|
@ -42,6 +47,7 @@ import software.amazon.kinesis.leases.dynamodb.DynamoDBLeaseManagementFactory;
|
||||||
import software.amazon.kinesis.leases.dynamodb.TableCreatorCallback;
|
import software.amazon.kinesis.leases.dynamodb.TableCreatorCallback;
|
||||||
import software.amazon.kinesis.metrics.MetricsFactory;
|
import software.amazon.kinesis.metrics.MetricsFactory;
|
||||||
import software.amazon.kinesis.metrics.NullMetricsFactory;
|
import software.amazon.kinesis.metrics.NullMetricsFactory;
|
||||||
|
import software.amazon.kinesis.worker.metric.WorkerMetric;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Used by the KCL to configure lease management.
|
* Used by the KCL to configure lease management.
|
||||||
|
|
@ -209,6 +215,9 @@ public class LeaseManagementConfig {
|
||||||
|
|
||||||
private BillingMode billingMode = BillingMode.PAY_PER_REQUEST;
|
private BillingMode billingMode = BillingMode.PAY_PER_REQUEST;
|
||||||
|
|
||||||
|
private WorkerUtilizationAwareAssignmentConfig workerUtilizationAwareAssignmentConfig =
|
||||||
|
new WorkerUtilizationAwareAssignmentConfig();
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Whether to enable deletion protection on the DynamoDB lease table created by KCL. This does not update
|
* Whether to enable deletion protection on the DynamoDB lease table created by KCL. This does not update
|
||||||
* already existing tables.
|
* already existing tables.
|
||||||
|
|
@ -276,14 +285,17 @@ public class LeaseManagementConfig {
|
||||||
}
|
}
|
||||||
|
|
||||||
public LeaseManagementConfig(
|
public LeaseManagementConfig(
|
||||||
String tableName,
|
final String tableName,
|
||||||
DynamoDbAsyncClient dynamoDBClient,
|
final String applicationName,
|
||||||
KinesisAsyncClient kinesisClient,
|
final DynamoDbAsyncClient dynamoDBClient,
|
||||||
String workerIdentifier) {
|
final KinesisAsyncClient kinesisClient,
|
||||||
|
final String workerIdentifier) {
|
||||||
this.tableName = tableName;
|
this.tableName = tableName;
|
||||||
this.dynamoDBClient = dynamoDBClient;
|
this.dynamoDBClient = dynamoDBClient;
|
||||||
this.kinesisClient = kinesisClient;
|
this.kinesisClient = kinesisClient;
|
||||||
this.workerIdentifier = workerIdentifier;
|
this.workerIdentifier = workerIdentifier;
|
||||||
|
this.workerUtilizationAwareAssignmentConfig.workerMetricsTableConfig =
|
||||||
|
new WorkerMetricsTableConfig(applicationName);
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
|
@ -361,6 +373,53 @@ public class LeaseManagementConfig {
|
||||||
return hierarchicalShardSyncer;
|
return hierarchicalShardSyncer;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Configuration class for controlling the graceful handoff of leases.
|
||||||
|
* This configuration allows tuning of the shutdown behavior during lease transfers.
|
||||||
|
* <p>
|
||||||
|
* It provides settings to control the timeout period for waiting on the record processor
|
||||||
|
* to shut down and an option to enable or disable graceful lease handoff.
|
||||||
|
* </p>
|
||||||
|
*/
|
||||||
|
@Builder
|
||||||
|
@Getter
|
||||||
|
@Accessors(fluent = true)
|
||||||
|
public static class GracefulLeaseHandoffConfig {
|
||||||
|
/**
|
||||||
|
* The minimum amount of time (in milliseconds) to wait for the current shard's RecordProcessor
|
||||||
|
* to gracefully shut down before forcefully transferring the lease to the next owner.
|
||||||
|
* <p>
|
||||||
|
* If each call to {@code processRecords} is expected to run longer than the default value,
|
||||||
|
* it makes sense to set this to a higher value to ensure the RecordProcessor has enough
|
||||||
|
* time to complete its processing.
|
||||||
|
* </p>
|
||||||
|
* <p>
|
||||||
|
* Default value is 30,000 milliseconds (30 seconds).
|
||||||
|
* </p>
|
||||||
|
*/
|
||||||
|
@Builder.Default
|
||||||
|
private long gracefulLeaseHandoffTimeoutMillis = 30_000L;
|
||||||
|
/**
|
||||||
|
* Flag to enable or disable the graceful lease handoff mechanism.
|
||||||
|
* <p>
|
||||||
|
* When set to {@code true}, the KCL will attempt to gracefully transfer leases by
|
||||||
|
* allowing the shard's RecordProcessor sufficient time to complete processing before
|
||||||
|
* handing off the lease to another worker. When {@code false}, the lease will be
|
||||||
|
* handed off without waiting for the RecordProcessor to shut down gracefully. Note
|
||||||
|
* that checkpointing is expected to be implemented inside {@code shutdownRequested}
|
||||||
|
* for this feature to work end to end.
|
||||||
|
* </p>
|
||||||
|
* <p>
|
||||||
|
* Default value is {@code true}.
|
||||||
|
* </p>
|
||||||
|
*/
|
||||||
|
@Builder.Default
|
||||||
|
private boolean isGracefulLeaseHandoffEnabled = true;
|
||||||
|
}
|
||||||
|
|
||||||
|
private GracefulLeaseHandoffConfig gracefulLeaseHandoffConfig =
|
||||||
|
GracefulLeaseHandoffConfig.builder().build();
|
||||||
|
|
||||||
@Deprecated
|
@Deprecated
|
||||||
public LeaseManagementFactory leaseManagementFactory() {
|
public LeaseManagementFactory leaseManagementFactory() {
|
||||||
if (leaseManagementFactory == null) {
|
if (leaseManagementFactory == null) {
|
||||||
|
|
@ -440,7 +499,9 @@ public class LeaseManagementConfig {
|
||||||
leaseSerializer,
|
leaseSerializer,
|
||||||
customShardDetectorProvider(),
|
customShardDetectorProvider(),
|
||||||
isMultiStreamingMode,
|
isMultiStreamingMode,
|
||||||
leaseCleanupConfig());
|
leaseCleanupConfig(),
|
||||||
|
workerUtilizationAwareAssignmentConfig(),
|
||||||
|
gracefulLeaseHandoffConfig);
|
||||||
}
|
}
|
||||||
return leaseManagementFactory;
|
return leaseManagementFactory;
|
||||||
}
|
}
|
||||||
|
|
@ -454,4 +515,89 @@ public class LeaseManagementConfig {
|
||||||
this.leaseManagementFactory = leaseManagementFactory;
|
this.leaseManagementFactory = leaseManagementFactory;
|
||||||
return this;
|
return this;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Data
|
||||||
|
@Accessors(fluent = true)
|
||||||
|
public static class WorkerUtilizationAwareAssignmentConfig {
|
||||||
|
/**
|
||||||
|
* This defines the frequency of capturing worker metric stats in memory. Default is 1s
|
||||||
|
*/
|
||||||
|
private long inMemoryWorkerMetricsCaptureFrequencyMillis =
|
||||||
|
Duration.ofSeconds(1L).toMillis();
|
||||||
|
/**
|
||||||
|
* This defines the frequency of reporting worker metric stats to storage. Default is 30s
|
||||||
|
*/
|
||||||
|
private long workerMetricsReporterFreqInMillis = Duration.ofSeconds(30).toMillis();
|
||||||
|
/**
|
||||||
|
* These are the no. of metrics that are persisted in storage in WorkerMetricStats ddb table.
|
||||||
|
*/
|
||||||
|
private int noOfPersistedMetricsPerWorkerMetrics = 10;
|
||||||
|
/**
|
||||||
|
* Option to disable workerMetrics to use in lease balancing.
|
||||||
|
*/
|
||||||
|
private boolean disableWorkerMetrics = false;
|
||||||
|
/**
|
||||||
|
* List of workerMetrics for the application.
|
||||||
|
*/
|
||||||
|
private List<WorkerMetric> workerMetricList = new ArrayList<>();
|
||||||
|
/**
|
||||||
|
* Max throughput per host KBps, default is unlimited.
|
||||||
|
*/
|
||||||
|
private double maxThroughputPerHostKBps = Double.MAX_VALUE;
|
||||||
|
/**
|
||||||
|
* Percentage of value to achieve critical dampening during this case
|
||||||
|
*/
|
||||||
|
private int dampeningPercentage = 60;
|
||||||
|
/**
|
||||||
|
* Percentage value used to trigger reBalance. If fleet has workers which are have metrics value more or less
|
||||||
|
* than 20% of fleet level average then reBalance is triggered.
|
||||||
|
* Leases are taken from workers with metrics value more than fleet level average. The load to take from these
|
||||||
|
* workers is determined by evaluating how far they are with respect to fleet level average.
|
||||||
|
*/
|
||||||
|
private int reBalanceThresholdPercentage = 10;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* The allowThroughputOvershoot flag determines whether leases should still be taken even if
|
||||||
|
* it causes the total assigned throughput to exceed the desired throughput to take for re-balance.
|
||||||
|
* Enabling this flag provides more flexibility for the LeaseAssignmentManager to explore additional
|
||||||
|
* assignment possibilities, which can lead to faster throughput convergence.
|
||||||
|
*/
|
||||||
|
private boolean allowThroughputOvershoot = true;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Duration after which workerMetrics entry from WorkerMetricStats table will be cleaned up. When an entry's
|
||||||
|
* lastUpdateTime is older than staleWorkerMetricsEntryCleanupDuration from current time, entry will be removed
|
||||||
|
* from the table.
|
||||||
|
*/
|
||||||
|
private Duration staleWorkerMetricsEntryCleanupDuration = Duration.ofDays(1);
|
||||||
|
|
||||||
|
/**
|
||||||
|
* configuration to configure how to create the WorkerMetricStats table, such as table name,
|
||||||
|
* billing mode, provisioned capacity. If no table name is specified, the table name will
|
||||||
|
* default to applicationName-WorkerMetricStats. If no billing more is chosen, default is
|
||||||
|
* On-Demand.
|
||||||
|
*/
|
||||||
|
private WorkerMetricsTableConfig workerMetricsTableConfig;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Frequency to perform worker variance balancing frequency. This value is used with respect to the LAM freq,
|
||||||
|
* that is every third (as default) iteration of LAM the worker variance balancing will be performed.
|
||||||
|
* Setting it to 1 will make varianceBalancing run on every iteration of LAM and 2 on every 2nd iteration
|
||||||
|
* and so on.
|
||||||
|
*/
|
||||||
|
private int varianceBalancingFrequency = 3;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Alpha value used for calculating exponential moving average of worker's metrics values. Selecting
|
||||||
|
* higher alpha value gives more weightage to recent value and thus low smoothing effect on computed average
|
||||||
|
* and selecting smaller alpha values gives more weightage to past value and high smoothing effect.
|
||||||
|
*/
|
||||||
|
private double workerMetricsEMAAlpha = 0.5;
|
||||||
|
}
|
||||||
|
|
||||||
|
public static class WorkerMetricsTableConfig extends DdbTableConfig {
|
||||||
|
public WorkerMetricsTableConfig(final String applicationName) {
|
||||||
|
super(applicationName, "WorkerMetricStats");
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -15,9 +15,12 @@
|
||||||
|
|
||||||
package software.amazon.kinesis.leases;
|
package software.amazon.kinesis.leases;
|
||||||
|
|
||||||
|
import java.util.concurrent.ConcurrentMap;
|
||||||
|
|
||||||
import software.amazon.kinesis.common.StreamConfig;
|
import software.amazon.kinesis.common.StreamConfig;
|
||||||
import software.amazon.kinesis.coordinator.DeletedStreamListProvider;
|
import software.amazon.kinesis.coordinator.DeletedStreamListProvider;
|
||||||
import software.amazon.kinesis.leases.dynamodb.DynamoDBLeaseRefresher;
|
import software.amazon.kinesis.leases.dynamodb.DynamoDBLeaseRefresher;
|
||||||
|
import software.amazon.kinesis.lifecycle.ShardConsumer;
|
||||||
import software.amazon.kinesis.metrics.MetricsFactory;
|
import software.amazon.kinesis.metrics.MetricsFactory;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
|
@ -26,6 +29,11 @@ import software.amazon.kinesis.metrics.MetricsFactory;
|
||||||
public interface LeaseManagementFactory {
|
public interface LeaseManagementFactory {
|
||||||
LeaseCoordinator createLeaseCoordinator(MetricsFactory metricsFactory);
|
LeaseCoordinator createLeaseCoordinator(MetricsFactory metricsFactory);
|
||||||
|
|
||||||
|
default LeaseCoordinator createLeaseCoordinator(
|
||||||
|
MetricsFactory metricsFactory, ConcurrentMap<ShardInfo, ShardConsumer> shardInfoShardConsumerMap) {
|
||||||
|
throw new UnsupportedOperationException();
|
||||||
|
}
|
||||||
|
|
||||||
ShardSyncTaskManager createShardSyncTaskManager(MetricsFactory metricsFactory);
|
ShardSyncTaskManager createShardSyncTaskManager(MetricsFactory metricsFactory);
|
||||||
|
|
||||||
default ShardSyncTaskManager createShardSyncTaskManager(MetricsFactory metricsFactory, StreamConfig streamConfig) {
|
default ShardSyncTaskManager createShardSyncTaskManager(MetricsFactory metricsFactory, StreamConfig streamConfig) {
|
||||||
|
|
|
||||||
|
|
@ -15,6 +15,9 @@
|
||||||
package software.amazon.kinesis.leases;
|
package software.amazon.kinesis.leases;
|
||||||
|
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
|
import java.util.Map;
|
||||||
|
import java.util.concurrent.ExecutorService;
|
||||||
|
import java.util.stream.Collectors;
|
||||||
|
|
||||||
import software.amazon.kinesis.common.StreamIdentifier;
|
import software.amazon.kinesis.common.StreamIdentifier;
|
||||||
import software.amazon.kinesis.leases.exceptions.DependencyException;
|
import software.amazon.kinesis.leases.exceptions.DependencyException;
|
||||||
|
|
@ -75,6 +78,37 @@ public interface LeaseRefresher {
|
||||||
*/
|
*/
|
||||||
boolean waitUntilLeaseTableExists(long secondsBetweenPolls, long timeoutSeconds) throws DependencyException;
|
boolean waitUntilLeaseTableExists(long secondsBetweenPolls, long timeoutSeconds) throws DependencyException;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Creates the LeaseOwnerToLeaseKey index on the lease table if it doesn't exist and returns the status of index.
|
||||||
|
*
|
||||||
|
* @return indexStatus status of the index.
|
||||||
|
* @throws DependencyException if storage's describe API fails in an unexpected way
|
||||||
|
*/
|
||||||
|
default String createLeaseOwnerToLeaseKeyIndexIfNotExists() throws DependencyException {
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Blocks until the index exists by polling storage till either the index is ACTIVE or else timeout has
|
||||||
|
* happened.
|
||||||
|
*
|
||||||
|
* @param secondsBetweenPolls time to wait between polls in seconds
|
||||||
|
* @param timeoutSeconds total time to wait in seconds
|
||||||
|
*
|
||||||
|
* @return true if index on the table exists and is ACTIVE, false if timeout was reached
|
||||||
|
*/
|
||||||
|
default boolean waitUntilLeaseOwnerToLeaseKeyIndexExists(
|
||||||
|
final long secondsBetweenPolls, final long timeoutSeconds) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Check if leaseOwner GSI is ACTIVE
|
||||||
|
* @return true if index is active, false otherwise
|
||||||
|
* @throws DependencyException if storage's describe API fails in an unexpected way
|
||||||
|
*/
|
||||||
|
boolean isLeaseOwnerToLeaseKeyIndexActive() throws DependencyException;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* List all leases for a given stream synchronously.
|
* List all leases for a given stream synchronously.
|
||||||
*
|
*
|
||||||
|
|
@ -87,6 +121,24 @@ public interface LeaseRefresher {
|
||||||
List<Lease> listLeasesForStream(StreamIdentifier streamIdentifier)
|
List<Lease> listLeasesForStream(StreamIdentifier streamIdentifier)
|
||||||
throws DependencyException, InvalidStateException, ProvisionedThroughputException;
|
throws DependencyException, InvalidStateException, ProvisionedThroughputException;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* List all leases for a given workerIdentifier synchronously.
|
||||||
|
* Default implementation calls listLeases() and filters the results.
|
||||||
|
*
|
||||||
|
* @throws DependencyException if DynamoDB scan fails in an unexpected way
|
||||||
|
* @throws InvalidStateException if lease table does not exist
|
||||||
|
* @throws ProvisionedThroughputException if DynamoDB scan fails due to lack of capacity
|
||||||
|
*
|
||||||
|
* @return list of leases
|
||||||
|
*/
|
||||||
|
default List<String> listLeaseKeysForWorker(final String workerIdentifier)
|
||||||
|
throws DependencyException, InvalidStateException, ProvisionedThroughputException {
|
||||||
|
return listLeases().stream()
|
||||||
|
.filter(lease -> lease.leaseOwner().equals(workerIdentifier))
|
||||||
|
.map(Lease::leaseKey)
|
||||||
|
.collect(Collectors.toList());
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* List all objects in table synchronously.
|
* List all objects in table synchronously.
|
||||||
*
|
*
|
||||||
|
|
@ -98,6 +150,23 @@ public interface LeaseRefresher {
|
||||||
*/
|
*/
|
||||||
List<Lease> listLeases() throws DependencyException, InvalidStateException, ProvisionedThroughputException;
|
List<Lease> listLeases() throws DependencyException, InvalidStateException, ProvisionedThroughputException;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* List all leases from the storage parallely and deserialize into Lease objects. Returns the list of leaseKey
|
||||||
|
* that failed deserialize separately.
|
||||||
|
*
|
||||||
|
* @param threadPool threadpool to use for parallel scan
|
||||||
|
* @param parallelismFactor no. of parallel scans
|
||||||
|
* @return Pair of List of leases from the storage and List of items failed to deserialize
|
||||||
|
* @throws DependencyException if DynamoDB scan fails in an unexpected way
|
||||||
|
* @throws InvalidStateException if lease table does not exist
|
||||||
|
* @throws ProvisionedThroughputException if DynamoDB scan fails due to lack of capacity
|
||||||
|
*/
|
||||||
|
default Map.Entry<List<Lease>, List<String>> listLeasesParallely(
|
||||||
|
final ExecutorService threadPool, final int parallelismFactor)
|
||||||
|
throws DependencyException, InvalidStateException, ProvisionedThroughputException {
|
||||||
|
throw new UnsupportedOperationException("listLeasesParallely is not implemented");
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Create a new lease. Conditional on a lease not already existing with this shardId.
|
* Create a new lease. Conditional on a lease not already existing with this shardId.
|
||||||
*
|
*
|
||||||
|
|
@ -154,6 +223,47 @@ public interface LeaseRefresher {
|
||||||
boolean takeLease(Lease lease, String owner)
|
boolean takeLease(Lease lease, String owner)
|
||||||
throws DependencyException, InvalidStateException, ProvisionedThroughputException;
|
throws DependencyException, InvalidStateException, ProvisionedThroughputException;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Assigns given lease to newOwner owner by incrementing its leaseCounter and setting its owner field. Conditional
|
||||||
|
* on the leaseOwner in DynamoDB matching the leaseOwner of the input lease. Mutates the leaseCounter and owner of
|
||||||
|
* the passed-in lease object after updating DynamoDB.
|
||||||
|
*
|
||||||
|
* @param lease the lease to be assigned
|
||||||
|
* @param newOwner the new owner
|
||||||
|
*
|
||||||
|
* @return true if lease was successfully assigned, false otherwise
|
||||||
|
*
|
||||||
|
* @throws InvalidStateException if lease table does not exist
|
||||||
|
* @throws ProvisionedThroughputException if DynamoDB update fails due to lack of capacity
|
||||||
|
* @throws DependencyException if DynamoDB update fails in an unexpected way
|
||||||
|
*/
|
||||||
|
default boolean assignLease(final Lease lease, final String newOwner)
|
||||||
|
throws DependencyException, InvalidStateException, ProvisionedThroughputException {
|
||||||
|
|
||||||
|
throw new UnsupportedOperationException("assignLease is not implemented");
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Initiates a graceful handoff of the given lease to the specified new owner, allowing the current owner
|
||||||
|
* to complete its processing before transferring ownership.
|
||||||
|
* <p>
|
||||||
|
* This method updates the lease with the new owner information but ensures that the current owner
|
||||||
|
* is given time to gracefully finish its work (e.g., processing records) before the lease is reassigned.
|
||||||
|
* </p>
|
||||||
|
*
|
||||||
|
* @param lease the lease to be assigned
|
||||||
|
* @param newOwner the new owner
|
||||||
|
* @return true if a graceful handoff was successfully initiated
|
||||||
|
* @throws InvalidStateException if lease table does not exist
|
||||||
|
* @throws ProvisionedThroughputException if DynamoDB update fails due to lack of capacity
|
||||||
|
* @throws DependencyException if DynamoDB update fails in an unexpected way
|
||||||
|
*/
|
||||||
|
default boolean initiateGracefulLeaseHandoff(final Lease lease, final String newOwner)
|
||||||
|
throws DependencyException, InvalidStateException, ProvisionedThroughputException {
|
||||||
|
|
||||||
|
throw new UnsupportedOperationException("assignLeaseWithWait is not implemented");
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Evict the current owner of lease by setting owner to null. Conditional on the owner in DynamoDB matching the owner of
|
* Evict the current owner of lease by setting owner to null. Conditional on the owner in DynamoDB matching the owner of
|
||||||
* the input. Mutates the lease counter and owner of the passed-in lease object after updating the record in DynamoDB.
|
* the input. Mutates the lease counter and owner of the passed-in lease object after updating the record in DynamoDB.
|
||||||
|
|
|
||||||
|
|
@ -15,6 +15,7 @@
|
||||||
package software.amazon.kinesis.leases;
|
package software.amazon.kinesis.leases;
|
||||||
|
|
||||||
import java.util.Collection;
|
import java.util.Collection;
|
||||||
|
import java.util.Collections;
|
||||||
import java.util.Map;
|
import java.util.Map;
|
||||||
|
|
||||||
import software.amazon.awssdk.services.dynamodb.model.AttributeDefinition;
|
import software.amazon.awssdk.services.dynamodb.model.AttributeDefinition;
|
||||||
|
|
@ -100,6 +101,15 @@ public interface LeaseSerializer {
|
||||||
*/
|
*/
|
||||||
Map<String, AttributeValueUpdate> getDynamoTakeLeaseUpdate(Lease lease, String newOwner);
|
Map<String, AttributeValueUpdate> getDynamoTakeLeaseUpdate(Lease lease, String newOwner);
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @param lease lease that needs to be assigned
|
||||||
|
* @param newOwner newLeaseOwner
|
||||||
|
* @return the attribute value map that takes a lease for a new owner
|
||||||
|
*/
|
||||||
|
default Map<String, AttributeValueUpdate> getDynamoAssignLeaseUpdate(Lease lease, String newOwner) {
|
||||||
|
throw new UnsupportedOperationException("getDynamoAssignLeaseUpdate is not implemented");
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* @param lease
|
* @param lease
|
||||||
* @return the attribute value map that voids a lease
|
* @return the attribute value map that voids a lease
|
||||||
|
|
@ -127,8 +137,22 @@ public interface LeaseSerializer {
|
||||||
*/
|
*/
|
||||||
Collection<KeySchemaElement> getKeySchema();
|
Collection<KeySchemaElement> getKeySchema();
|
||||||
|
|
||||||
|
default Collection<KeySchemaElement> getWorkerIdToLeaseKeyIndexKeySchema() {
|
||||||
|
return Collections.EMPTY_LIST;
|
||||||
|
}
|
||||||
|
|
||||||
|
default Collection<AttributeDefinition> getWorkerIdToLeaseKeyIndexAttributeDefinitions() {
|
||||||
|
return Collections.EMPTY_LIST;
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* @return attribute definitions for creating a DynamoDB table to store leases
|
* @return attribute definitions for creating a DynamoDB table to store leases
|
||||||
*/
|
*/
|
||||||
Collection<AttributeDefinition> getAttributeDefinitions();
|
Collection<AttributeDefinition> getAttributeDefinitions();
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @param lease
|
||||||
|
* @return the attribute value map that includes lease throughput
|
||||||
|
*/
|
||||||
|
Map<String, AttributeValueUpdate> getDynamoLeaseThroughputKbpsUpdate(Lease lease);
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -0,0 +1,158 @@
|
||||||
|
package software.amazon.kinesis.leases;
|
||||||
|
|
||||||
|
import java.util.LinkedList;
|
||||||
|
import java.util.Map;
|
||||||
|
import java.util.Queue;
|
||||||
|
import java.util.concurrent.Callable;
|
||||||
|
import java.util.concurrent.ConcurrentHashMap;
|
||||||
|
import java.util.concurrent.ConcurrentLinkedQueue;
|
||||||
|
|
||||||
|
import lombok.Builder;
|
||||||
|
import lombok.Getter;
|
||||||
|
import lombok.NonNull;
|
||||||
|
import lombok.RequiredArgsConstructor;
|
||||||
|
import lombok.ToString;
|
||||||
|
import software.amazon.awssdk.annotations.ThreadSafe;
|
||||||
|
import software.amazon.kinesis.annotations.KinesisClientInternalApi;
|
||||||
|
import software.amazon.kinesis.utils.ExponentialMovingAverage;
|
||||||
|
|
||||||
|
import static java.util.Objects.isNull;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* This class records the stats for the leases.
|
||||||
|
* The stats are recorded in a thread safe queue, and the throughput is calculated by summing up the bytes and dividing
|
||||||
|
* by interval in seconds.
|
||||||
|
* This class is thread safe and backed by thread safe data structures.
|
||||||
|
*/
|
||||||
|
@RequiredArgsConstructor
|
||||||
|
@KinesisClientInternalApi
|
||||||
|
@ThreadSafe
|
||||||
|
public class LeaseStatsRecorder {
|
||||||
|
|
||||||
|
/**
|
||||||
|
* This default alpha is chosen based on the testing so far between simple average and moving average with 0.5.
|
||||||
|
* In the future, if one value does not fit all use cases, inject this via config.
|
||||||
|
*/
|
||||||
|
private static final double DEFAULT_ALPHA = 0.5;
|
||||||
|
|
||||||
|
public static final int BYTES_PER_KB = 1024;
|
||||||
|
|
||||||
|
private final Long renewerFrequencyInMillis;
|
||||||
|
private final Map<String, Queue<LeaseStats>> leaseStatsMap = new ConcurrentHashMap<>();
|
||||||
|
private final Map<String, ExponentialMovingAverage> leaseKeyToExponentialMovingAverageMap =
|
||||||
|
new ConcurrentHashMap<>();
|
||||||
|
private final Callable<Long> timeProviderInMillis;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* This method provides happens-before semantics (i.e., the action (access or removal) from a thread happens
|
||||||
|
* before the action from subsequent thread) for the stats recording in multithreaded environment.
|
||||||
|
*/
|
||||||
|
public void recordStats(@NonNull final LeaseStats leaseStats) {
|
||||||
|
final Queue<LeaseStats> leaseStatsQueue =
|
||||||
|
leaseStatsMap.computeIfAbsent(leaseStats.getLeaseKey(), lease -> new ConcurrentLinkedQueue<>());
|
||||||
|
leaseStatsQueue.add(leaseStats);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Calculates the throughput in KBps for the given leaseKey.
|
||||||
|
* Method first clears the items that are older than {@link #renewerFrequencyInMillis} from the queue and then
|
||||||
|
* calculates the throughput per second during {@link #renewerFrequencyInMillis} interval and then returns the
|
||||||
|
* ExponentialMovingAverage of the throughput. If method is called in quick succession with or without new stats
|
||||||
|
* the result can be different as ExponentialMovingAverage decays old values on every new call.
|
||||||
|
* This method is thread safe.
|
||||||
|
* @param leaseKey leaseKey for which stats are required
|
||||||
|
* @return throughput in Kbps, returns null if there is no stats available for the leaseKey.
|
||||||
|
*/
|
||||||
|
public Double getThroughputKBps(final String leaseKey) {
|
||||||
|
final Queue<LeaseStats> leaseStatsQueue = leaseStatsMap.get(leaseKey);
|
||||||
|
|
||||||
|
if (isNull(leaseStatsQueue)) {
|
||||||
|
// This means there is no entry for this leaseKey yet
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
|
filterExpiredEntries(leaseStatsQueue);
|
||||||
|
|
||||||
|
// Convert bytes into KB and divide by interval in second to get throughput per second.
|
||||||
|
final ExponentialMovingAverage exponentialMovingAverage = leaseKeyToExponentialMovingAverageMap.computeIfAbsent(
|
||||||
|
leaseKey, leaseId -> new ExponentialMovingAverage(DEFAULT_ALPHA));
|
||||||
|
|
||||||
|
// Specifically dividing by 1000.0 rather than using Duration class to get seconds, because Duration class
|
||||||
|
// implementation rounds off to seconds and precision is lost.
|
||||||
|
final double frequency = renewerFrequencyInMillis / 1000.0;
|
||||||
|
final double throughput = readQueue(leaseStatsQueue).stream()
|
||||||
|
.mapToDouble(LeaseStats::getBytes)
|
||||||
|
.sum()
|
||||||
|
/ BYTES_PER_KB
|
||||||
|
/ frequency;
|
||||||
|
exponentialMovingAverage.add(throughput);
|
||||||
|
return exponentialMovingAverage.getValue();
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Gets the currentTimeMillis and then iterates over the queue to get the stats with creation time less than
|
||||||
|
* currentTimeMillis.
|
||||||
|
* This is specifically done to avoid potential race between with high-frequency put thread blocking get thread.
|
||||||
|
*/
|
||||||
|
private Queue<LeaseStats> readQueue(final Queue<LeaseStats> leaseStatsQueue) {
|
||||||
|
final long currentTimeMillis = getCurrenTimeInMillis();
|
||||||
|
final Queue<LeaseStats> response = new LinkedList<>();
|
||||||
|
for (LeaseStats leaseStats : leaseStatsQueue) {
|
||||||
|
if (leaseStats.creationTimeMillis > currentTimeMillis) {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
response.add(leaseStats);
|
||||||
|
}
|
||||||
|
return response;
|
||||||
|
}
|
||||||
|
|
||||||
|
private long getCurrenTimeInMillis() {
|
||||||
|
try {
|
||||||
|
return timeProviderInMillis.call();
|
||||||
|
} catch (final Exception e) {
|
||||||
|
// Fallback to using the System.currentTimeMillis if failed.
|
||||||
|
return System.currentTimeMillis();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private void filterExpiredEntries(final Queue<LeaseStats> leaseStatsQueue) {
|
||||||
|
final long currentTime = getCurrenTimeInMillis();
|
||||||
|
while (!leaseStatsQueue.isEmpty()) {
|
||||||
|
final LeaseStats leaseStats = leaseStatsQueue.peek();
|
||||||
|
if (isNull(leaseStats) || currentTime - leaseStats.getCreationTimeMillis() < renewerFrequencyInMillis) {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
leaseStatsQueue.poll();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Clear the in-memory stats for the lease when a lease is reassigned (due to shut down or lease stealing)
|
||||||
|
* @param leaseKey leaseKey, for which stats are supposed to be clear.
|
||||||
|
*/
|
||||||
|
public void dropLeaseStats(final String leaseKey) {
|
||||||
|
leaseStatsMap.remove(leaseKey);
|
||||||
|
leaseKeyToExponentialMovingAverageMap.remove(leaseKey);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Builder
|
||||||
|
@Getter
|
||||||
|
@ToString
|
||||||
|
@KinesisClientInternalApi
|
||||||
|
public static final class LeaseStats {
|
||||||
|
/**
|
||||||
|
* Lease key for which this leaseStats object is created.
|
||||||
|
*/
|
||||||
|
private final String leaseKey;
|
||||||
|
/**
|
||||||
|
* Bytes that are processed for a lease
|
||||||
|
*/
|
||||||
|
private final long bytes;
|
||||||
|
/**
|
||||||
|
* Wall time in epoch millis at which this leaseStats object was created. This time is used to determine the
|
||||||
|
* expiry of the lease stats.
|
||||||
|
*/
|
||||||
|
@Builder.Default
|
||||||
|
private final long creationTimeMillis = System.currentTimeMillis();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
@ -19,6 +19,7 @@ import java.util.Collections;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
import java.util.Map;
|
import java.util.Map;
|
||||||
import java.util.UUID;
|
import java.util.UUID;
|
||||||
|
import java.util.concurrent.ConcurrentMap;
|
||||||
import java.util.concurrent.ExecutorService;
|
import java.util.concurrent.ExecutorService;
|
||||||
import java.util.concurrent.Executors;
|
import java.util.concurrent.Executors;
|
||||||
import java.util.concurrent.LinkedTransferQueue;
|
import java.util.concurrent.LinkedTransferQueue;
|
||||||
|
|
@ -30,13 +31,17 @@ import java.util.concurrent.TimeUnit;
|
||||||
import java.util.stream.Collectors;
|
import java.util.stream.Collectors;
|
||||||
|
|
||||||
import com.google.common.util.concurrent.ThreadFactoryBuilder;
|
import com.google.common.util.concurrent.ThreadFactoryBuilder;
|
||||||
|
import lombok.RequiredArgsConstructor;
|
||||||
import lombok.extern.slf4j.Slf4j;
|
import lombok.extern.slf4j.Slf4j;
|
||||||
import software.amazon.kinesis.annotations.KinesisClientInternalApi;
|
import software.amazon.kinesis.annotations.KinesisClientInternalApi;
|
||||||
|
import software.amazon.kinesis.coordinator.MigrationAdaptiveLeaseAssignmentModeProvider;
|
||||||
import software.amazon.kinesis.leases.Lease;
|
import software.amazon.kinesis.leases.Lease;
|
||||||
import software.amazon.kinesis.leases.LeaseCoordinator;
|
import software.amazon.kinesis.leases.LeaseCoordinator;
|
||||||
|
import software.amazon.kinesis.leases.LeaseDiscoverer;
|
||||||
import software.amazon.kinesis.leases.LeaseManagementConfig;
|
import software.amazon.kinesis.leases.LeaseManagementConfig;
|
||||||
import software.amazon.kinesis.leases.LeaseRefresher;
|
import software.amazon.kinesis.leases.LeaseRefresher;
|
||||||
import software.amazon.kinesis.leases.LeaseRenewer;
|
import software.amazon.kinesis.leases.LeaseRenewer;
|
||||||
|
import software.amazon.kinesis.leases.LeaseStatsRecorder;
|
||||||
import software.amazon.kinesis.leases.LeaseTaker;
|
import software.amazon.kinesis.leases.LeaseTaker;
|
||||||
import software.amazon.kinesis.leases.MultiStreamLease;
|
import software.amazon.kinesis.leases.MultiStreamLease;
|
||||||
import software.amazon.kinesis.leases.ShardInfo;
|
import software.amazon.kinesis.leases.ShardInfo;
|
||||||
|
|
@ -44,6 +49,8 @@ import software.amazon.kinesis.leases.exceptions.DependencyException;
|
||||||
import software.amazon.kinesis.leases.exceptions.InvalidStateException;
|
import software.amazon.kinesis.leases.exceptions.InvalidStateException;
|
||||||
import software.amazon.kinesis.leases.exceptions.LeasingException;
|
import software.amazon.kinesis.leases.exceptions.LeasingException;
|
||||||
import software.amazon.kinesis.leases.exceptions.ProvisionedThroughputException;
|
import software.amazon.kinesis.leases.exceptions.ProvisionedThroughputException;
|
||||||
|
import software.amazon.kinesis.lifecycle.LeaseGracefulShutdownHandler;
|
||||||
|
import software.amazon.kinesis.lifecycle.ShardConsumer;
|
||||||
import software.amazon.kinesis.metrics.MetricsFactory;
|
import software.amazon.kinesis.metrics.MetricsFactory;
|
||||||
import software.amazon.kinesis.metrics.MetricsLevel;
|
import software.amazon.kinesis.metrics.MetricsLevel;
|
||||||
import software.amazon.kinesis.metrics.MetricsScope;
|
import software.amazon.kinesis.metrics.MetricsScope;
|
||||||
|
|
@ -70,115 +77,34 @@ public class DynamoDBLeaseCoordinator implements LeaseCoordinator {
|
||||||
.setNameFormat("LeaseRenewer-%04d")
|
.setNameFormat("LeaseRenewer-%04d")
|
||||||
.setDaemon(true)
|
.setDaemon(true)
|
||||||
.build();
|
.build();
|
||||||
|
private static final ThreadFactory LEASE_DISCOVERY_THREAD_FACTORY = new ThreadFactoryBuilder()
|
||||||
|
.setNameFormat("LeaseDiscovery-%04d")
|
||||||
|
.setDaemon(true)
|
||||||
|
.build();
|
||||||
|
|
||||||
private final LeaseRenewer leaseRenewer;
|
private final LeaseRenewer leaseRenewer;
|
||||||
private final LeaseTaker leaseTaker;
|
private final LeaseTaker leaseTaker;
|
||||||
|
private final LeaseDiscoverer leaseDiscoverer;
|
||||||
private final long renewerIntervalMillis;
|
private final long renewerIntervalMillis;
|
||||||
private final long takerIntervalMillis;
|
private final long takerIntervalMillis;
|
||||||
|
private final long leaseDiscovererIntervalMillis;
|
||||||
private final ExecutorService leaseRenewalThreadpool;
|
private final ExecutorService leaseRenewalThreadpool;
|
||||||
|
private final ExecutorService leaseDiscoveryThreadPool;
|
||||||
private final LeaseRefresher leaseRefresher;
|
private final LeaseRefresher leaseRefresher;
|
||||||
|
private final LeaseStatsRecorder leaseStatsRecorder;
|
||||||
|
private final LeaseGracefulShutdownHandler leaseGracefulShutdownHandler;
|
||||||
private long initialLeaseTableReadCapacity;
|
private long initialLeaseTableReadCapacity;
|
||||||
private long initialLeaseTableWriteCapacity;
|
private long initialLeaseTableWriteCapacity;
|
||||||
protected final MetricsFactory metricsFactory;
|
protected final MetricsFactory metricsFactory;
|
||||||
|
|
||||||
private final Object shutdownLock = new Object();
|
private final Object shutdownLock = new Object();
|
||||||
|
private final LeaseManagementConfig.WorkerUtilizationAwareAssignmentConfig workerUtilizationAwareAssignmentConfig;
|
||||||
private ScheduledExecutorService leaseCoordinatorThreadPool;
|
private ScheduledExecutorService leaseCoordinatorThreadPool;
|
||||||
|
private ScheduledFuture<?> leaseDiscoveryFuture;
|
||||||
private ScheduledFuture<?> takerFuture;
|
private ScheduledFuture<?> takerFuture;
|
||||||
|
|
||||||
private volatile boolean running = false;
|
private volatile boolean running = false;
|
||||||
|
|
||||||
/**
|
|
||||||
* Constructor.
|
|
||||||
*
|
|
||||||
* <p>NOTE: This constructor is deprecated and will be removed in a future release.</p>
|
|
||||||
*
|
|
||||||
* @param leaseRefresher
|
|
||||||
* LeaseRefresher instance to use
|
|
||||||
* @param workerIdentifier
|
|
||||||
* Identifies the worker (e.g. useful to track lease ownership)
|
|
||||||
* @param leaseDurationMillis
|
|
||||||
* Duration of a lease
|
|
||||||
* @param epsilonMillis
|
|
||||||
* Allow for some variance when calculating lease expirations
|
|
||||||
* @param maxLeasesForWorker
|
|
||||||
* Max leases this Worker can handle at a time
|
|
||||||
* @param maxLeasesToStealAtOneTime
|
|
||||||
* Steal up to these many leases at a time (for load balancing)
|
|
||||||
* @param metricsFactory
|
|
||||||
* Used to publish metrics about lease operations
|
|
||||||
*/
|
|
||||||
@Deprecated
|
|
||||||
public DynamoDBLeaseCoordinator(
|
|
||||||
final LeaseRefresher leaseRefresher,
|
|
||||||
final String workerIdentifier,
|
|
||||||
final long leaseDurationMillis,
|
|
||||||
final long epsilonMillis,
|
|
||||||
final int maxLeasesForWorker,
|
|
||||||
final int maxLeasesToStealAtOneTime,
|
|
||||||
final int maxLeaseRenewerThreadCount,
|
|
||||||
final MetricsFactory metricsFactory) {
|
|
||||||
this(
|
|
||||||
leaseRefresher,
|
|
||||||
workerIdentifier,
|
|
||||||
leaseDurationMillis,
|
|
||||||
epsilonMillis,
|
|
||||||
maxLeasesForWorker,
|
|
||||||
maxLeasesToStealAtOneTime,
|
|
||||||
maxLeaseRenewerThreadCount,
|
|
||||||
TableConstants.DEFAULT_INITIAL_LEASE_TABLE_READ_CAPACITY,
|
|
||||||
TableConstants.DEFAULT_INITIAL_LEASE_TABLE_WRITE_CAPACITY,
|
|
||||||
metricsFactory);
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Constructor.
|
|
||||||
*
|
|
||||||
* @param leaseRefresher
|
|
||||||
* LeaseRefresher instance to use
|
|
||||||
* @param workerIdentifier
|
|
||||||
* Identifies the worker (e.g. useful to track lease ownership)
|
|
||||||
* @param leaseDurationMillis
|
|
||||||
* Duration of a lease
|
|
||||||
* @param epsilonMillis
|
|
||||||
* Allow for some variance when calculating lease expirations
|
|
||||||
* @param maxLeasesForWorker
|
|
||||||
* Max leases this Worker can handle at a time
|
|
||||||
* @param maxLeasesToStealAtOneTime
|
|
||||||
* Steal up to these many leases at a time (for load balancing)
|
|
||||||
* @param initialLeaseTableReadCapacity
|
|
||||||
* Initial dynamodb lease table read iops if creating the lease table
|
|
||||||
* @param initialLeaseTableWriteCapacity
|
|
||||||
* Initial dynamodb lease table write iops if creating the lease table
|
|
||||||
* @param metricsFactory
|
|
||||||
* Used to publish metrics about lease operations
|
|
||||||
*/
|
|
||||||
@Deprecated
|
|
||||||
public DynamoDBLeaseCoordinator(
|
|
||||||
final LeaseRefresher leaseRefresher,
|
|
||||||
final String workerIdentifier,
|
|
||||||
final long leaseDurationMillis,
|
|
||||||
final long epsilonMillis,
|
|
||||||
final int maxLeasesForWorker,
|
|
||||||
final int maxLeasesToStealAtOneTime,
|
|
||||||
final int maxLeaseRenewerThreadCount,
|
|
||||||
final long initialLeaseTableReadCapacity,
|
|
||||||
final long initialLeaseTableWriteCapacity,
|
|
||||||
final MetricsFactory metricsFactory) {
|
|
||||||
this(
|
|
||||||
leaseRefresher,
|
|
||||||
workerIdentifier,
|
|
||||||
leaseDurationMillis,
|
|
||||||
LeaseManagementConfig.DEFAULT_ENABLE_PRIORITY_LEASE_ASSIGNMENT,
|
|
||||||
epsilonMillis,
|
|
||||||
maxLeasesForWorker,
|
|
||||||
maxLeasesToStealAtOneTime,
|
|
||||||
maxLeaseRenewerThreadCount,
|
|
||||||
TableConstants.DEFAULT_INITIAL_LEASE_TABLE_READ_CAPACITY,
|
|
||||||
TableConstants.DEFAULT_INITIAL_LEASE_TABLE_WRITE_CAPACITY,
|
|
||||||
metricsFactory);
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Constructor.
|
* Constructor.
|
||||||
*
|
*
|
||||||
|
|
@ -214,17 +140,35 @@ public class DynamoDBLeaseCoordinator implements LeaseCoordinator {
|
||||||
final int maxLeaseRenewerThreadCount,
|
final int maxLeaseRenewerThreadCount,
|
||||||
final long initialLeaseTableReadCapacity,
|
final long initialLeaseTableReadCapacity,
|
||||||
final long initialLeaseTableWriteCapacity,
|
final long initialLeaseTableWriteCapacity,
|
||||||
final MetricsFactory metricsFactory) {
|
final MetricsFactory metricsFactory,
|
||||||
|
final LeaseManagementConfig.WorkerUtilizationAwareAssignmentConfig workerUtilizationAwareAssignmentConfig,
|
||||||
|
final LeaseManagementConfig.GracefulLeaseHandoffConfig gracefulLeaseHandoffConfig,
|
||||||
|
final ConcurrentMap<ShardInfo, ShardConsumer> shardInfoShardConsumerMap) {
|
||||||
this.leaseRefresher = leaseRefresher;
|
this.leaseRefresher = leaseRefresher;
|
||||||
this.leaseRenewalThreadpool = getLeaseRenewalExecutorService(maxLeaseRenewerThreadCount);
|
this.leaseRenewalThreadpool = createExecutorService(maxLeaseRenewerThreadCount, LEASE_RENEWAL_THREAD_FACTORY);
|
||||||
this.leaseTaker = new DynamoDBLeaseTaker(leaseRefresher, workerIdentifier, leaseDurationMillis, metricsFactory)
|
this.leaseTaker = new DynamoDBLeaseTaker(leaseRefresher, workerIdentifier, leaseDurationMillis, metricsFactory)
|
||||||
.withMaxLeasesForWorker(maxLeasesForWorker)
|
.withMaxLeasesForWorker(maxLeasesForWorker)
|
||||||
.withMaxLeasesToStealAtOneTime(maxLeasesToStealAtOneTime)
|
.withMaxLeasesToStealAtOneTime(maxLeasesToStealAtOneTime)
|
||||||
.withEnablePriorityLeaseAssignment(enablePriorityLeaseAssignment);
|
.withEnablePriorityLeaseAssignment(enablePriorityLeaseAssignment);
|
||||||
this.leaseRenewer = new DynamoDBLeaseRenewer(
|
|
||||||
leaseRefresher, workerIdentifier, leaseDurationMillis, leaseRenewalThreadpool, metricsFactory);
|
|
||||||
this.renewerIntervalMillis = getRenewerTakerIntervalMillis(leaseDurationMillis, epsilonMillis);
|
this.renewerIntervalMillis = getRenewerTakerIntervalMillis(leaseDurationMillis, epsilonMillis);
|
||||||
this.takerIntervalMillis = (leaseDurationMillis + epsilonMillis) * 2;
|
this.takerIntervalMillis = (leaseDurationMillis + epsilonMillis) * 2;
|
||||||
|
// Should run once every leaseDurationMillis to identify new leases before expiry.
|
||||||
|
this.leaseDiscovererIntervalMillis = leaseDurationMillis - epsilonMillis;
|
||||||
|
this.leaseStatsRecorder = new LeaseStatsRecorder(renewerIntervalMillis, System::currentTimeMillis);
|
||||||
|
this.leaseGracefulShutdownHandler = LeaseGracefulShutdownHandler.create(
|
||||||
|
gracefulLeaseHandoffConfig.gracefulLeaseHandoffTimeoutMillis(), shardInfoShardConsumerMap, this);
|
||||||
|
this.leaseRenewer = new DynamoDBLeaseRenewer(
|
||||||
|
leaseRefresher,
|
||||||
|
workerIdentifier,
|
||||||
|
leaseDurationMillis,
|
||||||
|
leaseRenewalThreadpool,
|
||||||
|
metricsFactory,
|
||||||
|
leaseStatsRecorder,
|
||||||
|
leaseGracefulShutdownHandler::enqueueShutdown);
|
||||||
|
this.leaseDiscoveryThreadPool =
|
||||||
|
createExecutorService(maxLeaseRenewerThreadCount, LEASE_DISCOVERY_THREAD_FACTORY);
|
||||||
|
this.leaseDiscoverer = new DynamoDBLeaseDiscoverer(
|
||||||
|
this.leaseRefresher, this.leaseRenewer, metricsFactory, workerIdentifier, leaseDiscoveryThreadPool);
|
||||||
if (initialLeaseTableReadCapacity <= 0) {
|
if (initialLeaseTableReadCapacity <= 0) {
|
||||||
throw new IllegalArgumentException("readCapacity should be >= 1");
|
throw new IllegalArgumentException("readCapacity should be >= 1");
|
||||||
}
|
}
|
||||||
|
|
@ -234,6 +178,7 @@ public class DynamoDBLeaseCoordinator implements LeaseCoordinator {
|
||||||
}
|
}
|
||||||
this.initialLeaseTableWriteCapacity = initialLeaseTableWriteCapacity;
|
this.initialLeaseTableWriteCapacity = initialLeaseTableWriteCapacity;
|
||||||
this.metricsFactory = metricsFactory;
|
this.metricsFactory = metricsFactory;
|
||||||
|
this.workerUtilizationAwareAssignmentConfig = workerUtilizationAwareAssignmentConfig;
|
||||||
|
|
||||||
log.info(
|
log.info(
|
||||||
"With failover time {} ms and epsilon {} ms, LeaseCoordinator will renew leases every {} ms, take"
|
"With failover time {} ms and epsilon {} ms, LeaseCoordinator will renew leases every {} ms, take"
|
||||||
|
|
@ -246,11 +191,49 @@ public class DynamoDBLeaseCoordinator implements LeaseCoordinator {
|
||||||
maxLeasesToStealAtOneTime);
|
maxLeasesToStealAtOneTime);
|
||||||
}
|
}
|
||||||
|
|
||||||
private class TakerRunnable implements Runnable {
|
@RequiredArgsConstructor
|
||||||
|
private class LeaseDiscoveryRunnable implements Runnable {
|
||||||
|
private final MigrationAdaptiveLeaseAssignmentModeProvider leaseAssignmentModeProvider;
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public void run() {
|
public void run() {
|
||||||
try {
|
try {
|
||||||
|
// LeaseDiscoverer is run in WORKER_UTILIZATION_AWARE_ASSIGNMENT mode only
|
||||||
|
synchronized (shutdownLock) {
|
||||||
|
if (!leaseAssignmentModeProvider
|
||||||
|
.getLeaseAssignmentMode()
|
||||||
|
.equals(
|
||||||
|
MigrationAdaptiveLeaseAssignmentModeProvider.LeaseAssignmentMode
|
||||||
|
.WORKER_UTILIZATION_AWARE_ASSIGNMENT)) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
if (running) {
|
||||||
|
leaseRenewer.addLeasesToRenew(leaseDiscoverer.discoverNewLeases());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} catch (Exception e) {
|
||||||
|
log.error("Failed to execute lease discovery", e);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
@RequiredArgsConstructor
|
||||||
|
private class TakerRunnable implements Runnable {
|
||||||
|
private final MigrationAdaptiveLeaseAssignmentModeProvider leaseAssignmentModeProvider;
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void run() {
|
||||||
|
try {
|
||||||
|
// LeaseTaker is run in DEFAULT_LEASE_COUNT_BASED_ASSIGNMENT mode only
|
||||||
|
synchronized (shutdownLock) {
|
||||||
|
if (!leaseAssignmentModeProvider
|
||||||
|
.getLeaseAssignmentMode()
|
||||||
|
.equals(
|
||||||
|
MigrationAdaptiveLeaseAssignmentModeProvider.LeaseAssignmentMode
|
||||||
|
.DEFAULT_LEASE_COUNT_BASED_ASSIGNMENT)) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
}
|
||||||
runLeaseTaker();
|
runLeaseTaker();
|
||||||
} catch (LeasingException e) {
|
} catch (LeasingException e) {
|
||||||
log.error("LeasingException encountered in lease taking thread", e);
|
log.error("LeasingException encountered in lease taking thread", e);
|
||||||
|
|
@ -290,18 +273,35 @@ public class DynamoDBLeaseCoordinator implements LeaseCoordinator {
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public void start() throws DependencyException, InvalidStateException, ProvisionedThroughputException {
|
public void start(final MigrationAdaptiveLeaseAssignmentModeProvider leaseAssignmentModeProvider)
|
||||||
|
throws DependencyException, InvalidStateException, ProvisionedThroughputException {
|
||||||
leaseRenewer.initialize();
|
leaseRenewer.initialize();
|
||||||
|
// At max, we need 3 threads - lease renewer, lease taker, lease discoverer - to run without contention.
|
||||||
|
leaseCoordinatorThreadPool = Executors.newScheduledThreadPool(3, LEASE_COORDINATOR_THREAD_FACTORY);
|
||||||
|
|
||||||
// 2 because we know we'll have at most 2 concurrent tasks at a time.
|
// During migration to KCLv3.x from KCLv2.x, lease assignment mode can change dynamically, so
|
||||||
leaseCoordinatorThreadPool = Executors.newScheduledThreadPool(2, LEASE_COORDINATOR_THREAD_FACTORY);
|
// both lease assignment algorithms will be started but only one will execute based on
|
||||||
|
// leaseAssignmentModeProvider.getLeaseAssignmentMode(). However for new applications starting in
|
||||||
|
// KCLv3.x or applications successfully migrated to KCLv3.x, lease assignment mode will not
|
||||||
|
// change dynamically and will always be WORKER_UTILIZATION_AWARE_ASSIGNMENT, therefore
|
||||||
|
// don't initialize KCLv2.x lease assignment algorithm components that are not needed.
|
||||||
|
if (leaseAssignmentModeProvider.dynamicModeChangeSupportNeeded()) {
|
||||||
|
// Taker runs with fixed DELAY because we want it to run slower in the event of performance degradation.
|
||||||
|
takerFuture = leaseCoordinatorThreadPool.scheduleWithFixedDelay(
|
||||||
|
new TakerRunnable(leaseAssignmentModeProvider), 0L, takerIntervalMillis, TimeUnit.MILLISECONDS);
|
||||||
|
}
|
||||||
|
|
||||||
// Taker runs with fixed DELAY because we want it to run slower in the event of performance degredation.
|
leaseDiscoveryFuture = leaseCoordinatorThreadPool.scheduleAtFixedRate(
|
||||||
takerFuture = leaseCoordinatorThreadPool.scheduleWithFixedDelay(
|
new LeaseDiscoveryRunnable(leaseAssignmentModeProvider),
|
||||||
new TakerRunnable(), 0L, takerIntervalMillis, TimeUnit.MILLISECONDS);
|
0L,
|
||||||
// Renewer runs at fixed INTERVAL because we want it to run at the same rate in the event of degredation.
|
leaseDiscovererIntervalMillis,
|
||||||
|
TimeUnit.MILLISECONDS);
|
||||||
|
|
||||||
|
// Renewer runs at fixed INTERVAL because we want it to run at the same rate in the event of degradation.
|
||||||
leaseCoordinatorThreadPool.scheduleAtFixedRate(
|
leaseCoordinatorThreadPool.scheduleAtFixedRate(
|
||||||
new RenewerRunnable(), 0L, renewerIntervalMillis, TimeUnit.MILLISECONDS);
|
new RenewerRunnable(), 0L, renewerIntervalMillis, TimeUnit.MILLISECONDS);
|
||||||
|
|
||||||
|
leaseGracefulShutdownHandler.start();
|
||||||
running = true;
|
running = true;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -383,6 +383,8 @@ public class DynamoDBLeaseCoordinator implements LeaseCoordinator {
|
||||||
}
|
}
|
||||||
|
|
||||||
leaseRenewalThreadpool.shutdownNow();
|
leaseRenewalThreadpool.shutdownNow();
|
||||||
|
leaseCoordinatorThreadPool.shutdownNow();
|
||||||
|
leaseGracefulShutdownHandler.stop();
|
||||||
synchronized (shutdownLock) {
|
synchronized (shutdownLock) {
|
||||||
leaseRenewer.clearCurrentlyHeldLeases();
|
leaseRenewer.clearCurrentlyHeldLeases();
|
||||||
running = false;
|
running = false;
|
||||||
|
|
@ -393,6 +395,10 @@ public class DynamoDBLeaseCoordinator implements LeaseCoordinator {
|
||||||
public void stopLeaseTaker() {
|
public void stopLeaseTaker() {
|
||||||
if (takerFuture != null) {
|
if (takerFuture != null) {
|
||||||
takerFuture.cancel(false);
|
takerFuture.cancel(false);
|
||||||
|
leaseDiscoveryFuture.cancel(false);
|
||||||
|
// the method is called in worker graceful shutdown. We want to stop any further lease shutdown
|
||||||
|
// so we don't interrupt worker shutdown.
|
||||||
|
leaseGracefulShutdownHandler.stop();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -418,20 +424,15 @@ public class DynamoDBLeaseCoordinator implements LeaseCoordinator {
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Returns executor service that should be used for lease renewal.
|
* Returns executor service for given ThreadFactory.
|
||||||
* @param maximumPoolSize Maximum allowed thread pool size
|
* @param maximumPoolSize Maximum allowed thread pool size
|
||||||
* @return Executor service that should be used for lease renewal.
|
* @return Executor service
|
||||||
*/
|
*/
|
||||||
private static ExecutorService getLeaseRenewalExecutorService(int maximumPoolSize) {
|
private static ExecutorService createExecutorService(final int maximumPoolSize, final ThreadFactory threadFactory) {
|
||||||
int coreLeaseCount = Math.max(maximumPoolSize / 4, 2);
|
int coreLeaseCount = Math.max(maximumPoolSize / 4, 2);
|
||||||
|
|
||||||
return new ThreadPoolExecutor(
|
return new ThreadPoolExecutor(
|
||||||
coreLeaseCount,
|
coreLeaseCount, maximumPoolSize, 60, TimeUnit.SECONDS, new LinkedTransferQueue<>(), threadFactory);
|
||||||
maximumPoolSize,
|
|
||||||
60,
|
|
||||||
TimeUnit.SECONDS,
|
|
||||||
new LinkedTransferQueue<>(),
|
|
||||||
LEASE_RENEWAL_THREAD_FACTORY);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
|
|
@ -472,6 +473,8 @@ public class DynamoDBLeaseCoordinator implements LeaseCoordinator {
|
||||||
* {@inheritDoc}
|
* {@inheritDoc}
|
||||||
*
|
*
|
||||||
* <p>NOTE: This method is deprecated. Please set the initial capacity through the constructor.</p>
|
* <p>NOTE: This method is deprecated. Please set the initial capacity through the constructor.</p>
|
||||||
|
*
|
||||||
|
* This is a method of the public lease coordinator interface.
|
||||||
*/
|
*/
|
||||||
@Override
|
@Override
|
||||||
@Deprecated
|
@Deprecated
|
||||||
|
|
@ -487,6 +490,8 @@ public class DynamoDBLeaseCoordinator implements LeaseCoordinator {
|
||||||
* {@inheritDoc}
|
* {@inheritDoc}
|
||||||
*
|
*
|
||||||
* <p>NOTE: This method is deprecated. Please set the initial capacity through the constructor.</p>
|
* <p>NOTE: This method is deprecated. Please set the initial capacity through the constructor.</p>
|
||||||
|
*
|
||||||
|
* This is a method of the public lease coordinator interface.
|
||||||
*/
|
*/
|
||||||
@Override
|
@Override
|
||||||
@Deprecated
|
@Deprecated
|
||||||
|
|
@ -497,4 +502,9 @@ public class DynamoDBLeaseCoordinator implements LeaseCoordinator {
|
||||||
initialLeaseTableWriteCapacity = writeCapacity;
|
initialLeaseTableWriteCapacity = writeCapacity;
|
||||||
return this;
|
return this;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public LeaseStatsRecorder leaseStatsRecorder() {
|
||||||
|
return leaseStatsRecorder;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -0,0 +1,120 @@
|
||||||
|
package software.amazon.kinesis.leases.dynamodb;
|
||||||
|
|
||||||
|
import java.util.List;
|
||||||
|
import java.util.Objects;
|
||||||
|
import java.util.Set;
|
||||||
|
import java.util.concurrent.CompletableFuture;
|
||||||
|
import java.util.concurrent.ExecutorService;
|
||||||
|
import java.util.stream.Collectors;
|
||||||
|
|
||||||
|
import lombok.RequiredArgsConstructor;
|
||||||
|
import lombok.extern.slf4j.Slf4j;
|
||||||
|
import software.amazon.kinesis.leases.Lease;
|
||||||
|
import software.amazon.kinesis.leases.LeaseDiscoverer;
|
||||||
|
import software.amazon.kinesis.leases.LeaseRefresher;
|
||||||
|
import software.amazon.kinesis.leases.LeaseRenewer;
|
||||||
|
import software.amazon.kinesis.leases.exceptions.DependencyException;
|
||||||
|
import software.amazon.kinesis.leases.exceptions.InvalidStateException;
|
||||||
|
import software.amazon.kinesis.leases.exceptions.ProvisionedThroughputException;
|
||||||
|
import software.amazon.kinesis.metrics.MetricsFactory;
|
||||||
|
import software.amazon.kinesis.metrics.MetricsLevel;
|
||||||
|
import software.amazon.kinesis.metrics.MetricsScope;
|
||||||
|
import software.amazon.kinesis.metrics.MetricsUtil;
|
||||||
|
|
||||||
|
import static java.util.Objects.isNull;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* An implementation of {@link LeaseDiscoverer}, it uses {@link LeaseRefresher} to query
|
||||||
|
* {@link DynamoDBLeaseRefresher#LEASE_OWNER_TO_LEASE_KEY_INDEX_NAME } and find the leases assigned
|
||||||
|
* to current worker and then filter and returns the leases that have not started processing (looks at
|
||||||
|
* {@link LeaseRenewer#getCurrentlyHeldLeases()} to find out which leases are currently held leases).
|
||||||
|
*/
|
||||||
|
@Slf4j
|
||||||
|
@RequiredArgsConstructor
|
||||||
|
public class DynamoDBLeaseDiscoverer implements LeaseDiscoverer {
|
||||||
|
|
||||||
|
private final LeaseRefresher leaseRefresher;
|
||||||
|
private final LeaseRenewer leaseRenewer;
|
||||||
|
private final MetricsFactory metricsFactory;
|
||||||
|
private final String workerIdentifier;
|
||||||
|
private final ExecutorService executorService;
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public List<Lease> discoverNewLeases()
|
||||||
|
throws ProvisionedThroughputException, InvalidStateException, DependencyException {
|
||||||
|
final MetricsScope metricsScope = MetricsUtil.createMetricsWithOperation(metricsFactory, "LeaseDiscovery");
|
||||||
|
long startTime = System.currentTimeMillis();
|
||||||
|
boolean success = false;
|
||||||
|
try {
|
||||||
|
final Set<String> currentHeldLeaseKeys =
|
||||||
|
leaseRenewer.getCurrentlyHeldLeases().keySet();
|
||||||
|
|
||||||
|
final long listLeaseKeysForWorkerStartTime = System.currentTimeMillis();
|
||||||
|
final List<String> leaseKeys = leaseRefresher.listLeaseKeysForWorker(workerIdentifier);
|
||||||
|
MetricsUtil.addLatency(
|
||||||
|
metricsScope, "ListLeaseKeysForWorker", listLeaseKeysForWorkerStartTime, MetricsLevel.DETAILED);
|
||||||
|
|
||||||
|
final List<String> newLeaseKeys = leaseKeys.stream()
|
||||||
|
.filter(leaseKey -> !currentHeldLeaseKeys.contains(leaseKey))
|
||||||
|
.collect(Collectors.toList());
|
||||||
|
|
||||||
|
final long fetchNewLeasesStartTime = System.currentTimeMillis();
|
||||||
|
final List<CompletableFuture<Lease>> completableFutures = newLeaseKeys.stream()
|
||||||
|
.map(leaseKey ->
|
||||||
|
CompletableFuture.supplyAsync(() -> fetchLease(leaseKey, metricsScope), executorService))
|
||||||
|
.collect(Collectors.toList());
|
||||||
|
|
||||||
|
final List<Lease> newLeases = completableFutures.stream()
|
||||||
|
.map(CompletableFuture::join)
|
||||||
|
.filter(Objects::nonNull)
|
||||||
|
.collect(Collectors.toList());
|
||||||
|
|
||||||
|
log.info(
|
||||||
|
"New leases assigned to worker : {}, count : {}, leases : {}",
|
||||||
|
workerIdentifier,
|
||||||
|
newLeases.size(),
|
||||||
|
newLeases.stream().map(Lease::leaseKey).collect(Collectors.toList()));
|
||||||
|
|
||||||
|
MetricsUtil.addLatency(metricsScope, "FetchNewLeases", fetchNewLeasesStartTime, MetricsLevel.DETAILED);
|
||||||
|
|
||||||
|
success = true;
|
||||||
|
MetricsUtil.addCount(metricsScope, "NewLeasesDiscovered", newLeases.size(), MetricsLevel.DETAILED);
|
||||||
|
return newLeases;
|
||||||
|
} finally {
|
||||||
|
MetricsUtil.addWorkerIdentifier(metricsScope, workerIdentifier);
|
||||||
|
MetricsUtil.addSuccessAndLatency(metricsScope, success, startTime, MetricsLevel.SUMMARY);
|
||||||
|
MetricsUtil.endScope(metricsScope);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private Lease fetchLease(final String leaseKey, final MetricsScope metricsScope) {
|
||||||
|
try {
|
||||||
|
final Lease lease = leaseRefresher.getLease(leaseKey);
|
||||||
|
if (isNull(lease)) {
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
// GSI is eventually consistent thus, validate that the fetched lease is indeed assigned to this
|
||||||
|
// worker, if not just pass in this run.
|
||||||
|
if (!lease.leaseOwner().equals(workerIdentifier)) {
|
||||||
|
MetricsUtil.addCount(metricsScope, "OwnerMismatch", 1, MetricsLevel.DETAILED);
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
// if checkpointOwner is not null, it means that the lease is still pending shutdown for the last owner.
|
||||||
|
// Don't add the lease to the in-memory map yet.
|
||||||
|
if (lease.checkpointOwner() != null) {
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
// when a new lease is discovered, set the lastCounterIncrementNanos to current time as the time
|
||||||
|
// when it has become visible, on next renewer interval this will be updated by LeaseRenewer to
|
||||||
|
// correct time.
|
||||||
|
lease.lastCounterIncrementNanos(System.nanoTime());
|
||||||
|
return lease;
|
||||||
|
} catch (final Exception e) {
|
||||||
|
// if getLease on some lease key fail, continue and fetch other leases, the one failed will
|
||||||
|
// be fetched in the next iteration or will be reassigned if stayed idle for long.
|
||||||
|
MetricsUtil.addCount(metricsScope, "GetLease:Error", 1, MetricsLevel.SUMMARY);
|
||||||
|
log.error("GetLease failed for leaseKey : {}", leaseKey, e);
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
@ -17,18 +17,21 @@ package software.amazon.kinesis.leases.dynamodb;
|
||||||
|
|
||||||
import java.time.Duration;
|
import java.time.Duration;
|
||||||
import java.util.Collection;
|
import java.util.Collection;
|
||||||
|
import java.util.concurrent.ConcurrentHashMap;
|
||||||
|
import java.util.concurrent.ConcurrentMap;
|
||||||
import java.util.concurrent.ExecutorService;
|
import java.util.concurrent.ExecutorService;
|
||||||
import java.util.concurrent.Executors;
|
import java.util.concurrent.Executors;
|
||||||
import java.util.function.Function;
|
import java.util.function.Function;
|
||||||
|
|
||||||
import lombok.Data;
|
import lombok.Data;
|
||||||
import lombok.NonNull;
|
import lombok.NonNull;
|
||||||
import software.amazon.awssdk.core.util.DefaultSdkAutoConstructList;
|
import lombok.extern.slf4j.Slf4j;
|
||||||
import software.amazon.awssdk.services.dynamodb.DynamoDbAsyncClient;
|
import software.amazon.awssdk.services.dynamodb.DynamoDbAsyncClient;
|
||||||
import software.amazon.awssdk.services.dynamodb.model.BillingMode;
|
import software.amazon.awssdk.services.dynamodb.model.BillingMode;
|
||||||
import software.amazon.awssdk.services.dynamodb.model.Tag;
|
import software.amazon.awssdk.services.dynamodb.model.Tag;
|
||||||
import software.amazon.awssdk.services.kinesis.KinesisAsyncClient;
|
import software.amazon.awssdk.services.kinesis.KinesisAsyncClient;
|
||||||
import software.amazon.kinesis.annotations.KinesisClientInternalApi;
|
import software.amazon.kinesis.annotations.KinesisClientInternalApi;
|
||||||
|
import software.amazon.kinesis.common.DdbTableConfig;
|
||||||
import software.amazon.kinesis.common.InitialPositionInStreamExtended;
|
import software.amazon.kinesis.common.InitialPositionInStreamExtended;
|
||||||
import software.amazon.kinesis.common.LeaseCleanupConfig;
|
import software.amazon.kinesis.common.LeaseCleanupConfig;
|
||||||
import software.amazon.kinesis.common.StreamConfig;
|
import software.amazon.kinesis.common.StreamConfig;
|
||||||
|
|
@ -42,12 +45,15 @@ import software.amazon.kinesis.leases.LeaseManagementConfig;
|
||||||
import software.amazon.kinesis.leases.LeaseManagementFactory;
|
import software.amazon.kinesis.leases.LeaseManagementFactory;
|
||||||
import software.amazon.kinesis.leases.LeaseSerializer;
|
import software.amazon.kinesis.leases.LeaseSerializer;
|
||||||
import software.amazon.kinesis.leases.ShardDetector;
|
import software.amazon.kinesis.leases.ShardDetector;
|
||||||
|
import software.amazon.kinesis.leases.ShardInfo;
|
||||||
import software.amazon.kinesis.leases.ShardSyncTaskManager;
|
import software.amazon.kinesis.leases.ShardSyncTaskManager;
|
||||||
|
import software.amazon.kinesis.lifecycle.ShardConsumer;
|
||||||
import software.amazon.kinesis.metrics.MetricsFactory;
|
import software.amazon.kinesis.metrics.MetricsFactory;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
*
|
*
|
||||||
*/
|
*/
|
||||||
|
@Slf4j
|
||||||
@Data
|
@Data
|
||||||
@KinesisClientInternalApi
|
@KinesisClientInternalApi
|
||||||
public class DynamoDBLeaseManagementFactory implements LeaseManagementFactory {
|
public class DynamoDBLeaseManagementFactory implements LeaseManagementFactory {
|
||||||
|
|
@ -73,6 +79,8 @@ public class DynamoDBLeaseManagementFactory implements LeaseManagementFactory {
|
||||||
@NonNull
|
@NonNull
|
||||||
private final LeaseSerializer leaseSerializer;
|
private final LeaseSerializer leaseSerializer;
|
||||||
|
|
||||||
|
private final LeaseManagementConfig.WorkerUtilizationAwareAssignmentConfig workerUtilizationAwareAssignmentConfig;
|
||||||
|
|
||||||
@NonNull
|
@NonNull
|
||||||
private StreamConfig streamConfig;
|
private StreamConfig streamConfig;
|
||||||
|
|
||||||
|
|
@ -103,434 +111,11 @@ public class DynamoDBLeaseManagementFactory implements LeaseManagementFactory {
|
||||||
private final Collection<Tag> tags;
|
private final Collection<Tag> tags;
|
||||||
private final boolean isMultiStreamMode;
|
private final boolean isMultiStreamMode;
|
||||||
private final LeaseCleanupConfig leaseCleanupConfig;
|
private final LeaseCleanupConfig leaseCleanupConfig;
|
||||||
|
private final LeaseManagementConfig.GracefulLeaseHandoffConfig gracefulLeaseHandoffConfig;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Constructor.
|
* Constructor.
|
||||||
*
|
* @deprecated this is used by the deprecated method in LeaseManagementConfig to construct the LeaseManagement factory
|
||||||
* <p>NOTE: This constructor is deprecated and will be removed in a future release.</p>
|
|
||||||
*
|
|
||||||
* @param kinesisClient
|
|
||||||
* @param streamName
|
|
||||||
* @param dynamoDBClient
|
|
||||||
* @param tableName
|
|
||||||
* @param workerIdentifier
|
|
||||||
* @param executorService
|
|
||||||
* @param initialPositionInStream
|
|
||||||
* @param failoverTimeMillis
|
|
||||||
* @param epsilonMillis
|
|
||||||
* @param maxLeasesForWorker
|
|
||||||
* @param maxLeasesToStealAtOneTime
|
|
||||||
* @param maxLeaseRenewalThreads
|
|
||||||
* @param cleanupLeasesUponShardCompletion
|
|
||||||
* @param ignoreUnexpectedChildShards
|
|
||||||
* @param shardSyncIntervalMillis
|
|
||||||
* @param consistentReads
|
|
||||||
* @param listShardsBackoffTimeMillis
|
|
||||||
* @param maxListShardsRetryAttempts
|
|
||||||
* @param maxCacheMissesBeforeReload
|
|
||||||
* @param listShardsCacheAllowedAgeInSeconds
|
|
||||||
* @param cacheMissWarningModulus
|
|
||||||
*/
|
|
||||||
@Deprecated
|
|
||||||
public DynamoDBLeaseManagementFactory(
|
|
||||||
final KinesisAsyncClient kinesisClient,
|
|
||||||
final String streamName,
|
|
||||||
final DynamoDbAsyncClient dynamoDBClient,
|
|
||||||
final String tableName,
|
|
||||||
final String workerIdentifier,
|
|
||||||
final ExecutorService executorService,
|
|
||||||
final InitialPositionInStreamExtended initialPositionInStream,
|
|
||||||
final long failoverTimeMillis,
|
|
||||||
final long epsilonMillis,
|
|
||||||
final int maxLeasesForWorker,
|
|
||||||
final int maxLeasesToStealAtOneTime,
|
|
||||||
final int maxLeaseRenewalThreads,
|
|
||||||
final boolean cleanupLeasesUponShardCompletion,
|
|
||||||
final boolean ignoreUnexpectedChildShards,
|
|
||||||
final long shardSyncIntervalMillis,
|
|
||||||
final boolean consistentReads,
|
|
||||||
final long listShardsBackoffTimeMillis,
|
|
||||||
final int maxListShardsRetryAttempts,
|
|
||||||
final int maxCacheMissesBeforeReload,
|
|
||||||
final long listShardsCacheAllowedAgeInSeconds,
|
|
||||||
final int cacheMissWarningModulus) {
|
|
||||||
this(
|
|
||||||
kinesisClient,
|
|
||||||
streamName,
|
|
||||||
dynamoDBClient,
|
|
||||||
tableName,
|
|
||||||
workerIdentifier,
|
|
||||||
executorService,
|
|
||||||
initialPositionInStream,
|
|
||||||
failoverTimeMillis,
|
|
||||||
epsilonMillis,
|
|
||||||
maxLeasesForWorker,
|
|
||||||
maxLeasesToStealAtOneTime,
|
|
||||||
maxLeaseRenewalThreads,
|
|
||||||
cleanupLeasesUponShardCompletion,
|
|
||||||
ignoreUnexpectedChildShards,
|
|
||||||
shardSyncIntervalMillis,
|
|
||||||
consistentReads,
|
|
||||||
listShardsBackoffTimeMillis,
|
|
||||||
maxListShardsRetryAttempts,
|
|
||||||
maxCacheMissesBeforeReload,
|
|
||||||
listShardsCacheAllowedAgeInSeconds,
|
|
||||||
cacheMissWarningModulus,
|
|
||||||
TableConstants.DEFAULT_INITIAL_LEASE_TABLE_READ_CAPACITY,
|
|
||||||
TableConstants.DEFAULT_INITIAL_LEASE_TABLE_WRITE_CAPACITY);
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Constructor.
|
|
||||||
*
|
|
||||||
* <p>
|
|
||||||
* NOTE: This constructor is deprecated and will be removed in a future release.
|
|
||||||
* </p>
|
|
||||||
*
|
|
||||||
* @param kinesisClient
|
|
||||||
* @param streamName
|
|
||||||
* @param dynamoDBClient
|
|
||||||
* @param tableName
|
|
||||||
* @param workerIdentifier
|
|
||||||
* @param executorService
|
|
||||||
* @param initialPositionInStream
|
|
||||||
* @param failoverTimeMillis
|
|
||||||
* @param epsilonMillis
|
|
||||||
* @param maxLeasesForWorker
|
|
||||||
* @param maxLeasesToStealAtOneTime
|
|
||||||
* @param maxLeaseRenewalThreads
|
|
||||||
* @param cleanupLeasesUponShardCompletion
|
|
||||||
* @param ignoreUnexpectedChildShards
|
|
||||||
* @param shardSyncIntervalMillis
|
|
||||||
* @param consistentReads
|
|
||||||
* @param listShardsBackoffTimeMillis
|
|
||||||
* @param maxListShardsRetryAttempts
|
|
||||||
* @param maxCacheMissesBeforeReload
|
|
||||||
* @param listShardsCacheAllowedAgeInSeconds
|
|
||||||
* @param cacheMissWarningModulus
|
|
||||||
* @param initialLeaseTableReadCapacity
|
|
||||||
* @param initialLeaseTableWriteCapacity
|
|
||||||
*/
|
|
||||||
@Deprecated
|
|
||||||
public DynamoDBLeaseManagementFactory(
|
|
||||||
final KinesisAsyncClient kinesisClient,
|
|
||||||
final String streamName,
|
|
||||||
final DynamoDbAsyncClient dynamoDBClient,
|
|
||||||
final String tableName,
|
|
||||||
final String workerIdentifier,
|
|
||||||
final ExecutorService executorService,
|
|
||||||
final InitialPositionInStreamExtended initialPositionInStream,
|
|
||||||
final long failoverTimeMillis,
|
|
||||||
final long epsilonMillis,
|
|
||||||
final int maxLeasesForWorker,
|
|
||||||
final int maxLeasesToStealAtOneTime,
|
|
||||||
final int maxLeaseRenewalThreads,
|
|
||||||
final boolean cleanupLeasesUponShardCompletion,
|
|
||||||
final boolean ignoreUnexpectedChildShards,
|
|
||||||
final long shardSyncIntervalMillis,
|
|
||||||
final boolean consistentReads,
|
|
||||||
final long listShardsBackoffTimeMillis,
|
|
||||||
final int maxListShardsRetryAttempts,
|
|
||||||
final int maxCacheMissesBeforeReload,
|
|
||||||
final long listShardsCacheAllowedAgeInSeconds,
|
|
||||||
final int cacheMissWarningModulus,
|
|
||||||
final long initialLeaseTableReadCapacity,
|
|
||||||
final long initialLeaseTableWriteCapacity) {
|
|
||||||
this(
|
|
||||||
kinesisClient,
|
|
||||||
streamName,
|
|
||||||
dynamoDBClient,
|
|
||||||
tableName,
|
|
||||||
workerIdentifier,
|
|
||||||
executorService,
|
|
||||||
initialPositionInStream,
|
|
||||||
failoverTimeMillis,
|
|
||||||
epsilonMillis,
|
|
||||||
maxLeasesForWorker,
|
|
||||||
maxLeasesToStealAtOneTime,
|
|
||||||
maxLeaseRenewalThreads,
|
|
||||||
cleanupLeasesUponShardCompletion,
|
|
||||||
ignoreUnexpectedChildShards,
|
|
||||||
shardSyncIntervalMillis,
|
|
||||||
consistentReads,
|
|
||||||
listShardsBackoffTimeMillis,
|
|
||||||
maxListShardsRetryAttempts,
|
|
||||||
maxCacheMissesBeforeReload,
|
|
||||||
listShardsCacheAllowedAgeInSeconds,
|
|
||||||
cacheMissWarningModulus,
|
|
||||||
initialLeaseTableReadCapacity,
|
|
||||||
initialLeaseTableWriteCapacity,
|
|
||||||
new HierarchicalShardSyncer(),
|
|
||||||
TableCreatorCallback.NOOP_TABLE_CREATOR_CALLBACK,
|
|
||||||
LeaseManagementConfig.DEFAULT_REQUEST_TIMEOUT);
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Constructor.
|
|
||||||
*
|
|
||||||
* @param kinesisClient
|
|
||||||
* @param streamName
|
|
||||||
* @param dynamoDBClient
|
|
||||||
* @param tableName
|
|
||||||
* @param workerIdentifier
|
|
||||||
* @param executorService
|
|
||||||
* @param initialPositionInStream
|
|
||||||
* @param failoverTimeMillis
|
|
||||||
* @param epsilonMillis
|
|
||||||
* @param maxLeasesForWorker
|
|
||||||
* @param maxLeasesToStealAtOneTime
|
|
||||||
* @param maxLeaseRenewalThreads
|
|
||||||
* @param cleanupLeasesUponShardCompletion
|
|
||||||
* @param ignoreUnexpectedChildShards
|
|
||||||
* @param shardSyncIntervalMillis
|
|
||||||
* @param consistentReads
|
|
||||||
* @param listShardsBackoffTimeMillis
|
|
||||||
* @param maxListShardsRetryAttempts
|
|
||||||
* @param maxCacheMissesBeforeReload
|
|
||||||
* @param listShardsCacheAllowedAgeInSeconds
|
|
||||||
* @param cacheMissWarningModulus
|
|
||||||
* @param initialLeaseTableReadCapacity
|
|
||||||
* @param initialLeaseTableWriteCapacity
|
|
||||||
* @param hierarchicalShardSyncer
|
|
||||||
* @param tableCreatorCallback
|
|
||||||
*/
|
|
||||||
@Deprecated
|
|
||||||
public DynamoDBLeaseManagementFactory(
|
|
||||||
final KinesisAsyncClient kinesisClient,
|
|
||||||
final String streamName,
|
|
||||||
final DynamoDbAsyncClient dynamoDBClient,
|
|
||||||
final String tableName,
|
|
||||||
final String workerIdentifier,
|
|
||||||
final ExecutorService executorService,
|
|
||||||
final InitialPositionInStreamExtended initialPositionInStream,
|
|
||||||
final long failoverTimeMillis,
|
|
||||||
final long epsilonMillis,
|
|
||||||
final int maxLeasesForWorker,
|
|
||||||
final int maxLeasesToStealAtOneTime,
|
|
||||||
final int maxLeaseRenewalThreads,
|
|
||||||
final boolean cleanupLeasesUponShardCompletion,
|
|
||||||
final boolean ignoreUnexpectedChildShards,
|
|
||||||
final long shardSyncIntervalMillis,
|
|
||||||
final boolean consistentReads,
|
|
||||||
final long listShardsBackoffTimeMillis,
|
|
||||||
final int maxListShardsRetryAttempts,
|
|
||||||
final int maxCacheMissesBeforeReload,
|
|
||||||
final long listShardsCacheAllowedAgeInSeconds,
|
|
||||||
final int cacheMissWarningModulus,
|
|
||||||
final long initialLeaseTableReadCapacity,
|
|
||||||
final long initialLeaseTableWriteCapacity,
|
|
||||||
final HierarchicalShardSyncer hierarchicalShardSyncer,
|
|
||||||
final TableCreatorCallback tableCreatorCallback) {
|
|
||||||
this(
|
|
||||||
kinesisClient,
|
|
||||||
streamName,
|
|
||||||
dynamoDBClient,
|
|
||||||
tableName,
|
|
||||||
workerIdentifier,
|
|
||||||
executorService,
|
|
||||||
initialPositionInStream,
|
|
||||||
failoverTimeMillis,
|
|
||||||
epsilonMillis,
|
|
||||||
maxLeasesForWorker,
|
|
||||||
maxLeasesToStealAtOneTime,
|
|
||||||
maxLeaseRenewalThreads,
|
|
||||||
cleanupLeasesUponShardCompletion,
|
|
||||||
ignoreUnexpectedChildShards,
|
|
||||||
shardSyncIntervalMillis,
|
|
||||||
consistentReads,
|
|
||||||
listShardsBackoffTimeMillis,
|
|
||||||
maxListShardsRetryAttempts,
|
|
||||||
maxCacheMissesBeforeReload,
|
|
||||||
listShardsCacheAllowedAgeInSeconds,
|
|
||||||
cacheMissWarningModulus,
|
|
||||||
initialLeaseTableReadCapacity,
|
|
||||||
initialLeaseTableWriteCapacity,
|
|
||||||
hierarchicalShardSyncer,
|
|
||||||
tableCreatorCallback,
|
|
||||||
LeaseManagementConfig.DEFAULT_REQUEST_TIMEOUT);
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Constructor.
|
|
||||||
*
|
|
||||||
* @param kinesisClient
|
|
||||||
* @param streamName
|
|
||||||
* @param dynamoDBClient
|
|
||||||
* @param tableName
|
|
||||||
* @param workerIdentifier
|
|
||||||
* @param executorService
|
|
||||||
* @param initialPositionInStream
|
|
||||||
* @param failoverTimeMillis
|
|
||||||
* @param epsilonMillis
|
|
||||||
* @param maxLeasesForWorker
|
|
||||||
* @param maxLeasesToStealAtOneTime
|
|
||||||
* @param maxLeaseRenewalThreads
|
|
||||||
* @param cleanupLeasesUponShardCompletion
|
|
||||||
* @param ignoreUnexpectedChildShards
|
|
||||||
* @param shardSyncIntervalMillis
|
|
||||||
* @param consistentReads
|
|
||||||
* @param listShardsBackoffTimeMillis
|
|
||||||
* @param maxListShardsRetryAttempts
|
|
||||||
* @param maxCacheMissesBeforeReload
|
|
||||||
* @param listShardsCacheAllowedAgeInSeconds
|
|
||||||
* @param cacheMissWarningModulus
|
|
||||||
* @param initialLeaseTableReadCapacity
|
|
||||||
* @param initialLeaseTableWriteCapacity
|
|
||||||
* @param hierarchicalShardSyncer
|
|
||||||
* @param tableCreatorCallback
|
|
||||||
* @param dynamoDbRequestTimeout
|
|
||||||
*/
|
|
||||||
@Deprecated
|
|
||||||
public DynamoDBLeaseManagementFactory(
|
|
||||||
final KinesisAsyncClient kinesisClient,
|
|
||||||
final String streamName,
|
|
||||||
final DynamoDbAsyncClient dynamoDBClient,
|
|
||||||
final String tableName,
|
|
||||||
final String workerIdentifier,
|
|
||||||
final ExecutorService executorService,
|
|
||||||
final InitialPositionInStreamExtended initialPositionInStream,
|
|
||||||
final long failoverTimeMillis,
|
|
||||||
final long epsilonMillis,
|
|
||||||
final int maxLeasesForWorker,
|
|
||||||
final int maxLeasesToStealAtOneTime,
|
|
||||||
final int maxLeaseRenewalThreads,
|
|
||||||
final boolean cleanupLeasesUponShardCompletion,
|
|
||||||
final boolean ignoreUnexpectedChildShards,
|
|
||||||
final long shardSyncIntervalMillis,
|
|
||||||
final boolean consistentReads,
|
|
||||||
final long listShardsBackoffTimeMillis,
|
|
||||||
final int maxListShardsRetryAttempts,
|
|
||||||
final int maxCacheMissesBeforeReload,
|
|
||||||
final long listShardsCacheAllowedAgeInSeconds,
|
|
||||||
final int cacheMissWarningModulus,
|
|
||||||
final long initialLeaseTableReadCapacity,
|
|
||||||
final long initialLeaseTableWriteCapacity,
|
|
||||||
final HierarchicalShardSyncer hierarchicalShardSyncer,
|
|
||||||
final TableCreatorCallback tableCreatorCallback,
|
|
||||||
Duration dynamoDbRequestTimeout) {
|
|
||||||
this(
|
|
||||||
kinesisClient,
|
|
||||||
streamName,
|
|
||||||
dynamoDBClient,
|
|
||||||
tableName,
|
|
||||||
workerIdentifier,
|
|
||||||
executorService,
|
|
||||||
initialPositionInStream,
|
|
||||||
failoverTimeMillis,
|
|
||||||
epsilonMillis,
|
|
||||||
maxLeasesForWorker,
|
|
||||||
maxLeasesToStealAtOneTime,
|
|
||||||
maxLeaseRenewalThreads,
|
|
||||||
cleanupLeasesUponShardCompletion,
|
|
||||||
ignoreUnexpectedChildShards,
|
|
||||||
shardSyncIntervalMillis,
|
|
||||||
consistentReads,
|
|
||||||
listShardsBackoffTimeMillis,
|
|
||||||
maxListShardsRetryAttempts,
|
|
||||||
maxCacheMissesBeforeReload,
|
|
||||||
listShardsCacheAllowedAgeInSeconds,
|
|
||||||
cacheMissWarningModulus,
|
|
||||||
initialLeaseTableReadCapacity,
|
|
||||||
initialLeaseTableWriteCapacity,
|
|
||||||
hierarchicalShardSyncer,
|
|
||||||
tableCreatorCallback,
|
|
||||||
dynamoDbRequestTimeout,
|
|
||||||
BillingMode.PAY_PER_REQUEST);
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Constructor.
|
|
||||||
*
|
|
||||||
* @param kinesisClient
|
|
||||||
* @param streamName
|
|
||||||
* @param dynamoDBClient
|
|
||||||
* @param tableName
|
|
||||||
* @param workerIdentifier
|
|
||||||
* @param executorService
|
|
||||||
* @param initialPositionInStream
|
|
||||||
* @param failoverTimeMillis
|
|
||||||
* @param epsilonMillis
|
|
||||||
* @param maxLeasesForWorker
|
|
||||||
* @param maxLeasesToStealAtOneTime
|
|
||||||
* @param maxLeaseRenewalThreads
|
|
||||||
* @param cleanupLeasesUponShardCompletion
|
|
||||||
* @param ignoreUnexpectedChildShards
|
|
||||||
* @param shardSyncIntervalMillis
|
|
||||||
* @param consistentReads
|
|
||||||
* @param listShardsBackoffTimeMillis
|
|
||||||
* @param maxListShardsRetryAttempts
|
|
||||||
* @param maxCacheMissesBeforeReload
|
|
||||||
* @param listShardsCacheAllowedAgeInSeconds
|
|
||||||
* @param cacheMissWarningModulus
|
|
||||||
* @param initialLeaseTableReadCapacity
|
|
||||||
* @param initialLeaseTableWriteCapacity
|
|
||||||
* @param hierarchicalShardSyncer
|
|
||||||
* @param tableCreatorCallback
|
|
||||||
* @param dynamoDbRequestTimeout
|
|
||||||
* @param billingMode
|
|
||||||
*/
|
|
||||||
@Deprecated
|
|
||||||
public DynamoDBLeaseManagementFactory(
|
|
||||||
final KinesisAsyncClient kinesisClient,
|
|
||||||
final String streamName,
|
|
||||||
final DynamoDbAsyncClient dynamoDBClient,
|
|
||||||
final String tableName,
|
|
||||||
final String workerIdentifier,
|
|
||||||
final ExecutorService executorService,
|
|
||||||
final InitialPositionInStreamExtended initialPositionInStream,
|
|
||||||
final long failoverTimeMillis,
|
|
||||||
final long epsilonMillis,
|
|
||||||
final int maxLeasesForWorker,
|
|
||||||
final int maxLeasesToStealAtOneTime,
|
|
||||||
final int maxLeaseRenewalThreads,
|
|
||||||
final boolean cleanupLeasesUponShardCompletion,
|
|
||||||
final boolean ignoreUnexpectedChildShards,
|
|
||||||
final long shardSyncIntervalMillis,
|
|
||||||
final boolean consistentReads,
|
|
||||||
final long listShardsBackoffTimeMillis,
|
|
||||||
final int maxListShardsRetryAttempts,
|
|
||||||
final int maxCacheMissesBeforeReload,
|
|
||||||
final long listShardsCacheAllowedAgeInSeconds,
|
|
||||||
final int cacheMissWarningModulus,
|
|
||||||
final long initialLeaseTableReadCapacity,
|
|
||||||
final long initialLeaseTableWriteCapacity,
|
|
||||||
final HierarchicalShardSyncer hierarchicalShardSyncer,
|
|
||||||
final TableCreatorCallback tableCreatorCallback,
|
|
||||||
Duration dynamoDbRequestTimeout,
|
|
||||||
BillingMode billingMode) {
|
|
||||||
|
|
||||||
this(
|
|
||||||
kinesisClient,
|
|
||||||
new StreamConfig(StreamIdentifier.singleStreamInstance(streamName), initialPositionInStream),
|
|
||||||
dynamoDBClient,
|
|
||||||
tableName,
|
|
||||||
workerIdentifier,
|
|
||||||
executorService,
|
|
||||||
failoverTimeMillis,
|
|
||||||
epsilonMillis,
|
|
||||||
maxLeasesForWorker,
|
|
||||||
maxLeasesToStealAtOneTime,
|
|
||||||
maxLeaseRenewalThreads,
|
|
||||||
cleanupLeasesUponShardCompletion,
|
|
||||||
ignoreUnexpectedChildShards,
|
|
||||||
shardSyncIntervalMillis,
|
|
||||||
consistentReads,
|
|
||||||
listShardsBackoffTimeMillis,
|
|
||||||
maxListShardsRetryAttempts,
|
|
||||||
maxCacheMissesBeforeReload,
|
|
||||||
listShardsCacheAllowedAgeInSeconds,
|
|
||||||
cacheMissWarningModulus,
|
|
||||||
initialLeaseTableReadCapacity,
|
|
||||||
initialLeaseTableWriteCapacity,
|
|
||||||
hierarchicalShardSyncer,
|
|
||||||
tableCreatorCallback,
|
|
||||||
dynamoDbRequestTimeout,
|
|
||||||
billingMode,
|
|
||||||
new DynamoDBLeaseSerializer());
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Constructor.
|
|
||||||
*
|
*
|
||||||
* @param kinesisClient
|
* @param kinesisClient
|
||||||
* @param streamName
|
* @param streamName
|
||||||
|
|
@ -592,291 +177,6 @@ public class DynamoDBLeaseManagementFactory implements LeaseManagementFactory {
|
||||||
BillingMode billingMode,
|
BillingMode billingMode,
|
||||||
Collection<Tag> tags) {
|
Collection<Tag> tags) {
|
||||||
|
|
||||||
this(
|
|
||||||
kinesisClient,
|
|
||||||
new StreamConfig(StreamIdentifier.singleStreamInstance(streamName), initialPositionInStream),
|
|
||||||
dynamoDBClient,
|
|
||||||
tableName,
|
|
||||||
workerIdentifier,
|
|
||||||
executorService,
|
|
||||||
failoverTimeMillis,
|
|
||||||
epsilonMillis,
|
|
||||||
maxLeasesForWorker,
|
|
||||||
maxLeasesToStealAtOneTime,
|
|
||||||
maxLeaseRenewalThreads,
|
|
||||||
cleanupLeasesUponShardCompletion,
|
|
||||||
ignoreUnexpectedChildShards,
|
|
||||||
shardSyncIntervalMillis,
|
|
||||||
consistentReads,
|
|
||||||
listShardsBackoffTimeMillis,
|
|
||||||
maxListShardsRetryAttempts,
|
|
||||||
maxCacheMissesBeforeReload,
|
|
||||||
listShardsCacheAllowedAgeInSeconds,
|
|
||||||
cacheMissWarningModulus,
|
|
||||||
initialLeaseTableReadCapacity,
|
|
||||||
initialLeaseTableWriteCapacity,
|
|
||||||
hierarchicalShardSyncer,
|
|
||||||
tableCreatorCallback,
|
|
||||||
dynamoDbRequestTimeout,
|
|
||||||
billingMode,
|
|
||||||
new DynamoDBLeaseSerializer());
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Constructor.
|
|
||||||
*
|
|
||||||
* @param kinesisClient
|
|
||||||
* @param streamConfig
|
|
||||||
* @param dynamoDBClient
|
|
||||||
* @param tableName
|
|
||||||
* @param workerIdentifier
|
|
||||||
* @param executorService
|
|
||||||
* @param failoverTimeMillis
|
|
||||||
* @param epsilonMillis
|
|
||||||
* @param maxLeasesForWorker
|
|
||||||
* @param maxLeasesToStealAtOneTime
|
|
||||||
* @param maxLeaseRenewalThreads
|
|
||||||
* @param cleanupLeasesUponShardCompletion
|
|
||||||
* @param ignoreUnexpectedChildShards
|
|
||||||
* @param shardSyncIntervalMillis
|
|
||||||
* @param consistentReads
|
|
||||||
* @param listShardsBackoffTimeMillis
|
|
||||||
* @param maxListShardsRetryAttempts
|
|
||||||
* @param maxCacheMissesBeforeReload
|
|
||||||
* @param listShardsCacheAllowedAgeInSeconds
|
|
||||||
* @param cacheMissWarningModulus
|
|
||||||
* @param initialLeaseTableReadCapacity
|
|
||||||
* @param initialLeaseTableWriteCapacity
|
|
||||||
* @param deprecatedHierarchicalShardSyncer
|
|
||||||
* @param tableCreatorCallback
|
|
||||||
* @param dynamoDbRequestTimeout
|
|
||||||
* @param billingMode
|
|
||||||
*/
|
|
||||||
@Deprecated
|
|
||||||
private DynamoDBLeaseManagementFactory(
|
|
||||||
final KinesisAsyncClient kinesisClient,
|
|
||||||
final StreamConfig streamConfig,
|
|
||||||
final DynamoDbAsyncClient dynamoDBClient,
|
|
||||||
final String tableName,
|
|
||||||
final String workerIdentifier,
|
|
||||||
final ExecutorService executorService,
|
|
||||||
final long failoverTimeMillis,
|
|
||||||
final long epsilonMillis,
|
|
||||||
final int maxLeasesForWorker,
|
|
||||||
final int maxLeasesToStealAtOneTime,
|
|
||||||
final int maxLeaseRenewalThreads,
|
|
||||||
final boolean cleanupLeasesUponShardCompletion,
|
|
||||||
final boolean ignoreUnexpectedChildShards,
|
|
||||||
final long shardSyncIntervalMillis,
|
|
||||||
final boolean consistentReads,
|
|
||||||
final long listShardsBackoffTimeMillis,
|
|
||||||
final int maxListShardsRetryAttempts,
|
|
||||||
final int maxCacheMissesBeforeReload,
|
|
||||||
final long listShardsCacheAllowedAgeInSeconds,
|
|
||||||
final int cacheMissWarningModulus,
|
|
||||||
final long initialLeaseTableReadCapacity,
|
|
||||||
final long initialLeaseTableWriteCapacity,
|
|
||||||
final HierarchicalShardSyncer deprecatedHierarchicalShardSyncer,
|
|
||||||
final TableCreatorCallback tableCreatorCallback,
|
|
||||||
Duration dynamoDbRequestTimeout,
|
|
||||||
BillingMode billingMode,
|
|
||||||
LeaseSerializer leaseSerializer) {
|
|
||||||
this(
|
|
||||||
kinesisClient,
|
|
||||||
streamConfig,
|
|
||||||
dynamoDBClient,
|
|
||||||
tableName,
|
|
||||||
workerIdentifier,
|
|
||||||
executorService,
|
|
||||||
failoverTimeMillis,
|
|
||||||
epsilonMillis,
|
|
||||||
maxLeasesForWorker,
|
|
||||||
maxLeasesToStealAtOneTime,
|
|
||||||
maxLeaseRenewalThreads,
|
|
||||||
cleanupLeasesUponShardCompletion,
|
|
||||||
ignoreUnexpectedChildShards,
|
|
||||||
shardSyncIntervalMillis,
|
|
||||||
consistentReads,
|
|
||||||
listShardsBackoffTimeMillis,
|
|
||||||
maxListShardsRetryAttempts,
|
|
||||||
maxCacheMissesBeforeReload,
|
|
||||||
listShardsCacheAllowedAgeInSeconds,
|
|
||||||
cacheMissWarningModulus,
|
|
||||||
initialLeaseTableReadCapacity,
|
|
||||||
initialLeaseTableWriteCapacity,
|
|
||||||
deprecatedHierarchicalShardSyncer,
|
|
||||||
tableCreatorCallback,
|
|
||||||
dynamoDbRequestTimeout,
|
|
||||||
billingMode,
|
|
||||||
LeaseManagementConfig.DEFAULT_LEASE_TABLE_DELETION_PROTECTION_ENABLED,
|
|
||||||
DefaultSdkAutoConstructList.getInstance(),
|
|
||||||
leaseSerializer);
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Constructor.
|
|
||||||
*
|
|
||||||
* @param kinesisClient
|
|
||||||
* @param streamConfig
|
|
||||||
* @param dynamoDBClient
|
|
||||||
* @param tableName
|
|
||||||
* @param workerIdentifier
|
|
||||||
* @param executorService
|
|
||||||
* @param failoverTimeMillis
|
|
||||||
* @param epsilonMillis
|
|
||||||
* @param maxLeasesForWorker
|
|
||||||
* @param maxLeasesToStealAtOneTime
|
|
||||||
* @param maxLeaseRenewalThreads
|
|
||||||
* @param cleanupLeasesUponShardCompletion
|
|
||||||
* @param ignoreUnexpectedChildShards
|
|
||||||
* @param shardSyncIntervalMillis
|
|
||||||
* @param consistentReads
|
|
||||||
* @param listShardsBackoffTimeMillis
|
|
||||||
* @param maxListShardsRetryAttempts
|
|
||||||
* @param maxCacheMissesBeforeReload
|
|
||||||
* @param listShardsCacheAllowedAgeInSeconds
|
|
||||||
* @param cacheMissWarningModulus
|
|
||||||
* @param initialLeaseTableReadCapacity
|
|
||||||
* @param initialLeaseTableWriteCapacity
|
|
||||||
* @param deprecatedHierarchicalShardSyncer
|
|
||||||
* @param tableCreatorCallback
|
|
||||||
* @param dynamoDbRequestTimeout
|
|
||||||
* @param billingMode
|
|
||||||
* @param leaseTableDeletionProtectionEnabled
|
|
||||||
* @param tags
|
|
||||||
*/
|
|
||||||
@Deprecated
|
|
||||||
private DynamoDBLeaseManagementFactory(
|
|
||||||
final KinesisAsyncClient kinesisClient,
|
|
||||||
final StreamConfig streamConfig,
|
|
||||||
final DynamoDbAsyncClient dynamoDBClient,
|
|
||||||
final String tableName,
|
|
||||||
final String workerIdentifier,
|
|
||||||
final ExecutorService executorService,
|
|
||||||
final long failoverTimeMillis,
|
|
||||||
final long epsilonMillis,
|
|
||||||
final int maxLeasesForWorker,
|
|
||||||
final int maxLeasesToStealAtOneTime,
|
|
||||||
final int maxLeaseRenewalThreads,
|
|
||||||
final boolean cleanupLeasesUponShardCompletion,
|
|
||||||
final boolean ignoreUnexpectedChildShards,
|
|
||||||
final long shardSyncIntervalMillis,
|
|
||||||
final boolean consistentReads,
|
|
||||||
final long listShardsBackoffTimeMillis,
|
|
||||||
final int maxListShardsRetryAttempts,
|
|
||||||
final int maxCacheMissesBeforeReload,
|
|
||||||
final long listShardsCacheAllowedAgeInSeconds,
|
|
||||||
final int cacheMissWarningModulus,
|
|
||||||
final long initialLeaseTableReadCapacity,
|
|
||||||
final long initialLeaseTableWriteCapacity,
|
|
||||||
final HierarchicalShardSyncer deprecatedHierarchicalShardSyncer,
|
|
||||||
final TableCreatorCallback tableCreatorCallback,
|
|
||||||
Duration dynamoDbRequestTimeout,
|
|
||||||
BillingMode billingMode,
|
|
||||||
final boolean leaseTableDeletionProtectionEnabled,
|
|
||||||
Collection<Tag> tags,
|
|
||||||
LeaseSerializer leaseSerializer) {
|
|
||||||
this(
|
|
||||||
kinesisClient,
|
|
||||||
dynamoDBClient,
|
|
||||||
tableName,
|
|
||||||
workerIdentifier,
|
|
||||||
executorService,
|
|
||||||
failoverTimeMillis,
|
|
||||||
epsilonMillis,
|
|
||||||
maxLeasesForWorker,
|
|
||||||
maxLeasesToStealAtOneTime,
|
|
||||||
maxLeaseRenewalThreads,
|
|
||||||
cleanupLeasesUponShardCompletion,
|
|
||||||
ignoreUnexpectedChildShards,
|
|
||||||
shardSyncIntervalMillis,
|
|
||||||
consistentReads,
|
|
||||||
listShardsBackoffTimeMillis,
|
|
||||||
maxListShardsRetryAttempts,
|
|
||||||
maxCacheMissesBeforeReload,
|
|
||||||
listShardsCacheAllowedAgeInSeconds,
|
|
||||||
cacheMissWarningModulus,
|
|
||||||
initialLeaseTableReadCapacity,
|
|
||||||
initialLeaseTableWriteCapacity,
|
|
||||||
deprecatedHierarchicalShardSyncer,
|
|
||||||
tableCreatorCallback,
|
|
||||||
dynamoDbRequestTimeout,
|
|
||||||
billingMode,
|
|
||||||
leaseTableDeletionProtectionEnabled,
|
|
||||||
tags,
|
|
||||||
leaseSerializer,
|
|
||||||
null,
|
|
||||||
false,
|
|
||||||
LeaseManagementConfig.DEFAULT_LEASE_CLEANUP_CONFIG);
|
|
||||||
this.streamConfig = streamConfig;
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Constructor.
|
|
||||||
* @param kinesisClient
|
|
||||||
* @param dynamoDBClient
|
|
||||||
* @param tableName
|
|
||||||
* @param workerIdentifier
|
|
||||||
* @param executorService
|
|
||||||
* @param failoverTimeMillis
|
|
||||||
* @param epsilonMillis
|
|
||||||
* @param maxLeasesForWorker
|
|
||||||
* @param maxLeasesToStealAtOneTime
|
|
||||||
* @param maxLeaseRenewalThreads
|
|
||||||
* @param cleanupLeasesUponShardCompletion
|
|
||||||
* @param ignoreUnexpectedChildShards
|
|
||||||
* @param shardSyncIntervalMillis
|
|
||||||
* @param consistentReads
|
|
||||||
* @param listShardsBackoffTimeMillis
|
|
||||||
* @param maxListShardsRetryAttempts
|
|
||||||
* @param maxCacheMissesBeforeReload
|
|
||||||
* @param listShardsCacheAllowedAgeInSeconds
|
|
||||||
* @param cacheMissWarningModulus
|
|
||||||
* @param initialLeaseTableReadCapacity
|
|
||||||
* @param initialLeaseTableWriteCapacity
|
|
||||||
* @param deprecatedHierarchicalShardSyncer
|
|
||||||
* @param tableCreatorCallback
|
|
||||||
* @param dynamoDbRequestTimeout
|
|
||||||
* @param billingMode
|
|
||||||
* @param leaseTableDeletionProtectionEnabled
|
|
||||||
* @param leaseSerializer
|
|
||||||
* @param customShardDetectorProvider
|
|
||||||
* @param isMultiStreamMode
|
|
||||||
* @param leaseCleanupConfig
|
|
||||||
*/
|
|
||||||
@Deprecated
|
|
||||||
public DynamoDBLeaseManagementFactory(
|
|
||||||
final KinesisAsyncClient kinesisClient,
|
|
||||||
final DynamoDbAsyncClient dynamoDBClient,
|
|
||||||
final String tableName,
|
|
||||||
final String workerIdentifier,
|
|
||||||
final ExecutorService executorService,
|
|
||||||
final long failoverTimeMillis,
|
|
||||||
final long epsilonMillis,
|
|
||||||
final int maxLeasesForWorker,
|
|
||||||
final int maxLeasesToStealAtOneTime,
|
|
||||||
final int maxLeaseRenewalThreads,
|
|
||||||
final boolean cleanupLeasesUponShardCompletion,
|
|
||||||
final boolean ignoreUnexpectedChildShards,
|
|
||||||
final long shardSyncIntervalMillis,
|
|
||||||
final boolean consistentReads,
|
|
||||||
final long listShardsBackoffTimeMillis,
|
|
||||||
final int maxListShardsRetryAttempts,
|
|
||||||
final int maxCacheMissesBeforeReload,
|
|
||||||
final long listShardsCacheAllowedAgeInSeconds,
|
|
||||||
final int cacheMissWarningModulus,
|
|
||||||
final long initialLeaseTableReadCapacity,
|
|
||||||
final long initialLeaseTableWriteCapacity,
|
|
||||||
final HierarchicalShardSyncer deprecatedHierarchicalShardSyncer,
|
|
||||||
final TableCreatorCallback tableCreatorCallback,
|
|
||||||
Duration dynamoDbRequestTimeout,
|
|
||||||
BillingMode billingMode,
|
|
||||||
final boolean leaseTableDeletionProtectionEnabled,
|
|
||||||
Collection<Tag> tags,
|
|
||||||
LeaseSerializer leaseSerializer,
|
|
||||||
Function<StreamConfig, ShardDetector> customShardDetectorProvider,
|
|
||||||
boolean isMultiStreamMode,
|
|
||||||
LeaseCleanupConfig leaseCleanupConfig) {
|
|
||||||
this(
|
this(
|
||||||
kinesisClient,
|
kinesisClient,
|
||||||
dynamoDBClient,
|
dynamoDBClient,
|
||||||
|
|
@ -900,16 +200,21 @@ public class DynamoDBLeaseManagementFactory implements LeaseManagementFactory {
|
||||||
cacheMissWarningModulus,
|
cacheMissWarningModulus,
|
||||||
initialLeaseTableReadCapacity,
|
initialLeaseTableReadCapacity,
|
||||||
initialLeaseTableWriteCapacity,
|
initialLeaseTableWriteCapacity,
|
||||||
deprecatedHierarchicalShardSyncer,
|
hierarchicalShardSyncer,
|
||||||
tableCreatorCallback,
|
tableCreatorCallback,
|
||||||
dynamoDbRequestTimeout,
|
dynamoDbRequestTimeout,
|
||||||
billingMode,
|
billingMode,
|
||||||
leaseTableDeletionProtectionEnabled,
|
LeaseManagementConfig.DEFAULT_LEASE_TABLE_DELETION_PROTECTION_ENABLED,
|
||||||
|
LeaseManagementConfig.DEFAULT_LEASE_TABLE_PITR_ENABLED,
|
||||||
tags,
|
tags,
|
||||||
leaseSerializer,
|
new DynamoDBLeaseSerializer(),
|
||||||
customShardDetectorProvider,
|
null,
|
||||||
isMultiStreamMode,
|
false,
|
||||||
leaseCleanupConfig);
|
LeaseManagementConfig.DEFAULT_LEASE_CLEANUP_CONFIG,
|
||||||
|
new LeaseManagementConfig.WorkerUtilizationAwareAssignmentConfig(),
|
||||||
|
LeaseManagementConfig.GracefulLeaseHandoffConfig.builder().build());
|
||||||
|
this.streamConfig =
|
||||||
|
new StreamConfig(StreamIdentifier.singleStreamInstance(streamName), initialPositionInStream);
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
|
@ -947,75 +252,6 @@ public class DynamoDBLeaseManagementFactory implements LeaseManagementFactory {
|
||||||
* @param leaseCleanupConfig
|
* @param leaseCleanupConfig
|
||||||
*/
|
*/
|
||||||
@Deprecated
|
@Deprecated
|
||||||
public DynamoDBLeaseManagementFactory(
|
|
||||||
final KinesisAsyncClient kinesisClient,
|
|
||||||
final DynamoDbAsyncClient dynamoDBClient,
|
|
||||||
final String tableName,
|
|
||||||
final String workerIdentifier,
|
|
||||||
final ExecutorService executorService,
|
|
||||||
final long failoverTimeMillis,
|
|
||||||
final boolean enablePriorityLeaseAssignment,
|
|
||||||
final long epsilonMillis,
|
|
||||||
final int maxLeasesForWorker,
|
|
||||||
final int maxLeasesToStealAtOneTime,
|
|
||||||
final int maxLeaseRenewalThreads,
|
|
||||||
final boolean cleanupLeasesUponShardCompletion,
|
|
||||||
final boolean ignoreUnexpectedChildShards,
|
|
||||||
final long shardSyncIntervalMillis,
|
|
||||||
final boolean consistentReads,
|
|
||||||
final long listShardsBackoffTimeMillis,
|
|
||||||
final int maxListShardsRetryAttempts,
|
|
||||||
final int maxCacheMissesBeforeReload,
|
|
||||||
final long listShardsCacheAllowedAgeInSeconds,
|
|
||||||
final int cacheMissWarningModulus,
|
|
||||||
final long initialLeaseTableReadCapacity,
|
|
||||||
final long initialLeaseTableWriteCapacity,
|
|
||||||
final HierarchicalShardSyncer deprecatedHierarchicalShardSyncer,
|
|
||||||
final TableCreatorCallback tableCreatorCallback,
|
|
||||||
Duration dynamoDbRequestTimeout,
|
|
||||||
BillingMode billingMode,
|
|
||||||
final boolean leaseTableDeletionProtectionEnabled,
|
|
||||||
Collection<Tag> tags,
|
|
||||||
LeaseSerializer leaseSerializer,
|
|
||||||
Function<StreamConfig, ShardDetector> customShardDetectorProvider,
|
|
||||||
boolean isMultiStreamMode,
|
|
||||||
LeaseCleanupConfig leaseCleanupConfig) {
|
|
||||||
this(
|
|
||||||
kinesisClient,
|
|
||||||
dynamoDBClient,
|
|
||||||
tableName,
|
|
||||||
workerIdentifier,
|
|
||||||
executorService,
|
|
||||||
failoverTimeMillis,
|
|
||||||
enablePriorityLeaseAssignment,
|
|
||||||
epsilonMillis,
|
|
||||||
maxLeasesForWorker,
|
|
||||||
maxLeasesToStealAtOneTime,
|
|
||||||
maxLeaseRenewalThreads,
|
|
||||||
cleanupLeasesUponShardCompletion,
|
|
||||||
ignoreUnexpectedChildShards,
|
|
||||||
shardSyncIntervalMillis,
|
|
||||||
consistentReads,
|
|
||||||
listShardsBackoffTimeMillis,
|
|
||||||
maxListShardsRetryAttempts,
|
|
||||||
maxCacheMissesBeforeReload,
|
|
||||||
listShardsCacheAllowedAgeInSeconds,
|
|
||||||
cacheMissWarningModulus,
|
|
||||||
initialLeaseTableReadCapacity,
|
|
||||||
initialLeaseTableWriteCapacity,
|
|
||||||
deprecatedHierarchicalShardSyncer,
|
|
||||||
tableCreatorCallback,
|
|
||||||
dynamoDbRequestTimeout,
|
|
||||||
billingMode,
|
|
||||||
leaseTableDeletionProtectionEnabled,
|
|
||||||
LeaseManagementConfig.DEFAULT_LEASE_TABLE_PITR_ENABLED,
|
|
||||||
tags,
|
|
||||||
leaseSerializer,
|
|
||||||
customShardDetectorProvider,
|
|
||||||
isMultiStreamMode,
|
|
||||||
leaseCleanupConfig);
|
|
||||||
}
|
|
||||||
|
|
||||||
public DynamoDBLeaseManagementFactory(
|
public DynamoDBLeaseManagementFactory(
|
||||||
final KinesisAsyncClient kinesisClient,
|
final KinesisAsyncClient kinesisClient,
|
||||||
final DynamoDbAsyncClient dynamoDBClient,
|
final DynamoDbAsyncClient dynamoDBClient,
|
||||||
|
|
@ -1049,7 +285,9 @@ public class DynamoDBLeaseManagementFactory implements LeaseManagementFactory {
|
||||||
LeaseSerializer leaseSerializer,
|
LeaseSerializer leaseSerializer,
|
||||||
Function<StreamConfig, ShardDetector> customShardDetectorProvider,
|
Function<StreamConfig, ShardDetector> customShardDetectorProvider,
|
||||||
boolean isMultiStreamMode,
|
boolean isMultiStreamMode,
|
||||||
LeaseCleanupConfig leaseCleanupConfig) {
|
LeaseCleanupConfig leaseCleanupConfig,
|
||||||
|
final LeaseManagementConfig.WorkerUtilizationAwareAssignmentConfig workerUtilizationAwareAssignmentConfig,
|
||||||
|
final LeaseManagementConfig.GracefulLeaseHandoffConfig gracefulLeaseHandoffConfig) {
|
||||||
this.kinesisClient = kinesisClient;
|
this.kinesisClient = kinesisClient;
|
||||||
this.dynamoDBClient = dynamoDBClient;
|
this.dynamoDBClient = dynamoDBClient;
|
||||||
this.tableName = tableName;
|
this.tableName = tableName;
|
||||||
|
|
@ -1083,10 +321,19 @@ public class DynamoDBLeaseManagementFactory implements LeaseManagementFactory {
|
||||||
this.isMultiStreamMode = isMultiStreamMode;
|
this.isMultiStreamMode = isMultiStreamMode;
|
||||||
this.leaseCleanupConfig = leaseCleanupConfig;
|
this.leaseCleanupConfig = leaseCleanupConfig;
|
||||||
this.tags = tags;
|
this.tags = tags;
|
||||||
|
this.workerUtilizationAwareAssignmentConfig = workerUtilizationAwareAssignmentConfig;
|
||||||
|
this.gracefulLeaseHandoffConfig = gracefulLeaseHandoffConfig;
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public LeaseCoordinator createLeaseCoordinator(@NonNull final MetricsFactory metricsFactory) {
|
public LeaseCoordinator createLeaseCoordinator(@NonNull final MetricsFactory metricsFactory) {
|
||||||
|
return createLeaseCoordinator(metricsFactory, new ConcurrentHashMap<>());
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public LeaseCoordinator createLeaseCoordinator(
|
||||||
|
@NonNull final MetricsFactory metricsFactory,
|
||||||
|
@NonNull final ConcurrentMap<ShardInfo, ShardConsumer> shardInfoShardConsumerMap) {
|
||||||
return new DynamoDBLeaseCoordinator(
|
return new DynamoDBLeaseCoordinator(
|
||||||
this.createLeaseRefresher(),
|
this.createLeaseRefresher(),
|
||||||
workerIdentifier,
|
workerIdentifier,
|
||||||
|
|
@ -1098,9 +345,15 @@ public class DynamoDBLeaseManagementFactory implements LeaseManagementFactory {
|
||||||
maxLeaseRenewalThreads,
|
maxLeaseRenewalThreads,
|
||||||
initialLeaseTableReadCapacity,
|
initialLeaseTableReadCapacity,
|
||||||
initialLeaseTableWriteCapacity,
|
initialLeaseTableWriteCapacity,
|
||||||
metricsFactory);
|
metricsFactory,
|
||||||
|
workerUtilizationAwareAssignmentConfig,
|
||||||
|
gracefulLeaseHandoffConfig,
|
||||||
|
shardInfoShardConsumerMap);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Even though this is deprecated, this is a method part of the public interface in LeaseManagementFactory
|
||||||
|
*/
|
||||||
@Override
|
@Override
|
||||||
@Deprecated
|
@Deprecated
|
||||||
public ShardSyncTaskManager createShardSyncTaskManager(@NonNull final MetricsFactory metricsFactory) {
|
public ShardSyncTaskManager createShardSyncTaskManager(@NonNull final MetricsFactory metricsFactory) {
|
||||||
|
|
@ -1155,6 +408,10 @@ public class DynamoDBLeaseManagementFactory implements LeaseManagementFactory {
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public DynamoDBLeaseRefresher createLeaseRefresher() {
|
public DynamoDBLeaseRefresher createLeaseRefresher() {
|
||||||
|
final DdbTableConfig ddbTableConfig = new DdbTableConfig();
|
||||||
|
ddbTableConfig.billingMode(billingMode);
|
||||||
|
ddbTableConfig.readCapacity(initialLeaseTableReadCapacity);
|
||||||
|
ddbTableConfig.writeCapacity(initialLeaseTableWriteCapacity);
|
||||||
return new DynamoDBLeaseRefresher(
|
return new DynamoDBLeaseRefresher(
|
||||||
tableName,
|
tableName,
|
||||||
dynamoDBClient,
|
dynamoDBClient,
|
||||||
|
|
@ -1162,12 +419,15 @@ public class DynamoDBLeaseManagementFactory implements LeaseManagementFactory {
|
||||||
consistentReads,
|
consistentReads,
|
||||||
tableCreatorCallback,
|
tableCreatorCallback,
|
||||||
dynamoDbRequestTimeout,
|
dynamoDbRequestTimeout,
|
||||||
billingMode,
|
ddbTableConfig,
|
||||||
leaseTableDeletionProtectionEnabled,
|
leaseTableDeletionProtectionEnabled,
|
||||||
leaseTablePitrEnabled,
|
leaseTablePitrEnabled,
|
||||||
tags);
|
tags);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Even though this is deprecated, this is a method part of the public interface in LeaseManagementFactory
|
||||||
|
*/
|
||||||
@Override
|
@Override
|
||||||
@Deprecated
|
@Deprecated
|
||||||
public ShardDetector createShardDetector() {
|
public ShardDetector createShardDetector() {
|
||||||
|
|
|
||||||
File diff suppressed because it is too large
Load diff
|
|
@ -14,6 +14,8 @@
|
||||||
*/
|
*/
|
||||||
package software.amazon.kinesis.leases.dynamodb;
|
package software.amazon.kinesis.leases.dynamodb;
|
||||||
|
|
||||||
|
import java.math.BigDecimal;
|
||||||
|
import java.math.RoundingMode;
|
||||||
import java.util.ArrayList;
|
import java.util.ArrayList;
|
||||||
import java.util.Collection;
|
import java.util.Collection;
|
||||||
import java.util.HashMap;
|
import java.util.HashMap;
|
||||||
|
|
@ -26,8 +28,10 @@ import java.util.concurrent.ConcurrentNavigableMap;
|
||||||
import java.util.concurrent.ConcurrentSkipListMap;
|
import java.util.concurrent.ConcurrentSkipListMap;
|
||||||
import java.util.concurrent.ExecutionException;
|
import java.util.concurrent.ExecutionException;
|
||||||
import java.util.concurrent.ExecutorService;
|
import java.util.concurrent.ExecutorService;
|
||||||
|
import java.util.concurrent.Executors;
|
||||||
import java.util.concurrent.Future;
|
import java.util.concurrent.Future;
|
||||||
import java.util.concurrent.TimeUnit;
|
import java.util.concurrent.TimeUnit;
|
||||||
|
import java.util.function.Consumer;
|
||||||
|
|
||||||
import lombok.NonNull;
|
import lombok.NonNull;
|
||||||
import lombok.RequiredArgsConstructor;
|
import lombok.RequiredArgsConstructor;
|
||||||
|
|
@ -39,6 +43,7 @@ import software.amazon.kinesis.common.StreamIdentifier;
|
||||||
import software.amazon.kinesis.leases.Lease;
|
import software.amazon.kinesis.leases.Lease;
|
||||||
import software.amazon.kinesis.leases.LeaseRefresher;
|
import software.amazon.kinesis.leases.LeaseRefresher;
|
||||||
import software.amazon.kinesis.leases.LeaseRenewer;
|
import software.amazon.kinesis.leases.LeaseRenewer;
|
||||||
|
import software.amazon.kinesis.leases.LeaseStatsRecorder;
|
||||||
import software.amazon.kinesis.leases.MultiStreamLease;
|
import software.amazon.kinesis.leases.MultiStreamLease;
|
||||||
import software.amazon.kinesis.leases.exceptions.DependencyException;
|
import software.amazon.kinesis.leases.exceptions.DependencyException;
|
||||||
import software.amazon.kinesis.leases.exceptions.InvalidStateException;
|
import software.amazon.kinesis.leases.exceptions.InvalidStateException;
|
||||||
|
|
@ -48,21 +53,32 @@ import software.amazon.kinesis.metrics.MetricsLevel;
|
||||||
import software.amazon.kinesis.metrics.MetricsScope;
|
import software.amazon.kinesis.metrics.MetricsScope;
|
||||||
import software.amazon.kinesis.metrics.MetricsUtil;
|
import software.amazon.kinesis.metrics.MetricsUtil;
|
||||||
|
|
||||||
|
import static java.util.Objects.nonNull;
|
||||||
|
import static software.amazon.kinesis.leases.LeaseStatsRecorder.BYTES_PER_KB;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* An implementation of {@link LeaseRenewer} that uses DynamoDB via {@link LeaseRefresher}.
|
* An implementation of {@link LeaseRenewer} that uses DynamoDB via {@link LeaseRefresher}.
|
||||||
*/
|
*/
|
||||||
@Slf4j
|
@Slf4j
|
||||||
@KinesisClientInternalApi
|
@KinesisClientInternalApi
|
||||||
public class DynamoDBLeaseRenewer implements LeaseRenewer {
|
public class DynamoDBLeaseRenewer implements LeaseRenewer {
|
||||||
|
|
||||||
|
/**
|
||||||
|
* 6 digit after decimal gives the granularity of 0.001 byte per second.
|
||||||
|
*/
|
||||||
|
private static final int DEFAULT_THROUGHPUT_DIGIT_AFTER_DECIMAL = 6;
|
||||||
|
|
||||||
private static final int RENEWAL_RETRIES = 2;
|
private static final int RENEWAL_RETRIES = 2;
|
||||||
private static final String RENEW_ALL_LEASES_DIMENSION = "RenewAllLeases";
|
private static final String RENEW_ALL_LEASES_DIMENSION = "RenewAllLeases";
|
||||||
|
private static final String LEASE_RENEWER_INITIALIZE = "LeaseRenewerInitialize";
|
||||||
|
|
||||||
private final LeaseRefresher leaseRefresher;
|
private final LeaseRefresher leaseRefresher;
|
||||||
private final String workerIdentifier;
|
private final String workerIdentifier;
|
||||||
private final long leaseDurationNanos;
|
private final long leaseDurationNanos;
|
||||||
private final ExecutorService executorService;
|
private final ExecutorService executorService;
|
||||||
private final MetricsFactory metricsFactory;
|
private final MetricsFactory metricsFactory;
|
||||||
|
private final LeaseStatsRecorder leaseStatsRecorder;
|
||||||
|
private final Consumer<Lease> leaseGracefulShutdownCallback;
|
||||||
private final ConcurrentNavigableMap<String, Lease> ownedLeases = new ConcurrentSkipListMap<>();
|
private final ConcurrentNavigableMap<String, Lease> ownedLeases = new ConcurrentSkipListMap<>();
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
|
@ -82,12 +98,16 @@ public class DynamoDBLeaseRenewer implements LeaseRenewer {
|
||||||
final String workerIdentifier,
|
final String workerIdentifier,
|
||||||
final long leaseDurationMillis,
|
final long leaseDurationMillis,
|
||||||
final ExecutorService executorService,
|
final ExecutorService executorService,
|
||||||
final MetricsFactory metricsFactory) {
|
final MetricsFactory metricsFactory,
|
||||||
|
final LeaseStatsRecorder leaseStatsRecorder,
|
||||||
|
final Consumer<Lease> leaseGracefulShutdownCallback) {
|
||||||
this.leaseRefresher = leaseRefresher;
|
this.leaseRefresher = leaseRefresher;
|
||||||
this.workerIdentifier = workerIdentifier;
|
this.workerIdentifier = workerIdentifier;
|
||||||
this.leaseDurationNanos = TimeUnit.MILLISECONDS.toNanos(leaseDurationMillis);
|
this.leaseDurationNanos = TimeUnit.MILLISECONDS.toNanos(leaseDurationMillis);
|
||||||
this.executorService = executorService;
|
this.executorService = executorService;
|
||||||
this.metricsFactory = metricsFactory;
|
this.metricsFactory = metricsFactory;
|
||||||
|
this.leaseStatsRecorder = leaseStatsRecorder;
|
||||||
|
this.leaseGracefulShutdownCallback = leaseGracefulShutdownCallback;
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
|
@ -187,11 +207,21 @@ public class DynamoDBLeaseRenewer implements LeaseRenewer {
|
||||||
// ShutdownException).
|
// ShutdownException).
|
||||||
boolean isLeaseExpired = lease.isExpired(leaseDurationNanos, System.nanoTime());
|
boolean isLeaseExpired = lease.isExpired(leaseDurationNanos, System.nanoTime());
|
||||||
if (renewEvenIfExpired || !isLeaseExpired) {
|
if (renewEvenIfExpired || !isLeaseExpired) {
|
||||||
|
final Double throughputPerKBps = this.leaseStatsRecorder.getThroughputKBps(leaseKey);
|
||||||
|
if (nonNull(throughputPerKBps)) {
|
||||||
|
lease.throughputKBps(BigDecimal.valueOf(throughputPerKBps)
|
||||||
|
.setScale(DEFAULT_THROUGHPUT_DIGIT_AFTER_DECIMAL, RoundingMode.HALF_UP)
|
||||||
|
.doubleValue());
|
||||||
|
}
|
||||||
renewedLease = leaseRefresher.renewLease(lease);
|
renewedLease = leaseRefresher.renewLease(lease);
|
||||||
}
|
}
|
||||||
if (renewedLease) {
|
if (renewedLease) {
|
||||||
lease.lastCounterIncrementNanos(System.nanoTime());
|
lease.lastCounterIncrementNanos(System.nanoTime());
|
||||||
}
|
}
|
||||||
|
if (lease.shutdownRequested()) {
|
||||||
|
// the underlying function will dedup
|
||||||
|
leaseGracefulShutdownCallback.accept(lease.copy());
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if (renewedLease) {
|
if (renewedLease) {
|
||||||
|
|
@ -391,6 +421,12 @@ public class DynamoDBLeaseRenewer implements LeaseRenewer {
|
||||||
* every time we acquire a lease, it gets a new concurrency token.
|
* every time we acquire a lease, it gets a new concurrency token.
|
||||||
*/
|
*/
|
||||||
authoritativeLease.concurrencyToken(UUID.randomUUID());
|
authoritativeLease.concurrencyToken(UUID.randomUUID());
|
||||||
|
if (nonNull(lease.throughputKBps())) {
|
||||||
|
leaseStatsRecorder.recordStats(LeaseStatsRecorder.LeaseStats.builder()
|
||||||
|
.leaseKey(lease.leaseKey())
|
||||||
|
.bytes(Math.round(lease.throughputKBps() * BYTES_PER_KB)) // Convert KB to Bytes
|
||||||
|
.build());
|
||||||
|
}
|
||||||
ownedLeases.put(authoritativeLease.leaseKey(), authoritativeLease);
|
ownedLeases.put(authoritativeLease.leaseKey(), authoritativeLease);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
@ -409,6 +445,7 @@ public class DynamoDBLeaseRenewer implements LeaseRenewer {
|
||||||
*/
|
*/
|
||||||
@Override
|
@Override
|
||||||
public void dropLease(Lease lease) {
|
public void dropLease(Lease lease) {
|
||||||
|
leaseStatsRecorder.dropLeaseStats(lease.leaseKey());
|
||||||
ownedLeases.remove(lease.leaseKey());
|
ownedLeases.remove(lease.leaseKey());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -417,26 +454,48 @@ public class DynamoDBLeaseRenewer implements LeaseRenewer {
|
||||||
*/
|
*/
|
||||||
@Override
|
@Override
|
||||||
public void initialize() throws DependencyException, InvalidStateException, ProvisionedThroughputException {
|
public void initialize() throws DependencyException, InvalidStateException, ProvisionedThroughputException {
|
||||||
Collection<Lease> leases = leaseRefresher.listLeases();
|
final MetricsScope scope = MetricsUtil.createMetricsWithOperation(metricsFactory, LEASE_RENEWER_INITIALIZE);
|
||||||
List<Lease> myLeases = new LinkedList<>();
|
final ExecutorService singleThreadExecutorService = Executors.newSingleThreadExecutor();
|
||||||
boolean renewEvenIfExpired = true;
|
boolean success = false;
|
||||||
|
try {
|
||||||
|
final Map.Entry<List<Lease>, List<String>> response =
|
||||||
|
leaseRefresher.listLeasesParallely(singleThreadExecutorService, 1);
|
||||||
|
|
||||||
for (Lease lease : leases) {
|
if (!response.getValue().isEmpty()) {
|
||||||
if (workerIdentifier.equals(lease.leaseOwner())) {
|
log.warn("List of leaseKeys failed to deserialize : {} ", response.getValue());
|
||||||
log.info(" Worker {} found lease {}", workerIdentifier, lease);
|
|
||||||
// Okay to renew even if lease is expired, because we start with an empty list and we add the lease to
|
|
||||||
// our list only after a successful renew. So we don't need to worry about the edge case where we could
|
|
||||||
// continue renewing a lease after signaling a lease loss to the application.
|
|
||||||
|
|
||||||
if (renewLease(lease, renewEvenIfExpired)) {
|
|
||||||
myLeases.add(lease);
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
log.debug("Worker {} ignoring lease {} ", workerIdentifier, lease);
|
|
||||||
}
|
}
|
||||||
}
|
|
||||||
|
|
||||||
addLeasesToRenew(myLeases);
|
final List<Lease> myLeases = new LinkedList<>();
|
||||||
|
boolean renewEvenIfExpired = true;
|
||||||
|
|
||||||
|
for (Lease lease : response.getKey()) {
|
||||||
|
if (workerIdentifier.equals(lease.leaseOwner())) {
|
||||||
|
log.info(" Worker {} found lease {}", workerIdentifier, lease);
|
||||||
|
// Okay to renew even if lease is expired, because we start with an empty list and we add the lease
|
||||||
|
// to
|
||||||
|
// our list only after a successful renew. So we don't need to worry about the edge case where we
|
||||||
|
// could
|
||||||
|
// continue renewing a lease after signaling a lease loss to the application.
|
||||||
|
|
||||||
|
if (renewLease(lease, renewEvenIfExpired)) {
|
||||||
|
myLeases.add(lease);
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
log.debug("Worker {} ignoring lease {} ", workerIdentifier, lease);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
addLeasesToRenew(myLeases);
|
||||||
|
success = true;
|
||||||
|
} catch (final Exception e) {
|
||||||
|
// It's ok to swollow exception here fail to discover all leases here, as the assignment logic takes
|
||||||
|
// care of reassignment if some lease is expired.
|
||||||
|
log.warn("LeaseRefresher failed in initialization during renewing of pre assigned leases", e);
|
||||||
|
} finally {
|
||||||
|
singleThreadExecutorService.shutdown();
|
||||||
|
MetricsUtil.addCount(scope, "Fault", success ? 0 : 1, MetricsLevel.DETAILED);
|
||||||
|
MetricsUtil.endScope(scope);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
private void verifyNotNull(Object object, String message) {
|
private void verifyNotNull(Object object, String message) {
|
||||||
|
|
|
||||||
|
|
@ -44,11 +44,8 @@ import software.amazon.kinesis.retrieval.kpl.ExtendedSequenceNumber;
|
||||||
*/
|
*/
|
||||||
@KinesisClientInternalApi
|
@KinesisClientInternalApi
|
||||||
public class DynamoDBLeaseSerializer implements LeaseSerializer {
|
public class DynamoDBLeaseSerializer implements LeaseSerializer {
|
||||||
private static final String LEASE_KEY_KEY = "leaseKey";
|
|
||||||
private static final String LEASE_OWNER_KEY = "leaseOwner";
|
|
||||||
private static final String LEASE_COUNTER_KEY = "leaseCounter";
|
private static final String LEASE_COUNTER_KEY = "leaseCounter";
|
||||||
private static final String OWNER_SWITCHES_KEY = "ownerSwitchesSinceCheckpoint";
|
private static final String OWNER_SWITCHES_KEY = "ownerSwitchesSinceCheckpoint";
|
||||||
private static final String CHECKPOINT_SEQUENCE_NUMBER_KEY = "checkpoint";
|
|
||||||
private static final String CHECKPOINT_SUBSEQUENCE_NUMBER_KEY = "checkpointSubSequenceNumber";
|
private static final String CHECKPOINT_SUBSEQUENCE_NUMBER_KEY = "checkpointSubSequenceNumber";
|
||||||
private static final String PENDING_CHECKPOINT_SEQUENCE_KEY = "pendingCheckpoint";
|
private static final String PENDING_CHECKPOINT_SEQUENCE_KEY = "pendingCheckpoint";
|
||||||
private static final String PENDING_CHECKPOINT_SUBSEQUENCE_KEY = "pendingCheckpointSubSequenceNumber";
|
private static final String PENDING_CHECKPOINT_SUBSEQUENCE_KEY = "pendingCheckpointSubSequenceNumber";
|
||||||
|
|
@ -57,6 +54,11 @@ public class DynamoDBLeaseSerializer implements LeaseSerializer {
|
||||||
private static final String CHILD_SHARD_IDS_KEY = "childShardIds";
|
private static final String CHILD_SHARD_IDS_KEY = "childShardIds";
|
||||||
private static final String STARTING_HASH_KEY = "startingHashKey";
|
private static final String STARTING_HASH_KEY = "startingHashKey";
|
||||||
private static final String ENDING_HASH_KEY = "endingHashKey";
|
private static final String ENDING_HASH_KEY = "endingHashKey";
|
||||||
|
private static final String THROUGHOUT_PUT_KBPS = "throughputKBps";
|
||||||
|
private static final String CHECKPOINT_SEQUENCE_NUMBER_KEY = "checkpoint";
|
||||||
|
static final String CHECKPOINT_OWNER = "checkpointOwner";
|
||||||
|
static final String LEASE_OWNER_KEY = "leaseOwner";
|
||||||
|
static final String LEASE_KEY_KEY = "leaseKey";
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public Map<String, AttributeValue> toDynamoRecord(final Lease lease) {
|
public Map<String, AttributeValue> toDynamoRecord(final Lease lease) {
|
||||||
|
|
@ -110,6 +112,13 @@ public class DynamoDBLeaseSerializer implements LeaseSerializer {
|
||||||
lease.hashKeyRangeForLease().serializedEndingHashKey()));
|
lease.hashKeyRangeForLease().serializedEndingHashKey()));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (lease.throughputKBps() != null) {
|
||||||
|
result.put(THROUGHOUT_PUT_KBPS, DynamoUtils.createAttributeValue(lease.throughputKBps()));
|
||||||
|
}
|
||||||
|
|
||||||
|
if (lease.checkpointOwner() != null) {
|
||||||
|
result.put(CHECKPOINT_OWNER, DynamoUtils.createAttributeValue(lease.checkpointOwner()));
|
||||||
|
}
|
||||||
return result;
|
return result;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -146,6 +155,14 @@ public class DynamoDBLeaseSerializer implements LeaseSerializer {
|
||||||
leaseToUpdate.hashKeyRange(HashKeyRangeForLease.deserialize(startingHashKey, endingHashKey));
|
leaseToUpdate.hashKeyRange(HashKeyRangeForLease.deserialize(startingHashKey, endingHashKey));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (DynamoUtils.safeGetDouble(dynamoRecord, THROUGHOUT_PUT_KBPS) != null) {
|
||||||
|
leaseToUpdate.throughputKBps(DynamoUtils.safeGetDouble(dynamoRecord, THROUGHOUT_PUT_KBPS));
|
||||||
|
}
|
||||||
|
|
||||||
|
if (DynamoUtils.safeGetString(dynamoRecord, CHECKPOINT_OWNER) != null) {
|
||||||
|
leaseToUpdate.checkpointOwner(DynamoUtils.safeGetString(dynamoRecord, CHECKPOINT_OWNER));
|
||||||
|
}
|
||||||
|
|
||||||
return leaseToUpdate;
|
return leaseToUpdate;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -181,18 +198,9 @@ public class DynamoDBLeaseSerializer implements LeaseSerializer {
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public Map<String, ExpectedAttributeValue> getDynamoLeaseOwnerExpectation(final Lease lease) {
|
public Map<String, ExpectedAttributeValue> getDynamoLeaseOwnerExpectation(final Lease lease) {
|
||||||
Map<String, ExpectedAttributeValue> result = new HashMap<>();
|
final Map<String, ExpectedAttributeValue> result = new HashMap<>();
|
||||||
|
result.put(LEASE_OWNER_KEY, buildExpectedAttributeValueIfExistsOrValue(lease.leaseOwner()));
|
||||||
ExpectedAttributeValue.Builder eavBuilder = ExpectedAttributeValue.builder();
|
result.put(CHECKPOINT_OWNER, buildExpectedAttributeValueIfExistsOrValue(lease.checkpointOwner()));
|
||||||
|
|
||||||
if (lease.leaseOwner() == null) {
|
|
||||||
eavBuilder = eavBuilder.exists(false);
|
|
||||||
} else {
|
|
||||||
eavBuilder = eavBuilder.value(DynamoUtils.createAttributeValue(lease.leaseOwner()));
|
|
||||||
}
|
|
||||||
|
|
||||||
result.put(LEASE_OWNER_KEY, eavBuilder.build());
|
|
||||||
|
|
||||||
return result;
|
return result;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -247,9 +255,17 @@ public class DynamoDBLeaseSerializer implements LeaseSerializer {
|
||||||
.value(DynamoUtils.createAttributeValue(owner))
|
.value(DynamoUtils.createAttributeValue(owner))
|
||||||
.action(AttributeAction.PUT)
|
.action(AttributeAction.PUT)
|
||||||
.build());
|
.build());
|
||||||
|
// this method is currently used by assignLease and takeLease. In both case we want the checkpoint owner to be
|
||||||
|
// deleted as this is a fresh assignment
|
||||||
|
result.put(
|
||||||
|
CHECKPOINT_OWNER,
|
||||||
|
AttributeValueUpdate.builder().action(AttributeAction.DELETE).build());
|
||||||
|
|
||||||
String oldOwner = lease.leaseOwner();
|
String oldOwner = lease.leaseOwner();
|
||||||
if (oldOwner != null && !oldOwner.equals(owner)) {
|
String checkpointOwner = lease.checkpointOwner();
|
||||||
|
// if checkpoint owner is not null, this update is supposed to remove the checkpoint owner
|
||||||
|
// and transfer the lease ownership to the leaseOwner so incrementing the owner switch key
|
||||||
|
if (oldOwner != null && !oldOwner.equals(owner) || (checkpointOwner != null && checkpointOwner.equals(owner))) {
|
||||||
result.put(
|
result.put(
|
||||||
OWNER_SWITCHES_KEY,
|
OWNER_SWITCHES_KEY,
|
||||||
AttributeValueUpdate.builder()
|
AttributeValueUpdate.builder()
|
||||||
|
|
@ -261,18 +277,38 @@ public class DynamoDBLeaseSerializer implements LeaseSerializer {
|
||||||
return result;
|
return result;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* AssignLease performs the PUT action on the LeaseOwner and ADD (1) action on the leaseCounter.
|
||||||
|
* @param lease lease that needs to be assigned
|
||||||
|
* @param newOwner newLeaseOwner
|
||||||
|
* @return Map of AttributeName to update operation
|
||||||
|
*/
|
||||||
|
@Override
|
||||||
|
public Map<String, AttributeValueUpdate> getDynamoAssignLeaseUpdate(final Lease lease, final String newOwner) {
|
||||||
|
Map<String, AttributeValueUpdate> result = getDynamoTakeLeaseUpdate(lease, newOwner);
|
||||||
|
|
||||||
|
result.put(LEASE_COUNTER_KEY, getAttributeValueUpdateForAdd());
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public Map<String, AttributeValueUpdate> getDynamoEvictLeaseUpdate(final Lease lease) {
|
public Map<String, AttributeValueUpdate> getDynamoEvictLeaseUpdate(final Lease lease) {
|
||||||
Map<String, AttributeValueUpdate> result = new HashMap<>();
|
final Map<String, AttributeValueUpdate> result = new HashMap<>();
|
||||||
AttributeValue value = null;
|
// if checkpointOwner is not null, it means lease handoff is initiated. In this case we just remove the
|
||||||
|
// checkpoint owner so the next owner (leaseOwner) can pick up the lease without waiting for assignment.
|
||||||
|
// Otherwise, remove the leaseOwner
|
||||||
|
if (lease.checkpointOwner() == null) {
|
||||||
|
result.put(
|
||||||
|
LEASE_OWNER_KEY,
|
||||||
|
AttributeValueUpdate.builder()
|
||||||
|
.action(AttributeAction.DELETE)
|
||||||
|
.build());
|
||||||
|
}
|
||||||
|
// We always want to remove checkpointOwner, it's ok even if it's null
|
||||||
result.put(
|
result.put(
|
||||||
LEASE_OWNER_KEY,
|
CHECKPOINT_OWNER,
|
||||||
AttributeValueUpdate.builder()
|
AttributeValueUpdate.builder().action(AttributeAction.DELETE).build());
|
||||||
.value(value)
|
result.put(LEASE_COUNTER_KEY, getAttributeValueUpdateForAdd());
|
||||||
.action(AttributeAction.DELETE)
|
|
||||||
.build());
|
|
||||||
|
|
||||||
return result;
|
return result;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -394,4 +430,58 @@ public class DynamoDBLeaseSerializer implements LeaseSerializer {
|
||||||
|
|
||||||
return definitions;
|
return definitions;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public Collection<KeySchemaElement> getWorkerIdToLeaseKeyIndexKeySchema() {
|
||||||
|
final List<KeySchemaElement> keySchema = new ArrayList<>();
|
||||||
|
keySchema.add(KeySchemaElement.builder()
|
||||||
|
.attributeName(LEASE_OWNER_KEY)
|
||||||
|
.keyType(KeyType.HASH)
|
||||||
|
.build());
|
||||||
|
keySchema.add(KeySchemaElement.builder()
|
||||||
|
.attributeName(LEASE_KEY_KEY)
|
||||||
|
.keyType(KeyType.RANGE)
|
||||||
|
.build());
|
||||||
|
return keySchema;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public Collection<AttributeDefinition> getWorkerIdToLeaseKeyIndexAttributeDefinitions() {
|
||||||
|
final List<AttributeDefinition> definitions = new ArrayList<>();
|
||||||
|
definitions.add(AttributeDefinition.builder()
|
||||||
|
.attributeName(LEASE_OWNER_KEY)
|
||||||
|
.attributeType(ScalarAttributeType.S)
|
||||||
|
.build());
|
||||||
|
definitions.add(AttributeDefinition.builder()
|
||||||
|
.attributeName(LEASE_KEY_KEY)
|
||||||
|
.attributeType(ScalarAttributeType.S)
|
||||||
|
.build());
|
||||||
|
return definitions;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public Map<String, AttributeValueUpdate> getDynamoLeaseThroughputKbpsUpdate(Lease lease) {
|
||||||
|
final Map<String, AttributeValueUpdate> result = new HashMap<>();
|
||||||
|
final AttributeValueUpdate avu = AttributeValueUpdate.builder()
|
||||||
|
.value(DynamoUtils.createAttributeValue(lease.throughputKBps()))
|
||||||
|
.action(AttributeAction.PUT)
|
||||||
|
.build();
|
||||||
|
result.put(THROUGHOUT_PUT_KBPS, avu);
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
|
private static ExpectedAttributeValue buildExpectedAttributeValueIfExistsOrValue(String value) {
|
||||||
|
return value == null
|
||||||
|
? ExpectedAttributeValue.builder().exists(false).build()
|
||||||
|
: ExpectedAttributeValue.builder()
|
||||||
|
.value(DynamoUtils.createAttributeValue(value))
|
||||||
|
.build();
|
||||||
|
}
|
||||||
|
|
||||||
|
private static AttributeValueUpdate getAttributeValueUpdateForAdd() {
|
||||||
|
return AttributeValueUpdate.builder()
|
||||||
|
.value(DynamoUtils.createAttributeValue(1L))
|
||||||
|
.action(AttributeAction.ADD)
|
||||||
|
.build();
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -106,15 +106,6 @@ public class DynamoDBLeaseTaker implements LeaseTaker {
|
||||||
return this;
|
return this;
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
|
||||||
* @deprecated Misspelled method, use {@link DynamoDBLeaseTaker#withVeryOldLeaseDurationNanosMultiplier(int)}
|
|
||||||
*/
|
|
||||||
@Deprecated
|
|
||||||
public DynamoDBLeaseTaker withVeryOldLeaseDurationNanosMultipler(long veryOldLeaseDurationNanosMultipler) {
|
|
||||||
this.veryOldLeaseDurationNanosMultiplier = (int) veryOldLeaseDurationNanosMultipler;
|
|
||||||
return this;
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Overrides the default very old lease duration nanos multiplier to increase the threshold for taking very old leases.
|
* Overrides the default very old lease duration nanos multiplier to increase the threshold for taking very old leases.
|
||||||
* Setting this to a higher value than 3 will increase the threshold for very old lease taking.
|
* Setting this to a higher value than 3 will increase the threshold for very old lease taking.
|
||||||
|
|
|
||||||
|
|
@ -266,7 +266,8 @@ class ConsumerStates {
|
||||||
argument.idleTimeInMilliseconds(),
|
argument.idleTimeInMilliseconds(),
|
||||||
argument.aggregatorUtil(),
|
argument.aggregatorUtil(),
|
||||||
argument.metricsFactory(),
|
argument.metricsFactory(),
|
||||||
argument.schemaRegistryDecoder());
|
argument.schemaRegistryDecoder(),
|
||||||
|
argument.leaseCoordinator().leaseStatsRecorder());
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
|
|
@ -336,7 +337,8 @@ class ConsumerStates {
|
||||||
argument.shardRecordProcessor(),
|
argument.shardRecordProcessor(),
|
||||||
argument.recordProcessorCheckpointer(),
|
argument.recordProcessorCheckpointer(),
|
||||||
consumer.shutdownNotification(),
|
consumer.shutdownNotification(),
|
||||||
argument.shardInfo());
|
argument.shardInfo(),
|
||||||
|
consumer.shardConsumerArgument().leaseCoordinator());
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
|
|
|
||||||
|
|
@ -0,0 +1,213 @@
|
||||||
|
package software.amazon.kinesis.lifecycle;
|
||||||
|
|
||||||
|
import java.util.concurrent.ConcurrentHashMap;
|
||||||
|
import java.util.concurrent.ConcurrentMap;
|
||||||
|
import java.util.concurrent.Executors;
|
||||||
|
import java.util.concurrent.ScheduledExecutorService;
|
||||||
|
import java.util.concurrent.TimeUnit;
|
||||||
|
import java.util.function.Supplier;
|
||||||
|
|
||||||
|
import com.google.common.util.concurrent.ThreadFactoryBuilder;
|
||||||
|
import lombok.Data;
|
||||||
|
import lombok.RequiredArgsConstructor;
|
||||||
|
import lombok.extern.slf4j.Slf4j;
|
||||||
|
import software.amazon.kinesis.annotations.KinesisClientInternalApi;
|
||||||
|
import software.amazon.kinesis.leases.Lease;
|
||||||
|
import software.amazon.kinesis.leases.LeaseCoordinator;
|
||||||
|
import software.amazon.kinesis.leases.ShardInfo;
|
||||||
|
import software.amazon.kinesis.leases.dynamodb.DynamoDBLeaseCoordinator;
|
||||||
|
import software.amazon.kinesis.leases.exceptions.DependencyException;
|
||||||
|
import software.amazon.kinesis.leases.exceptions.InvalidStateException;
|
||||||
|
import software.amazon.kinesis.leases.exceptions.ProvisionedThroughputException;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* This class handles the graceful shutdown of shard consumers. When a lease is requested for shutdown, it will be
|
||||||
|
* enqueued from the lease renewal thread which will call the shard consumer of the lease to enqueue a shutdown request.
|
||||||
|
* The class monitors those leases and check if the shutdown is properly completed.
|
||||||
|
* If the shard consumer doesn't shut down within the given timeout, it will trigger a lease transfer.
|
||||||
|
*/
|
||||||
|
@Slf4j
|
||||||
|
@RequiredArgsConstructor
|
||||||
|
@KinesisClientInternalApi
|
||||||
|
public class LeaseGracefulShutdownHandler {
|
||||||
|
|
||||||
|
// Arbitrary number to run a similar frequency as the scheduler based on shardConsumerDispatchPollIntervalMillis
|
||||||
|
// which is how fast scheduler triggers state change. It's ok to add few extra second delay to call shutdown since
|
||||||
|
// the leases should still be processing by the current owner so there should not be processing delay due to this.
|
||||||
|
private static final long SHUTDOWN_CHECK_INTERVAL_MILLIS = 2000;
|
||||||
|
|
||||||
|
private final long shutdownTimeoutMillis;
|
||||||
|
private final ConcurrentMap<ShardInfo, ShardConsumer> shardInfoShardConsumerMap;
|
||||||
|
private final LeaseCoordinator leaseCoordinator;
|
||||||
|
private final Supplier<Long> currentTimeSupplier;
|
||||||
|
private final ConcurrentMap<ShardInfo, LeasePendingShutdown> shardInfoLeasePendingShutdownMap =
|
||||||
|
new ConcurrentHashMap<>();
|
||||||
|
private final ScheduledExecutorService executorService;
|
||||||
|
|
||||||
|
private volatile boolean isRunning = false;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Factory method to create a new instance of LeaseGracefulShutdownHandler.
|
||||||
|
*
|
||||||
|
* @param shutdownTimeoutMillis Timeout for graceful shutdown of shard consumers.
|
||||||
|
* @param shardInfoShardConsumerMap Map of shard info to shard consumer instances.
|
||||||
|
* @param leaseCoordinator Lease coordinator instance to access lease information.
|
||||||
|
* @return A new instance of LeaseGracefulShutdownHandler.
|
||||||
|
*/
|
||||||
|
public static LeaseGracefulShutdownHandler create(
|
||||||
|
long shutdownTimeoutMillis,
|
||||||
|
ConcurrentMap<ShardInfo, ShardConsumer> shardInfoShardConsumerMap,
|
||||||
|
LeaseCoordinator leaseCoordinator) {
|
||||||
|
return new LeaseGracefulShutdownHandler(
|
||||||
|
shutdownTimeoutMillis,
|
||||||
|
shardInfoShardConsumerMap,
|
||||||
|
leaseCoordinator,
|
||||||
|
System::currentTimeMillis,
|
||||||
|
Executors.newSingleThreadScheduledExecutor(new ThreadFactoryBuilder()
|
||||||
|
.setNameFormat("LeaseGracefulShutdown-%04d")
|
||||||
|
.setDaemon(true)
|
||||||
|
.build()));
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Starts the shard consumer shutdown handler thread.
|
||||||
|
*/
|
||||||
|
public void start() {
|
||||||
|
if (!isRunning) {
|
||||||
|
log.info("Starting graceful lease handoff thread.");
|
||||||
|
executorService.scheduleAtFixedRate(
|
||||||
|
this::monitorGracefulShutdownLeases, 0, SHUTDOWN_CHECK_INTERVAL_MILLIS, TimeUnit.MILLISECONDS);
|
||||||
|
isRunning = true;
|
||||||
|
} else {
|
||||||
|
log.info("Graceful lease handoff thread already running, no need to start.");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Stops the shard consumer shutdown handler thread.
|
||||||
|
*/
|
||||||
|
public void stop() {
|
||||||
|
if (isRunning) {
|
||||||
|
log.info("Stopping graceful lease handoff thread.");
|
||||||
|
executorService.shutdown();
|
||||||
|
isRunning = false;
|
||||||
|
} else {
|
||||||
|
log.info("Graceful lease handoff thread already stopped.");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Enqueue a shutdown request for the given lease if the lease has requested shutdown and the shard consumer
|
||||||
|
* is not already shutdown.
|
||||||
|
*
|
||||||
|
* @param lease The lease to enqueue a shutdown request for.
|
||||||
|
*/
|
||||||
|
public void enqueueShutdown(Lease lease) {
|
||||||
|
if (lease == null || !lease.shutdownRequested() || !isRunning) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
final ShardInfo shardInfo = DynamoDBLeaseCoordinator.convertLeaseToAssignment(lease);
|
||||||
|
final ShardConsumer consumer = shardInfoShardConsumerMap.get(shardInfo);
|
||||||
|
if (consumer == null || consumer.isShutdown()) {
|
||||||
|
shardInfoLeasePendingShutdownMap.remove(shardInfo);
|
||||||
|
} else {
|
||||||
|
// there could be change shard get enqueued after getting removed. This should be okay because
|
||||||
|
// this enqueue will be no-op and will be removed again because the shardConsumer associated with the
|
||||||
|
// shardInfo is shutdown by then.
|
||||||
|
shardInfoLeasePendingShutdownMap.computeIfAbsent(shardInfo, key -> {
|
||||||
|
log.info("Calling graceful shutdown for lease {}", lease.leaseKey());
|
||||||
|
LeasePendingShutdown leasePendingShutdown = new LeasePendingShutdown(lease, consumer);
|
||||||
|
initiateShutdown(leasePendingShutdown);
|
||||||
|
return leasePendingShutdown;
|
||||||
|
});
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Wait for shutdown to complete or transfer ownership of lease to the next owner if timeout is met.
|
||||||
|
*/
|
||||||
|
private void monitorGracefulShutdownLeases() {
|
||||||
|
String leaseKey = null;
|
||||||
|
try {
|
||||||
|
for (ConcurrentMap.Entry<ShardInfo, LeasePendingShutdown> entry :
|
||||||
|
shardInfoLeasePendingShutdownMap.entrySet()) {
|
||||||
|
final LeasePendingShutdown leasePendingShutdown = entry.getValue();
|
||||||
|
final ShardInfo shardInfo = entry.getKey();
|
||||||
|
leaseKey = leasePendingShutdown.lease.leaseKey();
|
||||||
|
|
||||||
|
if (leasePendingShutdown.shardConsumer.isShutdown()
|
||||||
|
|| shardInfoShardConsumerMap.get(shardInfo) == null
|
||||||
|
|| leaseCoordinator.getCurrentlyHeldLease(leaseKey) == null) {
|
||||||
|
logTimeoutMessage(leasePendingShutdown);
|
||||||
|
shardInfoLeasePendingShutdownMap.remove(shardInfo);
|
||||||
|
} else if (getCurrentTimeMillis() >= leasePendingShutdown.timeoutTimestampMillis
|
||||||
|
&& !leasePendingShutdown.leaseTransferCalled) {
|
||||||
|
try {
|
||||||
|
log.info(
|
||||||
|
"Timeout {} millisecond reached waiting for lease {} to graceful handoff."
|
||||||
|
+ " Attempting to transfer the lease to {}",
|
||||||
|
shutdownTimeoutMillis,
|
||||||
|
leaseKey,
|
||||||
|
leasePendingShutdown.lease.leaseOwner());
|
||||||
|
transferLeaseIfOwner(leasePendingShutdown);
|
||||||
|
} catch (DependencyException | InvalidStateException | ProvisionedThroughputException e) {
|
||||||
|
log.warn("Failed to transfer lease for key {}. Will retry", leaseKey, e);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} catch (Exception e) {
|
||||||
|
log.error("Error in graceful shutdown for lease {}", leaseKey, e);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private void initiateShutdown(LeasePendingShutdown tracker) {
|
||||||
|
tracker.shardConsumer.gracefulShutdown(null);
|
||||||
|
tracker.shutdownRequested = true;
|
||||||
|
tracker.timeoutTimestampMillis = getCurrentTimeMillis() + shutdownTimeoutMillis;
|
||||||
|
}
|
||||||
|
|
||||||
|
private void logTimeoutMessage(LeasePendingShutdown leasePendingShutdown) {
|
||||||
|
if (leasePendingShutdown.leaseTransferCalled) {
|
||||||
|
final long timeElapsedSinceShutdownInitiated =
|
||||||
|
getCurrentTimeMillis() - leasePendingShutdown.timeoutTimestampMillis + shutdownTimeoutMillis;
|
||||||
|
log.info(
|
||||||
|
"Lease {} took {} milliseconds to complete the shutdown. "
|
||||||
|
+ "Consider tuning the GracefulLeaseHandoffTimeoutMillis to prevent timeouts, "
|
||||||
|
+ "if necessary.",
|
||||||
|
leasePendingShutdown.lease.leaseKey(),
|
||||||
|
timeElapsedSinceShutdownInitiated);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private void transferLeaseIfOwner(LeasePendingShutdown leasePendingShutdown)
|
||||||
|
throws ProvisionedThroughputException, InvalidStateException, DependencyException {
|
||||||
|
final Lease lease = leasePendingShutdown.lease;
|
||||||
|
if (leaseCoordinator.workerIdentifier().equals(lease.checkpointOwner())) {
|
||||||
|
// assignLease will increment the leaseCounter which will cause the heartbeat to stop on the current owner
|
||||||
|
// for the lease
|
||||||
|
leaseCoordinator.leaseRefresher().assignLease(lease, lease.leaseOwner());
|
||||||
|
} else {
|
||||||
|
// the worker ID check is just for sanity. We don't expect it to be different from the current worker.
|
||||||
|
log.error(
|
||||||
|
"Lease {} checkpoint owner mismatch found {} but it should be {}",
|
||||||
|
lease.leaseKey(),
|
||||||
|
lease.checkpointOwner(),
|
||||||
|
leaseCoordinator.workerIdentifier());
|
||||||
|
}
|
||||||
|
// mark it true because we don't want to enter the method again because update is not possible anymore.
|
||||||
|
leasePendingShutdown.leaseTransferCalled = true;
|
||||||
|
}
|
||||||
|
|
||||||
|
private long getCurrentTimeMillis() {
|
||||||
|
return currentTimeSupplier.get();
|
||||||
|
}
|
||||||
|
|
||||||
|
@Data
|
||||||
|
private static class LeasePendingShutdown {
|
||||||
|
final Lease lease;
|
||||||
|
final ShardConsumer shardConsumer;
|
||||||
|
long timeoutTimestampMillis;
|
||||||
|
boolean shutdownRequested = false;
|
||||||
|
boolean leaseTransferCalled = false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
@ -24,6 +24,7 @@ import software.amazon.awssdk.services.kinesis.model.Shard;
|
||||||
import software.amazon.kinesis.annotations.KinesisClientInternalApi;
|
import software.amazon.kinesis.annotations.KinesisClientInternalApi;
|
||||||
import software.amazon.kinesis.checkpoint.ShardRecordProcessorCheckpointer;
|
import software.amazon.kinesis.checkpoint.ShardRecordProcessorCheckpointer;
|
||||||
import software.amazon.kinesis.common.StreamIdentifier;
|
import software.amazon.kinesis.common.StreamIdentifier;
|
||||||
|
import software.amazon.kinesis.leases.LeaseStatsRecorder;
|
||||||
import software.amazon.kinesis.leases.ShardDetector;
|
import software.amazon.kinesis.leases.ShardDetector;
|
||||||
import software.amazon.kinesis.leases.ShardInfo;
|
import software.amazon.kinesis.leases.ShardInfo;
|
||||||
import software.amazon.kinesis.lifecycle.events.ProcessRecordsInput;
|
import software.amazon.kinesis.lifecycle.events.ProcessRecordsInput;
|
||||||
|
|
@ -65,6 +66,7 @@ public class ProcessTask implements ConsumerTask {
|
||||||
private final AggregatorUtil aggregatorUtil;
|
private final AggregatorUtil aggregatorUtil;
|
||||||
private final String shardInfoId;
|
private final String shardInfoId;
|
||||||
private final SchemaRegistryDecoder schemaRegistryDecoder;
|
private final SchemaRegistryDecoder schemaRegistryDecoder;
|
||||||
|
private final LeaseStatsRecorder leaseStatsRecorder;
|
||||||
|
|
||||||
public ProcessTask(
|
public ProcessTask(
|
||||||
@NonNull ShardInfo shardInfo,
|
@NonNull ShardInfo shardInfo,
|
||||||
|
|
@ -79,7 +81,8 @@ public class ProcessTask implements ConsumerTask {
|
||||||
long idleTimeInMilliseconds,
|
long idleTimeInMilliseconds,
|
||||||
@NonNull AggregatorUtil aggregatorUtil,
|
@NonNull AggregatorUtil aggregatorUtil,
|
||||||
@NonNull MetricsFactory metricsFactory,
|
@NonNull MetricsFactory metricsFactory,
|
||||||
SchemaRegistryDecoder schemaRegistryDecoder) {
|
SchemaRegistryDecoder schemaRegistryDecoder,
|
||||||
|
@NonNull LeaseStatsRecorder leaseStatsRecorder) {
|
||||||
this.shardInfo = shardInfo;
|
this.shardInfo = shardInfo;
|
||||||
this.shardInfoId = ShardInfo.getLeaseKey(shardInfo);
|
this.shardInfoId = ShardInfo.getLeaseKey(shardInfo);
|
||||||
this.shardRecordProcessor = shardRecordProcessor;
|
this.shardRecordProcessor = shardRecordProcessor;
|
||||||
|
|
@ -91,6 +94,7 @@ public class ProcessTask implements ConsumerTask {
|
||||||
this.idleTimeInMilliseconds = idleTimeInMilliseconds;
|
this.idleTimeInMilliseconds = idleTimeInMilliseconds;
|
||||||
this.metricsFactory = metricsFactory;
|
this.metricsFactory = metricsFactory;
|
||||||
this.schemaRegistryDecoder = schemaRegistryDecoder;
|
this.schemaRegistryDecoder = schemaRegistryDecoder;
|
||||||
|
this.leaseStatsRecorder = leaseStatsRecorder;
|
||||||
|
|
||||||
if (!skipShardSyncAtWorkerInitializationIfLeasesExist) {
|
if (!skipShardSyncAtWorkerInitializationIfLeasesExist) {
|
||||||
this.shard = shardDetector.shard(shardInfo.shardId());
|
this.shard = shardDetector.shard(shardInfo.shardId());
|
||||||
|
|
@ -173,6 +177,7 @@ public class ProcessTask implements ConsumerTask {
|
||||||
recordProcessorCheckpointer.largestPermittedCheckpointValue()));
|
recordProcessorCheckpointer.largestPermittedCheckpointValue()));
|
||||||
|
|
||||||
if (shouldCallProcessRecords(records)) {
|
if (shouldCallProcessRecords(records)) {
|
||||||
|
publishLeaseStats(records);
|
||||||
callProcessRecords(processRecordsInput, records);
|
callProcessRecords(processRecordsInput, records);
|
||||||
}
|
}
|
||||||
success = true;
|
success = true;
|
||||||
|
|
@ -197,6 +202,15 @@ public class ProcessTask implements ConsumerTask {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private void publishLeaseStats(final List<KinesisClientRecord> records) {
|
||||||
|
leaseStatsRecorder.recordStats(LeaseStatsRecorder.LeaseStats.builder()
|
||||||
|
.bytes(records.stream()
|
||||||
|
.mapToInt(record -> record.data().limit())
|
||||||
|
.sum())
|
||||||
|
.leaseKey(ShardInfo.getLeaseKey(shardInfo))
|
||||||
|
.build());
|
||||||
|
}
|
||||||
|
|
||||||
private List<KinesisClientRecord> deaggregateAnyKplRecords(List<KinesisClientRecord> records) {
|
private List<KinesisClientRecord> deaggregateAnyKplRecords(List<KinesisClientRecord> records) {
|
||||||
if (shard == null) {
|
if (shard == null) {
|
||||||
return aggregatorUtil.deaggregate(records);
|
return aggregatorUtil.deaggregate(records);
|
||||||
|
|
|
||||||
|
|
@ -21,7 +21,6 @@ import java.util.concurrent.CompletableFuture;
|
||||||
import java.util.concurrent.ExecutionException;
|
import java.util.concurrent.ExecutionException;
|
||||||
import java.util.concurrent.ExecutorService;
|
import java.util.concurrent.ExecutorService;
|
||||||
import java.util.concurrent.RejectedExecutionException;
|
import java.util.concurrent.RejectedExecutionException;
|
||||||
import java.util.function.Function;
|
|
||||||
|
|
||||||
import com.google.common.annotations.VisibleForTesting;
|
import com.google.common.annotations.VisibleForTesting;
|
||||||
import lombok.AccessLevel;
|
import lombok.AccessLevel;
|
||||||
|
|
@ -35,8 +34,6 @@ import software.amazon.kinesis.exceptions.internal.BlockedOnParentShardException
|
||||||
import software.amazon.kinesis.leases.ShardInfo;
|
import software.amazon.kinesis.leases.ShardInfo;
|
||||||
import software.amazon.kinesis.lifecycle.events.ProcessRecordsInput;
|
import software.amazon.kinesis.lifecycle.events.ProcessRecordsInput;
|
||||||
import software.amazon.kinesis.lifecycle.events.TaskExecutionListenerInput;
|
import software.amazon.kinesis.lifecycle.events.TaskExecutionListenerInput;
|
||||||
import software.amazon.kinesis.metrics.MetricsCollectingTaskDecorator;
|
|
||||||
import software.amazon.kinesis.metrics.MetricsFactory;
|
|
||||||
import software.amazon.kinesis.retrieval.RecordsPublisher;
|
import software.amazon.kinesis.retrieval.RecordsPublisher;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
|
@ -59,12 +56,6 @@ public class ShardConsumer {
|
||||||
@NonNull
|
@NonNull
|
||||||
private final Optional<Long> logWarningForTaskAfterMillis;
|
private final Optional<Long> logWarningForTaskAfterMillis;
|
||||||
|
|
||||||
/**
|
|
||||||
* @deprecated unused; to be removed in a "major" version bump
|
|
||||||
*/
|
|
||||||
@Deprecated
|
|
||||||
private final Function<ConsumerTask, ConsumerTask> taskMetricsDecorator;
|
|
||||||
|
|
||||||
private final int bufferSize;
|
private final int bufferSize;
|
||||||
private final TaskExecutionListener taskExecutionListener;
|
private final TaskExecutionListener taskExecutionListener;
|
||||||
private final String streamIdentifier;
|
private final String streamIdentifier;
|
||||||
|
|
@ -95,27 +86,6 @@ public class ShardConsumer {
|
||||||
|
|
||||||
private ProcessRecordsInput shardEndProcessRecordsInput;
|
private ProcessRecordsInput shardEndProcessRecordsInput;
|
||||||
|
|
||||||
@Deprecated
|
|
||||||
public ShardConsumer(
|
|
||||||
RecordsPublisher recordsPublisher,
|
|
||||||
ExecutorService executorService,
|
|
||||||
ShardInfo shardInfo,
|
|
||||||
Optional<Long> logWarningForTaskAfterMillis,
|
|
||||||
ShardConsumerArgument shardConsumerArgument,
|
|
||||||
TaskExecutionListener taskExecutionListener) {
|
|
||||||
this(
|
|
||||||
recordsPublisher,
|
|
||||||
executorService,
|
|
||||||
shardInfo,
|
|
||||||
logWarningForTaskAfterMillis,
|
|
||||||
shardConsumerArgument,
|
|
||||||
ConsumerStates.INITIAL_STATE,
|
|
||||||
ShardConsumer.metricsWrappingFunction(shardConsumerArgument.metricsFactory()),
|
|
||||||
8,
|
|
||||||
taskExecutionListener,
|
|
||||||
LifecycleConfig.DEFAULT_READ_TIMEOUTS_TO_IGNORE);
|
|
||||||
}
|
|
||||||
|
|
||||||
public ShardConsumer(
|
public ShardConsumer(
|
||||||
RecordsPublisher recordsPublisher,
|
RecordsPublisher recordsPublisher,
|
||||||
ExecutorService executorService,
|
ExecutorService executorService,
|
||||||
|
|
@ -131,36 +101,11 @@ public class ShardConsumer {
|
||||||
logWarningForTaskAfterMillis,
|
logWarningForTaskAfterMillis,
|
||||||
shardConsumerArgument,
|
shardConsumerArgument,
|
||||||
ConsumerStates.INITIAL_STATE,
|
ConsumerStates.INITIAL_STATE,
|
||||||
ShardConsumer.metricsWrappingFunction(shardConsumerArgument.metricsFactory()),
|
|
||||||
8,
|
8,
|
||||||
taskExecutionListener,
|
taskExecutionListener,
|
||||||
readTimeoutsToIgnoreBeforeWarning);
|
readTimeoutsToIgnoreBeforeWarning);
|
||||||
}
|
}
|
||||||
|
|
||||||
@Deprecated
|
|
||||||
public ShardConsumer(
|
|
||||||
RecordsPublisher recordsPublisher,
|
|
||||||
ExecutorService executorService,
|
|
||||||
ShardInfo shardInfo,
|
|
||||||
Optional<Long> logWarningForTaskAfterMillis,
|
|
||||||
ShardConsumerArgument shardConsumerArgument,
|
|
||||||
ConsumerState initialState,
|
|
||||||
Function<ConsumerTask, ConsumerTask> taskMetricsDecorator,
|
|
||||||
int bufferSize,
|
|
||||||
TaskExecutionListener taskExecutionListener) {
|
|
||||||
this(
|
|
||||||
recordsPublisher,
|
|
||||||
executorService,
|
|
||||||
shardInfo,
|
|
||||||
logWarningForTaskAfterMillis,
|
|
||||||
shardConsumerArgument,
|
|
||||||
initialState,
|
|
||||||
taskMetricsDecorator,
|
|
||||||
bufferSize,
|
|
||||||
taskExecutionListener,
|
|
||||||
LifecycleConfig.DEFAULT_READ_TIMEOUTS_TO_IGNORE);
|
|
||||||
}
|
|
||||||
|
|
||||||
//
|
//
|
||||||
// TODO: Make bufferSize configurable
|
// TODO: Make bufferSize configurable
|
||||||
//
|
//
|
||||||
|
|
@ -171,7 +116,6 @@ public class ShardConsumer {
|
||||||
Optional<Long> logWarningForTaskAfterMillis,
|
Optional<Long> logWarningForTaskAfterMillis,
|
||||||
ShardConsumerArgument shardConsumerArgument,
|
ShardConsumerArgument shardConsumerArgument,
|
||||||
ConsumerState initialState,
|
ConsumerState initialState,
|
||||||
Function<ConsumerTask, ConsumerTask> taskMetricsDecorator,
|
|
||||||
int bufferSize,
|
int bufferSize,
|
||||||
TaskExecutionListener taskExecutionListener,
|
TaskExecutionListener taskExecutionListener,
|
||||||
int readTimeoutsToIgnoreBeforeWarning) {
|
int readTimeoutsToIgnoreBeforeWarning) {
|
||||||
|
|
@ -183,7 +127,6 @@ public class ShardConsumer {
|
||||||
this.logWarningForTaskAfterMillis = logWarningForTaskAfterMillis;
|
this.logWarningForTaskAfterMillis = logWarningForTaskAfterMillis;
|
||||||
this.taskExecutionListener = taskExecutionListener;
|
this.taskExecutionListener = taskExecutionListener;
|
||||||
this.currentState = initialState;
|
this.currentState = initialState;
|
||||||
this.taskMetricsDecorator = taskMetricsDecorator;
|
|
||||||
subscriber = new ShardConsumerSubscriber(
|
subscriber = new ShardConsumerSubscriber(
|
||||||
recordsPublisher, executorService, bufferSize, this, readTimeoutsToIgnoreBeforeWarning);
|
recordsPublisher, executorService, bufferSize, this, readTimeoutsToIgnoreBeforeWarning);
|
||||||
this.bufferSize = bufferSize;
|
this.bufferSize = bufferSize;
|
||||||
|
|
@ -484,17 +427,18 @@ public class ShardConsumer {
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Requests the shutdown of the this ShardConsumer. This should give the record processor a chance to checkpoint
|
* Requests the shutdown of the ShardConsumer. This should give the record processor a chance to checkpoint
|
||||||
* before being shutdown.
|
* before being shutdown.
|
||||||
*
|
*
|
||||||
* @param shutdownNotification
|
* @param shutdownNotification used to signal that the record processor has been given the chance to shut down.
|
||||||
* used to signal that the record processor has been given the chance to shutdown.
|
|
||||||
*/
|
*/
|
||||||
public void gracefulShutdown(ShutdownNotification shutdownNotification) {
|
public void gracefulShutdown(ShutdownNotification shutdownNotification) {
|
||||||
if (subscriber != null) {
|
if (subscriber != null) {
|
||||||
subscriber.cancel();
|
subscriber.cancel();
|
||||||
}
|
}
|
||||||
this.shutdownNotification = shutdownNotification;
|
if (shutdownNotification != null) {
|
||||||
|
this.shutdownNotification = shutdownNotification;
|
||||||
|
}
|
||||||
markForShutdown(ShutdownReason.REQUESTED);
|
markForShutdown(ShutdownReason.REQUESTED);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -542,21 +486,4 @@ public class ShardConsumer {
|
||||||
return shutdownReason != null;
|
return shutdownReason != null;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
|
||||||
* Default task wrapping function for metrics
|
|
||||||
*
|
|
||||||
* @param metricsFactory
|
|
||||||
* the factory used for reporting metrics
|
|
||||||
* @return a function that will wrap the task with a metrics reporter
|
|
||||||
*/
|
|
||||||
private static Function<ConsumerTask, ConsumerTask> metricsWrappingFunction(MetricsFactory metricsFactory) {
|
|
||||||
return (task) -> {
|
|
||||||
if (task == null) {
|
|
||||||
return null;
|
|
||||||
} else {
|
|
||||||
return new MetricsCollectingTaskDecorator(task, metricsFactory);
|
|
||||||
}
|
|
||||||
};
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -18,7 +18,12 @@ import lombok.AccessLevel;
|
||||||
import lombok.RequiredArgsConstructor;
|
import lombok.RequiredArgsConstructor;
|
||||||
import lombok.extern.slf4j.Slf4j;
|
import lombok.extern.slf4j.Slf4j;
|
||||||
import software.amazon.kinesis.annotations.KinesisClientInternalApi;
|
import software.amazon.kinesis.annotations.KinesisClientInternalApi;
|
||||||
|
import software.amazon.kinesis.leases.Lease;
|
||||||
|
import software.amazon.kinesis.leases.LeaseCoordinator;
|
||||||
import software.amazon.kinesis.leases.ShardInfo;
|
import software.amazon.kinesis.leases.ShardInfo;
|
||||||
|
import software.amazon.kinesis.leases.exceptions.DependencyException;
|
||||||
|
import software.amazon.kinesis.leases.exceptions.InvalidStateException;
|
||||||
|
import software.amazon.kinesis.leases.exceptions.ProvisionedThroughputException;
|
||||||
import software.amazon.kinesis.lifecycle.events.ShutdownRequestedInput;
|
import software.amazon.kinesis.lifecycle.events.ShutdownRequestedInput;
|
||||||
import software.amazon.kinesis.processor.RecordProcessorCheckpointer;
|
import software.amazon.kinesis.processor.RecordProcessorCheckpointer;
|
||||||
import software.amazon.kinesis.processor.ShardRecordProcessor;
|
import software.amazon.kinesis.processor.ShardRecordProcessor;
|
||||||
|
|
@ -33,23 +38,41 @@ public class ShutdownNotificationTask implements ConsumerTask {
|
||||||
private final ShardRecordProcessor shardRecordProcessor;
|
private final ShardRecordProcessor shardRecordProcessor;
|
||||||
private final RecordProcessorCheckpointer recordProcessorCheckpointer;
|
private final RecordProcessorCheckpointer recordProcessorCheckpointer;
|
||||||
private final ShutdownNotification shutdownNotification;
|
private final ShutdownNotification shutdownNotification;
|
||||||
// TODO: remove if not used
|
|
||||||
private final ShardInfo shardInfo;
|
private final ShardInfo shardInfo;
|
||||||
|
private final LeaseCoordinator leaseCoordinator;
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public TaskResult call() {
|
public TaskResult call() {
|
||||||
|
final String leaseKey = ShardInfo.getLeaseKey(shardInfo);
|
||||||
|
final Lease currentShardLease = leaseCoordinator.getCurrentlyHeldLease(leaseKey);
|
||||||
try {
|
try {
|
||||||
try {
|
try {
|
||||||
shardRecordProcessor.shutdownRequested(ShutdownRequestedInput.builder()
|
shardRecordProcessor.shutdownRequested(ShutdownRequestedInput.builder()
|
||||||
.checkpointer(recordProcessorCheckpointer)
|
.checkpointer(recordProcessorCheckpointer)
|
||||||
.build());
|
.build());
|
||||||
|
attemptLeaseTransfer(currentShardLease);
|
||||||
} catch (Exception ex) {
|
} catch (Exception ex) {
|
||||||
return new TaskResult(ex);
|
return new TaskResult(ex);
|
||||||
}
|
}
|
||||||
|
|
||||||
return new TaskResult(null);
|
return new TaskResult(null);
|
||||||
} finally {
|
} finally {
|
||||||
shutdownNotification.shutdownNotificationComplete();
|
if (shutdownNotification != null) {
|
||||||
|
shutdownNotification.shutdownNotificationComplete();
|
||||||
|
} else {
|
||||||
|
// shutdownNotification is null if this is a shard level graceful shutdown instead of a worker level
|
||||||
|
// one. We need to drop lease like what's done in the shutdownNotificationComplete so we can
|
||||||
|
// transition to next state.
|
||||||
|
leaseCoordinator.dropLease(currentShardLease);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private void attemptLeaseTransfer(Lease lease)
|
||||||
|
throws ProvisionedThroughputException, InvalidStateException, DependencyException {
|
||||||
|
if (lease != null && lease.shutdownRequested()) {
|
||||||
|
if (leaseCoordinator.workerIdentifier().equals(lease.checkpointOwner())) {
|
||||||
|
leaseCoordinator.leaseRefresher().assignLease(lease, lease.leaseOwner());
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -164,7 +164,6 @@ public class ShutdownTask implements ConsumerTask {
|
||||||
} else {
|
} else {
|
||||||
throwOnApplicationException(leaseKey, leaseLostAction, scope, startTime);
|
throwOnApplicationException(leaseKey, leaseLostAction, scope, startTime);
|
||||||
}
|
}
|
||||||
|
|
||||||
log.debug("Shutting down retrieval strategy for shard {}.", leaseKey);
|
log.debug("Shutting down retrieval strategy for shard {}.", leaseKey);
|
||||||
recordsPublisher.shutdown();
|
recordsPublisher.shutdown();
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -49,7 +49,7 @@ public class RetrievalConfig {
|
||||||
*/
|
*/
|
||||||
public static final String KINESIS_CLIENT_LIB_USER_AGENT = "amazon-kinesis-client-library-java";
|
public static final String KINESIS_CLIENT_LIB_USER_AGENT = "amazon-kinesis-client-library-java";
|
||||||
|
|
||||||
public static final String KINESIS_CLIENT_LIB_USER_AGENT_VERSION = "2.6.1-SNAPSHOT";
|
public static final String KINESIS_CLIENT_LIB_USER_AGENT_VERSION = "3.0.0";
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Client used to make calls to Kinesis for records retrieval
|
* Client used to make calls to Kinesis for records retrieval
|
||||||
|
|
|
||||||
|
|
@ -26,6 +26,7 @@ import lombok.NonNull;
|
||||||
import lombok.Setter;
|
import lombok.Setter;
|
||||||
import lombok.ToString;
|
import lombok.ToString;
|
||||||
import lombok.experimental.Accessors;
|
import lombok.experimental.Accessors;
|
||||||
|
import lombok.extern.slf4j.Slf4j;
|
||||||
import software.amazon.awssdk.services.kinesis.KinesisAsyncClient;
|
import software.amazon.awssdk.services.kinesis.KinesisAsyncClient;
|
||||||
import software.amazon.awssdk.services.kinesis.model.GetRecordsRequest;
|
import software.amazon.awssdk.services.kinesis.model.GetRecordsRequest;
|
||||||
import software.amazon.kinesis.retrieval.DataFetcherProviderConfig;
|
import software.amazon.kinesis.retrieval.DataFetcherProviderConfig;
|
||||||
|
|
@ -38,12 +39,15 @@ import software.amazon.kinesis.retrieval.RetrievalSpecificConfig;
|
||||||
@Setter
|
@Setter
|
||||||
@ToString
|
@ToString
|
||||||
@EqualsAndHashCode
|
@EqualsAndHashCode
|
||||||
|
@Slf4j
|
||||||
public class PollingConfig implements RetrievalSpecificConfig {
|
public class PollingConfig implements RetrievalSpecificConfig {
|
||||||
|
|
||||||
public static final Duration DEFAULT_REQUEST_TIMEOUT = Duration.ofSeconds(30);
|
public static final Duration DEFAULT_REQUEST_TIMEOUT = Duration.ofSeconds(30);
|
||||||
|
|
||||||
public static final int DEFAULT_MAX_RECORDS = 10000;
|
public static final int DEFAULT_MAX_RECORDS = 10000;
|
||||||
|
|
||||||
|
public static final long MIN_IDLE_MILLIS_BETWEEN_READS = 200L;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Configurable functional interface to override the existing DataFetcher.
|
* Configurable functional interface to override the existing DataFetcher.
|
||||||
*/
|
*/
|
||||||
|
|
@ -138,9 +142,18 @@ public class PollingConfig implements RetrievalSpecificConfig {
|
||||||
/**
|
/**
|
||||||
* Set the value for how long the ShardConsumer should sleep in between calls to
|
* Set the value for how long the ShardConsumer should sleep in between calls to
|
||||||
* {@link KinesisAsyncClient#getRecords(GetRecordsRequest)}. If this is not specified here the value provided in
|
* {@link KinesisAsyncClient#getRecords(GetRecordsRequest)}. If this is not specified here the value provided in
|
||||||
* {@link RecordsFetcherFactory} will be used.
|
* {@link RecordsFetcherFactory} will be used. Cannot set value below MIN_IDLE_MILLIS_BETWEEN_READS.
|
||||||
*/
|
*/
|
||||||
public PollingConfig idleTimeBetweenReadsInMillis(long idleTimeBetweenReadsInMillis) {
|
public PollingConfig idleTimeBetweenReadsInMillis(long idleTimeBetweenReadsInMillis) {
|
||||||
|
if (idleTimeBetweenReadsInMillis < MIN_IDLE_MILLIS_BETWEEN_READS) {
|
||||||
|
log.warn(
|
||||||
|
"idleTimeBetweenReadsInMillis must be greater than or equal to {} but current value is {}."
|
||||||
|
+ " Defaulting to minimum {}.",
|
||||||
|
MIN_IDLE_MILLIS_BETWEEN_READS,
|
||||||
|
idleTimeBetweenReadsInMillis,
|
||||||
|
MIN_IDLE_MILLIS_BETWEEN_READS);
|
||||||
|
idleTimeBetweenReadsInMillis = MIN_IDLE_MILLIS_BETWEEN_READS;
|
||||||
|
}
|
||||||
usePollingConfigIdleTimeValue = true;
|
usePollingConfigIdleTimeValue = true;
|
||||||
this.idleTimeBetweenReadsInMillis = idleTimeBetweenReadsInMillis;
|
this.idleTimeBetweenReadsInMillis = idleTimeBetweenReadsInMillis;
|
||||||
return this;
|
return this;
|
||||||
|
|
|
||||||
|
|
@ -61,6 +61,7 @@ import software.amazon.kinesis.retrieval.RecordsDeliveryAck;
|
||||||
import software.amazon.kinesis.retrieval.RecordsPublisher;
|
import software.amazon.kinesis.retrieval.RecordsPublisher;
|
||||||
import software.amazon.kinesis.retrieval.RecordsRetrieved;
|
import software.amazon.kinesis.retrieval.RecordsRetrieved;
|
||||||
import software.amazon.kinesis.retrieval.RetryableRetrievalException;
|
import software.amazon.kinesis.retrieval.RetryableRetrievalException;
|
||||||
|
import software.amazon.kinesis.retrieval.ThrottlingReporter;
|
||||||
import software.amazon.kinesis.retrieval.kpl.ExtendedSequenceNumber;
|
import software.amazon.kinesis.retrieval.kpl.ExtendedSequenceNumber;
|
||||||
|
|
||||||
import static software.amazon.kinesis.common.DiagnosticUtils.takeDelayedDeliveryActionIfRequired;
|
import static software.amazon.kinesis.common.DiagnosticUtils.takeDelayedDeliveryActionIfRequired;
|
||||||
|
|
@ -109,6 +110,7 @@ public class PrefetchRecordsPublisher implements RecordsPublisher {
|
||||||
private boolean wasReset = false;
|
private boolean wasReset = false;
|
||||||
private Instant lastEventDeliveryTime = Instant.EPOCH;
|
private Instant lastEventDeliveryTime = Instant.EPOCH;
|
||||||
private final RequestDetails lastSuccessfulRequestDetails = new RequestDetails();
|
private final RequestDetails lastSuccessfulRequestDetails = new RequestDetails();
|
||||||
|
private final ThrottlingReporter throttlingReporter;
|
||||||
|
|
||||||
@Data
|
@Data
|
||||||
@Accessors(fluent = true)
|
@Accessors(fluent = true)
|
||||||
|
|
@ -233,6 +235,7 @@ public class PrefetchRecordsPublisher implements RecordsPublisher {
|
||||||
@NonNull final MetricsFactory metricsFactory,
|
@NonNull final MetricsFactory metricsFactory,
|
||||||
@NonNull final String operation,
|
@NonNull final String operation,
|
||||||
@NonNull final String shardId,
|
@NonNull final String shardId,
|
||||||
|
final ThrottlingReporter throttlingReporter,
|
||||||
final long awaitTerminationTimeoutMillis) {
|
final long awaitTerminationTimeoutMillis) {
|
||||||
this.getRecordsRetrievalStrategy = getRecordsRetrievalStrategy;
|
this.getRecordsRetrievalStrategy = getRecordsRetrievalStrategy;
|
||||||
this.maxRecordsPerCall = maxRecordsPerCall;
|
this.maxRecordsPerCall = maxRecordsPerCall;
|
||||||
|
|
@ -248,6 +251,7 @@ public class PrefetchRecordsPublisher implements RecordsPublisher {
|
||||||
this.idleMillisBetweenCalls = idleMillisBetweenCalls;
|
this.idleMillisBetweenCalls = idleMillisBetweenCalls;
|
||||||
this.defaultGetRecordsCacheDaemon = new DefaultGetRecordsCacheDaemon();
|
this.defaultGetRecordsCacheDaemon = new DefaultGetRecordsCacheDaemon();
|
||||||
Validate.notEmpty(operation, "Operation cannot be empty");
|
Validate.notEmpty(operation, "Operation cannot be empty");
|
||||||
|
this.throttlingReporter = throttlingReporter;
|
||||||
this.operation = operation;
|
this.operation = operation;
|
||||||
this.streamId = this.getRecordsRetrievalStrategy.dataFetcher().getStreamIdentifier();
|
this.streamId = this.getRecordsRetrievalStrategy.dataFetcher().getStreamIdentifier();
|
||||||
this.streamAndShardId = this.streamId.serialize() + ":" + shardId;
|
this.streamAndShardId = this.streamId.serialize() + ":" + shardId;
|
||||||
|
|
@ -279,7 +283,8 @@ public class PrefetchRecordsPublisher implements RecordsPublisher {
|
||||||
final long idleMillisBetweenCalls,
|
final long idleMillisBetweenCalls,
|
||||||
final MetricsFactory metricsFactory,
|
final MetricsFactory metricsFactory,
|
||||||
final String operation,
|
final String operation,
|
||||||
final String shardId) {
|
final String shardId,
|
||||||
|
final ThrottlingReporter throttlingReporter) {
|
||||||
this(
|
this(
|
||||||
maxPendingProcessRecordsInput,
|
maxPendingProcessRecordsInput,
|
||||||
maxByteSize,
|
maxByteSize,
|
||||||
|
|
@ -291,6 +296,7 @@ public class PrefetchRecordsPublisher implements RecordsPublisher {
|
||||||
metricsFactory,
|
metricsFactory,
|
||||||
operation,
|
operation,
|
||||||
shardId,
|
shardId,
|
||||||
|
throttlingReporter,
|
||||||
DEFAULT_AWAIT_TERMINATION_TIMEOUT_MILLIS);
|
DEFAULT_AWAIT_TERMINATION_TIMEOUT_MILLIS);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -555,6 +561,7 @@ public class PrefetchRecordsPublisher implements RecordsPublisher {
|
||||||
recordsRetrieved.lastBatchSequenceNumber);
|
recordsRetrieved.lastBatchSequenceNumber);
|
||||||
addArrivedRecordsInput(recordsRetrieved);
|
addArrivedRecordsInput(recordsRetrieved);
|
||||||
drainQueueForRequests();
|
drainQueueForRequests();
|
||||||
|
throttlingReporter.success();
|
||||||
} catch (PositionResetException pse) {
|
} catch (PositionResetException pse) {
|
||||||
throw pse;
|
throw pse;
|
||||||
} catch (RetryableRetrievalException rre) {
|
} catch (RetryableRetrievalException rre) {
|
||||||
|
|
@ -584,10 +591,11 @@ public class PrefetchRecordsPublisher implements RecordsPublisher {
|
||||||
|
|
||||||
publisherSession.dataFetcher().restartIterator();
|
publisherSession.dataFetcher().restartIterator();
|
||||||
} catch (ProvisionedThroughputExceededException e) {
|
} catch (ProvisionedThroughputExceededException e) {
|
||||||
// Update the lastSuccessfulCall if we get a throttling exception so that we back off idleMillis
|
log.error(
|
||||||
// for the next call
|
"{} : ProvisionedThroughputExceededException thrown while fetching records from Kinesis",
|
||||||
lastSuccessfulCall = Instant.now();
|
streamAndShardId,
|
||||||
log.error("{} : Exception thrown while fetching records from Kinesis", streamAndShardId, e);
|
e);
|
||||||
|
throttlingReporter.throttled();
|
||||||
} catch (SdkException e) {
|
} catch (SdkException e) {
|
||||||
log.error("{} : Exception thrown while fetching records from Kinesis", streamAndShardId, e);
|
log.error("{} : Exception thrown while fetching records from Kinesis", streamAndShardId, e);
|
||||||
} finally {
|
} finally {
|
||||||
|
|
|
||||||
|
|
@ -24,6 +24,7 @@ import software.amazon.kinesis.retrieval.DataFetchingStrategy;
|
||||||
import software.amazon.kinesis.retrieval.GetRecordsRetrievalStrategy;
|
import software.amazon.kinesis.retrieval.GetRecordsRetrievalStrategy;
|
||||||
import software.amazon.kinesis.retrieval.RecordsFetcherFactory;
|
import software.amazon.kinesis.retrieval.RecordsFetcherFactory;
|
||||||
import software.amazon.kinesis.retrieval.RecordsPublisher;
|
import software.amazon.kinesis.retrieval.RecordsPublisher;
|
||||||
|
import software.amazon.kinesis.retrieval.ThrottlingReporter;
|
||||||
|
|
||||||
@Slf4j
|
@Slf4j
|
||||||
@KinesisClientInternalApi
|
@KinesisClientInternalApi
|
||||||
|
|
@ -32,6 +33,7 @@ public class SimpleRecordsFetcherFactory implements RecordsFetcherFactory {
|
||||||
private int maxByteSize = 8 * 1024 * 1024;
|
private int maxByteSize = 8 * 1024 * 1024;
|
||||||
private int maxRecordsCount = 30000;
|
private int maxRecordsCount = 30000;
|
||||||
private long idleMillisBetweenCalls = 1500L;
|
private long idleMillisBetweenCalls = 1500L;
|
||||||
|
private int maxConsecutiveThrottles = 5;
|
||||||
private DataFetchingStrategy dataFetchingStrategy = DataFetchingStrategy.DEFAULT;
|
private DataFetchingStrategy dataFetchingStrategy = DataFetchingStrategy.DEFAULT;
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
|
|
@ -56,7 +58,8 @@ public class SimpleRecordsFetcherFactory implements RecordsFetcherFactory {
|
||||||
idleMillisBetweenCalls,
|
idleMillisBetweenCalls,
|
||||||
metricsFactory,
|
metricsFactory,
|
||||||
"ProcessTask",
|
"ProcessTask",
|
||||||
shardId);
|
shardId,
|
||||||
|
new ThrottlingReporter(maxConsecutiveThrottles, shardId));
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
|
|
|
||||||
|
|
@ -7,7 +7,6 @@ import java.util.List;
|
||||||
import com.amazonaws.services.schemaregistry.common.Schema;
|
import com.amazonaws.services.schemaregistry.common.Schema;
|
||||||
import com.amazonaws.services.schemaregistry.deserializers.GlueSchemaRegistryDeserializer;
|
import com.amazonaws.services.schemaregistry.deserializers.GlueSchemaRegistryDeserializer;
|
||||||
import lombok.extern.slf4j.Slf4j;
|
import lombok.extern.slf4j.Slf4j;
|
||||||
import software.amazon.kinesis.common.KinesisClientLibraryPackage;
|
|
||||||
import software.amazon.kinesis.retrieval.KinesisClientRecord;
|
import software.amazon.kinesis.retrieval.KinesisClientRecord;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
|
@ -15,7 +14,7 @@ import software.amazon.kinesis.retrieval.KinesisClientRecord;
|
||||||
*/
|
*/
|
||||||
@Slf4j
|
@Slf4j
|
||||||
public class SchemaRegistryDecoder {
|
public class SchemaRegistryDecoder {
|
||||||
private static final String USER_AGENT_APP_NAME = "kcl" + "-" + KinesisClientLibraryPackage.VERSION;
|
private static final String USER_AGENT_APP_NAME = "kcl" + "-" + "3.0.0";
|
||||||
private final GlueSchemaRegistryDeserializer glueSchemaRegistryDeserializer;
|
private final GlueSchemaRegistryDeserializer glueSchemaRegistryDeserializer;
|
||||||
|
|
||||||
public SchemaRegistryDecoder(GlueSchemaRegistryDeserializer glueSchemaRegistryDeserializer) {
|
public SchemaRegistryDecoder(GlueSchemaRegistryDeserializer glueSchemaRegistryDeserializer) {
|
||||||
|
|
|
||||||
|
|
@ -0,0 +1,61 @@
|
||||||
|
package software.amazon.kinesis.utils;
|
||||||
|
|
||||||
|
import java.io.BufferedReader;
|
||||||
|
import java.io.File;
|
||||||
|
import java.io.FileReader;
|
||||||
|
|
||||||
|
import lombok.extern.slf4j.Slf4j;
|
||||||
|
|
||||||
|
@Slf4j
|
||||||
|
public class Cgroup {
|
||||||
|
|
||||||
|
public static String readSingleLineFile(String path) {
|
||||||
|
BufferedReader bufferedReader = null;
|
||||||
|
try {
|
||||||
|
final File file = new File(path);
|
||||||
|
if (file.exists()) {
|
||||||
|
bufferedReader = new BufferedReader(new FileReader(file));
|
||||||
|
return bufferedReader.readLine();
|
||||||
|
} else {
|
||||||
|
throw new IllegalArgumentException(String.format("Failed to read file. %s does not exist", path));
|
||||||
|
}
|
||||||
|
} catch (final Throwable t) {
|
||||||
|
if (t instanceof IllegalArgumentException) {
|
||||||
|
throw (IllegalArgumentException) t;
|
||||||
|
}
|
||||||
|
throw new IllegalArgumentException("Failed to read file.", t);
|
||||||
|
} finally {
|
||||||
|
try {
|
||||||
|
if (bufferedReader != null) {
|
||||||
|
bufferedReader.close();
|
||||||
|
}
|
||||||
|
} catch (Throwable x) {
|
||||||
|
log.warn("Failed to close bufferedReader ", x);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Calculates the number of available cpus from the cpuset
|
||||||
|
* See https://docs.kernel.org/admin-guide/cgroup-v2.html#cpuset for more information
|
||||||
|
* "0-7" represents 8 cores
|
||||||
|
* "0-4,6,8-10" represents 9 cores (cores 0,1,2,3,4 and core 6 and core 8,9,10)
|
||||||
|
* @param cpuSet a single line from the cgroup cpuset file
|
||||||
|
* @return the number of available cpus
|
||||||
|
*/
|
||||||
|
public static int getAvailableCpusFromEffectiveCpuSet(final String cpuSet) {
|
||||||
|
final String[] cpuSetArr = cpuSet.split(",");
|
||||||
|
|
||||||
|
int sumCpus = 0;
|
||||||
|
for (String cpuSetGroup : cpuSetArr) {
|
||||||
|
if (cpuSetGroup.contains("-")) {
|
||||||
|
final String[] cpuSetGroupSplit = cpuSetGroup.split("-");
|
||||||
|
// Values are inclusive
|
||||||
|
sumCpus += Integer.parseInt(cpuSetGroupSplit[1]) - Integer.parseInt(cpuSetGroupSplit[0]) + 1;
|
||||||
|
} else {
|
||||||
|
sumCpus += 1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return sumCpus;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
@ -0,0 +1,31 @@
|
||||||
|
package software.amazon.kinesis.utils;
|
||||||
|
|
||||||
|
import lombok.Getter;
|
||||||
|
import lombok.RequiredArgsConstructor;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Uses the formula mentioned below for simple ExponentialMovingAverage
|
||||||
|
* <a href="https://en.wikipedia.org/wiki/Moving_average#Exponential_moving_average"/>
|
||||||
|
*
|
||||||
|
* Values of alpha close to 1 have less of a smoothing effect and give greater weight to recent changes in the data,
|
||||||
|
* while values of alpha closer to 0 have a greater smoothing effect and are less responsive to recent changes.
|
||||||
|
*/
|
||||||
|
@RequiredArgsConstructor
|
||||||
|
public class ExponentialMovingAverage {
|
||||||
|
|
||||||
|
private final double alpha;
|
||||||
|
|
||||||
|
@Getter
|
||||||
|
private double value;
|
||||||
|
|
||||||
|
private boolean initialized = false;
|
||||||
|
|
||||||
|
public void add(final double newValue) {
|
||||||
|
if (!initialized) {
|
||||||
|
this.value = newValue;
|
||||||
|
initialized = true;
|
||||||
|
} else {
|
||||||
|
this.value = alpha * newValue + (1 - alpha) * this.value;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
@ -0,0 +1,44 @@
|
||||||
|
package software.amazon.kinesis.utils;
|
||||||
|
|
||||||
|
import java.util.AbstractMap;
|
||||||
|
import java.util.List;
|
||||||
|
import java.util.Map;
|
||||||
|
|
||||||
|
public class Statistics {
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Calculates the simple mean of the given values
|
||||||
|
* @param values list of values (double)
|
||||||
|
* @return mean of the given values, if the {@param values} is empty then returns 0;
|
||||||
|
*/
|
||||||
|
public static double calculateSimpleMean(final List<Double> values) {
|
||||||
|
if (values.isEmpty()) {
|
||||||
|
return 0D;
|
||||||
|
}
|
||||||
|
double sum = 0.0;
|
||||||
|
for (final double i : values) {
|
||||||
|
sum += i;
|
||||||
|
}
|
||||||
|
return sum / values.size();
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* For the given values find the standard deviation (SD).
|
||||||
|
* For details of SD calculation ref : <a href="https://en.wikipedia.org/wiki/Standard_deviation"/>
|
||||||
|
* @param values list of values (double)
|
||||||
|
* @return Map.Entry of mean to standard deviation for {@param values}, if {@param values} is empty then return
|
||||||
|
* Map.Entry with 0 as mean and 0 as SD.
|
||||||
|
*/
|
||||||
|
public static Map.Entry<Double, Double> calculateStandardDeviationAndMean(final List<Double> values) {
|
||||||
|
if (values.isEmpty()) {
|
||||||
|
return new AbstractMap.SimpleEntry<>(0D, 0D);
|
||||||
|
}
|
||||||
|
final double mean = calculateSimpleMean(values);
|
||||||
|
// calculate the standard deviation
|
||||||
|
double standardDeviation = 0.0;
|
||||||
|
for (final double num : values) {
|
||||||
|
standardDeviation += Math.pow(num - mean, 2);
|
||||||
|
}
|
||||||
|
return new AbstractMap.SimpleEntry<>(mean, Math.sqrt(standardDeviation / values.size()));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
@ -0,0 +1,92 @@
|
||||||
|
package software.amazon.kinesis.worker;
|
||||||
|
|
||||||
|
import java.util.ArrayList;
|
||||||
|
import java.util.List;
|
||||||
|
import java.util.Optional;
|
||||||
|
|
||||||
|
import lombok.RequiredArgsConstructor;
|
||||||
|
import lombok.extern.slf4j.Slf4j;
|
||||||
|
import software.amazon.kinesis.annotations.KinesisClientInternalApi;
|
||||||
|
import software.amazon.kinesis.worker.metric.OperatingRange;
|
||||||
|
import software.amazon.kinesis.worker.metric.WorkerMetric;
|
||||||
|
import software.amazon.kinesis.worker.metric.impl.container.Cgroupv1CpuWorkerMetric;
|
||||||
|
import software.amazon.kinesis.worker.metric.impl.container.Cgroupv2CpuWorkerMetric;
|
||||||
|
import software.amazon.kinesis.worker.metric.impl.container.EcsCpuWorkerMetric;
|
||||||
|
import software.amazon.kinesis.worker.metric.impl.linux.LinuxCpuWorkerMetric;
|
||||||
|
import software.amazon.kinesis.worker.platform.Ec2Resource;
|
||||||
|
import software.amazon.kinesis.worker.platform.EcsResource;
|
||||||
|
import software.amazon.kinesis.worker.platform.EksResource;
|
||||||
|
import software.amazon.kinesis.worker.platform.OperatingRangeDataProvider;
|
||||||
|
import software.amazon.kinesis.worker.platform.ResourceMetadataProvider;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Class to select appropriate WorkerMetricStats based on the operating range provider that is available on the instance.
|
||||||
|
*/
|
||||||
|
@Slf4j
|
||||||
|
@RequiredArgsConstructor
|
||||||
|
@KinesisClientInternalApi
|
||||||
|
public class WorkerMetricsSelector {
|
||||||
|
|
||||||
|
private static final OperatingRange DEFAULT_100_PERC_UTILIZED_OPERATING_RANGE =
|
||||||
|
OperatingRange.builder().maxUtilization(100).build();
|
||||||
|
|
||||||
|
private final List<ResourceMetadataProvider> workerComputePlatforms;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Factory method to create an instance of WorkerMetricsSelector.
|
||||||
|
*
|
||||||
|
* @return WorkerMetricsSelector instance
|
||||||
|
*/
|
||||||
|
public static WorkerMetricsSelector create() {
|
||||||
|
final List<ResourceMetadataProvider> resourceMetadataProviders = new ArrayList<>();
|
||||||
|
resourceMetadataProviders.add(EcsResource.create());
|
||||||
|
resourceMetadataProviders.add(EksResource.create());
|
||||||
|
// ec2 has to be the last one to check
|
||||||
|
resourceMetadataProviders.add(Ec2Resource.create());
|
||||||
|
return new WorkerMetricsSelector(resourceMetadataProviders);
|
||||||
|
}
|
||||||
|
|
||||||
|
private Optional<OperatingRangeDataProvider> getOperatingRangeDataProvider() {
|
||||||
|
for (ResourceMetadataProvider platform : workerComputePlatforms) {
|
||||||
|
if (platform.isOnPlatform()) {
|
||||||
|
final ResourceMetadataProvider.ComputePlatform computePlatform = platform.getPlatform();
|
||||||
|
log.info("Worker is running on {}", computePlatform);
|
||||||
|
return platform.getOperatingRangeDataProvider();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return Optional.empty();
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Returns a list of WorkerMetricStats based on the operating range provider the worker uses.
|
||||||
|
*
|
||||||
|
* @return List of WorkerMetricStats
|
||||||
|
*/
|
||||||
|
public List<WorkerMetric> getDefaultWorkerMetrics() {
|
||||||
|
final List<WorkerMetric> workerMetrics = new ArrayList<>();
|
||||||
|
final Optional<OperatingRangeDataProvider> optionalProvider = getOperatingRangeDataProvider();
|
||||||
|
if (!optionalProvider.isPresent()) {
|
||||||
|
log.warn("Did not find an operating range metadata provider.");
|
||||||
|
return workerMetrics;
|
||||||
|
}
|
||||||
|
final OperatingRangeDataProvider dataProvider = optionalProvider.get();
|
||||||
|
log.info("Worker has operating range metadata provider {} ", dataProvider);
|
||||||
|
switch (dataProvider) {
|
||||||
|
case LINUX_PROC:
|
||||||
|
workerMetrics.add(new LinuxCpuWorkerMetric(DEFAULT_100_PERC_UTILIZED_OPERATING_RANGE));
|
||||||
|
break;
|
||||||
|
case LINUX_ECS_METADATA_KEY_V4:
|
||||||
|
workerMetrics.add(new EcsCpuWorkerMetric(DEFAULT_100_PERC_UTILIZED_OPERATING_RANGE));
|
||||||
|
break;
|
||||||
|
case LINUX_EKS_CGROUP_V2:
|
||||||
|
workerMetrics.add(new Cgroupv2CpuWorkerMetric(DEFAULT_100_PERC_UTILIZED_OPERATING_RANGE));
|
||||||
|
break;
|
||||||
|
case LINUX_EKS_CGROUP_V1:
|
||||||
|
workerMetrics.add(new Cgroupv1CpuWorkerMetric(DEFAULT_100_PERC_UTILIZED_OPERATING_RANGE));
|
||||||
|
break;
|
||||||
|
default:
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
return workerMetrics;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
@ -0,0 +1,20 @@
|
||||||
|
package software.amazon.kinesis.worker.metric;
|
||||||
|
|
||||||
|
import com.google.common.base.Preconditions;
|
||||||
|
import lombok.Builder;
|
||||||
|
import lombok.Data;
|
||||||
|
|
||||||
|
@Data
|
||||||
|
@Builder
|
||||||
|
public class OperatingRange {
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Max utilization percentage allowed for the workerMetrics.
|
||||||
|
*/
|
||||||
|
private final int maxUtilization;
|
||||||
|
|
||||||
|
private OperatingRange(final int maxUtilization) {
|
||||||
|
Preconditions.checkArgument(!(maxUtilization < 0 || maxUtilization > 100), "Invalid maxUtilization value");
|
||||||
|
this.maxUtilization = maxUtilization;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
@ -0,0 +1,52 @@
|
||||||
|
package software.amazon.kinesis.worker.metric;
|
||||||
|
|
||||||
|
import com.google.common.base.Preconditions;
|
||||||
|
import lombok.Builder;
|
||||||
|
import lombok.Getter;
|
||||||
|
import lombok.NonNull;
|
||||||
|
|
||||||
|
public interface WorkerMetric {
|
||||||
|
/**
|
||||||
|
* WorkerMetricStats short name that is used as attribute name for it in storage.
|
||||||
|
* @return short name for the WorkerMetricStats
|
||||||
|
*/
|
||||||
|
String getShortName();
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Current WorkerMetricValue. WorkerMetricValue is a normalized percentage value to its max configured limits.
|
||||||
|
* E.g., if for a worker max network bandwidth is 10Gbps and current used bandwidth is 2Gbps, then WorkerMetricValue for
|
||||||
|
* NetworkWorkerMetrics will be 20 (%).
|
||||||
|
*
|
||||||
|
* @return WorkerMetricValue between 0 and 100 (both inclusive)
|
||||||
|
*/
|
||||||
|
WorkerMetricValue capture();
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Gets the operating range for this workerMetrics
|
||||||
|
* @return Operating range for this workerMetrics
|
||||||
|
*/
|
||||||
|
OperatingRange getOperatingRange();
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Type of the current WorkerMetricStats.
|
||||||
|
* @return WorkerMetricType
|
||||||
|
*/
|
||||||
|
WorkerMetricType getWorkerMetricType();
|
||||||
|
|
||||||
|
/**
|
||||||
|
* WorkerMetricValue model class is used as return type for the capture() method to have a strong checks at the build
|
||||||
|
* time of the object itself.
|
||||||
|
*/
|
||||||
|
@Builder
|
||||||
|
class WorkerMetricValue {
|
||||||
|
|
||||||
|
@Getter
|
||||||
|
private final Double value;
|
||||||
|
|
||||||
|
private WorkerMetricValue(@NonNull final Double value) {
|
||||||
|
Preconditions.checkArgument(
|
||||||
|
!(value < 0 || value > 100), value + " is either less than 0 or greater than 100");
|
||||||
|
this.value = value;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
@ -0,0 +1,16 @@
|
||||||
|
package software.amazon.kinesis.worker.metric;
|
||||||
|
|
||||||
|
import lombok.Getter;
|
||||||
|
import lombok.RequiredArgsConstructor;
|
||||||
|
|
||||||
|
@RequiredArgsConstructor
|
||||||
|
public enum WorkerMetricType {
|
||||||
|
CPU("C"),
|
||||||
|
MEMORY("M"),
|
||||||
|
NETWORK_IN("NI"),
|
||||||
|
NETWORK_OUT("NO"),
|
||||||
|
THROUGHPUT("T");
|
||||||
|
|
||||||
|
@Getter
|
||||||
|
private final String shortName;
|
||||||
|
}
|
||||||
|
|
@ -0,0 +1,128 @@
|
||||||
|
package software.amazon.kinesis.worker.metric.impl.container;
|
||||||
|
|
||||||
|
import java.time.Clock;
|
||||||
|
import java.util.concurrent.TimeUnit;
|
||||||
|
|
||||||
|
import lombok.AccessLevel;
|
||||||
|
import lombok.RequiredArgsConstructor;
|
||||||
|
import lombok.extern.slf4j.Slf4j;
|
||||||
|
import software.amazon.kinesis.worker.metric.OperatingRange;
|
||||||
|
import software.amazon.kinesis.worker.metric.WorkerMetric;
|
||||||
|
import software.amazon.kinesis.worker.metric.WorkerMetricType;
|
||||||
|
|
||||||
|
import static software.amazon.kinesis.utils.Cgroup.getAvailableCpusFromEffectiveCpuSet;
|
||||||
|
import static software.amazon.kinesis.utils.Cgroup.readSingleLineFile;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Utilizes Linux Control Groups by reading cpu time and available cpu from cgroup directory.This works for Elastic
|
||||||
|
* Kubernetes Service (EKS) containers running on Linux instances which use cgroupv1.
|
||||||
|
*
|
||||||
|
* EC2 instances must use a Linux instance that uses cgroupv1. Amazon Linux 2 uses cgroupv1.
|
||||||
|
* Fargate versions 1.4.0 and 1.3.0 use Amazon Linux 2 and can use this.
|
||||||
|
*
|
||||||
|
* CPU time is measured in CPU cores time. A container is limited by amount of CPU core time it is allocated. So if over
|
||||||
|
* a second the container uses 0.5 CPU core time and is allocated 2 CPU cores, the cpu utilization would be 25%.
|
||||||
|
*
|
||||||
|
* When this is invoked for the first time, the value returned is always 0 as the prev values are not available
|
||||||
|
* to calculate the diff.
|
||||||
|
* In case the file is not present or any other exception occurs, this throws IllegalArgumentException.
|
||||||
|
*/
|
||||||
|
@Slf4j
|
||||||
|
@RequiredArgsConstructor(access = AccessLevel.PACKAGE)
|
||||||
|
public class Cgroupv1CpuWorkerMetric implements WorkerMetric {
|
||||||
|
|
||||||
|
private static final Object LOCK_OBJECT = new Object();
|
||||||
|
private static final WorkerMetricType CPU_WORKER_METRICS_TYPE = WorkerMetricType.CPU;
|
||||||
|
private static final String CGROUP_ROOT = "/sys/fs/cgroup/";
|
||||||
|
private static final String CPU_TIME_FILE = CGROUP_ROOT + "cpu/cpuacct.usage";
|
||||||
|
private static final String CPU_CFS_QUOTA_FILE = CGROUP_ROOT + "cpu/cpu.cfs_quota_us";
|
||||||
|
private static final String CPU_CFS_PERIOD_FILE = CGROUP_ROOT + "cpu/cpu.cfs_period_us";
|
||||||
|
private static final String EFFECTIVE_CPU_SET_FILE = CGROUP_ROOT + "cpuset/cpuset.effective_cpus";
|
||||||
|
private final OperatingRange operatingRange;
|
||||||
|
private final String cpuTimeFile;
|
||||||
|
private final String cfsQuotaFile;
|
||||||
|
private final String cfsPeriodFile;
|
||||||
|
private final String effectiveCpuSetFile;
|
||||||
|
private final Clock clock;
|
||||||
|
private double cpuLimit = -1;
|
||||||
|
private long lastCpuUseTimeNanos = 0;
|
||||||
|
private long lastSystemTimeNanos = 0;
|
||||||
|
|
||||||
|
public Cgroupv1CpuWorkerMetric(final OperatingRange operatingRange) {
|
||||||
|
this(
|
||||||
|
operatingRange,
|
||||||
|
CPU_TIME_FILE,
|
||||||
|
CPU_CFS_QUOTA_FILE,
|
||||||
|
CPU_CFS_PERIOD_FILE,
|
||||||
|
EFFECTIVE_CPU_SET_FILE,
|
||||||
|
Clock.systemUTC());
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public String getShortName() {
|
||||||
|
return CPU_WORKER_METRICS_TYPE.getShortName();
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public WorkerMetricValue capture() {
|
||||||
|
return WorkerMetricValue.builder().value(calculateCpuUsage()).build();
|
||||||
|
}
|
||||||
|
|
||||||
|
private double calculateCpuUsage() {
|
||||||
|
if (cpuLimit == -1) {
|
||||||
|
cpuLimit = calculateCpuLimit();
|
||||||
|
}
|
||||||
|
|
||||||
|
final long cpuTimeNanos = Long.parseLong(readSingleLineFile(cpuTimeFile));
|
||||||
|
final long currentTimeNanos = TimeUnit.MILLISECONDS.toNanos(clock.millis());
|
||||||
|
|
||||||
|
boolean skip = false;
|
||||||
|
double cpuCoreTimeUsed;
|
||||||
|
synchronized (LOCK_OBJECT) {
|
||||||
|
if (lastCpuUseTimeNanos == 0 && lastSystemTimeNanos == 0) {
|
||||||
|
// Case where this is a first call so no diff available
|
||||||
|
skip = true;
|
||||||
|
}
|
||||||
|
|
||||||
|
final long nanoTimeDiff = currentTimeNanos - lastSystemTimeNanos;
|
||||||
|
final long cpuUseDiff = cpuTimeNanos - lastCpuUseTimeNanos;
|
||||||
|
// This value is not a percent, but rather how much CPU core time was consumed. i.e. this number can be
|
||||||
|
// 2.2 which stands for 2.2 CPU cores were fully utilized. If this number is less than 1 than that means
|
||||||
|
// that less than 1 CPU core was used.
|
||||||
|
cpuCoreTimeUsed = ((double) cpuUseDiff / nanoTimeDiff);
|
||||||
|
|
||||||
|
lastCpuUseTimeNanos = cpuTimeNanos;
|
||||||
|
lastSystemTimeNanos = currentTimeNanos;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (skip) {
|
||||||
|
return 0D;
|
||||||
|
} else {
|
||||||
|
// In case of rounding error, treat everything above 100% as 100%
|
||||||
|
return Math.min(100.0, cpuCoreTimeUsed / cpuLimit * 100.0);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private double calculateCpuLimit() {
|
||||||
|
// Documentation on these values:
|
||||||
|
// https://docs.redhat.com/en/documentation/red_hat_enterprise_linux/6/html/resource_management_guide/sec-cpu#sect-cfs
|
||||||
|
final long cfsQuota = Long.parseLong(readSingleLineFile(cfsQuotaFile));
|
||||||
|
final long cfsPeriod = Long.parseLong(readSingleLineFile(cfsPeriodFile));
|
||||||
|
if (cfsQuota == -1) {
|
||||||
|
// If quota is -1, a limit is not set on the container. The container can use all available cores.
|
||||||
|
return getAvailableCpusFromEffectiveCpuSet(readSingleLineFile(effectiveCpuSetFile));
|
||||||
|
} else {
|
||||||
|
return ((double) cfsQuota) / cfsPeriod;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public OperatingRange getOperatingRange() {
|
||||||
|
return operatingRange;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public WorkerMetricType getWorkerMetricType() {
|
||||||
|
return CPU_WORKER_METRICS_TYPE;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
@ -0,0 +1,128 @@
|
||||||
|
package software.amazon.kinesis.worker.metric.impl.container;
|
||||||
|
|
||||||
|
import java.time.Clock;
|
||||||
|
import java.util.concurrent.TimeUnit;
|
||||||
|
|
||||||
|
import lombok.AccessLevel;
|
||||||
|
import lombok.RequiredArgsConstructor;
|
||||||
|
import lombok.extern.slf4j.Slf4j;
|
||||||
|
import software.amazon.kinesis.worker.metric.OperatingRange;
|
||||||
|
import software.amazon.kinesis.worker.metric.WorkerMetric;
|
||||||
|
import software.amazon.kinesis.worker.metric.WorkerMetricType;
|
||||||
|
|
||||||
|
import static software.amazon.kinesis.utils.Cgroup.getAvailableCpusFromEffectiveCpuSet;
|
||||||
|
import static software.amazon.kinesis.utils.Cgroup.readSingleLineFile;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Utilizes Linux Control Groups by reading cpu time and available cpu from cgroup directory. This works for Elastic
|
||||||
|
* Kubernetes Service (EKS) containers running on Linux instances which use cgroupv2.
|
||||||
|
*
|
||||||
|
* EC2 instances must use a Linux instance that uses cgroupv2. Amazon Linux 2023 uses cgroupv2.
|
||||||
|
*
|
||||||
|
* CPU time is measured in CPU cores time. A container is limited by amount of CPU core time it is allocated. So if over
|
||||||
|
* a second the container uses 0.5 CPU core time and is allocated 2 CPU cores, the cpu utilization would be 25%.
|
||||||
|
*
|
||||||
|
* When this is invoked for the first time, the value returned is always 0 as the prev values are not available
|
||||||
|
* to calculate the diff.
|
||||||
|
* In case the file is not present or any other exception occurs, this throws IllegalArgumentException.
|
||||||
|
*/
|
||||||
|
@Slf4j
|
||||||
|
@RequiredArgsConstructor(access = AccessLevel.PACKAGE)
|
||||||
|
public class Cgroupv2CpuWorkerMetric implements WorkerMetric {
|
||||||
|
|
||||||
|
private static final Object LOCK_OBJECT = new Object();
|
||||||
|
private static final WorkerMetricType CPU_WORKER_METRICS_TYPE = WorkerMetricType.CPU;
|
||||||
|
private static final String CGROUP_ROOT = "/sys/fs/cgroup/";
|
||||||
|
private static final String CPU_MAX_FILE = CGROUP_ROOT + "cpu.max";
|
||||||
|
private static final String EFFECTIVE_CPU_SET_FILE = CGROUP_ROOT + "cpuset.cpus.effective";
|
||||||
|
private static final String CPU_STAT_FILE = CGROUP_ROOT + "cpu.stat";
|
||||||
|
private final OperatingRange operatingRange;
|
||||||
|
private final String cpuMaxFile;
|
||||||
|
private final String effectiveCpuSetFile;
|
||||||
|
private final String cpuStatFile;
|
||||||
|
private final Clock clock;
|
||||||
|
private double cpuLimit = -1;
|
||||||
|
private long lastCpuUseTimeMicros = 0;
|
||||||
|
private long lastSystemTimeMicros = 0;
|
||||||
|
|
||||||
|
public Cgroupv2CpuWorkerMetric(final OperatingRange operatingRange) {
|
||||||
|
this(operatingRange, CPU_MAX_FILE, EFFECTIVE_CPU_SET_FILE, CPU_STAT_FILE, Clock.systemUTC());
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public String getShortName() {
|
||||||
|
return CPU_WORKER_METRICS_TYPE.getShortName();
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public WorkerMetricValue capture() {
|
||||||
|
return WorkerMetricValue.builder().value(calculateCpuUsage()).build();
|
||||||
|
}
|
||||||
|
|
||||||
|
private double calculateCpuUsage() {
|
||||||
|
if (cpuLimit == -1) {
|
||||||
|
cpuLimit = calculateCpuLimit();
|
||||||
|
}
|
||||||
|
|
||||||
|
// The first line of this file is of the format
|
||||||
|
// usage_usec $MICROSECONDS
|
||||||
|
// where $MICROSECONDS is always a number
|
||||||
|
final String cpuUsageStat = readSingleLineFile(cpuStatFile);
|
||||||
|
final long cpuTimeMicros = Long.parseLong(cpuUsageStat.split(" ")[1]);
|
||||||
|
final long currentTimeMicros = TimeUnit.MILLISECONDS.toMicros(clock.millis());
|
||||||
|
|
||||||
|
boolean skip = false;
|
||||||
|
double cpuCoreTimeUsed;
|
||||||
|
synchronized (LOCK_OBJECT) {
|
||||||
|
if (lastCpuUseTimeMicros == 0 && lastSystemTimeMicros == 0) {
|
||||||
|
// Case where this is a first call so no diff available
|
||||||
|
skip = true;
|
||||||
|
}
|
||||||
|
|
||||||
|
final long microTimeDiff = currentTimeMicros - lastSystemTimeMicros;
|
||||||
|
final long cpuUseDiff = cpuTimeMicros - lastCpuUseTimeMicros;
|
||||||
|
// This value is not a percent, but rather how much CPU core time was consumed. i.e. this number can be
|
||||||
|
// 2.2 which stands for 2.2 CPU cores were fully utilized. If this number is less than 1 than that means
|
||||||
|
// that less than 1 CPU core was used.
|
||||||
|
cpuCoreTimeUsed = ((double) cpuUseDiff / microTimeDiff);
|
||||||
|
|
||||||
|
lastCpuUseTimeMicros = cpuTimeMicros;
|
||||||
|
lastSystemTimeMicros = currentTimeMicros;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (skip) {
|
||||||
|
return 0D;
|
||||||
|
} else {
|
||||||
|
// In case of rounding error, treat everything above 100% as 100%
|
||||||
|
return Math.min(100.0, cpuCoreTimeUsed / cpuLimit * 100.0);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private double calculateCpuLimit() {
|
||||||
|
// This file contains two values separated by space ($MAX $PERIOD).
|
||||||
|
// $MAX is either a number or "max"
|
||||||
|
// $PERIOD is always a number
|
||||||
|
final String cpuMax = readSingleLineFile(cpuMaxFile);
|
||||||
|
final String[] cpuMaxArr = cpuMax.split(" ");
|
||||||
|
final String max = cpuMaxArr[0];
|
||||||
|
final String period = cpuMaxArr[1];
|
||||||
|
|
||||||
|
if (max.equals("max")) {
|
||||||
|
// if first value in file is "max", a limit is not set on the container. The container can use all available
|
||||||
|
// cores
|
||||||
|
return getAvailableCpusFromEffectiveCpuSet(readSingleLineFile(effectiveCpuSetFile));
|
||||||
|
} else {
|
||||||
|
return Double.parseDouble(max) / Long.parseLong(period);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public OperatingRange getOperatingRange() {
|
||||||
|
return operatingRange;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public WorkerMetricType getWorkerMetricType() {
|
||||||
|
return CPU_WORKER_METRICS_TYPE;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
@ -0,0 +1,203 @@
|
||||||
|
package software.amazon.kinesis.worker.metric.impl.container;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.io.InputStreamReader;
|
||||||
|
import java.net.MalformedURLException;
|
||||||
|
import java.net.URL;
|
||||||
|
import java.nio.charset.Charset;
|
||||||
|
import java.util.Iterator;
|
||||||
|
|
||||||
|
import com.fasterxml.jackson.databind.JsonNode;
|
||||||
|
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||||
|
import lombok.AccessLevel;
|
||||||
|
import lombok.RequiredArgsConstructor;
|
||||||
|
import lombok.extern.slf4j.Slf4j;
|
||||||
|
import software.amazon.kinesis.worker.metric.OperatingRange;
|
||||||
|
import software.amazon.kinesis.worker.metric.WorkerMetric;
|
||||||
|
import software.amazon.kinesis.worker.metric.WorkerMetricType;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Queries the Amazon ECS task metadata endpoint version 4 to get CPU metric stats as well as allocated CPU to the ECS task and
|
||||||
|
* containers to calculate percent CPU utilization. This works for all ECS containers running on the following
|
||||||
|
* platforms:
|
||||||
|
*
|
||||||
|
* Fargate agent version 1.4.0
|
||||||
|
* EC2 instance running at least 1.39.0 of the Amazon ECS container agent
|
||||||
|
*
|
||||||
|
* For more information, see
|
||||||
|
* https://docs.aws.amazon.com/AmazonECS/latest/developerguide/task-metadata-endpoint-v4.html
|
||||||
|
*/
|
||||||
|
@Slf4j
|
||||||
|
@RequiredArgsConstructor(access = AccessLevel.PACKAGE)
|
||||||
|
public class EcsCpuWorkerMetric implements WorkerMetric {
|
||||||
|
|
||||||
|
private static final WorkerMetricType CPU_WORKER_METRICS_TYPE = WorkerMetricType.CPU;
|
||||||
|
private static final String SYS_VAR_ECS_METADATA_URI = "ECS_CONTAINER_METADATA_URI_V4";
|
||||||
|
private final OperatingRange operatingRange;
|
||||||
|
private final String containerStatsUri;
|
||||||
|
private final String taskMetadataUri;
|
||||||
|
private final String containerMetadataUri;
|
||||||
|
private double containerCpuLimit = -1;
|
||||||
|
private double onlineCpus = -1;
|
||||||
|
|
||||||
|
public EcsCpuWorkerMetric(final OperatingRange operatingRange) {
|
||||||
|
this.operatingRange = operatingRange;
|
||||||
|
|
||||||
|
final String ecsMetadataRootUri = System.getenv(SYS_VAR_ECS_METADATA_URI);
|
||||||
|
if (ecsMetadataRootUri != null) {
|
||||||
|
this.containerStatsUri = ecsMetadataRootUri + "/stats";
|
||||||
|
this.taskMetadataUri = ecsMetadataRootUri + "/task";
|
||||||
|
this.containerMetadataUri = ecsMetadataRootUri;
|
||||||
|
} else {
|
||||||
|
this.containerStatsUri = null;
|
||||||
|
this.taskMetadataUri = null;
|
||||||
|
this.containerMetadataUri = null;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public String getShortName() {
|
||||||
|
return CPU_WORKER_METRICS_TYPE.getShortName();
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public WorkerMetricValue capture() {
|
||||||
|
return WorkerMetricValue.builder().value(calculateCpuUsage()).build();
|
||||||
|
}
|
||||||
|
|
||||||
|
private double calculateCpuUsage() {
|
||||||
|
// Read current container metrics
|
||||||
|
final JsonNode containerStatsRootNode = readEcsMetadata(containerStatsUri);
|
||||||
|
|
||||||
|
final long cpuUsage = containerStatsRootNode
|
||||||
|
.path("cpu_stats")
|
||||||
|
.path("cpu_usage")
|
||||||
|
.path("total_usage")
|
||||||
|
.asLong();
|
||||||
|
final long systemCpuUsage = containerStatsRootNode
|
||||||
|
.path("cpu_stats")
|
||||||
|
.path("system_cpu_usage")
|
||||||
|
.asLong();
|
||||||
|
final long prevCpuUsage = containerStatsRootNode
|
||||||
|
.path("precpu_stats")
|
||||||
|
.path("cpu_usage")
|
||||||
|
.path("total_usage")
|
||||||
|
.asLong();
|
||||||
|
final long prevSystemCpuUsage = containerStatsRootNode
|
||||||
|
.path("precpu_stats")
|
||||||
|
.path("system_cpu_usage")
|
||||||
|
.asLong();
|
||||||
|
|
||||||
|
if (containerCpuLimit == -1 && onlineCpus == -1) {
|
||||||
|
onlineCpus =
|
||||||
|
containerStatsRootNode.path("cpu_stats").path("online_cpus").asDouble();
|
||||||
|
containerCpuLimit = calculateContainerCpuLimit(onlineCpus);
|
||||||
|
}
|
||||||
|
|
||||||
|
// precpu_stats values will be 0 if it is the first call
|
||||||
|
if (prevCpuUsage == 0 && prevSystemCpuUsage == 0) {
|
||||||
|
return 0D;
|
||||||
|
}
|
||||||
|
|
||||||
|
final long cpuUsageDiff = cpuUsage - prevCpuUsage;
|
||||||
|
final long systemCpuUsageDiff = systemCpuUsage - prevSystemCpuUsage;
|
||||||
|
|
||||||
|
// Edge case when there is no systemCpu usage, then that means that 100% of the cpu is used.
|
||||||
|
if (systemCpuUsageDiff == 0) {
|
||||||
|
return 100D;
|
||||||
|
}
|
||||||
|
|
||||||
|
// This value is not a percent, but rather how much CPU core time was consumed. i.e. this number can be
|
||||||
|
// 2.2 which stands for 2.2 CPU cores were fully utilized. If this number is less than 1 than that means
|
||||||
|
// that less than 1 CPU core was used.
|
||||||
|
final double cpuCoreTimeUsed = ((double) cpuUsageDiff) / systemCpuUsageDiff * onlineCpus;
|
||||||
|
|
||||||
|
// This calculated value is cpu utilization percent. This can burst past 100%, but we will take min with 100%
|
||||||
|
// because only this amount is guaranteed CPU time to the container
|
||||||
|
return Math.min(100.0, cpuCoreTimeUsed / containerCpuLimit * 100.0);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* All containers in an ECS task can use up to the task level CPU limit. However, CPU is shared among all containers
|
||||||
|
* in the task according to the relative ratio of CPU shares allocated to each container.
|
||||||
|
* i.e.
|
||||||
|
* CPU limit of task is 8 cores
|
||||||
|
* Container 1 with 10 CPU shares
|
||||||
|
* Container 2 with 30 CPU shares
|
||||||
|
* Sum of CPU shares is 40
|
||||||
|
* Container 1 can use 25% of the 8 cores in CPU core time, so this function returns 2
|
||||||
|
* Container 2 can use 75% of the 8 cores in CPU core time, so this function returns 6
|
||||||
|
* @return the CPU core time allocated to the container
|
||||||
|
*/
|
||||||
|
private double calculateContainerCpuLimit(double onlineCpus) {
|
||||||
|
// Read task metadata
|
||||||
|
final JsonNode taskStatsRootNode = readEcsMetadata(taskMetadataUri);
|
||||||
|
double taskCpuLimit = calculateTaskCpuLimit(taskStatsRootNode, onlineCpus);
|
||||||
|
|
||||||
|
// Read current container metadata
|
||||||
|
final String currentContainerId =
|
||||||
|
readEcsMetadata(containerMetadataUri).path("DockerId").asText();
|
||||||
|
final Iterator<JsonNode> containersIterator =
|
||||||
|
taskStatsRootNode.path("Containers").iterator();
|
||||||
|
|
||||||
|
// The default if this value is not provided is 2 CPU shares (in ECS agent versions >= 1.2.0)
|
||||||
|
int currentContainerCpuShare = 2;
|
||||||
|
int containersCpuShareSum = 0;
|
||||||
|
while (containersIterator.hasNext()) {
|
||||||
|
final JsonNode containerNode = containersIterator.next();
|
||||||
|
final int containerCpuShare =
|
||||||
|
containerNode.path("Limits").path("CPU").asInt();
|
||||||
|
if (containerNode.path("DockerId").asText().equals(currentContainerId)) {
|
||||||
|
currentContainerCpuShare = containerCpuShare;
|
||||||
|
}
|
||||||
|
containersCpuShareSum += containerCpuShare;
|
||||||
|
}
|
||||||
|
return ((double) currentContainerCpuShare) / containersCpuShareSum * taskCpuLimit;
|
||||||
|
}
|
||||||
|
|
||||||
|
private double calculateTaskCpuLimit(JsonNode taskStatsRootNode, double onlineCpus) {
|
||||||
|
final JsonNode limitsNode = taskStatsRootNode.path("Limits");
|
||||||
|
if (limitsNode.isMissingNode()) {
|
||||||
|
// Neither a memory limit nor cpu limit is set at the task level (possible on EC2 instances)
|
||||||
|
return onlineCpus;
|
||||||
|
}
|
||||||
|
final JsonNode cpuLimitsNode = limitsNode.path("CPU");
|
||||||
|
if (cpuLimitsNode.isMissingNode()) {
|
||||||
|
// When only a memory limit is set at the task level (possible on ec2 instances)
|
||||||
|
return onlineCpus;
|
||||||
|
}
|
||||||
|
return cpuLimitsNode.asDouble();
|
||||||
|
}
|
||||||
|
|
||||||
|
private JsonNode readEcsMetadata(String uri) {
|
||||||
|
if (this.containerMetadataUri == null) {
|
||||||
|
throw new IllegalArgumentException("No ECS metadata endpoint found from environment variables.");
|
||||||
|
}
|
||||||
|
|
||||||
|
URL url;
|
||||||
|
try {
|
||||||
|
url = new URL(uri);
|
||||||
|
} catch (MalformedURLException e) {
|
||||||
|
throw new IllegalArgumentException(
|
||||||
|
"CpuWorkerMetrics is not configured properly. ECS metadata url is malformed", e);
|
||||||
|
}
|
||||||
|
try {
|
||||||
|
final ObjectMapper mapper = new ObjectMapper();
|
||||||
|
final JsonNode rootNode =
|
||||||
|
mapper.readValue(new InputStreamReader(url.openStream(), Charset.defaultCharset()), JsonNode.class);
|
||||||
|
return rootNode;
|
||||||
|
} catch (IOException e) {
|
||||||
|
throw new IllegalArgumentException("Error in parsing ECS metadata", e);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public OperatingRange getOperatingRange() {
|
||||||
|
return operatingRange;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public WorkerMetricType getWorkerMetricType() {
|
||||||
|
return CPU_WORKER_METRICS_TYPE;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
@ -0,0 +1,108 @@
|
||||||
|
package software.amazon.kinesis.worker.metric.impl.jmx;
|
||||||
|
|
||||||
|
import java.lang.management.ManagementFactory;
|
||||||
|
import java.util.Collections;
|
||||||
|
import java.util.HashSet;
|
||||||
|
import java.util.Set;
|
||||||
|
import javax.management.MBeanServerConnection;
|
||||||
|
import javax.management.ObjectName;
|
||||||
|
import javax.management.openmbean.CompositeDataSupport;
|
||||||
|
|
||||||
|
import lombok.RequiredArgsConstructor;
|
||||||
|
import software.amazon.kinesis.worker.metric.OperatingRange;
|
||||||
|
import software.amazon.kinesis.worker.metric.WorkerMetric;
|
||||||
|
import software.amazon.kinesis.worker.metric.WorkerMetricType;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Memory WorkerMetricStats that reads the heap memory after GC. The way memory usage is calculated that, all the
|
||||||
|
* available memory pools are read except Eden (as this is allocation buffer) and used memory and total memory is
|
||||||
|
* computed.
|
||||||
|
* Then percentage is computed by dividing used memory by total memory.
|
||||||
|
*
|
||||||
|
*/
|
||||||
|
@RequiredArgsConstructor
|
||||||
|
public class HeapMemoryAfterGCWorkerMetric implements WorkerMetric {
|
||||||
|
|
||||||
|
private static final WorkerMetricType MEMORY_WORKER_METRICS_TYPE = WorkerMetricType.MEMORY;
|
||||||
|
|
||||||
|
private final OperatingRange operatingRange;
|
||||||
|
|
||||||
|
private Set<ObjectName> garbageCollectorMxBeans;
|
||||||
|
private Set<String> memoryPoolNames;
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public String getShortName() {
|
||||||
|
return MEMORY_WORKER_METRICS_TYPE.getShortName();
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public WorkerMetricValue capture() {
|
||||||
|
return WorkerMetricValue.builder()
|
||||||
|
.value(getAfterGCMemoryUsage(ManagementFactory.getPlatformMBeanServer()))
|
||||||
|
.build();
|
||||||
|
}
|
||||||
|
|
||||||
|
private double getAfterGCMemoryUsage(final MBeanServerConnection connection) {
|
||||||
|
try {
|
||||||
|
if (garbageCollectorMxBeans == null) {
|
||||||
|
garbageCollectorMxBeans = connection.queryNames(
|
||||||
|
new ObjectName(ManagementFactory.GARBAGE_COLLECTOR_MXBEAN_DOMAIN_TYPE + ",*"), null);
|
||||||
|
|
||||||
|
memoryPoolNames = new HashSet<String>();
|
||||||
|
for (ObjectName on : garbageCollectorMxBeans) {
|
||||||
|
String[] poolNames = (String[]) connection.getAttribute(on, "MemoryPoolNames");
|
||||||
|
// A given MemoryPool may be associated with multiple GarbageCollectors,
|
||||||
|
// but will appear only once in memoryPoolNames
|
||||||
|
Collections.addAll(memoryPoolNames, poolNames);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Report on the sum of non-Eden HEAP spaces after the last gc
|
||||||
|
Long used, max;
|
||||||
|
long usedKb = 0, totalKb = 0;
|
||||||
|
|
||||||
|
for (String poolName : memoryPoolNames) {
|
||||||
|
if (!poolName.contains("Eden")) {
|
||||||
|
// Ignore Eden, since it's just an allocation buffer
|
||||||
|
ObjectName on =
|
||||||
|
new ObjectName(ManagementFactory.MEMORY_POOL_MXBEAN_DOMAIN_TYPE + ",name=" + poolName);
|
||||||
|
String mt = (String) connection.getAttribute(on, "Type");
|
||||||
|
if (mt.equals("HEAP")) {
|
||||||
|
// Paranoia: ignore non-HEAP memory pools
|
||||||
|
CompositeDataSupport data =
|
||||||
|
(CompositeDataSupport) connection.getAttribute(on, "CollectionUsage");
|
||||||
|
|
||||||
|
used = (Long) data.get("used");
|
||||||
|
usedKb += used / 1024;
|
||||||
|
|
||||||
|
max = (Long) data.get("max");
|
||||||
|
// max can be undefined (-1)
|
||||||
|
// http://docs.oracle.com/javase/7/docs/api/java/lang/management/MemoryUsage.html
|
||||||
|
totalKb += max == -1 ? 0 : max / 1024;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (totalKb <= 0) {
|
||||||
|
throw new IllegalArgumentException("Total memory value for JVM is greater than zero");
|
||||||
|
}
|
||||||
|
|
||||||
|
return 100.0 * (double) usedKb / (double) totalKb;
|
||||||
|
} catch (final Exception e) {
|
||||||
|
if (e instanceof IllegalArgumentException) {
|
||||||
|
throw (IllegalArgumentException) e;
|
||||||
|
}
|
||||||
|
throw new IllegalArgumentException(e);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public OperatingRange getOperatingRange() {
|
||||||
|
return operatingRange;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public WorkerMetricType getWorkerMetricType() {
|
||||||
|
return MEMORY_WORKER_METRICS_TYPE;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
@ -0,0 +1,133 @@
|
||||||
|
package software.amazon.kinesis.worker.metric.impl.linux;
|
||||||
|
|
||||||
|
import java.io.BufferedReader;
|
||||||
|
import java.io.File;
|
||||||
|
import java.io.FileReader;
|
||||||
|
|
||||||
|
import lombok.AccessLevel;
|
||||||
|
import lombok.RequiredArgsConstructor;
|
||||||
|
import lombok.extern.slf4j.Slf4j;
|
||||||
|
import software.amazon.kinesis.worker.metric.OperatingRange;
|
||||||
|
import software.amazon.kinesis.worker.metric.WorkerMetric;
|
||||||
|
import software.amazon.kinesis.worker.metric.WorkerMetricType;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Reads CPU usage statistics out of /proc/stat file that is present on the EC2 instances. The value is % utilization
|
||||||
|
* of the CPU.
|
||||||
|
* When this is invoked for the first time, the value returned is always 0 as the prev values are not available
|
||||||
|
* to calculate the diff. If the file hasn't changed this also returns 0.
|
||||||
|
* In case the file is not present or any other exception occurs, this throws IllegalArgumentException.
|
||||||
|
*/
|
||||||
|
@Slf4j
|
||||||
|
@RequiredArgsConstructor(access = AccessLevel.PACKAGE)
|
||||||
|
public class LinuxCpuWorkerMetric implements WorkerMetric {
|
||||||
|
|
||||||
|
private static final Object LOCK_OBJECT = new Object();
|
||||||
|
private static final WorkerMetricType CPU_WORKER_METRICS_TYPE = WorkerMetricType.CPU;
|
||||||
|
private final OperatingRange operatingRange;
|
||||||
|
private final String statFile;
|
||||||
|
private long lastUsr, lastIow, lastSys, lastIdl, lastTot;
|
||||||
|
private String lastLine;
|
||||||
|
|
||||||
|
public LinuxCpuWorkerMetric(final OperatingRange operatingRange) {
|
||||||
|
this(operatingRange, "/proc/stat");
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public String getShortName() {
|
||||||
|
return CPU_WORKER_METRICS_TYPE.getShortName();
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public WorkerMetricValue capture() {
|
||||||
|
return WorkerMetricValue.builder().value(calculateCpuUsage()).build();
|
||||||
|
}
|
||||||
|
|
||||||
|
private double calculateCpuUsage() {
|
||||||
|
BufferedReader bufferedReader = null;
|
||||||
|
try {
|
||||||
|
|
||||||
|
final File stat = new File(statFile);
|
||||||
|
if (stat.exists()) {
|
||||||
|
|
||||||
|
bufferedReader = new BufferedReader(new FileReader(stat));
|
||||||
|
final String line = bufferedReader.readLine();
|
||||||
|
final String[] lineVals = line.split("\\s+");
|
||||||
|
|
||||||
|
long usr = Long.parseLong(lineVals[1]) + Long.parseLong(lineVals[2]);
|
||||||
|
long sys = Long.parseLong(lineVals[3]);
|
||||||
|
long idl = Long.parseLong(lineVals[4]);
|
||||||
|
long iow = Long.parseLong(lineVals[5]);
|
||||||
|
long tot = usr + sys + idl + iow;
|
||||||
|
long diffIdl = -1;
|
||||||
|
long diffTot = -1;
|
||||||
|
|
||||||
|
boolean skip = false;
|
||||||
|
synchronized (LOCK_OBJECT) {
|
||||||
|
if (lastUsr == 0 || line.equals(lastLine)) {
|
||||||
|
// Case where this is a first call so no diff available or
|
||||||
|
// /proc/stat file is not updated since last time.
|
||||||
|
skip = true;
|
||||||
|
}
|
||||||
|
|
||||||
|
diffIdl = Math.abs(idl - lastIdl);
|
||||||
|
diffTot = Math.abs(tot - lastTot);
|
||||||
|
if (diffTot < diffIdl) {
|
||||||
|
log.warn(
|
||||||
|
"diffTot is less than diff_idle. \nPrev cpu line : {} and current cpu line : {} ",
|
||||||
|
lastLine,
|
||||||
|
line);
|
||||||
|
if (iow < lastIow) {
|
||||||
|
// this is case where current iow value less than prev, this can happen in rare cases as per
|
||||||
|
// https://docs.kernel.org/filesystems/proc.html, and when the worker is idle
|
||||||
|
// there is no increase in usr or sys values as well resulting in diffTot < diffIdl as
|
||||||
|
// current tot increases less than current idl
|
||||||
|
// return 0 in this case as this is the case where worker is not doing anything anyways.
|
||||||
|
skip = true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
lastUsr = usr;
|
||||||
|
lastSys = sys;
|
||||||
|
lastIdl = idl;
|
||||||
|
lastIow = iow;
|
||||||
|
lastTot = usr + sys + idl + iow;
|
||||||
|
lastLine = line;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (skip) {
|
||||||
|
return 0D;
|
||||||
|
}
|
||||||
|
|
||||||
|
return ((double) (diffTot - diffIdl) / (double) diffTot) * 100.0;
|
||||||
|
|
||||||
|
} else {
|
||||||
|
throw new IllegalArgumentException(String.format(
|
||||||
|
"LinuxCpuWorkerMetric is not configured properly, file : %s does not exists", this.statFile));
|
||||||
|
}
|
||||||
|
} catch (final Throwable t) {
|
||||||
|
if (t instanceof IllegalArgumentException) {
|
||||||
|
throw (IllegalArgumentException) t;
|
||||||
|
}
|
||||||
|
throw new IllegalArgumentException(
|
||||||
|
"LinuxCpuWorkerMetric failed to read metric stats or not configured properly.", t);
|
||||||
|
} finally {
|
||||||
|
try {
|
||||||
|
if (bufferedReader != null) {
|
||||||
|
bufferedReader.close();
|
||||||
|
}
|
||||||
|
} catch (Throwable x) {
|
||||||
|
log.warn("Failed to close bufferedReader ", x);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public OperatingRange getOperatingRange() {
|
||||||
|
return operatingRange;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public WorkerMetricType getWorkerMetricType() {
|
||||||
|
return CPU_WORKER_METRICS_TYPE;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
@ -0,0 +1,42 @@
|
||||||
|
package software.amazon.kinesis.worker.metric.impl.linux;
|
||||||
|
|
||||||
|
import com.google.common.annotations.VisibleForTesting;
|
||||||
|
import com.google.common.base.Stopwatch;
|
||||||
|
import software.amazon.kinesis.worker.metric.OperatingRange;
|
||||||
|
import software.amazon.kinesis.worker.metric.WorkerMetricType;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Ref java doc for {@link LinuxNetworkWorkerMetricBase}
|
||||||
|
*/
|
||||||
|
public class LinuxNetworkInWorkerMetric extends LinuxNetworkWorkerMetricBase {
|
||||||
|
private static final WorkerMetricType NETWORK_IN_WORKER_METRICS_TYPE = WorkerMetricType.NETWORK_IN;
|
||||||
|
|
||||||
|
public LinuxNetworkInWorkerMetric(
|
||||||
|
final OperatingRange operatingRange, final String interfaceName, final double maxBandwidthInMB) {
|
||||||
|
this(operatingRange, interfaceName, DEFAULT_NETWORK_STAT_FILE, maxBandwidthInMB, Stopwatch.createUnstarted());
|
||||||
|
}
|
||||||
|
|
||||||
|
public LinuxNetworkInWorkerMetric(final OperatingRange operatingRange, final double maxBandwidthInMB) {
|
||||||
|
this(
|
||||||
|
operatingRange,
|
||||||
|
DEFAULT_INTERFACE_NAME,
|
||||||
|
DEFAULT_NETWORK_STAT_FILE,
|
||||||
|
maxBandwidthInMB,
|
||||||
|
Stopwatch.createUnstarted());
|
||||||
|
}
|
||||||
|
|
||||||
|
@VisibleForTesting
|
||||||
|
LinuxNetworkInWorkerMetric(
|
||||||
|
final OperatingRange operatingRange,
|
||||||
|
final String interfaceName,
|
||||||
|
final String statFile,
|
||||||
|
final double maxBandwidthInMB,
|
||||||
|
final Stopwatch stopwatch) {
|
||||||
|
super(operatingRange, interfaceName, statFile, maxBandwidthInMB, stopwatch);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
protected WorkerMetricType getWorkerMetricsType() {
|
||||||
|
return NETWORK_IN_WORKER_METRICS_TYPE;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
@ -0,0 +1,42 @@
|
||||||
|
package software.amazon.kinesis.worker.metric.impl.linux;
|
||||||
|
|
||||||
|
import com.google.common.annotations.VisibleForTesting;
|
||||||
|
import com.google.common.base.Stopwatch;
|
||||||
|
import software.amazon.kinesis.worker.metric.OperatingRange;
|
||||||
|
import software.amazon.kinesis.worker.metric.WorkerMetricType;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Ref java doc for {@link LinuxNetworkWorkerMetricBase}
|
||||||
|
*/
|
||||||
|
public class LinuxNetworkOutWorkerMetric extends LinuxNetworkWorkerMetricBase {
|
||||||
|
private static final WorkerMetricType NETWORK_OUT_WORKER_METRICS_TYPE = WorkerMetricType.NETWORK_OUT;
|
||||||
|
|
||||||
|
public LinuxNetworkOutWorkerMetric(
|
||||||
|
final OperatingRange operatingRange, final String interfaceName, final double maxBandwidthInMB) {
|
||||||
|
this(operatingRange, interfaceName, DEFAULT_NETWORK_STAT_FILE, maxBandwidthInMB, Stopwatch.createUnstarted());
|
||||||
|
}
|
||||||
|
|
||||||
|
public LinuxNetworkOutWorkerMetric(final OperatingRange operatingRange, final double maxBandwidthInMB) {
|
||||||
|
this(
|
||||||
|
operatingRange,
|
||||||
|
DEFAULT_INTERFACE_NAME,
|
||||||
|
DEFAULT_NETWORK_STAT_FILE,
|
||||||
|
maxBandwidthInMB,
|
||||||
|
Stopwatch.createUnstarted());
|
||||||
|
}
|
||||||
|
|
||||||
|
@VisibleForTesting
|
||||||
|
LinuxNetworkOutWorkerMetric(
|
||||||
|
final OperatingRange operatingRange,
|
||||||
|
final String interfaceName,
|
||||||
|
final String statFile,
|
||||||
|
final double maxBandwidthInMB,
|
||||||
|
final Stopwatch stopwatch) {
|
||||||
|
super(operatingRange, interfaceName, statFile, maxBandwidthInMB, stopwatch);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
protected WorkerMetricType getWorkerMetricsType() {
|
||||||
|
return NETWORK_OUT_WORKER_METRICS_TYPE;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
@ -0,0 +1,188 @@
|
||||||
|
package software.amazon.kinesis.worker.metric.impl.linux;
|
||||||
|
|
||||||
|
import java.io.BufferedReader;
|
||||||
|
import java.io.File;
|
||||||
|
import java.io.FileReader;
|
||||||
|
import java.time.Duration;
|
||||||
|
import java.util.Map;
|
||||||
|
|
||||||
|
import com.google.common.base.Preconditions;
|
||||||
|
import com.google.common.base.Stopwatch;
|
||||||
|
import com.google.common.collect.ImmutableMap;
|
||||||
|
import lombok.extern.slf4j.Slf4j;
|
||||||
|
import software.amazon.kinesis.worker.metric.OperatingRange;
|
||||||
|
import software.amazon.kinesis.worker.metric.WorkerMetric;
|
||||||
|
import software.amazon.kinesis.worker.metric.WorkerMetricType;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Base class for EC2NetworkWorkerMetrics, this reads and parses /proc/net/dev file and look for the specific
|
||||||
|
* interface and reads received and transmitted bytes.
|
||||||
|
* To get the percentage of bandwidth consumed, the fetch bytes are converted to per second (based on the interval
|
||||||
|
* between invocation) and percentage is calculated by dividing it by the maximum bandwidth in MBps.
|
||||||
|
*
|
||||||
|
* When this is invoked for the first time, the value returned is always 0 as the prev values are not available
|
||||||
|
* to calculate the diff.
|
||||||
|
* In case the stat file is not present or any other exception occurs, this throws IllegalArgumentException.
|
||||||
|
*/
|
||||||
|
@Slf4j
|
||||||
|
public abstract class LinuxNetworkWorkerMetricBase implements WorkerMetric {
|
||||||
|
|
||||||
|
protected static final String DEFAULT_NETWORK_STAT_FILE = "/proc/net/dev";
|
||||||
|
protected static final String DEFAULT_INTERFACE_NAME = "eth0";
|
||||||
|
private final Object lockObject = new Object();
|
||||||
|
|
||||||
|
private final OperatingRange operatingRange;
|
||||||
|
private final String interfaceName;
|
||||||
|
private final String statFile;
|
||||||
|
private final double maxBandwidthInMBps;
|
||||||
|
// Stopwatch to keep track of elapsed time between invocation.
|
||||||
|
private final Stopwatch stopwatch;
|
||||||
|
|
||||||
|
public LinuxNetworkWorkerMetricBase(
|
||||||
|
final OperatingRange operatingRange,
|
||||||
|
final String interfaceName,
|
||||||
|
final String statFile,
|
||||||
|
final double maxBandwidthInMBps,
|
||||||
|
final Stopwatch stopwatch) {
|
||||||
|
Preconditions.checkArgument(maxBandwidthInMBps > 0, "maxBandwidthInMBps should be greater than 0.");
|
||||||
|
this.operatingRange = operatingRange;
|
||||||
|
this.interfaceName = interfaceName;
|
||||||
|
this.statFile = statFile;
|
||||||
|
this.maxBandwidthInMBps = maxBandwidthInMBps;
|
||||||
|
this.stopwatch = stopwatch;
|
||||||
|
}
|
||||||
|
|
||||||
|
private long lastRx = -1;
|
||||||
|
private long lastTx = -1;
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public String getShortName() {
|
||||||
|
return getWorkerMetricsType().getShortName();
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public OperatingRange getOperatingRange() {
|
||||||
|
return this.operatingRange;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public WorkerMetricType getWorkerMetricType() {
|
||||||
|
return getWorkerMetricsType();
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Reads the stat file and find the total bytes (in and out) and divide it by the time elapsed since last read to
|
||||||
|
* get the bytes per second.
|
||||||
|
* Converts the bytes per second to MBps and then normalizes it to a percentage of the maximum bandwidth.
|
||||||
|
* @return WorkerMetricValue with the % of network bandwidth consumed.
|
||||||
|
*/
|
||||||
|
@Override
|
||||||
|
public WorkerMetricValue capture() {
|
||||||
|
final double percentageOfMaxBandwidth =
|
||||||
|
convertToMBps(calculateNetworkUsage().get(getWorkerMetricsType())) / maxBandwidthInMBps * 100;
|
||||||
|
return WorkerMetricValue.builder()
|
||||||
|
// If maxBandwidthInMBps is less than utilized (could be wrong configuration),
|
||||||
|
// default to 100 % bandwidth utilization.
|
||||||
|
.value(Math.min(100, percentageOfMaxBandwidth))
|
||||||
|
.build();
|
||||||
|
}
|
||||||
|
|
||||||
|
private double convertToMBps(final long bytes) {
|
||||||
|
final double elapsedTimeInSecond;
|
||||||
|
if (!stopwatch.isRunning()) {
|
||||||
|
// stopwatch is not running during the first request only, in this case assume 1 second as elapsed as
|
||||||
|
// during the first request even bytes are zero, any value of elapsedTimeInSecond does not have any effect.
|
||||||
|
elapsedTimeInSecond = 1.0;
|
||||||
|
} else {
|
||||||
|
// Specifically, getting nanos and converting to seconds to get the decimal precision.
|
||||||
|
elapsedTimeInSecond = (double) stopwatch.elapsed().toNanos()
|
||||||
|
/ Duration.ofSeconds(1).toNanos();
|
||||||
|
}
|
||||||
|
stopwatch.reset().start();
|
||||||
|
// Convert bytes to MB
|
||||||
|
final double totalDataMB = (double) bytes / (1024 * 1024);
|
||||||
|
if (elapsedTimeInSecond == 0) {
|
||||||
|
// This should never happen, as getting called twice within 1 nanoSecond is never expected.
|
||||||
|
// If this happens something is real wrong.
|
||||||
|
throw new IllegalArgumentException("elapsedTimeInSecond is zero which in incorrect");
|
||||||
|
}
|
||||||
|
return totalDataMB / elapsedTimeInSecond;
|
||||||
|
}
|
||||||
|
|
||||||
|
protected abstract WorkerMetricType getWorkerMetricsType();
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Returns the absolute bytes in and out since the last invocation of the method.
|
||||||
|
* @return Map of WorkerMetricType to bytes
|
||||||
|
*/
|
||||||
|
private Map<WorkerMetricType, Long> calculateNetworkUsage() {
|
||||||
|
BufferedReader bufferedReader = null;
|
||||||
|
try {
|
||||||
|
final File net = new File(statFile);
|
||||||
|
if (net.exists()) {
|
||||||
|
bufferedReader = new BufferedReader(new FileReader(net));
|
||||||
|
|
||||||
|
// skip over header lines
|
||||||
|
bufferedReader.readLine();
|
||||||
|
bufferedReader.readLine();
|
||||||
|
|
||||||
|
// find specified interface
|
||||||
|
String line = bufferedReader.readLine();
|
||||||
|
while (line != null && !line.matches("^\\s*" + interfaceName + ":.*")) {
|
||||||
|
line = bufferedReader.readLine();
|
||||||
|
}
|
||||||
|
if (line == null) {
|
||||||
|
throw new IllegalArgumentException(
|
||||||
|
"Failed to parse the file and find interface : " + interfaceName);
|
||||||
|
}
|
||||||
|
|
||||||
|
int n = line.indexOf(':') + 1;
|
||||||
|
line = line.substring(n).trim();
|
||||||
|
String[] parts = line.split("\\s+");
|
||||||
|
|
||||||
|
long rx = Long.parseLong(parts[0]);
|
||||||
|
long tx = Long.parseLong(parts[8]);
|
||||||
|
long diffRx = -1, diffTx = -1;
|
||||||
|
boolean skip = false;
|
||||||
|
synchronized (lockObject) {
|
||||||
|
if (lastRx == -1) {
|
||||||
|
skip = true;
|
||||||
|
} else {
|
||||||
|
diffRx = Math.abs(rx - lastRx);
|
||||||
|
diffTx = Math.abs(tx - lastTx);
|
||||||
|
}
|
||||||
|
lastRx = rx;
|
||||||
|
lastTx = tx;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (skip) {
|
||||||
|
return createResponse(0L, 0L);
|
||||||
|
}
|
||||||
|
|
||||||
|
return createResponse(diffRx, diffTx);
|
||||||
|
} else {
|
||||||
|
throw new IllegalArgumentException(String.format(
|
||||||
|
"NetworkWorkerMetrics is not configured properly, file : %s does not exists", this.statFile));
|
||||||
|
}
|
||||||
|
} catch (final Throwable t) {
|
||||||
|
if (t instanceof IllegalArgumentException) {
|
||||||
|
throw (IllegalArgumentException) t;
|
||||||
|
}
|
||||||
|
throw new IllegalArgumentException("Cannot read/parse " + this.statFile, t);
|
||||||
|
} finally {
|
||||||
|
try {
|
||||||
|
if (bufferedReader != null) {
|
||||||
|
bufferedReader.close();
|
||||||
|
}
|
||||||
|
} catch (Throwable x) {
|
||||||
|
log.warn("Failed to close bufferedReader ", x);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private Map<WorkerMetricType, Long> createResponse(final long diffRx, final long diffTx) {
|
||||||
|
return ImmutableMap.of(
|
||||||
|
WorkerMetricType.NETWORK_IN, diffRx,
|
||||||
|
WorkerMetricType.NETWORK_OUT, diffTx);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
@ -0,0 +1,302 @@
|
||||||
|
package software.amazon.kinesis.worker.metricstats;
|
||||||
|
|
||||||
|
import java.util.HashMap;
|
||||||
|
import java.util.List;
|
||||||
|
import java.util.Map;
|
||||||
|
|
||||||
|
import lombok.AccessLevel;
|
||||||
|
import lombok.AllArgsConstructor;
|
||||||
|
import lombok.Builder;
|
||||||
|
import lombok.Data;
|
||||||
|
import lombok.EqualsAndHashCode;
|
||||||
|
import lombok.Getter;
|
||||||
|
import lombok.NoArgsConstructor;
|
||||||
|
import lombok.extern.slf4j.Slf4j;
|
||||||
|
import software.amazon.awssdk.enhanced.dynamodb.mapper.annotations.DynamoDbAttribute;
|
||||||
|
import software.amazon.awssdk.enhanced.dynamodb.mapper.annotations.DynamoDbBean;
|
||||||
|
import software.amazon.awssdk.enhanced.dynamodb.mapper.annotations.DynamoDbIgnore;
|
||||||
|
import software.amazon.awssdk.enhanced.dynamodb.mapper.annotations.DynamoDbPartitionKey;
|
||||||
|
import software.amazon.kinesis.utils.ExponentialMovingAverage;
|
||||||
|
import software.amazon.kinesis.worker.metric.WorkerMetricType;
|
||||||
|
|
||||||
|
import static java.util.Objects.isNull;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* DataModel for a WorkerMetric, this data model is used to store the current state of a Worker in terms of relevant
|
||||||
|
* WorkerMetric(CPU, Memory, Network).
|
||||||
|
*
|
||||||
|
* workerId : unique worker identifier, this is equivalent to the owner attribute from the lease table.
|
||||||
|
* lastUpdateTime : wall epoch in seconds when the entry was last updated
|
||||||
|
* metricStats : Map of WorkerMetric to last N values for it. e.g. entry "CPU" : [10,20,12,10] etc
|
||||||
|
* operatingRange : Map of WorkerMetric to its operating range. First item in the list of values defines the max limit.
|
||||||
|
* metricStatsMap : runtime computed WorkerMetric name to its average value map. This field is not stored in ddb
|
||||||
|
* and is used during Lease assignment only
|
||||||
|
*/
|
||||||
|
@Data
|
||||||
|
@Builder
|
||||||
|
@DynamoDbBean
|
||||||
|
@NoArgsConstructor
|
||||||
|
@AllArgsConstructor(access = AccessLevel.PRIVATE)
|
||||||
|
@Slf4j
|
||||||
|
public class WorkerMetricStats {
|
||||||
|
|
||||||
|
static final String KEY_LAST_UPDATE_TIME = "lut";
|
||||||
|
static final String KEY_WORKER_ID = "wid";
|
||||||
|
|
||||||
|
@Getter(onMethod_ = {@DynamoDbPartitionKey, @DynamoDbAttribute(KEY_WORKER_ID)})
|
||||||
|
private String workerId;
|
||||||
|
|
||||||
|
@Getter(onMethod_ = {@DynamoDbAttribute(KEY_LAST_UPDATE_TIME)})
|
||||||
|
private Long lastUpdateTime;
|
||||||
|
|
||||||
|
@Getter(onMethod_ = {@DynamoDbAttribute("sts")})
|
||||||
|
private Map<String, List<Double>> metricStats;
|
||||||
|
|
||||||
|
@Getter(onMethod_ = {@DynamoDbAttribute("opr")})
|
||||||
|
private Map<String, List<Long>> operatingRange;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* This map contains the WorkerMetric to its metric stat value. Metric stat value stored in this is exponentially averaged over
|
||||||
|
* available number of different datapoints.
|
||||||
|
*/
|
||||||
|
@Getter(onMethod_ = {@DynamoDbIgnore})
|
||||||
|
@EqualsAndHashCode.Exclude
|
||||||
|
@Builder.Default
|
||||||
|
private Map<String, Double> metricStatsMap = new HashMap<>();
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Alpha value used to compute the exponential moving average for worker metrics values.
|
||||||
|
*/
|
||||||
|
@Getter(onMethod_ = {@DynamoDbIgnore})
|
||||||
|
@EqualsAndHashCode.Exclude
|
||||||
|
@Builder.Default
|
||||||
|
private double emaAlpha = 0.2;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Returns true if given {@param workerMetricName} is available for the current worker else false
|
||||||
|
*/
|
||||||
|
public boolean containsMetricStat(final String workerMetricName) {
|
||||||
|
return metricStats.containsKey(workerMetricName);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Returns the value for given WorkerMetricStats name.
|
||||||
|
*/
|
||||||
|
public double getMetricStat(final String workerMetricName) {
|
||||||
|
return metricStatsMap.computeIfAbsent(workerMetricName, (key) -> computeAverage(metricStats.get(key)));
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Increase the WorkerMetricStats value by given increaseLoadPercentage. This is done during execution of LAM and
|
||||||
|
* as assignments are happening the current metric stat value is increased based on increaseLoadPercentage.
|
||||||
|
*/
|
||||||
|
public void extrapolateMetricStatValuesForAddedThroughput(
|
||||||
|
final Map<String, Double> workerMetricsToFleetLevelAverageMap,
|
||||||
|
final double averageThroughput,
|
||||||
|
final double increaseThroughput,
|
||||||
|
final double averageLeaseCount) {
|
||||||
|
|
||||||
|
metricStatsMap.replaceAll((key, value) -> extrapolateMetricsValue(
|
||||||
|
key,
|
||||||
|
workerMetricsToFleetLevelAverageMap.get(key),
|
||||||
|
averageThroughput,
|
||||||
|
increaseThroughput,
|
||||||
|
averageLeaseCount));
|
||||||
|
}
|
||||||
|
|
||||||
|
private double extrapolateMetricsValue(
|
||||||
|
final String metricName,
|
||||||
|
final double fleetLevelMetricAverage,
|
||||||
|
final double averageThroughput,
|
||||||
|
final double increaseThroughput,
|
||||||
|
final double averageLeaseCount) {
|
||||||
|
|
||||||
|
if (averageThroughput > 0) {
|
||||||
|
return metricStatsMap.get(metricName) + increaseThroughput * fleetLevelMetricAverage / averageThroughput;
|
||||||
|
} else {
|
||||||
|
return metricStatsMap.get(metricName) + fleetLevelMetricAverage / averageLeaseCount;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public boolean willAnyMetricStatsGoAboveAverageUtilizationOrOperatingRange(
|
||||||
|
final Map<String, Double> workerMetricsToFleetLevelAverageMap,
|
||||||
|
final double averageThroughput,
|
||||||
|
final double increaseThroughput,
|
||||||
|
final double averageLeaseCount) {
|
||||||
|
for (final String metricStatName : metricStats.keySet()) {
|
||||||
|
final double fleetLevelAverageForMetric = workerMetricsToFleetLevelAverageMap.get(metricStatName);
|
||||||
|
final double updatedValueToBe = extrapolateMetricsValue(
|
||||||
|
metricStatName,
|
||||||
|
fleetLevelAverageForMetric,
|
||||||
|
averageThroughput,
|
||||||
|
increaseThroughput,
|
||||||
|
averageLeaseCount);
|
||||||
|
|
||||||
|
if (updatedValueToBe > fleetLevelAverageForMetric
|
||||||
|
|| updatedValueToBe > operatingRange.get(metricStatName).get(0)) {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Increase the metric stat value corresponding to the added single lease. This is done during execution of LAM and
|
||||||
|
* as assignments are happening the load is increase for LAM to determine workers for assignment.
|
||||||
|
* The increase is done considering that for a WorkerMetric the fleet level average would be met when fleet level
|
||||||
|
* average leases are assigned to a worker and thus 1 lease addition increases the metric stat value by fleet level
|
||||||
|
* average of metric stat by averageLeaseCount
|
||||||
|
*/
|
||||||
|
public void extrapolateMetricStatValuesForAddedLease(
|
||||||
|
final Map<String, Double> workerMetricToFleetLevelAverage, final int averageLeaseCount) {
|
||||||
|
for (Map.Entry<String, Double> workerMetricToMetricStat : metricStatsMap.entrySet()) {
|
||||||
|
final String workerMetricName = workerMetricToMetricStat.getKey();
|
||||||
|
final Double updatedValue = workerMetricToMetricStat.getValue()
|
||||||
|
+ workerMetricToFleetLevelAverage.get(workerMetricName) / averageLeaseCount;
|
||||||
|
metricStatsMap.replace(workerMetricName, updatedValue);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Determines percentage of load to reach the mean for the worker. In case of multiple worker metrics the metric stat
|
||||||
|
* value closest to mean is used to determine the percentage value. This value is indication of how much load in
|
||||||
|
* percentage to current load the worker can take to reach mean value.
|
||||||
|
* @param workerMetricToFleetLevelAverage : WorkerMetric to fleet level mean value.
|
||||||
|
* @return percentage to reach mean based on the WorkerMetric closest to its corresponding average.
|
||||||
|
*/
|
||||||
|
public double computePercentageToReachAverage(final Map<String, Double> workerMetricToFleetLevelAverage) {
|
||||||
|
double minDifferencePercentage = Double.MAX_VALUE;
|
||||||
|
for (final String workerMetricName : metricStats.keySet()) {
|
||||||
|
final double metricStatValue = getMetricStat(workerMetricName);
|
||||||
|
final double differenceRatio;
|
||||||
|
if (metricStatValue == 0D) {
|
||||||
|
// If metric stat value is 0 that means this worker does not have any load so we assume that this worker
|
||||||
|
// can take 100% more load than the current to reach average.
|
||||||
|
differenceRatio = 1;
|
||||||
|
} else {
|
||||||
|
differenceRatio =
|
||||||
|
(workerMetricToFleetLevelAverage.get(workerMetricName) - metricStatValue) / metricStatValue;
|
||||||
|
}
|
||||||
|
minDifferencePercentage = Math.min(minDifferencePercentage, differenceRatio);
|
||||||
|
}
|
||||||
|
return minDifferencePercentage;
|
||||||
|
}
|
||||||
|
|
||||||
|
private Double computeAverage(final List<Double> values) {
|
||||||
|
if (values.isEmpty()) {
|
||||||
|
return 0D;
|
||||||
|
}
|
||||||
|
final ExponentialMovingAverage average = new ExponentialMovingAverage(emaAlpha);
|
||||||
|
// Ignore -1 which denotes the WorkerMetric failure when calculating average, as it possible in past
|
||||||
|
// one of the value is -1 due to some intermediate failure, and it has recovered since.
|
||||||
|
values.forEach(value -> {
|
||||||
|
if (value != -1) {
|
||||||
|
average.add(value);
|
||||||
|
}
|
||||||
|
});
|
||||||
|
return average.getValue();
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Returns true if any of the metric stat values has -1 in last index which represents that the metric stat value
|
||||||
|
* was not successfully fetched in last attempt by worker.
|
||||||
|
*
|
||||||
|
* @return true if any metric stat value has -1 in last index, false otherwise.
|
||||||
|
*/
|
||||||
|
public boolean isAnyWorkerMetricFailing() {
|
||||||
|
boolean response = false;
|
||||||
|
if (isUsingDefaultWorkerMetric()) {
|
||||||
|
return response;
|
||||||
|
}
|
||||||
|
for (final Map.Entry<String, List<Double>> resourceStatsEntry : metricStats.entrySet()) {
|
||||||
|
if (resourceStatsEntry.getValue().isEmpty()) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
final Double lastEntry = resourceStatsEntry
|
||||||
|
.getValue()
|
||||||
|
.get(resourceStatsEntry.getValue().size() - 1);
|
||||||
|
if (lastEntry != null && lastEntry == -1D) {
|
||||||
|
response = true;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (response) {
|
||||||
|
log.warn("WorkerStats: {} has a WorkerMetric which is failing.", this);
|
||||||
|
}
|
||||||
|
return response;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* WorkerMetricStats entry is invalid
|
||||||
|
* if any of the field from lastUpdateTime, operatingRange, resourcesStats are not present or
|
||||||
|
* if resourcesStats is empty or
|
||||||
|
* if any of the WorkerMetrics having resourceStats does not have operatingRange or
|
||||||
|
* if operating range values are not present or
|
||||||
|
* if maxUtilization is 0 for any WorkerMetric
|
||||||
|
* @return true if the entry is valid false otherwise.
|
||||||
|
*/
|
||||||
|
public boolean isValidWorkerMetric() {
|
||||||
|
if (isNull(lastUpdateTime)) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
if (isUsingDefaultWorkerMetric()) {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
if (isNull(metricStats) || isNull(operatingRange)) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
for (final Map.Entry<String, List<Double>> entry : metricStats.entrySet()) {
|
||||||
|
if (!operatingRange.containsKey(entry.getKey())) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
for (final Map.Entry<String, List<Long>> operatingRangeEntry : operatingRange.entrySet()) {
|
||||||
|
// If operatingRange for a WorkerMetric is missing or if maxUtilization is 0 then its not valid entry.
|
||||||
|
if (operatingRangeEntry.getValue().isEmpty()
|
||||||
|
|| operatingRangeEntry.getValue().get(0) == 0) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
public boolean isAnyWorkerMetricAboveAverageUtilizationOrOperatingRange(
|
||||||
|
final Map<String, Double> workerMetricToFleetLevelAverage) {
|
||||||
|
for (final String workerMetricName : metricStats.keySet()) {
|
||||||
|
final double value = getMetricStat(workerMetricName);
|
||||||
|
if (value > workerMetricToFleetLevelAverage.get(workerMetricName)) {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
// check if any metric stat value is above operating range.
|
||||||
|
return workerMetricToFleetLevelAverage.keySet().stream().anyMatch(this::isWorkerMetricAboveOperatingRange);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* If a worker is not using an explicit WorkerMetric such as CPU, Memory, or Network, then it
|
||||||
|
* is said to be using the default WorkerMetric. Load management then falls back to throughput.
|
||||||
|
* @return true if the worker is not using an explicit WorkerMetric.
|
||||||
|
*/
|
||||||
|
public boolean isUsingDefaultWorkerMetric() {
|
||||||
|
if ((metricStats == null || metricStats.isEmpty()) && (operatingRange == null || operatingRange.isEmpty())) {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
if (metricStats != null) {
|
||||||
|
return metricStats.entrySet().stream()
|
||||||
|
.anyMatch(entry -> entry.getKey().equals(WorkerMetricType.THROUGHPUT.name()));
|
||||||
|
}
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Evaluates if the given metric stat is above operatingRange for the given WorkerMetric name. If the WorkerMetric
|
||||||
|
* does not exist returns false
|
||||||
|
* @param workerMetricName WorkerMetric name to evaluate
|
||||||
|
* @return true if metric stat exists and is above operatingRange for the WorkerMetric
|
||||||
|
*/
|
||||||
|
public boolean isWorkerMetricAboveOperatingRange(final String workerMetricName) {
|
||||||
|
return metricStatsMap.containsKey(workerMetricName)
|
||||||
|
&& metricStatsMap.get(workerMetricName)
|
||||||
|
> operatingRange.get(workerMetricName).get(0);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
@ -0,0 +1,219 @@
|
||||||
|
package software.amazon.kinesis.worker.metricstats;
|
||||||
|
|
||||||
|
import java.time.Duration;
|
||||||
|
import java.time.Instant;
|
||||||
|
import java.util.ArrayList;
|
||||||
|
import java.util.List;
|
||||||
|
import java.util.Map;
|
||||||
|
import java.util.concurrent.CompletableFuture;
|
||||||
|
import java.util.concurrent.CompletionException;
|
||||||
|
import java.util.function.Supplier;
|
||||||
|
import java.util.stream.Collectors;
|
||||||
|
|
||||||
|
import com.google.common.base.Preconditions;
|
||||||
|
import com.google.common.collect.ImmutableMap;
|
||||||
|
import lombok.extern.slf4j.Slf4j;
|
||||||
|
import org.jetbrains.annotations.NotNull;
|
||||||
|
import software.amazon.awssdk.core.waiters.WaiterResponse;
|
||||||
|
import software.amazon.awssdk.enhanced.dynamodb.DynamoDbAsyncTable;
|
||||||
|
import software.amazon.awssdk.enhanced.dynamodb.DynamoDbEnhancedAsyncClient;
|
||||||
|
import software.amazon.awssdk.enhanced.dynamodb.Expression;
|
||||||
|
import software.amazon.awssdk.enhanced.dynamodb.Key;
|
||||||
|
import software.amazon.awssdk.enhanced.dynamodb.TableSchema;
|
||||||
|
import software.amazon.awssdk.enhanced.dynamodb.model.DeleteItemEnhancedRequest;
|
||||||
|
import software.amazon.awssdk.enhanced.dynamodb.model.UpdateItemEnhancedRequest;
|
||||||
|
import software.amazon.awssdk.services.dynamodb.DynamoDbAsyncClient;
|
||||||
|
import software.amazon.awssdk.services.dynamodb.model.AttributeValue;
|
||||||
|
import software.amazon.awssdk.services.dynamodb.model.BillingMode;
|
||||||
|
import software.amazon.awssdk.services.dynamodb.model.ConditionalCheckFailedException;
|
||||||
|
import software.amazon.awssdk.services.dynamodb.model.DescribeTableRequest;
|
||||||
|
import software.amazon.awssdk.services.dynamodb.model.DescribeTableResponse;
|
||||||
|
import software.amazon.awssdk.services.dynamodb.model.ProvisionedThroughput;
|
||||||
|
import software.amazon.awssdk.services.dynamodb.model.ResourceNotFoundException;
|
||||||
|
import software.amazon.awssdk.services.dynamodb.model.TableDescription;
|
||||||
|
import software.amazon.awssdk.services.dynamodb.model.TableStatus;
|
||||||
|
import software.amazon.awssdk.services.dynamodb.waiters.DynamoDbAsyncWaiter;
|
||||||
|
import software.amazon.kinesis.leases.LeaseManagementConfig.WorkerMetricsTableConfig;
|
||||||
|
import software.amazon.kinesis.leases.exceptions.DependencyException;
|
||||||
|
|
||||||
|
import static java.util.Objects.nonNull;
|
||||||
|
import static software.amazon.kinesis.worker.metricstats.WorkerMetricStats.KEY_LAST_UPDATE_TIME;
|
||||||
|
import static software.amazon.kinesis.worker.metricstats.WorkerMetricStats.KEY_WORKER_ID;
|
||||||
|
|
||||||
|
@Slf4j
|
||||||
|
public class WorkerMetricStatsDAO {
|
||||||
|
private final DynamoDbEnhancedAsyncClient dynamoDbEnhancedAsyncClient;
|
||||||
|
private final DynamoDbAsyncTable<WorkerMetricStats> table;
|
||||||
|
private final DynamoDbAsyncClient dynamoDbAsyncClient;
|
||||||
|
private final WorkerMetricsTableConfig tableConfig;
|
||||||
|
private final Long workerMetricsReporterFrequencyMillis;
|
||||||
|
|
||||||
|
public WorkerMetricStatsDAO(
|
||||||
|
final DynamoDbAsyncClient dynamoDbAsyncClient,
|
||||||
|
final WorkerMetricsTableConfig tableConfig,
|
||||||
|
final Long workerMetricsReporterFrequencyMillis) {
|
||||||
|
this.dynamoDbAsyncClient = dynamoDbAsyncClient;
|
||||||
|
this.dynamoDbEnhancedAsyncClient = DynamoDbEnhancedAsyncClient.builder()
|
||||||
|
.dynamoDbClient(dynamoDbAsyncClient)
|
||||||
|
.build();
|
||||||
|
this.table = dynamoDbEnhancedAsyncClient.table(
|
||||||
|
tableConfig.tableName(), TableSchema.fromBean(WorkerMetricStats.class));
|
||||||
|
this.tableConfig = tableConfig;
|
||||||
|
this.workerMetricsReporterFrequencyMillis = workerMetricsReporterFrequencyMillis;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Performs initialization of the WorkerMetricStats DAO and table.
|
||||||
|
* This will create the table if it doesn't exist.
|
||||||
|
*/
|
||||||
|
public void initialize() throws DependencyException {
|
||||||
|
createTableIfDoesNotExist();
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Updates the workerMetrics for the provided worker, method ignores the null attributes and overrides
|
||||||
|
* the only non-null from {@param workerMetrics}. This is a blocking call.
|
||||||
|
*
|
||||||
|
* @param workerMetrics : Updated WorkerMetricStats object, resourceStats, workerId and lastUpdateTime are
|
||||||
|
* required fields from {@param workerMetrics}
|
||||||
|
*/
|
||||||
|
public void updateMetrics(final WorkerMetricStats workerMetrics) {
|
||||||
|
validateWorkerMetrics(workerMetrics);
|
||||||
|
final UpdateItemEnhancedRequest<WorkerMetricStats> request = UpdateItemEnhancedRequest.builder(
|
||||||
|
WorkerMetricStats.class)
|
||||||
|
.item(workerMetrics)
|
||||||
|
.ignoreNulls(true)
|
||||||
|
.build();
|
||||||
|
unwrappingFuture(() -> table.updateItem(request));
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Deletes the WorkerMetricStats entry with conditional check on lastUpdateTime, if the worker has come alive and
|
||||||
|
* updated the lastUpdateTime then we no longer need to perform the deletion.
|
||||||
|
* @param workerMetrics WorkerMetricStats that needs to be deleted.
|
||||||
|
* @return
|
||||||
|
*/
|
||||||
|
public boolean deleteMetrics(final WorkerMetricStats workerMetrics) {
|
||||||
|
Preconditions.checkArgument(nonNull(workerMetrics.getWorkerId()), "WorkerID is not provided");
|
||||||
|
Preconditions.checkArgument(nonNull(workerMetrics.getLastUpdateTime()), "LastUpdateTime is not provided");
|
||||||
|
|
||||||
|
final DeleteItemEnhancedRequest request = DeleteItemEnhancedRequest.builder()
|
||||||
|
.key(Key.builder().partitionValue(workerMetrics.getWorkerId()).build())
|
||||||
|
.conditionExpression(Expression.builder()
|
||||||
|
.expression(String.format("#key = :value AND attribute_exists (%s)", KEY_WORKER_ID))
|
||||||
|
.expressionNames(ImmutableMap.of("#key", KEY_LAST_UPDATE_TIME))
|
||||||
|
.expressionValues(ImmutableMap.of(
|
||||||
|
":value", AttributeValue.fromN(Long.toString(workerMetrics.getLastUpdateTime()))))
|
||||||
|
.build())
|
||||||
|
.build();
|
||||||
|
|
||||||
|
try {
|
||||||
|
unwrappingFuture(() -> table.deleteItem(request));
|
||||||
|
return true;
|
||||||
|
} catch (final ConditionalCheckFailedException e) {
|
||||||
|
log.warn(
|
||||||
|
"Failed to delete the WorkerMetricStats due to conditional failure for worker : {}",
|
||||||
|
workerMetrics,
|
||||||
|
e);
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private void validateWorkerMetrics(final WorkerMetricStats workerMetrics) {
|
||||||
|
Preconditions.checkArgument(nonNull(workerMetrics.getMetricStats()), "ResourceMetrics not provided");
|
||||||
|
|
||||||
|
final List<String> entriesWithoutValues = workerMetrics.getMetricStats().entrySet().stream()
|
||||||
|
.filter(entry -> entry.getValue() == null || entry.getValue().isEmpty())
|
||||||
|
.map(Map.Entry::getKey)
|
||||||
|
.collect(Collectors.toList());
|
||||||
|
|
||||||
|
Preconditions.checkArgument(
|
||||||
|
entriesWithoutValues.isEmpty(), "Following metric stats dont have any values " + entriesWithoutValues);
|
||||||
|
|
||||||
|
Preconditions.checkArgument(nonNull(workerMetrics.getLastUpdateTime()), "LastUpdateTime field not set");
|
||||||
|
|
||||||
|
// If the LastUpdateTime field is 2x older than the reporter interval, it is considered stale.
|
||||||
|
Preconditions.checkArgument(
|
||||||
|
Duration.between(Instant.ofEpochSecond(workerMetrics.getLastUpdateTime()), Instant.now())
|
||||||
|
.toMillis()
|
||||||
|
< 2 * workerMetricsReporterFrequencyMillis,
|
||||||
|
"LastUpdateTime is more than 2x older than workerMetricsReporterFrequencyMillis");
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Performs the scan on the storage and returns list of all workerMetricStats objects.
|
||||||
|
*
|
||||||
|
* @return : List of all worker metric stats
|
||||||
|
*/
|
||||||
|
public List<WorkerMetricStats> getAllWorkerMetricStats() {
|
||||||
|
log.debug("Scanning DDB table {}", table.tableName());
|
||||||
|
final List<WorkerMetricStats> workerMetricStats = new ArrayList<>();
|
||||||
|
unwrappingFuture(() -> table.scan().items().subscribe(workerMetricStats::add));
|
||||||
|
return workerMetricStats;
|
||||||
|
}
|
||||||
|
|
||||||
|
private TableDescription getTableDescription() {
|
||||||
|
try {
|
||||||
|
final DescribeTableResponse response = unwrappingFuture(() -> dynamoDbAsyncClient.describeTable(
|
||||||
|
DescribeTableRequest.builder().tableName(table.tableName()).build()));
|
||||||
|
return response.table();
|
||||||
|
} catch (final ResourceNotFoundException e) {
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private void createTableIfDoesNotExist() throws DependencyException {
|
||||||
|
TableDescription tableDescription = getTableDescription();
|
||||||
|
if (tableDescription == null) {
|
||||||
|
unwrappingFuture(getWorkerMetricsDynamoTableCreator());
|
||||||
|
tableDescription = getTableDescription();
|
||||||
|
log.info("Table : {} created.", table.tableName());
|
||||||
|
} else {
|
||||||
|
log.info("Table : {} already existing, skipping creation...", table.tableName());
|
||||||
|
}
|
||||||
|
|
||||||
|
if (tableDescription.tableStatus() != TableStatus.ACTIVE) {
|
||||||
|
log.info("Waiting for DDB Table: {} to become active", table.tableName());
|
||||||
|
try (final DynamoDbAsyncWaiter waiter = dynamoDbAsyncClient.waiter()) {
|
||||||
|
final WaiterResponse<DescribeTableResponse> response =
|
||||||
|
unwrappingFuture(() -> waiter.waitUntilTableExists(
|
||||||
|
r -> r.tableName(table.tableName()), o -> o.waitTimeout(Duration.ofMinutes(10))));
|
||||||
|
response.matched()
|
||||||
|
.response()
|
||||||
|
.orElseThrow(() -> new DependencyException(new IllegalStateException(
|
||||||
|
"Creating WorkerMetricStats table timed out",
|
||||||
|
response.matched().exception().orElse(null))));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
@NotNull
|
||||||
|
private Supplier<CompletableFuture<Void>> getWorkerMetricsDynamoTableCreator() {
|
||||||
|
final Supplier<CompletableFuture<Void>> tableCreator;
|
||||||
|
if (tableConfig.billingMode() == BillingMode.PROVISIONED) {
|
||||||
|
log.info(
|
||||||
|
"Creating worker metric stats table {} in provisioned mode with {}wcu and {}rcu",
|
||||||
|
tableConfig.tableName(),
|
||||||
|
tableConfig.writeCapacity(),
|
||||||
|
tableConfig.readCapacity());
|
||||||
|
tableCreator = () -> table.createTable(r -> r.provisionedThroughput(ProvisionedThroughput.builder()
|
||||||
|
.readCapacityUnits(tableConfig.readCapacity())
|
||||||
|
.writeCapacityUnits(tableConfig.writeCapacity())
|
||||||
|
.build()));
|
||||||
|
} else {
|
||||||
|
tableCreator = table::createTable;
|
||||||
|
}
|
||||||
|
return tableCreator;
|
||||||
|
}
|
||||||
|
|
||||||
|
static <T> T unwrappingFuture(final Supplier<CompletableFuture<T>> supplier) {
|
||||||
|
try {
|
||||||
|
return supplier.get().join();
|
||||||
|
} catch (final CompletionException e) {
|
||||||
|
if (e.getCause() instanceof RuntimeException) {
|
||||||
|
throw (RuntimeException) e.getCause();
|
||||||
|
}
|
||||||
|
throw e;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
@ -0,0 +1,227 @@
|
||||||
|
package software.amazon.kinesis.worker.metricstats;
|
||||||
|
|
||||||
|
import java.math.BigDecimal;
|
||||||
|
import java.math.RoundingMode;
|
||||||
|
import java.util.ArrayList;
|
||||||
|
import java.util.HashMap;
|
||||||
|
import java.util.List;
|
||||||
|
import java.util.Map;
|
||||||
|
import java.util.Optional;
|
||||||
|
import java.util.Queue;
|
||||||
|
import java.util.concurrent.Executors;
|
||||||
|
import java.util.concurrent.ScheduledExecutorService;
|
||||||
|
import java.util.concurrent.ScheduledFuture;
|
||||||
|
import java.util.concurrent.TimeUnit;
|
||||||
|
|
||||||
|
import com.google.common.collect.EvictingQueue;
|
||||||
|
import com.google.common.collect.ImmutableList;
|
||||||
|
import com.google.common.collect.Queues;
|
||||||
|
import lombok.AccessLevel;
|
||||||
|
import lombok.Getter;
|
||||||
|
import lombok.extern.slf4j.Slf4j;
|
||||||
|
import software.amazon.awssdk.services.cloudwatch.model.StandardUnit;
|
||||||
|
import software.amazon.awssdk.utils.ThreadFactoryBuilder;
|
||||||
|
import software.amazon.kinesis.annotations.KinesisClientInternalApi;
|
||||||
|
import software.amazon.kinesis.metrics.MetricsFactory;
|
||||||
|
import software.amazon.kinesis.metrics.MetricsLevel;
|
||||||
|
import software.amazon.kinesis.metrics.MetricsScope;
|
||||||
|
import software.amazon.kinesis.metrics.MetricsUtil;
|
||||||
|
import software.amazon.kinesis.worker.metric.WorkerMetric;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* WorkerMetricStatsManager is a class that manages the collection of raw WorkerMetricStats values for the list of WorkerMetricStats
|
||||||
|
* periodically and store in a bounded in-memory queue.
|
||||||
|
* This class runs a periodic thread at every {@link #inMemoryStatsCaptureThreadFrequencyMillis} interval which
|
||||||
|
* captures each WorkerMetricStats's raw value and stores them in {@link #workerMetricsToRawHighFreqValuesMap} for each.
|
||||||
|
* When computeStats is invoked, the method drains the in-memory raw values queue for each WorkerMetricStats and computes the
|
||||||
|
* average and stores the computed average in #computedAverageStats for each WorkerMetricStats.
|
||||||
|
* For each WorkerMetricStats last {@link #maxMetricStatsCount} values are captured in {@link #computedAverageMetrics}
|
||||||
|
*
|
||||||
|
* This class is thread safe.
|
||||||
|
*/
|
||||||
|
@Slf4j
|
||||||
|
@KinesisClientInternalApi
|
||||||
|
public final class WorkerMetricStatsManager {
|
||||||
|
|
||||||
|
/**
|
||||||
|
* 6 digit after decimal
|
||||||
|
*/
|
||||||
|
private static final int DEFAULT_AVERAGE_VALUES_DIGIT_AFTER_DECIMAL = 6;
|
||||||
|
|
||||||
|
private static final String METRICS_OPERATION_WORKER_STATS_REPORTER = "WorkerMetricStatsReporter";
|
||||||
|
static final String METRICS_IN_MEMORY_REPORTER_FAILURE = "InMemoryMetricStatsReporterFailure";
|
||||||
|
// 1 value per sec gives 5 minutes worth of past data for 300 count which is sufficient.
|
||||||
|
// In case of reporter running more frequently than 5 minutes the queue will not reach this value anyway.
|
||||||
|
private static final int HIGH_FREQUENCY_STATS_COUNT = 300;
|
||||||
|
private static final long SCHEDULER_SHUTDOWN_TIMEOUT_SECONDS = 60L;
|
||||||
|
|
||||||
|
private final ScheduledExecutorService scheduledExecutorService;
|
||||||
|
/**
|
||||||
|
* Max count of values per WorkerMetricStats that is recorded in the storage.
|
||||||
|
*/
|
||||||
|
private final int maxMetricStatsCount;
|
||||||
|
/**
|
||||||
|
* List of WorkerMetricStats configured for the application, the values from these will be recorded in the storage.
|
||||||
|
*/
|
||||||
|
private final List<WorkerMetric> workerMetricList;
|
||||||
|
/**
|
||||||
|
* Map of WorkerMetricStats to its trailing (#maxMetricStatsCount) values.
|
||||||
|
*/
|
||||||
|
@Getter(AccessLevel.PACKAGE)
|
||||||
|
private final Map<WorkerMetric, Queue<Double>> computedAverageMetrics;
|
||||||
|
/**
|
||||||
|
* Map of the WorkerMetricStats to its raw values since the last flush to storage was done.
|
||||||
|
*/
|
||||||
|
@Getter(AccessLevel.PACKAGE)
|
||||||
|
private final Map<WorkerMetric, Queue<Double>> workerMetricsToRawHighFreqValuesMap;
|
||||||
|
/**
|
||||||
|
* Frequency for capturing raw WorkerMetricsValues in millis.
|
||||||
|
*/
|
||||||
|
private final long inMemoryStatsCaptureThreadFrequencyMillis;
|
||||||
|
|
||||||
|
private final MetricsFactory metricsFactory;
|
||||||
|
private ScheduledFuture<?> managerProcessFuture;
|
||||||
|
|
||||||
|
public WorkerMetricStatsManager(
|
||||||
|
final int maxMetricStatsCount,
|
||||||
|
final List<WorkerMetric> workerMetricList,
|
||||||
|
final MetricsFactory metricsFactory,
|
||||||
|
long inMemoryStatsCaptureThreadFrequencyMillis) {
|
||||||
|
// Set thread as daemon to not block VM from exit.
|
||||||
|
this.scheduledExecutorService = Executors.newScheduledThreadPool(
|
||||||
|
1,
|
||||||
|
new ThreadFactoryBuilder()
|
||||||
|
.daemonThreads(true)
|
||||||
|
.threadNamePrefix("worker-metrics-manager")
|
||||||
|
.build());
|
||||||
|
this.maxMetricStatsCount = maxMetricStatsCount;
|
||||||
|
this.workerMetricList = workerMetricList;
|
||||||
|
this.computedAverageMetrics = new HashMap<>();
|
||||||
|
this.workerMetricsToRawHighFreqValuesMap = new HashMap<>();
|
||||||
|
this.metricsFactory = metricsFactory;
|
||||||
|
this.inMemoryStatsCaptureThreadFrequencyMillis = inMemoryStatsCaptureThreadFrequencyMillis;
|
||||||
|
init();
|
||||||
|
}
|
||||||
|
|
||||||
|
private void init() {
|
||||||
|
for (final WorkerMetric workerMetric : workerMetricList) {
|
||||||
|
computedAverageMetrics.put(workerMetric, EvictingQueue.create(maxMetricStatsCount));
|
||||||
|
workerMetricsToRawHighFreqValuesMap.put(
|
||||||
|
workerMetric, Queues.synchronizedQueue(EvictingQueue.create(HIGH_FREQUENCY_STATS_COUNT)));
|
||||||
|
}
|
||||||
|
log.info(
|
||||||
|
"Completed initialization with maxMetricStatsCount : {} and total WorkerMetricStats : {}",
|
||||||
|
maxMetricStatsCount,
|
||||||
|
workerMetricList.size());
|
||||||
|
}
|
||||||
|
|
||||||
|
public void startManager() {
|
||||||
|
managerProcessFuture = scheduledExecutorService.scheduleWithFixedDelay(
|
||||||
|
this::recordWorkerMetrics, 0, inMemoryStatsCaptureThreadFrequencyMillis, TimeUnit.MILLISECONDS);
|
||||||
|
log.info("Started manager process...");
|
||||||
|
}
|
||||||
|
|
||||||
|
public void stopManager() {
|
||||||
|
if (managerProcessFuture != null) {
|
||||||
|
managerProcessFuture.cancel(false);
|
||||||
|
}
|
||||||
|
if (!scheduledExecutorService.isShutdown()) {
|
||||||
|
scheduledExecutorService.shutdown();
|
||||||
|
try {
|
||||||
|
if (scheduledExecutorService.awaitTermination(SCHEDULER_SHUTDOWN_TIMEOUT_SECONDS, TimeUnit.SECONDS)) {
|
||||||
|
scheduledExecutorService.shutdownNow();
|
||||||
|
}
|
||||||
|
} catch (final InterruptedException e) {
|
||||||
|
Thread.currentThread().interrupt();
|
||||||
|
log.warn("Interrupted when shutting down the scheduler, forcing shutdown", e);
|
||||||
|
scheduledExecutorService.shutdownNow();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private void recordWorkerMetrics() {
|
||||||
|
for (final WorkerMetric workerMetric : workerMetricList) {
|
||||||
|
final Optional<Double> value = fetchWorkerMetricsValue(workerMetric);
|
||||||
|
value.ifPresent(aDouble ->
|
||||||
|
workerMetricsToRawHighFreqValuesMap.get(workerMetric).add(aDouble));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private Optional<Double> fetchWorkerMetricsValue(final WorkerMetric workerMetric) {
|
||||||
|
try {
|
||||||
|
final Double value = workerMetric.capture().getValue();
|
||||||
|
return Optional.of(value);
|
||||||
|
} catch (final Throwable throwable) {
|
||||||
|
log.error(
|
||||||
|
"WorkerMetricStats {} failure : ",
|
||||||
|
workerMetric.getWorkerMetricType().name(),
|
||||||
|
throwable);
|
||||||
|
final MetricsScope scope =
|
||||||
|
MetricsUtil.createMetricsWithOperation(metricsFactory, METRICS_OPERATION_WORKER_STATS_REPORTER);
|
||||||
|
try {
|
||||||
|
scope.addData(METRICS_IN_MEMORY_REPORTER_FAILURE, 1, StandardUnit.COUNT, MetricsLevel.SUMMARY);
|
||||||
|
} finally {
|
||||||
|
MetricsUtil.endScope(scope);
|
||||||
|
}
|
||||||
|
return Optional.empty();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Computes the metric stats for each WorkerMetricStats by averaging the values in inMemoryQueue and returns last
|
||||||
|
* {@link WorkerMetricStatsManager#maxMetricStatsCount } averaged values for each WorkerMetricStats.
|
||||||
|
*
|
||||||
|
* In the case of empty inMemoryQueue, computedStats has -1 value to denote that specific WorkerMetricStats has failed.
|
||||||
|
* @return Map of WorkerMetricStats shortName to averaged {@link WorkerMetricStatsManager#maxMetricStatsCount } values.
|
||||||
|
*/
|
||||||
|
public synchronized Map<String, List<Double>> computeMetrics() {
|
||||||
|
final Map<String, List<Double>> result = new HashMap<>();
|
||||||
|
workerMetricsToRawHighFreqValuesMap.forEach((workerMetrics, statsQueue) -> {
|
||||||
|
final List<Double> currentWorkerMetricsStats = drainQueue(statsQueue);
|
||||||
|
|
||||||
|
final Queue<Double> computedMetrics = computedAverageMetrics.get(workerMetrics);
|
||||||
|
|
||||||
|
if (currentWorkerMetricsStats.isEmpty()) {
|
||||||
|
// In case currentWorkerMetricsStats is empty that means values from workerMetrics were not capture due
|
||||||
|
// to some
|
||||||
|
// reason, and thus there are no recent values, compute the value to be -1 to denote workerMetrics
|
||||||
|
// failure
|
||||||
|
computedMetrics.add(-1D);
|
||||||
|
} else {
|
||||||
|
computedMetrics.add(computeAverage(currentWorkerMetricsStats));
|
||||||
|
}
|
||||||
|
|
||||||
|
result.put(workerMetrics.getShortName(), new ArrayList<>(computedMetrics));
|
||||||
|
});
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Gets the operating range for each WorkerMetricStats that is registered.
|
||||||
|
* @return Map of WorkerMetricStats to list of two values, first value is max utilization, and second value is variance %.
|
||||||
|
*/
|
||||||
|
public Map<String, List<Long>> getOperatingRange() {
|
||||||
|
final Map<String, List<Long>> operatingRange = new HashMap<>();
|
||||||
|
workerMetricList.forEach(
|
||||||
|
workerMetrics -> operatingRange.put(workerMetrics.getShortName(), ImmutableList.of((long)
|
||||||
|
workerMetrics.getOperatingRange().getMaxUtilization())));
|
||||||
|
return operatingRange;
|
||||||
|
}
|
||||||
|
|
||||||
|
private static List<Double> drainQueue(final Queue<Double> queue) {
|
||||||
|
final List<Double> elements = new ArrayList<>();
|
||||||
|
final int queueLength = queue.size();
|
||||||
|
for (int i = 0; i < queueLength; ++i) {
|
||||||
|
elements.add(queue.poll());
|
||||||
|
}
|
||||||
|
return elements;
|
||||||
|
}
|
||||||
|
|
||||||
|
private Double computeAverage(final List<Double> values) {
|
||||||
|
final double average =
|
||||||
|
values.stream().mapToDouble(Double::doubleValue).average().orElse(0D);
|
||||||
|
return BigDecimal.valueOf(average)
|
||||||
|
.setScale(DEFAULT_AVERAGE_VALUES_DIGIT_AFTER_DECIMAL, RoundingMode.HALF_UP)
|
||||||
|
.doubleValue();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
@ -0,0 +1,68 @@
|
||||||
|
/*
|
||||||
|
* Copyright 2024 Amazon.com, Inc. or its affiliates.
|
||||||
|
* Licensed under the Apache License, Version 2.0 (the
|
||||||
|
* "License"); you may not use this file except in compliance
|
||||||
|
* with the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
package software.amazon.kinesis.worker.metricstats;
|
||||||
|
|
||||||
|
import java.time.Instant;
|
||||||
|
|
||||||
|
import lombok.RequiredArgsConstructor;
|
||||||
|
import lombok.extern.slf4j.Slf4j;
|
||||||
|
import software.amazon.kinesis.annotations.KinesisClientInternalApi;
|
||||||
|
import software.amazon.kinesis.metrics.MetricsFactory;
|
||||||
|
import software.amazon.kinesis.metrics.MetricsLevel;
|
||||||
|
import software.amazon.kinesis.metrics.MetricsScope;
|
||||||
|
import software.amazon.kinesis.metrics.MetricsUtil;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Reporter that is periodically executed to report WorkerMetricStats. It collects
|
||||||
|
* the in memory metric stats and writes into the DDB WorkerMetricStats table.
|
||||||
|
*/
|
||||||
|
@Slf4j
|
||||||
|
@RequiredArgsConstructor
|
||||||
|
@KinesisClientInternalApi
|
||||||
|
public class WorkerMetricStatsReporter implements Runnable {
|
||||||
|
private final MetricsFactory metricsFactory;
|
||||||
|
private final String workerIdentifier;
|
||||||
|
private final WorkerMetricStatsManager workerMetricsManager;
|
||||||
|
private final WorkerMetricStatsDAO workerMetricsDAO;
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void run() {
|
||||||
|
final MetricsScope scope = MetricsUtil.createMetricsWithOperation(metricsFactory, "WorkerMetricStatsReporter");
|
||||||
|
final long startTime = System.currentTimeMillis();
|
||||||
|
boolean success = false;
|
||||||
|
try {
|
||||||
|
/*
|
||||||
|
* OperatingRange value fetched during the initialization and is same afterwards. It's possible
|
||||||
|
* to update OperatingRange only in first call and then skip, but we do not want to do that to avoid
|
||||||
|
* case where a worker can have a failure for some time and thus does not update the workerMetrics entry
|
||||||
|
* and LeaseAssigmentManager cleans it and then worker ends updating entry without operating range.
|
||||||
|
*/
|
||||||
|
final WorkerMetricStats workerMetrics = WorkerMetricStats.builder()
|
||||||
|
.workerId(workerIdentifier)
|
||||||
|
.metricStats(workerMetricsManager.computeMetrics())
|
||||||
|
.operatingRange(workerMetricsManager.getOperatingRange())
|
||||||
|
.lastUpdateTime(Instant.now().getEpochSecond())
|
||||||
|
.build();
|
||||||
|
workerMetricsDAO.updateMetrics(workerMetrics);
|
||||||
|
success = true;
|
||||||
|
} catch (final Exception e) {
|
||||||
|
log.error("Failed to update worker metric stats for worker : {}", workerIdentifier, e);
|
||||||
|
} finally {
|
||||||
|
MetricsUtil.addWorkerIdentifier(scope, workerIdentifier);
|
||||||
|
MetricsUtil.addSuccessAndLatency(scope, success, startTime, MetricsLevel.SUMMARY);
|
||||||
|
MetricsUtil.endScope(scope);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
@ -0,0 +1,111 @@
|
||||||
|
package software.amazon.kinesis.worker.platform;
|
||||||
|
|
||||||
|
import java.io.BufferedReader;
|
||||||
|
import java.io.InputStreamReader;
|
||||||
|
import java.net.HttpURLConnection;
|
||||||
|
import java.net.MalformedURLException;
|
||||||
|
import java.net.URL;
|
||||||
|
import java.util.Optional;
|
||||||
|
import java.util.stream.Collectors;
|
||||||
|
|
||||||
|
import lombok.extern.slf4j.Slf4j;
|
||||||
|
import org.jetbrains.annotations.VisibleForTesting;
|
||||||
|
import software.amazon.kinesis.annotations.KinesisClientInternalApi;
|
||||||
|
|
||||||
|
import static software.amazon.kinesis.worker.platform.OperatingRangeDataProvider.LINUX_PROC;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Provides resource metadata for EC2.
|
||||||
|
*/
|
||||||
|
@KinesisClientInternalApi
|
||||||
|
@Slf4j
|
||||||
|
public class Ec2Resource implements ResourceMetadataProvider {
|
||||||
|
// https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/retrieve-iid.html
|
||||||
|
private static final String IMDS_URL = "http://169.254.169.254/latest/dynamic/instance-identity/document";
|
||||||
|
private static final String TOKEN_URL = "http://169.254.169.254/latest/api/token";
|
||||||
|
private static final int EC2_INSTANCE_METADATA_TIMEOUT_MILLIS = 5000;
|
||||||
|
|
||||||
|
private final UrlOpener identityDocumentUrl;
|
||||||
|
private final UrlOpener tokenUrl;
|
||||||
|
|
||||||
|
@VisibleForTesting
|
||||||
|
Ec2Resource(UrlOpener identityDocumentUrl, UrlOpener tokenUrl) {
|
||||||
|
this.identityDocumentUrl = identityDocumentUrl;
|
||||||
|
this.tokenUrl = tokenUrl;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Factory method to create an instance of Ec2Resource.
|
||||||
|
*
|
||||||
|
* @return Ec2Resource instance
|
||||||
|
*/
|
||||||
|
public static Ec2Resource create() {
|
||||||
|
try {
|
||||||
|
return new Ec2Resource(new UrlOpener(new URL(IMDS_URL)), new UrlOpener(new URL(TOKEN_URL)));
|
||||||
|
} catch (MalformedURLException e) {
|
||||||
|
// It should not throw unless it's unit testing.
|
||||||
|
throw new IllegalArgumentException(e);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private boolean isEc2() {
|
||||||
|
try {
|
||||||
|
final HttpURLConnection connection = identityDocumentUrl.openConnection();
|
||||||
|
connection.setRequestMethod("GET");
|
||||||
|
// IMDS v2 requires IMDS token
|
||||||
|
connection.setRequestProperty("X-aws-ec2-metadata-token", fetchImdsToken());
|
||||||
|
connection.setConnectTimeout(EC2_INSTANCE_METADATA_TIMEOUT_MILLIS);
|
||||||
|
connection.setReadTimeout(EC2_INSTANCE_METADATA_TIMEOUT_MILLIS);
|
||||||
|
if (connection.getResponseCode() == 200) {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
} catch (Exception e) {
|
||||||
|
// TODO: probably need to add retries as well.
|
||||||
|
log.error("Unable to retrieve instance metadata", e);
|
||||||
|
}
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
private String fetchImdsToken() {
|
||||||
|
try {
|
||||||
|
final HttpURLConnection connection = tokenUrl.openConnection();
|
||||||
|
connection.setRequestMethod("PUT");
|
||||||
|
connection.setRequestProperty("X-aws-ec2-metadata-token-ttl-seconds", "600");
|
||||||
|
connection.setConnectTimeout(EC2_INSTANCE_METADATA_TIMEOUT_MILLIS);
|
||||||
|
connection.setReadTimeout(EC2_INSTANCE_METADATA_TIMEOUT_MILLIS);
|
||||||
|
if (connection.getResponseCode() == 200) {
|
||||||
|
return new BufferedReader(new InputStreamReader(tokenUrl.getInputStream(connection)))
|
||||||
|
.lines()
|
||||||
|
.collect(Collectors.joining());
|
||||||
|
}
|
||||||
|
} catch (Exception e) {
|
||||||
|
log.warn(
|
||||||
|
"Unable to retrieve IMDS token. It could mean that the instance is not EC2 or is using IMDS V1", e);
|
||||||
|
}
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* {@inheritDoc}
|
||||||
|
*/
|
||||||
|
@Override
|
||||||
|
public boolean isOnPlatform() {
|
||||||
|
return isEc2();
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* {@inheritDoc}
|
||||||
|
*/
|
||||||
|
@Override
|
||||||
|
public ComputePlatform getPlatform() {
|
||||||
|
return ComputePlatform.EC2;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* {@inheritDoc}
|
||||||
|
*/
|
||||||
|
@Override
|
||||||
|
public Optional<OperatingRangeDataProvider> getOperatingRangeDataProvider() {
|
||||||
|
return Optional.of(LINUX_PROC).filter(OperatingRangeDataProvider::isProvider);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
@ -0,0 +1,59 @@
|
||||||
|
package software.amazon.kinesis.worker.platform;
|
||||||
|
|
||||||
|
import java.util.Map;
|
||||||
|
import java.util.Optional;
|
||||||
|
|
||||||
|
import org.jetbrains.annotations.VisibleForTesting;
|
||||||
|
import software.amazon.kinesis.annotations.KinesisClientInternalApi;
|
||||||
|
|
||||||
|
import static software.amazon.kinesis.worker.platform.OperatingRangeDataProvider.LINUX_ECS_METADATA_KEY_V4;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Provides resource metadata for ECS.
|
||||||
|
*/
|
||||||
|
@KinesisClientInternalApi
|
||||||
|
public class EcsResource implements ResourceMetadataProvider {
|
||||||
|
static final String ECS_METADATA_KEY_V3 = "ECS_CONTAINER_METADATA_URI";
|
||||||
|
static final String ECS_METADATA_KEY_V4 = "ECS_CONTAINER_METADATA_URI_V4";
|
||||||
|
|
||||||
|
private final Map<String, String> sysEnv;
|
||||||
|
|
||||||
|
@VisibleForTesting
|
||||||
|
EcsResource(Map<String, String> sysEnv) {
|
||||||
|
this.sysEnv = sysEnv;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Factory method to create an instance of EcsResource.
|
||||||
|
*
|
||||||
|
* @return an instance of EcsResource
|
||||||
|
*/
|
||||||
|
public static EcsResource create() {
|
||||||
|
return new EcsResource(System.getenv());
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* {@inheritDoc}
|
||||||
|
*/
|
||||||
|
@Override
|
||||||
|
public boolean isOnPlatform() {
|
||||||
|
return !sysEnv.getOrDefault(ECS_METADATA_KEY_V3, "").isEmpty()
|
||||||
|
|| !sysEnv.getOrDefault(ECS_METADATA_KEY_V4, "").isEmpty();
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* {@inheritDoc}
|
||||||
|
*/
|
||||||
|
@Override
|
||||||
|
public ComputePlatform getPlatform() {
|
||||||
|
return ComputePlatform.ECS;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* {@inheritDoc}
|
||||||
|
*/
|
||||||
|
@Override
|
||||||
|
public Optional<OperatingRangeDataProvider> getOperatingRangeDataProvider() {
|
||||||
|
return Optional.of(LINUX_ECS_METADATA_KEY_V4).filter(OperatingRangeDataProvider::isProvider);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
@ -0,0 +1,61 @@
|
||||||
|
package software.amazon.kinesis.worker.platform;
|
||||||
|
|
||||||
|
import java.io.File;
|
||||||
|
import java.util.Optional;
|
||||||
|
import java.util.stream.Stream;
|
||||||
|
|
||||||
|
import org.jetbrains.annotations.VisibleForTesting;
|
||||||
|
import software.amazon.kinesis.annotations.KinesisClientInternalApi;
|
||||||
|
|
||||||
|
import static software.amazon.kinesis.worker.platform.OperatingRangeDataProvider.LINUX_EKS_CGROUP_V1;
|
||||||
|
import static software.amazon.kinesis.worker.platform.OperatingRangeDataProvider.LINUX_EKS_CGROUP_V2;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Provides resource metadata for EKS.
|
||||||
|
*/
|
||||||
|
@KinesisClientInternalApi
|
||||||
|
public class EksResource implements ResourceMetadataProvider {
|
||||||
|
private static final String K8S_TOKEN_PATH = "/var/run/secrets/kubernetes.io/serviceaccount/token";
|
||||||
|
private final String k8sTokenPath;
|
||||||
|
|
||||||
|
@VisibleForTesting
|
||||||
|
EksResource(String k8sTokenPath) {
|
||||||
|
this.k8sTokenPath = k8sTokenPath;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Factory method to create an instance of EksResource.
|
||||||
|
*
|
||||||
|
* @return an instance of EksResource
|
||||||
|
*/
|
||||||
|
public static EksResource create() {
|
||||||
|
return new EksResource(K8S_TOKEN_PATH);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* {@inheritDoc}
|
||||||
|
*/
|
||||||
|
@Override
|
||||||
|
public boolean isOnPlatform() {
|
||||||
|
return new File(this.k8sTokenPath).exists();
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* {@inheritDoc}
|
||||||
|
*/
|
||||||
|
@Override
|
||||||
|
public ComputePlatform getPlatform() {
|
||||||
|
return ComputePlatform.EKS;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* {@inheritDoc}
|
||||||
|
*/
|
||||||
|
@Override
|
||||||
|
public Optional<OperatingRangeDataProvider> getOperatingRangeDataProvider() {
|
||||||
|
// It is only possible that either cgroupv1 or cgroupv2 is mounted
|
||||||
|
return Stream.of(LINUX_EKS_CGROUP_V2, LINUX_EKS_CGROUP_V1)
|
||||||
|
.filter(OperatingRangeDataProvider::isProvider)
|
||||||
|
.findFirst();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
@ -0,0 +1,73 @@
|
||||||
|
package software.amazon.kinesis.worker.platform;
|
||||||
|
|
||||||
|
import java.io.File;
|
||||||
|
|
||||||
|
import static software.amazon.kinesis.worker.platform.EcsResource.ECS_METADATA_KEY_V4;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Enum representing the different operating range metadata providers.
|
||||||
|
*/
|
||||||
|
public enum OperatingRangeDataProvider {
|
||||||
|
LINUX_EKS_CGROUP_V1 {
|
||||||
|
@Override
|
||||||
|
public boolean isProvider() {
|
||||||
|
if (!OperatingRangeDataProvider.isLinux()) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
// Check if the cgroup v2 specific file does NOT exist
|
||||||
|
final File cgroupV2File = new File("/sys/fs/cgroup/cgroup.controllers");
|
||||||
|
if (cgroupV2File.exists()) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Check for common cgroup v1 directories like memory or cpu
|
||||||
|
final File memoryCgroup = new File("/sys/fs/cgroup/memory");
|
||||||
|
final File cpuCgroup = new File("/sys/fs/cgroup/cpu");
|
||||||
|
|
||||||
|
return memoryCgroup.exists() || cpuCgroup.exists();
|
||||||
|
}
|
||||||
|
},
|
||||||
|
LINUX_EKS_CGROUP_V2 {
|
||||||
|
@Override
|
||||||
|
public boolean isProvider() {
|
||||||
|
if (!OperatingRangeDataProvider.isLinux()) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Check if the cgroup v2 specific file exists
|
||||||
|
final File cgroupV2File = new File("/sys/fs/cgroup/cgroup.controllers");
|
||||||
|
|
||||||
|
return cgroupV2File.exists();
|
||||||
|
}
|
||||||
|
},
|
||||||
|
LINUX_ECS_METADATA_KEY_V4 {
|
||||||
|
@Override
|
||||||
|
public boolean isProvider() {
|
||||||
|
if (!OperatingRangeDataProvider.isLinux()) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
return !System.getenv().getOrDefault(ECS_METADATA_KEY_V4, "").isEmpty();
|
||||||
|
}
|
||||||
|
},
|
||||||
|
LINUX_PROC {
|
||||||
|
@Override
|
||||||
|
public boolean isProvider() {
|
||||||
|
if (!OperatingRangeDataProvider.isLinux()) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
// Check if /proc directory exists (common in Linux environments)
|
||||||
|
return new File("/proc").exists();
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
private static boolean isLinux() {
|
||||||
|
return System.getProperty("os.name").toLowerCase().contains("linux");
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Abstract method to check if the provider is supported on the current platform.
|
||||||
|
*
|
||||||
|
* @return true if the provider is supported, false otherwise.
|
||||||
|
*/
|
||||||
|
public abstract boolean isProvider();
|
||||||
|
}
|
||||||
|
|
@ -0,0 +1,42 @@
|
||||||
|
package software.amazon.kinesis.worker.platform;
|
||||||
|
|
||||||
|
import java.util.Optional;
|
||||||
|
|
||||||
|
import software.amazon.kinesis.annotations.KinesisClientInternalApi;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Interface for providing resource metadata for worker.
|
||||||
|
*/
|
||||||
|
@KinesisClientInternalApi
|
||||||
|
public interface ResourceMetadataProvider {
|
||||||
|
/**
|
||||||
|
* Enum representing the different compute platforms.
|
||||||
|
*/
|
||||||
|
enum ComputePlatform {
|
||||||
|
EC2,
|
||||||
|
ECS,
|
||||||
|
EKS,
|
||||||
|
UNKNOWN
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Check if the worker is running on the specific platform.
|
||||||
|
*
|
||||||
|
* @return true if the worker is running on the specific platform, false otherwise.
|
||||||
|
*/
|
||||||
|
boolean isOnPlatform();
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Get the name of the compute platform.
|
||||||
|
*
|
||||||
|
* @return the platform represent by the class.
|
||||||
|
*/
|
||||||
|
ComputePlatform getPlatform();
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Get the operating range data provider.
|
||||||
|
*
|
||||||
|
* @return the operating range data provider.
|
||||||
|
*/
|
||||||
|
Optional<OperatingRangeDataProvider> getOperatingRangeDataProvider();
|
||||||
|
}
|
||||||
|
|
@ -0,0 +1,39 @@
|
||||||
|
package software.amazon.kinesis.worker.platform;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.io.InputStream;
|
||||||
|
import java.net.HttpURLConnection;
|
||||||
|
import java.net.URL;
|
||||||
|
|
||||||
|
import lombok.RequiredArgsConstructor;
|
||||||
|
import software.amazon.kinesis.annotations.KinesisClientInternalApi;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Utility class to open a URL and get the input stream.
|
||||||
|
*/
|
||||||
|
@RequiredArgsConstructor
|
||||||
|
@KinesisClientInternalApi
|
||||||
|
class UrlOpener {
|
||||||
|
private final URL url;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Open the URL and return the connection.
|
||||||
|
*
|
||||||
|
* @return a HttpURLConnection.
|
||||||
|
* @throws IOException if a connection cannot be established.
|
||||||
|
*/
|
||||||
|
public HttpURLConnection openConnection() throws IOException {
|
||||||
|
return (HttpURLConnection) url.openConnection();
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Get the input stream from the connection.
|
||||||
|
*
|
||||||
|
* @param connection the connection to get the input stream from.
|
||||||
|
* @return the InputStream for the data.
|
||||||
|
* @throws IOException if an error occurs while getting the input stream.
|
||||||
|
*/
|
||||||
|
public InputStream getInputStream(HttpURLConnection connection) throws IOException {
|
||||||
|
return connection.getInputStream();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
@ -20,4 +20,4 @@ message AggregatedRecord {
|
||||||
repeated string partition_key_table = 1;
|
repeated string partition_key_table = 1;
|
||||||
repeated string explicit_hash_key_table = 2;
|
repeated string explicit_hash_key_table = 2;
|
||||||
repeated Record records = 3;
|
repeated Record records = 3;
|
||||||
}
|
}
|
||||||
|
|
@ -0,0 +1 @@
|
||||||
|
Sample test ECS metadata for Amazon ECS task metadata v4. For more information, see https://docs.aws.amazon.com/AmazonECS/latest/developerguide/task-metadata-endpoint-v4-examples.html
|
||||||
|
|
@ -0,0 +1,44 @@
|
||||||
|
{
|
||||||
|
"DockerId": "ea32192c8553fbff06c9340478a2ff089b2bb5646fb718b4ee206641c9086d66",
|
||||||
|
"Name": "curl",
|
||||||
|
"DockerName": "ecs-curltest-24-curl-cca48e8dcadd97805600",
|
||||||
|
"Image": "111122223333.dkr.ecr.us-west-2.amazonaws.com/curltest:latest",
|
||||||
|
"ImageID": "sha256:d691691e9652791a60114e67b365688d20d19940dde7c4736ea30e660d8d3553",
|
||||||
|
"Labels": {
|
||||||
|
"com.amazonaws.ecs.cluster": "default",
|
||||||
|
"com.amazonaws.ecs.container-name": "curl",
|
||||||
|
"com.amazonaws.ecs.task-arn": "arn:aws:ecs:us-west-2:111122223333:task/default/8f03e41243824aea923aca126495f665",
|
||||||
|
"com.amazonaws.ecs.task-definition-family": "curltest",
|
||||||
|
"com.amazonaws.ecs.task-definition-version": "24"
|
||||||
|
},
|
||||||
|
"DesiredStatus": "RUNNING",
|
||||||
|
"KnownStatus": "RUNNING",
|
||||||
|
"Limits": {
|
||||||
|
"CPU": 50,
|
||||||
|
"Memory": 128
|
||||||
|
},
|
||||||
|
"CreatedAt": "2020-10-02T00:15:07.620912337Z",
|
||||||
|
"StartedAt": "2020-10-02T00:15:08.062559351Z",
|
||||||
|
"Type": "NORMAL",
|
||||||
|
"LogDriver": "awslogs",
|
||||||
|
"LogOptions": {
|
||||||
|
"awslogs-create-group": "true",
|
||||||
|
"awslogs-group": "/ecs/metadata",
|
||||||
|
"awslogs-region": "us-west-2",
|
||||||
|
"awslogs-stream": "ecs/curl/8f03e41243824aea923aca126495f665"
|
||||||
|
},
|
||||||
|
"ContainerARN": "arn:aws:ecs:us-west-2:111122223333:container/0206b271-b33f-47ab-86c6-a0ba208a70a9",
|
||||||
|
"Networks": [
|
||||||
|
{
|
||||||
|
"NetworkMode": "awsvpc",
|
||||||
|
"IPv4Addresses": [
|
||||||
|
"10.0.2.100"
|
||||||
|
],
|
||||||
|
"AttachmentIndex": 0,
|
||||||
|
"MACAddress": "0e:9e:32:c7:48:85",
|
||||||
|
"IPv4SubnetCIDRBlock": "10.0.2.0/24",
|
||||||
|
"PrivateDNSName": "ip-10-0-2-100.us-west-2.compute.internal",
|
||||||
|
"SubnetGatewayIpv4Address": "10.0.2.1/24"
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}
|
||||||
|
|
@ -0,0 +1,130 @@
|
||||||
|
{
|
||||||
|
"read": "2020-10-02T00:61:13.410254284Z",
|
||||||
|
"preread": "2020-10-02T00:51:12.406202398Z",
|
||||||
|
"pids_stats": {
|
||||||
|
"current": 3
|
||||||
|
},
|
||||||
|
"blkio_stats": {
|
||||||
|
"io_service_bytes_recursive": [
|
||||||
|
|
||||||
|
],
|
||||||
|
"io_serviced_recursive": [
|
||||||
|
|
||||||
|
],
|
||||||
|
"io_queue_recursive": [
|
||||||
|
|
||||||
|
],
|
||||||
|
"io_service_time_recursive": [
|
||||||
|
|
||||||
|
],
|
||||||
|
"io_wait_time_recursive": [
|
||||||
|
|
||||||
|
],
|
||||||
|
"io_merged_recursive": [
|
||||||
|
|
||||||
|
],
|
||||||
|
"io_time_recursive": [
|
||||||
|
|
||||||
|
],
|
||||||
|
"sectors_recursive": [
|
||||||
|
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"num_procs": 0,
|
||||||
|
"storage_stats": {
|
||||||
|
|
||||||
|
},
|
||||||
|
"cpu_stats": {
|
||||||
|
"cpu_usage": {
|
||||||
|
"total_usage": 150000000,
|
||||||
|
"percpu_usage": [
|
||||||
|
182359190,
|
||||||
|
178608875
|
||||||
|
],
|
||||||
|
"usage_in_kernelmode": 40000000,
|
||||||
|
"usage_in_usermode": 290000000
|
||||||
|
},
|
||||||
|
"system_cpu_usage": 200000000,
|
||||||
|
"online_cpus": 2,
|
||||||
|
"throttling_data": {
|
||||||
|
"periods": 0,
|
||||||
|
"throttled_periods": 0,
|
||||||
|
"throttled_time": 0
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"precpu_stats": {
|
||||||
|
"cpu_usage": {
|
||||||
|
"total_usage": 0,
|
||||||
|
"percpu_usage": [
|
||||||
|
182359190,
|
||||||
|
178608875
|
||||||
|
],
|
||||||
|
"usage_in_kernelmode": 40000000,
|
||||||
|
"usage_in_usermode": 290000000
|
||||||
|
},
|
||||||
|
"system_cpu_usage": 0,
|
||||||
|
"online_cpus": 2,
|
||||||
|
"throttling_data": {
|
||||||
|
"periods": 0,
|
||||||
|
"throttled_periods": 0,
|
||||||
|
"throttled_time": 0
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"memory_stats": {
|
||||||
|
"usage": 1806336,
|
||||||
|
"max_usage": 6299648,
|
||||||
|
"stats": {
|
||||||
|
"active_anon": 606208,
|
||||||
|
"active_file": 0,
|
||||||
|
"cache": 0,
|
||||||
|
"dirty": 0,
|
||||||
|
"hierarchical_memory_limit": 134217728,
|
||||||
|
"hierarchical_memsw_limit": 268435456,
|
||||||
|
"inactive_anon": 0,
|
||||||
|
"inactive_file": 0,
|
||||||
|
"mapped_file": 0,
|
||||||
|
"pgfault": 4185,
|
||||||
|
"pgmajfault": 0,
|
||||||
|
"pgpgin": 2926,
|
||||||
|
"pgpgout": 2778,
|
||||||
|
"rss": 606208,
|
||||||
|
"rss_huge": 0,
|
||||||
|
"total_active_anon": 606208,
|
||||||
|
"total_active_file": 0,
|
||||||
|
"total_cache": 0,
|
||||||
|
"total_dirty": 0,
|
||||||
|
"total_inactive_anon": 0,
|
||||||
|
"total_inactive_file": 0,
|
||||||
|
"total_mapped_file": 0,
|
||||||
|
"total_pgfault": 4185,
|
||||||
|
"total_pgmajfault": 0,
|
||||||
|
"total_pgpgin": 2926,
|
||||||
|
"total_pgpgout": 2778,
|
||||||
|
"total_rss": 606208,
|
||||||
|
"total_rss_huge": 0,
|
||||||
|
"total_unevictable": 0,
|
||||||
|
"total_writeback": 0,
|
||||||
|
"unevictable": 0,
|
||||||
|
"writeback": 0
|
||||||
|
},
|
||||||
|
"limit": 134217728
|
||||||
|
},
|
||||||
|
"name": "/ecs-curltest-26-curl-c2e5f6e0cf91b0bead01",
|
||||||
|
"id": "5fc21e5b015f899d22618f8aede80b6d70d71b2a75465ea49d9462c8f3d2d3af",
|
||||||
|
"networks": {
|
||||||
|
"eth0": {
|
||||||
|
"rx_bytes": 84,
|
||||||
|
"rx_packets": 2,
|
||||||
|
"rx_errors": 0,
|
||||||
|
"rx_dropped": 0,
|
||||||
|
"tx_bytes": 84,
|
||||||
|
"tx_packets": 2,
|
||||||
|
"tx_errors": 0,
|
||||||
|
"tx_dropped": 0
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"network_rate_stats": {
|
||||||
|
"rx_bytes_per_sec": 0,
|
||||||
|
"tx_bytes_per_sec": 0
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
@ -0,0 +1,56 @@
|
||||||
|
{
|
||||||
|
"Cluster": "default",
|
||||||
|
"TaskARN": "arn:aws:ecs:us-west-2:111122223333:task/default/158d1c8083dd49d6b527399fd6414f5c",
|
||||||
|
"Family": "curltest",
|
||||||
|
"ServiceName": "MyService",
|
||||||
|
"Revision": "26",
|
||||||
|
"DesiredStatus": "RUNNING",
|
||||||
|
"KnownStatus": "RUNNING",
|
||||||
|
"Limits": {
|
||||||
|
"CPU": 4,
|
||||||
|
"Memory": 128
|
||||||
|
},
|
||||||
|
"PullStartedAt": "2020-10-02T00:43:06.202617438Z",
|
||||||
|
"PullStoppedAt": "2020-10-02T00:43:06.31288465Z",
|
||||||
|
"AvailabilityZone": "us-west-2d",
|
||||||
|
"VPCID": "vpc-1234567890abcdef0",
|
||||||
|
"LaunchType": "EC2",
|
||||||
|
"Containers": [
|
||||||
|
{
|
||||||
|
"DockerId": "ea32192c8553fbff06c9340478a2ff089b2bb5646fb718b4ee206641c9086d66",
|
||||||
|
"Name": "~internal~ecs~pause",
|
||||||
|
"DockerName": "ecs-curltest-26-internalecspause-e292d586b6f9dade4a00",
|
||||||
|
"Image": "amazon/amazon-ecs-pause:0.1.0",
|
||||||
|
"ImageID": "",
|
||||||
|
"Labels": {
|
||||||
|
"com.amazonaws.ecs.cluster": "default",
|
||||||
|
"com.amazonaws.ecs.container-name": "~internal~ecs~pause",
|
||||||
|
"com.amazonaws.ecs.task-arn": "arn:aws:ecs:us-west-2:111122223333:task/default/158d1c8083dd49d6b527399fd6414f5c",
|
||||||
|
"com.amazonaws.ecs.task-definition-family": "curltest",
|
||||||
|
"com.amazonaws.ecs.task-definition-version": "26"
|
||||||
|
},
|
||||||
|
"DesiredStatus": "RESOURCES_PROVISIONED",
|
||||||
|
"KnownStatus": "RESOURCES_PROVISIONED",
|
||||||
|
"Limits": {
|
||||||
|
"CPU": 50,
|
||||||
|
"Memory": 128
|
||||||
|
},
|
||||||
|
"CreatedAt": "2020-10-02T00:43:05.602352471Z",
|
||||||
|
"StartedAt": "2020-10-02T00:43:06.076707576Z",
|
||||||
|
"Type": "CNI_PAUSE",
|
||||||
|
"Networks": [
|
||||||
|
{
|
||||||
|
"NetworkMode": "awsvpc",
|
||||||
|
"IPv4Addresses": [
|
||||||
|
"10.0.2.61"
|
||||||
|
],
|
||||||
|
"AttachmentIndex": 0,
|
||||||
|
"MACAddress": "0e:10:e2:01:bd:91",
|
||||||
|
"IPv4SubnetCIDRBlock": "10.0.2.0/24",
|
||||||
|
"PrivateDNSName": "ip-10-0-2-61.us-west-2.compute.internal",
|
||||||
|
"SubnetGatewayIpv4Address": "10.0.2.1/24"
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}
|
||||||
|
|
@ -0,0 +1,44 @@
|
||||||
|
{
|
||||||
|
"DockerId": "ea32192c8553fbff06c9340478a2ff089b2bb5646fb718b4ee206641c9086d66",
|
||||||
|
"Name": "curl",
|
||||||
|
"DockerName": "ecs-curltest-24-curl-cca48e8dcadd97805600",
|
||||||
|
"Image": "111122223333.dkr.ecr.us-west-2.amazonaws.com/curltest:latest",
|
||||||
|
"ImageID": "sha256:d691691e9652791a60114e67b365688d20d19940dde7c4736ea30e660d8d3553",
|
||||||
|
"Labels": {
|
||||||
|
"com.amazonaws.ecs.cluster": "default",
|
||||||
|
"com.amazonaws.ecs.container-name": "curl",
|
||||||
|
"com.amazonaws.ecs.task-arn": "arn:aws:ecs:us-west-2:111122223333:task/default/8f03e41243824aea923aca126495f665",
|
||||||
|
"com.amazonaws.ecs.task-definition-family": "curltest",
|
||||||
|
"com.amazonaws.ecs.task-definition-version": "24"
|
||||||
|
},
|
||||||
|
"DesiredStatus": "RUNNING",
|
||||||
|
"KnownStatus": "RUNNING",
|
||||||
|
"Limits": {
|
||||||
|
"CPU": 50,
|
||||||
|
"Memory": 128
|
||||||
|
},
|
||||||
|
"CreatedAt": "2020-10-02T00:15:07.620912337Z",
|
||||||
|
"StartedAt": "2020-10-02T00:15:08.062559351Z",
|
||||||
|
"Type": "NORMAL",
|
||||||
|
"LogDriver": "awslogs",
|
||||||
|
"LogOptions": {
|
||||||
|
"awslogs-create-group": "true",
|
||||||
|
"awslogs-group": "/ecs/metadata",
|
||||||
|
"awslogs-region": "us-west-2",
|
||||||
|
"awslogs-stream": "ecs/curl/8f03e41243824aea923aca126495f665"
|
||||||
|
},
|
||||||
|
"ContainerARN": "arn:aws:ecs:us-west-2:111122223333:container/0206b271-b33f-47ab-86c6-a0ba208a70a9",
|
||||||
|
"Networks": [
|
||||||
|
{
|
||||||
|
"NetworkMode": "awsvpc",
|
||||||
|
"IPv4Addresses": [
|
||||||
|
"10.0.2.100"
|
||||||
|
],
|
||||||
|
"AttachmentIndex": 0,
|
||||||
|
"MACAddress": "0e:9e:32:c7:48:85",
|
||||||
|
"IPv4SubnetCIDRBlock": "10.0.2.0/24",
|
||||||
|
"PrivateDNSName": "ip-10-0-2-100.us-west-2.compute.internal",
|
||||||
|
"SubnetGatewayIpv4Address": "10.0.2.1/24"
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}
|
||||||
|
|
@ -0,0 +1,130 @@
|
||||||
|
{
|
||||||
|
"read": "2020-10-02T00:61:13.410254284Z",
|
||||||
|
"preread": "2020-10-02T00:51:12.406202398Z",
|
||||||
|
"pids_stats": {
|
||||||
|
"current": 3
|
||||||
|
},
|
||||||
|
"blkio_stats": {
|
||||||
|
"io_service_bytes_recursive": [
|
||||||
|
|
||||||
|
],
|
||||||
|
"io_serviced_recursive": [
|
||||||
|
|
||||||
|
],
|
||||||
|
"io_queue_recursive": [
|
||||||
|
|
||||||
|
],
|
||||||
|
"io_service_time_recursive": [
|
||||||
|
|
||||||
|
],
|
||||||
|
"io_wait_time_recursive": [
|
||||||
|
|
||||||
|
],
|
||||||
|
"io_merged_recursive": [
|
||||||
|
|
||||||
|
],
|
||||||
|
"io_time_recursive": [
|
||||||
|
|
||||||
|
],
|
||||||
|
"sectors_recursive": [
|
||||||
|
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"num_procs": 0,
|
||||||
|
"storage_stats": {
|
||||||
|
|
||||||
|
},
|
||||||
|
"cpu_stats": {
|
||||||
|
"cpu_usage": {
|
||||||
|
"total_usage": 150000000,
|
||||||
|
"percpu_usage": [
|
||||||
|
182359190,
|
||||||
|
178608875
|
||||||
|
],
|
||||||
|
"usage_in_kernelmode": 40000000,
|
||||||
|
"usage_in_usermode": 290000000
|
||||||
|
},
|
||||||
|
"system_cpu_usage": 100000000,
|
||||||
|
"online_cpus": 2,
|
||||||
|
"throttling_data": {
|
||||||
|
"periods": 0,
|
||||||
|
"throttled_periods": 0,
|
||||||
|
"throttled_time": 0
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"precpu_stats": {
|
||||||
|
"cpu_usage": {
|
||||||
|
"total_usage": 100000000,
|
||||||
|
"percpu_usage": [
|
||||||
|
182359190,
|
||||||
|
178608875
|
||||||
|
],
|
||||||
|
"usage_in_kernelmode": 40000000,
|
||||||
|
"usage_in_usermode": 290000000
|
||||||
|
},
|
||||||
|
"system_cpu_usage": 100000000,
|
||||||
|
"online_cpus": 2,
|
||||||
|
"throttling_data": {
|
||||||
|
"periods": 0,
|
||||||
|
"throttled_periods": 0,
|
||||||
|
"throttled_time": 0
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"memory_stats": {
|
||||||
|
"usage": 1806336,
|
||||||
|
"max_usage": 6299648,
|
||||||
|
"stats": {
|
||||||
|
"active_anon": 606208,
|
||||||
|
"active_file": 0,
|
||||||
|
"cache": 0,
|
||||||
|
"dirty": 0,
|
||||||
|
"hierarchical_memory_limit": 134217728,
|
||||||
|
"hierarchical_memsw_limit": 268435456,
|
||||||
|
"inactive_anon": 0,
|
||||||
|
"inactive_file": 0,
|
||||||
|
"mapped_file": 0,
|
||||||
|
"pgfault": 4185,
|
||||||
|
"pgmajfault": 0,
|
||||||
|
"pgpgin": 2926,
|
||||||
|
"pgpgout": 2778,
|
||||||
|
"rss": 606208,
|
||||||
|
"rss_huge": 0,
|
||||||
|
"total_active_anon": 606208,
|
||||||
|
"total_active_file": 0,
|
||||||
|
"total_cache": 0,
|
||||||
|
"total_dirty": 0,
|
||||||
|
"total_inactive_anon": 0,
|
||||||
|
"total_inactive_file": 0,
|
||||||
|
"total_mapped_file": 0,
|
||||||
|
"total_pgfault": 4185,
|
||||||
|
"total_pgmajfault": 0,
|
||||||
|
"total_pgpgin": 2926,
|
||||||
|
"total_pgpgout": 2778,
|
||||||
|
"total_rss": 606208,
|
||||||
|
"total_rss_huge": 0,
|
||||||
|
"total_unevictable": 0,
|
||||||
|
"total_writeback": 0,
|
||||||
|
"unevictable": 0,
|
||||||
|
"writeback": 0
|
||||||
|
},
|
||||||
|
"limit": 134217728
|
||||||
|
},
|
||||||
|
"name": "/ecs-curltest-26-curl-c2e5f6e0cf91b0bead01",
|
||||||
|
"id": "5fc21e5b015f899d22618f8aede80b6d70d71b2a75465ea49d9462c8f3d2d3af",
|
||||||
|
"networks": {
|
||||||
|
"eth0": {
|
||||||
|
"rx_bytes": 84,
|
||||||
|
"rx_packets": 2,
|
||||||
|
"rx_errors": 0,
|
||||||
|
"rx_dropped": 0,
|
||||||
|
"tx_bytes": 84,
|
||||||
|
"tx_packets": 2,
|
||||||
|
"tx_errors": 0,
|
||||||
|
"tx_dropped": 0
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"network_rate_stats": {
|
||||||
|
"rx_bytes_per_sec": 0,
|
||||||
|
"tx_bytes_per_sec": 0
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
@ -0,0 +1,56 @@
|
||||||
|
{
|
||||||
|
"Cluster": "default",
|
||||||
|
"TaskARN": "arn:aws:ecs:us-west-2:111122223333:task/default/158d1c8083dd49d6b527399fd6414f5c",
|
||||||
|
"Family": "curltest",
|
||||||
|
"ServiceName": "MyService",
|
||||||
|
"Revision": "26",
|
||||||
|
"DesiredStatus": "RUNNING",
|
||||||
|
"KnownStatus": "RUNNING",
|
||||||
|
"Limits": {
|
||||||
|
"CPU": 4,
|
||||||
|
"Memory": 128
|
||||||
|
},
|
||||||
|
"PullStartedAt": "2020-10-02T00:43:06.202617438Z",
|
||||||
|
"PullStoppedAt": "2020-10-02T00:43:06.31288465Z",
|
||||||
|
"AvailabilityZone": "us-west-2d",
|
||||||
|
"VPCID": "vpc-1234567890abcdef0",
|
||||||
|
"LaunchType": "EC2",
|
||||||
|
"Containers": [
|
||||||
|
{
|
||||||
|
"DockerId": "ea32192c8553fbff06c9340478a2ff089b2bb5646fb718b4ee206641c9086d66",
|
||||||
|
"Name": "~internal~ecs~pause",
|
||||||
|
"DockerName": "ecs-curltest-26-internalecspause-e292d586b6f9dade4a00",
|
||||||
|
"Image": "amazon/amazon-ecs-pause:0.1.0",
|
||||||
|
"ImageID": "",
|
||||||
|
"Labels": {
|
||||||
|
"com.amazonaws.ecs.cluster": "default",
|
||||||
|
"com.amazonaws.ecs.container-name": "~internal~ecs~pause",
|
||||||
|
"com.amazonaws.ecs.task-arn": "arn:aws:ecs:us-west-2:111122223333:task/default/158d1c8083dd49d6b527399fd6414f5c",
|
||||||
|
"com.amazonaws.ecs.task-definition-family": "curltest",
|
||||||
|
"com.amazonaws.ecs.task-definition-version": "26"
|
||||||
|
},
|
||||||
|
"DesiredStatus": "RESOURCES_PROVISIONED",
|
||||||
|
"KnownStatus": "RESOURCES_PROVISIONED",
|
||||||
|
"Limits": {
|
||||||
|
"CPU": 50,
|
||||||
|
"Memory": 128
|
||||||
|
},
|
||||||
|
"CreatedAt": "2020-10-02T00:43:05.602352471Z",
|
||||||
|
"StartedAt": "2020-10-02T00:43:06.076707576Z",
|
||||||
|
"Type": "CNI_PAUSE",
|
||||||
|
"Networks": [
|
||||||
|
{
|
||||||
|
"NetworkMode": "awsvpc",
|
||||||
|
"IPv4Addresses": [
|
||||||
|
"10.0.2.61"
|
||||||
|
],
|
||||||
|
"AttachmentIndex": 0,
|
||||||
|
"MACAddress": "0e:10:e2:01:bd:91",
|
||||||
|
"IPv4SubnetCIDRBlock": "10.0.2.0/24",
|
||||||
|
"PrivateDNSName": "ip-10-0-2-61.us-west-2.compute.internal",
|
||||||
|
"SubnetGatewayIpv4Address": "10.0.2.1/24"
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}
|
||||||
|
|
@ -0,0 +1,44 @@
|
||||||
|
{
|
||||||
|
"DockerId": "ea32192c8553fbff06c9340478a2ff089b2bb5646fb718b4ee206641c9086d66",
|
||||||
|
"Name": "curl",
|
||||||
|
"DockerName": "ecs-curltest-24-curl-cca48e8dcadd97805600",
|
||||||
|
"Image": "111122223333.dkr.ecr.us-west-2.amazonaws.com/curltest:latest",
|
||||||
|
"ImageID": "sha256:d691691e9652791a60114e67b365688d20d19940dde7c4736ea30e660d8d3553",
|
||||||
|
"Labels": {
|
||||||
|
"com.amazonaws.ecs.cluster": "default",
|
||||||
|
"com.amazonaws.ecs.container-name": "curl",
|
||||||
|
"com.amazonaws.ecs.task-arn": "arn:aws:ecs:us-west-2:111122223333:task/default/8f03e41243824aea923aca126495f665",
|
||||||
|
"com.amazonaws.ecs.task-definition-family": "curltest",
|
||||||
|
"com.amazonaws.ecs.task-definition-version": "24"
|
||||||
|
},
|
||||||
|
"DesiredStatus": "RUNNING",
|
||||||
|
"KnownStatus": "RUNNING",
|
||||||
|
"Limits": {
|
||||||
|
"CPU": 50,
|
||||||
|
"Memory": 128
|
||||||
|
},
|
||||||
|
"CreatedAt": "2020-10-02T00:15:07.620912337Z",
|
||||||
|
"StartedAt": "2020-10-02T00:15:08.062559351Z",
|
||||||
|
"Type": "NORMAL",
|
||||||
|
"LogDriver": "awslogs",
|
||||||
|
"LogOptions": {
|
||||||
|
"awslogs-create-group": "true",
|
||||||
|
"awslogs-group": "/ecs/metadata",
|
||||||
|
"awslogs-region": "us-west-2",
|
||||||
|
"awslogs-stream": "ecs/curl/8f03e41243824aea923aca126495f665"
|
||||||
|
},
|
||||||
|
"ContainerARN": "arn:aws:ecs:us-west-2:111122223333:container/0206b271-b33f-47ab-86c6-a0ba208a70a9",
|
||||||
|
"Networks": [
|
||||||
|
{
|
||||||
|
"NetworkMode": "awsvpc",
|
||||||
|
"IPv4Addresses": [
|
||||||
|
"10.0.2.100"
|
||||||
|
],
|
||||||
|
"AttachmentIndex": 0,
|
||||||
|
"MACAddress": "0e:9e:32:c7:48:85",
|
||||||
|
"IPv4SubnetCIDRBlock": "10.0.2.0/24",
|
||||||
|
"PrivateDNSName": "ip-10-0-2-100.us-west-2.compute.internal",
|
||||||
|
"SubnetGatewayIpv4Address": "10.0.2.1/24"
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}
|
||||||
|
|
@ -0,0 +1,130 @@
|
||||||
|
{
|
||||||
|
"read": "2020-10-02T00:51:13.410254284Z",
|
||||||
|
"preread": "2020-10-02T00:51:12.406202398Z",
|
||||||
|
"pids_stats": {
|
||||||
|
"current": 3
|
||||||
|
},
|
||||||
|
"blkio_stats": {
|
||||||
|
"io_service_bytes_recursive": [
|
||||||
|
|
||||||
|
],
|
||||||
|
"io_serviced_recursive": [
|
||||||
|
|
||||||
|
],
|
||||||
|
"io_queue_recursive": [
|
||||||
|
|
||||||
|
],
|
||||||
|
"io_service_time_recursive": [
|
||||||
|
|
||||||
|
],
|
||||||
|
"io_wait_time_recursive": [
|
||||||
|
|
||||||
|
],
|
||||||
|
"io_merged_recursive": [
|
||||||
|
|
||||||
|
],
|
||||||
|
"io_time_recursive": [
|
||||||
|
|
||||||
|
],
|
||||||
|
"sectors_recursive": [
|
||||||
|
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"num_procs": 0,
|
||||||
|
"storage_stats": {
|
||||||
|
|
||||||
|
},
|
||||||
|
"cpu_stats": {
|
||||||
|
"cpu_usage": {
|
||||||
|
"total_usage": 150000000,
|
||||||
|
"percpu_usage": [
|
||||||
|
182359190,
|
||||||
|
178608875
|
||||||
|
],
|
||||||
|
"usage_in_kernelmode": 40000000,
|
||||||
|
"usage_in_usermode": 290000000
|
||||||
|
},
|
||||||
|
"system_cpu_usage": 200000000,
|
||||||
|
"online_cpus": 2,
|
||||||
|
"throttling_data": {
|
||||||
|
"periods": 0,
|
||||||
|
"throttled_periods": 0,
|
||||||
|
"throttled_time": 0
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"precpu_stats": {
|
||||||
|
"cpu_usage": {
|
||||||
|
"total_usage": 100000000,
|
||||||
|
"percpu_usage": [
|
||||||
|
182359190,
|
||||||
|
178608875
|
||||||
|
],
|
||||||
|
"usage_in_kernelmode": 40000000,
|
||||||
|
"usage_in_usermode": 290000000
|
||||||
|
},
|
||||||
|
"system_cpu_usage": 100000000,
|
||||||
|
"online_cpus": 2,
|
||||||
|
"throttling_data": {
|
||||||
|
"periods": 0,
|
||||||
|
"throttled_periods": 0,
|
||||||
|
"throttled_time": 0
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"memory_stats": {
|
||||||
|
"usage": 1806336,
|
||||||
|
"max_usage": 6299648,
|
||||||
|
"stats": {
|
||||||
|
"active_anon": 606208,
|
||||||
|
"active_file": 0,
|
||||||
|
"cache": 0,
|
||||||
|
"dirty": 0,
|
||||||
|
"hierarchical_memory_limit": 134217728,
|
||||||
|
"hierarchical_memsw_limit": 268435456,
|
||||||
|
"inactive_anon": 0,
|
||||||
|
"inactive_file": 0,
|
||||||
|
"mapped_file": 0,
|
||||||
|
"pgfault": 4185,
|
||||||
|
"pgmajfault": 0,
|
||||||
|
"pgpgin": 2926,
|
||||||
|
"pgpgout": 2778,
|
||||||
|
"rss": 606208,
|
||||||
|
"rss_huge": 0,
|
||||||
|
"total_active_anon": 606208,
|
||||||
|
"total_active_file": 0,
|
||||||
|
"total_cache": 0,
|
||||||
|
"total_dirty": 0,
|
||||||
|
"total_inactive_anon": 0,
|
||||||
|
"total_inactive_file": 0,
|
||||||
|
"total_mapped_file": 0,
|
||||||
|
"total_pgfault": 4185,
|
||||||
|
"total_pgmajfault": 0,
|
||||||
|
"total_pgpgin": 2926,
|
||||||
|
"total_pgpgout": 2778,
|
||||||
|
"total_rss": 606208,
|
||||||
|
"total_rss_huge": 0,
|
||||||
|
"total_unevictable": 0,
|
||||||
|
"total_writeback": 0,
|
||||||
|
"unevictable": 0,
|
||||||
|
"writeback": 0
|
||||||
|
},
|
||||||
|
"limit": 134217728
|
||||||
|
},
|
||||||
|
"name": "/ecs-curltest-26-curl-c2e5f6e0cf91b0bead01",
|
||||||
|
"id": "5fc21e5b015f899d22618f8aede80b6d70d71b2a75465ea49d9462c8f3d2d3af",
|
||||||
|
"networks": {
|
||||||
|
"eth0": {
|
||||||
|
"rx_bytes": 84,
|
||||||
|
"rx_packets": 2,
|
||||||
|
"rx_errors": 0,
|
||||||
|
"rx_dropped": 0,
|
||||||
|
"tx_bytes": 84,
|
||||||
|
"tx_packets": 2,
|
||||||
|
"tx_errors": 0,
|
||||||
|
"tx_dropped": 0
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"network_rate_stats": {
|
||||||
|
"rx_bytes_per_sec": 0,
|
||||||
|
"tx_bytes_per_sec": 0
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
@ -0,0 +1,55 @@
|
||||||
|
{
|
||||||
|
"Cluster": "default",
|
||||||
|
"TaskARN": "arn:aws:ecs:us-west-2:111122223333:task/default/158d1c8083dd49d6b527399fd6414f5c",
|
||||||
|
"Family": "curltest",
|
||||||
|
"ServiceName": "MyService",
|
||||||
|
"Revision": "26",
|
||||||
|
"DesiredStatus": "RUNNING",
|
||||||
|
"KnownStatus": "RUNNING",
|
||||||
|
"Limits": {
|
||||||
|
"Memory": 128
|
||||||
|
},
|
||||||
|
"PullStartedAt": "2020-10-02T00:43:06.202617438Z",
|
||||||
|
"PullStoppedAt": "2020-10-02T00:43:06.31288465Z",
|
||||||
|
"AvailabilityZone": "us-west-2d",
|
||||||
|
"VPCID": "vpc-1234567890abcdef0",
|
||||||
|
"LaunchType": "EC2",
|
||||||
|
"Containers": [
|
||||||
|
{
|
||||||
|
"DockerId": "ea32192c8553fbff06c9340478a2ff089b2bb5646fb718b4ee206641c9086d66",
|
||||||
|
"Name": "~internal~ecs~pause",
|
||||||
|
"DockerName": "ecs-curltest-26-internalecspause-e292d586b6f9dade4a00",
|
||||||
|
"Image": "amazon/amazon-ecs-pause:0.1.0",
|
||||||
|
"ImageID": "",
|
||||||
|
"Labels": {
|
||||||
|
"com.amazonaws.ecs.cluster": "default",
|
||||||
|
"com.amazonaws.ecs.container-name": "~internal~ecs~pause",
|
||||||
|
"com.amazonaws.ecs.task-arn": "arn:aws:ecs:us-west-2:111122223333:task/default/158d1c8083dd49d6b527399fd6414f5c",
|
||||||
|
"com.amazonaws.ecs.task-definition-family": "curltest",
|
||||||
|
"com.amazonaws.ecs.task-definition-version": "26"
|
||||||
|
},
|
||||||
|
"DesiredStatus": "RESOURCES_PROVISIONED",
|
||||||
|
"KnownStatus": "RESOURCES_PROVISIONED",
|
||||||
|
"Limits": {
|
||||||
|
"CPU": 50,
|
||||||
|
"Memory": 128
|
||||||
|
},
|
||||||
|
"CreatedAt": "2020-10-02T00:43:05.602352471Z",
|
||||||
|
"StartedAt": "2020-10-02T00:43:06.076707576Z",
|
||||||
|
"Type": "CNI_PAUSE",
|
||||||
|
"Networks": [
|
||||||
|
{
|
||||||
|
"NetworkMode": "awsvpc",
|
||||||
|
"IPv4Addresses": [
|
||||||
|
"10.0.2.61"
|
||||||
|
],
|
||||||
|
"AttachmentIndex": 0,
|
||||||
|
"MACAddress": "0e:10:e2:01:bd:91",
|
||||||
|
"IPv4SubnetCIDRBlock": "10.0.2.0/24",
|
||||||
|
"PrivateDNSName": "ip-10-0-2-61.us-west-2.compute.internal",
|
||||||
|
"SubnetGatewayIpv4Address": "10.0.2.1/24"
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}
|
||||||
|
|
@ -0,0 +1,44 @@
|
||||||
|
{
|
||||||
|
"DockerId": "ea32192c8553fbff06c9340478a2ff089b2bb5646fb718b4ee206641c9086d66",
|
||||||
|
"Name": "curl",
|
||||||
|
"DockerName": "ecs-curltest-24-curl-cca48e8dcadd97805600",
|
||||||
|
"Image": "111122223333.dkr.ecr.us-west-2.amazonaws.com/curltest:latest",
|
||||||
|
"ImageID": "sha256:d691691e9652791a60114e67b365688d20d19940dde7c4736ea30e660d8d3553",
|
||||||
|
"Labels": {
|
||||||
|
"com.amazonaws.ecs.cluster": "default",
|
||||||
|
"com.amazonaws.ecs.container-name": "curl",
|
||||||
|
"com.amazonaws.ecs.task-arn": "arn:aws:ecs:us-west-2:111122223333:task/default/8f03e41243824aea923aca126495f665",
|
||||||
|
"com.amazonaws.ecs.task-definition-family": "curltest",
|
||||||
|
"com.amazonaws.ecs.task-definition-version": "24"
|
||||||
|
},
|
||||||
|
"DesiredStatus": "RUNNING",
|
||||||
|
"KnownStatus": "RUNNING",
|
||||||
|
"Limits": {
|
||||||
|
"CPU": 50,
|
||||||
|
"Memory": 128
|
||||||
|
},
|
||||||
|
"CreatedAt": "2020-10-02T00:15:07.620912337Z",
|
||||||
|
"StartedAt": "2020-10-02T00:15:08.062559351Z",
|
||||||
|
"Type": "NORMAL",
|
||||||
|
"LogDriver": "awslogs",
|
||||||
|
"LogOptions": {
|
||||||
|
"awslogs-create-group": "true",
|
||||||
|
"awslogs-group": "/ecs/metadata",
|
||||||
|
"awslogs-region": "us-west-2",
|
||||||
|
"awslogs-stream": "ecs/curl/8f03e41243824aea923aca126495f665"
|
||||||
|
},
|
||||||
|
"ContainerARN": "arn:aws:ecs:us-west-2:111122223333:container/0206b271-b33f-47ab-86c6-a0ba208a70a9",
|
||||||
|
"Networks": [
|
||||||
|
{
|
||||||
|
"NetworkMode": "awsvpc",
|
||||||
|
"IPv4Addresses": [
|
||||||
|
"10.0.2.100"
|
||||||
|
],
|
||||||
|
"AttachmentIndex": 0,
|
||||||
|
"MACAddress": "0e:9e:32:c7:48:85",
|
||||||
|
"IPv4SubnetCIDRBlock": "10.0.2.0/24",
|
||||||
|
"PrivateDNSName": "ip-10-0-2-100.us-west-2.compute.internal",
|
||||||
|
"SubnetGatewayIpv4Address": "10.0.2.1/24"
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}
|
||||||
|
|
@ -0,0 +1,130 @@
|
||||||
|
{
|
||||||
|
"read": "2020-10-02T00:51:13.410254284Z",
|
||||||
|
"preread": "2020-10-02T00:51:12.406202398Z",
|
||||||
|
"pids_stats": {
|
||||||
|
"current": 3
|
||||||
|
},
|
||||||
|
"blkio_stats": {
|
||||||
|
"io_service_bytes_recursive": [
|
||||||
|
|
||||||
|
],
|
||||||
|
"io_serviced_recursive": [
|
||||||
|
|
||||||
|
],
|
||||||
|
"io_queue_recursive": [
|
||||||
|
|
||||||
|
],
|
||||||
|
"io_service_time_recursive": [
|
||||||
|
|
||||||
|
],
|
||||||
|
"io_wait_time_recursive": [
|
||||||
|
|
||||||
|
],
|
||||||
|
"io_merged_recursive": [
|
||||||
|
|
||||||
|
],
|
||||||
|
"io_time_recursive": [
|
||||||
|
|
||||||
|
],
|
||||||
|
"sectors_recursive": [
|
||||||
|
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"num_procs": 0,
|
||||||
|
"storage_stats": {
|
||||||
|
|
||||||
|
},
|
||||||
|
"cpu_stats": {
|
||||||
|
"cpu_usage": {
|
||||||
|
"total_usage": 150000000,
|
||||||
|
"percpu_usage": [
|
||||||
|
182359190,
|
||||||
|
178608875
|
||||||
|
],
|
||||||
|
"usage_in_kernelmode": 40000000,
|
||||||
|
"usage_in_usermode": 290000000
|
||||||
|
},
|
||||||
|
"system_cpu_usage": 200000000,
|
||||||
|
"online_cpus": 2,
|
||||||
|
"throttling_data": {
|
||||||
|
"periods": 0,
|
||||||
|
"throttled_periods": 0,
|
||||||
|
"throttled_time": 0
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"precpu_stats": {
|
||||||
|
"cpu_usage": {
|
||||||
|
"total_usage": 100000000,
|
||||||
|
"percpu_usage": [
|
||||||
|
182359190,
|
||||||
|
178608875
|
||||||
|
],
|
||||||
|
"usage_in_kernelmode": 40000000,
|
||||||
|
"usage_in_usermode": 290000000
|
||||||
|
},
|
||||||
|
"system_cpu_usage": 100000000,
|
||||||
|
"online_cpus": 2,
|
||||||
|
"throttling_data": {
|
||||||
|
"periods": 0,
|
||||||
|
"throttled_periods": 0,
|
||||||
|
"throttled_time": 0
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"memory_stats": {
|
||||||
|
"usage": 1806336,
|
||||||
|
"max_usage": 6299648,
|
||||||
|
"stats": {
|
||||||
|
"active_anon": 606208,
|
||||||
|
"active_file": 0,
|
||||||
|
"cache": 0,
|
||||||
|
"dirty": 0,
|
||||||
|
"hierarchical_memory_limit": 134217728,
|
||||||
|
"hierarchical_memsw_limit": 268435456,
|
||||||
|
"inactive_anon": 0,
|
||||||
|
"inactive_file": 0,
|
||||||
|
"mapped_file": 0,
|
||||||
|
"pgfault": 4185,
|
||||||
|
"pgmajfault": 0,
|
||||||
|
"pgpgin": 2926,
|
||||||
|
"pgpgout": 2778,
|
||||||
|
"rss": 606208,
|
||||||
|
"rss_huge": 0,
|
||||||
|
"total_active_anon": 606208,
|
||||||
|
"total_active_file": 0,
|
||||||
|
"total_cache": 0,
|
||||||
|
"total_dirty": 0,
|
||||||
|
"total_inactive_anon": 0,
|
||||||
|
"total_inactive_file": 0,
|
||||||
|
"total_mapped_file": 0,
|
||||||
|
"total_pgfault": 4185,
|
||||||
|
"total_pgmajfault": 0,
|
||||||
|
"total_pgpgin": 2926,
|
||||||
|
"total_pgpgout": 2778,
|
||||||
|
"total_rss": 606208,
|
||||||
|
"total_rss_huge": 0,
|
||||||
|
"total_unevictable": 0,
|
||||||
|
"total_writeback": 0,
|
||||||
|
"unevictable": 0,
|
||||||
|
"writeback": 0
|
||||||
|
},
|
||||||
|
"limit": 134217728
|
||||||
|
},
|
||||||
|
"name": "/ecs-curltest-26-curl-c2e5f6e0cf91b0bead01",
|
||||||
|
"id": "5fc21e5b015f899d22618f8aede80b6d70d71b2a75465ea49d9462c8f3d2d3af",
|
||||||
|
"networks": {
|
||||||
|
"eth0": {
|
||||||
|
"rx_bytes": 84,
|
||||||
|
"rx_packets": 2,
|
||||||
|
"rx_errors": 0,
|
||||||
|
"rx_dropped": 0,
|
||||||
|
"tx_bytes": 84,
|
||||||
|
"tx_packets": 2,
|
||||||
|
"tx_errors": 0,
|
||||||
|
"tx_dropped": 0
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"network_rate_stats": {
|
||||||
|
"rx_bytes_per_sec": 0,
|
||||||
|
"tx_bytes_per_sec": 0
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
@ -0,0 +1,52 @@
|
||||||
|
{
|
||||||
|
"Cluster": "default",
|
||||||
|
"TaskARN": "arn:aws:ecs:us-west-2:111122223333:task/default/158d1c8083dd49d6b527399fd6414f5c",
|
||||||
|
"Family": "curltest",
|
||||||
|
"ServiceName": "MyService",
|
||||||
|
"Revision": "26",
|
||||||
|
"DesiredStatus": "RUNNING",
|
||||||
|
"KnownStatus": "RUNNING",
|
||||||
|
"PullStartedAt": "2020-10-02T00:43:06.202617438Z",
|
||||||
|
"PullStoppedAt": "2020-10-02T00:43:06.31288465Z",
|
||||||
|
"AvailabilityZone": "us-west-2d",
|
||||||
|
"VPCID": "vpc-1234567890abcdef0",
|
||||||
|
"LaunchType": "EC2",
|
||||||
|
"Containers": [
|
||||||
|
{
|
||||||
|
"DockerId": "ea32192c8553fbff06c9340478a2ff089b2bb5646fb718b4ee206641c9086d66",
|
||||||
|
"Name": "~internal~ecs~pause",
|
||||||
|
"DockerName": "ecs-curltest-26-internalecspause-e292d586b6f9dade4a00",
|
||||||
|
"Image": "amazon/amazon-ecs-pause:0.1.0",
|
||||||
|
"ImageID": "",
|
||||||
|
"Labels": {
|
||||||
|
"com.amazonaws.ecs.cluster": "default",
|
||||||
|
"com.amazonaws.ecs.container-name": "~internal~ecs~pause",
|
||||||
|
"com.amazonaws.ecs.task-arn": "arn:aws:ecs:us-west-2:111122223333:task/default/158d1c8083dd49d6b527399fd6414f5c",
|
||||||
|
"com.amazonaws.ecs.task-definition-family": "curltest",
|
||||||
|
"com.amazonaws.ecs.task-definition-version": "26"
|
||||||
|
},
|
||||||
|
"DesiredStatus": "RESOURCES_PROVISIONED",
|
||||||
|
"KnownStatus": "RESOURCES_PROVISIONED",
|
||||||
|
"Limits": {
|
||||||
|
"CPU": 50,
|
||||||
|
"Memory": 128
|
||||||
|
},
|
||||||
|
"CreatedAt": "2020-10-02T00:43:05.602352471Z",
|
||||||
|
"StartedAt": "2020-10-02T00:43:06.076707576Z",
|
||||||
|
"Type": "CNI_PAUSE",
|
||||||
|
"Networks": [
|
||||||
|
{
|
||||||
|
"NetworkMode": "awsvpc",
|
||||||
|
"IPv4Addresses": [
|
||||||
|
"10.0.2.61"
|
||||||
|
],
|
||||||
|
"AttachmentIndex": 0,
|
||||||
|
"MACAddress": "0e:10:e2:01:bd:91",
|
||||||
|
"IPv4SubnetCIDRBlock": "10.0.2.0/24",
|
||||||
|
"PrivateDNSName": "ip-10-0-2-61.us-west-2.compute.internal",
|
||||||
|
"SubnetGatewayIpv4Address": "10.0.2.1/24"
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}
|
||||||
Some files were not shown because too many files have changed in this diff Show more
Loading…
Reference in a new issue