vmware-go-kcl-v2/clientlibrary/metrics/cloudwatch.go

322 lines
8.1 KiB
Go
Raw Normal View History

package metrics
import (
"sync"
"time"
"github.com/aws/aws-sdk-go/aws"
"github.com/aws/aws-sdk-go/aws/session"
"github.com/aws/aws-sdk-go/service/cloudwatch"
"github.com/aws/aws-sdk-go/service/cloudwatch/cloudwatchiface"
log "github.com/sirupsen/logrus"
)
type CloudWatchMonitoringService struct {
Namespace string
KinesisStream string
WorkerID string
Region string
// control how often to pusblish to CloudWatch
MetricsBufferTimeMillis int
MetricsMaxQueueSize int
stop *chan struct{}
waitGroup *sync.WaitGroup
svc cloudwatchiface.CloudWatchAPI
shardMetrics *sync.Map
}
type cloudWatchMetrics struct {
processedRecords int64
processedBytes int64
behindLatestMillis []float64
leasesHeld int64
leaseRenewals int64
getRecordsTime []float64
processRecordsTime []float64
sync.Mutex
}
func (cw *CloudWatchMonitoringService) Init() error {
s := session.New(&aws.Config{Region: aws.String(cw.Region)})
cw.svc = cloudwatch.New(s)
cw.shardMetrics = new(sync.Map)
stopChan := make(chan struct{})
cw.stop = &stopChan
wg := sync.WaitGroup{}
cw.waitGroup = &wg
return nil
}
func (cw *CloudWatchMonitoringService) Start() error {
cw.waitGroup.Add(1)
// entering eventloop for sending metrics to CloudWatch
go cw.eventloop()
return nil
}
func (cw *CloudWatchMonitoringService) Shutdown() {
log.Info("Shutting down cloudwatch metrics system...")
close(*cw.stop)
cw.waitGroup.Wait()
log.Info("Cloudwatch metrics system has been shutdown.")
}
// Start daemon to flush metrics periodically
func (cw *CloudWatchMonitoringService) eventloop() {
defer cw.waitGroup.Done()
for {
err := cw.flush()
if err != nil {
log.Errorf("Error sending metrics to CloudWatch. %+v", err)
}
select {
case <-*cw.stop:
log.Info("Shutting down monitoring system")
cw.flush()
return
case <-time.After(time.Duration(cw.MetricsBufferTimeMillis) * time.Millisecond):
}
}
}
func (cw *CloudWatchMonitoringService) flushShard(shard string, metric *cloudWatchMetrics) bool {
metric.Lock()
defaultDimensions := []*cloudwatch.Dimension{
{
Name: aws.String("Shard"),
Value: &shard,
},
{
Name: aws.String("KinesisStreamName"),
Value: &cw.KinesisStream,
},
}
leaseDimensions := []*cloudwatch.Dimension{
{
Name: aws.String("Shard"),
Value: &shard,
},
{
Name: aws.String("KinesisStreamName"),
Value: &cw.KinesisStream,
},
{
Name: aws.String("WorkerID"),
Value: &cw.WorkerID,
},
}
metricTimestamp := time.Now()
data := []*cloudwatch.MetricDatum{
{
Dimensions: defaultDimensions,
MetricName: aws.String("RecordsProcessed"),
Unit: aws.String("Count"),
Timestamp: &metricTimestamp,
Value: aws.Float64(float64(metric.processedRecords)),
},
{
Dimensions: defaultDimensions,
MetricName: aws.String("DataBytesProcessed"),
Unit: aws.String("Bytes"),
Timestamp: &metricTimestamp,
Value: aws.Float64(float64(metric.processedBytes)),
},
{
Dimensions: leaseDimensions,
MetricName: aws.String("RenewLease.Success"),
Unit: aws.String("Count"),
Timestamp: &metricTimestamp,
Value: aws.Float64(float64(metric.leaseRenewals)),
},
{
Dimensions: leaseDimensions,
MetricName: aws.String("CurrentLeases"),
Unit: aws.String("Count"),
Timestamp: &metricTimestamp,
Value: aws.Float64(float64(metric.leasesHeld)),
},
}
if len(metric.behindLatestMillis) > 0 {
data = append(data, &cloudwatch.MetricDatum{
Dimensions: defaultDimensions,
MetricName: aws.String("MillisBehindLatest"),
Unit: aws.String("Milliseconds"),
Timestamp: &metricTimestamp,
StatisticValues: &cloudwatch.StatisticSet{
SampleCount: aws.Float64(float64(len(metric.behindLatestMillis))),
Sum: sumFloat64(metric.behindLatestMillis),
Maximum: maxFloat64(metric.behindLatestMillis),
Minimum: minFloat64(metric.behindLatestMillis),
}})
}
if len(metric.getRecordsTime) > 0 {
data = append(data, &cloudwatch.MetricDatum{
Dimensions: defaultDimensions,
MetricName: aws.String("KinesisDataFetcher.getRecords.Time"),
Unit: aws.String("Milliseconds"),
Timestamp: &metricTimestamp,
StatisticValues: &cloudwatch.StatisticSet{
SampleCount: aws.Float64(float64(len(metric.getRecordsTime))),
Sum: sumFloat64(metric.getRecordsTime),
Maximum: maxFloat64(metric.getRecordsTime),
Minimum: minFloat64(metric.getRecordsTime),
}})
}
if len(metric.processRecordsTime) > 0 {
data = append(data, &cloudwatch.MetricDatum{
Dimensions: defaultDimensions,
MetricName: aws.String("RecordProcessor.processRecords.Time"),
Unit: aws.String("Milliseconds"),
Timestamp: &metricTimestamp,
StatisticValues: &cloudwatch.StatisticSet{
SampleCount: aws.Float64(float64(len(metric.processRecordsTime))),
Sum: sumFloat64(metric.processRecordsTime),
Maximum: maxFloat64(metric.processRecordsTime),
Minimum: minFloat64(metric.processRecordsTime),
}})
}
// Publish metrics data to cloud watch
_, err := cw.svc.PutMetricData(&cloudwatch.PutMetricDataInput{
Namespace: aws.String(cw.Namespace),
MetricData: data,
})
if err == nil {
metric.processedRecords = 0
metric.processedBytes = 0
metric.behindLatestMillis = []float64{}
metric.leaseRenewals = 0
metric.getRecordsTime = []float64{}
metric.processRecordsTime = []float64{}
} else {
log.Errorf("Error in publishing cloudwatch metrics. Error: %+v", err)
}
metric.Unlock()
return true
}
func (cw *CloudWatchMonitoringService) flush() error {
log.Debugf("Flushing metrics data. Stream: %s, Worker: %s", cw.KinesisStream, cw.WorkerID)
// publish per shard metrics
cw.shardMetrics.Range(func(k, v interface{}) bool {
shard, metric := k.(string), v.(*cloudWatchMetrics)
return cw.flushShard(shard, metric)
})
return nil
}
func (cw *CloudWatchMonitoringService) IncrRecordsProcessed(shard string, count int) {
m := cw.getOrCreatePerShardMetrics(shard)
m.Lock()
defer m.Unlock()
m.processedRecords += int64(count)
}
func (cw *CloudWatchMonitoringService) IncrBytesProcessed(shard string, count int64) {
m := cw.getOrCreatePerShardMetrics(shard)
m.Lock()
defer m.Unlock()
m.processedBytes += count
}
func (cw *CloudWatchMonitoringService) MillisBehindLatest(shard string, millSeconds float64) {
m := cw.getOrCreatePerShardMetrics(shard)
m.Lock()
defer m.Unlock()
m.behindLatestMillis = append(m.behindLatestMillis, millSeconds)
}
func (cw *CloudWatchMonitoringService) LeaseGained(shard string) {
m := cw.getOrCreatePerShardMetrics(shard)
m.Lock()
defer m.Unlock()
m.leasesHeld++
}
func (cw *CloudWatchMonitoringService) LeaseLost(shard string) {
m := cw.getOrCreatePerShardMetrics(shard)
m.Lock()
defer m.Unlock()
m.leasesHeld--
}
func (cw *CloudWatchMonitoringService) LeaseRenewed(shard string) {
m := cw.getOrCreatePerShardMetrics(shard)
m.Lock()
defer m.Unlock()
m.leaseRenewals++
}
func (cw *CloudWatchMonitoringService) RecordGetRecordsTime(shard string, time float64) {
m := cw.getOrCreatePerShardMetrics(shard)
m.Lock()
defer m.Unlock()
m.getRecordsTime = append(m.getRecordsTime, time)
}
func (cw *CloudWatchMonitoringService) RecordProcessRecordsTime(shard string, time float64) {
m := cw.getOrCreatePerShardMetrics(shard)
m.Lock()
defer m.Unlock()
m.processRecordsTime = append(m.processRecordsTime, time)
}
func (cw *CloudWatchMonitoringService) getOrCreatePerShardMetrics(shard string) *cloudWatchMetrics {
var i interface{}
var ok bool
if i, ok = cw.shardMetrics.Load(shard); !ok {
m := &cloudWatchMetrics{}
cw.shardMetrics.Store(shard, m)
return m
}
return i.(*cloudWatchMetrics)
}
func sumFloat64(slice []float64) *float64 {
sum := float64(0)
for _, num := range slice {
sum += num
}
return &sum
}
func maxFloat64(slice []float64) *float64 {
if len(slice) < 1 {
return aws.Float64(0)
}
max := slice[0]
for _, num := range slice {
if num > max {
max = num
}
}
return &max
}
func minFloat64(slice []float64) *float64 {
if len(slice) < 1 {
return aws.Float64(0)
}
min := slice[0]
for _, num := range slice {
if num < min {
min = num
}
}
return &min
}