KCL: Fix cloudwatch metrics

This changes fixed cloudwatch metrics publishing by adding long
running go routine to periodically publish cloudwatch metrics.
Also, shutdown metrics publishing when KCL is shutdown.

Test:
Run hmake test and verified cloudwatch metrics has been
published via AWS cloudwatch console.

Jira CNA-702

Change-Id: I78b347cd12939447b0daf93f51acf620d18e2f49
This commit is contained in:
Tao Jiang 2018-04-20 21:07:11 -07:00
parent 2fea884212
commit e1071abc80
6 changed files with 248 additions and 196 deletions

View file

@ -16,10 +16,15 @@ type CloudWatchMonitoringService struct {
KinesisStream string
WorkerID string
Region string
// how frequently to send data to cloudwatch
ResolutionSec int
// control how often to pusblish to CloudWatch
MetricsBufferTimeMillis int
MetricsMaxQueueSize int
stop *chan struct{}
waitGroup *sync.WaitGroup
svc cloudwatchiface.CloudWatchAPI
shardMetrics map[string]*cloudWatchMetrics
shardMetrics *sync.Map
}
type cloudWatchMetrics struct {
@ -34,35 +39,53 @@ type cloudWatchMetrics struct {
}
func (cw *CloudWatchMonitoringService) Init() error {
// default to 1 min resolution
if cw.ResolutionSec == 0 {
cw.ResolutionSec = 60
}
s := session.New(&aws.Config{Region: aws.String(cw.Region)})
cw.svc = cloudwatch.New(s)
cw.shardMetrics = make(map[string]*cloudWatchMetrics)
cw.shardMetrics = new(sync.Map)
stopChan := make(chan struct{})
cw.stop = &stopChan
wg := sync.WaitGroup{}
cw.waitGroup = &wg
return nil
}
func (cw *CloudWatchMonitoringService) Start() error {
cw.waitGroup.Add(1)
// entering eventloop for sending metrics to CloudWatch
go cw.eventloop()
return nil
}
func (cw *CloudWatchMonitoringService) Shutdown() {
log.Info("Shutting down cloudwatch metrics system...")
close(*cw.stop)
cw.waitGroup.Wait()
log.Info("Cloudwatch metrics system has been shutdown.")
}
// Start daemon to flush metrics periodically
func (cw *CloudWatchMonitoringService) flushDaemon() {
previousFlushTime := time.Now()
resolutionDuration := time.Duration(cw.ResolutionSec) * time.Second
func (cw *CloudWatchMonitoringService) eventloop() {
defer cw.waitGroup.Done()
for {
time.Sleep(resolutionDuration - time.Now().Sub(previousFlushTime))
err := cw.Flush()
err := cw.flush()
if err != nil {
log.Errorf("Error sending metrics to CloudWatch. %+v", err)
}
previousFlushTime = time.Now()
select {
case <-*cw.stop:
log.Info("Shutting down monitoring system")
cw.flush()
return
case <-time.After(time.Duration(cw.MetricsBufferTimeMillis) * time.Millisecond):
}
}
}
func (cw *CloudWatchMonitoringService) Flush() error {
// publish per shard metrics
for shard, metric := range cw.shardMetrics {
func (cw *CloudWatchMonitoringService) flushShard(shard string, metric *cloudWatchMetrics) bool {
metric.Lock()
defaultDimensions := []*cloudwatch.Dimension{
{
@ -91,10 +114,7 @@ func (cw *CloudWatchMonitoringService) Flush() error {
}
metricTimestamp := time.Now()
// Publish metrics data to cloud watch
_, err := cw.svc.PutMetricData(&cloudwatch.PutMetricDataInput{
Namespace: aws.String(cw.Namespace),
MetricData: []*cloudwatch.MetricDatum{
data := []*cloudwatch.MetricDatum{
{
Dimensions: defaultDimensions,
MetricName: aws.String("RecordsProcessed"),
@ -109,42 +129,6 @@ func (cw *CloudWatchMonitoringService) Flush() error {
Timestamp: &metricTimestamp,
Value: aws.Float64(float64(metric.processedBytes)),
},
{
Dimensions: defaultDimensions,
MetricName: aws.String("MillisBehindLatest"),
Unit: aws.String("Milliseconds"),
Timestamp: &metricTimestamp,
StatisticValues: &cloudwatch.StatisticSet{
SampleCount: aws.Float64(float64(len(metric.behindLatestMillis))),
Sum: sumFloat64(metric.behindLatestMillis),
Maximum: maxFloat64(metric.behindLatestMillis),
Minimum: minFloat64(metric.behindLatestMillis),
},
},
{
Dimensions: defaultDimensions,
MetricName: aws.String("KinesisDataFetcher.getRecords.Time"),
Unit: aws.String("Milliseconds"),
Timestamp: &metricTimestamp,
StatisticValues: &cloudwatch.StatisticSet{
SampleCount: aws.Float64(float64(len(metric.getRecordsTime))),
Sum: sumFloat64(metric.getRecordsTime),
Maximum: maxFloat64(metric.getRecordsTime),
Minimum: minFloat64(metric.getRecordsTime),
},
},
{
Dimensions: defaultDimensions,
MetricName: aws.String("RecordProcessor.processRecords.Time"),
Unit: aws.String("Milliseconds"),
Timestamp: &metricTimestamp,
StatisticValues: &cloudwatch.StatisticSet{
SampleCount: aws.Float64(float64(len(metric.processRecordsTime))),
Sum: sumFloat64(metric.processRecordsTime),
Maximum: maxFloat64(metric.processRecordsTime),
Minimum: minFloat64(metric.processRecordsTime),
},
},
{
Dimensions: leaseDimensions,
MetricName: aws.String("RenewLease.Success"),
@ -159,8 +143,56 @@ func (cw *CloudWatchMonitoringService) Flush() error {
Timestamp: &metricTimestamp,
Value: aws.Float64(float64(metric.leasesHeld)),
},
},
}
if len(metric.behindLatestMillis) > 0 {
data = append(data, &cloudwatch.MetricDatum{
Dimensions: defaultDimensions,
MetricName: aws.String("MillisBehindLatest"),
Unit: aws.String("Milliseconds"),
Timestamp: &metricTimestamp,
StatisticValues: &cloudwatch.StatisticSet{
SampleCount: aws.Float64(float64(len(metric.behindLatestMillis))),
Sum: sumFloat64(metric.behindLatestMillis),
Maximum: maxFloat64(metric.behindLatestMillis),
Minimum: minFloat64(metric.behindLatestMillis),
}})
}
if len(metric.getRecordsTime) > 0 {
data = append(data, &cloudwatch.MetricDatum{
Dimensions: defaultDimensions,
MetricName: aws.String("KinesisDataFetcher.getRecords.Time"),
Unit: aws.String("Milliseconds"),
Timestamp: &metricTimestamp,
StatisticValues: &cloudwatch.StatisticSet{
SampleCount: aws.Float64(float64(len(metric.getRecordsTime))),
Sum: sumFloat64(metric.getRecordsTime),
Maximum: maxFloat64(metric.getRecordsTime),
Minimum: minFloat64(metric.getRecordsTime),
}})
}
if len(metric.processRecordsTime) > 0 {
data = append(data, &cloudwatch.MetricDatum{
Dimensions: defaultDimensions,
MetricName: aws.String("RecordProcessor.processRecords.Time"),
Unit: aws.String("Milliseconds"),
Timestamp: &metricTimestamp,
StatisticValues: &cloudwatch.StatisticSet{
SampleCount: aws.Float64(float64(len(metric.processRecordsTime))),
Sum: sumFloat64(metric.processRecordsTime),
Maximum: maxFloat64(metric.processRecordsTime),
Minimum: minFloat64(metric.processRecordsTime),
}})
}
// Publish metrics data to cloud watch
_, err := cw.svc.PutMetricData(&cloudwatch.PutMetricDataInput{
Namespace: aws.String(cw.Namespace),
MetricData: data,
})
if err == nil {
metric.processedRecords = 0
metric.processedBytes = 0
@ -173,80 +205,85 @@ func (cw *CloudWatchMonitoringService) Flush() error {
}
metric.Unlock()
return err
}
return true
}
func (cw *CloudWatchMonitoringService) flush() error {
log.Debugf("Flushing metrics data. Stream: %s, Worker: %s", cw.KinesisStream, cw.WorkerID)
// publish per shard metrics
cw.shardMetrics.Range(func(k, v interface{}) bool {
shard, metric := k.(string), v.(*cloudWatchMetrics)
return cw.flushShard(shard, metric)
})
return nil
}
func (cw *CloudWatchMonitoringService) IncrRecordsProcessed(shard string, count int) {
if _, ok := cw.shardMetrics[shard]; !ok {
cw.shardMetrics[shard] = &cloudWatchMetrics{}
}
cw.shardMetrics[shard].Lock()
defer cw.shardMetrics[shard].Unlock()
cw.shardMetrics[shard].processedRecords += int64(count)
m := cw.getOrCreatePerShardMetrics(shard)
m.Lock()
defer m.Unlock()
m.processedRecords += int64(count)
}
func (cw *CloudWatchMonitoringService) IncrBytesProcessed(shard string, count int64) {
if _, ok := cw.shardMetrics[shard]; !ok {
cw.shardMetrics[shard] = &cloudWatchMetrics{}
}
cw.shardMetrics[shard].Lock()
defer cw.shardMetrics[shard].Unlock()
cw.shardMetrics[shard].processedBytes += count
m := cw.getOrCreatePerShardMetrics(shard)
m.Lock()
defer m.Unlock()
m.processedBytes += count
}
func (cw *CloudWatchMonitoringService) MillisBehindLatest(shard string, millSeconds float64) {
if _, ok := cw.shardMetrics[shard]; !ok {
cw.shardMetrics[shard] = &cloudWatchMetrics{}
}
cw.shardMetrics[shard].Lock()
defer cw.shardMetrics[shard].Unlock()
cw.shardMetrics[shard].behindLatestMillis = append(cw.shardMetrics[shard].behindLatestMillis, millSeconds)
m := cw.getOrCreatePerShardMetrics(shard)
m.Lock()
defer m.Unlock()
m.behindLatestMillis = append(m.behindLatestMillis, millSeconds)
}
func (cw *CloudWatchMonitoringService) LeaseGained(shard string) {
if _, ok := cw.shardMetrics[shard]; !ok {
cw.shardMetrics[shard] = &cloudWatchMetrics{}
}
cw.shardMetrics[shard].Lock()
defer cw.shardMetrics[shard].Unlock()
cw.shardMetrics[shard].leasesHeld++
m := cw.getOrCreatePerShardMetrics(shard)
m.Lock()
defer m.Unlock()
m.leasesHeld++
}
func (cw *CloudWatchMonitoringService) LeaseLost(shard string) {
if _, ok := cw.shardMetrics[shard]; !ok {
cw.shardMetrics[shard] = &cloudWatchMetrics{}
}
cw.shardMetrics[shard].Lock()
defer cw.shardMetrics[shard].Unlock()
cw.shardMetrics[shard].leasesHeld--
m := cw.getOrCreatePerShardMetrics(shard)
m.Lock()
defer m.Unlock()
m.leasesHeld--
}
func (cw *CloudWatchMonitoringService) LeaseRenewed(shard string) {
if _, ok := cw.shardMetrics[shard]; !ok {
cw.shardMetrics[shard] = &cloudWatchMetrics{}
}
cw.shardMetrics[shard].Lock()
defer cw.shardMetrics[shard].Unlock()
cw.shardMetrics[shard].leaseRenewals++
m := cw.getOrCreatePerShardMetrics(shard)
m.Lock()
defer m.Unlock()
m.leaseRenewals++
}
func (cw *CloudWatchMonitoringService) RecordGetRecordsTime(shard string, time float64) {
if _, ok := cw.shardMetrics[shard]; !ok {
cw.shardMetrics[shard] = &cloudWatchMetrics{}
}
cw.shardMetrics[shard].Lock()
defer cw.shardMetrics[shard].Unlock()
cw.shardMetrics[shard].getRecordsTime = append(cw.shardMetrics[shard].getRecordsTime, time)
m := cw.getOrCreatePerShardMetrics(shard)
m.Lock()
defer m.Unlock()
m.getRecordsTime = append(m.getRecordsTime, time)
}
func (cw *CloudWatchMonitoringService) RecordProcessRecordsTime(shard string, time float64) {
if _, ok := cw.shardMetrics[shard]; !ok {
cw.shardMetrics[shard] = &cloudWatchMetrics{}
m := cw.getOrCreatePerShardMetrics(shard)
m.Lock()
defer m.Unlock()
m.processRecordsTime = append(m.processRecordsTime, time)
}
func (cw *CloudWatchMonitoringService) getOrCreatePerShardMetrics(shard string) *cloudWatchMetrics {
var i interface{}
var ok bool
if i, ok = cw.shardMetrics.Load(shard); !ok {
m := &cloudWatchMetrics{}
cw.shardMetrics.Store(shard, m)
return m
}
cw.shardMetrics[shard].Lock()
defer cw.shardMetrics[shard].Unlock()
cw.shardMetrics[shard].processRecordsTime = append(cw.shardMetrics[shard].processRecordsTime, time)
return i.(*cloudWatchMetrics)
}
func sumFloat64(slice []float64) *float64 {

View file

@ -15,6 +15,7 @@ type MonitoringConfiguration struct {
type MonitoringService interface {
Init() error
Start() error
IncrRecordsProcessed(string, int)
IncrBytesProcessed(string, int64)
MillisBehindLatest(string, float64)
@ -23,7 +24,7 @@ type MonitoringService interface {
LeaseRenewed(string)
RecordGetRecordsTime(string, float64)
RecordProcessRecordsTime(string, float64)
Flush() error
Shutdown()
}
func (m *MonitoringConfiguration) Init(nameSpace, streamName string, workerID string) error {
@ -57,9 +58,9 @@ func (m *MonitoringConfiguration) GetMonitoringService() MonitoringService {
type noopMonitoringService struct{}
func (n *noopMonitoringService) Init() error {
return nil
}
func (n *noopMonitoringService) Init() error { return nil }
func (n *noopMonitoringService) Start() error { return nil }
func (n *noopMonitoringService) Shutdown() {}
func (n *noopMonitoringService) IncrRecordsProcessed(shard string, count int) {}
func (n *noopMonitoringService) IncrBytesProcessed(shard string, count int64) {}
@ -69,4 +70,3 @@ func (n *noopMonitoringService) LeaseLost(shard string)
func (n *noopMonitoringService) LeaseRenewed(shard string) {}
func (n *noopMonitoringService) RecordGetRecordsTime(shard string, time float64) {}
func (n *noopMonitoringService) RecordProcessRecordsTime(shard string, time float64) {}
func (n *noopMonitoringService) Flush() error { return nil }

View file

@ -8,6 +8,9 @@ import (
log "github.com/sirupsen/logrus"
)
// PrometheusMonitoringService to start Prometheus as metrics system.
// It might be trick if the service onboarding with KCL also uses Prometheus.
// Therefore, we should start cloudwatch metrics by default instead.
type PrometheusMonitoringService struct {
ListenAddress string
@ -70,17 +73,25 @@ func (p *PrometheusMonitoringService) Init() error {
}
}
http.Handle("/metrics", promhttp.Handler())
go func() {
log.Debugf("Starting Prometheus listener on %s", p.ListenAddress)
err := http.ListenAndServe(p.ListenAddress, nil)
if err != nil {
log.Errorln("Error starting Prometheus metrics endpoint", err)
}
}()
return nil
}
func (p *PrometheusMonitoringService) Start() error {
http.Handle("/metrics", promhttp.Handler())
go func() {
log.Infof("Starting Prometheus listener on %s", p.ListenAddress)
err := http.ListenAndServe(p.ListenAddress, nil)
if err != nil {
log.Errorf("Error starting Prometheus metrics endpoint. %+v", err)
}
log.Info("Stopped metrics server")
}()
return nil
}
func (p *PrometheusMonitoringService) Shutdown() {}
func (p *PrometheusMonitoringService) IncrRecordsProcessed(shard string, count int) {
p.processedRecords.With(prometheus.Labels{"shard": shard, "kinesisStream": p.KinesisStream}).Add(float64(count))
}
@ -112,5 +123,3 @@ func (p *PrometheusMonitoringService) RecordGetRecordsTime(shard string, time fl
func (p *PrometheusMonitoringService) RecordProcessRecordsTime(shard string, time float64) {
p.processRecordsTime.With(prometheus.Labels{"shard": shard, "kinesisStream": p.KinesisStream}).Observe(time)
}
func (p *PrometheusMonitoringService) Flush() error { return nil }

View file

@ -209,8 +209,6 @@ func (sc *ShardConsumer) getRecords(shard *shardStatus) error {
case <-*sc.stop:
shutdownInput := &kcl.ShutdownInput{ShutdownReason: kcl.REQUESTED, Checkpointer: recordCheckpointer}
sc.recordProcessor.Shutdown(shutdownInput)
// flush out the metrics data
sc.mService.Flush()
return nil
case <-time.After(1 * time.Nanosecond):
}

View file

@ -106,8 +106,11 @@ func (w *Worker) Start() error {
return err
}
log.Info("Initialization complete. Starting worker event loop.")
// Start monitoring service
log.Info("Starting monitoring service.")
w.mService.Start()
log.Info("Starting worker event loop.")
// entering event loop
go w.eventLoop()
return nil
@ -120,6 +123,7 @@ func (w *Worker) Shutdown() {
close(*w.stop)
w.waitGroup.Wait()
w.mService.Shutdown()
log.Info("Worker loop is complete. Exiting from worker.")
}
@ -170,6 +174,8 @@ func (w *Worker) initialize() error {
return err
}
log.Info("Initialization complete.")
return nil
}

View file

@ -92,7 +92,9 @@ func getMetricsConfig(service string) *metrics.MonitoringConfiguration {
MonitoringService: "cloudwatch",
Region: regionName,
CloudWatch: metrics.CloudWatchMonitoringService{
ResolutionSec: 1,
// Those value should come from kclConfig
MetricsBufferTimeMillis: 10000,
MetricsMaxQueueSize: 20,
},
}
}