KCL: Enable Metrics

This change enables metrics reporting and fixes a few bug in metrics reporting.
The current metrics reporting is quite limited. Will add more metrics in
next cr.

Tested with both prometheus and cloudwatch.

Jira CNA-702

Change-Id: I678b3f8a372d83f7b8adc419133c14cd10884f61
This commit is contained in:
Tao Jiang 2018-04-20 08:30:24 -07:00
parent 9d1993547f
commit 2fea884212
5 changed files with 104 additions and 33 deletions

View file

@ -15,8 +15,8 @@ type CloudWatchMonitoringService struct {
Namespace string Namespace string
KinesisStream string KinesisStream string
WorkerID string WorkerID string
// What granularity we should send metrics to CW at. Note setting this to 1 will cost quite a bit of money Region string
// At the time of writing (March 2018) about US$200 per month // how frequently to send data to cloudwatch
ResolutionSec int ResolutionSec int
svc cloudwatchiface.CloudWatchAPI svc cloudwatchiface.CloudWatchAPI
shardMetrics map[string]*cloudWatchMetrics shardMetrics map[string]*cloudWatchMetrics
@ -34,75 +34,82 @@ type cloudWatchMetrics struct {
} }
func (cw *CloudWatchMonitoringService) Init() error { func (cw *CloudWatchMonitoringService) Init() error {
// default to 1 min resolution
if cw.ResolutionSec == 0 { if cw.ResolutionSec == 0 {
cw.ResolutionSec = 60 cw.ResolutionSec = 60
} }
session, err := session.NewSessionWithOptions( s := session.New(&aws.Config{Region: aws.String(cw.Region)})
session.Options{ cw.svc = cloudwatch.New(s)
SharedConfigState: session.SharedConfigEnable,
},
)
if err != nil {
return err
}
cw.svc = cloudwatch.New(session)
cw.shardMetrics = make(map[string]*cloudWatchMetrics) cw.shardMetrics = make(map[string]*cloudWatchMetrics)
return nil return nil
} }
// Start daemon to flush metrics periodically
func (cw *CloudWatchMonitoringService) flushDaemon() { func (cw *CloudWatchMonitoringService) flushDaemon() {
previousFlushTime := time.Now() previousFlushTime := time.Now()
resolutionDuration := time.Duration(cw.ResolutionSec) * time.Second resolutionDuration := time.Duration(cw.ResolutionSec) * time.Second
for { for {
time.Sleep(resolutionDuration - time.Now().Sub(previousFlushTime)) time.Sleep(resolutionDuration - time.Now().Sub(previousFlushTime))
err := cw.flush() err := cw.Flush()
if err != nil { if err != nil {
log.Errorln("Error sending metrics to CloudWatch", err) log.Errorf("Error sending metrics to CloudWatch. %+v", err)
} }
previousFlushTime = time.Now() previousFlushTime = time.Now()
} }
} }
func (cw *CloudWatchMonitoringService) flush() error { func (cw *CloudWatchMonitoringService) Flush() error {
// publish per shard metrics
for shard, metric := range cw.shardMetrics { for shard, metric := range cw.shardMetrics {
metric.Lock() metric.Lock()
defaultDimensions := []*cloudwatch.Dimension{ defaultDimensions := []*cloudwatch.Dimension{
&cloudwatch.Dimension{ {
Name: aws.String("shard"), Name: aws.String("Shard"),
Value: &shard, Value: &shard,
}, },
&cloudwatch.Dimension{ {
Name: aws.String("KinesisStreamName"), Name: aws.String("KinesisStreamName"),
Value: &cw.KinesisStream, Value: &cw.KinesisStream,
}, },
} }
leaseDimensions := make([]*cloudwatch.Dimension, len(defaultDimensions))
copy(defaultDimensions, leaseDimensions) leaseDimensions := []*cloudwatch.Dimension{
leaseDimensions = append(leaseDimensions, &cloudwatch.Dimension{ {
Name: aws.String("Shard"),
Value: &shard,
},
{
Name: aws.String("KinesisStreamName"),
Value: &cw.KinesisStream,
},
{
Name: aws.String("WorkerID"), Name: aws.String("WorkerID"),
Value: &cw.WorkerID, Value: &cw.WorkerID,
}) },
}
metricTimestamp := time.Now() metricTimestamp := time.Now()
// Publish metrics data to cloud watch
_, err := cw.svc.PutMetricData(&cloudwatch.PutMetricDataInput{ _, err := cw.svc.PutMetricData(&cloudwatch.PutMetricDataInput{
Namespace: aws.String(cw.Namespace), Namespace: aws.String(cw.Namespace),
MetricData: []*cloudwatch.MetricDatum{ MetricData: []*cloudwatch.MetricDatum{
&cloudwatch.MetricDatum{ {
Dimensions: defaultDimensions, Dimensions: defaultDimensions,
MetricName: aws.String("RecordsProcessed"), MetricName: aws.String("RecordsProcessed"),
Unit: aws.String("Count"), Unit: aws.String("Count"),
Timestamp: &metricTimestamp, Timestamp: &metricTimestamp,
Value: aws.Float64(float64(metric.processedRecords)), Value: aws.Float64(float64(metric.processedRecords)),
}, },
&cloudwatch.MetricDatum{ {
Dimensions: defaultDimensions, Dimensions: defaultDimensions,
MetricName: aws.String("DataBytesProcessed"), MetricName: aws.String("DataBytesProcessed"),
Unit: aws.String("Byte"), Unit: aws.String("Bytes"),
Timestamp: &metricTimestamp, Timestamp: &metricTimestamp,
Value: aws.Float64(float64(metric.processedBytes)), Value: aws.Float64(float64(metric.processedBytes)),
}, },
&cloudwatch.MetricDatum{ {
Dimensions: defaultDimensions, Dimensions: defaultDimensions,
MetricName: aws.String("MillisBehindLatest"), MetricName: aws.String("MillisBehindLatest"),
Unit: aws.String("Milliseconds"), Unit: aws.String("Milliseconds"),
@ -114,7 +121,7 @@ func (cw *CloudWatchMonitoringService) flush() error {
Minimum: minFloat64(metric.behindLatestMillis), Minimum: minFloat64(metric.behindLatestMillis),
}, },
}, },
&cloudwatch.MetricDatum{ {
Dimensions: defaultDimensions, Dimensions: defaultDimensions,
MetricName: aws.String("KinesisDataFetcher.getRecords.Time"), MetricName: aws.String("KinesisDataFetcher.getRecords.Time"),
Unit: aws.String("Milliseconds"), Unit: aws.String("Milliseconds"),
@ -126,7 +133,7 @@ func (cw *CloudWatchMonitoringService) flush() error {
Minimum: minFloat64(metric.getRecordsTime), Minimum: minFloat64(metric.getRecordsTime),
}, },
}, },
&cloudwatch.MetricDatum{ {
Dimensions: defaultDimensions, Dimensions: defaultDimensions,
MetricName: aws.String("RecordProcessor.processRecords.Time"), MetricName: aws.String("RecordProcessor.processRecords.Time"),
Unit: aws.String("Milliseconds"), Unit: aws.String("Milliseconds"),
@ -138,14 +145,14 @@ func (cw *CloudWatchMonitoringService) flush() error {
Minimum: minFloat64(metric.processRecordsTime), Minimum: minFloat64(metric.processRecordsTime),
}, },
}, },
&cloudwatch.MetricDatum{ {
Dimensions: leaseDimensions, Dimensions: leaseDimensions,
MetricName: aws.String("RenewLease.Success"), MetricName: aws.String("RenewLease.Success"),
Unit: aws.String("Count"), Unit: aws.String("Count"),
Timestamp: &metricTimestamp, Timestamp: &metricTimestamp,
Value: aws.Float64(float64(metric.leaseRenewals)), Value: aws.Float64(float64(metric.leaseRenewals)),
}, },
&cloudwatch.MetricDatum{ {
Dimensions: leaseDimensions, Dimensions: leaseDimensions,
MetricName: aws.String("CurrentLeases"), MetricName: aws.String("CurrentLeases"),
Unit: aws.String("Count"), Unit: aws.String("Count"),
@ -161,7 +168,10 @@ func (cw *CloudWatchMonitoringService) flush() error {
metric.leaseRenewals = 0 metric.leaseRenewals = 0
metric.getRecordsTime = []float64{} metric.getRecordsTime = []float64{}
metric.processRecordsTime = []float64{} metric.processRecordsTime = []float64{}
} else {
log.Errorf("Error in publishing cloudwatch metrics. Error: %+v", err)
} }
metric.Unlock() metric.Unlock()
return err return err
} }

View file

@ -7,6 +7,7 @@ import (
// MonitoringConfiguration allows you to configure how record processing metrics are exposed // MonitoringConfiguration allows you to configure how record processing metrics are exposed
type MonitoringConfiguration struct { type MonitoringConfiguration struct {
MonitoringService string // Type of monitoring to expose. Supported types are "prometheus" MonitoringService string // Type of monitoring to expose. Supported types are "prometheus"
Region string
Prometheus PrometheusMonitoringService Prometheus PrometheusMonitoringService
CloudWatch CloudWatchMonitoringService CloudWatch CloudWatchMonitoringService
service MonitoringService service MonitoringService
@ -22,6 +23,7 @@ type MonitoringService interface {
LeaseRenewed(string) LeaseRenewed(string)
RecordGetRecordsTime(string, float64) RecordGetRecordsTime(string, float64)
RecordProcessRecordsTime(string, float64) RecordProcessRecordsTime(string, float64)
Flush() error
} }
func (m *MonitoringConfiguration) Init(nameSpace, streamName string, workerID string) error { func (m *MonitoringConfiguration) Init(nameSpace, streamName string, workerID string) error {
@ -35,10 +37,13 @@ func (m *MonitoringConfiguration) Init(nameSpace, streamName string, workerID st
m.Prometheus.Namespace = nameSpace m.Prometheus.Namespace = nameSpace
m.Prometheus.KinesisStream = streamName m.Prometheus.KinesisStream = streamName
m.Prometheus.WorkerID = workerID m.Prometheus.WorkerID = workerID
m.Prometheus.Region = m.Region
m.service = &m.Prometheus m.service = &m.Prometheus
case "cloudwatch": case "cloudwatch":
m.CloudWatch.Namespace = nameSpace
m.CloudWatch.KinesisStream = streamName m.CloudWatch.KinesisStream = streamName
m.CloudWatch.WorkerID = workerID m.CloudWatch.WorkerID = workerID
m.CloudWatch.Region = m.Region
m.service = &m.CloudWatch m.service = &m.CloudWatch
default: default:
return fmt.Errorf("Invalid monitoring service type %s", m.MonitoringService) return fmt.Errorf("Invalid monitoring service type %s", m.MonitoringService)
@ -64,3 +69,4 @@ func (n *noopMonitoringService) LeaseLost(shard string)
func (n *noopMonitoringService) LeaseRenewed(shard string) {} func (n *noopMonitoringService) LeaseRenewed(shard string) {}
func (n *noopMonitoringService) RecordGetRecordsTime(shard string, time float64) {} func (n *noopMonitoringService) RecordGetRecordsTime(shard string, time float64) {}
func (n *noopMonitoringService) RecordProcessRecordsTime(shard string, time float64) {} func (n *noopMonitoringService) RecordProcessRecordsTime(shard string, time float64) {}
func (n *noopMonitoringService) Flush() error { return nil }

View file

@ -14,6 +14,7 @@ type PrometheusMonitoringService struct {
Namespace string Namespace string
KinesisStream string KinesisStream string
WorkerID string WorkerID string
Region string
processedRecords *prometheus.CounterVec processedRecords *prometheus.CounterVec
processedBytes *prometheus.CounterVec processedBytes *prometheus.CounterVec
behindLatestMillis *prometheus.GaugeVec behindLatestMillis *prometheus.GaugeVec
@ -111,3 +112,5 @@ func (p *PrometheusMonitoringService) RecordGetRecordsTime(shard string, time fl
func (p *PrometheusMonitoringService) RecordProcessRecordsTime(shard string, time float64) { func (p *PrometheusMonitoringService) RecordProcessRecordsTime(shard string, time float64) {
p.processRecordsTime.With(prometheus.Labels{"shard": shard, "kinesisStream": p.KinesisStream}).Observe(time) p.processRecordsTime.With(prometheus.Labels{"shard": shard, "kinesisStream": p.KinesisStream}).Observe(time)
} }
func (p *PrometheusMonitoringService) Flush() error { return nil }

View file

@ -209,6 +209,8 @@ func (sc *ShardConsumer) getRecords(shard *shardStatus) error {
case <-*sc.stop: case <-*sc.stop:
shutdownInput := &kcl.ShutdownInput{ShutdownReason: kcl.REQUESTED, Checkpointer: recordCheckpointer} shutdownInput := &kcl.ShutdownInput{ShutdownReason: kcl.REQUESTED, Checkpointer: recordCheckpointer}
sc.recordProcessor.Shutdown(shutdownInput) sc.recordProcessor.Shutdown(shutdownInput)
// flush out the metrics data
sc.mService.Flush()
return nil return nil
case <-time.After(1 * time.Nanosecond): case <-time.After(1 * time.Nanosecond):
} }

View file

@ -1,15 +1,18 @@
package worker package worker
import ( import (
"net/http"
"os" "os"
"testing" "testing"
"time" "time"
"github.com/aws/aws-sdk-go/aws" "github.com/aws/aws-sdk-go/aws"
"github.com/prometheus/common/expfmt"
log "github.com/sirupsen/logrus" log "github.com/sirupsen/logrus"
cfg "clientlibrary/config" cfg "clientlibrary/config"
kc "clientlibrary/interfaces" kc "clientlibrary/interfaces"
"clientlibrary/metrics"
"clientlibrary/utils" "clientlibrary/utils"
"github.com/stretchr/testify/assert" "github.com/stretchr/testify/assert"
) )
@ -21,6 +24,7 @@ const (
) )
const specstr = `{"name":"kube-qQyhk","networking":{"containerNetworkCidr":"10.2.0.0/16"},"orgName":"BVT-Org-cLQch","projectName":"project-tDSJd","serviceLevel":"DEVELOPER","size":{"count":1},"version":"1.8.1-4"}` const specstr = `{"name":"kube-qQyhk","networking":{"containerNetworkCidr":"10.2.0.0/16"},"orgName":"BVT-Org-cLQch","projectName":"project-tDSJd","serviceLevel":"DEVELOPER","size":{"count":1},"version":"1.8.1-4"}`
const metricsSystem = "cloudwatch"
func TestWorker(t *testing.T) { func TestWorker(t *testing.T) {
os.Setenv("AWS_ACCESS_KEY_ID", "your aws access key id") os.Setenv("AWS_ACCESS_KEY_ID", "your aws access key id")
@ -40,7 +44,10 @@ func TestWorker(t *testing.T) {
assert.Equal(t, regionName, kclConfig.RegionName) assert.Equal(t, regionName, kclConfig.RegionName)
assert.Equal(t, streamName, kclConfig.StreamName) assert.Equal(t, streamName, kclConfig.StreamName)
worker := NewWorker(recordProcessorFactory(t), kclConfig, nil) // configure cloudwatch as metrics system
metricsConfig := getMetricsConfig(metricsSystem)
worker := NewWorker(recordProcessorFactory(t), kclConfig, metricsConfig)
assert.Equal(t, regionName, worker.regionName) assert.Equal(t, regionName, worker.regionName)
assert.Equal(t, streamName, worker.streamName) assert.Equal(t, streamName, worker.streamName)
@ -56,10 +63,53 @@ func TestWorker(t *testing.T) {
} }
} }
// wait a few seconds before shutdown processing
time.Sleep(10 * time.Second) time.Sleep(10 * time.Second)
if metricsConfig != nil && metricsConfig.MonitoringService == "prometheus" {
res, err := http.Get("http://localhost:8080/metrics")
if err != nil {
t.Fatalf("Error scraping Prometheus endpoint %s", err)
}
var parser expfmt.TextParser
parsed, err := parser.TextToMetricFamilies(res.Body)
res.Body.Close()
if err != nil {
t.Errorf("Error reading monitoring response %s", err)
}
t.Logf("Prometheus: %+v", parsed)
}
worker.Shutdown() worker.Shutdown()
} }
// configure different metrics system
func getMetricsConfig(service string) *metrics.MonitoringConfiguration {
if service == "cloudwatch" {
return &metrics.MonitoringConfiguration{
MonitoringService: "cloudwatch",
Region: regionName,
CloudWatch: metrics.CloudWatchMonitoringService{
ResolutionSec: 1,
},
}
}
if service == "prometheus" {
return &metrics.MonitoringConfiguration{
MonitoringService: "prometheus",
Region: regionName,
Prometheus: metrics.PrometheusMonitoringService{
ListenAddress: ":8080",
},
}
}
return nil
}
// Record processor factory is used to create RecordProcessor // Record processor factory is used to create RecordProcessor
func recordProcessorFactory(t *testing.T) kc.IRecordProcessorFactory { func recordProcessorFactory(t *testing.T) kc.IRecordProcessorFactory {
return &dumpRecordProcessorFactory{t: t} return &dumpRecordProcessorFactory{t: t}