KCL: Enable Metrics
This change enables metrics reporting and fixes a few bug in metrics reporting. The current metrics reporting is quite limited. Will add more metrics in next cr. Tested with both prometheus and cloudwatch. Jira CNA-702 Change-Id: I678b3f8a372d83f7b8adc419133c14cd10884f61
This commit is contained in:
parent
9d1993547f
commit
2fea884212
5 changed files with 104 additions and 33 deletions
|
|
@ -15,8 +15,8 @@ type CloudWatchMonitoringService struct {
|
|||
Namespace string
|
||||
KinesisStream string
|
||||
WorkerID string
|
||||
// What granularity we should send metrics to CW at. Note setting this to 1 will cost quite a bit of money
|
||||
// At the time of writing (March 2018) about US$200 per month
|
||||
Region string
|
||||
// how frequently to send data to cloudwatch
|
||||
ResolutionSec int
|
||||
svc cloudwatchiface.CloudWatchAPI
|
||||
shardMetrics map[string]*cloudWatchMetrics
|
||||
|
|
@ -34,75 +34,82 @@ type cloudWatchMetrics struct {
|
|||
}
|
||||
|
||||
func (cw *CloudWatchMonitoringService) Init() error {
|
||||
// default to 1 min resolution
|
||||
if cw.ResolutionSec == 0 {
|
||||
cw.ResolutionSec = 60
|
||||
}
|
||||
|
||||
session, err := session.NewSessionWithOptions(
|
||||
session.Options{
|
||||
SharedConfigState: session.SharedConfigEnable,
|
||||
},
|
||||
)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
cw.svc = cloudwatch.New(session)
|
||||
s := session.New(&aws.Config{Region: aws.String(cw.Region)})
|
||||
cw.svc = cloudwatch.New(s)
|
||||
cw.shardMetrics = make(map[string]*cloudWatchMetrics)
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
// Start daemon to flush metrics periodically
|
||||
func (cw *CloudWatchMonitoringService) flushDaemon() {
|
||||
previousFlushTime := time.Now()
|
||||
resolutionDuration := time.Duration(cw.ResolutionSec) * time.Second
|
||||
for {
|
||||
time.Sleep(resolutionDuration - time.Now().Sub(previousFlushTime))
|
||||
err := cw.flush()
|
||||
err := cw.Flush()
|
||||
if err != nil {
|
||||
log.Errorln("Error sending metrics to CloudWatch", err)
|
||||
log.Errorf("Error sending metrics to CloudWatch. %+v", err)
|
||||
}
|
||||
previousFlushTime = time.Now()
|
||||
}
|
||||
}
|
||||
|
||||
func (cw *CloudWatchMonitoringService) flush() error {
|
||||
func (cw *CloudWatchMonitoringService) Flush() error {
|
||||
// publish per shard metrics
|
||||
for shard, metric := range cw.shardMetrics {
|
||||
metric.Lock()
|
||||
defaultDimensions := []*cloudwatch.Dimension{
|
||||
&cloudwatch.Dimension{
|
||||
Name: aws.String("shard"),
|
||||
{
|
||||
Name: aws.String("Shard"),
|
||||
Value: &shard,
|
||||
},
|
||||
&cloudwatch.Dimension{
|
||||
{
|
||||
Name: aws.String("KinesisStreamName"),
|
||||
Value: &cw.KinesisStream,
|
||||
},
|
||||
}
|
||||
leaseDimensions := make([]*cloudwatch.Dimension, len(defaultDimensions))
|
||||
copy(defaultDimensions, leaseDimensions)
|
||||
leaseDimensions = append(leaseDimensions, &cloudwatch.Dimension{
|
||||
Name: aws.String("WorkerID"),
|
||||
Value: &cw.WorkerID,
|
||||
})
|
||||
|
||||
leaseDimensions := []*cloudwatch.Dimension{
|
||||
{
|
||||
Name: aws.String("Shard"),
|
||||
Value: &shard,
|
||||
},
|
||||
{
|
||||
Name: aws.String("KinesisStreamName"),
|
||||
Value: &cw.KinesisStream,
|
||||
},
|
||||
{
|
||||
Name: aws.String("WorkerID"),
|
||||
Value: &cw.WorkerID,
|
||||
},
|
||||
}
|
||||
metricTimestamp := time.Now()
|
||||
|
||||
// Publish metrics data to cloud watch
|
||||
_, err := cw.svc.PutMetricData(&cloudwatch.PutMetricDataInput{
|
||||
Namespace: aws.String(cw.Namespace),
|
||||
MetricData: []*cloudwatch.MetricDatum{
|
||||
&cloudwatch.MetricDatum{
|
||||
{
|
||||
Dimensions: defaultDimensions,
|
||||
MetricName: aws.String("RecordsProcessed"),
|
||||
Unit: aws.String("Count"),
|
||||
Timestamp: &metricTimestamp,
|
||||
Value: aws.Float64(float64(metric.processedRecords)),
|
||||
},
|
||||
&cloudwatch.MetricDatum{
|
||||
{
|
||||
Dimensions: defaultDimensions,
|
||||
MetricName: aws.String("DataBytesProcessed"),
|
||||
Unit: aws.String("Byte"),
|
||||
Unit: aws.String("Bytes"),
|
||||
Timestamp: &metricTimestamp,
|
||||
Value: aws.Float64(float64(metric.processedBytes)),
|
||||
},
|
||||
&cloudwatch.MetricDatum{
|
||||
{
|
||||
Dimensions: defaultDimensions,
|
||||
MetricName: aws.String("MillisBehindLatest"),
|
||||
Unit: aws.String("Milliseconds"),
|
||||
|
|
@ -114,7 +121,7 @@ func (cw *CloudWatchMonitoringService) flush() error {
|
|||
Minimum: minFloat64(metric.behindLatestMillis),
|
||||
},
|
||||
},
|
||||
&cloudwatch.MetricDatum{
|
||||
{
|
||||
Dimensions: defaultDimensions,
|
||||
MetricName: aws.String("KinesisDataFetcher.getRecords.Time"),
|
||||
Unit: aws.String("Milliseconds"),
|
||||
|
|
@ -126,7 +133,7 @@ func (cw *CloudWatchMonitoringService) flush() error {
|
|||
Minimum: minFloat64(metric.getRecordsTime),
|
||||
},
|
||||
},
|
||||
&cloudwatch.MetricDatum{
|
||||
{
|
||||
Dimensions: defaultDimensions,
|
||||
MetricName: aws.String("RecordProcessor.processRecords.Time"),
|
||||
Unit: aws.String("Milliseconds"),
|
||||
|
|
@ -138,14 +145,14 @@ func (cw *CloudWatchMonitoringService) flush() error {
|
|||
Minimum: minFloat64(metric.processRecordsTime),
|
||||
},
|
||||
},
|
||||
&cloudwatch.MetricDatum{
|
||||
{
|
||||
Dimensions: leaseDimensions,
|
||||
MetricName: aws.String("RenewLease.Success"),
|
||||
Unit: aws.String("Count"),
|
||||
Timestamp: &metricTimestamp,
|
||||
Value: aws.Float64(float64(metric.leaseRenewals)),
|
||||
},
|
||||
&cloudwatch.MetricDatum{
|
||||
{
|
||||
Dimensions: leaseDimensions,
|
||||
MetricName: aws.String("CurrentLeases"),
|
||||
Unit: aws.String("Count"),
|
||||
|
|
@ -161,7 +168,10 @@ func (cw *CloudWatchMonitoringService) flush() error {
|
|||
metric.leaseRenewals = 0
|
||||
metric.getRecordsTime = []float64{}
|
||||
metric.processRecordsTime = []float64{}
|
||||
} else {
|
||||
log.Errorf("Error in publishing cloudwatch metrics. Error: %+v", err)
|
||||
}
|
||||
|
||||
metric.Unlock()
|
||||
return err
|
||||
}
|
||||
|
|
|
|||
|
|
@ -7,6 +7,7 @@ import (
|
|||
// MonitoringConfiguration allows you to configure how record processing metrics are exposed
|
||||
type MonitoringConfiguration struct {
|
||||
MonitoringService string // Type of monitoring to expose. Supported types are "prometheus"
|
||||
Region string
|
||||
Prometheus PrometheusMonitoringService
|
||||
CloudWatch CloudWatchMonitoringService
|
||||
service MonitoringService
|
||||
|
|
@ -22,6 +23,7 @@ type MonitoringService interface {
|
|||
LeaseRenewed(string)
|
||||
RecordGetRecordsTime(string, float64)
|
||||
RecordProcessRecordsTime(string, float64)
|
||||
Flush() error
|
||||
}
|
||||
|
||||
func (m *MonitoringConfiguration) Init(nameSpace, streamName string, workerID string) error {
|
||||
|
|
@ -35,10 +37,13 @@ func (m *MonitoringConfiguration) Init(nameSpace, streamName string, workerID st
|
|||
m.Prometheus.Namespace = nameSpace
|
||||
m.Prometheus.KinesisStream = streamName
|
||||
m.Prometheus.WorkerID = workerID
|
||||
m.Prometheus.Region = m.Region
|
||||
m.service = &m.Prometheus
|
||||
case "cloudwatch":
|
||||
m.CloudWatch.Namespace = nameSpace
|
||||
m.CloudWatch.KinesisStream = streamName
|
||||
m.CloudWatch.WorkerID = workerID
|
||||
m.CloudWatch.Region = m.Region
|
||||
m.service = &m.CloudWatch
|
||||
default:
|
||||
return fmt.Errorf("Invalid monitoring service type %s", m.MonitoringService)
|
||||
|
|
@ -64,3 +69,4 @@ func (n *noopMonitoringService) LeaseLost(shard string)
|
|||
func (n *noopMonitoringService) LeaseRenewed(shard string) {}
|
||||
func (n *noopMonitoringService) RecordGetRecordsTime(shard string, time float64) {}
|
||||
func (n *noopMonitoringService) RecordProcessRecordsTime(shard string, time float64) {}
|
||||
func (n *noopMonitoringService) Flush() error { return nil }
|
||||
|
|
|
|||
|
|
@ -14,6 +14,7 @@ type PrometheusMonitoringService struct {
|
|||
Namespace string
|
||||
KinesisStream string
|
||||
WorkerID string
|
||||
Region string
|
||||
processedRecords *prometheus.CounterVec
|
||||
processedBytes *prometheus.CounterVec
|
||||
behindLatestMillis *prometheus.GaugeVec
|
||||
|
|
@ -111,3 +112,5 @@ func (p *PrometheusMonitoringService) RecordGetRecordsTime(shard string, time fl
|
|||
func (p *PrometheusMonitoringService) RecordProcessRecordsTime(shard string, time float64) {
|
||||
p.processRecordsTime.With(prometheus.Labels{"shard": shard, "kinesisStream": p.KinesisStream}).Observe(time)
|
||||
}
|
||||
|
||||
func (p *PrometheusMonitoringService) Flush() error { return nil }
|
||||
|
|
|
|||
|
|
@ -209,6 +209,8 @@ func (sc *ShardConsumer) getRecords(shard *shardStatus) error {
|
|||
case <-*sc.stop:
|
||||
shutdownInput := &kcl.ShutdownInput{ShutdownReason: kcl.REQUESTED, Checkpointer: recordCheckpointer}
|
||||
sc.recordProcessor.Shutdown(shutdownInput)
|
||||
// flush out the metrics data
|
||||
sc.mService.Flush()
|
||||
return nil
|
||||
case <-time.After(1 * time.Nanosecond):
|
||||
}
|
||||
|
|
|
|||
|
|
@ -1,15 +1,18 @@
|
|||
package worker
|
||||
|
||||
import (
|
||||
"net/http"
|
||||
"os"
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
"github.com/aws/aws-sdk-go/aws"
|
||||
"github.com/prometheus/common/expfmt"
|
||||
log "github.com/sirupsen/logrus"
|
||||
|
||||
cfg "clientlibrary/config"
|
||||
kc "clientlibrary/interfaces"
|
||||
"clientlibrary/metrics"
|
||||
"clientlibrary/utils"
|
||||
"github.com/stretchr/testify/assert"
|
||||
)
|
||||
|
|
@ -21,6 +24,7 @@ const (
|
|||
)
|
||||
|
||||
const specstr = `{"name":"kube-qQyhk","networking":{"containerNetworkCidr":"10.2.0.0/16"},"orgName":"BVT-Org-cLQch","projectName":"project-tDSJd","serviceLevel":"DEVELOPER","size":{"count":1},"version":"1.8.1-4"}`
|
||||
const metricsSystem = "cloudwatch"
|
||||
|
||||
func TestWorker(t *testing.T) {
|
||||
os.Setenv("AWS_ACCESS_KEY_ID", "your aws access key id")
|
||||
|
|
@ -40,7 +44,10 @@ func TestWorker(t *testing.T) {
|
|||
assert.Equal(t, regionName, kclConfig.RegionName)
|
||||
assert.Equal(t, streamName, kclConfig.StreamName)
|
||||
|
||||
worker := NewWorker(recordProcessorFactory(t), kclConfig, nil)
|
||||
// configure cloudwatch as metrics system
|
||||
metricsConfig := getMetricsConfig(metricsSystem)
|
||||
|
||||
worker := NewWorker(recordProcessorFactory(t), kclConfig, metricsConfig)
|
||||
assert.Equal(t, regionName, worker.regionName)
|
||||
assert.Equal(t, streamName, worker.streamName)
|
||||
|
||||
|
|
@ -56,10 +63,53 @@ func TestWorker(t *testing.T) {
|
|||
}
|
||||
}
|
||||
|
||||
// wait a few seconds before shutdown processing
|
||||
time.Sleep(10 * time.Second)
|
||||
|
||||
if metricsConfig != nil && metricsConfig.MonitoringService == "prometheus" {
|
||||
res, err := http.Get("http://localhost:8080/metrics")
|
||||
if err != nil {
|
||||
t.Fatalf("Error scraping Prometheus endpoint %s", err)
|
||||
}
|
||||
|
||||
var parser expfmt.TextParser
|
||||
parsed, err := parser.TextToMetricFamilies(res.Body)
|
||||
res.Body.Close()
|
||||
if err != nil {
|
||||
t.Errorf("Error reading monitoring response %s", err)
|
||||
}
|
||||
t.Logf("Prometheus: %+v", parsed)
|
||||
|
||||
}
|
||||
|
||||
worker.Shutdown()
|
||||
}
|
||||
|
||||
// configure different metrics system
|
||||
func getMetricsConfig(service string) *metrics.MonitoringConfiguration {
|
||||
if service == "cloudwatch" {
|
||||
return &metrics.MonitoringConfiguration{
|
||||
MonitoringService: "cloudwatch",
|
||||
Region: regionName,
|
||||
CloudWatch: metrics.CloudWatchMonitoringService{
|
||||
ResolutionSec: 1,
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
if service == "prometheus" {
|
||||
return &metrics.MonitoringConfiguration{
|
||||
MonitoringService: "prometheus",
|
||||
Region: regionName,
|
||||
Prometheus: metrics.PrometheusMonitoringService{
|
||||
ListenAddress: ":8080",
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
// Record processor factory is used to create RecordProcessor
|
||||
func recordProcessorFactory(t *testing.T) kc.IRecordProcessorFactory {
|
||||
return &dumpRecordProcessorFactory{t: t}
|
||||
|
|
|
|||
Loading…
Reference in a new issue