Golang library for consuming Kinesis stream data
Find a file
Harlow Ward cd71fd41bc Add Logger interface
To allow for different logging endpoints we'll introduce a Logger
interface that will be passed into the pipeline during initialization.

* Add Logger interface
* Use logger interface in pipeline and emitters
2015-05-03 21:01:14 -07:00
.gitignore Add tags to gitignore 2014-12-13 13:25:36 -08:00
all_pass_filter.go Use golint to update Golang styles 2014-12-10 15:38:19 -08:00
buffer.go Add StringToString transformer 2014-11-15 17:07:12 -08:00
checkpoint.go Use golint to update Golang styles 2014-12-10 15:38:19 -08:00
emitter.go Rename Model to Record 2014-11-15 15:54:54 -08:00
filter.go Use golint to update Golang styles 2014-12-10 15:38:19 -08:00
kinesis.go Remove custom config functions 2014-12-20 19:40:25 -08:00
LICENSE Add MIT License 2015-04-10 09:32:30 -07:00
logger.go Add Logger interface 2015-05-03 21:01:14 -07:00
manifest.go Add Redshift Manifest functionality 2014-12-10 21:59:42 -08:00
pipeline.go Add Logger interface 2015-05-03 21:01:14 -07:00
README.md Add Logger interface 2015-05-03 21:01:14 -07:00
record_buffer.go Use golint to update Golang styles 2014-12-10 15:38:19 -08:00
record_buffer_test.go Use golint to update Golang styles 2014-12-10 15:38:19 -08:00
redis_checkpoint.go Use golint to update Golang styles 2014-12-10 15:38:19 -08:00
redis_checkpoint_test.go Base pipeline components 2014-11-14 20:45:34 -08:00
redshift_basic_emitter.go Add Logger interface 2015-05-03 21:01:14 -07:00
redshift_basic_emitter_test.go Add Logger interface 2015-05-03 21:01:14 -07:00
redshift_manifest_emitter.go Add Logger interface 2015-05-03 21:01:14 -07:00
redshift_manifest_emitter_test.go Add Redshift Manifest functionality 2014-12-10 21:59:42 -08:00
s3_emitter.go Add Logger interface 2015-05-03 21:01:14 -07:00
s3_emitter_test.go Add Redshift Manifest functionality 2014-12-10 21:59:42 -08:00
s3_manifest_emitter.go Add Logger interface 2015-05-03 21:01:14 -07:00
string_to_string_transformer.go Use golint to update Golang styles 2014-12-10 15:38:19 -08:00
transformer.go Sort exported fields alphabetically 2014-11-15 17:20:25 -08:00

Golang Kinesis Connectors

Note: This codebase is a under active development. Expect breaking changes until 1.0 version release.

Kinesis connector applications written in Go

Inspired by the Amazon Kinesis Connector Library. These components are used for extracting streaming event data into S3, Redshift, DynamoDB, and more. See the API Docs for package documentation.

Overview

Each Amazon Kinesis connector application is a pipeline that determines how records from an Amazon Kinesis stream will be handled. Records are retrieved from the stream, transformed according to a user-defined data model, buffered for batch processing, and then emitted to the appropriate AWS service.

golang_kinesis_connector

A connector pipeline uses the following interfaces:

  • Pipeline: The pipeline implementation itself.
  • Transformer: Defines the transformation of records from the Amazon Kinesis stream in order to suit the user-defined data model. Includes methods for custom serializer/deserializers.
  • Filter: Defines a method for excluding irrelevant records from the processing.
  • Buffer: Defines a system for batching the set of records to be processed. The application can specify three thresholds: number of records, total byte count, and time. When one of these thresholds is crossed, the buffer is flushed and the data is emitted to the destination.
  • Emitter: Defines a method that makes client calls to other AWS services and persists the records stored in the buffer. The records can also be sent to another Amazon Kinesis stream.

Usage

Install the library:

$ go get github.com/harlow/kinesis-connectors

Example Redshift Manifest Pipeline

The Redshift Manifest Pipeline works in several steps:

  1. Pull records from Kinesis and buffer them untill the desired threshold is reached. The S3 Manifest Emitter will then upload the buffered records to an S3 bucket, set a checkpoint in Redis, and put the file path onto the manifest stream.
  2. Pull S3 path records from Kinesis and batch into a Manifest file. Upload the manifest to S3 and issue the COPY command to Redshift.

The config vars are loaded done with gcfg.

package main

import (
	"fmt"
	"os"

	"code.google.com/p/gcfg"
	"github.com/harlow/kinesis-connectors"
	"github.com/sendgridlabs/go-kinesis"
)

type Config struct {
	Pipeline struct {
		Name string
	}
	Redshift struct {
		CopyMandatory bool
		DataTable     string
		FileTable     string
		Format        string
	}
	Kinesis struct {
		InputBufferSize  int
		InputShardCount  int
		InputStream      string
		OutputBufferSize int
		OutputShardCount int
		OutputStream     string
	}
	S3 struct {
		BucketName string
	}
}

func main() {
	var cfg Config
	err := gcfg.ReadFileInto(&cfg, "config.cfg")

	// Set up kinesis client
	accessKey := os.Getenv("AWS_ACCESS_KEY")
	secretKey := os.Getenv("AWS_SECRET_KEY")
	ksis := kinesis.New(accessKey, secretKey, kinesis.Region{})

	// Create and wait for streams
	connector.CreateStream(ksis, cfg.Kinesis.InputStream, cfg.Kinesis.InputShardCount)
	connector.CreateStream(ksis, cfg.Kinesis.OutputStream, cfg.Kinesis.OutputShardCount)

	// Process mobile event stream
	args := kinesis.NewArgs()
	args.Add("StreamName", cfg.Kinesis.InputStream)
	streamInfo, err := ksis.DescribeStream(args)

	if err != nil {
		fmt.Printf("Unable to connect to %s stream. Aborting.", cfg.Kinesis.OutputStream)
		return
	}

	for _, shard := range streamInfo.StreamDescription.Shards {
		fmt.Printf("Processing %s on %s\n", shard.ShardId, cfg.Kinesis.InputStream)
		f := connector.AllPassFilter{}
		b := connector.RecordBuffer{NumRecordsToBuffer: cfg.Kinesis.InputBufferSize}
		t := connector.StringToStringTransformer{}
		c := connector.RedisCheckpoint{AppName: cfg.Pipeline.Name, StreamName: cfg.Kinesis.InputStream}
		e := connector.S3ManifestEmitter{
			OutputStream: cfg.Kinesis.OutputStream,
			S3Bucket:     cfg.S3.BucketName,
			Ksis:         ksis,
		}
		p := &connector.Pipeline{
			Buffer:      &b,
			Checkpoint:  &c,
			Emitter:     &e,
			Filter:      &f,
			Logger:      log.New(os.Stdout, "KINESIS: ", log.Ldate|log.Ltime|log.Lshortfile),
			StreamName:  cfg.Kinesis.InputStream,
			Transformer: &t,
		}
		go p.ProcessShard(ksis, shard.ShardId)
	}

	// Process manifest stream
	args = kinesis.NewArgs()
	args.Add("StreamName", cfg.Kinesis.OutputStream)
	streamInfo, err = ksis.DescribeStream(args)

	if err != nil {
		fmt.Printf("Unable to connect to %s stream. Aborting.", cfg.Kinesis.OutputStream)
		return
	}

	for _, shard := range streamInfo.StreamDescription.Shards {
		fmt.Printf("Processing %s on %s\n", shard.ShardId, cfg.Kinesis.OutputStream)
		f := connector.AllPassFilter{}
		b := connector.RecordBuffer{NumRecordsToBuffer: cfg.Kinesis.OutputBufferSize}
		t := connector.StringToStringTransformer{}
		c := connector.RedisCheckpoint{AppName: cfg.Pipeline.Name, StreamName: cfg.Kinesis.OutputStream}
		e := connector.RedshiftManifestEmitter{
			CopyMandatory: cfg.Redshift.CopyMandatory,
			DataTable:     cfg.Redshift.DataTable,
			FileTable:     cfg.Redshift.FileTable,
			Format:        cfg.Redshift.Format,
			S3Bucket:      cfg.S3.BucketName,
		}
		p := &connector.Pipeline{
			Buffer:      &b,
			Checkpoint:  &c,
			Emitter:     &e,
			Filter:      &f,
			Logger:      log.New(os.Stdout, "KINESIS: ", log.Ldate|log.Ltime|log.Lshortfile),
			StreamName:  cfg.Kinesis.OutputStream,
			Transformer: &t,
		}
		go p.ProcessShard(ksis, shard.ShardId)
	}

	// Keep alive
	<-make(chan int)
}