Simplify example (no Redshift cluster required)

This commit is contained in:
Harlow Ward 2015-05-03 16:00:27 -07:00
parent cd71fd41bc
commit 82d10ab78d

119
README.md
View file

@ -27,12 +27,13 @@ Install the library:
$ go get github.com/harlow/kinesis-connectors $ go get github.com/harlow/kinesis-connectors
### Example Redshift Manifest Pipeline ### Example S3 Pipeline
The Redshift Manifest Pipeline works in several steps: The S3 Pipeline performs the following steps:
1. Pull records from Kinesis and buffer them untill the desired threshold is reached. The S3 Manifest Emitter will then upload the buffered records to an S3 bucket, set a checkpoint in Redis, and put the file path onto the manifest stream. 1. Pull records from Kinesis and buffer them untill the desired threshold is reached.
2. Pull S3 path records from Kinesis and batch into a Manifest file. Upload the manifest to S3 and issue the COPY command to Redshift. 2. Upload the records to an S3 bucket.
3. Set the current Shard checkpoint in Redis.
The config vars are loaded done with [gcfg][3]. The config vars are loaded done with [gcfg][3].
@ -41,6 +42,8 @@ package main
import ( import (
"fmt" "fmt"
"log"
"net/http"
"os" "os"
"code.google.com/p/gcfg" "code.google.com/p/gcfg"
@ -52,28 +55,44 @@ type Config struct {
Pipeline struct { Pipeline struct {
Name string Name string
} }
Redshift struct {
CopyMandatory bool
DataTable string
FileTable string
Format string
}
Kinesis struct { Kinesis struct {
InputBufferSize int BufferSize int
InputShardCount int ShardCount int
InputStream string StreamName string
OutputBufferSize int
OutputShardCount int
OutputStream string
} }
S3 struct { S3 struct {
BucketName string BucketName string
} }
} }
func newS3Pipeline(cfg Config) *connector.Pipeline {
f := &connector.AllPassFilter{}
b := &connector.RecordBuffer{
NumRecordsToBuffer: cfg.Kinesis.BufferSize,
}
t := &connector.StringToStringTransformer{}
c := &connector.RedisCheckpoint{
AppName: cfg.Pipeline.Name,
StreamName: cfg.Kinesis.StreamName,
}
e := &connector.S3ManifestEmitter{
S3Bucket: cfg.S3.BucketName,
}
return &connector.Pipeline{
Buffer: b,
Checkpoint: c,
Emitter: e,
Filter: f,
Logger: log.New(os.Stdout, "KINESIS: ", log.Ldate|log.Ltime|log.Lshortfile),
StreamName: cfg.Kinesis.StreamName,
Transformer: t,
}
}
func main() { func main() {
// Load config vars
var cfg Config var cfg Config
err := gcfg.ReadFileInto(&cfg, "config.cfg") err := gcfg.ReadFileInto(&cfg, "pipeline.cfg")
// Set up kinesis client // Set up kinesis client
accessKey := os.Getenv("AWS_ACCESS_KEY") accessKey := os.Getenv("AWS_ACCESS_KEY")
@ -81,74 +100,22 @@ func main() {
ksis := kinesis.New(accessKey, secretKey, kinesis.Region{}) ksis := kinesis.New(accessKey, secretKey, kinesis.Region{})
// Create and wait for streams // Create and wait for streams
connector.CreateStream(ksis, cfg.Kinesis.InputStream, cfg.Kinesis.InputShardCount) connector.CreateStream(ksis, cfg.Kinesis.StreamName, cfg.Kinesis.ShardCount)
connector.CreateStream(ksis, cfg.Kinesis.OutputStream, cfg.Kinesis.OutputShardCount)
// Process mobile event stream // Fetch stream info
args := kinesis.NewArgs() args := kinesis.NewArgs()
args.Add("StreamName", cfg.Kinesis.InputStream) args.Add("StreamName", cfg.Kinesis.StreamName)
streamInfo, err := ksis.DescribeStream(args) streamInfo, err := ksis.DescribeStream(args)
if err != nil { if err != nil {
fmt.Printf("Unable to connect to %s stream. Aborting.", cfg.Kinesis.OutputStream) fmt.Printf("Unable to connect to %s stream. Aborting.", cfg.Kinesis.StreamName)
return return
} }
// Process kinesis shards
for _, shard := range streamInfo.StreamDescription.Shards { for _, shard := range streamInfo.StreamDescription.Shards {
fmt.Printf("Processing %s on %s\n", shard.ShardId, cfg.Kinesis.InputStream) fmt.Printf("Processing %s on %s\n", shard.ShardId, cfg.Kinesis.StreamName)
f := connector.AllPassFilter{} p := newS3Pipeline(cfg)
b := connector.RecordBuffer{NumRecordsToBuffer: cfg.Kinesis.InputBufferSize}
t := connector.StringToStringTransformer{}
c := connector.RedisCheckpoint{AppName: cfg.Pipeline.Name, StreamName: cfg.Kinesis.InputStream}
e := connector.S3ManifestEmitter{
OutputStream: cfg.Kinesis.OutputStream,
S3Bucket: cfg.S3.BucketName,
Ksis: ksis,
}
p := &connector.Pipeline{
Buffer: &b,
Checkpoint: &c,
Emitter: &e,
Filter: &f,
Logger: log.New(os.Stdout, "KINESIS: ", log.Ldate|log.Ltime|log.Lshortfile),
StreamName: cfg.Kinesis.InputStream,
Transformer: &t,
}
go p.ProcessShard(ksis, shard.ShardId)
}
// Process manifest stream
args = kinesis.NewArgs()
args.Add("StreamName", cfg.Kinesis.OutputStream)
streamInfo, err = ksis.DescribeStream(args)
if err != nil {
fmt.Printf("Unable to connect to %s stream. Aborting.", cfg.Kinesis.OutputStream)
return
}
for _, shard := range streamInfo.StreamDescription.Shards {
fmt.Printf("Processing %s on %s\n", shard.ShardId, cfg.Kinesis.OutputStream)
f := connector.AllPassFilter{}
b := connector.RecordBuffer{NumRecordsToBuffer: cfg.Kinesis.OutputBufferSize}
t := connector.StringToStringTransformer{}
c := connector.RedisCheckpoint{AppName: cfg.Pipeline.Name, StreamName: cfg.Kinesis.OutputStream}
e := connector.RedshiftManifestEmitter{
CopyMandatory: cfg.Redshift.CopyMandatory,
DataTable: cfg.Redshift.DataTable,
FileTable: cfg.Redshift.FileTable,
Format: cfg.Redshift.Format,
S3Bucket: cfg.S3.BucketName,
}
p := &connector.Pipeline{
Buffer: &b,
Checkpoint: &c,
Emitter: &e,
Filter: &f,
Logger: log.New(os.Stdout, "KINESIS: ", log.Ldate|log.Ltime|log.Lshortfile),
StreamName: cfg.Kinesis.OutputStream,
Transformer: &t,
}
go p.ProcessShard(ksis, shard.ShardId) go p.ProcessShard(ksis, shard.ShardId)
} }