Base pipeline components
* Create base Interfaces for Pipeline * Add first base implementations for Pipeline * Add initial test for core functionality
This commit is contained in:
parent
0fe80b708b
commit
06b40e6ed8
19 changed files with 929 additions and 9 deletions
9
.gitignore
vendored
9
.gitignore
vendored
|
|
@ -19,5 +19,12 @@ _cgo_export.*
|
||||||
|
|
||||||
_testmain.go
|
_testmain.go
|
||||||
|
|
||||||
*.exe
|
|
||||||
*.test
|
*.test
|
||||||
|
*.exe
|
||||||
|
|
||||||
|
# vim temp files
|
||||||
|
.*.swp
|
||||||
|
.*.swo
|
||||||
|
|
||||||
|
# System files
|
||||||
|
.DS_Store
|
||||||
|
|
|
||||||
212
README.md
212
README.md
|
|
@ -1,14 +1,210 @@
|
||||||
# Kinesis Connector Application
|
# Golang Kinesis Connectors
|
||||||
|
|
||||||
A Kinesis Connector Application written in Go for extracting streaming event data
|
__Note:__ _This codebase is a under active development._
|
||||||
into S3, Redshift, DynamoDB, and more.
|
|
||||||
|
|
||||||
__Note:__ _This codebase is a under active development, and is not condisdered
|
### Kinesis connector applications written in Go
|
||||||
production ready._
|
|
||||||
|
|
||||||
## Installation
|
This is a port of the [AWS Kinesis connector libraries][2] from Java to Go for extracting streaming event data
|
||||||
|
into S3, Redshift, DynamoDB, and more. See the [API Docs][1] for package documentation.
|
||||||
|
|
||||||
Clone the repository.
|
## Overview
|
||||||
|
|
||||||
$ git clone git@github.com:harlow/go-etl.git
|
Each Amazon Kinesis connector application is a pipeline that determines how records from an Amazon Kinesis stream will be handled. Records are retrieved from the stream, transformed according to a user-defined data model, buffered for batch processing, and then emitted to the appropriate AWS service.
|
||||||
|
|
||||||
|

|
||||||
|
|
||||||
|
A connector pipeline uses the following interfaces:
|
||||||
|
|
||||||
|
* __Pipeline:__ The pipeline implementation itself.
|
||||||
|
* __Transformer:__ Defines the transformation of records from the Amazon Kinesis stream in order to suit the user-defined data model. Includes methods for custom serializer/deserializers.
|
||||||
|
* __Filter:__ Defines a method for excluding irrelevant records from the processing.
|
||||||
|
* __Buffer:__ Defines a system for batching the set of records to be processed. The application can specify three thresholds: number of records, total byte count, and time. When one of these thresholds is crossed, the buffer is flushed and the data is emitted to the destination.
|
||||||
|
* __Emitter:__ Defines a method that makes client calls to other AWS services and persists the records stored in the buffer. The records can also be sent to another Amazon Kinesis stream.
|
||||||
|
|
||||||
|
## Usage
|
||||||
|
|
||||||
|
Install the library:
|
||||||
|
|
||||||
|
$ go get github.com/harlow/kinesis-connectors
|
||||||
|
|
||||||
|
The library has been broken into several components (buffers, checkpoints, filters, transformers, and emitters). These compontents can be mixed and matched to generate the desired functionality.
|
||||||
|
|
||||||
|
### Example Redshift Pipeline
|
||||||
|
|
||||||
|
The Redshift Pipeline will pull records from Kinesis and buffer them untill the desired threshold is reached. The Emitter will then upload the buffered records to an S3 bucket, set a checkpoint in Redis, and copy data to to Redshift.
|
||||||
|
|
||||||
|
```go
|
||||||
|
package main
|
||||||
|
|
||||||
|
import (
|
||||||
|
"fmt"
|
||||||
|
|
||||||
|
"github.com/harlow/kinesis-connectors"
|
||||||
|
"github.com/harlow/sample-connectors/transformers"
|
||||||
|
"github.com/joho/godotenv"
|
||||||
|
"github.com/sendgridlabs/go-kinesis"
|
||||||
|
)
|
||||||
|
|
||||||
|
type Config struct {
|
||||||
|
AppName string
|
||||||
|
NumRecordsToBuffer int
|
||||||
|
KinesisStream string
|
||||||
|
KinesisStreamShardCount int
|
||||||
|
TableName string
|
||||||
|
S3Bucket string
|
||||||
|
Format string
|
||||||
|
Delimiter string
|
||||||
|
}
|
||||||
|
|
||||||
|
func NewPipeline(cfg Config) *connector.Pipeline {
|
||||||
|
b := connector.RecordBuffer{
|
||||||
|
NumRecordsToBuffer: cfg.NumRecordsToBuffer,
|
||||||
|
}
|
||||||
|
|
||||||
|
c := connector.RedisCheckpoint{
|
||||||
|
AppName: cfg.AppName,
|
||||||
|
StreamName: cfg.KinesisStream,
|
||||||
|
}
|
||||||
|
|
||||||
|
e := connector.RedshiftEmitter{
|
||||||
|
TableName: cfg.TableName,
|
||||||
|
S3Bucket: cfg.S3Bucket,
|
||||||
|
Format: cfg.Format,
|
||||||
|
Delimiter: cfg.Delimiter,
|
||||||
|
}
|
||||||
|
|
||||||
|
f := connector.AllPassFilter{}
|
||||||
|
|
||||||
|
t := transformers.UserTransformer{}
|
||||||
|
|
||||||
|
return &connector.Pipeline{
|
||||||
|
Buffer: &b,
|
||||||
|
Checkpoint: &c,
|
||||||
|
Emitter: &e,
|
||||||
|
Filter: &f,
|
||||||
|
StreamName: cfg.KinesisStream,
|
||||||
|
Transformer: &t,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func main() {
|
||||||
|
var cfg Config
|
||||||
|
godotenv.Load()
|
||||||
|
ksis := kinesis.New("", "", kinesis.Region{})
|
||||||
|
|
||||||
|
connector.LoadConfig(&cfg, "redshift_basic_pipeline.properties")
|
||||||
|
connector.CreateAndWaitForStreamToBecomeAvailable(ksis, cfg.KinesisStream, cfg.KinesisStreamShardCount)
|
||||||
|
|
||||||
|
args := kinesis.NewArgs()
|
||||||
|
args.Add("StreamName", cfg.KinesisStream)
|
||||||
|
streamInfo, err := ksis.DescribeStream(args)
|
||||||
|
|
||||||
|
if err != nil {
|
||||||
|
fmt.Printf("Unable to connect to %v stream. Aborting.", cfg.KinesisStream)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
for _, shard := range streamInfo.StreamDescription.Shards {
|
||||||
|
var p = NewPipeline(cfg)
|
||||||
|
go p.ProcessShard(ksis, shard.ShardId)
|
||||||
|
}
|
||||||
|
|
||||||
|
select {}
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
```go
|
||||||
|
package models
|
||||||
|
|
||||||
|
// Implements Model interface
|
||||||
|
type User struct {
|
||||||
|
ID int `json:"userid"`
|
||||||
|
Username string `json:"username"`
|
||||||
|
Firstname string `json:"firstname"`
|
||||||
|
Lastname string `json:"lastname"`
|
||||||
|
City string `json:"city"`
|
||||||
|
State string `json:"state"`
|
||||||
|
Email string `json:"email"`
|
||||||
|
Phone string `json:"phone"`
|
||||||
|
Likesports bool `json:"likesports"`
|
||||||
|
Liketheatre bool `json:"liketheatre"`
|
||||||
|
Likeconcerts bool `json:"likeconcerts"`
|
||||||
|
Likejazz bool `json:"likejazz"`
|
||||||
|
Likeclassical bool `json:"likeclassical"`
|
||||||
|
Likeopera bool `json:"likeopera"`
|
||||||
|
Likerock bool `json:"likerock"`
|
||||||
|
Likevegas bool `json:"likevegas"`
|
||||||
|
Likebroadway bool `json:"likebroadway"`
|
||||||
|
Likemusicals bool `json:"likemusicals"`
|
||||||
|
}
|
||||||
|
|
||||||
|
func (u User) ToString() string {
|
||||||
|
s := []string{
|
||||||
|
strconv.Itoa(u.ID),
|
||||||
|
u.Username,
|
||||||
|
u.Firstname,
|
||||||
|
u.Lastname,
|
||||||
|
u.City,
|
||||||
|
u.State,
|
||||||
|
u.Email,
|
||||||
|
u.Phone,
|
||||||
|
strconv.FormatBool(u.Likesports),
|
||||||
|
strconv.FormatBool(u.Liketheatre),
|
||||||
|
strconv.FormatBool(u.Likeconcerts),
|
||||||
|
strconv.FormatBool(u.Likejazz),
|
||||||
|
strconv.FormatBool(u.Likeclassical),
|
||||||
|
strconv.FormatBool(u.Likeopera),
|
||||||
|
strconv.FormatBool(u.Likerock),
|
||||||
|
strconv.FormatBool(u.Likevegas),
|
||||||
|
strconv.FormatBool(u.Likebroadway),
|
||||||
|
strconv.FormatBool(u.Likemusicals),
|
||||||
|
"\n",
|
||||||
|
}
|
||||||
|
|
||||||
|
return strings.Join(s, "|")
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
```go
|
||||||
|
package transformers
|
||||||
|
|
||||||
|
// Implements Transformer interface
|
||||||
|
type UserTransformer struct {}
|
||||||
|
|
||||||
|
func (t *UserTransformer) ToModel(data []byte) connector.Model {
|
||||||
|
user := &models.User{}
|
||||||
|
json.Unmarshal(data, &user)
|
||||||
|
return user
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
```sql
|
||||||
|
CREATE TABLE users (
|
||||||
|
id INTEGER,
|
||||||
|
username VARCHAR(255),
|
||||||
|
first_name VARCHAR(255),
|
||||||
|
last_name VARCHAR(255),
|
||||||
|
city VARCHAR(255),
|
||||||
|
state VARCHAR(255),
|
||||||
|
email VARCHAR(255),
|
||||||
|
phone VARCHAR(255),
|
||||||
|
like_sports BOOLEAN,
|
||||||
|
like_theatre BOOLEAN,
|
||||||
|
like_concerts BOOLEAN,
|
||||||
|
like_jazz BOOLEAN,
|
||||||
|
like_classical BOOLEAN,
|
||||||
|
like_opera BOOLEAN,
|
||||||
|
like_rock BOOLEAN,
|
||||||
|
like_vegas BOOLEAN,
|
||||||
|
like_broadway BOOLEAN,
|
||||||
|
like_musicals BOOLEAN,
|
||||||
|
PRIMARY KEY(id)
|
||||||
|
)
|
||||||
|
DISTSTYLE KEY
|
||||||
|
DISTKEY(id)
|
||||||
|
SORTKEY(id)
|
||||||
|
```
|
||||||
|
|
||||||
|
[1]: http://godoc.org/github.com/harlow/kinesis-connectors
|
||||||
|
[2]: http://aws.amazon.com/kinesis/
|
||||||
|
[3]: https://github.com/awslabs/amazon-kinesis-connectors
|
||||||
|
|
|
||||||
9
all_pass_filter.go
Normal file
9
all_pass_filter.go
Normal file
|
|
@ -0,0 +1,9 @@
|
||||||
|
package connector
|
||||||
|
|
||||||
|
// A basic implementation of the Filter interface that returns true for all records.
|
||||||
|
type AllPassFilter struct{}
|
||||||
|
|
||||||
|
// Returns true for all records.
|
||||||
|
func (b *AllPassFilter) KeepRecord(m Model) bool {
|
||||||
|
return true
|
||||||
|
}
|
||||||
16
buffer.go
Normal file
16
buffer.go
Normal file
|
|
@ -0,0 +1,16 @@
|
||||||
|
package connector
|
||||||
|
|
||||||
|
// Buffer defines a buffer used to store records streamed through Kinesis. It is a part of the
|
||||||
|
// Pipeline utilized by the Pipeline.ProcessShard function. Records are stored in the buffer by calling
|
||||||
|
// the Add method. The buffer has two size limits defined: total total number of records and a
|
||||||
|
// time limit in seconds. The ShouldFlush() method may indicate that the buffer is full based on
|
||||||
|
// these limits.
|
||||||
|
type Buffer interface {
|
||||||
|
Add(data Model, sequenceNumber string)
|
||||||
|
FirstSequenceNumber() string
|
||||||
|
Flush()
|
||||||
|
LastSequenceNumber() string
|
||||||
|
NumRecordsInBuffer() int
|
||||||
|
Records() []Model
|
||||||
|
ShouldFlush() bool
|
||||||
|
}
|
||||||
10
checkpoint.go
Normal file
10
checkpoint.go
Normal file
|
|
@ -0,0 +1,10 @@
|
||||||
|
package connector
|
||||||
|
|
||||||
|
// Used by Pipeline.ProcessShard when they want to checkpoint their progress.
|
||||||
|
// The Kinesis Connector Library will pass an object implementing this interface to ProcessShard,
|
||||||
|
// so they can checkpoint their progress.
|
||||||
|
type Checkpoint interface {
|
||||||
|
CheckpointExists(shardID string) bool
|
||||||
|
SequenceNumber() string
|
||||||
|
SetCheckpoint(shardID string, sequenceNumber string)
|
||||||
|
}
|
||||||
11
emitter.go
Normal file
11
emitter.go
Normal file
|
|
@ -0,0 +1,11 @@
|
||||||
|
package connector
|
||||||
|
|
||||||
|
// Emitter takes a full buffer and processes the stored records. The Emitter is a member of the
|
||||||
|
// Pipeline that "emits" the objects that have been deserialized by the
|
||||||
|
// Transformer. The Emit() method is invoked when the buffer is full (possibly to persist the
|
||||||
|
// records or send them to another Kinesis stream). After emitting the records.
|
||||||
|
// Implementations may choose to fail the entire set of records in the buffer or to fail records
|
||||||
|
// individually.
|
||||||
|
type Emitter interface {
|
||||||
|
Emit(buffer Buffer)
|
||||||
|
}
|
||||||
7
filter.go
Normal file
7
filter.go
Normal file
|
|
@ -0,0 +1,7 @@
|
||||||
|
package connector
|
||||||
|
|
||||||
|
// The Filter is associated with an Buffer. The Buffer may use the result of calling the
|
||||||
|
// KeepRecord() method to decide whether to store a record or discard it.
|
||||||
|
type Filter interface {
|
||||||
|
KeepRecord(m Model) bool
|
||||||
|
}
|
||||||
6
model.go
Normal file
6
model.go
Normal file
|
|
@ -0,0 +1,6 @@
|
||||||
|
package connector
|
||||||
|
|
||||||
|
// Used to map the attributres of the data being sent through the Kinesis stream
|
||||||
|
type Model interface {
|
||||||
|
ToString() string
|
||||||
|
}
|
||||||
86
pipeline.go
Normal file
86
pipeline.go
Normal file
|
|
@ -0,0 +1,86 @@
|
||||||
|
package connector
|
||||||
|
|
||||||
|
import (
|
||||||
|
"fmt"
|
||||||
|
"time"
|
||||||
|
|
||||||
|
"github.com/sendgridlabs/go-kinesis"
|
||||||
|
)
|
||||||
|
|
||||||
|
// This struct is used by the main application to configure instances of the user's implemented pipline.
|
||||||
|
// The user should implement this such that each method returns a configured implementation of each
|
||||||
|
// interface. It has a data type (Model) as Records come in as a byte[] and are transformed to a Model.
|
||||||
|
// Then they are buffered in Model form and when the buffer is full, Models's are passed to the emitter.
|
||||||
|
type Pipeline struct {
|
||||||
|
Buffer Buffer
|
||||||
|
Checkpoint Checkpoint
|
||||||
|
Emitter Emitter
|
||||||
|
Filter Filter
|
||||||
|
StreamName string
|
||||||
|
Transformer Transformer
|
||||||
|
}
|
||||||
|
|
||||||
|
func (p Pipeline) ProcessShard(ksis *kinesis.Kinesis, shardID string) {
|
||||||
|
args := kinesis.NewArgs()
|
||||||
|
args.Add("ShardId", shardID)
|
||||||
|
args.Add("StreamName", p.StreamName)
|
||||||
|
|
||||||
|
if p.Checkpoint.CheckpointExists(shardID) {
|
||||||
|
args.Add("ShardIteratorType", "AFTER_SEQUENCE_NUMBER")
|
||||||
|
args.Add("StartingSequenceNumber", p.Checkpoint.SequenceNumber())
|
||||||
|
} else {
|
||||||
|
args.Add("ShardIteratorType", "TRIM_HORIZON")
|
||||||
|
}
|
||||||
|
|
||||||
|
shardInfo, err := ksis.GetShardIterator(args)
|
||||||
|
|
||||||
|
if err != nil {
|
||||||
|
fmt.Printf("Error fetching shard itterator: %v", err)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
shardIterator := shardInfo.ShardIterator
|
||||||
|
|
||||||
|
for {
|
||||||
|
args = kinesis.NewArgs()
|
||||||
|
args.Add("ShardIterator", shardIterator)
|
||||||
|
recordSet, err := ksis.GetRecords(args)
|
||||||
|
|
||||||
|
if err != nil {
|
||||||
|
fmt.Printf("GetRecords ERROR: %v\n", err)
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
|
||||||
|
if len(recordSet.Records) > 0 {
|
||||||
|
for _, v := range recordSet.Records {
|
||||||
|
data, err := v.GetData()
|
||||||
|
|
||||||
|
if err != nil {
|
||||||
|
fmt.Printf("GetData ERROR: %v\n", err)
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
|
||||||
|
var m = p.Transformer.ToModel(data)
|
||||||
|
|
||||||
|
if p.Filter.KeepRecord(m) {
|
||||||
|
p.Buffer.Add(m, v.SequenceNumber)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} else if recordSet.NextShardIterator == "" || shardIterator == recordSet.NextShardIterator || err != nil {
|
||||||
|
fmt.Printf("NextShardIterator ERROR: %v\n", err)
|
||||||
|
break
|
||||||
|
} else {
|
||||||
|
fmt.Printf("Sleeping: %v\n", shardID)
|
||||||
|
time.Sleep(5 * time.Second)
|
||||||
|
}
|
||||||
|
|
||||||
|
if p.Buffer.ShouldFlush() {
|
||||||
|
fmt.Printf("Emitting to Shard: %v\n", shardID)
|
||||||
|
p.Emitter.Emit(p.Buffer)
|
||||||
|
p.Checkpoint.SetCheckpoint(shardID, p.Buffer.LastSequenceNumber())
|
||||||
|
p.Buffer.Flush()
|
||||||
|
}
|
||||||
|
|
||||||
|
shardIterator = recordSet.NextShardIterator
|
||||||
|
}
|
||||||
|
}
|
||||||
67
record_buffer.go
Normal file
67
record_buffer.go
Normal file
|
|
@ -0,0 +1,67 @@
|
||||||
|
package connector
|
||||||
|
|
||||||
|
// This struct is a basic implementation of the Buffer interface. It is a wrapper on a buffer of
|
||||||
|
// records that are periodically flushed. It is configured with an implementation of Filter that
|
||||||
|
// decides whether a record will be added to the buffer to be emitted.
|
||||||
|
type RecordBuffer struct {
|
||||||
|
NumRecordsToBuffer int
|
||||||
|
firstSequenceNumber string
|
||||||
|
lastSequenceNumber string
|
||||||
|
recordsInBuffer []Model
|
||||||
|
sequencesInBuffer []string
|
||||||
|
}
|
||||||
|
|
||||||
|
// Adds a message to the buffer.
|
||||||
|
func (b *RecordBuffer) Add(record Model, sequenceNumber string) {
|
||||||
|
if len(b.sequencesInBuffer) == 0 {
|
||||||
|
b.firstSequenceNumber = sequenceNumber
|
||||||
|
}
|
||||||
|
|
||||||
|
b.lastSequenceNumber = sequenceNumber
|
||||||
|
|
||||||
|
if !b.sequenceExists(sequenceNumber) {
|
||||||
|
b.recordsInBuffer = append(b.recordsInBuffer, record)
|
||||||
|
b.sequencesInBuffer = append(b.sequencesInBuffer, sequenceNumber)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Returns the records in the buffer.
|
||||||
|
func (b *RecordBuffer) Records() []Model {
|
||||||
|
return b.recordsInBuffer
|
||||||
|
}
|
||||||
|
|
||||||
|
// Returns the number of messages in the buffer.
|
||||||
|
func (b RecordBuffer) NumRecordsInBuffer() int {
|
||||||
|
return len(b.sequencesInBuffer)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Flushes the content in the buffer and resets the sequence counter.
|
||||||
|
func (b *RecordBuffer) Flush() {
|
||||||
|
b.recordsInBuffer = b.recordsInBuffer[:0]
|
||||||
|
b.sequencesInBuffer = b.sequencesInBuffer[:0]
|
||||||
|
}
|
||||||
|
|
||||||
|
// Checks if the sequence already exists in the buffer.
|
||||||
|
func (b *RecordBuffer) sequenceExists(sequenceNumber string) bool {
|
||||||
|
for _, v := range b.sequencesInBuffer {
|
||||||
|
if v == sequenceNumber {
|
||||||
|
return true
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
|
||||||
|
// Determines if the buffer has reached its target size.
|
||||||
|
func (b *RecordBuffer) ShouldFlush() bool {
|
||||||
|
return len(b.sequencesInBuffer) >= b.NumRecordsToBuffer
|
||||||
|
}
|
||||||
|
|
||||||
|
// Returns the sequence number of the first message in the buffer.
|
||||||
|
func (b *RecordBuffer) FirstSequenceNumber() string {
|
||||||
|
return b.firstSequenceNumber
|
||||||
|
}
|
||||||
|
|
||||||
|
// Returns the sequence number of the last message in the buffer.
|
||||||
|
func (b *RecordBuffer) LastSequenceNumber() string {
|
||||||
|
return b.lastSequenceNumber
|
||||||
|
}
|
||||||
118
record_buffer_test.go
Normal file
118
record_buffer_test.go
Normal file
|
|
@ -0,0 +1,118 @@
|
||||||
|
package connector
|
||||||
|
|
||||||
|
import "testing"
|
||||||
|
|
||||||
|
type TestModel struct{}
|
||||||
|
|
||||||
|
func (u TestModel) ToString() string {
|
||||||
|
return "ok"
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestAdd(t *testing.T) {
|
||||||
|
var r1, s1 = TestModel{}, "Seq1"
|
||||||
|
var r2, s2 = TestModel{}, "Seq2"
|
||||||
|
|
||||||
|
b := RecordBuffer{}
|
||||||
|
b.Add(r1, s1)
|
||||||
|
|
||||||
|
if b.NumRecordsInBuffer() != 1 {
|
||||||
|
t.Errorf("NumRecordsInBuffer() want %v", 1)
|
||||||
|
}
|
||||||
|
|
||||||
|
b.Add(r2, s2)
|
||||||
|
|
||||||
|
if b.NumRecordsInBuffer() != 2 {
|
||||||
|
t.Errorf("NumRecordsInBuffer() want %v", 2)
|
||||||
|
}
|
||||||
|
|
||||||
|
b.Add(r2, s2)
|
||||||
|
|
||||||
|
if b.NumRecordsInBuffer() != 2 {
|
||||||
|
t.Errorf("NumRecordsInBuffer() want %v", 2)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestSequenceExists(t *testing.T) {
|
||||||
|
var r1, s1 = TestModel{}, "Seq1"
|
||||||
|
var r2, s2 = TestModel{}, "Seq2"
|
||||||
|
|
||||||
|
b := RecordBuffer{}
|
||||||
|
b.Add(r1, s1)
|
||||||
|
|
||||||
|
if b.sequenceExists(s1) != true {
|
||||||
|
t.Errorf("sequenceExists() want %v", true)
|
||||||
|
}
|
||||||
|
|
||||||
|
b.Add(r2, s2)
|
||||||
|
|
||||||
|
if b.sequenceExists(s2) != true {
|
||||||
|
t.Errorf("sequenceExists() want %v", true)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestFlush(t *testing.T) {
|
||||||
|
var r1, s1 = TestModel{}, "SeqNum"
|
||||||
|
b := RecordBuffer{}
|
||||||
|
b.Add(r1, s1)
|
||||||
|
|
||||||
|
b.Flush()
|
||||||
|
|
||||||
|
if b.NumRecordsInBuffer() != 0 {
|
||||||
|
t.Errorf("Count() want %v", 0)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestLastSequenceNumber(t *testing.T) {
|
||||||
|
var r1, s1 = TestModel{}, "Seq1"
|
||||||
|
var r2, s2 = TestModel{}, "Seq2"
|
||||||
|
|
||||||
|
b := RecordBuffer{}
|
||||||
|
b.Add(r1, s1)
|
||||||
|
|
||||||
|
if b.LastSequenceNumber() != s1 {
|
||||||
|
t.Errorf("LastSequenceNumber() want %v", s1)
|
||||||
|
}
|
||||||
|
|
||||||
|
b.Add(r2, s2)
|
||||||
|
|
||||||
|
if b.LastSequenceNumber() != s2 {
|
||||||
|
t.Errorf("LastSequenceNumber() want %v", s2)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestFirstSequenceNumber(t *testing.T) {
|
||||||
|
var r1, s1 = TestModel{}, "Seq1"
|
||||||
|
var r2, s2 = TestModel{}, "Seq2"
|
||||||
|
|
||||||
|
b := RecordBuffer{}
|
||||||
|
b.Add(r1, s1)
|
||||||
|
|
||||||
|
if b.FirstSequenceNumber() != s1 {
|
||||||
|
t.Errorf("FirstSequenceNumber() want %v", s1)
|
||||||
|
}
|
||||||
|
|
||||||
|
b.Add(r2, s2)
|
||||||
|
|
||||||
|
if b.FirstSequenceNumber() != s1 {
|
||||||
|
t.Errorf("FirstSequenceNumber() want %v", s1)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestShouldFlush(t *testing.T) {
|
||||||
|
const n = 2
|
||||||
|
var r1, s1 = TestModel{}, "Seq1"
|
||||||
|
var r2, s2 = TestModel{}, "Seq2"
|
||||||
|
|
||||||
|
b := RecordBuffer{NumRecordsToBuffer: n}
|
||||||
|
b.Add(r1, s1)
|
||||||
|
|
||||||
|
if b.ShouldFlush() != false {
|
||||||
|
t.Errorf("ShouldFlush() want %v", false)
|
||||||
|
}
|
||||||
|
|
||||||
|
b.Add(r2, s2)
|
||||||
|
|
||||||
|
if b.ShouldFlush() != true {
|
||||||
|
t.Errorf("ShouldFlush() want %v", true)
|
||||||
|
}
|
||||||
|
}
|
||||||
40
redis_checkpoint.go
Normal file
40
redis_checkpoint.go
Normal file
|
|
@ -0,0 +1,40 @@
|
||||||
|
package connector
|
||||||
|
|
||||||
|
import (
|
||||||
|
"fmt"
|
||||||
|
|
||||||
|
"github.com/hoisie/redis"
|
||||||
|
)
|
||||||
|
|
||||||
|
// A Redis implementation of the Checkpont interface. This class is used to enable the Pipeline.ProcessShard
|
||||||
|
// to checkpoint their progress.
|
||||||
|
type RedisCheckpoint struct {
|
||||||
|
AppName string
|
||||||
|
client redis.Client
|
||||||
|
sequenceNumber string
|
||||||
|
StreamName string
|
||||||
|
}
|
||||||
|
|
||||||
|
func (c *RedisCheckpoint) CheckpointExists(shardID string) bool {
|
||||||
|
val, _ := c.client.Get(c.key(shardID))
|
||||||
|
|
||||||
|
if val != nil && string(val) != "" {
|
||||||
|
c.sequenceNumber = string(val)
|
||||||
|
return true
|
||||||
|
} else {
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func (c *RedisCheckpoint) SequenceNumber() string {
|
||||||
|
return c.sequenceNumber
|
||||||
|
}
|
||||||
|
|
||||||
|
func (c *RedisCheckpoint) SetCheckpoint(shardID string, sequenceNumber string) {
|
||||||
|
c.client.Set(c.key(shardID), []byte(sequenceNumber))
|
||||||
|
c.sequenceNumber = sequenceNumber
|
||||||
|
}
|
||||||
|
|
||||||
|
func (c *RedisCheckpoint) key(shardID string) string {
|
||||||
|
return fmt.Sprintf("%v:checkpoint:%v:%v", c.AppName, c.StreamName, shardID)
|
||||||
|
}
|
||||||
48
redis_checkpoint_test.go
Normal file
48
redis_checkpoint_test.go
Normal file
|
|
@ -0,0 +1,48 @@
|
||||||
|
package connector
|
||||||
|
|
||||||
|
import (
|
||||||
|
"testing"
|
||||||
|
|
||||||
|
"github.com/hoisie/redis"
|
||||||
|
)
|
||||||
|
|
||||||
|
func TestKey(t *testing.T) {
|
||||||
|
k := "app:checkpoint:stream:shard"
|
||||||
|
c := RedisCheckpoint{AppName: "app", StreamName: "stream"}
|
||||||
|
|
||||||
|
r := c.key("shard")
|
||||||
|
|
||||||
|
if r != k {
|
||||||
|
t.Errorf("key() = %v, want %v", k, r)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestCheckpointExists(t *testing.T) {
|
||||||
|
var rc redis.Client
|
||||||
|
k := "app:checkpoint:stream:shard"
|
||||||
|
rc.Set(k, []byte("fakeSeqNum"))
|
||||||
|
c := RedisCheckpoint{AppName: "app", StreamName: "stream"}
|
||||||
|
|
||||||
|
r := c.CheckpointExists("shard")
|
||||||
|
|
||||||
|
if r != true {
|
||||||
|
t.Errorf("CheckpointExists() = %v, want %v", false, r)
|
||||||
|
}
|
||||||
|
|
||||||
|
rc.Del(k)
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestSetCheckpoint(t *testing.T) {
|
||||||
|
k := "app:checkpoint:stream:shard"
|
||||||
|
var rc redis.Client
|
||||||
|
c := RedisCheckpoint{AppName: "app", StreamName: "stream"}
|
||||||
|
c.SetCheckpoint("shard", "fakeSeqNum")
|
||||||
|
|
||||||
|
r, _ := rc.Get(k)
|
||||||
|
|
||||||
|
if string(r) != "fakeSeqNum" {
|
||||||
|
t.Errorf("SetCheckpoint() = %v, want %v", "fakeSeqNum", r)
|
||||||
|
}
|
||||||
|
|
||||||
|
rc.Del(k)
|
||||||
|
}
|
||||||
62
redshift_emitter.go
Normal file
62
redshift_emitter.go
Normal file
|
|
@ -0,0 +1,62 @@
|
||||||
|
package connector
|
||||||
|
|
||||||
|
import (
|
||||||
|
"bytes"
|
||||||
|
"database/sql"
|
||||||
|
"fmt"
|
||||||
|
"log"
|
||||||
|
"os"
|
||||||
|
|
||||||
|
_ "github.com/lib/pq"
|
||||||
|
)
|
||||||
|
|
||||||
|
// This struct is an implementation of Emitter that buffered batches of records into Redshift one by one.
|
||||||
|
// It first emits records into S3 and then perfors the Redshift JSON COPY command. S3 storage of buffered
|
||||||
|
// data achieved using the S3Emitter. A link to jsonpaths must be provided when configuring the struct.
|
||||||
|
type RedshiftEmitter struct {
|
||||||
|
Delimiter string
|
||||||
|
Format string
|
||||||
|
Jsonpath string
|
||||||
|
S3Bucket string
|
||||||
|
TableName string
|
||||||
|
}
|
||||||
|
|
||||||
|
// Invoked when the buffer is full. This method leverages the S3Emitter and then issues a copy command to
|
||||||
|
// Redshift data store.
|
||||||
|
func (e RedshiftEmitter) Emit(b Buffer) {
|
||||||
|
s3Emitter := S3Emitter{S3Bucket: e.S3Bucket}
|
||||||
|
s3Emitter.Emit(b)
|
||||||
|
s3File := s3Emitter.S3FileName(b.FirstSequenceNumber(), b.LastSequenceNumber())
|
||||||
|
db, err := sql.Open("postgres", os.Getenv("REDSHIFT_URL"))
|
||||||
|
|
||||||
|
if err != nil {
|
||||||
|
log.Fatal(err)
|
||||||
|
}
|
||||||
|
|
||||||
|
_, err = db.Exec(e.copyStatement(s3File))
|
||||||
|
|
||||||
|
if err != nil {
|
||||||
|
log.Fatal(err)
|
||||||
|
}
|
||||||
|
|
||||||
|
fmt.Printf("Redshift load completed.\n")
|
||||||
|
db.Close()
|
||||||
|
}
|
||||||
|
|
||||||
|
// Creates the SQL copy statement issued to Redshift cluster.
|
||||||
|
func (e RedshiftEmitter) copyStatement(s3File string) string {
|
||||||
|
var b bytes.Buffer
|
||||||
|
b.WriteString(fmt.Sprintf("COPY %v ", e.TableName))
|
||||||
|
b.WriteString(fmt.Sprintf("FROM 's3://%v%v' ", e.S3Bucket, s3File))
|
||||||
|
b.WriteString(fmt.Sprintf("CREDENTIALS 'aws_access_key_id=%v;", os.Getenv("AWS_ACCESS_KEY")))
|
||||||
|
b.WriteString(fmt.Sprintf("aws_secret_access_key=%v' ", os.Getenv("AWS_SECRET_KEY")))
|
||||||
|
|
||||||
|
if e.Format == "json" {
|
||||||
|
b.WriteString(fmt.Sprintf("json 'auto'"))
|
||||||
|
} else {
|
||||||
|
b.WriteString(fmt.Sprintf("DELIMITER '%v'", e.Delimiter))
|
||||||
|
}
|
||||||
|
|
||||||
|
b.WriteString(";")
|
||||||
|
return b.String()
|
||||||
|
}
|
||||||
20
redshift_emitter_test.go
Normal file
20
redshift_emitter_test.go
Normal file
|
|
@ -0,0 +1,20 @@
|
||||||
|
package connector
|
||||||
|
|
||||||
|
import (
|
||||||
|
"testing"
|
||||||
|
)
|
||||||
|
|
||||||
|
func TestCopyStatement(t *testing.T) {
|
||||||
|
e := RedshiftEmitter{
|
||||||
|
Delimiter: ",",
|
||||||
|
S3Bucket: "test_bucket",
|
||||||
|
TableName: "test_table",
|
||||||
|
}
|
||||||
|
f := e.copyStatement("/test.txt")
|
||||||
|
|
||||||
|
copyStatement := "COPY test_table FROM 's3://test_bucket/test.txt' CREDENTIALS 'aws_access_key_id=;aws_secret_access_key=' DELIMITER ',';"
|
||||||
|
|
||||||
|
if f != copyStatement {
|
||||||
|
t.Errorf("copyStatement() = %s want %s", f, copyStatement)
|
||||||
|
}
|
||||||
|
}
|
||||||
48
s3_emitter.go
Normal file
48
s3_emitter.go
Normal file
|
|
@ -0,0 +1,48 @@
|
||||||
|
package connector
|
||||||
|
|
||||||
|
import (
|
||||||
|
"bytes"
|
||||||
|
"fmt"
|
||||||
|
"time"
|
||||||
|
|
||||||
|
"github.com/crowdmob/goamz/aws"
|
||||||
|
"github.com/crowdmob/goamz/s3"
|
||||||
|
)
|
||||||
|
|
||||||
|
// This implementation of Emitter is used to store files from a Kinesis stream in S3. The use of
|
||||||
|
// this struct requires the configuration of an S3 bucket/endpoint. When the buffer is full, this
|
||||||
|
// struct's Emit method adds the contents of the buffer to S3 as one file. The filename is generated
|
||||||
|
// from the first and last sequence numbers of the records contained in that file separated by a
|
||||||
|
// dash. This struct requires the configuration of an S3 bucket and endpoint.
|
||||||
|
type S3Emitter struct {
|
||||||
|
S3Bucket string
|
||||||
|
}
|
||||||
|
|
||||||
|
// Generates a file name based on the First and Last sequence numbers from the buffer. The current
|
||||||
|
// UTC date (YYYY-MM-DD) is base of the path to logically group days of batches.
|
||||||
|
func (e S3Emitter) S3FileName(firstSeq string, lastSeq string) string {
|
||||||
|
date := time.Now().UTC().Format("2006-01-02")
|
||||||
|
return fmt.Sprintf("/%v/%v-%v.txt", date, firstSeq, lastSeq)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Invoked when the buffer is full. This method emits the set of filtered records.
|
||||||
|
func (e S3Emitter) Emit(buf Buffer) {
|
||||||
|
auth, _ := aws.EnvAuth()
|
||||||
|
s3Con := s3.New(auth, aws.USEast)
|
||||||
|
bucket := s3Con.Bucket(e.S3Bucket)
|
||||||
|
s3File := e.S3FileName(buf.FirstSequenceNumber(), buf.LastSequenceNumber())
|
||||||
|
|
||||||
|
var buffer bytes.Buffer
|
||||||
|
|
||||||
|
for _, r := range buf.Records() {
|
||||||
|
buffer.WriteString(r.ToString())
|
||||||
|
}
|
||||||
|
|
||||||
|
err := bucket.Put(s3File, buffer.Bytes(), "text/plain", s3.Private, s3.Options{})
|
||||||
|
|
||||||
|
if err != nil {
|
||||||
|
fmt.Printf("Error occured while uploding to S3: %v\n", err)
|
||||||
|
} else {
|
||||||
|
fmt.Printf("Emitted %v records to S3 in s3://%v%v\n", buf.NumRecordsInBuffer(), e.S3Bucket, s3File)
|
||||||
|
}
|
||||||
|
}
|
||||||
18
s3_emitter_test.go
Normal file
18
s3_emitter_test.go
Normal file
|
|
@ -0,0 +1,18 @@
|
||||||
|
package connector
|
||||||
|
|
||||||
|
import (
|
||||||
|
"fmt"
|
||||||
|
"testing"
|
||||||
|
"time"
|
||||||
|
)
|
||||||
|
|
||||||
|
func TestS3FileName(t *testing.T) {
|
||||||
|
d := time.Now().UTC().Format("2006-01-02")
|
||||||
|
n := fmt.Sprintf("/%v/a-b.txt", d)
|
||||||
|
e := S3Emitter{}
|
||||||
|
f := e.S3FileName("a", "b")
|
||||||
|
|
||||||
|
if f != n {
|
||||||
|
t.Errorf("S3FileName() = want %v", f, n)
|
||||||
|
}
|
||||||
|
}
|
||||||
7
transformer.go
Normal file
7
transformer.go
Normal file
|
|
@ -0,0 +1,7 @@
|
||||||
|
package connector
|
||||||
|
|
||||||
|
// Transformer is used to transform data from a Record (byte array) to the data model for
|
||||||
|
// processing in the application.
|
||||||
|
type Transformer interface {
|
||||||
|
ToModel(data []byte) Model
|
||||||
|
}
|
||||||
144
utils.go
Normal file
144
utils.go
Normal file
|
|
@ -0,0 +1,144 @@
|
||||||
|
package connector
|
||||||
|
|
||||||
|
import (
|
||||||
|
"bufio"
|
||||||
|
"fmt"
|
||||||
|
"log"
|
||||||
|
"os"
|
||||||
|
"reflect"
|
||||||
|
"regexp"
|
||||||
|
"strconv"
|
||||||
|
"strings"
|
||||||
|
"time"
|
||||||
|
"unicode"
|
||||||
|
|
||||||
|
"github.com/sendgridlabs/go-kinesis"
|
||||||
|
)
|
||||||
|
|
||||||
|
func readLines(path string) ([]string, error) {
|
||||||
|
file, err := os.Open(path)
|
||||||
|
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
|
||||||
|
defer file.Close()
|
||||||
|
var lines []string
|
||||||
|
|
||||||
|
scanner := bufio.NewScanner(file)
|
||||||
|
|
||||||
|
for scanner.Scan() {
|
||||||
|
lines = append(lines, scanner.Text())
|
||||||
|
}
|
||||||
|
|
||||||
|
return lines, scanner.Err()
|
||||||
|
}
|
||||||
|
|
||||||
|
var (
|
||||||
|
assignRegex = regexp.MustCompile(`^([^=]+)=(.*)$`)
|
||||||
|
)
|
||||||
|
|
||||||
|
func upcaseInitial(str string) string {
|
||||||
|
for i, v := range str {
|
||||||
|
return string(unicode.ToUpper(v)) + str[i+1:]
|
||||||
|
}
|
||||||
|
|
||||||
|
return ""
|
||||||
|
}
|
||||||
|
|
||||||
|
// Opens the file path and loads config values into the sturct.
|
||||||
|
func LoadConfig(config interface{}, filename string) error {
|
||||||
|
lines, err := readLines(filename)
|
||||||
|
|
||||||
|
if err != nil {
|
||||||
|
log.Fatalf("Config Load ERROR: %s\n", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
mutable := reflect.ValueOf(config).Elem()
|
||||||
|
|
||||||
|
for _, line := range lines {
|
||||||
|
line = strings.TrimSpace(line)
|
||||||
|
|
||||||
|
if len(line) == 0 || line[0] == ';' || line[0] == '#' {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
|
||||||
|
if groups := assignRegex.FindStringSubmatch(line); groups != nil {
|
||||||
|
key, val := groups[1], groups[2]
|
||||||
|
key, val = strings.TrimSpace(key), strings.TrimSpace(val)
|
||||||
|
key = upcaseInitial(key)
|
||||||
|
field := mutable.FieldByName(key)
|
||||||
|
|
||||||
|
if !field.IsValid() {
|
||||||
|
log.Fatalf("Config ERROR: Field %s not found\n", key)
|
||||||
|
}
|
||||||
|
|
||||||
|
switch field.Type().Name() {
|
||||||
|
case "int":
|
||||||
|
val, _ := strconv.ParseInt(val, 0, 64)
|
||||||
|
mutable.FieldByName(key).SetInt(val)
|
||||||
|
case "bool":
|
||||||
|
val, _ := strconv.ParseBool(val)
|
||||||
|
mutable.FieldByName(key).SetBool(val)
|
||||||
|
default:
|
||||||
|
mutable.FieldByName(key).SetString(val)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// Creates a new Kinesis stream (uses existing stream if exists) and waits for it to become available.
|
||||||
|
func CreateAndWaitForStreamToBecomeAvailable(k *kinesis.Kinesis, streamName string, shardCount int) {
|
||||||
|
if !StreamExists(k, streamName) {
|
||||||
|
err := k.CreateStream(streamName, shardCount)
|
||||||
|
|
||||||
|
if err != nil {
|
||||||
|
fmt.Printf("CreateStream ERROR: %v\n", err)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
resp := &kinesis.DescribeStreamResp{}
|
||||||
|
timeout := make(chan bool, 30)
|
||||||
|
|
||||||
|
for {
|
||||||
|
args := kinesis.NewArgs()
|
||||||
|
args.Add("StreamName", streamName)
|
||||||
|
resp, _ = k.DescribeStream(args)
|
||||||
|
streamStatus := resp.StreamDescription.StreamStatus
|
||||||
|
fmt.Printf("Stream [%v] is %v\n", streamName, streamStatus)
|
||||||
|
|
||||||
|
if streamStatus != "ACTIVE" {
|
||||||
|
time.Sleep(4 * time.Second)
|
||||||
|
timeout <- true
|
||||||
|
} else {
|
||||||
|
break
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Check if a Kinesis stream exists.
|
||||||
|
func StreamExists(k *kinesis.Kinesis, streamName string) bool {
|
||||||
|
args := kinesis.NewArgs()
|
||||||
|
resp, _ := k.ListStreams(args)
|
||||||
|
for _, s := range resp.StreamNames {
|
||||||
|
if s == streamName {
|
||||||
|
return true
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
|
||||||
|
// Delete a Kinesis stream.
|
||||||
|
func DeleteStream(k *kinesis.Kinesis, streamName string) {
|
||||||
|
err := k.DeleteStream("test")
|
||||||
|
|
||||||
|
if err != nil {
|
||||||
|
fmt.Printf("DeleteStream ERROR: %v\n", err)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
fmt.Printf("Stream [%v] is DELETING\n", streamName)
|
||||||
|
}
|
||||||
Loading…
Reference in a new issue