Add Record Processor

* Add GetRecords function to Kinesis Utils
* Add sample .env to showcase ENV vars needed
* Fix RedisCheckpoint to look for empty string (default from Redis)
* Extract example code into its own repository
This commit is contained in:
Harlow Ward 2014-07-29 23:09:57 -07:00
parent 70c3b1bd79
commit 4e909185d1
18 changed files with 288 additions and 135 deletions

7
.gitignore vendored
View file

@ -19,6 +19,9 @@ _cgo_export.*
_testmain.go _testmain.go
*.env
*.exe
*.test *.test
*.exe
# vim temp files
.*.swp
.*.swo

View file

@ -1,8 +0,0 @@
package buffers
type Buffer interface {
Data() []byte
FirstSequenceNumber() string
LastSequenceNumber() string
NumMessagesInBuffer() int
}

View file

@ -6,12 +6,12 @@ type MsgBuffer struct {
buffer bytes.Buffer buffer bytes.Buffer
firstSequenceNumber string firstSequenceNumber string
lastSequenceNumber string lastSequenceNumber string
numMessagesToBuffer int NumMessagesToBuffer int
sequencesInBuffer []string sequencesInBuffer []string
} }
func (b MsgBuffer) NumMessagesToBuffer() int { func NewMessageBuffer(n int) *MsgBuffer {
return b.numMessagesToBuffer return &MsgBuffer{NumMessagesToBuffer: n}
} }
func (b *MsgBuffer) ConsumeRecord(data []byte, sequenceNumber string) { func (b *MsgBuffer) ConsumeRecord(data []byte, sequenceNumber string) {
@ -44,19 +44,19 @@ func (b MsgBuffer) NumMessagesInBuffer() int {
return len(b.sequencesInBuffer) return len(b.sequencesInBuffer)
} }
func (b *MsgBuffer) FlushBuffer() { func (b *MsgBuffer) Flush() {
b.buffer.Reset() b.buffer.Reset()
b.sequencesInBuffer = b.sequencesInBuffer[:0] b.sequencesInBuffer = b.sequencesInBuffer[:0]
} }
func (b MsgBuffer) ShouldFlush() bool { func (b MsgBuffer) ShouldFlush() bool {
return len(b.sequencesInBuffer) >= b.NumMessagesToBuffer() return len(b.sequencesInBuffer) >= b.NumMessagesToBuffer
}
func (b MsgBuffer) LastSequenceNumber() string {
return b.lastSequenceNumber
} }
func (b MsgBuffer) FirstSequenceNumber() string { func (b MsgBuffer) FirstSequenceNumber() string {
return b.firstSequenceNumber return b.firstSequenceNumber
} }
func (b MsgBuffer) LastSequenceNumber() string {
return b.lastSequenceNumber
}

View file

@ -5,16 +5,6 @@ import (
"testing" "testing"
) )
func TestNumMessagesToBuffer(t *testing.T) {
const n = 25
b := MsgBuffer{numMessagesToBuffer: n}
r := b.NumMessagesToBuffer()
if r != n {
t.Errorf("NumMessagesToBuffer() = %v, want %v", r, n)
}
}
func TestConsumeRecord(t *testing.T) { func TestConsumeRecord(t *testing.T) {
var r1, s1 = []byte("Record1"), "Seq1" var r1, s1 = []byte("Record1"), "Seq1"
var r2, s2 = []byte("Recrod2"), "Seq2" var r2, s2 = []byte("Recrod2"), "Seq2"
@ -57,12 +47,12 @@ func TestSequenceExists(t *testing.T) {
} }
} }
func TestFlushBuffer(t *testing.T) { func TestFlush(t *testing.T) {
var r1, s1 = []byte("Record"), "SeqNum" var r1, s1 = []byte("Record"), "SeqNum"
b := MsgBuffer{} b := MsgBuffer{}
b.ConsumeRecord(r1, s1) b.ConsumeRecord(r1, s1)
b.FlushBuffer() b.Flush()
if b.NumMessagesInBuffer() != 0 { if b.NumMessagesInBuffer() != 0 {
t.Errorf("NumMessagesInBuffer() want %v", 0) t.Errorf("NumMessagesInBuffer() want %v", 0)
@ -110,7 +100,7 @@ func TestShouldFlush(t *testing.T) {
var r1, s1 = []byte("Record1"), "Seq1" var r1, s1 = []byte("Record1"), "Seq1"
var r2, s2 = []byte("Recrod2"), "Seq2" var r2, s2 = []byte("Recrod2"), "Seq2"
b := MsgBuffer{numMessagesToBuffer: n} b := MsgBuffer{NumMessagesToBuffer: n}
b.ConsumeRecord(r1, s1) b.ConsumeRecord(r1, s1)
if b.ShouldFlush() != false { if b.ShouldFlush() != false {

View file

@ -1,8 +0,0 @@
package checkpoints
type Checkpoint interface {
CheckpointExists(streamName string, shardID string) bool
SequenceNumber() string
SetCheckpoint(streamName string, shardID string, sequenceNumber string)
}

View file

@ -12,15 +12,11 @@ type RedisCheckpoint struct {
sequenceNumber string sequenceNumber string
} }
func (c RedisCheckpoint) SequenceNumber() string {
return c.sequenceNumber
}
func (c *RedisCheckpoint) CheckpointExists(streamName string, shardID string) bool { func (c *RedisCheckpoint) CheckpointExists(streamName string, shardID string) bool {
key := c.keyGen(streamName, shardID) key := c.keyGen(streamName, shardID)
val, _ := c.client.Get(key) val, _ := c.client.Get(key)
if val != nil { if val != nil && string(val) != "" {
c.sequenceNumber = string(val) c.sequenceNumber = string(val)
return true return true
} else { } else {
@ -28,6 +24,10 @@ func (c *RedisCheckpoint) CheckpointExists(streamName string, shardID string) bo
} }
} }
func (c RedisCheckpoint) SequenceNumber() string {
return c.sequenceNumber
}
func (c *RedisCheckpoint) SetCheckpoint(streamName string, shardID string, sequenceNumber string) { func (c *RedisCheckpoint) SetCheckpoint(streamName string, shardID string, sequenceNumber string) {
key := c.keyGen(streamName, shardID) key := c.keyGen(streamName, shardID)
c.client.Set(key, []byte(sequenceNumber)) c.client.Set(key, []byte(sequenceNumber))

View file

@ -1,5 +0,0 @@
package emitters
type Emitter interface {
Emit(path string, data []byte)
}

View file

@ -1,16 +1,50 @@
package emitters package emitters
import ( import (
"bytes"
"fmt" "fmt"
"os"
// "database/sql"
"github.com/harlow/go-etl/interfaces"
// "github.com/lib/pq"
) )
type RedshiftEmitter struct { type RedshiftEmitter struct {
redshiftDelimiter string
redshiftPassword string
redshiftTable string
redshiftURL string
redshiftUsername string
S3Bucket string
} }
func (e RedshiftEmitter) Emit(path string, data []byte) { func (e RedshiftEmitter) Emit(buffer interfaces.Buffer) {
// first call S3 bucket s3Emitter := S3Emitter{S3Bucket: e.S3Bucket}
// pg.query("COPY file_path TO table_name") s3Emitter.Emit(buffer)
// s3File := s3Emitter.S3FileName(buffer.FirstSequenceNumber(), buffer.LastSequenceNumber())
// fmt.Printf("Redshift emitted: %v\n", s3File)
// db, err := sql.Open("postgres", "user=pqgotest dbname=pqgotest sslmode=verify-full")
// if err != nil {
// log.Fatal(err)
// }
// pg.query("INSERT INTO imported_files VALUE file_path") // pg.query("INSERT INTO imported_files VALUE file_path")
fmt.Printf("debug: emitting %v to Redshift\n", path) // err := db.Exec(generateCopyStatement(s3File))
fmt.Println(string(data)) // rows, err := db.Query("SELECT pg_last_copy_count();")
// log.info("Successfully copied " + getNumberOfCopiedRecords(conn) + " records to Redshift from file s3://" + s3Bucket + "/" + s3File);
// db.Close()
}
func (e RedshiftEmitter) generateCopyStatement(s3File string) string {
var b bytes.Buffer
b.WriteString(fmt.Sprintf("COPY %v ", e.redshiftTable))
b.WriteString(fmt.Sprintf("FROM 's3://%v%v' ", e.S3Bucket, s3File))
b.WriteString(fmt.Sprintf("CREDENTIALS 'aws_access_key_id=%v;", os.Getenv("AWS_ACCESS_KEY")))
b.WriteString(fmt.Sprintf("aws_secret_access_key=%v' ", os.Getenv("AWS_SECRET_KEY")))
b.WriteString(fmt.Sprintf("DELIMITER '%v'", e.redshiftDelimiter))
b.WriteString(";")
return b.String()
} }

View file

@ -6,24 +6,29 @@ import (
"github.com/crowdmob/goamz/aws" "github.com/crowdmob/goamz/aws"
"github.com/crowdmob/goamz/s3" "github.com/crowdmob/goamz/s3"
"github.com/harlow/go-etl/buffers" "github.com/harlow/go-etl/interfaces"
) )
type S3Emitter struct { type S3Emitter struct {
S3Bucket string S3Bucket string
} }
func (e S3Emitter) s3FileName(firstSeq string, lastSeq string) string { func (e S3Emitter) S3FileName(firstSeq string, lastSeq string) string {
date := time.Now().UTC().Format("2006-01-02") date := time.Now().UTC().Format("2006-01-02")
return fmt.Sprintf("/%v/%v-%v.txt", date, firstSeq, lastSeq) return fmt.Sprintf("/%v/%v-%v.txt", date, firstSeq, lastSeq)
} }
func (e S3Emitter) Emit(buffer buffers.Buffer) { func (e S3Emitter) Emit(buffer interfaces.Buffer) {
auth, _ := aws.EnvAuth() auth, _ := aws.EnvAuth()
s := s3.New(auth, aws.USEast) s3Con := s3.New(auth, aws.USEast)
b := s.Bucket(e.S3Bucket) bucket := s3Con.Bucket(e.S3Bucket)
f := e.s3FileName(buffer.FirstSequenceNumber(), buffer.LastSequenceNumber()) s3File := e.S3FileName(buffer.FirstSequenceNumber(), buffer.LastSequenceNumber())
r := b.Put(f, buffer.Data(), "text/plain", s3.Private, s3.Options{})
fmt.Printf("Successfully emitted %v records to S3 in s3://%v/%v", buffer.NumMessagesInBuffer(), b, f) err := bucket.Put(s3File, buffer.Data(), "text/plain", s3.Private, s3.Options{})
fmt.Println(r)
if err != nil {
fmt.Printf("Error occured while uploding to S3: %v\n", err)
} else {
fmt.Printf("Emitted %v records to S3 in s3://%v%v\n", buffer.NumMessagesInBuffer(), e.S3Bucket, s3File)
}
} }

View file

@ -0,0 +1,30 @@
package emitters
import (
"fmt"
"github.com/harlow/go-etl/interfaces"
"github.com/sendgridlabs/go-kinesis"
)
type S3ManifestEmitter struct {
OutputStream string
S3Bucket string
Ksis *kinesis.Kinesis
}
func (e S3ManifestEmitter) Emit(buffer interfaces.Buffer) {
s3Emitter := S3Emitter{S3Bucket: e.S3Bucket}
s3Emitter.Emit(buffer)
s3File := s3Emitter.S3FileName(buffer.FirstSequenceNumber(), buffer.LastSequenceNumber())
args := kinesis.NewArgs()
args.Add("StreamName", e.OutputStream)
args.Add("PartitionKey", s3File)
args.AddData([]byte(s3File))
_, err := e.Ksis.PutRecord(args)
if err != nil {
fmt.Printf("S3 Manifest Emitter Error: %v", err)
}
}

View file

@ -1,36 +0,0 @@
package main
import (
"fmt"
"github.com/harlow/go-etl/checkpoints"
"github.com/harlow/go-etl/emitters"
"github.com/harlow/go-etl/utils"
"github.com/joho/godotenv"
"github.com/sendgridlabs/go-kinesis"
)
func main() {
godotenv.Load()
s := "inputStream"
k := kinesis.New("", "", kinesis.Region{})
c := checkpoints.RedisCheckpoint{AppName: "sampleApp"}
e := emitters.S3Emitter{S3Bucket: "bucketName"}
// t := transformers.EventTransformer{}
args := kinesis.NewArgs()
args.Add("StreamName", s)
streamInfo, err := k.DescribeStream(args)
if err != nil {
fmt.Printf("Unable to connect to %v stream. Aborting.", s)
return
}
for _, shard := range streamInfo.StreamDescription.Shards {
go utils.GetRecords(k, &c, e, s, shard.ShardId)
}
select {}
}

View file

@ -1,35 +0,0 @@
package main
import (
"fmt"
"github.com/harlow/go-etl/utils"
"github.com/joho/godotenv"
"github.com/sendgridlabs/go-kinesis"
)
func putSampleDataOnStream(ksis *kinesis.Kinesis, streamName string, numRecords int) {
for i := 0; i < numRecords; i++ {
args := kinesis.NewArgs()
args.Add("StreamName", streamName)
args.AddData([]byte(fmt.Sprintf("Hello AWS Kinesis %d", i)))
args.Add("PartitionKey", fmt.Sprintf("partitionKey-%d", i))
resp, err := ksis.PutRecord(args)
if err != nil {
fmt.Printf("PutRecord err: %v\n", err)
} else {
fmt.Printf("SequenceNumber: %v\n", resp.SequenceNumber)
}
}
}
func main() {
godotenv.Load()
streamName := "inputStream"
ksis := kinesis.New("", "", kinesis.Region{})
utils.CreateAndWaitForStreamToBecomeAvailable(ksis, streamName, 2)
putSampleDataOnStream(ksis, streamName, 50)
// deleteStream(ksis, streamName)
}

11
interfaces/buffer.go Normal file
View file

@ -0,0 +1,11 @@
package interfaces
type Buffer interface {
ConsumeRecord(data []byte, sequenceNumber string)
Data() []byte
FirstSequenceNumber() string
Flush()
LastSequenceNumber() string
NumMessagesInBuffer() int
ShouldFlush() bool
}

8
interfaces/checkpoint.go Normal file
View file

@ -0,0 +1,8 @@
package interfaces
type Checkpoint interface {
CheckpointExists(streamName string, shardID string) bool
SequenceNumber() string
SetCheckpoint(streamName string, shardID string, sequenceNumber string)
}

5
interfaces/emitter.go Normal file
View file

@ -0,0 +1,5 @@
package interfaces
type Emitter interface {
Emit(buffer Buffer)
}

View file

@ -0,0 +1,5 @@
package interfaces
type Transformer interface {
Transform() string
}

73
pipeline/pipeline.go Normal file
View file

@ -0,0 +1,73 @@
package pipeline
import (
"fmt"
"time"
"github.com/harlow/go-etl/interfaces"
"github.com/sendgridlabs/go-kinesis"
)
type Pipeline struct {
Checkpoint interfaces.Checkpoint
Emitter interfaces.Emitter
Transformer interfaces.Transformer
}
func (p Pipeline) GetRecords(ksis *kinesis.Kinesis, buf interfaces.Buffer, streamName string, shardId string) {
args := kinesis.NewArgs()
args.Add("ShardId", shardId)
args.Add("StreamName", streamName)
if p.Checkpoint.CheckpointExists(streamName, shardId) {
args.Add("ShardIteratorType", "AFTER_SEQUENCE_NUMBER")
args.Add("StartingSequenceNumber", p.Checkpoint.SequenceNumber())
} else {
args.Add("ShardIteratorType", "TRIM_HORIZON")
}
shardInfo, err := ksis.GetShardIterator(args)
if err != nil {
fmt.Printf("Error fetching shard itterator: %v", err)
return
}
shardIterator := shardInfo.ShardIterator
for {
args = kinesis.NewArgs()
args.Add("ShardIterator", shardIterator)
recordSet, err := ksis.GetRecords(args)
if len(recordSet.Records) > 0 {
for _, d := range recordSet.Records {
data, err := d.GetData()
if err != nil {
fmt.Printf("GetData ERROR: %v\n", err)
continue
}
// json.Unmarshal(data, &t)
// csv := []byte(t.Transform())
buf.ConsumeRecord(data, d.SequenceNumber)
}
} else if recordSet.NextShardIterator == "" || shardIterator == recordSet.NextShardIterator || err != nil {
fmt.Printf("GetRecords ERROR: %v\n", err)
break
} else {
fmt.Printf("Sleeping: %v\n", shardId)
time.Sleep(5 * time.Second)
}
if buf.ShouldFlush() {
fmt.Printf("Emitting shardId: %v\n", shardId)
p.Emitter.Emit(buf)
p.Checkpoint.SetCheckpoint(streamName, shardId, buf.LastSequenceNumber())
buf.Flush()
}
shardIterator = recordSet.NextShardIterator
}
}

81
utils/config_utils.go Normal file
View file

@ -0,0 +1,81 @@
package utils
import (
"bufio"
"log"
"os"
"reflect"
"regexp"
"strconv"
"strings"
"unicode"
)
func readLines(path string) ([]string, error) {
file, err := os.Open(path)
if err != nil {
return nil, err
}
defer file.Close()
var lines []string
scanner := bufio.NewScanner(file)
for scanner.Scan() {
lines = append(lines, scanner.Text())
}
return lines, scanner.Err()
}
var (
assignRegex = regexp.MustCompile(`^([^=]+)=(.*)$`)
)
func upcaseInitial(str string) string {
for i, v := range str {
return string(unicode.ToUpper(v)) + str[i+1:]
}
return ""
}
func LoadConfig(config interface{}, filename string) error {
lines, err := readLines(filename)
if err != nil {
log.Fatalf("Load error: %s", err)
}
mutable := reflect.ValueOf(config).Elem()
for _, line := range lines {
line = strings.TrimSpace(line)
if len(line) == 0 || line[0] == ';' || line[0] == '#' {
continue
}
if groups := assignRegex.FindStringSubmatch(line); groups != nil {
key, val := groups[1], groups[2]
key, val = strings.TrimSpace(key), strings.TrimSpace(val)
key = upcaseInitial(key)
fieldType := mutable.FieldByName(key).Type()
switch fieldType.Name() {
case "int":
val, _ := strconv.ParseInt(val, 0, 64)
mutable.FieldByName(key).SetInt(val)
case "bool":
val, _ := strconv.ParseBool(val)
mutable.FieldByName(key).SetBool(val)
default:
mutable.FieldByName(key).SetString(val)
}
}
}
return nil
}