Add postgres checkpoint implementation (#55)

This commit is contained in:
Emanuel Ramos 2018-06-18 03:27:10 +01:00 committed by Harlow Ward
parent 739e9e39a5
commit b7be26418a
8 changed files with 293 additions and 1 deletions

4
.gitignore vendored
View file

@ -42,3 +42,7 @@ vendor/**
# Benchmark files # Benchmark files
prof.cpu prof.cpu
prof.mem prof.mem
# VSCode files
/.vscode
/**/debug

11
Gopkg.lock generated
View file

@ -60,6 +60,15 @@
packages = ["."] packages = ["."]
revision = "0b12d6b5" revision = "0b12d6b5"
[[projects]]
branch = "master"
name = "github.com/lib/pq"
packages = [
".",
"oid"
]
revision = "90697d60dd844d5ef6ff15135d0203f65d2f53b8"
[[projects]] [[projects]]
name = "github.com/pkg/errors" name = "github.com/pkg/errors"
packages = ["."] packages = ["."]
@ -82,6 +91,6 @@
[solve-meta] [solve-meta]
analyzer-name = "dep" analyzer-name = "dep"
analyzer-version = 1 analyzer-version = 1
inputs-digest = "1b40486b645b81bc2215d0153631e9e002e534ba86713ba55500ce62c07cbad8" inputs-digest = "82ad275b394f4727b9d9ad816f534e512b1b007806a4b785d8a742e5836eee48"
solver-name = "gps-cdcl" solver-name = "gps-cdcl"
solver-version = 1 solver-version = 1

View file

@ -123,6 +123,34 @@ Sort key: shard_id
<img width="727" alt="screen shot 2017-11-22 at 7 59 36 pm" src="https://user-images.githubusercontent.com/739782/33158557-b90e4228-cfbf-11e7-9a99-73b56a446f5f.png"> <img width="727" alt="screen shot 2017-11-22 at 7 59 36 pm" src="https://user-images.githubusercontent.com/739782/33158557-b90e4228-cfbf-11e7-9a99-73b56a446f5f.png">
### Postgres Checkpoint
The Postgres checkpoint requires Table Name, App Name, Stream Name and ConnectionString:
```go
import checkpoint "github.com/harlow/kinesis-consumer/checkpoint/postgres"
// postgres checkpoint
ck, err := checkpoint.New(app, table, connStr)
if err != nil {
log.Fatalf("new checkpoint error: %v", err)
}
```
To leverage the Postgres checkpoint we'll also need to create a table:
```sql
CREATE TABLE kinesis_consumer (
namespace text NOT NULL,
shard_id text NOT NULL,
sequence_number numeric NOT NULL,
CONSTRAINT kinesis_consumer_pk PRIMARY KEY (namespace, shard_id)
);
```
The table name has to be the same that you specify when creating the checkpoint. The primary key composed by namespace and shard_id is mandatory in order to the checkpoint run without issues and also to ensure data integrity.
## Options ## Options
The consumer allows the following optional overrides. The consumer allows the following optional overrides.

View file

@ -0,0 +1,154 @@
package postgres
import (
"database/sql"
"errors"
"fmt"
"sync"
"time"
// this is the postgres package so it makes sense to be here
_ "github.com/lib/pq"
)
var getCheckpointQuery = `SELECT sequence_number
FROM %s
WHERE namespace=$1 AND shard_id=$2`
var upsertCheckpoint = `INSERT INTO %s (namespace, shard_id, sequence_number)
VALUES($1, $2, $3)
ON CONFLICT (namespace, shard_id)
DO
UPDATE
SET sequence_number= $3`
type key struct {
streamName string
shardID string
}
// Option is used to override defaults when creating a new Checkpoint
type Option func(*Checkpoint)
// Checkpoint stores and retreives the last evaluated key from a DDB scan
type Checkpoint struct {
appName string
conn *sql.DB
mu *sync.Mutex // protects the checkpoints
done chan struct{}
checkpoints map[key]string
maxInterval time.Duration
}
// New returns a checkpoint that uses PostgresDB for underlying storage
// Using connectionStr turn it more flexible to use specific db configs
func New(appName, tableName, connectionStr string, opts ...Option) (*Checkpoint, error) {
if tableName == "" {
return nil, errors.New("Table name not defined")
}
conn, err := sql.Open("postgres", connectionStr)
if err != nil {
return nil, err
}
getCheckpointQuery = fmt.Sprintf(getCheckpointQuery, tableName)
upsertCheckpoint = fmt.Sprintf(upsertCheckpoint, tableName)
ck := &Checkpoint{
conn: conn,
appName: appName,
done: make(chan struct{}),
maxInterval: time.Duration(1 * time.Minute),
mu: new(sync.Mutex),
checkpoints: map[key]string{},
}
for _, opt := range opts {
opt(ck)
}
go ck.loop()
return ck, nil
}
// Get determines if a checkpoint for a particular Shard exists.
// Typically used to determine whether we should start processing the shard with
// TRIM_HORIZON or AFTER_SEQUENCE_NUMBER (if checkpoint exists).
func (c *Checkpoint) Get(streamName, shardID string) (string, error) {
namespace := fmt.Sprintf("%s-%s", c.appName, streamName)
var sequenceNumber string
err := c.conn.QueryRow(getCheckpointQuery, namespace, shardID).Scan(&sequenceNumber)
if err != nil {
if err == sql.ErrNoRows {
return "", nil
}
return "", err
}
return sequenceNumber, nil
}
// Set stores a checkpoint for a shard (e.g. sequence number of last record processed by application).
// Upon failover, record processing is resumed from this point.
func (c *Checkpoint) Set(streamName, shardID, sequenceNumber string) error {
c.mu.Lock()
defer c.mu.Unlock()
if sequenceNumber == "" {
return fmt.Errorf("sequence number should not be empty")
}
key := key{
streamName: streamName,
shardID: shardID,
}
c.checkpoints[key] = sequenceNumber
return nil
}
// Shutdown the checkpoint. Save any in-flight data.
func (c *Checkpoint) Shutdown() error {
defer c.conn.Close()
c.done <- struct{}{}
return c.save()
}
func (c *Checkpoint) loop() {
tick := time.NewTicker(c.maxInterval)
defer tick.Stop()
defer close(c.done)
for {
select {
case <-tick.C:
c.save()
case <-c.done:
return
}
}
}
func (c *Checkpoint) save() error {
c.mu.Lock()
defer c.mu.Unlock()
for key, sequenceNumber := range c.checkpoints {
if _, err := c.conn.Exec(upsertCheckpoint, fmt.Sprintf("%s-%s", c.appName, key.streamName), key.shardID, sequenceNumber); err != nil {
return err
}
}
return nil
}

View file

@ -0,0 +1,17 @@
# Consumer with postgres checkpoint
Read records from the Kinesis stream using postgres as checkpoint
## Environment Variables
Export the required environment vars for connecting to the Kinesis stream:
```shell
export AWS_ACCESS_KEY=
export AWS_REGION=
export AWS_SECRET_KEY=
```
## Run the consumer
go run main.go --app appName --stream streamName --table tableName --connection connectionString

View file

@ -0,0 +1,80 @@
package main
import (
"context"
"errors"
"expvar"
"flag"
"fmt"
"log"
"os"
"os/signal"
consumer "github.com/harlow/kinesis-consumer"
checkpoint "github.com/harlow/kinesis-consumer/checkpoint/postgres"
)
func main() {
var (
app = flag.String("app", "", "App name")
stream = flag.String("stream", "", "Stream name")
table = flag.String("table", "", "Table name")
connStr = flag.String("connection", "", "Connection Str")
)
flag.Parse()
// postgres checkpoint
ck, err := checkpoint.New(*app, *table, *connStr)
if err != nil {
log.Fatalf("checkpoint error: %v", err)
}
var (
counter = expvar.NewMap("counters")
)
newKclient := consumer.NewKinesisClient()
// consumer
c, err := consumer.New(
*stream,
consumer.WithCheckpoint(ck),
consumer.WithCounter(counter),
consumer.WithClient(newKclient),
)
if err != nil {
log.Fatalf("consumer error: %v", err)
}
// use cancel \func to signal shutdown
ctx, cancel := context.WithCancel(context.Background())
// trap SIGINT, wait to trigger shutdown
signals := make(chan os.Signal, 1)
signal.Notify(signals, os.Interrupt)
go func() {
<-signals
cancel()
}()
// scan stream
err = c.Scan(ctx, func(r *consumer.Record) consumer.ScanError {
fmt.Println(string(r.Data))
err := errors.New("some error happened")
// continue scanning
return consumer.ScanError{
Error: err,
StopScan: false,
SkipCheckpoint: false,
}
})
if err != nil {
log.Fatalf("scan error: %v", err)
}
if err := ck.Shutdown(); err != nil {
log.Fatalf("checkpoint shutdown error: %v", err)
}
}