Merge pull request #58 from Clever/refactor-decompression
Refactor zlib decompression into the splitter package
This commit is contained in:
commit
4dd769ffca
4 changed files with 93 additions and 106 deletions
|
|
@ -1,12 +1,9 @@
|
||||||
package batchconsumer
|
package batchconsumer
|
||||||
|
|
||||||
import (
|
import (
|
||||||
"bytes"
|
|
||||||
"compress/zlib"
|
|
||||||
"context"
|
"context"
|
||||||
"encoding/base64"
|
"encoding/base64"
|
||||||
"fmt"
|
"fmt"
|
||||||
"io/ioutil"
|
|
||||||
"math/big"
|
"math/big"
|
||||||
|
|
||||||
"golang.org/x/time/rate"
|
"golang.org/x/time/rate"
|
||||||
|
|
@ -60,31 +57,6 @@ func (b *batchedWriter) Initialize(shardID string, checkpointer kcl.Checkpointer
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
func (b *batchedWriter) splitMessageIfNecessary(record []byte) ([][]byte, error) {
|
|
||||||
// We handle three types of records:
|
|
||||||
// - records emitted from CWLogs Subscription (which are gzip compressed)
|
|
||||||
// - uncompressed records emitted from KPL
|
|
||||||
// - zlib compressed records (e.g. as compressed and emitted by Kinesis plugin for Fluent Bit)
|
|
||||||
if splitter.IsGzipped(record) {
|
|
||||||
// Process a batch of messages from a CWLogs Subscription
|
|
||||||
return splitter.GetMessagesFromGzippedInput(record)
|
|
||||||
}
|
|
||||||
|
|
||||||
// Try to read it as a zlib-compressed record
|
|
||||||
// zlib.NewReader checks for a zlib header and returns an error if not found
|
|
||||||
zlibReader, err := zlib.NewReader(bytes.NewReader(record))
|
|
||||||
if err == nil {
|
|
||||||
unzlibRecord, err := ioutil.ReadAll(zlibReader)
|
|
||||||
if err != nil {
|
|
||||||
return nil, fmt.Errorf("reading zlib-compressed record: %v", err)
|
|
||||||
}
|
|
||||||
return [][]byte{unzlibRecord}, nil
|
|
||||||
}
|
|
||||||
// Process a single message, from KPL
|
|
||||||
return [][]byte{record}, nil
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
func (b *batchedWriter) ProcessRecords(records []kcl.Record) error {
|
func (b *batchedWriter) ProcessRecords(records []kcl.Record) error {
|
||||||
var pair kcl.SequencePair
|
var pair kcl.SequencePair
|
||||||
prevPair := b.lastProcessedSeq
|
prevPair := b.lastProcessedSeq
|
||||||
|
|
@ -108,7 +80,7 @@ func (b *batchedWriter) ProcessRecords(records []kcl.Record) error {
|
||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
|
|
||||||
messages, err := b.splitMessageIfNecessary(data)
|
messages, err := splitter.SplitMessageIfNecessary(data)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -1,11 +1,7 @@
|
||||||
package batchconsumer
|
package batchconsumer
|
||||||
|
|
||||||
import (
|
import (
|
||||||
"bytes"
|
|
||||||
"compress/gzip"
|
|
||||||
"compress/zlib"
|
|
||||||
"encoding/base64"
|
"encoding/base64"
|
||||||
"encoding/json"
|
|
||||||
"fmt"
|
"fmt"
|
||||||
"testing"
|
"testing"
|
||||||
"time"
|
"time"
|
||||||
|
|
@ -14,7 +10,6 @@ import (
|
||||||
"gopkg.in/Clever/kayvee-go.v6/logger"
|
"gopkg.in/Clever/kayvee-go.v6/logger"
|
||||||
|
|
||||||
"github.com/Clever/amazon-kinesis-client-go/kcl"
|
"github.com/Clever/amazon-kinesis-client-go/kcl"
|
||||||
"github.com/Clever/amazon-kinesis-client-go/splitter"
|
|
||||||
)
|
)
|
||||||
|
|
||||||
type ignoringSender struct{}
|
type ignoringSender struct{}
|
||||||
|
|
@ -426,75 +421,3 @@ func TestStaggeredCheckpointing(t *testing.T) {
|
||||||
assert.Equal("tag3", string(mocksender.batches["tag3"][2][0]))
|
assert.Equal("tag3", string(mocksender.batches["tag3"][2][0]))
|
||||||
assert.Equal("tag3", string(mocksender.batches["tag3"][2][1]))
|
assert.Equal("tag3", string(mocksender.batches["tag3"][2][1]))
|
||||||
}
|
}
|
||||||
|
|
||||||
func TestSplitIfNecesary(t *testing.T) {
|
|
||||||
|
|
||||||
// We provide three different inputs to batchedWriter.splitMessageIfNecessary
|
|
||||||
// plain text
|
|
||||||
// zlib compressed text
|
|
||||||
// gzip compressed CloudWatch logs batch
|
|
||||||
// we verify that the split function matches the input against the correct splitter
|
|
||||||
// and decodes it.
|
|
||||||
|
|
||||||
assert := assert.New(t)
|
|
||||||
|
|
||||||
mockFailedLogsFile := logger.New("testing")
|
|
||||||
mockconfig := withDefaults(Config{
|
|
||||||
BatchInterval: 10 * time.Millisecond,
|
|
||||||
CheckpointFreq: 20 * time.Millisecond,
|
|
||||||
})
|
|
||||||
|
|
||||||
wrt := NewBatchedWriter(mockconfig, ignoringSender{}, mockFailedLogsFile)
|
|
||||||
|
|
||||||
plainTextInput := []byte("hello, world!")
|
|
||||||
|
|
||||||
records, err := wrt.splitMessageIfNecessary(plainTextInput)
|
|
||||||
assert.NoError(err)
|
|
||||||
assert.Equal(
|
|
||||||
records,
|
|
||||||
[][]byte{[]byte("hello, world!")},
|
|
||||||
)
|
|
||||||
|
|
||||||
var z bytes.Buffer
|
|
||||||
zbuf := zlib.NewWriter(&z)
|
|
||||||
zbuf.Write([]byte("hello, world!"))
|
|
||||||
zbuf.Close()
|
|
||||||
zlibSingleInput := z.Bytes()
|
|
||||||
|
|
||||||
records, err = wrt.splitMessageIfNecessary(zlibSingleInput)
|
|
||||||
assert.NoError(err)
|
|
||||||
assert.Equal(
|
|
||||||
records,
|
|
||||||
[][]byte{[]byte("hello, world!")},
|
|
||||||
)
|
|
||||||
|
|
||||||
// the details of this part aren't super important since the actual functionality is
|
|
||||||
// tested in the splitter package, we just want to make sure that OUR split function
|
|
||||||
// correctly realizes it's gzip and call the splitter package's functionality
|
|
||||||
var g bytes.Buffer
|
|
||||||
gbuf := gzip.NewWriter(&g)
|
|
||||||
cwLogBatch := splitter.LogEventBatch{
|
|
||||||
MessageType: "test",
|
|
||||||
Owner: "test",
|
|
||||||
LogGroup: "test",
|
|
||||||
LogStream: "test",
|
|
||||||
SubscriptionFilters: []string{""},
|
|
||||||
LogEvents: []splitter.LogEvent{{
|
|
||||||
ID: "test",
|
|
||||||
Timestamp: splitter.UnixTimestampMillis(time.Date(2020, time.September, 9, 9, 10, 10, 0, time.UTC)),
|
|
||||||
Message: "test",
|
|
||||||
}},
|
|
||||||
}
|
|
||||||
cwLogBatchJSON, _ := json.Marshal(cwLogBatch)
|
|
||||||
gbuf.Write(cwLogBatchJSON)
|
|
||||||
gbuf.Close()
|
|
||||||
gzipBatchInput := g.Bytes()
|
|
||||||
|
|
||||||
expectedRecord := []byte("2020-09-09T09:10:10.000001+00:00 test test--test/arn%3Aaws%3Aecs%3Aus-east-1%3A999988887777%3Atask%2F12345678-1234-1234-1234-555566667777: test")
|
|
||||||
records, err = wrt.splitMessageIfNecessary(gzipBatchInput)
|
|
||||||
assert.NoError(err)
|
|
||||||
assert.Equal(
|
|
||||||
records,
|
|
||||||
[][]byte{expectedRecord},
|
|
||||||
)
|
|
||||||
}
|
|
||||||
|
|
|
||||||
|
|
@ -3,6 +3,7 @@ package splitter
|
||||||
import (
|
import (
|
||||||
"bytes"
|
"bytes"
|
||||||
"compress/gzip"
|
"compress/gzip"
|
||||||
|
"compress/zlib"
|
||||||
"encoding/json"
|
"encoding/json"
|
||||||
"fmt"
|
"fmt"
|
||||||
"io/ioutil"
|
"io/ioutil"
|
||||||
|
|
@ -11,6 +12,31 @@ import (
|
||||||
"time"
|
"time"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
// SplitMessageIfNecessary handles three types of records:
|
||||||
|
// - records emitted from CWLogs Subscription (which are gzip compressed)
|
||||||
|
// - uncompressed records emitted from KPL
|
||||||
|
// - zlib compressed records (e.g. as compressed and emitted by Kinesis plugin for Fluent Bi
|
||||||
|
func SplitMessageIfNecessary(record []byte) ([][]byte, error) {
|
||||||
|
if IsGzipped(record) {
|
||||||
|
// Process a batch of messages from a CWLogs Subscription
|
||||||
|
return GetMessagesFromGzippedInput(record)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Try to read it as a zlib-compressed record
|
||||||
|
// zlib.NewReader checks for a zlib header and returns an error if not found
|
||||||
|
zlibReader, err := zlib.NewReader(bytes.NewReader(record))
|
||||||
|
if err == nil {
|
||||||
|
unzlibRecord, err := ioutil.ReadAll(zlibReader)
|
||||||
|
if err != nil {
|
||||||
|
return nil, fmt.Errorf("reading zlib-compressed record: %v", err)
|
||||||
|
}
|
||||||
|
return [][]byte{unzlibRecord}, nil
|
||||||
|
}
|
||||||
|
// Process a single message, from KPL
|
||||||
|
return [][]byte{record}, nil
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
// LogEvent is a single log line within a LogEventBatch
|
// LogEvent is a single log line within a LogEventBatch
|
||||||
type LogEvent struct {
|
type LogEvent struct {
|
||||||
ID string `json:"id"`
|
ID string `json:"id"`
|
||||||
|
|
|
||||||
|
|
@ -3,9 +3,11 @@ package splitter
|
||||||
import (
|
import (
|
||||||
"bytes"
|
"bytes"
|
||||||
"compress/gzip"
|
"compress/gzip"
|
||||||
|
"compress/zlib"
|
||||||
b64 "encoding/base64"
|
b64 "encoding/base64"
|
||||||
"encoding/json"
|
"encoding/json"
|
||||||
"testing"
|
"testing"
|
||||||
|
"time"
|
||||||
|
|
||||||
"github.com/Clever/amazon-kinesis-client-go/decode"
|
"github.com/Clever/amazon-kinesis-client-go/decode"
|
||||||
"github.com/stretchr/testify/assert"
|
"github.com/stretchr/testify/assert"
|
||||||
|
|
@ -283,3 +285,67 @@ func TestSplitGlue(t *testing.T) {
|
||||||
assert.Equal(t, "jr_8927660fecacbe026ccab656cb80befea8102ac2023df531b92889b112aada28", enhanced["container_task"])
|
assert.Equal(t, "jr_8927660fecacbe026ccab656cb80befea8102ac2023df531b92889b112aada28", enhanced["container_task"])
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func TestSplitIfNecesary(t *testing.T) {
|
||||||
|
|
||||||
|
// We provide three different inputs to batchedWriter.splitMessageIfNecessary
|
||||||
|
// plain text
|
||||||
|
// zlib compressed text
|
||||||
|
// gzip compressed CloudWatch logs batch
|
||||||
|
// we verify that the split function matches the input against the correct splitter
|
||||||
|
// and decodes it.
|
||||||
|
|
||||||
|
assert := assert.New(t)
|
||||||
|
|
||||||
|
plainTextInput := []byte("hello, world!")
|
||||||
|
|
||||||
|
records, err := SplitMessageIfNecessary(plainTextInput)
|
||||||
|
assert.NoError(err)
|
||||||
|
assert.Equal(
|
||||||
|
records,
|
||||||
|
[][]byte{[]byte("hello, world!")},
|
||||||
|
)
|
||||||
|
|
||||||
|
var z bytes.Buffer
|
||||||
|
zbuf := zlib.NewWriter(&z)
|
||||||
|
zbuf.Write([]byte("hello, world!"))
|
||||||
|
zbuf.Close()
|
||||||
|
zlibSingleInput := z.Bytes()
|
||||||
|
|
||||||
|
records, err = SplitMessageIfNecessary(zlibSingleInput)
|
||||||
|
assert.NoError(err)
|
||||||
|
assert.Equal(
|
||||||
|
records,
|
||||||
|
[][]byte{[]byte("hello, world!")},
|
||||||
|
)
|
||||||
|
|
||||||
|
// the details of this part aren't super important since the actual functionality is
|
||||||
|
// tested in other tests; for this test we just want to make sure that split function
|
||||||
|
// correctly realizes it's gzip and call the appropriate CW-log-splitting logic
|
||||||
|
var g bytes.Buffer
|
||||||
|
gbuf := gzip.NewWriter(&g)
|
||||||
|
cwLogBatch := LogEventBatch{
|
||||||
|
MessageType: "test",
|
||||||
|
Owner: "test",
|
||||||
|
LogGroup: "test",
|
||||||
|
LogStream: "test",
|
||||||
|
SubscriptionFilters: []string{""},
|
||||||
|
LogEvents: []LogEvent{{
|
||||||
|
ID: "test",
|
||||||
|
Timestamp: UnixTimestampMillis(time.Date(2020, time.September, 9, 9, 10, 10, 0, time.UTC)),
|
||||||
|
Message: "test",
|
||||||
|
}},
|
||||||
|
}
|
||||||
|
cwLogBatchJSON, _ := json.Marshal(cwLogBatch)
|
||||||
|
gbuf.Write(cwLogBatchJSON)
|
||||||
|
gbuf.Close()
|
||||||
|
gzipBatchInput := g.Bytes()
|
||||||
|
|
||||||
|
expectedRecord := []byte("2020-09-09T09:10:10.000001+00:00 test test--test/arn%3Aaws%3Aecs%3Aus-east-1%3A999988887777%3Atask%2F12345678-1234-1234-1234-555566667777: test")
|
||||||
|
records, err = SplitMessageIfNecessary(gzipBatchInput)
|
||||||
|
assert.NoError(err)
|
||||||
|
assert.Equal(
|
||||||
|
records,
|
||||||
|
[][]byte{expectedRecord},
|
||||||
|
)
|
||||||
|
}
|
||||||
|
|
|
||||||
Loading…
Reference in a new issue