From 203953723e17e426b4fd3261e7c9af6b0a98a0fa Mon Sep 17 00:00:00 2001 From: Anmol Sethi Date: Wed, 2 Aug 2023 10:26:45 -0700 Subject: [PATCH] d2parser: Autodetect UTF-16 based on BOM Turns out I was wrong this is safe. --- d2compiler/compile.go | 10 ++-- d2exporter/export_test.go | 2 +- d2ir/compile.go | 6 +-- d2ir/import.go | 2 +- d2lib/d2.go | 6 +-- d2parser/parse.go | 49 ++++++++++--------- d2parser/parse_test.go | 12 ----- d2parser/utf16_gen.go | 3 ++ e2etests/e2e_test.go | 2 +- .../TestParse/errors/utf16-input.exp.json | 38 -------------- 10 files changed, 44 insertions(+), 86 deletions(-) delete mode 100644 testdata/d2parser/TestParse/errors/utf16-input.exp.json diff --git a/d2compiler/compile.go b/d2compiler/compile.go index 7c9e60ccc..f777ff3dd 100644 --- a/d2compiler/compile.go +++ b/d2compiler/compile.go @@ -21,27 +21,27 @@ import ( ) type CompileOptions struct { - UTF16 bool + UTF16Pos bool // FS is the file system used for resolving imports in the d2 text. // It should correspond to the root path. FS fs.FS } -func Compile(p string, r io.RuneReader, opts *CompileOptions) (*d2graph.Graph, *d2target.Config, error) { +func Compile(p string, r io.Reader, opts *CompileOptions) (*d2graph.Graph, *d2target.Config, error) { if opts == nil { opts = &CompileOptions{} } ast, err := d2parser.Parse(p, r, &d2parser.ParseOptions{ - UTF16: opts.UTF16, + UTF16Pos: opts.UTF16Pos, }) if err != nil { return nil, nil, err } ir, err := d2ir.Compile(ast, &d2ir.CompileOptions{ - UTF16: opts.UTF16, - FS: opts.FS, + UTF16Pos: opts.UTF16Pos, + FS: opts.FS, }) if err != nil { return nil, nil, err diff --git a/d2exporter/export_test.go b/d2exporter/export_test.go index 5b02901ae..4eb1974c9 100644 --- a/d2exporter/export_test.go +++ b/d2exporter/export_test.go @@ -223,7 +223,7 @@ func run(t *testing.T, tc testCase) { ctx = log.Leveled(ctx, slog.LevelDebug) g, config, err := d2compiler.Compile("", strings.NewReader(tc.dsl), &d2compiler.CompileOptions{ - UTF16: true, + UTF16Pos: true, }) if err != nil { t.Fatal(err) diff --git a/d2ir/compile.go b/d2ir/compile.go index a0783923e..9bb92b253 100644 --- a/d2ir/compile.go +++ b/d2ir/compile.go @@ -21,13 +21,13 @@ type compiler struct { importStack []string // importCache enables reuse of files imported multiple times. importCache map[string]*Map - utf16 bool + utf16Pos bool globStack []bool } type CompileOptions struct { - UTF16 bool + UTF16Pos bool // Pass nil to disable imports. FS fs.FS } @@ -45,7 +45,7 @@ func Compile(ast *d2ast.Map, opts *CompileOptions) (*Map, error) { fs: opts.FS, importCache: make(map[string]*Map), - utf16: opts.UTF16, + utf16Pos: opts.UTF16Pos, } m := &Map{} m.initRoot() diff --git a/d2ir/import.go b/d2ir/import.go index 383e5c24c..44cf509ff 100644 --- a/d2ir/import.go +++ b/d2ir/import.go @@ -99,7 +99,7 @@ func (c *compiler) __import(imp *d2ast.Import) (*Map, bool) { defer f.Close() ast, err := d2parser.Parse(impPath, f, &d2parser.ParseOptions{ - UTF16: c.utf16, + UTF16Pos: c.utf16Pos, ParseError: c.err, }) if err != nil { diff --git a/d2lib/d2.go b/d2lib/d2.go index 18372ee73..c34eb9206 100644 --- a/d2lib/d2.go +++ b/d2lib/d2.go @@ -23,7 +23,7 @@ import ( ) type CompileOptions struct { - UTF16 bool + UTF16Pos bool FS fs.FS MeasuredTexts []*d2target.MText Ruler *textmeasure.Ruler @@ -50,8 +50,8 @@ func Compile(ctx context.Context, input string, compileOpts *CompileOptions, ren } g, config, err := d2compiler.Compile(compileOpts.InputPath, strings.NewReader(input), &d2compiler.CompileOptions{ - UTF16: compileOpts.UTF16, - FS: compileOpts.FS, + UTF16Pos: compileOpts.UTF16Pos, + FS: compileOpts.FS, }) if err != nil { return nil, nil, err diff --git a/d2parser/parse.go b/d2parser/parse.go index 2eeae234a..732038480 100644 --- a/d2parser/parse.go +++ b/d2parser/parse.go @@ -2,6 +2,7 @@ package d2parser import ( "bufio" + "bytes" "fmt" "io" "math/big" @@ -23,9 +24,6 @@ type ParseOptions struct { // So you want to read UTF-8 still but adjust the indexes to pretend the input is utf16. UTF16Pos bool - // UTF16Input makes the parser read the input as UTF16 and also sets UTF16Pos. - UTF16Input bool - ParseError *ParseError } @@ -44,25 +42,36 @@ type ParseOptions struct { func Parse(path string, r io.Reader, opts *ParseOptions) (*d2ast.Map, error) { if opts == nil { opts = &ParseOptions{ - UTF16Pos: false, - UTF16Input: false, + UTF16Pos: false, } } p := &parser{ path: path, - utf16Input: opts.UTF16Input, - utf16Pos: opts.UTF16Pos, - err: opts.ParseError, + utf16Pos: opts.UTF16Pos, + err: opts.ParseError, } - if p.utf16Input { - p.utf16Pos = true - tr := transform.NewReader(r, tunicode.UTF16(tunicode.LittleEndian, tunicode.UseBOM).NewDecoder()) - p.reader = bufio.NewReader(tr) - } else { - p.reader = bufio.NewReader(r) + br := bufio.NewReader(r) + p.reader = br + + bom, err := br.Peek(2) + if err == nil { + // 0xFFFE is invalid UTF-8 so this is safe. + // Also a different BOM is used for UTF-8. + // See https://unicode.org/faq/utf_bom.html#bom4 + if bom[0] == 0xFF && bom[1] == 0xFE { + p.utf16Pos = true + + buf := make([]byte, br.Buffered()) + io.ReadFull(br, buf) + + mr := io.MultiReader(bytes.NewBuffer(buf), r) + tr := transform.NewReader(mr, tunicode.UTF16(tunicode.LittleEndian, tunicode.UseBOM).NewDecoder()) + br.Reset(tr) + } } + if p.err == nil { p.err = &ParseError{} } @@ -131,10 +140,9 @@ func ParseValue(value string) (d2ast.Value, error) { // // TODO: ast struct that combines map & errors and pass that around type parser struct { - path string - pos d2ast.Position - utf16Pos bool - utf16Input bool + path string + pos d2ast.Position + utf16Pos bool reader io.RuneReader readerPos d2ast.Position @@ -212,10 +220,7 @@ func (p *parser) _readRune() (r rune, eof bool) { p.readerPos = p.lookaheadPos - r, n, err := p.reader.ReadRune() - if p.utf16Input && n > 0 { - // TODO: - } + r, _, err := p.reader.ReadRune() if err != nil { p.ioerr = true if err != io.EOF { diff --git a/d2parser/parse_test.go b/d2parser/parse_test.go index 2b7300ad3..605be68f7 100644 --- a/d2parser/parse_test.go +++ b/d2parser/parse_test.go @@ -17,7 +17,6 @@ import ( type testCase struct { name string text string - utf16 bool assert func(t testing.TB, ast *d2ast.Map, err error) } @@ -395,20 +394,12 @@ c- }, { name: "utf16-input", - utf16: true, text: "\xff\xfex\x00 \x00-\x00>\x00 \x00y\x00\r\x00\n\x00", assert: func(t testing.TB, ast *d2ast.Map, err error) { assert.Success(t, err) assert.Equal(t, "x -> y\n", d2format.Format(ast)) }, }, - { - name: "errors/utf16-input", - text: "\xff\xfex\x00 \x00-\x00>\x00 \x00y\x00\r\x00\n\x00", - assert: func(t testing.TB, ast *d2ast.Map, err error) { - assert.ErrorString(t, err, `d2/testdata/d2parser/TestParse/errors/utf16-input.d2:1:13: invalid text beginning unquoted key`) - }, - }, } t.Run("import", testImport) @@ -510,9 +501,6 @@ func runa(t *testing.T, tca []testCase) { d2Path := fmt.Sprintf("d2/testdata/d2parser/%v.d2", t.Name()) opts := &d2parser.ParseOptions{} - if tc.utf16 { - opts.UTF16Input = true - } ast, err := d2parser.Parse(d2Path, strings.NewReader(tc.text), opts) if tc.assert != nil { diff --git a/d2parser/utf16_gen.go b/d2parser/utf16_gen.go index 090705dfe..2acecc16d 100644 --- a/d2parser/utf16_gen.go +++ b/d2parser/utf16_gen.go @@ -10,6 +10,7 @@ import ( "io" "log" "os" + "unicode/utf8" "golang.org/x/text/encoding/unicode" "golang.org/x/text/transform" @@ -27,6 +28,8 @@ func main() { } fmt.Printf("%q\n", b.String()) + fmt.Println("\xFF\xFE") + fmt.Println(utf8.ValidString("\xFF\xFE")) err = os.WriteFile("./utf16.d2", b.Bytes(), 0644) if err != nil { diff --git a/e2etests/e2e_test.go b/e2etests/e2e_test.go index 53201f8d7..2eaa799af 100644 --- a/e2etests/e2e_test.go +++ b/e2etests/e2e_test.go @@ -111,7 +111,7 @@ func serde(t *testing.T, tc testCase, ruler *textmeasure.Ruler) { ctx := context.Background() ctx = log.WithTB(ctx, t, nil) g, _, err := d2compiler.Compile("", strings.NewReader(tc.script), &d2compiler.CompileOptions{ - UTF16: false, + UTF16Pos: false, }) trequire.Nil(t, err) if len(g.Objects) > 0 { diff --git a/testdata/d2parser/TestParse/errors/utf16-input.exp.json b/testdata/d2parser/TestParse/errors/utf16-input.exp.json deleted file mode 100644 index 81f075aa7..000000000 --- a/testdata/d2parser/TestParse/errors/utf16-input.exp.json +++ /dev/null @@ -1,38 +0,0 @@ -{ - "ast": { - "range": "d2/testdata/d2parser/TestParse/errors/utf16-input.d2,0:0:0-1:1:22", - "nodes": [ - { - "map_key": { - "range": "d2/testdata/d2parser/TestParse/errors/utf16-input.d2,1:0:21-1:1:22", - "key": { - "range": "d2/testdata/d2parser/TestParse/errors/utf16-input.d2,1:0:21-1:1:22", - "path": [ - { - "unquoted_string": { - "range": "d2/testdata/d2parser/TestParse/errors/utf16-input.d2,1:0:21-1:1:22", - "value": [ - { - "string": "\u0000", - "raw_string": "\u0000" - } - ] - } - } - ] - }, - "primary": {}, - "value": {} - } - } - ] - }, - "err": { - "errs": [ - { - "range": "d2/testdata/d2parser/TestParse/errors/utf16-input.d2,0:12:12-0:20:20", - "errmsg": "d2/testdata/d2parser/TestParse/errors/utf16-input.d2:1:13: invalid text beginning unquoted key" - } - ] - } -}