d2parser: Autodetect UTF-16 based on BOM

Turns out I was wrong this is safe.
2023-08-02 10:26:45 -07:00 · 2023-08-02 10:26:45 -07:00 · 203953723e
commit 203953723e
parent b81da1ee62
10 changed files with 44 additions and 86 deletions
--- a/d2compiler/compile.go
+++ b/d2compiler/compile.go
@ -21,27 +21,27 @@ import (
 )

 type CompileOptions struct {
-	UTF16 bool
+	UTF16Pos bool
 	// FS is the file system used for resolving imports in the d2 text.
 	// It should correspond to the root path.
 	FS fs.FS
 }

-func Compile(p string, r io.RuneReader, opts *CompileOptions) (*d2graph.Graph, *d2target.Config, error) {
+func Compile(p string, r io.Reader, opts *CompileOptions) (*d2graph.Graph, *d2target.Config, error) {
 	if opts == nil {
 		opts = &CompileOptions{}
 	}

 	ast, err := d2parser.Parse(p, r, &d2parser.ParseOptions{
-		UTF16: opts.UTF16,
+		UTF16Pos: opts.UTF16Pos,
 	})
 	if err != nil {
 		return nil, nil, err
 	}

 	ir, err := d2ir.Compile(ast, &d2ir.CompileOptions{
-		UTF16: opts.UTF16,
-		FS:    opts.FS,
+		UTF16Pos: opts.UTF16Pos,
+		FS:       opts.FS,
 	})
 	if err != nil {
 		return nil, nil, err
--- a/d2exporter/export_test.go
+++ b/d2exporter/export_test.go
@ -223,7 +223,7 @@ func run(t *testing.T, tc testCase) {
 	ctx = log.Leveled(ctx, slog.LevelDebug)

 	g, config, err := d2compiler.Compile("", strings.NewReader(tc.dsl), &d2compiler.CompileOptions{
-		UTF16: true,
+		UTF16Pos: true,
 	})
 	if err != nil {
 		t.Fatal(err)
--- a/d2ir/compile.go
+++ b/d2ir/compile.go
@ -21,13 +21,13 @@ type compiler struct {
 	importStack []string
 	// importCache enables reuse of files imported multiple times.
 	importCache map[string]*Map
-	utf16       bool
+	utf16Pos    bool

 	globStack []bool
 }

 type CompileOptions struct {
-	UTF16 bool
+	UTF16Pos bool
 	// Pass nil to disable imports.
 	FS fs.FS
 }
@ -45,7 +45,7 @@ func Compile(ast *d2ast.Map, opts *CompileOptions) (*Map, error) {
 		fs:  opts.FS,

 		importCache: make(map[string]*Map),
-		utf16:       opts.UTF16,
+		utf16Pos:    opts.UTF16Pos,
 	}
 	m := &Map{}
 	m.initRoot()
--- a/d2ir/import.go
+++ b/d2ir/import.go
@ -99,7 +99,7 @@ func (c *compiler) __import(imp *d2ast.Import) (*Map, bool) {
 	defer f.Close()

 	ast, err := d2parser.Parse(impPath, f, &d2parser.ParseOptions{
-		UTF16:      c.utf16,
+		UTF16Pos:   c.utf16Pos,
 		ParseError: c.err,
 	})
 	if err != nil {
--- a/d2lib/d2.go
+++ b/d2lib/d2.go
@ -23,7 +23,7 @@ import (
 )

 type CompileOptions struct {
-	UTF16          bool
+	UTF16Pos       bool
 	FS             fs.FS
 	MeasuredTexts  []*d2target.MText
 	Ruler          *textmeasure.Ruler
@ -50,8 +50,8 @@ func Compile(ctx context.Context, input string, compileOpts *CompileOptions, ren
 	}

 	g, config, err := d2compiler.Compile(compileOpts.InputPath, strings.NewReader(input), &d2compiler.CompileOptions{
-		UTF16: compileOpts.UTF16,
-		FS:    compileOpts.FS,
+		UTF16Pos: compileOpts.UTF16Pos,
+		FS:       compileOpts.FS,
 	})
 	if err != nil {
 		return nil, nil, err
--- a/d2parser/parse.go
+++ b/d2parser/parse.go
@ -2,6 +2,7 @@ package d2parser

 import (
 	"bufio"
+	"bytes"
 	"fmt"
 	"io"
 	"math/big"
@ -23,9 +24,6 @@ type ParseOptions struct {
 	// So you want to read UTF-8 still but adjust the indexes to pretend the input is utf16.
 	UTF16Pos bool

-	// UTF16Input makes the parser read the input as UTF16 and also sets UTF16Pos.
-	UTF16Input bool
-
 	ParseError *ParseError
 }

@ -44,25 +42,36 @@ type ParseOptions struct {
 func Parse(path string, r io.Reader, opts *ParseOptions) (*d2ast.Map, error) {
 	if opts == nil {
 		opts = &ParseOptions{
-			UTF16Pos:   false,
-			UTF16Input: false,
+			UTF16Pos: false,
 		}
 	}

 	p := &parser{
 		path: path,

-		utf16Input: opts.UTF16Input,
-		utf16Pos:   opts.UTF16Pos,
-		err:        opts.ParseError,
+		utf16Pos: opts.UTF16Pos,
+		err:      opts.ParseError,
 	}
-	if p.utf16Input {
-		p.utf16Pos = true
-		tr := transform.NewReader(r, tunicode.UTF16(tunicode.LittleEndian, tunicode.UseBOM).NewDecoder())
-		p.reader = bufio.NewReader(tr)
-	} else {
-		p.reader = bufio.NewReader(r)
+	br := bufio.NewReader(r)
+	p.reader = br
+
+	bom, err := br.Peek(2)
+	if err == nil {
+		// 0xFFFE is invalid UTF-8 so this is safe.
+		// Also a different BOM is used for UTF-8.
+		// See https://unicode.org/faq/utf_bom.html#bom4
+		if bom[0] == 0xFF && bom[1] == 0xFE {
+			p.utf16Pos = true
+
+			buf := make([]byte, br.Buffered())
+			io.ReadFull(br, buf)
+
+			mr := io.MultiReader(bytes.NewBuffer(buf), r)
+			tr := transform.NewReader(mr, tunicode.UTF16(tunicode.LittleEndian, tunicode.UseBOM).NewDecoder())
+			br.Reset(tr)
+		}
 	}
+
 	if p.err == nil {
 		p.err = &ParseError{}
 	}
@ -131,10 +140,9 @@ func ParseValue(value string) (d2ast.Value, error) {
 //
 // TODO: ast struct that combines map & errors and pass that around
 type parser struct {
-	path       string
-	pos        d2ast.Position
-	utf16Pos   bool
-	utf16Input bool
+	path     string
+	pos      d2ast.Position
+	utf16Pos bool

 	reader    io.RuneReader
 	readerPos d2ast.Position
@ -212,10 +220,7 @@ func (p *parser) _readRune() (r rune, eof bool) {

 	p.readerPos = p.lookaheadPos

-	r, n, err := p.reader.ReadRune()
-	if p.utf16Input && n > 0 {
-		// TODO:
-	}
+	r, _, err := p.reader.ReadRune()
 	if err != nil {
 		p.ioerr = true
 		if err != io.EOF {
--- a/d2parser/parse_test.go
+++ b/d2parser/parse_test.go
@ -17,7 +17,6 @@ import (
 type testCase struct {
 	name   string
 	text   string
-	utf16  bool
 	assert func(t testing.TB, ast *d2ast.Map, err error)
 }

@ -395,20 +394,12 @@ c-
 		},
 		{
 			name: "utf16-input",
-			utf16: true,
 			text: "\xff\xfex\x00 \x00-\x00>\x00 \x00y\x00\r\x00\n\x00",
 			assert: func(t testing.TB, ast *d2ast.Map, err error) {
 				assert.Success(t, err)
 				assert.Equal(t, "x -> y\n", d2format.Format(ast))
 			},
 		},
-		{
-			name: "errors/utf16-input",
-			text: "\xff\xfex\x00 \x00-\x00>\x00 \x00y\x00\r\x00\n\x00",
-			assert: func(t testing.TB, ast *d2ast.Map, err error) {
-				assert.ErrorString(t, err, `d2/testdata/d2parser/TestParse/errors/utf16-input.d2:1:13: invalid text beginning unquoted key`)
-			},
-		},
 	}

 	t.Run("import", testImport)
@ -510,9 +501,6 @@ func runa(t *testing.T, tca []testCase) {

 			d2Path := fmt.Sprintf("d2/testdata/d2parser/%v.d2", t.Name())
 			opts := &d2parser.ParseOptions{}
-			if tc.utf16 {
-				opts.UTF16Input = true
-			}
 			ast, err := d2parser.Parse(d2Path, strings.NewReader(tc.text), opts)

 			if tc.assert != nil {
--- a/d2parser/utf16_gen.go
+++ b/d2parser/utf16_gen.go
@ -10,6 +10,7 @@ import (
 	"io"
 	"log"
 	"os"
+	"unicode/utf8"

 	"golang.org/x/text/encoding/unicode"
 	"golang.org/x/text/transform"
@ -27,6 +28,8 @@ func main() {
 	}

 	fmt.Printf("%q\n", b.String())
+	fmt.Println("\xFF\xFE")
+	fmt.Println(utf8.ValidString("\xFF\xFE"))

 	err = os.WriteFile("./utf16.d2", b.Bytes(), 0644)
 	if err != nil {
--- a/e2etests/e2e_test.go
+++ b/e2etests/e2e_test.go
@ -111,7 +111,7 @@ func serde(t *testing.T, tc testCase, ruler *textmeasure.Ruler) {
 	ctx := context.Background()
 	ctx = log.WithTB(ctx, t, nil)
 	g, _, err := d2compiler.Compile("", strings.NewReader(tc.script), &d2compiler.CompileOptions{
-		UTF16: false,
+		UTF16Pos: false,
 	})
 	trequire.Nil(t, err)
 	if len(g.Objects) > 0 {
--- a/testdata/d2parser/TestParse/errors/utf16-input.exp.json
+++ b/testdata/d2parser/TestParse/errors/utf16-input.exp.json
@ -1,38 +0,0 @@
-{
-  "ast": {
-    "range": "d2/testdata/d2parser/TestParse/errors/utf16-input.d2,0:0:0-1:1:22",
-    "nodes": [
-      {
-        "map_key": {
-          "range": "d2/testdata/d2parser/TestParse/errors/utf16-input.d2,1:0:21-1:1:22",
-          "key": {
-            "range": "d2/testdata/d2parser/TestParse/errors/utf16-input.d2,1:0:21-1:1:22",
-            "path": [
-              {
-                "unquoted_string": {
-                  "range": "d2/testdata/d2parser/TestParse/errors/utf16-input.d2,1:0:21-1:1:22",
-                  "value": [
-                    {
-                      "string": "\u0000",
-                      "raw_string": "\u0000"
-                    }
-                  ]
-                }
-              }
-            ]
-          },
-          "primary": {},
-          "value": {}
-        }
-      }
-    ]
-  },
-  "err": {
-    "errs": [
-      {
-        "range": "d2/testdata/d2parser/TestParse/errors/utf16-input.d2,0:12:12-0:20:20",
-        "errmsg": "d2/testdata/d2parser/TestParse/errors/utf16-input.d2:1:13: invalid text beginning unquoted key"
-      }
-    ]
-  }
-}