d2parser: Autodetect UTF-16 based on BOM

Turns out I was wrong this is safe.
This commit is contained in:
Anmol Sethi 2023-08-02 10:26:45 -07:00
parent b81da1ee62
commit 203953723e
No known key found for this signature in database
GPG key ID: 8CEF1878FF10ADEB
10 changed files with 44 additions and 86 deletions

View file

@ -21,27 +21,27 @@ import (
) )
type CompileOptions struct { type CompileOptions struct {
UTF16 bool UTF16Pos bool
// FS is the file system used for resolving imports in the d2 text. // FS is the file system used for resolving imports in the d2 text.
// It should correspond to the root path. // It should correspond to the root path.
FS fs.FS FS fs.FS
} }
func Compile(p string, r io.RuneReader, opts *CompileOptions) (*d2graph.Graph, *d2target.Config, error) { func Compile(p string, r io.Reader, opts *CompileOptions) (*d2graph.Graph, *d2target.Config, error) {
if opts == nil { if opts == nil {
opts = &CompileOptions{} opts = &CompileOptions{}
} }
ast, err := d2parser.Parse(p, r, &d2parser.ParseOptions{ ast, err := d2parser.Parse(p, r, &d2parser.ParseOptions{
UTF16: opts.UTF16, UTF16Pos: opts.UTF16Pos,
}) })
if err != nil { if err != nil {
return nil, nil, err return nil, nil, err
} }
ir, err := d2ir.Compile(ast, &d2ir.CompileOptions{ ir, err := d2ir.Compile(ast, &d2ir.CompileOptions{
UTF16: opts.UTF16, UTF16Pos: opts.UTF16Pos,
FS: opts.FS, FS: opts.FS,
}) })
if err != nil { if err != nil {
return nil, nil, err return nil, nil, err

View file

@ -223,7 +223,7 @@ func run(t *testing.T, tc testCase) {
ctx = log.Leveled(ctx, slog.LevelDebug) ctx = log.Leveled(ctx, slog.LevelDebug)
g, config, err := d2compiler.Compile("", strings.NewReader(tc.dsl), &d2compiler.CompileOptions{ g, config, err := d2compiler.Compile("", strings.NewReader(tc.dsl), &d2compiler.CompileOptions{
UTF16: true, UTF16Pos: true,
}) })
if err != nil { if err != nil {
t.Fatal(err) t.Fatal(err)

View file

@ -21,13 +21,13 @@ type compiler struct {
importStack []string importStack []string
// importCache enables reuse of files imported multiple times. // importCache enables reuse of files imported multiple times.
importCache map[string]*Map importCache map[string]*Map
utf16 bool utf16Pos bool
globStack []bool globStack []bool
} }
type CompileOptions struct { type CompileOptions struct {
UTF16 bool UTF16Pos bool
// Pass nil to disable imports. // Pass nil to disable imports.
FS fs.FS FS fs.FS
} }
@ -45,7 +45,7 @@ func Compile(ast *d2ast.Map, opts *CompileOptions) (*Map, error) {
fs: opts.FS, fs: opts.FS,
importCache: make(map[string]*Map), importCache: make(map[string]*Map),
utf16: opts.UTF16, utf16Pos: opts.UTF16Pos,
} }
m := &Map{} m := &Map{}
m.initRoot() m.initRoot()

View file

@ -99,7 +99,7 @@ func (c *compiler) __import(imp *d2ast.Import) (*Map, bool) {
defer f.Close() defer f.Close()
ast, err := d2parser.Parse(impPath, f, &d2parser.ParseOptions{ ast, err := d2parser.Parse(impPath, f, &d2parser.ParseOptions{
UTF16: c.utf16, UTF16Pos: c.utf16Pos,
ParseError: c.err, ParseError: c.err,
}) })
if err != nil { if err != nil {

View file

@ -23,7 +23,7 @@ import (
) )
type CompileOptions struct { type CompileOptions struct {
UTF16 bool UTF16Pos bool
FS fs.FS FS fs.FS
MeasuredTexts []*d2target.MText MeasuredTexts []*d2target.MText
Ruler *textmeasure.Ruler Ruler *textmeasure.Ruler
@ -50,8 +50,8 @@ func Compile(ctx context.Context, input string, compileOpts *CompileOptions, ren
} }
g, config, err := d2compiler.Compile(compileOpts.InputPath, strings.NewReader(input), &d2compiler.CompileOptions{ g, config, err := d2compiler.Compile(compileOpts.InputPath, strings.NewReader(input), &d2compiler.CompileOptions{
UTF16: compileOpts.UTF16, UTF16Pos: compileOpts.UTF16Pos,
FS: compileOpts.FS, FS: compileOpts.FS,
}) })
if err != nil { if err != nil {
return nil, nil, err return nil, nil, err

View file

@ -2,6 +2,7 @@ package d2parser
import ( import (
"bufio" "bufio"
"bytes"
"fmt" "fmt"
"io" "io"
"math/big" "math/big"
@ -23,9 +24,6 @@ type ParseOptions struct {
// So you want to read UTF-8 still but adjust the indexes to pretend the input is utf16. // So you want to read UTF-8 still but adjust the indexes to pretend the input is utf16.
UTF16Pos bool UTF16Pos bool
// UTF16Input makes the parser read the input as UTF16 and also sets UTF16Pos.
UTF16Input bool
ParseError *ParseError ParseError *ParseError
} }
@ -44,25 +42,36 @@ type ParseOptions struct {
func Parse(path string, r io.Reader, opts *ParseOptions) (*d2ast.Map, error) { func Parse(path string, r io.Reader, opts *ParseOptions) (*d2ast.Map, error) {
if opts == nil { if opts == nil {
opts = &ParseOptions{ opts = &ParseOptions{
UTF16Pos: false, UTF16Pos: false,
UTF16Input: false,
} }
} }
p := &parser{ p := &parser{
path: path, path: path,
utf16Input: opts.UTF16Input, utf16Pos: opts.UTF16Pos,
utf16Pos: opts.UTF16Pos, err: opts.ParseError,
err: opts.ParseError,
} }
if p.utf16Input { br := bufio.NewReader(r)
p.utf16Pos = true p.reader = br
tr := transform.NewReader(r, tunicode.UTF16(tunicode.LittleEndian, tunicode.UseBOM).NewDecoder())
p.reader = bufio.NewReader(tr) bom, err := br.Peek(2)
} else { if err == nil {
p.reader = bufio.NewReader(r) // 0xFFFE is invalid UTF-8 so this is safe.
// Also a different BOM is used for UTF-8.
// See https://unicode.org/faq/utf_bom.html#bom4
if bom[0] == 0xFF && bom[1] == 0xFE {
p.utf16Pos = true
buf := make([]byte, br.Buffered())
io.ReadFull(br, buf)
mr := io.MultiReader(bytes.NewBuffer(buf), r)
tr := transform.NewReader(mr, tunicode.UTF16(tunicode.LittleEndian, tunicode.UseBOM).NewDecoder())
br.Reset(tr)
}
} }
if p.err == nil { if p.err == nil {
p.err = &ParseError{} p.err = &ParseError{}
} }
@ -131,10 +140,9 @@ func ParseValue(value string) (d2ast.Value, error) {
// //
// TODO: ast struct that combines map & errors and pass that around // TODO: ast struct that combines map & errors and pass that around
type parser struct { type parser struct {
path string path string
pos d2ast.Position pos d2ast.Position
utf16Pos bool utf16Pos bool
utf16Input bool
reader io.RuneReader reader io.RuneReader
readerPos d2ast.Position readerPos d2ast.Position
@ -212,10 +220,7 @@ func (p *parser) _readRune() (r rune, eof bool) {
p.readerPos = p.lookaheadPos p.readerPos = p.lookaheadPos
r, n, err := p.reader.ReadRune() r, _, err := p.reader.ReadRune()
if p.utf16Input && n > 0 {
// TODO:
}
if err != nil { if err != nil {
p.ioerr = true p.ioerr = true
if err != io.EOF { if err != io.EOF {

View file

@ -17,7 +17,6 @@ import (
type testCase struct { type testCase struct {
name string name string
text string text string
utf16 bool
assert func(t testing.TB, ast *d2ast.Map, err error) assert func(t testing.TB, ast *d2ast.Map, err error)
} }
@ -395,20 +394,12 @@ c-
}, },
{ {
name: "utf16-input", name: "utf16-input",
utf16: true,
text: "\xff\xfex\x00 \x00-\x00>\x00 \x00y\x00\r\x00\n\x00", text: "\xff\xfex\x00 \x00-\x00>\x00 \x00y\x00\r\x00\n\x00",
assert: func(t testing.TB, ast *d2ast.Map, err error) { assert: func(t testing.TB, ast *d2ast.Map, err error) {
assert.Success(t, err) assert.Success(t, err)
assert.Equal(t, "x -> y\n", d2format.Format(ast)) assert.Equal(t, "x -> y\n", d2format.Format(ast))
}, },
}, },
{
name: "errors/utf16-input",
text: "\xff\xfex\x00 \x00-\x00>\x00 \x00y\x00\r\x00\n\x00",
assert: func(t testing.TB, ast *d2ast.Map, err error) {
assert.ErrorString(t, err, `d2/testdata/d2parser/TestParse/errors/utf16-input.d2:1:13: invalid text beginning unquoted key`)
},
},
} }
t.Run("import", testImport) t.Run("import", testImport)
@ -510,9 +501,6 @@ func runa(t *testing.T, tca []testCase) {
d2Path := fmt.Sprintf("d2/testdata/d2parser/%v.d2", t.Name()) d2Path := fmt.Sprintf("d2/testdata/d2parser/%v.d2", t.Name())
opts := &d2parser.ParseOptions{} opts := &d2parser.ParseOptions{}
if tc.utf16 {
opts.UTF16Input = true
}
ast, err := d2parser.Parse(d2Path, strings.NewReader(tc.text), opts) ast, err := d2parser.Parse(d2Path, strings.NewReader(tc.text), opts)
if tc.assert != nil { if tc.assert != nil {

View file

@ -10,6 +10,7 @@ import (
"io" "io"
"log" "log"
"os" "os"
"unicode/utf8"
"golang.org/x/text/encoding/unicode" "golang.org/x/text/encoding/unicode"
"golang.org/x/text/transform" "golang.org/x/text/transform"
@ -27,6 +28,8 @@ func main() {
} }
fmt.Printf("%q\n", b.String()) fmt.Printf("%q\n", b.String())
fmt.Println("\xFF\xFE")
fmt.Println(utf8.ValidString("\xFF\xFE"))
err = os.WriteFile("./utf16.d2", b.Bytes(), 0644) err = os.WriteFile("./utf16.d2", b.Bytes(), 0644)
if err != nil { if err != nil {

View file

@ -111,7 +111,7 @@ func serde(t *testing.T, tc testCase, ruler *textmeasure.Ruler) {
ctx := context.Background() ctx := context.Background()
ctx = log.WithTB(ctx, t, nil) ctx = log.WithTB(ctx, t, nil)
g, _, err := d2compiler.Compile("", strings.NewReader(tc.script), &d2compiler.CompileOptions{ g, _, err := d2compiler.Compile("", strings.NewReader(tc.script), &d2compiler.CompileOptions{
UTF16: false, UTF16Pos: false,
}) })
trequire.Nil(t, err) trequire.Nil(t, err)
if len(g.Objects) > 0 { if len(g.Objects) > 0 {

View file

@ -1,38 +0,0 @@
{
"ast": {
"range": "d2/testdata/d2parser/TestParse/errors/utf16-input.d2,0:0:0-1:1:22",
"nodes": [
{
"map_key": {
"range": "d2/testdata/d2parser/TestParse/errors/utf16-input.d2,1:0:21-1:1:22",
"key": {
"range": "d2/testdata/d2parser/TestParse/errors/utf16-input.d2,1:0:21-1:1:22",
"path": [
{
"unquoted_string": {
"range": "d2/testdata/d2parser/TestParse/errors/utf16-input.d2,1:0:21-1:1:22",
"value": [
{
"string": "\u0000",
"raw_string": "\u0000"
}
]
}
}
]
},
"primary": {},
"value": {}
}
}
]
},
"err": {
"errs": [
{
"range": "d2/testdata/d2parser/TestParse/errors/utf16-input.d2,0:12:12-0:20:20",
"errmsg": "d2/testdata/d2parser/TestParse/errors/utf16-input.d2:1:13: invalid text beginning unquoted key"
}
]
}
}