d2parser: Autodetect UTF-16 based on BOM

Turns out I was wrong this is safe.
This commit is contained in:
Anmol Sethi 2023-08-02 10:26:45 -07:00
parent b81da1ee62
commit 203953723e
No known key found for this signature in database
GPG key ID: 8CEF1878FF10ADEB
10 changed files with 44 additions and 86 deletions

View file

@ -21,27 +21,27 @@ import (
)
type CompileOptions struct {
UTF16 bool
UTF16Pos bool
// FS is the file system used for resolving imports in the d2 text.
// It should correspond to the root path.
FS fs.FS
}
func Compile(p string, r io.RuneReader, opts *CompileOptions) (*d2graph.Graph, *d2target.Config, error) {
func Compile(p string, r io.Reader, opts *CompileOptions) (*d2graph.Graph, *d2target.Config, error) {
if opts == nil {
opts = &CompileOptions{}
}
ast, err := d2parser.Parse(p, r, &d2parser.ParseOptions{
UTF16: opts.UTF16,
UTF16Pos: opts.UTF16Pos,
})
if err != nil {
return nil, nil, err
}
ir, err := d2ir.Compile(ast, &d2ir.CompileOptions{
UTF16: opts.UTF16,
FS: opts.FS,
UTF16Pos: opts.UTF16Pos,
FS: opts.FS,
})
if err != nil {
return nil, nil, err

View file

@ -223,7 +223,7 @@ func run(t *testing.T, tc testCase) {
ctx = log.Leveled(ctx, slog.LevelDebug)
g, config, err := d2compiler.Compile("", strings.NewReader(tc.dsl), &d2compiler.CompileOptions{
UTF16: true,
UTF16Pos: true,
})
if err != nil {
t.Fatal(err)

View file

@ -21,13 +21,13 @@ type compiler struct {
importStack []string
// importCache enables reuse of files imported multiple times.
importCache map[string]*Map
utf16 bool
utf16Pos bool
globStack []bool
}
type CompileOptions struct {
UTF16 bool
UTF16Pos bool
// Pass nil to disable imports.
FS fs.FS
}
@ -45,7 +45,7 @@ func Compile(ast *d2ast.Map, opts *CompileOptions) (*Map, error) {
fs: opts.FS,
importCache: make(map[string]*Map),
utf16: opts.UTF16,
utf16Pos: opts.UTF16Pos,
}
m := &Map{}
m.initRoot()

View file

@ -99,7 +99,7 @@ func (c *compiler) __import(imp *d2ast.Import) (*Map, bool) {
defer f.Close()
ast, err := d2parser.Parse(impPath, f, &d2parser.ParseOptions{
UTF16: c.utf16,
UTF16Pos: c.utf16Pos,
ParseError: c.err,
})
if err != nil {

View file

@ -23,7 +23,7 @@ import (
)
type CompileOptions struct {
UTF16 bool
UTF16Pos bool
FS fs.FS
MeasuredTexts []*d2target.MText
Ruler *textmeasure.Ruler
@ -50,8 +50,8 @@ func Compile(ctx context.Context, input string, compileOpts *CompileOptions, ren
}
g, config, err := d2compiler.Compile(compileOpts.InputPath, strings.NewReader(input), &d2compiler.CompileOptions{
UTF16: compileOpts.UTF16,
FS: compileOpts.FS,
UTF16Pos: compileOpts.UTF16Pos,
FS: compileOpts.FS,
})
if err != nil {
return nil, nil, err

View file

@ -2,6 +2,7 @@ package d2parser
import (
"bufio"
"bytes"
"fmt"
"io"
"math/big"
@ -23,9 +24,6 @@ type ParseOptions struct {
// So you want to read UTF-8 still but adjust the indexes to pretend the input is utf16.
UTF16Pos bool
// UTF16Input makes the parser read the input as UTF16 and also sets UTF16Pos.
UTF16Input bool
ParseError *ParseError
}
@ -44,25 +42,36 @@ type ParseOptions struct {
func Parse(path string, r io.Reader, opts *ParseOptions) (*d2ast.Map, error) {
if opts == nil {
opts = &ParseOptions{
UTF16Pos: false,
UTF16Input: false,
UTF16Pos: false,
}
}
p := &parser{
path: path,
utf16Input: opts.UTF16Input,
utf16Pos: opts.UTF16Pos,
err: opts.ParseError,
utf16Pos: opts.UTF16Pos,
err: opts.ParseError,
}
if p.utf16Input {
p.utf16Pos = true
tr := transform.NewReader(r, tunicode.UTF16(tunicode.LittleEndian, tunicode.UseBOM).NewDecoder())
p.reader = bufio.NewReader(tr)
} else {
p.reader = bufio.NewReader(r)
br := bufio.NewReader(r)
p.reader = br
bom, err := br.Peek(2)
if err == nil {
// 0xFFFE is invalid UTF-8 so this is safe.
// Also a different BOM is used for UTF-8.
// See https://unicode.org/faq/utf_bom.html#bom4
if bom[0] == 0xFF && bom[1] == 0xFE {
p.utf16Pos = true
buf := make([]byte, br.Buffered())
io.ReadFull(br, buf)
mr := io.MultiReader(bytes.NewBuffer(buf), r)
tr := transform.NewReader(mr, tunicode.UTF16(tunicode.LittleEndian, tunicode.UseBOM).NewDecoder())
br.Reset(tr)
}
}
if p.err == nil {
p.err = &ParseError{}
}
@ -131,10 +140,9 @@ func ParseValue(value string) (d2ast.Value, error) {
//
// TODO: ast struct that combines map & errors and pass that around
type parser struct {
path string
pos d2ast.Position
utf16Pos bool
utf16Input bool
path string
pos d2ast.Position
utf16Pos bool
reader io.RuneReader
readerPos d2ast.Position
@ -212,10 +220,7 @@ func (p *parser) _readRune() (r rune, eof bool) {
p.readerPos = p.lookaheadPos
r, n, err := p.reader.ReadRune()
if p.utf16Input && n > 0 {
// TODO:
}
r, _, err := p.reader.ReadRune()
if err != nil {
p.ioerr = true
if err != io.EOF {

View file

@ -17,7 +17,6 @@ import (
type testCase struct {
name string
text string
utf16 bool
assert func(t testing.TB, ast *d2ast.Map, err error)
}
@ -395,20 +394,12 @@ c-
},
{
name: "utf16-input",
utf16: true,
text: "\xff\xfex\x00 \x00-\x00>\x00 \x00y\x00\r\x00\n\x00",
assert: func(t testing.TB, ast *d2ast.Map, err error) {
assert.Success(t, err)
assert.Equal(t, "x -> y\n", d2format.Format(ast))
},
},
{
name: "errors/utf16-input",
text: "\xff\xfex\x00 \x00-\x00>\x00 \x00y\x00\r\x00\n\x00",
assert: func(t testing.TB, ast *d2ast.Map, err error) {
assert.ErrorString(t, err, `d2/testdata/d2parser/TestParse/errors/utf16-input.d2:1:13: invalid text beginning unquoted key`)
},
},
}
t.Run("import", testImport)
@ -510,9 +501,6 @@ func runa(t *testing.T, tca []testCase) {
d2Path := fmt.Sprintf("d2/testdata/d2parser/%v.d2", t.Name())
opts := &d2parser.ParseOptions{}
if tc.utf16 {
opts.UTF16Input = true
}
ast, err := d2parser.Parse(d2Path, strings.NewReader(tc.text), opts)
if tc.assert != nil {

View file

@ -10,6 +10,7 @@ import (
"io"
"log"
"os"
"unicode/utf8"
"golang.org/x/text/encoding/unicode"
"golang.org/x/text/transform"
@ -27,6 +28,8 @@ func main() {
}
fmt.Printf("%q\n", b.String())
fmt.Println("\xFF\xFE")
fmt.Println(utf8.ValidString("\xFF\xFE"))
err = os.WriteFile("./utf16.d2", b.Bytes(), 0644)
if err != nil {

View file

@ -111,7 +111,7 @@ func serde(t *testing.T, tc testCase, ruler *textmeasure.Ruler) {
ctx := context.Background()
ctx = log.WithTB(ctx, t, nil)
g, _, err := d2compiler.Compile("", strings.NewReader(tc.script), &d2compiler.CompileOptions{
UTF16: false,
UTF16Pos: false,
})
trequire.Nil(t, err)
if len(g.Objects) > 0 {

View file

@ -1,38 +0,0 @@
{
"ast": {
"range": "d2/testdata/d2parser/TestParse/errors/utf16-input.d2,0:0:0-1:1:22",
"nodes": [
{
"map_key": {
"range": "d2/testdata/d2parser/TestParse/errors/utf16-input.d2,1:0:21-1:1:22",
"key": {
"range": "d2/testdata/d2parser/TestParse/errors/utf16-input.d2,1:0:21-1:1:22",
"path": [
{
"unquoted_string": {
"range": "d2/testdata/d2parser/TestParse/errors/utf16-input.d2,1:0:21-1:1:22",
"value": [
{
"string": "\u0000",
"raw_string": "\u0000"
}
]
}
}
]
},
"primary": {},
"value": {}
}
}
]
},
"err": {
"errs": [
{
"range": "d2/testdata/d2parser/TestParse/errors/utf16-input.d2,0:12:12-0:20:20",
"errmsg": "d2/testdata/d2parser/TestParse/errors/utf16-input.d2:1:13: invalid text beginning unquoted key"
}
]
}
}