d2parser: Autodetect UTF-16 based on BOM
Turns out I was wrong this is safe.
This commit is contained in:
parent
b81da1ee62
commit
203953723e
10 changed files with 44 additions and 86 deletions
|
|
@ -21,27 +21,27 @@ import (
|
||||||
)
|
)
|
||||||
|
|
||||||
type CompileOptions struct {
|
type CompileOptions struct {
|
||||||
UTF16 bool
|
UTF16Pos bool
|
||||||
// FS is the file system used for resolving imports in the d2 text.
|
// FS is the file system used for resolving imports in the d2 text.
|
||||||
// It should correspond to the root path.
|
// It should correspond to the root path.
|
||||||
FS fs.FS
|
FS fs.FS
|
||||||
}
|
}
|
||||||
|
|
||||||
func Compile(p string, r io.RuneReader, opts *CompileOptions) (*d2graph.Graph, *d2target.Config, error) {
|
func Compile(p string, r io.Reader, opts *CompileOptions) (*d2graph.Graph, *d2target.Config, error) {
|
||||||
if opts == nil {
|
if opts == nil {
|
||||||
opts = &CompileOptions{}
|
opts = &CompileOptions{}
|
||||||
}
|
}
|
||||||
|
|
||||||
ast, err := d2parser.Parse(p, r, &d2parser.ParseOptions{
|
ast, err := d2parser.Parse(p, r, &d2parser.ParseOptions{
|
||||||
UTF16: opts.UTF16,
|
UTF16Pos: opts.UTF16Pos,
|
||||||
})
|
})
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return nil, nil, err
|
return nil, nil, err
|
||||||
}
|
}
|
||||||
|
|
||||||
ir, err := d2ir.Compile(ast, &d2ir.CompileOptions{
|
ir, err := d2ir.Compile(ast, &d2ir.CompileOptions{
|
||||||
UTF16: opts.UTF16,
|
UTF16Pos: opts.UTF16Pos,
|
||||||
FS: opts.FS,
|
FS: opts.FS,
|
||||||
})
|
})
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return nil, nil, err
|
return nil, nil, err
|
||||||
|
|
|
||||||
|
|
@ -223,7 +223,7 @@ func run(t *testing.T, tc testCase) {
|
||||||
ctx = log.Leveled(ctx, slog.LevelDebug)
|
ctx = log.Leveled(ctx, slog.LevelDebug)
|
||||||
|
|
||||||
g, config, err := d2compiler.Compile("", strings.NewReader(tc.dsl), &d2compiler.CompileOptions{
|
g, config, err := d2compiler.Compile("", strings.NewReader(tc.dsl), &d2compiler.CompileOptions{
|
||||||
UTF16: true,
|
UTF16Pos: true,
|
||||||
})
|
})
|
||||||
if err != nil {
|
if err != nil {
|
||||||
t.Fatal(err)
|
t.Fatal(err)
|
||||||
|
|
|
||||||
|
|
@ -21,13 +21,13 @@ type compiler struct {
|
||||||
importStack []string
|
importStack []string
|
||||||
// importCache enables reuse of files imported multiple times.
|
// importCache enables reuse of files imported multiple times.
|
||||||
importCache map[string]*Map
|
importCache map[string]*Map
|
||||||
utf16 bool
|
utf16Pos bool
|
||||||
|
|
||||||
globStack []bool
|
globStack []bool
|
||||||
}
|
}
|
||||||
|
|
||||||
type CompileOptions struct {
|
type CompileOptions struct {
|
||||||
UTF16 bool
|
UTF16Pos bool
|
||||||
// Pass nil to disable imports.
|
// Pass nil to disable imports.
|
||||||
FS fs.FS
|
FS fs.FS
|
||||||
}
|
}
|
||||||
|
|
@ -45,7 +45,7 @@ func Compile(ast *d2ast.Map, opts *CompileOptions) (*Map, error) {
|
||||||
fs: opts.FS,
|
fs: opts.FS,
|
||||||
|
|
||||||
importCache: make(map[string]*Map),
|
importCache: make(map[string]*Map),
|
||||||
utf16: opts.UTF16,
|
utf16Pos: opts.UTF16Pos,
|
||||||
}
|
}
|
||||||
m := &Map{}
|
m := &Map{}
|
||||||
m.initRoot()
|
m.initRoot()
|
||||||
|
|
|
||||||
|
|
@ -99,7 +99,7 @@ func (c *compiler) __import(imp *d2ast.Import) (*Map, bool) {
|
||||||
defer f.Close()
|
defer f.Close()
|
||||||
|
|
||||||
ast, err := d2parser.Parse(impPath, f, &d2parser.ParseOptions{
|
ast, err := d2parser.Parse(impPath, f, &d2parser.ParseOptions{
|
||||||
UTF16: c.utf16,
|
UTF16Pos: c.utf16Pos,
|
||||||
ParseError: c.err,
|
ParseError: c.err,
|
||||||
})
|
})
|
||||||
if err != nil {
|
if err != nil {
|
||||||
|
|
|
||||||
|
|
@ -23,7 +23,7 @@ import (
|
||||||
)
|
)
|
||||||
|
|
||||||
type CompileOptions struct {
|
type CompileOptions struct {
|
||||||
UTF16 bool
|
UTF16Pos bool
|
||||||
FS fs.FS
|
FS fs.FS
|
||||||
MeasuredTexts []*d2target.MText
|
MeasuredTexts []*d2target.MText
|
||||||
Ruler *textmeasure.Ruler
|
Ruler *textmeasure.Ruler
|
||||||
|
|
@ -50,8 +50,8 @@ func Compile(ctx context.Context, input string, compileOpts *CompileOptions, ren
|
||||||
}
|
}
|
||||||
|
|
||||||
g, config, err := d2compiler.Compile(compileOpts.InputPath, strings.NewReader(input), &d2compiler.CompileOptions{
|
g, config, err := d2compiler.Compile(compileOpts.InputPath, strings.NewReader(input), &d2compiler.CompileOptions{
|
||||||
UTF16: compileOpts.UTF16,
|
UTF16Pos: compileOpts.UTF16Pos,
|
||||||
FS: compileOpts.FS,
|
FS: compileOpts.FS,
|
||||||
})
|
})
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return nil, nil, err
|
return nil, nil, err
|
||||||
|
|
|
||||||
|
|
@ -2,6 +2,7 @@ package d2parser
|
||||||
|
|
||||||
import (
|
import (
|
||||||
"bufio"
|
"bufio"
|
||||||
|
"bytes"
|
||||||
"fmt"
|
"fmt"
|
||||||
"io"
|
"io"
|
||||||
"math/big"
|
"math/big"
|
||||||
|
|
@ -23,9 +24,6 @@ type ParseOptions struct {
|
||||||
// So you want to read UTF-8 still but adjust the indexes to pretend the input is utf16.
|
// So you want to read UTF-8 still but adjust the indexes to pretend the input is utf16.
|
||||||
UTF16Pos bool
|
UTF16Pos bool
|
||||||
|
|
||||||
// UTF16Input makes the parser read the input as UTF16 and also sets UTF16Pos.
|
|
||||||
UTF16Input bool
|
|
||||||
|
|
||||||
ParseError *ParseError
|
ParseError *ParseError
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -44,25 +42,36 @@ type ParseOptions struct {
|
||||||
func Parse(path string, r io.Reader, opts *ParseOptions) (*d2ast.Map, error) {
|
func Parse(path string, r io.Reader, opts *ParseOptions) (*d2ast.Map, error) {
|
||||||
if opts == nil {
|
if opts == nil {
|
||||||
opts = &ParseOptions{
|
opts = &ParseOptions{
|
||||||
UTF16Pos: false,
|
UTF16Pos: false,
|
||||||
UTF16Input: false,
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
p := &parser{
|
p := &parser{
|
||||||
path: path,
|
path: path,
|
||||||
|
|
||||||
utf16Input: opts.UTF16Input,
|
utf16Pos: opts.UTF16Pos,
|
||||||
utf16Pos: opts.UTF16Pos,
|
err: opts.ParseError,
|
||||||
err: opts.ParseError,
|
|
||||||
}
|
}
|
||||||
if p.utf16Input {
|
br := bufio.NewReader(r)
|
||||||
p.utf16Pos = true
|
p.reader = br
|
||||||
tr := transform.NewReader(r, tunicode.UTF16(tunicode.LittleEndian, tunicode.UseBOM).NewDecoder())
|
|
||||||
p.reader = bufio.NewReader(tr)
|
bom, err := br.Peek(2)
|
||||||
} else {
|
if err == nil {
|
||||||
p.reader = bufio.NewReader(r)
|
// 0xFFFE is invalid UTF-8 so this is safe.
|
||||||
|
// Also a different BOM is used for UTF-8.
|
||||||
|
// See https://unicode.org/faq/utf_bom.html#bom4
|
||||||
|
if bom[0] == 0xFF && bom[1] == 0xFE {
|
||||||
|
p.utf16Pos = true
|
||||||
|
|
||||||
|
buf := make([]byte, br.Buffered())
|
||||||
|
io.ReadFull(br, buf)
|
||||||
|
|
||||||
|
mr := io.MultiReader(bytes.NewBuffer(buf), r)
|
||||||
|
tr := transform.NewReader(mr, tunicode.UTF16(tunicode.LittleEndian, tunicode.UseBOM).NewDecoder())
|
||||||
|
br.Reset(tr)
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if p.err == nil {
|
if p.err == nil {
|
||||||
p.err = &ParseError{}
|
p.err = &ParseError{}
|
||||||
}
|
}
|
||||||
|
|
@ -131,10 +140,9 @@ func ParseValue(value string) (d2ast.Value, error) {
|
||||||
//
|
//
|
||||||
// TODO: ast struct that combines map & errors and pass that around
|
// TODO: ast struct that combines map & errors and pass that around
|
||||||
type parser struct {
|
type parser struct {
|
||||||
path string
|
path string
|
||||||
pos d2ast.Position
|
pos d2ast.Position
|
||||||
utf16Pos bool
|
utf16Pos bool
|
||||||
utf16Input bool
|
|
||||||
|
|
||||||
reader io.RuneReader
|
reader io.RuneReader
|
||||||
readerPos d2ast.Position
|
readerPos d2ast.Position
|
||||||
|
|
@ -212,10 +220,7 @@ func (p *parser) _readRune() (r rune, eof bool) {
|
||||||
|
|
||||||
p.readerPos = p.lookaheadPos
|
p.readerPos = p.lookaheadPos
|
||||||
|
|
||||||
r, n, err := p.reader.ReadRune()
|
r, _, err := p.reader.ReadRune()
|
||||||
if p.utf16Input && n > 0 {
|
|
||||||
// TODO:
|
|
||||||
}
|
|
||||||
if err != nil {
|
if err != nil {
|
||||||
p.ioerr = true
|
p.ioerr = true
|
||||||
if err != io.EOF {
|
if err != io.EOF {
|
||||||
|
|
|
||||||
|
|
@ -17,7 +17,6 @@ import (
|
||||||
type testCase struct {
|
type testCase struct {
|
||||||
name string
|
name string
|
||||||
text string
|
text string
|
||||||
utf16 bool
|
|
||||||
assert func(t testing.TB, ast *d2ast.Map, err error)
|
assert func(t testing.TB, ast *d2ast.Map, err error)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -395,20 +394,12 @@ c-
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
name: "utf16-input",
|
name: "utf16-input",
|
||||||
utf16: true,
|
|
||||||
text: "\xff\xfex\x00 \x00-\x00>\x00 \x00y\x00\r\x00\n\x00",
|
text: "\xff\xfex\x00 \x00-\x00>\x00 \x00y\x00\r\x00\n\x00",
|
||||||
assert: func(t testing.TB, ast *d2ast.Map, err error) {
|
assert: func(t testing.TB, ast *d2ast.Map, err error) {
|
||||||
assert.Success(t, err)
|
assert.Success(t, err)
|
||||||
assert.Equal(t, "x -> y\n", d2format.Format(ast))
|
assert.Equal(t, "x -> y\n", d2format.Format(ast))
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
{
|
|
||||||
name: "errors/utf16-input",
|
|
||||||
text: "\xff\xfex\x00 \x00-\x00>\x00 \x00y\x00\r\x00\n\x00",
|
|
||||||
assert: func(t testing.TB, ast *d2ast.Map, err error) {
|
|
||||||
assert.ErrorString(t, err, `d2/testdata/d2parser/TestParse/errors/utf16-input.d2:1:13: invalid text beginning unquoted key`)
|
|
||||||
},
|
|
||||||
},
|
|
||||||
}
|
}
|
||||||
|
|
||||||
t.Run("import", testImport)
|
t.Run("import", testImport)
|
||||||
|
|
@ -510,9 +501,6 @@ func runa(t *testing.T, tca []testCase) {
|
||||||
|
|
||||||
d2Path := fmt.Sprintf("d2/testdata/d2parser/%v.d2", t.Name())
|
d2Path := fmt.Sprintf("d2/testdata/d2parser/%v.d2", t.Name())
|
||||||
opts := &d2parser.ParseOptions{}
|
opts := &d2parser.ParseOptions{}
|
||||||
if tc.utf16 {
|
|
||||||
opts.UTF16Input = true
|
|
||||||
}
|
|
||||||
ast, err := d2parser.Parse(d2Path, strings.NewReader(tc.text), opts)
|
ast, err := d2parser.Parse(d2Path, strings.NewReader(tc.text), opts)
|
||||||
|
|
||||||
if tc.assert != nil {
|
if tc.assert != nil {
|
||||||
|
|
|
||||||
|
|
@ -10,6 +10,7 @@ import (
|
||||||
"io"
|
"io"
|
||||||
"log"
|
"log"
|
||||||
"os"
|
"os"
|
||||||
|
"unicode/utf8"
|
||||||
|
|
||||||
"golang.org/x/text/encoding/unicode"
|
"golang.org/x/text/encoding/unicode"
|
||||||
"golang.org/x/text/transform"
|
"golang.org/x/text/transform"
|
||||||
|
|
@ -27,6 +28,8 @@ func main() {
|
||||||
}
|
}
|
||||||
|
|
||||||
fmt.Printf("%q\n", b.String())
|
fmt.Printf("%q\n", b.String())
|
||||||
|
fmt.Println("\xFF\xFE")
|
||||||
|
fmt.Println(utf8.ValidString("\xFF\xFE"))
|
||||||
|
|
||||||
err = os.WriteFile("./utf16.d2", b.Bytes(), 0644)
|
err = os.WriteFile("./utf16.d2", b.Bytes(), 0644)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
|
|
|
||||||
|
|
@ -111,7 +111,7 @@ func serde(t *testing.T, tc testCase, ruler *textmeasure.Ruler) {
|
||||||
ctx := context.Background()
|
ctx := context.Background()
|
||||||
ctx = log.WithTB(ctx, t, nil)
|
ctx = log.WithTB(ctx, t, nil)
|
||||||
g, _, err := d2compiler.Compile("", strings.NewReader(tc.script), &d2compiler.CompileOptions{
|
g, _, err := d2compiler.Compile("", strings.NewReader(tc.script), &d2compiler.CompileOptions{
|
||||||
UTF16: false,
|
UTF16Pos: false,
|
||||||
})
|
})
|
||||||
trequire.Nil(t, err)
|
trequire.Nil(t, err)
|
||||||
if len(g.Objects) > 0 {
|
if len(g.Objects) > 0 {
|
||||||
|
|
|
||||||
38
testdata/d2parser/TestParse/errors/utf16-input.exp.json
generated
vendored
38
testdata/d2parser/TestParse/errors/utf16-input.exp.json
generated
vendored
|
|
@ -1,38 +0,0 @@
|
||||||
{
|
|
||||||
"ast": {
|
|
||||||
"range": "d2/testdata/d2parser/TestParse/errors/utf16-input.d2,0:0:0-1:1:22",
|
|
||||||
"nodes": [
|
|
||||||
{
|
|
||||||
"map_key": {
|
|
||||||
"range": "d2/testdata/d2parser/TestParse/errors/utf16-input.d2,1:0:21-1:1:22",
|
|
||||||
"key": {
|
|
||||||
"range": "d2/testdata/d2parser/TestParse/errors/utf16-input.d2,1:0:21-1:1:22",
|
|
||||||
"path": [
|
|
||||||
{
|
|
||||||
"unquoted_string": {
|
|
||||||
"range": "d2/testdata/d2parser/TestParse/errors/utf16-input.d2,1:0:21-1:1:22",
|
|
||||||
"value": [
|
|
||||||
{
|
|
||||||
"string": "\u0000",
|
|
||||||
"raw_string": "\u0000"
|
|
||||||
}
|
|
||||||
]
|
|
||||||
}
|
|
||||||
}
|
|
||||||
]
|
|
||||||
},
|
|
||||||
"primary": {},
|
|
||||||
"value": {}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
]
|
|
||||||
},
|
|
||||||
"err": {
|
|
||||||
"errs": [
|
|
||||||
{
|
|
||||||
"range": "d2/testdata/d2parser/TestParse/errors/utf16-input.d2,0:12:12-0:20:20",
|
|
||||||
"errmsg": "d2/testdata/d2parser/TestParse/errors/utf16-input.d2:1:13: invalid text beginning unquoted key"
|
|
||||||
}
|
|
||||||
]
|
|
||||||
}
|
|
||||||
}
|
|
||||||
Loading…
Reference in a new issue