From b81da1ee621260d5f7c5ba37030b9be60f01d70e Mon Sep 17 00:00:00 2001 From: Anmol Sethi Date: Wed, 2 Aug 2023 09:56:35 -0700 Subject: [PATCH] d2parser: Support reading utf16 files --- d2ir/import.go | 3 +- d2parser/.gitignore | 1 + d2parser/parse.go | 98 ++++++++++++------- d2parser/parse_test.go | 24 ++++- d2parser/utf16_gen.go | 35 +++++++ .../TestParse/errors/utf16-input.exp.json | 38 +++++++ .../d2parser/TestParse/utf16-input.exp.json | 54 ++++++++++ 7 files changed, 212 insertions(+), 41 deletions(-) create mode 100644 d2parser/.gitignore create mode 100644 d2parser/utf16_gen.go create mode 100644 testdata/d2parser/TestParse/errors/utf16-input.exp.json create mode 100644 testdata/d2parser/TestParse/utf16-input.exp.json diff --git a/d2ir/import.go b/d2ir/import.go index 147130071..383e5c24c 100644 --- a/d2ir/import.go +++ b/d2ir/import.go @@ -1,7 +1,6 @@ package d2ir import ( - "bufio" "io/fs" "os" "path" @@ -99,7 +98,7 @@ func (c *compiler) __import(imp *d2ast.Import) (*Map, bool) { } defer f.Close() - ast, err := d2parser.Parse(impPath, bufio.NewReader(f), &d2parser.ParseOptions{ + ast, err := d2parser.Parse(impPath, f, &d2parser.ParseOptions{ UTF16: c.utf16, ParseError: c.err, }) diff --git a/d2parser/.gitignore b/d2parser/.gitignore new file mode 100644 index 000000000..e3767e21a --- /dev/null +++ b/d2parser/.gitignore @@ -0,0 +1 @@ +utf16.d2 diff --git a/d2parser/parse.go b/d2parser/parse.go index 50f6c91c1..2eeae234a 100644 --- a/d2parser/parse.go +++ b/d2parser/parse.go @@ -1,6 +1,7 @@ package d2parser import ( + "bufio" "fmt" "io" "math/big" @@ -9,13 +10,22 @@ import ( "unicode" "unicode/utf8" - "oss.terrastruct.com/util-go/go2" + tunicode "golang.org/x/text/encoding/unicode" + "golang.org/x/text/transform" "oss.terrastruct.com/d2/d2ast" + "oss.terrastruct.com/util-go/go2" ) type ParseOptions struct { - UTF16 bool + // UTF16Pos would be used with input received from a browser where the browser will send the text as UTF-8 but + // JavaScript keeps strings in memory as UTF-16 and so needs UTF-16 indexes into the text to line up errors correctly. + // So you want to read UTF-8 still but adjust the indexes to pretend the input is utf16. + UTF16Pos bool + + // UTF16Input makes the parser read the input as UTF16 and also sets UTF16Pos. + UTF16Input bool + ParseError *ParseError } @@ -27,23 +37,31 @@ type ParseOptions struct { // The map may be compiled via Compile even if there are errors to keep language tooling // operational. Though autoformat should not run. // -// If UTF16Mode is true, positions will be recorded in UTF-16 codeunits as required by LSP +// If UTF16Pos is true, positions will be recorded in UTF-16 codeunits as required by LSP // and browser clients. See // https://microsoft.github.io/language-server-protocol/specifications/specification-current/#textDocuments // TODO: update godocs -func Parse(path string, r io.RuneReader, opts *ParseOptions) (*d2ast.Map, error) { +func Parse(path string, r io.Reader, opts *ParseOptions) (*d2ast.Map, error) { if opts == nil { opts = &ParseOptions{ - UTF16: false, + UTF16Pos: false, + UTF16Input: false, } } p := &parser{ - path: path, - reader: r, + path: path, - utf16: opts.UTF16, - err: opts.ParseError, + utf16Input: opts.UTF16Input, + utf16Pos: opts.UTF16Pos, + err: opts.ParseError, + } + if p.utf16Input { + p.utf16Pos = true + tr := transform.NewReader(r, tunicode.UTF16(tunicode.LittleEndian, tunicode.UseBOM).NewDecoder()) + p.reader = bufio.NewReader(tr) + } else { + p.reader = bufio.NewReader(r) } if p.err == nil { p.err = &ParseError{} @@ -113,9 +131,10 @@ func ParseValue(value string) (d2ast.Value, error) { // // TODO: ast struct that combines map & errors and pass that around type parser struct { - path string - pos d2ast.Position - utf16 bool + path string + pos d2ast.Position + utf16Pos bool + utf16Input bool reader io.RuneReader readerPos d2ast.Position @@ -193,7 +212,10 @@ func (p *parser) _readRune() (r rune, eof bool) { p.readerPos = p.lookaheadPos - r, _, err := p.reader.ReadRune() + r, n, err := p.reader.ReadRune() + if p.utf16Input && n > 0 { + // TODO: + } if err != nil { p.ioerr = true if err != io.EOF { @@ -217,13 +239,13 @@ func (p *parser) read() (r rune, eof bool) { if eof { return 0, true } - p.pos = p.pos.Advance(r, p.utf16) + p.pos = p.pos.Advance(r, p.utf16Pos) p.lookaheadPos = p.pos return r, false } func (p *parser) replay(r rune) { - p.pos = p.pos.Subtract(r, p.utf16) + p.pos = p.pos.Subtract(r, p.utf16Pos) // This is more complex than it needs to be to allow reusing the buffer underlying // p.lookahead. @@ -250,7 +272,7 @@ func (p *parser) peek() (r rune, eof bool) { } p.lookahead = append(p.lookahead, r) - p.lookaheadPos = p.lookaheadPos.Advance(r, p.utf16) + p.lookaheadPos = p.lookaheadPos.Advance(r, p.utf16Pos) return r, false } @@ -364,7 +386,7 @@ func (p *parser) parseMap(isFileMap bool) *d2ast.Map { defer m.Range.End.From(&p.pos) if !isFileMap { - m.Range.Start = m.Range.Start.Subtract('{', p.utf16) + m.Range.Start = m.Range.Start.Subtract('{', p.utf16Pos) p.depth++ defer dec(&p.depth) } @@ -383,7 +405,7 @@ func (p *parser) parseMap(isFileMap bool) *d2ast.Map { continue case '}': if isFileMap { - p.errorf(p.pos.Subtract(r, p.utf16), p.pos, "unexpected map termination character } in file map") + p.errorf(p.pos.Subtract(r, p.utf16Pos), p.pos, "unexpected map termination character } in file map") continue } return m @@ -489,7 +511,7 @@ func (p *parser) parseComment() *d2ast.Comment { c := &d2ast.Comment{ Range: d2ast.Range{ Path: p.path, - Start: p.pos.Subtract('#', p.utf16), + Start: p.pos.Subtract('#', p.utf16Pos), }, } defer c.Range.End.From(&p.pos) @@ -546,7 +568,7 @@ func (p *parser) parseBlockComment() *d2ast.BlockComment { bc := &d2ast.BlockComment{ Range: d2ast.Range{ Path: p.path, - Start: p.pos.SubtractString(`"""`, p.utf16), + Start: p.pos.SubtractString(`"""`, p.utf16Pos), }, } defer bc.Range.End.From(&p.pos) @@ -714,7 +736,7 @@ func (p *parser) parseMapKeyValue(mk *d2ast.Key) { } mk.Value = p.parseValue() if mk.Value.Unbox() == nil { - p.errorf(p.pos.Subtract(':', p.utf16), p.pos, "missing value after colon") + p.errorf(p.pos.Subtract(':', p.utf16Pos), p.pos, "missing value after colon") } sb := mk.Value.ScalarBox() @@ -788,7 +810,7 @@ func (p *parser) parseEdgeIndex() *d2ast.EdgeIndex { ei := &d2ast.EdgeIndex{ Range: d2ast.Range{ Path: p.path, - Start: p.pos.Subtract('[', p.utf16), + Start: p.pos.Subtract('[', p.utf16Pos), }, } defer ei.Range.End.From(&p.pos) @@ -816,7 +838,7 @@ func (p *parser) parseEdgeIndex() *d2ast.EdgeIndex { } p.commit() if !unicode.IsDigit(r) { - p.errorf(p.pos.Subtract(r, p.utf16), p.pos, "unexpected character in edge index") + p.errorf(p.pos.Subtract(r, p.utf16Pos), p.pos, "unexpected character in edge index") continue } sb.WriteRune(r) @@ -827,7 +849,7 @@ func (p *parser) parseEdgeIndex() *d2ast.EdgeIndex { p.commit() ei.Glob = true } else { - p.errorf(p.pos.Subtract(r, p.utf16), p.pos, "unexpected character in edge index") + p.errorf(p.pos.Subtract(r, p.utf16Pos), p.pos, "unexpected character in edge index") // TODO: skip to ], maybe add a p.skipTo to skip to certain characters } @@ -870,8 +892,8 @@ func (p *parser) parseEdges(mk *d2ast.Key, src *d2ast.KeyPath) { return } if src == nil { - p.errorf(p.lookaheadPos.Subtract(r, p.utf16), p.lookaheadPos, "connection missing source") - e.Range.Start = p.lookaheadPos.Subtract(r, p.utf16) + p.errorf(p.lookaheadPos.Subtract(r, p.utf16Pos), p.lookaheadPos, "connection missing source") + e.Range.Start = p.lookaheadPos.Subtract(r, p.utf16Pos) } p.commit() @@ -1056,7 +1078,7 @@ func (p *parser) parseUnquotedString(inKey bool) (s *d2ast.UnquotedString) { p.rewind() if !eof { if _s == "...@" { - p.errorf(p.pos, p.pos.AdvanceString("...@", p.utf16), "unquoted strings cannot begin with ...@ as that's import spread syntax") + p.errorf(p.pos, p.pos.AdvanceString("...@", p.utf16Pos), "unquoted strings cannot begin with ...@ as that's import spread syntax") } } @@ -1162,7 +1184,7 @@ func (p *parser) parseUnquotedString(inKey bool) (s *d2ast.UnquotedString) { r2, eof := p.read() if eof { - p.errorf(p.pos.Subtract('\\', p.utf16), p.readerPos, "unfinished escape sequence") + p.errorf(p.pos.Subtract('\\', p.utf16Pos), p.readerPos, "unfinished escape sequence") return s } @@ -1214,7 +1236,7 @@ func (p *parser) parseDoubleQuotedString(inKey bool) *d2ast.DoubleQuotedString { s := &d2ast.DoubleQuotedString{ Range: d2ast.Range{ Path: p.path, - Start: p.pos.Subtract('"', p.utf16), + Start: p.pos.Subtract('"', p.utf16Pos), }, } defer s.Range.End.From(&p.pos) @@ -1266,7 +1288,7 @@ func (p *parser) parseDoubleQuotedString(inKey bool) *d2ast.DoubleQuotedString { r2, eof := p.read() if eof { - p.errorf(p.pos.Subtract('\\', p.utf16), p.readerPos, "unfinished escape sequence") + p.errorf(p.pos.Subtract('\\', p.utf16Pos), p.readerPos, "unfinished escape sequence") p.errorf(s.Range.Start, p.readerPos, `double quoted strings must be terminated with "`) return s } @@ -1285,7 +1307,7 @@ func (p *parser) parseSingleQuotedString() *d2ast.SingleQuotedString { s := &d2ast.SingleQuotedString{ Range: d2ast.Range{ Path: p.path, - Start: p.pos.Subtract('\'', p.utf16), + Start: p.pos.Subtract('\'', p.utf16Pos), }, } defer s.Range.End.From(&p.pos) @@ -1347,7 +1369,7 @@ func (p *parser) parseBlockString() *d2ast.BlockString { bs := &d2ast.BlockString{ Range: d2ast.Range{ Path: p.path, - Start: p.pos.Subtract('|', p.utf16), + Start: p.pos.Subtract('|', p.utf16Pos), }, } defer bs.Range.End.From(&p.pos) @@ -1460,7 +1482,7 @@ func (p *parser) parseArray() *d2ast.Array { a := &d2ast.Array{ Range: d2ast.Range{ Path: p.path, - Start: p.pos.Subtract('[', p.utf16), + Start: p.pos.Subtract('[', p.utf16Pos), }, } defer a.Range.End.From(&p.readerPos) @@ -1562,7 +1584,7 @@ func (p *parser) parseArrayNode(r rune) d2ast.ArrayNodeBox { vbox := p.parseValue() if vbox.UnquotedString != nil && vbox.UnquotedString.ScalarString() == "" && !(len(vbox.UnquotedString.Value) > 0 && vbox.UnquotedString.Value[0].Substitution != nil) { - p.errorf(p.pos, p.pos.Advance(r, p.utf16), "unquoted strings cannot start on %q", r) + p.errorf(p.pos, p.pos.Advance(r, p.utf16Pos), "unquoted strings cannot start on %q", r) } box.Null = vbox.Null box.Boolean = vbox.Boolean @@ -1661,14 +1683,14 @@ func (p *parser) parseSubstitution(spread bool) *d2ast.Substitution { subst := &d2ast.Substitution{ Range: d2ast.Range{ Path: p.path, - Start: p.pos.SubtractString("$", p.utf16), + Start: p.pos.SubtractString("$", p.utf16Pos), }, Spread: spread, } defer subst.Range.End.From(&p.pos) if subst.Spread { - subst.Range.Start = subst.Range.Start.SubtractString("...", p.utf16) + subst.Range.Start = subst.Range.Start.SubtractString("...", p.utf16Pos) } r, newlines, eof := p.peekNotSpace() @@ -1711,14 +1733,14 @@ func (p *parser) parseImport(spread bool) *d2ast.Import { imp := &d2ast.Import{ Range: d2ast.Range{ Path: p.path, - Start: p.pos.SubtractString("$", p.utf16), + Start: p.pos.SubtractString("$", p.utf16Pos), }, Spread: spread, } defer imp.Range.End.From(&p.pos) if imp.Spread { - imp.Range.Start = imp.Range.Start.SubtractString("...", p.utf16) + imp.Range.Start = imp.Range.Start.SubtractString("...", p.utf16Pos) } var pre strings.Builder diff --git a/d2parser/parse_test.go b/d2parser/parse_test.go index 2dc195d02..2b7300ad3 100644 --- a/d2parser/parse_test.go +++ b/d2parser/parse_test.go @@ -10,12 +10,14 @@ import ( "oss.terrastruct.com/util-go/diff" "oss.terrastruct.com/d2/d2ast" + "oss.terrastruct.com/d2/d2format" "oss.terrastruct.com/d2/d2parser" ) type testCase struct { name string text string + utf16 bool assert func(t testing.TB, ast *d2ast.Map, err error) } @@ -391,6 +393,22 @@ c- assert.Equal(t, "1:13", ast.Nodes[0].MapKey.Edges[1].Dst.Range.End.String()) }, }, + { + name: "utf16-input", + utf16: true, + text: "\xff\xfex\x00 \x00-\x00>\x00 \x00y\x00\r\x00\n\x00", + assert: func(t testing.TB, ast *d2ast.Map, err error) { + assert.Success(t, err) + assert.Equal(t, "x -> y\n", d2format.Format(ast)) + }, + }, + { + name: "errors/utf16-input", + text: "\xff\xfex\x00 \x00-\x00>\x00 \x00y\x00\r\x00\n\x00", + assert: func(t testing.TB, ast *d2ast.Map, err error) { + assert.ErrorString(t, err, `d2/testdata/d2parser/TestParse/errors/utf16-input.d2:1:13: invalid text beginning unquoted key`) + }, + }, } t.Run("import", testImport) @@ -491,7 +509,11 @@ func runa(t *testing.T, tca []testCase) { t.Parallel() d2Path := fmt.Sprintf("d2/testdata/d2parser/%v.d2", t.Name()) - ast, err := d2parser.Parse(d2Path, strings.NewReader(tc.text), nil) + opts := &d2parser.ParseOptions{} + if tc.utf16 { + opts.UTF16Input = true + } + ast, err := d2parser.Parse(d2Path, strings.NewReader(tc.text), opts) if tc.assert != nil { tc.assert(t, ast, err) diff --git a/d2parser/utf16_gen.go b/d2parser/utf16_gen.go new file mode 100644 index 000000000..090705dfe --- /dev/null +++ b/d2parser/utf16_gen.go @@ -0,0 +1,35 @@ +//go:build ignore + +// utf16_gen.go is used to create test UTF-16 input for the UTF-16 input test in parse_test.go +// Confirm `file utf16.txt` returns +package main + +import ( + "bytes" + "fmt" + "io" + "log" + "os" + + "golang.org/x/text/encoding/unicode" + "golang.org/x/text/transform" +) + +func main() { + // Pretend we're on Windows. + s := "x -> y\r\n" + + b := &bytes.Buffer{} + t := transform.NewWriter(b, unicode.UTF16(unicode.LittleEndian, unicode.UseBOM).NewEncoder()) + _, err := io.WriteString(t, s) + if err != nil { + log.Fatal(err) + } + + fmt.Printf("%q\n", b.String()) + + err = os.WriteFile("./utf16.d2", b.Bytes(), 0644) + if err != nil { + log.Fatal(err) + } +} diff --git a/testdata/d2parser/TestParse/errors/utf16-input.exp.json b/testdata/d2parser/TestParse/errors/utf16-input.exp.json new file mode 100644 index 000000000..81f075aa7 --- /dev/null +++ b/testdata/d2parser/TestParse/errors/utf16-input.exp.json @@ -0,0 +1,38 @@ +{ + "ast": { + "range": "d2/testdata/d2parser/TestParse/errors/utf16-input.d2,0:0:0-1:1:22", + "nodes": [ + { + "map_key": { + "range": "d2/testdata/d2parser/TestParse/errors/utf16-input.d2,1:0:21-1:1:22", + "key": { + "range": "d2/testdata/d2parser/TestParse/errors/utf16-input.d2,1:0:21-1:1:22", + "path": [ + { + "unquoted_string": { + "range": "d2/testdata/d2parser/TestParse/errors/utf16-input.d2,1:0:21-1:1:22", + "value": [ + { + "string": "\u0000", + "raw_string": "\u0000" + } + ] + } + } + ] + }, + "primary": {}, + "value": {} + } + } + ] + }, + "err": { + "errs": [ + { + "range": "d2/testdata/d2parser/TestParse/errors/utf16-input.d2,0:12:12-0:20:20", + "errmsg": "d2/testdata/d2parser/TestParse/errors/utf16-input.d2:1:13: invalid text beginning unquoted key" + } + ] + } +} diff --git a/testdata/d2parser/TestParse/utf16-input.exp.json b/testdata/d2parser/TestParse/utf16-input.exp.json new file mode 100644 index 000000000..24f3479d2 --- /dev/null +++ b/testdata/d2parser/TestParse/utf16-input.exp.json @@ -0,0 +1,54 @@ +{ + "ast": { + "range": "d2/testdata/d2parser/TestParse/utf16-input.d2,0:0:0-1:0:8", + "nodes": [ + { + "map_key": { + "range": "d2/testdata/d2parser/TestParse/utf16-input.d2,0:0:0-0:7:7", + "edges": [ + { + "range": "d2/testdata/d2parser/TestParse/utf16-input.d2,0:0:0-0:6:6", + "src": { + "range": "d2/testdata/d2parser/TestParse/utf16-input.d2,0:0:0-0:1:1", + "path": [ + { + "unquoted_string": { + "range": "d2/testdata/d2parser/TestParse/utf16-input.d2,0:0:0-0:1:1", + "value": [ + { + "string": "x", + "raw_string": "x" + } + ] + } + } + ] + }, + "src_arrow": "", + "dst": { + "range": "d2/testdata/d2parser/TestParse/utf16-input.d2,0:5:5-0:6:6", + "path": [ + { + "unquoted_string": { + "range": "d2/testdata/d2parser/TestParse/utf16-input.d2,0:5:5-0:6:6", + "value": [ + { + "string": "y", + "raw_string": "y" + } + ] + } + } + ] + }, + "dst_arrow": ">" + } + ], + "primary": {}, + "value": {} + } + } + ] + }, + "err": null +}