d2/d2parser/parse.go

package d2parser

import (
	"bufio"
	"bytes"
	"fmt"
	"io"
	"math/big"
	"strconv"
	"strings"
	"unicode"
	"unicode/utf8"

	tunicode "golang.org/x/text/encoding/unicode"
	"golang.org/x/text/transform"

	"oss.terrastruct.com/d2/d2ast"
	"oss.terrastruct.com/util-go/go2"
)

type ParseOptions struct {
	// UTF16Pos would be used with input received from a browser where the browser will send the text as UTF-8 but
	// JavaScript keeps strings in memory as UTF-16 and so needs UTF-16 indexes into the text to line up errors correctly.
	// So you want to read UTF-8 still but adjust the indexes to pretend the input is utf16.
	UTF16Pos bool

	ParseError *ParseError
}

// Parse parses a .d2 Map in r.
//
// The returned Map always represents a valid .d2 file. All encountered errors will be in
// []error.
//
// The map may be compiled via Compile even if there are errors to keep language tooling
// operational. Though autoformat should not run.
//
// If UTF16Pos is true, positions will be recorded in UTF-16 codeunits as required by LSP
// and browser clients. See
// https://microsoft.github.io/language-server-protocol/specifications/specification-current/#textDocuments
// TODO: update godocs
func Parse(path string, r io.Reader, opts *ParseOptions) (*d2ast.Map, error) {
	if opts == nil {
		opts = &ParseOptions{
			UTF16Pos: false,
		}
	}

	p := &parser{
		path: path,

		utf16Pos: opts.UTF16Pos,
		err:      opts.ParseError,
	}
	br := bufio.NewReader(r)
	p.reader = br

	bom, err := br.Peek(2)
	if err == nil {
		// 0xFFFE is invalid UTF-8 so this is safe.
		// Also a different BOM is used for UTF-8.
		// See https://unicode.org/faq/utf_bom.html#bom4
		if bom[0] == 0xFF && bom[1] == 0xFE {
			p.utf16Pos = true

			buf := make([]byte, br.Buffered())
			io.ReadFull(br, buf)

			mr := io.MultiReader(bytes.NewBuffer(buf), r)
			tr := transform.NewReader(mr, tunicode.UTF16(tunicode.LittleEndian, tunicode.UseBOM).NewDecoder())
			br.Reset(tr)
		}
	}

	if p.err == nil {
		p.err = &ParseError{}
	}

	m := p.parseMap(true)
	if !p.err.Empty() {
		return m, p.err
	}
	return m, nil
}

func ParseKey(key string) (*d2ast.KeyPath, error) {
	p := &parser{
		reader: strings.NewReader(key),
		err:    &ParseError{},
	}

	k := p.parseKey()
	if !p.err.Empty() {
		return nil, fmt.Errorf("failed to parse key %q: %w", key, p.err)
	}
	if k == nil {
		return nil, fmt.Errorf("empty key: %q", key)
	}
	return k, nil
}

func ParseMapKey(mapKey string) (*d2ast.Key, error) {
	p := &parser{
		reader: strings.NewReader(mapKey),
		err:    &ParseError{},
	}

	mk := p.parseMapKey()
	if !p.err.Empty() {
		return nil, fmt.Errorf("failed to parse map key %q: %w", mapKey, p.err)
	}
	if mk == nil {
		return nil, fmt.Errorf("empty map key: %q", mapKey)
	}
	return mk, nil
}

func ParseValue(value string) (d2ast.Value, error) {
	p := &parser{
		reader: strings.NewReader(value),
		err:    &ParseError{},
	}

	v := p.parseValue()
	if !p.err.Empty() {
		return nil, fmt.Errorf("failed to parse value %q: %w", value, p.err)
	}
	if v.Unbox() == nil {
		return nil, fmt.Errorf("empty value: %q", value)
	}
	return v.Unbox(), nil
}

// TODO: refactor parser to keep entire file in memory as []rune
//   - trivial to then convert positions
//   - lookahead is gone, just forward back as much as you want :)
//   - streaming parser isn't really helpful.
//   - just read into a string even and decode runes forward/back as needed
//   - the whole file essentially exists within the parser as the AST anyway...
//
// TODO: ast struct that combines map & errors and pass that around
type parser struct {
	path     string
	pos      d2ast.Position
	utf16Pos bool

	reader    io.RuneReader
	readerPos d2ast.Position

	readahead    []rune
	lookahead    []rune
	lookaheadPos d2ast.Position

	ioerr bool
	err   *ParseError

	inEdgeGroup bool

	depth int
}

// TODO: rename to Error and make existing Error a private type errorWithRange
type ParseError struct {
	// Errors from globs need to be deduplicated
	ErrorsLookup map[d2ast.Error]struct{} `json:"-"`
	Errors       []d2ast.Error            `json:"errs"`
}

func Errorf(n d2ast.Node, f string, v ...interface{}) error {
	f = "%v: " + f
	v = append([]interface{}{n.GetRange()}, v...)
	return d2ast.Error{
		Range:   n.GetRange(),
		Message: fmt.Sprintf(f, v...),
	}
}

func (pe *ParseError) Empty() bool {
	if pe == nil {
		return true
	}
	return len(pe.Errors) == 0
}

func (pe *ParseError) Error() string {
	var sb strings.Builder
	for i, err := range pe.Errors {
		if i > 0 {
			sb.WriteByte('\n')
		}
		sb.WriteString(err.Error())
	}
	return sb.String()
}

func (p *parser) errorf(start d2ast.Position, end d2ast.Position, f string, v ...interface{}) {
	r := d2ast.Range{
		Path:  p.path,
		Start: start,
		End:   end,
	}
	f = "%v: " + f
	v = append([]interface{}{r}, v...)
	p.err.Errors = append(p.err.Errors, d2ast.Error{
		Range:   r,
		Message: fmt.Sprintf(f, v...),
	})
}

// _readRune reads the next rune from the underlying reader or from the p.readahead buffer.
func (p *parser) _readRune() (r rune, eof bool) {
	if len(p.readahead) > 0 {
		r = p.readahead[0]
		p.readahead = append(p.readahead[:0], p.readahead[1:]...)
		return r, false
	}

	if p.ioerr {
		p.rewind()
		return 0, true
	}

	p.readerPos = p.lookaheadPos

	r, _, err := p.reader.ReadRune()
	if err != nil {
		p.ioerr = true
		if err != io.EOF {
			p.err.Errors = append(p.err.Errors, d2ast.Error{
				Range: d2ast.Range{
					Path:  p.path,
					Start: p.readerPos,
					End:   p.readerPos,
				},
				Message: fmt.Sprintf("io error: %v", err),
			})
		}
		p.rewind()
		return 0, true
	}
	return r, false
}

func (p *parser) read() (r rune, eof bool) {
	r, eof = p._readRune()
	if eof {
		return 0, true
	}
	p.pos = p.pos.Advance(r, p.utf16Pos)
	p.lookaheadPos = p.pos
	return r, false
}

func (p *parser) replay(r rune) {
	p.pos = p.pos.Subtract(r, p.utf16Pos)

	// This is more complex than it needs to be to allow reusing the buffer underlying
	// p.lookahead.
	newcap := len(p.lookahead) + 1
	if newcap > cap(p.lookahead) {
		lookahead2 := make([]rune, newcap)
		copy(lookahead2[1:], p.lookahead)
		p.lookahead = lookahead2
	} else {
		p.lookahead = p.lookahead[:newcap]
		copy(p.lookahead[1:], p.lookahead)
	}
	p.lookahead[0] = r

	p.rewind()
}

// peek returns the next rune without advancing the parser.
// You *must* call commit or rewind afterwards.
func (p *parser) peek() (r rune, eof bool) {
	r, eof = p._readRune()
	if eof {
		return 0, true
	}

	p.lookahead = append(p.lookahead, r)
	p.lookaheadPos = p.lookaheadPos.Advance(r, p.utf16Pos)
	return r, false
}

// TODO: this can replace multiple peeks i think, just return []rune instead
func (p *parser) peekn(n int) (s string, eof bool) {
	var sb strings.Builder
	for i := 0; i < n; i++ {
		r, eof := p.peek()
		if eof {
			return sb.String(), true
		}
		sb.WriteRune(r)
	}
	return sb.String(), false
}

func (p *parser) readNotSpace() (r rune, eof bool) {
	for {
		r, eof = p.read()
		if eof {
			return 0, true
		}
		if unicode.IsSpace(r) {
			continue
		}
		return r, false
	}
}

// peekNotSpace returns the next non space rune without advancing the parser.
//
// newline is set if the next non space character is on a different line
// than the current line.
//
// TODO: everywhere this is used, we support newline escapes and so can just
// add the logic here and it should *just* work
// except line comments iirc
// not entirely sure, maybe i can put it into peek somehow
func (p *parser) peekNotSpace() (r rune, newlines int, eof bool) {
	for {
		r, eof = p.peek()
		if eof {
			return 0, 0, true
		}
		if unicode.IsSpace(r) {
			if r == '\n' {
				newlines++
			}
			continue
		}
		return r, newlines, false
	}
}

// commit advances p.pos by all peeked bytes and then resets the p.lookahead buffer.
func (p *parser) commit() {
	p.pos = p.lookaheadPos
	p.lookahead = p.lookahead[:0]
}

// rewind copies p.lookahead to the front of p.readahead and then resets the p.lookahead buffer.
// All peeked bytes will again be available via p.eat or p.peek.
// TODO:
// peek
// peekn
// peekNotSpace
// commit
// rewind
//
// TODO: make each parse function read its delimiter and return nil if not as expected
// TODO: lookahead *must* always be empty in between parse calls. you either commit or
//
//	rewind in each function. if you don't, you pass a hint.
//
// TODO: omg we don't need two buffers, just a single lookahead and an index...
// TODO: get rid of lookaheadPos or at least never use directly. maybe rename to beforePeekPos?
//
//	or better yet keep positions in the lookahead buffer.
//	ok so plan here is to get rid of lookaheadPos and add a rewindPos that stores
//	the pos to rewind to.
func (p *parser) rewind() {
	if len(p.lookahead) == 0 {
		return
	}

	// This is more complex than it needs to be to allow reusing the buffer underlying
	// p.readahead.
	newcap := len(p.lookahead) + len(p.readahead)
	if cap(p.readahead) < newcap {
		readahead2 := make([]rune, newcap)
		copy(readahead2[len(p.lookahead):], p.readahead)
		p.readahead = readahead2
	} else {
		p.readahead = p.readahead[:newcap]
		copy(p.readahead[len(p.lookahead):], p.readahead)
	}
	copy(p.readahead, p.lookahead)

	p.lookahead = p.lookahead[:0]
	p.lookaheadPos = p.pos
}

// TODO: remove isFileMap like in printer. can't rn as we have to subtract delim
func (p *parser) parseMap(isFileMap bool) *d2ast.Map {
	m := &d2ast.Map{
		Range: d2ast.Range{
			Path:  p.path,
			Start: p.pos,
		},
	}
	defer m.Range.End.From(&p.pos)

	if !isFileMap {
		m.Range.Start = m.Range.Start.Subtract('{', p.utf16Pos)
		p.depth++
		defer dec(&p.depth)
	}

	for {
		r, eof := p.readNotSpace()
		if eof {
			if !isFileMap {
				p.errorf(m.Range.Start, p.readerPos, "maps must be terminated with }")
			}
			return m
		}

		switch r {
		case ';':
			continue
		case '}':
			if isFileMap {
				p.errorf(p.pos.Subtract(r, p.utf16Pos), p.pos, "unexpected map termination character } in file map")
				continue
			}
			return m
		}

		n := p.parseMapNode(r)
		if n.Unbox() != nil {
			m.Nodes = append(m.Nodes, n)
			// TODO: all subsequent not comment characters on the current line (or till ;)
			// need to be considered errors.
			// TODO: add specific msg for each bad rune type
		}

		if n.BlockComment != nil {
			// Anything after a block comment is ok.
			continue
		}

		after := p.pos
		for {
			r, newlines, eof := p.peekNotSpace()
			if eof || newlines != 0 || r == ';' || r == '}' || r == '#' {
				p.rewind()
				break
			}
			p.commit()
		}

		// TODO: maybe better idea here is to make parseUnquotedString aware of its delimiters
		// better and so it would read technically invalid characters and just complain.
		// TODO: that way broken syntax will be parsed more "intently". would work better with
		// language tooling I think though not sure. yes definitely, eaterr!
		if after != p.pos {
			if n.Unbox() != nil {
				if n.MapKey != nil && n.MapKey.Value.Unbox() != nil {
					ps := ""
					if _, ok := n.MapKey.Value.Unbox().(*d2ast.BlockString); ok {
						ps = ". See https://d2lang.com/tour/text#advanced-block-strings."
					}
					p.errorf(after, p.pos, "unexpected text after %v%s", n.MapKey.Value.Unbox().Type(), ps)
				} else {
					p.errorf(after, p.pos, "unexpected text after %v", n.Unbox().Type())
				}
			} else {
				p.errorf(after, p.pos, "invalid text beginning unquoted key")
			}
		}
	}
}

func (p *parser) parseMapNode(r rune) d2ast.MapNodeBox {
	var box d2ast.MapNodeBox

	switch r {
	case '#':
		box.Comment = p.parseComment()
		return box
	case '"':
		s, eof := p.peekn(2)
		if eof {
			break
		}
		if s != `""` {
			p.rewind()
			break
		}
		p.commit()
		box.BlockComment = p.parseBlockComment()
		return box
	case '.':
		s, eof := p.peekn(2)
		if eof {
			break
		}
		if s != ".." {
			p.rewind()
			break
		}
		r, eof := p.peek()
		if eof {
			break
		}
		if r == '$' {
			p.commit()
			box.Substitution = p.parseSubstitution(true)
			return box
		}
		if r == '@' {
			p.commit()
			box.Import = p.parseImport(true)
			return box
		}
		p.rewind()
		break
	}

	p.replay(r)
	box.MapKey = p.parseMapKey()
	return box
}

func (p *parser) parseComment() *d2ast.Comment {
	c := &d2ast.Comment{
		Range: d2ast.Range{
			Path:  p.path,
			Start: p.pos.Subtract('#', p.utf16Pos),
		},
	}
	defer c.Range.End.From(&p.pos)

	var sb strings.Builder
	defer func() {
		c.Value = sb.String()
	}()
	p.parseCommentLine(c, &sb)

	for {
		r, newlines, eof := p.peekNotSpace()
		if eof {
			return c
		}
		if r != '#' || newlines >= 2 {
			p.rewind()
			return c
		}
		p.commit()

		if newlines == 1 {
			sb.WriteByte('\n')
		}

		p.parseCommentLine(c, &sb)
	}
}

func (p *parser) parseCommentLine(c *d2ast.Comment, sb *strings.Builder) {
	firstRune := true
	for {
		r, eof := p.peek()
		if eof {
			return
		}
		if r == '\n' {
			p.rewind()
			return
		}
		p.commit()

		if firstRune {
			firstRune = false
			if r == ' ' {
				continue
			}
		}
		sb.WriteRune(r)
	}
}

func (p *parser) parseBlockComment() *d2ast.BlockComment {
	bc := &d2ast.BlockComment{
		Range: d2ast.Range{
			Path:  p.path,
			Start: p.pos.SubtractString(`"""`, p.utf16Pos),
		},
	}
	defer bc.Range.End.From(&p.pos)

	p.depth++
	defer dec(&p.depth)

	var sb strings.Builder
	defer func() {
		bc.Value = trimSpaceAfterLastNewline(sb.String())
		bc.Value = trimCommonIndent(bc.Value)
	}()

	for {
		r, eof := p.peek()
		if eof {
			p.errorf(bc.Range.Start, p.readerPos, `block comments must be terminated with """`)
			return bc
		}

		if !unicode.IsSpace(r) {
			p.rewind()
			break
		}
		p.commit()
		if r == '\n' {
			break
		}
	}

	for {
		r, eof := p.read()
		if eof {
			p.errorf(bc.Range.Start, p.readerPos, `block comments must be terminated with """`)
			return bc
		}

		if r != '"' {
			sb.WriteRune(r)
			continue
		}

		s, eof := p.peekn(2)
		if eof {
			p.errorf(bc.Range.Start, p.readerPos, `block comments must be terminated with """`)
			return bc
		}
		if s != `""` {
			sb.WriteByte('"')
			p.rewind()
			continue
		}
		p.commit()
		return bc
	}
}

func trimSpaceAfterLastNewline(s string) string {
	lastNewline := strings.LastIndexByte(s, '\n')
	if lastNewline == -1 {
		return strings.TrimRightFunc(s, unicode.IsSpace)
	}

	lastLine := s[lastNewline+1:]
	lastLine = strings.TrimRightFunc(lastLine, unicode.IsSpace)
	if len(lastLine) == 0 {
		return s[:lastNewline]
	}
	return s[:lastNewline+1] + lastLine
}

func (p *parser) parseMapKey() (mk *d2ast.Key) {
	mk = &d2ast.Key{
		Range: d2ast.Range{
			Path:  p.path,
			Start: p.pos,
		},
	}
	defer mk.Range.End.From(&p.pos)

	defer func() {
		if mk.Key == nil && len(mk.Edges) == 0 {
			mk = nil
		}
	}()

	// Check for not ampersand/@.
	r, eof := p.peek()
	if eof {
		return mk
	}
	if r == '!' {
		r, eof := p.peek()
		if eof {
			return mk
		}
		if r == '&' {
			p.commit()
			mk.NotAmpersand = true
		} else {
			p.rewind()
		}
	} else if r == '&' {
		p.commit()
		mk.Ampersand = true
	} else {
		p.rewind()
	}

	r, eof = p.peek()
	if eof {
		return mk
	}
	if r == '(' {
		p.commit()
		p.parseEdgeGroup(mk)
		return mk
	}
	p.rewind()

	k := p.parseKey()
	if k != nil {
		mk.Key = k
	}

	r, newlines, eof := p.peekNotSpace()
	if eof {
		return mk
	}
	if newlines > 0 {
		p.rewind()
		return mk
	}
	switch r {
	case '(':
		p.commit()
		p.parseEdgeGroup(mk)
		return mk
	case '<', '>', '-':
		p.rewind()
		mk.Key = nil
		p.parseEdges(mk, k)
		p.parseMapKeyValue(mk)
		return mk
	default:
		p.rewind()
		p.parseMapKeyValue(mk)
		return mk
	}
}

func (p *parser) parseMapKeyValue(mk *d2ast.Key) {
	r, newlines, eof := p.peekNotSpace()
	if eof {
		return
	}
	if newlines > 0 {
		p.rewind()
		return
	}

	switch r {
	case '{':
		p.rewind()
		if mk.Key == nil && len(mk.Edges) == 0 {
			return
		}
	case ':':
		p.commit()
		if mk.Key == nil && len(mk.Edges) == 0 {
			p.errorf(mk.Range.Start, p.pos, "map value without key")
		}
	default:
		p.rewind()
		return
	}
	mk.Value = p.parseValue()
	if mk.Value.Unbox() == nil {
		p.errorf(p.pos.Subtract(':', p.utf16Pos), p.pos, "missing value after colon")
	}

	sb := mk.Value.ScalarBox()
	// If the value is a scalar, then check if it's the primary value.
	if sb.Unbox() != nil {
		r, newlines, eof := p.peekNotSpace()
		if eof || newlines > 0 || r != '{' {
			p.rewind()
			return
		}
		// Next character is on the same line without ; separator so it must mean
		// our current value is the Primary and the next is the Value.
		p.commit()
		p.replay(r)
		mk.Primary = sb
		mk.Value = p.parseValue()
	}
}

func (p *parser) parseEdgeGroup(mk *d2ast.Key) {
	// To prevent p.parseUnquotedString from consuming terminating parentheses.
	p.inEdgeGroup = true
	defer func() {
		p.inEdgeGroup = false
	}()

	src := p.parseKey()
	p.parseEdges(mk, src)

	r, newlines, eof := p.peekNotSpace()
	if eof || newlines > 0 {
		p.rewind()
		return
	}
	if r != ')' {
		p.rewind()
		p.errorf(mk.Range.Start, p.pos, "edge groups must be terminated with )")
		return
	}
	p.commit()

	r, newlines, eof = p.peekNotSpace()
	if eof || newlines > 0 {
		p.rewind()
		return
	}
	if r == '[' {
		p.commit()
		mk.EdgeIndex = p.parseEdgeIndex()
	} else {
		p.rewind()
	}

	r, newlines, eof = p.peekNotSpace()
	if eof || newlines > 0 {
		p.rewind()
		return
	}
	if r == '.' {
		p.commit()
		mk.EdgeKey = p.parseKey()
	} else {
		p.rewind()
	}

	p.inEdgeGroup = false
	p.parseMapKeyValue(mk)
}

func (p *parser) parseEdgeIndex() *d2ast.EdgeIndex {
	ei := &d2ast.EdgeIndex{
		Range: d2ast.Range{
			Path:  p.path,
			Start: p.pos.Subtract('[', p.utf16Pos),
		},
	}
	defer ei.Range.End.From(&p.pos)

	r, newlines, eof := p.peekNotSpace()
	if eof || newlines > 0 {
		p.rewind()
		return nil
	}

	if unicode.IsDigit(r) {
		p.commit()
		var sb strings.Builder
		sb.WriteRune(r)
		for {
			r, newlines, eof = p.peekNotSpace()
			if eof || newlines > 0 {
				p.rewind()
				p.errorf(ei.Range.Start, p.pos, "unterminated edge index")
				return nil
			}
			if r == ']' {
				p.rewind()
				break
			}
			p.commit()
			if !unicode.IsDigit(r) {
				p.errorf(p.pos.Subtract(r, p.utf16Pos), p.pos, "unexpected character in edge index")
				continue
			}
			sb.WriteRune(r)
		}
		i, _ := strconv.Atoi(sb.String())
		ei.Int = &i
	} else if r == '*' {
		p.commit()
		ei.Glob = true
	} else {
		p.errorf(p.pos.Subtract(r, p.utf16Pos), p.pos, "unexpected character in edge index")
		// TODO: skip to ], maybe add a p.skipTo to skip to certain characters
	}

	r, newlines, eof = p.peekNotSpace()
	if eof || newlines > 0 || r != ']' {
		p.rewind()
		p.errorf(ei.Range.Start, p.pos, "unterminated edge index")
		return ei
	}
	p.commit()
	return ei
}

func (p *parser) parseEdges(mk *d2ast.Key, src *d2ast.KeyPath) {
	for {
		e := &d2ast.Edge{
			Range: d2ast.Range{
				Path: p.path,
			},
			Src: src,
		}
		if src != nil {
			e.Range.Start = src.Range.Start
		} else {
			e.Range.Start = p.pos
		}

		r, newlines, eof := p.peekNotSpace()
		if eof {
			return
		}
		if newlines > 0 {
			p.rewind()
			return
		}
		if r == '<' || r == '*' {
			e.SrcArrow = string(r)
		} else if r != '-' {
			p.rewind()
			return
		}
		if src == nil {
			p.errorf(p.lookaheadPos.Subtract(r, p.utf16Pos), p.lookaheadPos, "connection missing source")
			e.Range.Start = p.lookaheadPos.Subtract(r, p.utf16Pos)
		}
		p.commit()

		if !p.parseEdge(e) {
			return
		}

		dst := p.parseKey()
		if dst == nil {
			p.errorf(e.Range.Start, p.pos, "connection missing destination")
		} else {
			e.Dst = dst
			e.Range.End = e.Dst.Range.End
		}
		mk.Edges = append(mk.Edges, e)
		src = dst
	}
}

func (p *parser) parseEdge(e *d2ast.Edge) (ok bool) {
	defer e.Range.End.From(&p.pos)

	for {
		r, eof := p.peek()
		if eof {
			p.errorf(e.Range.Start, p.readerPos, "unterminated connection")
			return false
		}
		switch r {
		case '>', '*':
			e.DstArrow = string(r)
			p.commit()
			return true
		case '\\':
			p.commit()
			r, newlines, eof := p.peekNotSpace()
			if eof {
				continue
			}
			if newlines == 0 {
				p.rewind()
				p.errorf(e.Range.Start, p.readerPos, "only newline escapes are allowed in connections")
				return false
			}
			if newlines > 1 {
				p.rewind()
				continue
			}
			p.commit()
			p.replay(r)
		case '-':
			p.commit()
		default:
			p.rewind()
			return true
		}
	}
}

func (p *parser) parseKey() (k *d2ast.KeyPath) {
	k = &d2ast.KeyPath{
		Range: d2ast.Range{
			Path:  p.path,
			Start: p.pos,
		},
	}

	defer func() {
		if len(k.Path) == 0 {
			k = nil
		} else {
			k.Range.End = k.Path[len(k.Path)-1].Unbox().GetRange().End
		}
	}()

	for {
		r, newlines, eof := p.peekNotSpace()
		if eof {
			return k
		}
		if newlines > 0 || r == '(' {
			p.rewind()
			return k
		}
		// TODO: error if begin, but see below too
		if r == '.' {
			continue
		}
		p.rewind()

		sb := p.parseString(true)
		s := sb.Unbox()
		if s == nil {
			return k
		}
		if sb.UnquotedString != nil && strings.HasPrefix(s.ScalarString(), "@") {
			p.errorf(s.GetRange().Start, s.GetRange().End, "%s is not a valid import, did you mean ...%[2]s?", s.ScalarString())
		}

		if len(k.Path) == 0 {
			k.Range.Start = s.GetRange().Start
		}
		k.Path = append(k.Path, &sb)

		r, newlines, eof = p.peekNotSpace()
		if eof {
			return k
		}
		if newlines > 0 || r != '.' {
			p.rewind()
			return k
		}
		// TODO: error if not string or ( after, see above too
		p.commit()
	}
}

// TODO: inKey -> p.inKey (means I have to restore though)
func (p *parser) parseString(inKey bool) d2ast.StringBox {
	var box d2ast.StringBox

	r, newlines, eof := p.peekNotSpace()
	if eof || newlines > 0 {
		p.rewind()
		return box
	}
	p.commit()

	switch r {
	case '"':
		box.DoubleQuotedString = p.parseDoubleQuotedString(inKey)
		return box
	case '\'':
		box.SingleQuotedString = p.parseSingleQuotedString()
		return box
	case '|':
		box.BlockString = p.parseBlockString()
		return box
	default:
		p.replay(r)
		box.UnquotedString = p.parseUnquotedString(inKey)
		return box
	}
}

func (p *parser) parseUnquotedString(inKey bool) (s *d2ast.UnquotedString) {
	s = &d2ast.UnquotedString{
		Range: d2ast.Range{
			Path:  p.path,
			Start: p.pos,
		},
	}
	// TODO: fix unquoted end whitespace handling to peekNotSpace
	lastNonSpace := p.pos
	defer s.Range.End.From(&lastNonSpace)

	var sb strings.Builder
	var rawb strings.Builder
	lastPatternIndex := 0
	defer func() {
		sv := strings.TrimRightFunc(sb.String(), unicode.IsSpace)
		rawv := strings.TrimRightFunc(rawb.String(), unicode.IsSpace)
		if s.Pattern != nil {
			if lastPatternIndex < len(sv) {
				s.Pattern = append(s.Pattern, sv[lastPatternIndex:])
			}
		}
		if sv == "" {
			if len(s.Value) > 0 {
				return
			}
			s = nil
			// TODO: this should be in the parent and instead they check the delimiters first
			// 			 or last really. only in parseMapNode && parseArrayNode
			// TODO: give specific descriptions for each kind of special character that could have caused this.
			return
		}
		s.Value = append(s.Value, d2ast.InterpolationBox{String: &sv, StringRaw: &rawv})
	}()

	_s, eof := p.peekn(4)
	p.rewind()
	if !eof {
		if _s == "...@" {
			p.errorf(p.pos, p.pos.AdvanceString("...@", p.utf16Pos), "unquoted strings cannot begin with ...@ as that's import spread syntax")
		}
	}

	for {
		r, eof := p.peek()
		if eof {
			return s
		}

		if p.inEdgeGroup && r == ')' {
			// TODO: need a peekNotSpace across escaped newlines
			r2, newlines, eof := p.peekNotSpace()
			if eof || newlines > 0 {
				p.rewind()
				return s
			}
			switch r2 {
			case '\n', '#', '{', '}', '[', ']', ':', '.':
				p.rewind()
				return s
			}
			p.rewind()
			p.peek()
			p.commit()
			lastNonSpace = p.pos
			sb.WriteRune(r)
			rawb.WriteRune(r)
			continue
		}

		// top:   '\n', '#', '{', '}', '[', ']'
		// keys:  ':', '.'
		// edges: '<', '>', '(', ')',
		// edges: --, ->, -*, *-
		switch r {
		case '\n', ';', '#', '{', '}', '[', ']':
			p.rewind()
			return s
		}
		if inKey {
			switch r {
			case ':', '.', '<', '>', '&':
				p.rewind()
				return s
			case '-':
				// TODO: need a peekNotSpace across escaped newlines
				r2, eof := p.peek()
				if eof {
					return s
				}
				switch r2 {
				case '\n', ';', '#', '{', '}', '[', ']':
					p.rewind()
					p.peek()
					p.commit()
					sb.WriteRune(r)
					rawb.WriteRune(r)
					return s
				}
				if r2 == '-' || r2 == '>' || r2 == '*' {
					p.rewind()
					return s
				}
				sb.WriteRune(r)
				rawb.WriteRune(r)
				r = r2
			}
		}

		if r == '*' {
			if sb.Len() == 0 {
				s.Pattern = append(s.Pattern, "*")
			} else {
				s.Pattern = append(s.Pattern, sb.String()[lastPatternIndex:], "*")
			}
			lastPatternIndex = len(sb.String()) + 1
		}

		p.commit()

		if !unicode.IsSpace(r) {
			lastNonSpace = p.pos
		}

		if !inKey && r == '$' {
			subst := p.parseSubstitution(false)
			if subst != nil {
				if sb.Len() > 0 {
					sv := sb.String()
					rawv := rawb.String()
					s.Value = append(s.Value, d2ast.InterpolationBox{String: &sv, StringRaw: &rawv})
					sb.Reset()
					rawb.Reset()
				}
				s.Value = append(s.Value, d2ast.InterpolationBox{Substitution: subst})
				continue
			}
			continue
		}

		if r != '\\' {
			sb.WriteRune(r)
			rawb.WriteRune(r)
			continue
		}

		r2, eof := p.read()
		if eof {
			p.errorf(p.pos.Subtract('\\', p.utf16Pos), p.readerPos, "unfinished escape sequence")
			return s
		}

		if r2 == '\n' {
			r, newlines, eof := p.peekNotSpace()
			if eof || newlines > 0 {
				p.rewind()
				return s
			}
			p.commit()
			p.replay(r)
			continue
		}

		sb.WriteRune(decodeEscape(r2))
		rawb.WriteByte('\\')
		rawb.WriteRune(r2)
	}
}

// https://go.dev/ref/spec#Rune_literals
// TODO: implement all Go escapes like the unicode ones
func decodeEscape(r2 rune) rune {
	switch r2 {
	case 'a':
		return '\a'
	case 'b':
		return '\b'
	case 'f':
		return '\f'
	case 'n':
		return '\n'
	case 'r':
		return '\r'
	case 't':
		return '\t'
	case 'v':
		return '\v'
	case '\\':
		return '\\'
	case '"':
		return '"'
	default:
		return r2
	}
}

func (p *parser) parseDoubleQuotedString(inKey bool) *d2ast.DoubleQuotedString {
	s := &d2ast.DoubleQuotedString{
		Range: d2ast.Range{
			Path:  p.path,
			Start: p.pos.Subtract('"', p.utf16Pos),
		},
	}
	defer s.Range.End.From(&p.pos)

	var sb strings.Builder
	var rawb strings.Builder
	defer func() {
		if sb.Len() > 0 {
			sv := sb.String()
			rawv := rawb.String()
			s.Value = append(s.Value, d2ast.InterpolationBox{String: &sv, StringRaw: &rawv})
		}
	}()

	for {
		r, eof := p.peek()
		if eof {
			p.errorf(s.Range.Start, p.readerPos, `double quoted strings must be terminated with "`)
			return s
		}
		if r == '\n' {
			p.rewind()
			p.errorf(s.Range.Start, p.pos, `double quoted strings must be terminated with "`)
			return s
		}

		p.commit()
		if !inKey && r == '$' {
			subst := p.parseSubstitution(false)
			if subst != nil {
				if sb.Len() > 0 {
					s.Value = append(s.Value, d2ast.InterpolationBox{String: go2.Pointer(sb.String())})
					sb.Reset()
				}
				s.Value = append(s.Value, d2ast.InterpolationBox{Substitution: subst})
				continue
			}
		}

		if r == '"' {
			return s
		}

		if r != '\\' {
			sb.WriteRune(r)
			rawb.WriteRune(r)
			continue
		}

		r2, eof := p.read()
		if eof {
			p.errorf(p.pos.Subtract('\\', p.utf16Pos), p.readerPos, "unfinished escape sequence")
			p.errorf(s.Range.Start, p.readerPos, `double quoted strings must be terminated with "`)
			return s
		}

		if r2 == '\n' {
			// TODO: deindent
			continue
		}
		sb.WriteRune(decodeEscape(r2))
		rawb.WriteByte('\\')
		rawb.WriteRune(r2)
	}
}

func (p *parser) parseSingleQuotedString() *d2ast.SingleQuotedString {
	s := &d2ast.SingleQuotedString{
		Range: d2ast.Range{
			Path:  p.path,
			Start: p.pos.Subtract('\'', p.utf16Pos),
		},
	}
	defer s.Range.End.From(&p.pos)

	var sb strings.Builder
	defer func() {
		s.Value = sb.String()
	}()

	for {
		r, eof := p.peek()
		if eof {
			p.errorf(s.Range.Start, p.readerPos, `single quoted strings must be terminated with '`)
			return s
		}
		if r == '\n' {
			p.rewind()
			p.errorf(s.Range.Start, p.pos, `single quoted strings must be terminated with '`)
			return s
		}
		p.commit()

		if r == '\'' {
			r, eof = p.peek()
			if eof {
				return s
			}
			if r == '\'' {
				p.commit()
				sb.WriteByte('\'')
				continue
			}
			p.rewind()
			return s
		}

		if r != '\\' {
			sb.WriteRune(r)
			continue
		}

		r2, eof := p.peek()
		if eof {
			continue
		}

		switch r2 {
		case '\n':
			p.commit()
			continue
		default:
			sb.WriteRune(r)
			p.rewind()
		}
	}
}

func (p *parser) parseBlockString() *d2ast.BlockString {
	bs := &d2ast.BlockString{
		Range: d2ast.Range{
			Path:  p.path,
			Start: p.pos.Subtract('|', p.utf16Pos),
		},
	}
	defer bs.Range.End.From(&p.pos)

	p.depth++
	defer dec(&p.depth)

	var sb strings.Builder
	defer func() {
		bs.Value = trimSpaceAfterLastNewline(sb.String())
		bs.Value = trimCommonIndent(bs.Value)
	}()

	// Do we have more symbol quotes?
	bs.Quote = ""
	for {
		r, eof := p.peek()
		if eof {
			p.errorf(bs.Range.Start, p.readerPos, `block string must be terminated with %v`, bs.Quote+"|")
			return bs
		}

		if unicode.IsSpace(r) || unicode.IsLetter(r) || unicode.IsDigit(r) || r == '_' {
			p.rewind()
			break
		}
		p.commit()
		bs.Quote += string(r)
	}

	// Do we have a tag?
	for {
		r, eof := p.peek()
		if eof {
			p.errorf(bs.Range.Start, p.readerPos, `block string must be terminated with %v`, bs.Quote+"|")
			return bs
		}

		if unicode.IsSpace(r) {
			p.rewind()
			break
		}
		p.commit()
		bs.Tag += string(r)
	}
	if bs.Tag == "" {
		// TODO: no and fix compiler to not set text/markdown shape always.
		//       reason being not all multiline text is markdown by default.
		//       for example markdown edge labels or other random text.
		//       maybe we can be smart about this at some point and only set
		//       if the block string is being interpreted as markdown.
		bs.Tag = "md"
	}

	// Skip non newline whitespace.
	for {
		r, eof := p.peek()
		if eof {
			p.errorf(bs.Range.Start, p.readerPos, `block string must be terminated with %v`, bs.Quote+"|")
			return bs
		}
		if !unicode.IsSpace(r) {
			// Non whitespace characters on the first line have an implicit indent.
			sb.WriteString(p.getIndent())
			p.rewind()
			break
		}
		p.commit()
		if r == '\n' {
			break
		}
	}

	endHint := '|'
	endRest := ""
	if len(bs.Quote) > 0 {
		var size int
		endHint, size = utf8.DecodeLastRuneInString(bs.Quote)
		endRest = bs.Quote[size:] + "|"
	}

	for {
		r, eof := p.read()
		if eof {
			p.errorf(bs.Range.Start, p.readerPos, `block string must be terminated with %v`, bs.Quote+"|")
			return bs
		}

		if r != endHint {
			if (bs.Tag == "latex" || bs.Tag == "tex") && r == '\\' {
				// For LaTeX, where single backslash is common, we escape it so that users don't have to write double the backslashes
				sb.WriteRune('\\')
				sb.WriteRune('\\')
				continue
			}
			sb.WriteRune(r)
			continue
		}

		s, eof := p.peekn(len(endRest))
		if eof {
			p.errorf(bs.Range.Start, p.readerPos, `block string must be terminated with %v`, bs.Quote+"|")
			return bs
		}
		if s != endRest {
			sb.WriteRune(endHint)
			p.rewind()
			continue
		}
		p.commit()
		return bs
	}
}

func (p *parser) parseArray() *d2ast.Array {
	a := &d2ast.Array{
		Range: d2ast.Range{
			Path:  p.path,
			Start: p.pos.Subtract('[', p.utf16Pos),
		},
	}
	defer a.Range.End.From(&p.readerPos)

	p.depth++
	defer dec(&p.depth)

	for {
		r, eof := p.readNotSpace()
		if eof {
			p.errorf(a.Range.Start, p.readerPos, "arrays must be terminated with ]")
			return a
		}

		switch r {
		case ';':
			continue
		case ']':
			return a
		}

		n := p.parseArrayNode(r)
		if n.Unbox() != nil {
			a.Nodes = append(a.Nodes, n)
		}

		if n.BlockComment != nil {
			// Anything after a block comment is ok.
			continue
		}

		after := p.pos
		for {
			r, newlines, eof := p.peekNotSpace()
			if eof || newlines != 0 || r == ';' || r == ']' || r == '#' {
				p.rewind()
				break
			}
			p.commit()
		}

		if after != p.pos {
			if n.Unbox() != nil {
				p.errorf(after, p.pos, "unexpected text after %v", n.Unbox().Type())
			} else {
				p.errorf(after, p.pos, "invalid text beginning unquoted string")
			}
		}
	}
}

func (p *parser) parseArrayNode(r rune) d2ast.ArrayNodeBox {
	var box d2ast.ArrayNodeBox

	switch r {
	case '#':
		box.Comment = p.parseComment()
		return box
	case '"':
		s, eof := p.peekn(2)
		if eof {
			break
		}
		if s != `""` {
			p.rewind()
			break
		}
		p.commit()
		box.BlockComment = p.parseBlockComment()
		return box
	case '.':
		s, eof := p.peekn(2)
		if eof {
			break
		}
		if s != ".." {
			p.rewind()
			break
		}
		r, eof := p.peek()
		if eof {
			break
		}
		if r == '$' {
			p.commit()
			box.Substitution = p.parseSubstitution(true)
			return box
		}
		if r == '@' {
			p.commit()
			box.Import = p.parseImport(true)
			return box
		}
		p.rewind()
		break
	}

	p.replay(r)
	vbox := p.parseValue()
	if vbox.UnquotedString != nil && vbox.UnquotedString.ScalarString() == "" &&
		!(len(vbox.UnquotedString.Value) > 0 && vbox.UnquotedString.Value[0].Substitution != nil) {
		p.errorf(p.pos, p.pos.Advance(r, p.utf16Pos), "unquoted strings cannot start on %q", r)
	}
	box.Null = vbox.Null
	box.Boolean = vbox.Boolean
	box.Number = vbox.Number
	box.UnquotedString = vbox.UnquotedString
	box.DoubleQuotedString = vbox.DoubleQuotedString
	box.SingleQuotedString = vbox.SingleQuotedString
	box.BlockString = vbox.BlockString
	box.Array = vbox.Array
	box.Map = vbox.Map
	box.Import = vbox.Import
	return box
}

func (p *parser) parseValue() d2ast.ValueBox {
	var box d2ast.ValueBox

	r, newlines, eof := p.peekNotSpace()
	if eof || newlines > 0 {
		p.rewind()
		return box
	}
	p.commit()

	switch r {
	case '[':
		box.Array = p.parseArray()
		return box
	case '{':
		box.Map = p.parseMap(false)
		return box
	case '@':
		box.Import = p.parseImport(false)
		return box
	}

	p.replay(r)
	sb := p.parseString(false)
	if sb.DoubleQuotedString != nil {
		box.DoubleQuotedString = sb.DoubleQuotedString
		return box
	}
	if sb.SingleQuotedString != nil {
		box.SingleQuotedString = sb.SingleQuotedString
		return box
	}
	if sb.BlockString != nil {
		box.BlockString = sb.BlockString
		return box
	}

	if sb.UnquotedString == nil {
		return box
	}

	s := sb.UnquotedString
	if strings.EqualFold(s.ScalarString(), "null") {
		box.Null = &d2ast.Null{
			Range: s.Range,
		}
		return box
	}

	if strings.EqualFold(s.ScalarString(), "true") {
		box.Boolean = &d2ast.Boolean{
			Range: s.Range,
			Value: true,
		}
		return box
	}

	if strings.EqualFold(s.ScalarString(), "false") {
		box.Boolean = &d2ast.Boolean{
			Range: s.Range,
			Value: false,
		}
		return box
	}

	// TODO: only if matches regex
	rat, ok := big.NewRat(0, 1).SetString(s.ScalarString())
	if ok {
		box.Number = &d2ast.Number{
			Range: s.Range,
			Raw:   s.ScalarString(),
			Value: rat,
		}
		return box
	}

	box.UnquotedString = s
	return box
}

func (p *parser) parseSubstitution(spread bool) *d2ast.Substitution {
	subst := &d2ast.Substitution{
		Range: d2ast.Range{
			Path:  p.path,
			Start: p.pos.SubtractString("$", p.utf16Pos),
		},
		Spread: spread,
	}
	defer subst.Range.End.From(&p.pos)

	if subst.Spread {
		subst.Range.Start = subst.Range.Start.SubtractString("...", p.utf16Pos)
	}

	r, newlines, eof := p.peekNotSpace()
	if eof {
		return nil
	}
	if newlines > 0 {
		p.rewind()
		return nil
	}
	if r != '{' {
		p.rewind()
		p.errorf(subst.Range.Start, p.readerPos, "substitutions must begin on {")
		return nil
	} else {
		p.commit()
	}

	k := p.parseKey()
	if k != nil {
		subst.Path = k.Path
	}

	r, newlines, eof = p.peekNotSpace()
	if eof {
		p.errorf(subst.Range.Start, p.readerPos, "substitutions must be terminated by }")
		return subst
	}
	if newlines > 0 || r != '}' {
		p.rewind()
		p.errorf(subst.Range.Start, p.pos, "substitutions must be terminated by }")
		return subst
	}
	p.commit()

	return subst
}

func (p *parser) parseImport(spread bool) *d2ast.Import {
	imp := &d2ast.Import{
		Range: d2ast.Range{
			Path:  p.path,
			Start: p.pos.SubtractString("$", p.utf16Pos),
		},
		Spread: spread,
	}
	defer imp.Range.End.From(&p.pos)

	if imp.Spread {
		imp.Range.Start = imp.Range.Start.SubtractString("...", p.utf16Pos)
	}

	var pre strings.Builder
	for {
		r, eof := p.peek()
		if eof {
			break
		}
		if r != '.' && r != '/' {
			p.rewind()
			break
		}
		pre.WriteRune(r)
		p.commit()
	}
	imp.Pre = pre.String()

	k := p.parseKey()
	if k == nil {
		return imp
	}
	if k.Path[0].UnquotedString != nil && len(k.Path) > 1 && k.Path[1].UnquotedString != nil && k.Path[1].Unbox().ScalarString() == "d2" {
		k.Path = append(k.Path[:1], k.Path[2:]...)
	}
	imp.Path = k.Path
	return imp
}

// func marshalKey(k *d2ast.Key) string {
// 	var sb strings.Builder
// 	for i, s := range k.Path {
// 		// TODO: Need to encode specials and quotes.
// 		sb.WriteString(s.Unbox().ScalarString())
// 		if i < len(k.Path)-1 {
// 			sb.WriteByte('.')
// 		}
// 	}
// 	return sb.String()
// }

func dec(i *int) {
	*i -= 1
}

func (p *parser) getIndent() string {
	return strings.Repeat(" ", p.depth*2)
}

func trimIndent(s, indent string) string {
	lines := strings.Split(s, "\n")
	for i, l := range lines {
		if l == "" {
			continue
		}
		_, l = splitLeadingIndent(l, len(indent))
		lines[i] = l
	}
	return strings.Join(lines, "\n")
}

func trimCommonIndent(s string) string {
	commonIndent := ""
	for _, l := range strings.Split(s, "\n") {
		if l == "" {
			continue
		}
		lineIndent, l := splitLeadingIndent(l, -1)
		if lineIndent == "" {
			// No common indent return as is.
			return s
		}
		if l == "" {
			// Whitespace only line.
			continue
		}
		if commonIndent == "" || len(lineIndent) < len(commonIndent) {
			commonIndent = lineIndent
		}
	}
	if commonIndent == "" {
		return s
	}
	return trimIndent(s, commonIndent)
}

func splitLeadingIndent(s string, maxSpaces int) (indent, rets string) {
	var indentb strings.Builder
	i := 0
	for _, r := range s {
		if !unicode.IsSpace(r) {
			break
		}
		i++
		if r != '\t' {
			indentb.WriteRune(r)
		} else {
			indentb.WriteByte(' ')
			indentb.WriteByte(' ')
		}
		if maxSpaces > -1 && indentb.Len() == maxSpaces {
			break
		}
	}
	return indentb.String(), s[i:]
}