Платформа ЦРНП "Мирокод" для разработки проектов
https://git.mirocod.ru
You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
752 lines
14 KiB
752 lines
14 KiB
// TOML lexer. |
|
// |
|
// Written using the principles developed by Rob Pike in |
|
// http://www.youtube.com/watch?v=HxaD_trXwRE |
|
|
|
package toml |
|
|
|
import ( |
|
"bytes" |
|
"errors" |
|
"fmt" |
|
"regexp" |
|
"strconv" |
|
"strings" |
|
) |
|
|
|
var dateRegexp *regexp.Regexp |
|
|
|
// Define state functions |
|
type tomlLexStateFn func() tomlLexStateFn |
|
|
|
// Define lexer |
|
type tomlLexer struct { |
|
inputIdx int |
|
input []rune // Textual source |
|
currentTokenStart int |
|
currentTokenStop int |
|
tokens []token |
|
depth int |
|
line int |
|
col int |
|
endbufferLine int |
|
endbufferCol int |
|
} |
|
|
|
// Basic read operations on input |
|
|
|
func (l *tomlLexer) read() rune { |
|
r := l.peek() |
|
if r == '\n' { |
|
l.endbufferLine++ |
|
l.endbufferCol = 1 |
|
} else { |
|
l.endbufferCol++ |
|
} |
|
l.inputIdx++ |
|
return r |
|
} |
|
|
|
func (l *tomlLexer) next() rune { |
|
r := l.read() |
|
|
|
if r != eof { |
|
l.currentTokenStop++ |
|
} |
|
return r |
|
} |
|
|
|
func (l *tomlLexer) ignore() { |
|
l.currentTokenStart = l.currentTokenStop |
|
l.line = l.endbufferLine |
|
l.col = l.endbufferCol |
|
} |
|
|
|
func (l *tomlLexer) skip() { |
|
l.next() |
|
l.ignore() |
|
} |
|
|
|
func (l *tomlLexer) fastForward(n int) { |
|
for i := 0; i < n; i++ { |
|
l.next() |
|
} |
|
} |
|
|
|
func (l *tomlLexer) emitWithValue(t tokenType, value string) { |
|
l.tokens = append(l.tokens, token{ |
|
Position: Position{l.line, l.col}, |
|
typ: t, |
|
val: value, |
|
}) |
|
l.ignore() |
|
} |
|
|
|
func (l *tomlLexer) emit(t tokenType) { |
|
l.emitWithValue(t, string(l.input[l.currentTokenStart:l.currentTokenStop])) |
|
} |
|
|
|
func (l *tomlLexer) peek() rune { |
|
if l.inputIdx >= len(l.input) { |
|
return eof |
|
} |
|
return l.input[l.inputIdx] |
|
} |
|
|
|
func (l *tomlLexer) peekString(size int) string { |
|
maxIdx := len(l.input) |
|
upperIdx := l.inputIdx + size // FIXME: potential overflow |
|
if upperIdx > maxIdx { |
|
upperIdx = maxIdx |
|
} |
|
return string(l.input[l.inputIdx:upperIdx]) |
|
} |
|
|
|
func (l *tomlLexer) follow(next string) bool { |
|
return next == l.peekString(len(next)) |
|
} |
|
|
|
// Error management |
|
|
|
func (l *tomlLexer) errorf(format string, args ...interface{}) tomlLexStateFn { |
|
l.tokens = append(l.tokens, token{ |
|
Position: Position{l.line, l.col}, |
|
typ: tokenError, |
|
val: fmt.Sprintf(format, args...), |
|
}) |
|
return nil |
|
} |
|
|
|
// State functions |
|
|
|
func (l *tomlLexer) lexVoid() tomlLexStateFn { |
|
for { |
|
next := l.peek() |
|
switch next { |
|
case '[': |
|
return l.lexTableKey |
|
case '#': |
|
return l.lexComment(l.lexVoid) |
|
case '=': |
|
return l.lexEqual |
|
case '\r': |
|
fallthrough |
|
case '\n': |
|
l.skip() |
|
continue |
|
} |
|
|
|
if isSpace(next) { |
|
l.skip() |
|
} |
|
|
|
if l.depth > 0 { |
|
return l.lexRvalue |
|
} |
|
|
|
if isKeyStartChar(next) { |
|
return l.lexKey |
|
} |
|
|
|
if next == eof { |
|
l.next() |
|
break |
|
} |
|
} |
|
|
|
l.emit(tokenEOF) |
|
return nil |
|
} |
|
|
|
func (l *tomlLexer) lexRvalue() tomlLexStateFn { |
|
for { |
|
next := l.peek() |
|
switch next { |
|
case '.': |
|
return l.errorf("cannot start float with a dot") |
|
case '=': |
|
return l.lexEqual |
|
case '[': |
|
l.depth++ |
|
return l.lexLeftBracket |
|
case ']': |
|
l.depth-- |
|
return l.lexRightBracket |
|
case '{': |
|
return l.lexLeftCurlyBrace |
|
case '}': |
|
return l.lexRightCurlyBrace |
|
case '#': |
|
return l.lexComment(l.lexRvalue) |
|
case '"': |
|
return l.lexString |
|
case '\'': |
|
return l.lexLiteralString |
|
case ',': |
|
return l.lexComma |
|
case '\r': |
|
fallthrough |
|
case '\n': |
|
l.skip() |
|
if l.depth == 0 { |
|
return l.lexVoid |
|
} |
|
return l.lexRvalue |
|
case '_': |
|
return l.errorf("cannot start number with underscore") |
|
} |
|
|
|
if l.follow("true") { |
|
return l.lexTrue |
|
} |
|
|
|
if l.follow("false") { |
|
return l.lexFalse |
|
} |
|
|
|
if l.follow("inf") { |
|
return l.lexInf |
|
} |
|
|
|
if l.follow("nan") { |
|
return l.lexNan |
|
} |
|
|
|
if isSpace(next) { |
|
l.skip() |
|
continue |
|
} |
|
|
|
if next == eof { |
|
l.next() |
|
break |
|
} |
|
|
|
possibleDate := l.peekString(35) |
|
dateMatch := dateRegexp.FindString(possibleDate) |
|
if dateMatch != "" { |
|
l.fastForward(len(dateMatch)) |
|
return l.lexDate |
|
} |
|
|
|
if next == '+' || next == '-' || isDigit(next) { |
|
return l.lexNumber |
|
} |
|
|
|
if isAlphanumeric(next) { |
|
return l.lexKey |
|
} |
|
|
|
return l.errorf("no value can start with %c", next) |
|
} |
|
|
|
l.emit(tokenEOF) |
|
return nil |
|
} |
|
|
|
func (l *tomlLexer) lexLeftCurlyBrace() tomlLexStateFn { |
|
l.next() |
|
l.emit(tokenLeftCurlyBrace) |
|
return l.lexRvalue |
|
} |
|
|
|
func (l *tomlLexer) lexRightCurlyBrace() tomlLexStateFn { |
|
l.next() |
|
l.emit(tokenRightCurlyBrace) |
|
return l.lexRvalue |
|
} |
|
|
|
func (l *tomlLexer) lexDate() tomlLexStateFn { |
|
l.emit(tokenDate) |
|
return l.lexRvalue |
|
} |
|
|
|
func (l *tomlLexer) lexTrue() tomlLexStateFn { |
|
l.fastForward(4) |
|
l.emit(tokenTrue) |
|
return l.lexRvalue |
|
} |
|
|
|
func (l *tomlLexer) lexFalse() tomlLexStateFn { |
|
l.fastForward(5) |
|
l.emit(tokenFalse) |
|
return l.lexRvalue |
|
} |
|
|
|
func (l *tomlLexer) lexInf() tomlLexStateFn { |
|
l.fastForward(3) |
|
l.emit(tokenInf) |
|
return l.lexRvalue |
|
} |
|
|
|
func (l *tomlLexer) lexNan() tomlLexStateFn { |
|
l.fastForward(3) |
|
l.emit(tokenNan) |
|
return l.lexRvalue |
|
} |
|
|
|
func (l *tomlLexer) lexEqual() tomlLexStateFn { |
|
l.next() |
|
l.emit(tokenEqual) |
|
return l.lexRvalue |
|
} |
|
|
|
func (l *tomlLexer) lexComma() tomlLexStateFn { |
|
l.next() |
|
l.emit(tokenComma) |
|
return l.lexRvalue |
|
} |
|
|
|
// Parse the key and emits its value without escape sequences. |
|
// bare keys, basic string keys and literal string keys are supported. |
|
func (l *tomlLexer) lexKey() tomlLexStateFn { |
|
growingString := "" |
|
|
|
for r := l.peek(); isKeyChar(r) || r == '\n' || r == '\r'; r = l.peek() { |
|
if r == '"' { |
|
l.next() |
|
str, err := l.lexStringAsString(`"`, false, true) |
|
if err != nil { |
|
return l.errorf(err.Error()) |
|
} |
|
growingString += "\"" + str + "\"" |
|
l.next() |
|
continue |
|
} else if r == '\'' { |
|
l.next() |
|
str, err := l.lexLiteralStringAsString(`'`, false) |
|
if err != nil { |
|
return l.errorf(err.Error()) |
|
} |
|
growingString += "'" + str + "'" |
|
l.next() |
|
continue |
|
} else if r == '\n' { |
|
return l.errorf("keys cannot contain new lines") |
|
} else if isSpace(r) { |
|
break |
|
} else if r == '.' { |
|
// skip |
|
} else if !isValidBareChar(r) { |
|
return l.errorf("keys cannot contain %c character", r) |
|
} |
|
growingString += string(r) |
|
l.next() |
|
} |
|
l.emitWithValue(tokenKey, growingString) |
|
return l.lexVoid |
|
} |
|
|
|
func (l *tomlLexer) lexComment(previousState tomlLexStateFn) tomlLexStateFn { |
|
return func() tomlLexStateFn { |
|
for next := l.peek(); next != '\n' && next != eof; next = l.peek() { |
|
if next == '\r' && l.follow("\r\n") { |
|
break |
|
} |
|
l.next() |
|
} |
|
l.ignore() |
|
return previousState |
|
} |
|
} |
|
|
|
func (l *tomlLexer) lexLeftBracket() tomlLexStateFn { |
|
l.next() |
|
l.emit(tokenLeftBracket) |
|
return l.lexRvalue |
|
} |
|
|
|
func (l *tomlLexer) lexLiteralStringAsString(terminator string, discardLeadingNewLine bool) (string, error) { |
|
growingString := "" |
|
|
|
if discardLeadingNewLine { |
|
if l.follow("\r\n") { |
|
l.skip() |
|
l.skip() |
|
} else if l.peek() == '\n' { |
|
l.skip() |
|
} |
|
} |
|
|
|
// find end of string |
|
for { |
|
if l.follow(terminator) { |
|
return growingString, nil |
|
} |
|
|
|
next := l.peek() |
|
if next == eof { |
|
break |
|
} |
|
growingString += string(l.next()) |
|
} |
|
|
|
return "", errors.New("unclosed string") |
|
} |
|
|
|
func (l *tomlLexer) lexLiteralString() tomlLexStateFn { |
|
l.skip() |
|
|
|
// handle special case for triple-quote |
|
terminator := "'" |
|
discardLeadingNewLine := false |
|
if l.follow("''") { |
|
l.skip() |
|
l.skip() |
|
terminator = "'''" |
|
discardLeadingNewLine = true |
|
} |
|
|
|
str, err := l.lexLiteralStringAsString(terminator, discardLeadingNewLine) |
|
if err != nil { |
|
return l.errorf(err.Error()) |
|
} |
|
|
|
l.emitWithValue(tokenString, str) |
|
l.fastForward(len(terminator)) |
|
l.ignore() |
|
return l.lexRvalue |
|
} |
|
|
|
// Lex a string and return the results as a string. |
|
// Terminator is the substring indicating the end of the token. |
|
// The resulting string does not include the terminator. |
|
func (l *tomlLexer) lexStringAsString(terminator string, discardLeadingNewLine, acceptNewLines bool) (string, error) { |
|
growingString := "" |
|
|
|
if discardLeadingNewLine { |
|
if l.follow("\r\n") { |
|
l.skip() |
|
l.skip() |
|
} else if l.peek() == '\n' { |
|
l.skip() |
|
} |
|
} |
|
|
|
for { |
|
if l.follow(terminator) { |
|
return growingString, nil |
|
} |
|
|
|
if l.follow("\\") { |
|
l.next() |
|
switch l.peek() { |
|
case '\r': |
|
fallthrough |
|
case '\n': |
|
fallthrough |
|
case '\t': |
|
fallthrough |
|
case ' ': |
|
// skip all whitespace chars following backslash |
|
for strings.ContainsRune("\r\n\t ", l.peek()) { |
|
l.next() |
|
} |
|
case '"': |
|
growingString += "\"" |
|
l.next() |
|
case 'n': |
|
growingString += "\n" |
|
l.next() |
|
case 'b': |
|
growingString += "\b" |
|
l.next() |
|
case 'f': |
|
growingString += "\f" |
|
l.next() |
|
case '/': |
|
growingString += "/" |
|
l.next() |
|
case 't': |
|
growingString += "\t" |
|
l.next() |
|
case 'r': |
|
growingString += "\r" |
|
l.next() |
|
case '\\': |
|
growingString += "\\" |
|
l.next() |
|
case 'u': |
|
l.next() |
|
code := "" |
|
for i := 0; i < 4; i++ { |
|
c := l.peek() |
|
if !isHexDigit(c) { |
|
return "", errors.New("unfinished unicode escape") |
|
} |
|
l.next() |
|
code = code + string(c) |
|
} |
|
intcode, err := strconv.ParseInt(code, 16, 32) |
|
if err != nil { |
|
return "", errors.New("invalid unicode escape: \\u" + code) |
|
} |
|
growingString += string(rune(intcode)) |
|
case 'U': |
|
l.next() |
|
code := "" |
|
for i := 0; i < 8; i++ { |
|
c := l.peek() |
|
if !isHexDigit(c) { |
|
return "", errors.New("unfinished unicode escape") |
|
} |
|
l.next() |
|
code = code + string(c) |
|
} |
|
intcode, err := strconv.ParseInt(code, 16, 64) |
|
if err != nil { |
|
return "", errors.New("invalid unicode escape: \\U" + code) |
|
} |
|
growingString += string(rune(intcode)) |
|
default: |
|
return "", errors.New("invalid escape sequence: \\" + string(l.peek())) |
|
} |
|
} else { |
|
r := l.peek() |
|
|
|
if 0x00 <= r && r <= 0x1F && !(acceptNewLines && (r == '\n' || r == '\r')) { |
|
return "", fmt.Errorf("unescaped control character %U", r) |
|
} |
|
l.next() |
|
growingString += string(r) |
|
} |
|
|
|
if l.peek() == eof { |
|
break |
|
} |
|
} |
|
|
|
return "", errors.New("unclosed string") |
|
} |
|
|
|
func (l *tomlLexer) lexString() tomlLexStateFn { |
|
l.skip() |
|
|
|
// handle special case for triple-quote |
|
terminator := `"` |
|
discardLeadingNewLine := false |
|
acceptNewLines := false |
|
if l.follow(`""`) { |
|
l.skip() |
|
l.skip() |
|
terminator = `"""` |
|
discardLeadingNewLine = true |
|
acceptNewLines = true |
|
} |
|
|
|
str, err := l.lexStringAsString(terminator, discardLeadingNewLine, acceptNewLines) |
|
|
|
if err != nil { |
|
return l.errorf(err.Error()) |
|
} |
|
|
|
l.emitWithValue(tokenString, str) |
|
l.fastForward(len(terminator)) |
|
l.ignore() |
|
return l.lexRvalue |
|
} |
|
|
|
func (l *tomlLexer) lexTableKey() tomlLexStateFn { |
|
l.next() |
|
|
|
if l.peek() == '[' { |
|
// token '[[' signifies an array of tables |
|
l.next() |
|
l.emit(tokenDoubleLeftBracket) |
|
return l.lexInsideTableArrayKey |
|
} |
|
// vanilla table key |
|
l.emit(tokenLeftBracket) |
|
return l.lexInsideTableKey |
|
} |
|
|
|
// Parse the key till "]]", but only bare keys are supported |
|
func (l *tomlLexer) lexInsideTableArrayKey() tomlLexStateFn { |
|
for r := l.peek(); r != eof; r = l.peek() { |
|
switch r { |
|
case ']': |
|
if l.currentTokenStop > l.currentTokenStart { |
|
l.emit(tokenKeyGroupArray) |
|
} |
|
l.next() |
|
if l.peek() != ']' { |
|
break |
|
} |
|
l.next() |
|
l.emit(tokenDoubleRightBracket) |
|
return l.lexVoid |
|
case '[': |
|
return l.errorf("table array key cannot contain ']'") |
|
default: |
|
l.next() |
|
} |
|
} |
|
return l.errorf("unclosed table array key") |
|
} |
|
|
|
// Parse the key till "]" but only bare keys are supported |
|
func (l *tomlLexer) lexInsideTableKey() tomlLexStateFn { |
|
for r := l.peek(); r != eof; r = l.peek() { |
|
switch r { |
|
case ']': |
|
if l.currentTokenStop > l.currentTokenStart { |
|
l.emit(tokenKeyGroup) |
|
} |
|
l.next() |
|
l.emit(tokenRightBracket) |
|
return l.lexVoid |
|
case '[': |
|
return l.errorf("table key cannot contain ']'") |
|
default: |
|
l.next() |
|
} |
|
} |
|
return l.errorf("unclosed table key") |
|
} |
|
|
|
func (l *tomlLexer) lexRightBracket() tomlLexStateFn { |
|
l.next() |
|
l.emit(tokenRightBracket) |
|
return l.lexRvalue |
|
} |
|
|
|
type validRuneFn func(r rune) bool |
|
|
|
func isValidHexRune(r rune) bool { |
|
return r >= 'a' && r <= 'f' || |
|
r >= 'A' && r <= 'F' || |
|
r >= '0' && r <= '9' || |
|
r == '_' |
|
} |
|
|
|
func isValidOctalRune(r rune) bool { |
|
return r >= '0' && r <= '7' || r == '_' |
|
} |
|
|
|
func isValidBinaryRune(r rune) bool { |
|
return r == '0' || r == '1' || r == '_' |
|
} |
|
|
|
func (l *tomlLexer) lexNumber() tomlLexStateFn { |
|
r := l.peek() |
|
|
|
if r == '0' { |
|
follow := l.peekString(2) |
|
if len(follow) == 2 { |
|
var isValidRune validRuneFn |
|
switch follow[1] { |
|
case 'x': |
|
isValidRune = isValidHexRune |
|
case 'o': |
|
isValidRune = isValidOctalRune |
|
case 'b': |
|
isValidRune = isValidBinaryRune |
|
default: |
|
if follow[1] >= 'a' && follow[1] <= 'z' || follow[1] >= 'A' && follow[1] <= 'Z' { |
|
return l.errorf("unknown number base: %s. possible options are x (hex) o (octal) b (binary)", string(follow[1])) |
|
} |
|
} |
|
|
|
if isValidRune != nil { |
|
l.next() |
|
l.next() |
|
digitSeen := false |
|
for { |
|
next := l.peek() |
|
if !isValidRune(next) { |
|
break |
|
} |
|
digitSeen = true |
|
l.next() |
|
} |
|
|
|
if !digitSeen { |
|
return l.errorf("number needs at least one digit") |
|
} |
|
|
|
l.emit(tokenInteger) |
|
|
|
return l.lexRvalue |
|
} |
|
} |
|
} |
|
|
|
if r == '+' || r == '-' { |
|
l.next() |
|
if l.follow("inf") { |
|
return l.lexInf |
|
} |
|
if l.follow("nan") { |
|
return l.lexNan |
|
} |
|
} |
|
|
|
pointSeen := false |
|
expSeen := false |
|
digitSeen := false |
|
for { |
|
next := l.peek() |
|
if next == '.' { |
|
if pointSeen { |
|
return l.errorf("cannot have two dots in one float") |
|
} |
|
l.next() |
|
if !isDigit(l.peek()) { |
|
return l.errorf("float cannot end with a dot") |
|
} |
|
pointSeen = true |
|
} else if next == 'e' || next == 'E' { |
|
expSeen = true |
|
l.next() |
|
r := l.peek() |
|
if r == '+' || r == '-' { |
|
l.next() |
|
} |
|
} else if isDigit(next) { |
|
digitSeen = true |
|
l.next() |
|
} else if next == '_' { |
|
l.next() |
|
} else { |
|
break |
|
} |
|
if pointSeen && !digitSeen { |
|
return l.errorf("cannot start float with a dot") |
|
} |
|
} |
|
|
|
if !digitSeen { |
|
return l.errorf("no digit in that number") |
|
} |
|
if pointSeen || expSeen { |
|
l.emit(tokenFloat) |
|
} else { |
|
l.emit(tokenInteger) |
|
} |
|
return l.lexRvalue |
|
} |
|
|
|
func (l *tomlLexer) run() { |
|
for state := l.lexVoid; state != nil; { |
|
state = state() |
|
} |
|
} |
|
|
|
func init() { |
|
dateRegexp = regexp.MustCompile(`^\d{1,4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}(\.\d{1,9})?(Z|[+-]\d{2}:\d{2})`) |
|
} |
|
|
|
// Entry point |
|
func lexToml(inputBytes []byte) []token { |
|
runes := bytes.Runes(inputBytes) |
|
l := &tomlLexer{ |
|
input: runes, |
|
tokens: make([]token, 0, 256), |
|
line: 1, |
|
col: 1, |
|
endbufferLine: 1, |
|
endbufferCol: 1, |
|
} |
|
l.run() |
|
return l.tokens |
|
}
|
|
|