Платформа ЦРНП "Мирокод" для разработки проектов
https://git.mirocod.ru
You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
461 lines
12 KiB
461 lines
12 KiB
package chroma |
|
|
|
import ( |
|
"fmt" |
|
"os" |
|
"regexp" |
|
"strings" |
|
"sync" |
|
"unicode/utf8" |
|
|
|
"github.com/dlclark/regexp2" |
|
) |
|
|
|
// A Rule is the fundamental matching unit of the Regex lexer state machine. |
|
type Rule struct { |
|
Pattern string |
|
Type Emitter |
|
Mutator Mutator |
|
} |
|
|
|
// An Emitter takes group matches and returns tokens. |
|
type Emitter interface { |
|
// Emit tokens for the given regex groups. |
|
Emit(groups []string, lexer Lexer) Iterator |
|
} |
|
|
|
// EmitterFunc is a function that is an Emitter. |
|
type EmitterFunc func(groups []string, lexer Lexer) Iterator |
|
|
|
// Emit tokens for groups. |
|
func (e EmitterFunc) Emit(groups []string, lexer Lexer) Iterator { return e(groups, lexer) } |
|
|
|
// ByGroups emits a token for each matching group in the rule's regex. |
|
func ByGroups(emitters ...Emitter) Emitter { |
|
return EmitterFunc(func(groups []string, lexer Lexer) Iterator { |
|
iterators := make([]Iterator, 0, len(groups)-1) |
|
if len(emitters) != len(groups)-1 { |
|
iterators = append(iterators, Error.Emit(groups, lexer)) |
|
// panic(errors.Errorf("number of groups %q does not match number of emitters %v", groups, emitters)) |
|
} else { |
|
for i, group := range groups[1:] { |
|
iterators = append(iterators, emitters[i].Emit([]string{group}, lexer)) |
|
} |
|
} |
|
return Concaterator(iterators...) |
|
}) |
|
} |
|
|
|
// UsingByGroup emits tokens for the matched groups in the regex using a |
|
// "sublexer". Used when lexing code blocks where the name of a sublexer is |
|
// contained within the block, for example on a Markdown text block or SQL |
|
// language block. |
|
// |
|
// The sublexer will be retrieved using sublexerGetFunc (typically |
|
// internal.Get), using the captured value from the matched sublexerNameGroup. |
|
// |
|
// If sublexerGetFunc returns a non-nil lexer for the captured sublexerNameGroup, |
|
// then tokens for the matched codeGroup will be emitted using the retrieved |
|
// lexer. Otherwise, if the sublexer is nil, then tokens will be emitted from |
|
// the passed emitter. |
|
// |
|
// Example: |
|
// |
|
// var Markdown = internal.Register(MustNewLexer( |
|
// &Config{ |
|
// Name: "markdown", |
|
// Aliases: []string{"md", "mkd"}, |
|
// Filenames: []string{"*.md", "*.mkd", "*.markdown"}, |
|
// MimeTypes: []string{"text/x-markdown"}, |
|
// }, |
|
// Rules{ |
|
// "root": { |
|
// {"^(```)(\\w+)(\\n)([\\w\\W]*?)(^```$)", |
|
// UsingByGroup( |
|
// internal.Get, |
|
// 2, 4, |
|
// String, String, String, Text, String, |
|
// ), |
|
// nil, |
|
// }, |
|
// }, |
|
// }, |
|
// )) |
|
// |
|
// See the lexers/m/markdown.go for the complete example. |
|
// |
|
// Note: panic's if the number emitters does not equal the number of matched |
|
// groups in the regex. |
|
func UsingByGroup(sublexerGetFunc func(string) Lexer, sublexerNameGroup, codeGroup int, emitters ...Emitter) Emitter { |
|
return EmitterFunc(func(groups []string, lexer Lexer) Iterator { |
|
// bounds check |
|
if len(emitters) != len(groups)-1 { |
|
panic("UsingByGroup expects number of emitters to be the same as len(groups)-1") |
|
} |
|
|
|
// grab sublexer |
|
sublexer := sublexerGetFunc(groups[sublexerNameGroup]) |
|
|
|
// build iterators |
|
iterators := make([]Iterator, len(groups)-1) |
|
for i, group := range groups[1:] { |
|
if i == codeGroup-1 && sublexer != nil { |
|
var err error |
|
iterators[i], err = sublexer.Tokenise(nil, groups[codeGroup]) |
|
if err != nil { |
|
panic(err) |
|
} |
|
} else { |
|
iterators[i] = emitters[i].Emit([]string{group}, lexer) |
|
} |
|
} |
|
|
|
return Concaterator(iterators...) |
|
}) |
|
} |
|
|
|
// Using returns an Emitter that uses a given Lexer for parsing and emitting. |
|
func Using(lexer Lexer) Emitter { |
|
return EmitterFunc(func(groups []string, _ Lexer) Iterator { |
|
it, err := lexer.Tokenise(&TokeniseOptions{State: "root", Nested: true}, groups[0]) |
|
if err != nil { |
|
panic(err) |
|
} |
|
return it |
|
}) |
|
} |
|
|
|
// UsingSelf is like Using, but uses the current Lexer. |
|
func UsingSelf(state string) Emitter { |
|
return EmitterFunc(func(groups []string, lexer Lexer) Iterator { |
|
it, err := lexer.Tokenise(&TokeniseOptions{State: state, Nested: true}, groups[0]) |
|
if err != nil { |
|
panic(err) |
|
} |
|
return it |
|
}) |
|
} |
|
|
|
// Words creates a regex that matches any of the given literal words. |
|
func Words(prefix, suffix string, words ...string) string { |
|
for i, word := range words { |
|
words[i] = regexp.QuoteMeta(word) |
|
} |
|
return prefix + `(` + strings.Join(words, `|`) + `)` + suffix |
|
} |
|
|
|
// Tokenise text using lexer, returning tokens as a slice. |
|
func Tokenise(lexer Lexer, options *TokeniseOptions, text string) ([]Token, error) { |
|
var out []Token |
|
it, err := lexer.Tokenise(options, text) |
|
if err != nil { |
|
return nil, err |
|
} |
|
for t := it(); t != EOF; t = it() { |
|
out = append(out, t) |
|
} |
|
return out, nil |
|
} |
|
|
|
// Rules maps from state to a sequence of Rules. |
|
type Rules map[string][]Rule |
|
|
|
// Clone returns a clone of the Rules. |
|
func (r Rules) Clone() Rules { |
|
out := map[string][]Rule{} |
|
for key, rules := range r { |
|
out[key] = make([]Rule, len(rules)) |
|
copy(out[key], rules) |
|
} |
|
return out |
|
} |
|
|
|
// MustNewLexer creates a new Lexer or panics. |
|
func MustNewLexer(config *Config, rules Rules) *RegexLexer { |
|
lexer, err := NewLexer(config, rules) |
|
if err != nil { |
|
panic(err) |
|
} |
|
return lexer |
|
} |
|
|
|
// NewLexer creates a new regex-based Lexer. |
|
// |
|
// "rules" is a state machine transitition map. Each key is a state. Values are sets of rules |
|
// that match input, optionally modify lexer state, and output tokens. |
|
func NewLexer(config *Config, rules Rules) (*RegexLexer, error) { |
|
if config == nil { |
|
config = &Config{} |
|
} |
|
if _, ok := rules["root"]; !ok { |
|
return nil, fmt.Errorf("no \"root\" state") |
|
} |
|
compiledRules := map[string][]*CompiledRule{} |
|
for state, rules := range rules { |
|
compiledRules[state] = nil |
|
for _, rule := range rules { |
|
flags := "" |
|
if !config.NotMultiline { |
|
flags += "m" |
|
} |
|
if config.CaseInsensitive { |
|
flags += "i" |
|
} |
|
if config.DotAll { |
|
flags += "s" |
|
} |
|
compiledRules[state] = append(compiledRules[state], &CompiledRule{Rule: rule, flags: flags}) |
|
} |
|
} |
|
return &RegexLexer{ |
|
config: config, |
|
rules: compiledRules, |
|
}, nil |
|
} |
|
|
|
// Trace enables debug tracing. |
|
func (r *RegexLexer) Trace(trace bool) *RegexLexer { |
|
r.trace = trace |
|
return r |
|
} |
|
|
|
// A CompiledRule is a Rule with a pre-compiled regex. |
|
// |
|
// Note that regular expressions are lazily compiled on first use of the lexer. |
|
type CompiledRule struct { |
|
Rule |
|
Regexp *regexp2.Regexp |
|
flags string |
|
} |
|
|
|
// CompiledRules is a map of rule name to sequence of compiled rules in that rule. |
|
type CompiledRules map[string][]*CompiledRule |
|
|
|
// LexerState contains the state for a single lex. |
|
type LexerState struct { |
|
Lexer *RegexLexer |
|
Text []rune |
|
Pos int |
|
Rules CompiledRules |
|
Stack []string |
|
State string |
|
Rule int |
|
// Group matches. |
|
Groups []string |
|
// Custum context for mutators. |
|
MutatorContext map[interface{}]interface{} |
|
iteratorStack []Iterator |
|
options *TokeniseOptions |
|
} |
|
|
|
// Set mutator context. |
|
func (l *LexerState) Set(key interface{}, value interface{}) { |
|
l.MutatorContext[key] = value |
|
} |
|
|
|
// Get mutator context. |
|
func (l *LexerState) Get(key interface{}) interface{} { |
|
return l.MutatorContext[key] |
|
} |
|
|
|
// Iterator returns the next Token from the lexer. |
|
func (l *LexerState) Iterator() Token { // nolint: gocognit |
|
for l.Pos < len(l.Text) && len(l.Stack) > 0 { |
|
// Exhaust the iterator stack, if any. |
|
for len(l.iteratorStack) > 0 { |
|
n := len(l.iteratorStack) - 1 |
|
t := l.iteratorStack[n]() |
|
if t == EOF { |
|
l.iteratorStack = l.iteratorStack[:n] |
|
continue |
|
} |
|
return t |
|
} |
|
|
|
l.State = l.Stack[len(l.Stack)-1] |
|
if l.Lexer.trace { |
|
fmt.Fprintf(os.Stderr, "%s: pos=%d, text=%q\n", l.State, l.Pos, string(l.Text[l.Pos:])) |
|
} |
|
selectedRule, ok := l.Rules[l.State] |
|
if !ok { |
|
panic("unknown state " + l.State) |
|
} |
|
ruleIndex, rule, groups := matchRules(l.Text, l.Pos, selectedRule) |
|
// No match. |
|
if groups == nil { |
|
// From Pygments :\ |
|
// |
|
// If the RegexLexer encounters a newline that is flagged as an error token, the stack is |
|
// emptied and the lexer continues scanning in the 'root' state. This can help producing |
|
// error-tolerant highlighting for erroneous input, e.g. when a single-line string is not |
|
// closed. |
|
if l.Text[l.Pos] == '\n' && l.State != l.options.State { |
|
l.Stack = []string{l.options.State} |
|
continue |
|
} |
|
l.Pos++ |
|
return Token{Error, string(l.Text[l.Pos-1 : l.Pos])} |
|
} |
|
l.Rule = ruleIndex |
|
l.Groups = groups |
|
l.Pos += utf8.RuneCountInString(groups[0]) |
|
if rule.Mutator != nil { |
|
if err := rule.Mutator.Mutate(l); err != nil { |
|
panic(err) |
|
} |
|
} |
|
if rule.Type != nil { |
|
l.iteratorStack = append(l.iteratorStack, rule.Type.Emit(l.Groups, l.Lexer)) |
|
} |
|
} |
|
// Exhaust the IteratorStack, if any. |
|
// Duplicate code, but eh. |
|
for len(l.iteratorStack) > 0 { |
|
n := len(l.iteratorStack) - 1 |
|
t := l.iteratorStack[n]() |
|
if t == EOF { |
|
l.iteratorStack = l.iteratorStack[:n] |
|
continue |
|
} |
|
return t |
|
} |
|
|
|
// If we get to here and we still have text, return it as an error. |
|
if l.Pos != len(l.Text) && len(l.Stack) == 0 { |
|
value := string(l.Text[l.Pos:]) |
|
l.Pos = len(l.Text) |
|
return Token{Type: Error, Value: value} |
|
} |
|
return EOF |
|
} |
|
|
|
// RegexLexer is the default lexer implementation used in Chroma. |
|
type RegexLexer struct { |
|
config *Config |
|
analyser func(text string) float32 |
|
trace bool |
|
|
|
mu sync.Mutex |
|
compiled bool |
|
rules map[string][]*CompiledRule |
|
} |
|
|
|
// SetAnalyser sets the analyser function used to perform content inspection. |
|
func (r *RegexLexer) SetAnalyser(analyser func(text string) float32) *RegexLexer { |
|
r.analyser = analyser |
|
return r |
|
} |
|
|
|
func (r *RegexLexer) AnalyseText(text string) float32 { // nolint |
|
if r.analyser != nil { |
|
return r.analyser(text) |
|
} |
|
return 0.0 |
|
} |
|
|
|
func (r *RegexLexer) Config() *Config { // nolint |
|
return r.config |
|
} |
|
|
|
// Regex compilation is deferred until the lexer is used. This is to avoid significant init() time costs. |
|
func (r *RegexLexer) maybeCompile() (err error) { |
|
r.mu.Lock() |
|
defer r.mu.Unlock() |
|
if r.compiled { |
|
return nil |
|
} |
|
for state, rules := range r.rules { |
|
for i, rule := range rules { |
|
if rule.Regexp == nil { |
|
pattern := "(?:" + rule.Pattern + ")" |
|
if rule.flags != "" { |
|
pattern = "(?" + rule.flags + ")" + pattern |
|
} |
|
pattern = `\G` + pattern |
|
rule.Regexp, err = regexp2.Compile(pattern, 0) |
|
if err != nil { |
|
return fmt.Errorf("failed to compile rule %s.%d: %s", state, i, err) |
|
} |
|
} |
|
} |
|
} |
|
restart: |
|
seen := map[LexerMutator]bool{} |
|
for state := range r.rules { |
|
for i := 0; i < len(r.rules[state]); i++ { |
|
rule := r.rules[state][i] |
|
if compile, ok := rule.Mutator.(LexerMutator); ok { |
|
if seen[compile] { |
|
return fmt.Errorf("saw mutator %T twice; this should not happen", compile) |
|
} |
|
seen[compile] = true |
|
if err := compile.MutateLexer(r.rules, state, i); err != nil { |
|
return err |
|
} |
|
// Process the rules again in case the mutator added/removed rules. |
|
// |
|
// This sounds bad, but shouldn't be significant in practice. |
|
goto restart |
|
} |
|
} |
|
} |
|
r.compiled = true |
|
return nil |
|
} |
|
|
|
func (r *RegexLexer) Tokenise(options *TokeniseOptions, text string) (Iterator, error) { // nolint |
|
if err := r.maybeCompile(); err != nil { |
|
return nil, err |
|
} |
|
if options == nil { |
|
options = defaultOptions |
|
} |
|
if options.EnsureLF { |
|
text = ensureLF(text) |
|
} |
|
if !options.Nested && r.config.EnsureNL && !strings.HasSuffix(text, "\n") { |
|
text += "\n" |
|
} |
|
state := &LexerState{ |
|
options: options, |
|
Lexer: r, |
|
Text: []rune(text), |
|
Stack: []string{options.State}, |
|
Rules: r.rules, |
|
MutatorContext: map[interface{}]interface{}{}, |
|
} |
|
return state.Iterator, nil |
|
} |
|
|
|
func matchRules(text []rune, pos int, rules []*CompiledRule) (int, *CompiledRule, []string) { |
|
for i, rule := range rules { |
|
match, err := rule.Regexp.FindRunesMatchStartingAt(text, pos) |
|
if match != nil && err == nil && match.Index == pos { |
|
groups := []string{} |
|
for _, g := range match.Groups() { |
|
groups = append(groups, g.String()) |
|
} |
|
return i, rule, groups |
|
} |
|
} |
|
return 0, &CompiledRule{}, nil |
|
} |
|
|
|
// replace \r and \r\n with \n |
|
// same as strings.ReplaceAll but more efficient |
|
func ensureLF(text string) string { |
|
buf := make([]byte, len(text)) |
|
var j int |
|
for i := 0; i < len(text); i++ { |
|
c := text[i] |
|
if c == '\r' { |
|
if i < len(text)-1 && text[i+1] == '\n' { |
|
continue |
|
} |
|
c = '\n' |
|
} |
|
buf[j] = c |
|
j++ |
|
} |
|
return string(buf[:j]) |
|
}
|
|
|