Платформа ЦРНП "Мирокод" для разработки проектов
https://git.mirocod.ru
You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
250 lines
6.4 KiB
250 lines
6.4 KiB
// Copyright (c) 2018 Couchbase, Inc. |
|
// |
|
// Licensed under the Apache License, Version 2.0 (the "License"); |
|
// you may not use this file except in compliance with the License. |
|
// You may obtain a copy of the License at |
|
// |
|
// http://www.apache.org/licenses/LICENSE-2.0 |
|
// |
|
// Unless required by applicable law or agreed to in writing, software |
|
// distributed under the License is distributed on an "AS IS" BASIS, |
|
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
|
// See the License for the specific language governing permissions and |
|
// limitations under the License. |
|
|
|
package levenshtein2 |
|
|
|
import ( |
|
"fmt" |
|
"math" |
|
) |
|
|
|
const SinkState = uint32(0) |
|
|
|
type DFA struct { |
|
transitions [][256]uint32 |
|
distances []Distance |
|
initState int |
|
ed uint8 |
|
} |
|
|
|
/// Returns the initial state |
|
func (d *DFA) initialState() int { |
|
return d.initState |
|
} |
|
|
|
/// Returns the Levenshtein distance associated to the |
|
/// current state. |
|
func (d *DFA) distance(stateId int) Distance { |
|
return d.distances[stateId] |
|
} |
|
|
|
/// Returns the number of states in the `DFA`. |
|
func (d *DFA) numStates() int { |
|
return len(d.transitions) |
|
} |
|
|
|
/// Returns the destination state reached after consuming a given byte. |
|
func (d *DFA) transition(fromState int, b uint8) int { |
|
return int(d.transitions[fromState][b]) |
|
} |
|
|
|
func (d *DFA) eval(bytes []uint8) Distance { |
|
state := d.initialState() |
|
|
|
for _, b := range bytes { |
|
state = d.transition(state, b) |
|
} |
|
|
|
return d.distance(state) |
|
} |
|
|
|
func (d *DFA) Start() int { |
|
return int(d.initialState()) |
|
} |
|
|
|
func (d *DFA) IsMatch(state int) bool { |
|
if _, ok := d.distance(state).(Exact); ok { |
|
return true |
|
} |
|
return false |
|
} |
|
|
|
func (d *DFA) CanMatch(state int) bool { |
|
return state > 0 && state < d.numStates() |
|
} |
|
|
|
func (d *DFA) Accept(state int, b byte) int { |
|
return int(d.transition(state, b)) |
|
} |
|
|
|
// WillAlwaysMatch returns if the specified state will always end in a |
|
// matching state. |
|
func (d *DFA) WillAlwaysMatch(state int) bool { |
|
return false |
|
} |
|
|
|
func fill(dest []uint32, val uint32) { |
|
for i := range dest { |
|
dest[i] = val |
|
} |
|
} |
|
|
|
func fillTransitions(dest *[256]uint32, val uint32) { |
|
for i := range dest { |
|
dest[i] = val |
|
} |
|
} |
|
|
|
type Utf8DFAStateBuilder struct { |
|
dfaBuilder *Utf8DFABuilder |
|
stateID uint32 |
|
defaultSuccessor []uint32 |
|
} |
|
|
|
func (sb *Utf8DFAStateBuilder) addTransitionID(fromStateID uint32, b uint8, |
|
toStateID uint32) { |
|
sb.dfaBuilder.transitions[fromStateID][b] = toStateID |
|
} |
|
|
|
func (sb *Utf8DFAStateBuilder) addTransition(in rune, toStateID uint32) { |
|
fromStateID := sb.stateID |
|
chars := []byte(string(in)) |
|
lastByte := chars[len(chars)-1] |
|
|
|
for i, ch := range chars[:len(chars)-1] { |
|
remNumBytes := len(chars) - i - 1 |
|
defaultSuccessor := sb.defaultSuccessor[remNumBytes] |
|
intermediateStateID := sb.dfaBuilder.transitions[fromStateID][ch] |
|
|
|
if intermediateStateID == defaultSuccessor { |
|
intermediateStateID = sb.dfaBuilder.allocate() |
|
fillTransitions(&sb.dfaBuilder.transitions[intermediateStateID], |
|
sb.defaultSuccessor[remNumBytes-1]) |
|
} |
|
|
|
sb.addTransitionID(fromStateID, ch, intermediateStateID) |
|
fromStateID = intermediateStateID |
|
} |
|
|
|
toStateIDDecoded := sb.dfaBuilder.getOrAllocate(original(toStateID)) |
|
sb.addTransitionID(fromStateID, lastByte, toStateIDDecoded) |
|
} |
|
|
|
type Utf8StateId uint32 |
|
|
|
func original(stateId uint32) Utf8StateId { |
|
return predecessor(stateId, 0) |
|
} |
|
|
|
func predecessor(stateId uint32, numSteps uint8) Utf8StateId { |
|
return Utf8StateId(stateId*4 + uint32(numSteps)) |
|
} |
|
|
|
// Utf8DFABuilder makes it possible to define a DFA |
|
// that takes unicode character, and build a `DFA` |
|
// that operates on utf-8 encoded |
|
type Utf8DFABuilder struct { |
|
index []uint32 |
|
distances []Distance |
|
transitions [][256]uint32 |
|
initialState uint32 |
|
numStates uint32 |
|
maxNumStates uint32 |
|
} |
|
|
|
func withMaxStates(maxStates uint32) *Utf8DFABuilder { |
|
rv := &Utf8DFABuilder{ |
|
index: make([]uint32, maxStates*2+100), |
|
distances: make([]Distance, 0, maxStates), |
|
transitions: make([][256]uint32, 0, maxStates), |
|
maxNumStates: maxStates, |
|
} |
|
|
|
for i := range rv.index { |
|
rv.index[i] = math.MaxUint32 |
|
} |
|
|
|
return rv |
|
} |
|
|
|
func (dfab *Utf8DFABuilder) allocate() uint32 { |
|
newState := dfab.numStates |
|
dfab.numStates++ |
|
|
|
dfab.distances = append(dfab.distances, Atleast{d: 255}) |
|
dfab.transitions = append(dfab.transitions, [256]uint32{}) |
|
|
|
return newState |
|
} |
|
|
|
func (dfab *Utf8DFABuilder) getOrAllocate(state Utf8StateId) uint32 { |
|
if int(state) >= cap(dfab.index) { |
|
cloneIndex := make([]uint32, int(state)*2) |
|
copy(cloneIndex, dfab.index) |
|
dfab.index = cloneIndex |
|
} |
|
if dfab.index[state] != math.MaxUint32 { |
|
return dfab.index[state] |
|
} |
|
|
|
nstate := dfab.allocate() |
|
dfab.index[state] = nstate |
|
|
|
return nstate |
|
} |
|
|
|
func (dfab *Utf8DFABuilder) setInitialState(iState uint32) { |
|
decodedID := dfab.getOrAllocate(original(iState)) |
|
dfab.initialState = decodedID |
|
} |
|
|
|
func (dfab *Utf8DFABuilder) build(ed uint8) *DFA { |
|
return &DFA{ |
|
transitions: dfab.transitions, |
|
distances: dfab.distances, |
|
initState: int(dfab.initialState), |
|
ed: ed, |
|
} |
|
} |
|
|
|
func (dfab *Utf8DFABuilder) addState(state, default_suc_orig uint32, |
|
distance Distance) (*Utf8DFAStateBuilder, error) { |
|
if state > dfab.maxNumStates { |
|
return nil, fmt.Errorf("State id is larger than maxNumStates") |
|
} |
|
|
|
stateID := dfab.getOrAllocate(original(state)) |
|
dfab.distances[stateID] = distance |
|
|
|
defaultSuccID := dfab.getOrAllocate(original(default_suc_orig)) |
|
// creates a chain of states of predecessors of `default_suc_orig`. |
|
// Accepting k-bytes (whatever the bytes are) from `predecessor_states[k-1]` |
|
// leads to the `default_suc_orig` state. |
|
predecessorStates := []uint32{defaultSuccID, |
|
defaultSuccID, |
|
defaultSuccID, |
|
defaultSuccID} |
|
|
|
for numBytes := uint8(1); numBytes < 4; numBytes++ { |
|
predecessorState := predecessor(default_suc_orig, numBytes) |
|
predecessorStateID := dfab.getOrAllocate(predecessorState) |
|
predecessorStates[numBytes] = predecessorStateID |
|
succ := predecessorStates[numBytes-1] |
|
fillTransitions(&dfab.transitions[predecessorStateID], succ) |
|
} |
|
|
|
// 1-byte encoded chars. |
|
fill(dfab.transitions[stateID][0:192], predecessorStates[0]) |
|
// 2-bytes encoded chars. |
|
fill(dfab.transitions[stateID][192:224], predecessorStates[1]) |
|
// 3-bytes encoded chars. |
|
fill(dfab.transitions[stateID][224:240], predecessorStates[2]) |
|
// 4-bytes encoded chars. |
|
fill(dfab.transitions[stateID][240:256], predecessorStates[3]) |
|
|
|
return &Utf8DFAStateBuilder{ |
|
dfaBuilder: dfab, |
|
stateID: stateID, |
|
defaultSuccessor: predecessorStates}, nil |
|
}
|
|
|