Платформа ЦРНП "Мирокод" для разработки проектов
https://git.mirocod.ru
You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
125 lines
2.6 KiB
125 lines
2.6 KiB
// Copyright (c) 2018 Couchbase, Inc. |
|
// |
|
// Licensed under the Apache License, Version 2.0 (the "License"); |
|
// you may not use this file except in compliance with the License. |
|
// You may obtain a copy of the License at |
|
// |
|
// http://www.apache.org/licenses/LICENSE-2.0 |
|
// |
|
// Unless required by applicable law or agreed to in writing, software |
|
// distributed under the License is distributed on an "AS IS" BASIS, |
|
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
|
// See the License for the specific language governing permissions and |
|
// limitations under the License. |
|
|
|
package levenshtein2 |
|
|
|
import ( |
|
"fmt" |
|
"sort" |
|
"unicode/utf8" |
|
) |
|
|
|
type FullCharacteristicVector []uint32 |
|
|
|
func (fcv FullCharacteristicVector) shiftAndMask(offset, mask uint32) uint32 { |
|
bucketID := offset / 32 |
|
align := offset - bucketID*32 |
|
if align == 0 { |
|
return fcv[bucketID] & mask |
|
} |
|
left := fcv[bucketID] >> align |
|
right := fcv[bucketID+1] << (32 - align) |
|
return (left | right) & mask |
|
} |
|
|
|
type tuple struct { |
|
char rune |
|
fcv FullCharacteristicVector |
|
} |
|
|
|
type sortRunes []rune |
|
|
|
func (s sortRunes) Less(i, j int) bool { |
|
return s[i] < s[j] |
|
} |
|
|
|
func (s sortRunes) Swap(i, j int) { |
|
s[i], s[j] = s[j], s[i] |
|
} |
|
|
|
func (s sortRunes) Len() int { |
|
return len(s) |
|
} |
|
|
|
func sortRune(r []rune) []rune { |
|
sort.Sort(sortRunes(r)) |
|
return r |
|
} |
|
|
|
type Alphabet struct { |
|
charset []tuple |
|
index uint32 |
|
} |
|
|
|
func (a *Alphabet) resetNext() { |
|
a.index = 0 |
|
} |
|
|
|
func (a *Alphabet) next() (rune, FullCharacteristicVector, error) { |
|
if int(a.index) >= len(a.charset) { |
|
return 0, nil, fmt.Errorf("eof") |
|
} |
|
|
|
rv := a.charset[a.index] |
|
a.index++ |
|
return rv.char, rv.fcv, nil |
|
} |
|
|
|
func dedupe(in string) string { |
|
lookUp := make(map[rune]struct{}, len(in)) |
|
var rv string |
|
for len(in) > 0 { |
|
r, size := utf8.DecodeRuneInString(in) |
|
in = in[size:] |
|
if _, ok := lookUp[r]; !ok { |
|
rv += string(r) |
|
lookUp[r] = struct{}{} |
|
} |
|
} |
|
return rv |
|
} |
|
|
|
func queryChars(qChars string) Alphabet { |
|
chars := dedupe(qChars) |
|
inChars := sortRune([]rune(chars)) |
|
charsets := make([]tuple, 0, len(inChars)) |
|
|
|
for _, c := range inChars { |
|
tempChars := qChars |
|
var bits []uint32 |
|
for len(tempChars) > 0 { |
|
var chunk string |
|
if len(tempChars) > 32 { |
|
chunk = tempChars[0:32] |
|
tempChars = tempChars[32:] |
|
} else { |
|
chunk = tempChars |
|
tempChars = tempChars[:0] |
|
} |
|
|
|
chunkBits := uint32(0) |
|
bit := uint32(1) |
|
for _, chr := range chunk { |
|
if chr == c { |
|
chunkBits |= bit |
|
} |
|
bit <<= 1 |
|
} |
|
bits = append(bits, chunkBits) |
|
} |
|
bits = append(bits, 0) |
|
charsets = append(charsets, tuple{char: c, fcv: FullCharacteristicVector(bits)}) |
|
} |
|
return Alphabet{charset: charsets} |
|
}
|
|
|