Платформа ЦРНП "Мирокод" для разработки проектов
https://git.mirocod.ru
You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
268 lines
6.0 KiB
268 lines
6.0 KiB
// Copyright (c) 2017 Couchbase, Inc. |
|
// |
|
// Licensed under the Apache License, Version 2.0 (the "License"); |
|
// you may not use this file except in compliance with the License. |
|
// You may obtain a copy of the License at |
|
// |
|
// http://www.apache.org/licenses/LICENSE-2.0 |
|
// |
|
// Unless required by applicable law or agreed to in writing, software |
|
// distributed under the License is distributed on an "AS IS" BASIS, |
|
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
|
// See the License for the specific language governing permissions and |
|
// limitations under the License. |
|
|
|
package utf8 |
|
|
|
import ( |
|
"fmt" |
|
"unicode/utf8" |
|
) |
|
|
|
// Sequences is a collection of Sequence |
|
type Sequences []Sequence |
|
|
|
// NewSequences constructs a collection of Sequence which describe the |
|
// byte ranges covered between the start and end runes. |
|
func NewSequences(start, end rune) (Sequences, error) { |
|
rv, _, err := NewSequencesPrealloc(start, end, nil, nil, nil, nil) |
|
return rv, err |
|
} |
|
|
|
func NewSequencesPrealloc(start, end rune, |
|
preallocSequences Sequences, |
|
preallocRangeStack RangeStack, |
|
preallocStartBytes, preallocEndBytes []byte) (Sequences, RangeStack, error) { |
|
rv := preallocSequences[:0] |
|
|
|
startBytes := preallocStartBytes |
|
if cap(startBytes) < utf8.UTFMax { |
|
startBytes = make([]byte, utf8.UTFMax) |
|
} |
|
startBytes = startBytes[:utf8.UTFMax] |
|
|
|
endBytes := preallocEndBytes |
|
if cap(endBytes) < utf8.UTFMax { |
|
endBytes = make([]byte, utf8.UTFMax) |
|
} |
|
endBytes = endBytes[:utf8.UTFMax] |
|
|
|
rangeStack := preallocRangeStack[:0] |
|
rangeStack = rangeStack.Push(scalarRange{start, end}) |
|
|
|
rangeStack, r := rangeStack.Pop() |
|
TOP: |
|
for r != nilScalarRange { |
|
INNER: |
|
for { |
|
r1, r2 := r.split() |
|
if r1 != nilScalarRange { |
|
rangeStack = rangeStack.Push(scalarRange{r2.start, r2.end}) |
|
r.start = r1.start |
|
r.end = r1.end |
|
continue INNER |
|
} |
|
if !r.valid() { |
|
rangeStack, r = rangeStack.Pop() |
|
continue TOP |
|
} |
|
for i := 1; i < utf8.UTFMax; i++ { |
|
max := maxScalarValue(i) |
|
if r.start <= max && max < r.end { |
|
rangeStack = rangeStack.Push(scalarRange{max + 1, r.end}) |
|
r.end = max |
|
continue INNER |
|
} |
|
} |
|
asciiRange := r.ascii() |
|
if asciiRange != nilRange { |
|
rv = append(rv, Sequence{ |
|
asciiRange, |
|
}) |
|
rangeStack, r = rangeStack.Pop() |
|
continue TOP |
|
} |
|
for i := uint(1); i < utf8.UTFMax; i++ { |
|
m := rune((1 << (6 * i)) - 1) |
|
if (r.start & ^m) != (r.end & ^m) { |
|
if (r.start & m) != 0 { |
|
rangeStack = rangeStack.Push(scalarRange{(r.start | m) + 1, r.end}) |
|
r.end = r.start | m |
|
continue INNER |
|
} |
|
if (r.end & m) != m { |
|
rangeStack = rangeStack.Push(scalarRange{r.end & ^m, r.end}) |
|
r.end = (r.end & ^m) - 1 |
|
continue INNER |
|
} |
|
} |
|
} |
|
n, m := r.encode(startBytes, endBytes) |
|
seq, err := SequenceFromEncodedRange(startBytes[0:n], endBytes[0:m]) |
|
if err != nil { |
|
return nil, nil, err |
|
} |
|
rv = append(rv, seq) |
|
rangeStack, r = rangeStack.Pop() |
|
continue TOP |
|
} |
|
} |
|
|
|
return rv, rangeStack, nil |
|
} |
|
|
|
// Sequence is a collection of Range |
|
type Sequence []Range |
|
|
|
// SequenceFromEncodedRange creates sequence from the encoded bytes |
|
func SequenceFromEncodedRange(start, end []byte) (Sequence, error) { |
|
if len(start) != len(end) { |
|
return nil, fmt.Errorf("byte slices must be the same length") |
|
} |
|
switch len(start) { |
|
case 2: |
|
return Sequence{ |
|
Range{start[0], end[0]}, |
|
Range{start[1], end[1]}, |
|
}, nil |
|
case 3: |
|
return Sequence{ |
|
Range{start[0], end[0]}, |
|
Range{start[1], end[1]}, |
|
Range{start[2], end[2]}, |
|
}, nil |
|
case 4: |
|
return Sequence{ |
|
Range{start[0], end[0]}, |
|
Range{start[1], end[1]}, |
|
Range{start[2], end[2]}, |
|
Range{start[3], end[3]}, |
|
}, nil |
|
} |
|
|
|
return nil, fmt.Errorf("invalid encoded byte length") |
|
} |
|
|
|
// Matches checks to see if the provided byte slice matches the Sequence |
|
func (u Sequence) Matches(bytes []byte) bool { |
|
if len(bytes) < len(u) { |
|
return false |
|
} |
|
for i := 0; i < len(u); i++ { |
|
if !u[i].matches(bytes[i]) { |
|
return false |
|
} |
|
} |
|
return true |
|
} |
|
|
|
func (u Sequence) String() string { |
|
switch len(u) { |
|
case 1: |
|
return fmt.Sprintf("%v", u[0]) |
|
case 2: |
|
return fmt.Sprintf("%v%v", u[0], u[1]) |
|
case 3: |
|
return fmt.Sprintf("%v%v%v", u[0], u[1], u[2]) |
|
case 4: |
|
return fmt.Sprintf("%v%v%v%v", u[0], u[1], u[2], u[3]) |
|
default: |
|
return fmt.Sprintf("invalid utf8 sequence") |
|
} |
|
} |
|
|
|
// Range describes a single range of byte values |
|
type Range struct { |
|
Start byte |
|
End byte |
|
} |
|
|
|
var nilRange = Range{0xff, 0} |
|
|
|
func (u Range) matches(b byte) bool { |
|
if u.Start <= b && b <= u.End { |
|
return true |
|
} |
|
return false |
|
} |
|
|
|
func (u Range) String() string { |
|
if u.Start == u.End { |
|
return fmt.Sprintf("[%X]", u.Start) |
|
} |
|
return fmt.Sprintf("[%X-%X]", u.Start, u.End) |
|
} |
|
|
|
type scalarRange struct { |
|
start rune |
|
end rune |
|
} |
|
|
|
var nilScalarRange = scalarRange{0xffff, 0} |
|
|
|
func (s *scalarRange) String() string { |
|
return fmt.Sprintf("ScalarRange(%d,%d)", s.start, s.end) |
|
} |
|
|
|
// split this scalar range if it overlaps with a surrogate codepoint |
|
func (s *scalarRange) split() (scalarRange, scalarRange) { |
|
if s.start < 0xe000 && s.end > 0xd7ff { |
|
return scalarRange{ |
|
start: s.start, |
|
end: 0xd7ff, |
|
}, |
|
scalarRange{ |
|
start: 0xe000, |
|
end: s.end, |
|
} |
|
} |
|
return nilScalarRange, nilScalarRange |
|
} |
|
|
|
func (s *scalarRange) valid() bool { |
|
return s.start <= s.end |
|
} |
|
|
|
func (s *scalarRange) ascii() Range { |
|
if s.valid() && s.end <= 0x7f { |
|
return Range{ |
|
Start: byte(s.start), |
|
End: byte(s.end), |
|
} |
|
} |
|
return nilRange |
|
} |
|
|
|
// start and end MUST have capacity for utf8.UTFMax bytes |
|
func (s *scalarRange) encode(start, end []byte) (int, int) { |
|
n := utf8.EncodeRune(start, s.start) |
|
m := utf8.EncodeRune(end, s.end) |
|
return n, m |
|
} |
|
|
|
type RangeStack []scalarRange |
|
|
|
func (s RangeStack) Push(v scalarRange) RangeStack { |
|
return append(s, v) |
|
} |
|
|
|
func (s RangeStack) Pop() (RangeStack, scalarRange) { |
|
l := len(s) |
|
if l < 1 { |
|
return s, nilScalarRange |
|
} |
|
return s[:l-1], s[l-1] |
|
} |
|
|
|
func maxScalarValue(nbytes int) rune { |
|
switch nbytes { |
|
case 1: |
|
return 0x007f |
|
case 2: |
|
return 0x07FF |
|
case 3: |
|
return 0xFFFF |
|
default: |
|
return 0x10FFFF |
|
} |
|
}
|
|
|