Платформа ЦРНП "Мирокод" для разработки проектов
https://git.mirocod.ru
You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
120 lines
2.2 KiB
120 lines
2.2 KiB
// Copyright 2015 Huan Du. All rights reserved. |
|
// Licensed under the MIT license that can be found in the LICENSE file. |
|
|
|
package xstrings |
|
|
|
import ( |
|
"unicode" |
|
"unicode/utf8" |
|
) |
|
|
|
// Len returns str's utf8 rune length. |
|
func Len(str string) int { |
|
return utf8.RuneCountInString(str) |
|
} |
|
|
|
// WordCount returns number of words in a string. |
|
// |
|
// Word is defined as a locale dependent string containing alphabetic characters, |
|
// which may also contain but not start with `'` and `-` characters. |
|
func WordCount(str string) int { |
|
var r rune |
|
var size, n int |
|
|
|
inWord := false |
|
|
|
for len(str) > 0 { |
|
r, size = utf8.DecodeRuneInString(str) |
|
|
|
switch { |
|
case isAlphabet(r): |
|
if !inWord { |
|
inWord = true |
|
n++ |
|
} |
|
|
|
case inWord && (r == '\'' || r == '-'): |
|
// Still in word. |
|
|
|
default: |
|
inWord = false |
|
} |
|
|
|
str = str[size:] |
|
} |
|
|
|
return n |
|
} |
|
|
|
const minCJKCharacter = '\u3400' |
|
|
|
// Checks r is a letter but not CJK character. |
|
func isAlphabet(r rune) bool { |
|
if !unicode.IsLetter(r) { |
|
return false |
|
} |
|
|
|
switch { |
|
// Quick check for non-CJK character. |
|
case r < minCJKCharacter: |
|
return true |
|
|
|
// Common CJK characters. |
|
case r >= '\u4E00' && r <= '\u9FCC': |
|
return false |
|
|
|
// Rare CJK characters. |
|
case r >= '\u3400' && r <= '\u4D85': |
|
return false |
|
|
|
// Rare and historic CJK characters. |
|
case r >= '\U00020000' && r <= '\U0002B81D': |
|
return false |
|
} |
|
|
|
return true |
|
} |
|
|
|
// Width returns string width in monotype font. |
|
// Multi-byte characters are usually twice the width of single byte characters. |
|
// |
|
// Algorithm comes from `mb_strwidth` in PHP. |
|
// http://php.net/manual/en/function.mb-strwidth.php |
|
func Width(str string) int { |
|
var r rune |
|
var size, n int |
|
|
|
for len(str) > 0 { |
|
r, size = utf8.DecodeRuneInString(str) |
|
n += RuneWidth(r) |
|
str = str[size:] |
|
} |
|
|
|
return n |
|
} |
|
|
|
// RuneWidth returns character width in monotype font. |
|
// Multi-byte characters are usually twice the width of single byte characters. |
|
// |
|
// Algorithm comes from `mb_strwidth` in PHP. |
|
// http://php.net/manual/en/function.mb-strwidth.php |
|
func RuneWidth(r rune) int { |
|
switch { |
|
case r == utf8.RuneError || r < '\x20': |
|
return 0 |
|
|
|
case '\x20' <= r && r < '\u2000': |
|
return 1 |
|
|
|
case '\u2000' <= r && r < '\uFF61': |
|
return 2 |
|
|
|
case '\uFF61' <= r && r < '\uFFA0': |
|
return 1 |
|
|
|
case '\uFFA0' <= r: |
|
return 2 |
|
} |
|
|
|
return 0 |
|
}
|
|
|