Платформа ЦРНП "Мирокод" для разработки проектов
https://git.mirocod.ru
You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
136 lines
3.6 KiB
136 lines
3.6 KiB
// Package chardet ports character set detection from ICU. |
|
package chardet |
|
|
|
import ( |
|
"errors" |
|
"sort" |
|
) |
|
|
|
// Result contains all the information that charset detector gives. |
|
type Result struct { |
|
// IANA name of the detected charset. |
|
Charset string |
|
// IANA name of the detected language. It may be empty for some charsets. |
|
Language string |
|
// Confidence of the Result. Scale from 1 to 100. The bigger, the more confident. |
|
Confidence int |
|
} |
|
|
|
// Detector implements charset detection. |
|
type Detector struct { |
|
recognizers []recognizer |
|
stripTag bool |
|
} |
|
|
|
// List of charset recognizers |
|
var recognizers = []recognizer{ |
|
newRecognizer_utf8(), |
|
newRecognizer_utf16be(), |
|
newRecognizer_utf16le(), |
|
newRecognizer_utf32be(), |
|
newRecognizer_utf32le(), |
|
newRecognizer_8859_1_en(), |
|
newRecognizer_8859_1_da(), |
|
newRecognizer_8859_1_de(), |
|
newRecognizer_8859_1_es(), |
|
newRecognizer_8859_1_fr(), |
|
newRecognizer_8859_1_it(), |
|
newRecognizer_8859_1_nl(), |
|
newRecognizer_8859_1_no(), |
|
newRecognizer_8859_1_pt(), |
|
newRecognizer_8859_1_sv(), |
|
newRecognizer_8859_2_cs(), |
|
newRecognizer_8859_2_hu(), |
|
newRecognizer_8859_2_pl(), |
|
newRecognizer_8859_2_ro(), |
|
newRecognizer_8859_5_ru(), |
|
newRecognizer_8859_6_ar(), |
|
newRecognizer_8859_7_el(), |
|
newRecognizer_8859_8_I_he(), |
|
newRecognizer_8859_8_he(), |
|
newRecognizer_windows_1251(), |
|
newRecognizer_windows_1256(), |
|
newRecognizer_KOI8_R(), |
|
newRecognizer_8859_9_tr(), |
|
|
|
newRecognizer_sjis(), |
|
newRecognizer_gb_18030(), |
|
newRecognizer_euc_jp(), |
|
newRecognizer_euc_kr(), |
|
newRecognizer_big5(), |
|
|
|
newRecognizer_2022JP(), |
|
newRecognizer_2022KR(), |
|
newRecognizer_2022CN(), |
|
|
|
newRecognizer_IBM424_he_rtl(), |
|
newRecognizer_IBM424_he_ltr(), |
|
newRecognizer_IBM420_ar_rtl(), |
|
newRecognizer_IBM420_ar_ltr(), |
|
} |
|
|
|
// NewTextDetector creates a Detector for plain text. |
|
func NewTextDetector() *Detector { |
|
return &Detector{recognizers, false} |
|
} |
|
|
|
// NewHtmlDetector creates a Detector for Html. |
|
func NewHtmlDetector() *Detector { |
|
return &Detector{recognizers, true} |
|
} |
|
|
|
var ( |
|
NotDetectedError = errors.New("Charset not detected.") |
|
) |
|
|
|
// DetectBest returns the Result with highest Confidence. |
|
func (d *Detector) DetectBest(b []byte) (r *Result, err error) { |
|
var all []Result |
|
if all, err = d.DetectAll(b); err == nil { |
|
r = &all[0] |
|
} |
|
return |
|
} |
|
|
|
// DetectAll returns all Results which have non-zero Confidence. The Results are sorted by Confidence in descending order. |
|
func (d *Detector) DetectAll(b []byte) ([]Result, error) { |
|
input := newRecognizerInput(b, d.stripTag) |
|
outputChan := make(chan recognizerOutput) |
|
for _, r := range d.recognizers { |
|
go matchHelper(r, input, outputChan) |
|
} |
|
outputs := make([]recognizerOutput, 0, len(d.recognizers)) |
|
for i := 0; i < len(d.recognizers); i++ { |
|
o := <-outputChan |
|
if o.Confidence > 0 { |
|
outputs = append(outputs, o) |
|
} |
|
} |
|
if len(outputs) == 0 { |
|
return nil, NotDetectedError |
|
} |
|
|
|
sort.Sort(recognizerOutputs(outputs)) |
|
dedupOutputs := make([]Result, 0, len(outputs)) |
|
foundCharsets := make(map[string]struct{}, len(outputs)) |
|
for _, o := range outputs { |
|
if _, found := foundCharsets[o.Charset]; !found { |
|
dedupOutputs = append(dedupOutputs, Result(o)) |
|
foundCharsets[o.Charset] = struct{}{} |
|
} |
|
} |
|
if len(dedupOutputs) == 0 { |
|
return nil, NotDetectedError |
|
} |
|
return dedupOutputs, nil |
|
} |
|
|
|
func matchHelper(r recognizer, input *recognizerInput, outputChan chan<- recognizerOutput) { |
|
outputChan <- r.Match(input) |
|
} |
|
|
|
type recognizerOutputs []recognizerOutput |
|
|
|
func (r recognizerOutputs) Len() int { return len(r) } |
|
func (r recognizerOutputs) Less(i, j int) bool { return r[i].Confidence > r[j].Confidence } |
|
func (r recognizerOutputs) Swap(i, j int) { r[i], r[j] = r[j], r[i] }
|
|
|