Платформа ЦРНП "Мирокод" для разработки проектов
https://git.mirocod.ru
You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
107 lines
2.7 KiB
107 lines
2.7 KiB
package enry |
|
|
|
import ( |
|
"math" |
|
"sort" |
|
|
|
"github.com/src-d/enry/v2/internal/tokenizer" |
|
) |
|
|
|
// Classifier is the interface in charge to detect the possible languages of the given content based on a set of |
|
// candidates. Candidates is a map which can be used to assign weights to languages dynamically. |
|
type Classifier interface { |
|
Classify(content []byte, candidates map[string]float64) (languages []string) |
|
} |
|
|
|
type classifier struct { |
|
languagesLogProbabilities map[string]float64 |
|
tokensLogProbabilities map[string]map[string]float64 |
|
tokensTotal float64 |
|
} |
|
|
|
type scoredLanguage struct { |
|
language string |
|
score float64 |
|
} |
|
|
|
// Classify returns a sorted slice of possible languages sorted by decreasing language's probability |
|
func (c *classifier) Classify(content []byte, candidates map[string]float64) []string { |
|
|
|
var languages map[string]float64 |
|
if len(candidates) == 0 { |
|
languages = c.knownLangs() |
|
} else { |
|
languages = make(map[string]float64, len(candidates)) |
|
for candidate, weight := range candidates { |
|
if lang, ok := GetLanguageByAlias(candidate); ok { |
|
candidate = lang |
|
} |
|
|
|
languages[candidate] = weight |
|
} |
|
} |
|
|
|
empty := len(content) == 0 |
|
scoredLangs := make([]*scoredLanguage, 0, len(languages)) |
|
|
|
var tokens []string |
|
if !empty { |
|
tokens = tokenizer.Tokenize(content) |
|
} |
|
|
|
for language := range languages { |
|
score := c.languagesLogProbabilities[language] |
|
if !empty { |
|
score += c.tokensLogProbability(tokens, language) |
|
} |
|
scoredLangs = append(scoredLangs, &scoredLanguage{ |
|
language: language, |
|
score: score, |
|
}) |
|
} |
|
|
|
return sortLanguagesByScore(scoredLangs) |
|
} |
|
|
|
func sortLanguagesByScore(scoredLangs []*scoredLanguage) []string { |
|
sort.Stable(byScore(scoredLangs)) |
|
sortedLanguages := make([]string, 0, len(scoredLangs)) |
|
for _, scoredLang := range scoredLangs { |
|
sortedLanguages = append(sortedLanguages, scoredLang.language) |
|
} |
|
|
|
return sortedLanguages |
|
} |
|
|
|
func (c *classifier) knownLangs() map[string]float64 { |
|
langs := make(map[string]float64, len(c.languagesLogProbabilities)) |
|
for lang := range c.languagesLogProbabilities { |
|
langs[lang]++ |
|
} |
|
|
|
return langs |
|
} |
|
|
|
func (c *classifier) tokensLogProbability(tokens []string, language string) float64 { |
|
var sum float64 |
|
for _, token := range tokens { |
|
sum += c.tokenProbability(token, language) |
|
} |
|
|
|
return sum |
|
} |
|
|
|
func (c *classifier) tokenProbability(token, language string) float64 { |
|
tokenProb, ok := c.tokensLogProbabilities[language][token] |
|
if !ok { |
|
tokenProb = math.Log(1.000000 / c.tokensTotal) |
|
} |
|
|
|
return tokenProb |
|
} |
|
|
|
type byScore []*scoredLanguage |
|
|
|
func (b byScore) Len() int { return len(b) } |
|
func (b byScore) Swap(i, j int) { b[i], b[j] = b[j], b[i] } |
|
func (b byScore) Less(i, j int) bool { return b[j].score < b[i].score }
|
|
|