// Copyright 2021 The Gitea Authors. All rights reserved.
// Use of this source code is governed by a MIT-style
// license that can be found in the LICENSE file.
package charset
import (
"bytes"
"fmt"
"io"
"strings"
"unicode"
"unicode/utf8"
"golang.org/x/text/unicode/bidi"
)
// EscapeStatus represents the findings of the unicode escaper
type EscapeStatus struct {
Escaped bool
HasError bool
HasBadRunes bool
HasControls bool
HasSpaces bool
HasMarks bool
HasBIDI bool
BadBIDI bool
HasRTLScript bool
HasLTRScript bool
}
// Or combines two EscapeStatus structs into one representing the conjunction of the two
func ( status EscapeStatus ) Or ( other EscapeStatus ) EscapeStatus {
st := status
st . Escaped = st . Escaped || other . Escaped
st . HasError = st . HasError || other . HasError
st . HasBadRunes = st . HasBadRunes || other . HasBadRunes
st . HasControls = st . HasControls || other . HasControls
st . HasSpaces = st . HasSpaces || other . HasSpaces
st . HasMarks = st . HasMarks || other . HasMarks
st . HasBIDI = st . HasBIDI || other . HasBIDI
st . BadBIDI = st . BadBIDI || other . BadBIDI
st . HasRTLScript = st . HasRTLScript || other . HasRTLScript
st . HasLTRScript = st . HasLTRScript || other . HasLTRScript
return st
}
// EscapeControlString escapes the unicode control sequences in a provided string and returns the findings as an EscapeStatus and the escaped string
func EscapeControlString ( text string ) ( EscapeStatus , string ) {
sb := & strings . Builder { }
escaped , _ := EscapeControlReader ( strings . NewReader ( text ) , sb )
return escaped , sb . String ( )
}
// EscapeControlBytes escapes the unicode control sequences a provided []byte and returns the findings as an EscapeStatus and the escaped []byte
func EscapeControlBytes ( text [ ] byte ) ( EscapeStatus , [ ] byte ) {
buf := & bytes . Buffer { }
escaped , _ := EscapeControlReader ( bytes . NewReader ( text ) , buf )
return escaped , buf . Bytes ( )
}
// EscapeControlReader escapes the unicode control sequences a provided Reader writing the escaped output to the output and returns the findings as an EscapeStatus and an error
func EscapeControlReader ( text io . Reader , output io . Writer ) ( escaped EscapeStatus , err error ) {
buf := make ( [ ] byte , 4096 )
readStart := 0
runeCount := 0
var n int
var writePos int
lineHasBIDI := false
lineHasRTLScript := false
lineHasLTRScript := false
readingloop :
for err == nil {
n , err = text . Read ( buf [ readStart : ] )
bs := buf [ : n + readStart ]
n = len ( bs )
i := 0
for i < len ( bs ) {
r , size := utf8 . DecodeRune ( bs [ i : ] )
runeCount ++
// Now handle the codepoints
switch {
case r == utf8 . RuneError :
if writePos < i {
if _ , err = output . Write ( bs [ writePos : i ] ) ; err != nil {
escaped . HasError = true
return
}
writePos = i
}
// runes can be at most 4 bytes - so...
if len ( bs ) - i <= 3 {
// if not request more data
copy ( buf , bs [ i : ] )
readStart = n - i
writePos = 0
continue readingloop
}
// this is a real broken rune
escaped . HasBadRunes = true
escaped . Escaped = true
if err = writeBroken ( output , bs [ i : i + size ] ) ; err != nil {
escaped . HasError = true
return
}
writePos += size
case r == '\n' :
if lineHasBIDI && ! lineHasRTLScript && lineHasLTRScript {
escaped . BadBIDI = true
}
lineHasBIDI = false
lineHasRTLScript = false
lineHasLTRScript = false
case runeCount == 1 && r == 0xFEFF : // UTF BOM
// the first BOM is safe
case r == '\r' || r == '\t' || r == ' ' :
// These are acceptable control characters and space characters
case unicode . IsSpace ( r ) :
escaped . HasSpaces = true
escaped . Escaped = true
if writePos < i {
if _ , err = output . Write ( bs [ writePos : i ] ) ; err != nil {
escaped . HasError = true
return
}
}
if err = writeEscaped ( output , r ) ; err != nil {
escaped . HasError = true
return
}
writePos = i + size
case unicode . Is ( unicode . Bidi_Control , r ) :
escaped . Escaped = true
escaped . HasBIDI = true
if writePos < i {
if _ , err = output . Write ( bs [ writePos : i ] ) ; err != nil {
escaped . HasError = true
return
}
}
lineHasBIDI = true
if err = writeEscaped ( output , r ) ; err != nil {
escaped . HasError = true
return
}
writePos = i + size
// 65279 == BOM rune.
case unicode . Is ( unicode . C , r ) && r != rune ( 65279 ) :
escaped . Escaped = true
escaped . HasControls = true
if writePos < i {
if _ , err = output . Write ( bs [ writePos : i ] ) ; err != nil {
escaped . HasError = true
return
}
}
if err = writeEscaped ( output , r ) ; err != nil {
escaped . HasError = true
return
}
writePos = i + size
case unicode . Is ( unicode . M , r ) :
escaped . Escaped = true
escaped . HasMarks = true
if writePos < i {
if _ , err = output . Write ( bs [ writePos : i ] ) ; err != nil {
escaped . HasError = true
return
}
}
if err = writeEscaped ( output , r ) ; err != nil {
escaped . HasError = true
return
}
writePos = i + size
default :
p , _ := bidi . Lookup ( bs [ i : i + size ] )
c := p . Class ( )
if c == bidi . R || c == bidi . AL {
lineHasRTLScript = true
escaped . HasRTLScript = true
} else if c == bidi . L {
lineHasLTRScript = true
escaped . HasLTRScript = true
}
}
i += size
}
if n > 0 {
// we read something...
// write everything unwritten
if writePos < i {
if _ , err = output . Write ( bs [ writePos : i ] ) ; err != nil {
escaped . HasError = true
return
}
}
// reset the starting positions for the next read
readStart = 0
writePos = 0
}
}
if readStart > 0 {
// this means that there is an incomplete or broken rune at 0-readStart and we read nothing on the last go round
escaped . Escaped = true
escaped . HasBadRunes = true
if err = writeBroken ( output , buf [ : readStart ] ) ; err != nil {
escaped . HasError = true
return
}
}
if err == io . EOF {
if lineHasBIDI && ! lineHasRTLScript && lineHasLTRScript {
escaped . BadBIDI = true
}
err = nil
return
}
escaped . HasError = true
return
}
func writeBroken ( output io . Writer , bs [ ] byte ) ( err error ) {
_ , err = fmt . Fprintf ( output , ` <span class="broken-code-point"><%X></span> ` , bs )
return
}
func writeEscaped ( output io . Writer , r rune ) ( err error ) {
_ , err = fmt . Fprintf ( output , ` <span class="escaped-code-point" data-escaped="[U+%04X]"><span class="char">%c</span></span> ` , r , r )
return
}