fckeuspy-go/vendor/github.com/makiuchi-d/gozxing/common/string_utils.go
2025-09-28 21:03:39 +02:00

209 lines
5.8 KiB
Go

package common
import (
"fmt"
"golang.org/x/text/encoding"
"golang.org/x/text/encoding/charmap"
"golang.org/x/text/encoding/ianaindex"
"golang.org/x/text/encoding/japanese"
"golang.org/x/text/encoding/simplifiedchinese"
"golang.org/x/text/encoding/unicode"
"github.com/makiuchi-d/gozxing"
)
const (
StringUtils_ASSUME_SHIFT_JIS = false
// Retained for ABI compatibility with earlier versions
StringUtils_SHIFT_JIS = "SJIS"
StringUtils_GB2312 = "GB2312"
)
var (
StringUtils_PLATFORM_DEFAULT_ENCODING = unicode.UTF8
StringUtils_SHIFT_JIS_CHARSET = japanese.ShiftJIS // "SJIS"
StringUtils_GB2312_CHARSET = simplifiedchinese.GB18030 // "GB2312"
StringUtils_EUC_JP = japanese.EUCJP // "EUC_JP"
)
func StringUtils_guessEncoding(bytes []byte, hints map[gozxing.DecodeHintType]interface{}) (string, error) {
c, err := StringUtils_guessCharset(bytes, hints)
if err != nil {
return "", err
}
if c == StringUtils_SHIFT_JIS_CHARSET {
return "SJIS", nil
} else if c == unicode.UTF8 {
return "UTF8", nil
} else if c == charmap.ISO8859_1 {
return "ISO8859_1", nil
}
return ianaindex.IANA.Name(c)
}
func StringUtils_guessCharset(bytes []byte, hints map[gozxing.DecodeHintType]interface{}) (encoding.Encoding, error) {
if hint, ok := hints[gozxing.DecodeHintType_CHARACTER_SET]; ok {
if charset, ok := hint.(encoding.Encoding); ok {
return charset, nil
}
name := fmt.Sprintf("%v", hint)
if eci, ok := GetCharacterSetECIByName(name); ok {
return eci.GetCharset(), nil
}
return ianaindex.IANA.Encoding(name)
}
// First try UTF-16, assuming anything with its BOM is UTF-16
if len(bytes) > 2 {
if bytes[0] == 0xfe && bytes[1] == 0xff {
return unicode.UTF16(unicode.BigEndian, unicode.UseBOM), nil
}
if bytes[0] == 0xff && bytes[1] == 0xfe {
return unicode.UTF16(unicode.LittleEndian, unicode.UseBOM), nil
}
}
// For now, merely tries to distinguish ISO-8859-1, UTF-8 and Shift_JIS,
// which should be by far the most common encodings.
length := len(bytes)
canBeISO88591 := true
canBeShiftJIS := true
canBeUTF8 := true
utf8BytesLeft := 0
utf2BytesChars := 0
utf3BytesChars := 0
utf4BytesChars := 0
sjisBytesLeft := 0
sjisKatakanaChars := 0
sjisCurKatakanaWordLength := 0
sjisCurDoubleBytesWordLength := 0
sjisMaxKatakanaWordLength := 0
sjisMaxDoubleBytesWordLength := 0
isoHighOther := 0
utf8bom := len(bytes) > 3 &&
bytes[0] == 0xEF &&
bytes[1] == 0xBB &&
bytes[2] == 0xBF
for i := 0; i < length && (canBeISO88591 || canBeShiftJIS || canBeUTF8); i++ {
value := bytes[i] & 0xFF
// UTF-8 stuff
if canBeUTF8 {
if utf8BytesLeft > 0 {
if (value & 0x80) == 0 {
canBeUTF8 = false
} else {
utf8BytesLeft--
}
} else if (value & 0x80) != 0 {
if (value & 0x40) == 0 {
canBeUTF8 = false
} else {
utf8BytesLeft++
if (value & 0x20) == 0 {
utf2BytesChars++
} else {
utf8BytesLeft++
if (value & 0x10) == 0 {
utf3BytesChars++
} else {
utf8BytesLeft++
if (value & 0x08) == 0 {
utf4BytesChars++
} else {
canBeUTF8 = false
}
}
}
}
}
}
// ISO-8859-1 stuff
if canBeISO88591 {
if value > 0x7F && value < 0xA0 {
canBeISO88591 = false
} else if value > 0x9F && (value < 0xC0 || value == 0xD7 || value == 0xF7) {
isoHighOther++
}
}
// Shift_JIS stuff
if canBeShiftJIS {
if sjisBytesLeft > 0 {
if value < 0x40 || value == 0x7F || value > 0xFC {
canBeShiftJIS = false
} else {
sjisBytesLeft--
}
} else if value == 0x80 || value == 0xA0 || value > 0xEF {
canBeShiftJIS = false
} else if value > 0xA0 && value < 0xE0 {
sjisKatakanaChars++
sjisCurDoubleBytesWordLength = 0
sjisCurKatakanaWordLength++
if sjisCurKatakanaWordLength > sjisMaxKatakanaWordLength {
sjisMaxKatakanaWordLength = sjisCurKatakanaWordLength
}
} else if value > 0x7F {
sjisBytesLeft++
//sjisDoubleBytesChars++;
sjisCurKatakanaWordLength = 0
sjisCurDoubleBytesWordLength++
if sjisCurDoubleBytesWordLength > sjisMaxDoubleBytesWordLength {
sjisMaxDoubleBytesWordLength = sjisCurDoubleBytesWordLength
}
} else {
//sjisLowChars++;
sjisCurKatakanaWordLength = 0
sjisCurDoubleBytesWordLength = 0
}
}
}
if canBeUTF8 && utf8BytesLeft > 0 {
canBeUTF8 = false
}
if canBeShiftJIS && sjisBytesLeft > 0 {
canBeShiftJIS = false
}
// Easy -- if there is BOM or at least 1 valid not-single byte character (and no evidence it can't be UTF-8), done
if canBeUTF8 && (utf8bom || utf2BytesChars+utf3BytesChars+utf4BytesChars > 0) {
return unicode.UTF8, nil
}
// Easy -- if assuming Shift_JIS or at least 3 valid consecutive not-ascii characters (and no evidence it can't be), done
if canBeShiftJIS && (sjisMaxKatakanaWordLength >= 3 || sjisMaxDoubleBytesWordLength >= 3) {
return StringUtils_SHIFT_JIS_CHARSET, nil
}
// Distinguishing Shift_JIS and ISO-8859-1 can be a little tough for short words. The crude heuristic is:
// - If we saw
// - only two consecutive katakana chars in the whole text, or
// - at least 10% of bytes that could be "upper" not-alphanumeric Latin1,
// - then we conclude Shift_JIS, else ISO-8859-1
if canBeISO88591 && canBeShiftJIS {
if (sjisMaxKatakanaWordLength == 2 && sjisKatakanaChars == 2) || isoHighOther*10 >= length {
return StringUtils_SHIFT_JIS_CHARSET, nil
}
return charmap.ISO8859_1, nil
}
// Otherwise, try in order ISO-8859-1, Shift JIS, UTF-8 and fall back to default platform encoding
if canBeISO88591 {
return charmap.ISO8859_1, nil
}
if canBeShiftJIS {
return StringUtils_SHIFT_JIS_CHARSET, nil
}
if canBeUTF8 {
return unicode.UTF8, nil
}
// Otherwise, we take a wild guess with platform encoding
return StringUtils_PLATFORM_DEFAULT_ENCODING, nil
}