package common import ( "fmt" "golang.org/x/text/encoding" "golang.org/x/text/encoding/charmap" "golang.org/x/text/encoding/ianaindex" "golang.org/x/text/encoding/japanese" "golang.org/x/text/encoding/simplifiedchinese" "golang.org/x/text/encoding/unicode" "github.com/makiuchi-d/gozxing" ) const ( StringUtils_ASSUME_SHIFT_JIS = false // Retained for ABI compatibility with earlier versions StringUtils_SHIFT_JIS = "SJIS" StringUtils_GB2312 = "GB2312" ) var ( StringUtils_PLATFORM_DEFAULT_ENCODING = unicode.UTF8 StringUtils_SHIFT_JIS_CHARSET = japanese.ShiftJIS // "SJIS" StringUtils_GB2312_CHARSET = simplifiedchinese.GB18030 // "GB2312" StringUtils_EUC_JP = japanese.EUCJP // "EUC_JP" ) func StringUtils_guessEncoding(bytes []byte, hints map[gozxing.DecodeHintType]interface{}) (string, error) { c, err := StringUtils_guessCharset(bytes, hints) if err != nil { return "", err } if c == StringUtils_SHIFT_JIS_CHARSET { return "SJIS", nil } else if c == unicode.UTF8 { return "UTF8", nil } else if c == charmap.ISO8859_1 { return "ISO8859_1", nil } return ianaindex.IANA.Name(c) } func StringUtils_guessCharset(bytes []byte, hints map[gozxing.DecodeHintType]interface{}) (encoding.Encoding, error) { if hint, ok := hints[gozxing.DecodeHintType_CHARACTER_SET]; ok { if charset, ok := hint.(encoding.Encoding); ok { return charset, nil } name := fmt.Sprintf("%v", hint) if eci, ok := GetCharacterSetECIByName(name); ok { return eci.GetCharset(), nil } return ianaindex.IANA.Encoding(name) } // First try UTF-16, assuming anything with its BOM is UTF-16 if len(bytes) > 2 { if bytes[0] == 0xfe && bytes[1] == 0xff { return unicode.UTF16(unicode.BigEndian, unicode.UseBOM), nil } if bytes[0] == 0xff && bytes[1] == 0xfe { return unicode.UTF16(unicode.LittleEndian, unicode.UseBOM), nil } } // For now, merely tries to distinguish ISO-8859-1, UTF-8 and Shift_JIS, // which should be by far the most common encodings. length := len(bytes) canBeISO88591 := true canBeShiftJIS := true canBeUTF8 := true utf8BytesLeft := 0 utf2BytesChars := 0 utf3BytesChars := 0 utf4BytesChars := 0 sjisBytesLeft := 0 sjisKatakanaChars := 0 sjisCurKatakanaWordLength := 0 sjisCurDoubleBytesWordLength := 0 sjisMaxKatakanaWordLength := 0 sjisMaxDoubleBytesWordLength := 0 isoHighOther := 0 utf8bom := len(bytes) > 3 && bytes[0] == 0xEF && bytes[1] == 0xBB && bytes[2] == 0xBF for i := 0; i < length && (canBeISO88591 || canBeShiftJIS || canBeUTF8); i++ { value := bytes[i] & 0xFF // UTF-8 stuff if canBeUTF8 { if utf8BytesLeft > 0 { if (value & 0x80) == 0 { canBeUTF8 = false } else { utf8BytesLeft-- } } else if (value & 0x80) != 0 { if (value & 0x40) == 0 { canBeUTF8 = false } else { utf8BytesLeft++ if (value & 0x20) == 0 { utf2BytesChars++ } else { utf8BytesLeft++ if (value & 0x10) == 0 { utf3BytesChars++ } else { utf8BytesLeft++ if (value & 0x08) == 0 { utf4BytesChars++ } else { canBeUTF8 = false } } } } } } // ISO-8859-1 stuff if canBeISO88591 { if value > 0x7F && value < 0xA0 { canBeISO88591 = false } else if value > 0x9F && (value < 0xC0 || value == 0xD7 || value == 0xF7) { isoHighOther++ } } // Shift_JIS stuff if canBeShiftJIS { if sjisBytesLeft > 0 { if value < 0x40 || value == 0x7F || value > 0xFC { canBeShiftJIS = false } else { sjisBytesLeft-- } } else if value == 0x80 || value == 0xA0 || value > 0xEF { canBeShiftJIS = false } else if value > 0xA0 && value < 0xE0 { sjisKatakanaChars++ sjisCurDoubleBytesWordLength = 0 sjisCurKatakanaWordLength++ if sjisCurKatakanaWordLength > sjisMaxKatakanaWordLength { sjisMaxKatakanaWordLength = sjisCurKatakanaWordLength } } else if value > 0x7F { sjisBytesLeft++ //sjisDoubleBytesChars++; sjisCurKatakanaWordLength = 0 sjisCurDoubleBytesWordLength++ if sjisCurDoubleBytesWordLength > sjisMaxDoubleBytesWordLength { sjisMaxDoubleBytesWordLength = sjisCurDoubleBytesWordLength } } else { //sjisLowChars++; sjisCurKatakanaWordLength = 0 sjisCurDoubleBytesWordLength = 0 } } } if canBeUTF8 && utf8BytesLeft > 0 { canBeUTF8 = false } if canBeShiftJIS && sjisBytesLeft > 0 { canBeShiftJIS = false } // Easy -- if there is BOM or at least 1 valid not-single byte character (and no evidence it can't be UTF-8), done if canBeUTF8 && (utf8bom || utf2BytesChars+utf3BytesChars+utf4BytesChars > 0) { return unicode.UTF8, nil } // Easy -- if assuming Shift_JIS or at least 3 valid consecutive not-ascii characters (and no evidence it can't be), done if canBeShiftJIS && (sjisMaxKatakanaWordLength >= 3 || sjisMaxDoubleBytesWordLength >= 3) { return StringUtils_SHIFT_JIS_CHARSET, nil } // Distinguishing Shift_JIS and ISO-8859-1 can be a little tough for short words. The crude heuristic is: // - If we saw // - only two consecutive katakana chars in the whole text, or // - at least 10% of bytes that could be "upper" not-alphanumeric Latin1, // - then we conclude Shift_JIS, else ISO-8859-1 if canBeISO88591 && canBeShiftJIS { if (sjisMaxKatakanaWordLength == 2 && sjisKatakanaChars == 2) || isoHighOther*10 >= length { return StringUtils_SHIFT_JIS_CHARSET, nil } return charmap.ISO8859_1, nil } // Otherwise, try in order ISO-8859-1, Shift JIS, UTF-8 and fall back to default platform encoding if canBeISO88591 { return charmap.ISO8859_1, nil } if canBeShiftJIS { return StringUtils_SHIFT_JIS_CHARSET, nil } if canBeUTF8 { return unicode.UTF8, nil } // Otherwise, we take a wild guess with platform encoding return StringUtils_PLATFORM_DEFAULT_ENCODING, nil }