使用encoding库做字符集转换

This commit is contained in:
wenzi1
2019-06-11 16:03:09 +08:00
parent e5c255200c
commit dd6152fe8a

View File

@ -9,24 +9,48 @@
// Package gcharset provides converting string to requested character encoding.
//
// 字符集转换方法,
// 使用mahonia实现的字符集转换方法支持的字符集包括常见的utf8/UTF-16/UTF-16LE/macintosh/big5/gbk/gb18030,支持的全量字符集可以参考mahonia包
package gcharset
import (
"bytes"
"errors"
"fmt"
"github.com/gogf/gf/g/container/gmap"
"github.com/gogf/gf/third/github.com/axgle/mahonia"
"github.com/gogf/gf/third/golang.org/x/text/encoding"
"github.com/gogf/gf/third/golang.org/x/text/encoding/japanese"
"github.com/gogf/gf/third/golang.org/x/text/encoding/korean"
"github.com/gogf/gf/third/golang.org/x/text/encoding/simplifiedchinese"
"github.com/gogf/gf/third/golang.org/x/text/encoding/traditionalchinese"
"github.com/gogf/gf/third/golang.org/x/text/transform"
"io/ioutil"
"strings"
)
var encodingMap *gmap.Map
func init() {
encodingMap = gmap.New()
encodingMap.Sets(
map[interface{}]interface{}{
"GBK": simplifiedchinese.GBK,
"GB18030": simplifiedchinese.GB18030,
"HZGB2312": simplifiedchinese.HZGB2312,
"GB2312": simplifiedchinese.HZGB2312,
"EUCJP": japanese.EUCJP,
"ISO2022JP": japanese.ISO2022JP,
"SHIFTJIS": japanese.ShiftJIS,
"EUCKR": korean.EUCKR,
"BIG5": traditionalchinese.Big5,
})
}
// 2个字符集之间的转换
func Convert(dstCharset string, srcCharset string, src string) (dst string, err error) {
srcCharsetUpper := strings.ToUpper(srcCharset)
dstCharsetUpper := strings.ToUpper(dstCharset)
if strings.EqualFold(srcCharset, dstCharset) {
if srcCharsetUpper == dstCharsetUpper {
return src, nil
}
@ -34,72 +58,48 @@ func Convert(dstCharset string, srcCharset string, src string) (dst string, err
d := new(mahonia.Charset)
srctmp := src
switch {
case strings.EqualFold("GBK", srcCharset):
tmp, err := ioutil.ReadAll(transform.NewReader(bytes.NewReader([]byte(src)), simplifiedchinese.GBK.NewDecoder()))
if err != nil {
return "", fmt.Errorf("gbk to utf8 failed. %v", err)
}
srctmp = string(tmp)
case strings.EqualFold("GB18030", srcCharset):
tmp, err := ioutil.ReadAll(transform.NewReader(bytes.NewReader([]byte(src)), simplifiedchinese.GB18030.NewDecoder()))
if err != nil {
return "", fmt.Errorf("GB18030 to utf8 failed. %v", err)
}
srctmp = string(tmp)
case strings.EqualFold("GB2312", srcCharset) || strings.EqualFold("HZGB2312", srcCharset):
tmp, err := ioutil.ReadAll(transform.NewReader(bytes.NewReader([]byte(src)), simplifiedchinese.HZGB2312.NewDecoder()))
if err != nil {
return "", fmt.Errorf("GB2312 to utf8 failed. %v", err)
}
srctmp = string(tmp)
case strings.EqualFold("UTF-8", srcCharset):
default:
s = mahonia.GetCharset(srcCharset)
if s == nil {
return "", errors.New(fmt.Sprintf("not support charset:%s", srcCharset))
}
if srcCharset != "UTF-8" {
enc := encodingMap.Get(srcCharset)
if enc != nil {
tmp, err := ioutil.ReadAll(transform.NewReader(bytes.NewReader([]byte(src)), enc.(encoding.Encoding).NewDecoder()))
if err != nil {
return "", fmt.Errorf("%s to utf8 failed. %v", srcCharset, err)
}
srctmp = string(tmp)
} else {
s = mahonia.GetCharset(srcCharsetUpper)
if s == nil {
return "", errors.New(fmt.Sprintf("not support charset:%s", srcCharset))
}
if s.Name != "UTF-8" {
srctmp = s.NewDecoder().ConvertString(srctmp)
if s.Name != "UTF-8" {
srctmp = s.NewDecoder().ConvertString(srctmp)
}
}
}
dst = srctmp
switch {
case strings.EqualFold("GBK", dstCharset):
tmp, err := ioutil.ReadAll(transform.NewReader(bytes.NewReader([]byte(srctmp)), simplifiedchinese.GBK.NewEncoder()))
if err != nil {
return "", fmt.Errorf("utf to gbk failed. %v", err)
}
dst = string(tmp)
case strings.EqualFold("GB18030", dstCharset):
tmp, err := ioutil.ReadAll(transform.NewReader(bytes.NewReader([]byte(srctmp)), simplifiedchinese.GB18030.NewEncoder()))
if err != nil {
return "", fmt.Errorf("utf8 to gb18030 failed. %v", err)
}
dst = string(tmp)
case strings.EqualFold("GB2312", dstCharset) || strings.EqualFold("HZGB2312", dstCharset):
tmp, err := ioutil.ReadAll(transform.NewReader(bytes.NewReader([]byte(srctmp)), simplifiedchinese.HZGB2312.NewEncoder()))
if err != nil {
return "", fmt.Errorf("utf8 to gb2312 failed. %v", err)
}
dst = string(tmp)
case strings.EqualFold("UTF-8", dstCharset):
default:
d = mahonia.GetCharset(dstCharset)
if d == nil {
return "", errors.New(fmt.Sprintf("not support charset:%s", dstCharset))
}
if dstCharset != "UTF-8" {
enc := encodingMap.Get(dstCharset)
if enc != nil {
tmp, err := ioutil.ReadAll(transform.NewReader(bytes.NewReader([]byte(srctmp)), enc.(encoding.Encoding).NewEncoder()))
if err != nil {
return "", fmt.Errorf("utf to %s failed. %v", dstCharset, err)
}
dst = string(tmp)
} else {
d = mahonia.GetCharset(dstCharsetUpper)
if d == nil {
return "", errors.New(fmt.Sprintf("not support charset:%s", dstCharset))
}
dst = srctmp
if d.Name != "UTF-8" {
dst = d.NewEncoder().ConvertString(dst)
dst = srctmp
if d.Name != "UTF-8" {
dst = d.NewEncoder().ConvertString(dst)
}
}
}
return dst, nil
}