From d97fda794cf37cef7b83373a1dc65d2947f88fab Mon Sep 17 00:00:00 2001 From: wenzi1 Date: Tue, 11 Jun 2019 17:25:30 +0800 Subject: [PATCH] =?UTF-8?q?=E6=A1=86=E6=9E=B6=E4=B8=AD=E5=A2=9E=E5=8A=A0?= =?UTF-8?q?=E5=AD=97=E7=AC=A6=E9=9B=86=E8=BD=AC=E6=8D=A2=E7=9A=84=E6=A0=87?= =?UTF-8?q?=E5=87=86=E5=BA=93?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- g/encoding/gcharset/gcharset.go | 87 +------------- g/encoding/gxml/gxml.go | 15 ++- g/encoding/gxml/gxml_test.go | 6 +- g/encoding/internal/gcharsetconvert.go | 114 ++++++++++++++++++ g/encoding/internal/gcharsetconvert_test.go | 127 ++++++++++++++++++++ 5 files changed, 256 insertions(+), 93 deletions(-) create mode 100644 g/encoding/internal/gcharsetconvert.go create mode 100644 g/encoding/internal/gcharsetconvert_test.go diff --git a/g/encoding/gcharset/gcharset.go b/g/encoding/gcharset/gcharset.go index 39e8ad3d9..c8cb56718 100644 --- a/g/encoding/gcharset/gcharset.go +++ b/g/encoding/gcharset/gcharset.go @@ -12,95 +12,12 @@ package gcharset import ( - "bytes" - "errors" - "fmt" - "github.com/gogf/gf/g/container/gmap" - "github.com/gogf/gf/third/github.com/axgle/mahonia" - "github.com/gogf/gf/third/golang.org/x/text/encoding" - "github.com/gogf/gf/third/golang.org/x/text/encoding/japanese" - "github.com/gogf/gf/third/golang.org/x/text/encoding/korean" - "github.com/gogf/gf/third/golang.org/x/text/encoding/simplifiedchinese" - "github.com/gogf/gf/third/golang.org/x/text/encoding/traditionalchinese" - "github.com/gogf/gf/third/golang.org/x/text/transform" - "io/ioutil" - "strings" + "github.com/gogf/gf/g/encoding/internal" ) -var encodingMap *gmap.Map - -func init() { - encodingMap = gmap.New() - encodingMap.Sets( - map[interface{}]interface{}{ - "GBK": simplifiedchinese.GBK, - "GB18030": simplifiedchinese.GB18030, - "HZGB2312": simplifiedchinese.HZGB2312, - "GB2312": simplifiedchinese.HZGB2312, - "EUCJP": japanese.EUCJP, - "ISO2022JP": japanese.ISO2022JP, - "SHIFTJIS": japanese.ShiftJIS, - "EUCKR": korean.EUCKR, - "BIG5": traditionalchinese.Big5, - }) -} - // 2个字符集之间的转换 func Convert(dstCharset string, srcCharset string, src string) (dst string, err error) { - srcCharsetUpper := strings.ToUpper(srcCharset) - dstCharsetUpper := strings.ToUpper(dstCharset) - - if srcCharsetUpper == dstCharsetUpper { - return src, nil - } - - s := new(mahonia.Charset) - d := new(mahonia.Charset) - srctmp := src - - if srcCharset != "UTF-8" { - enc := encodingMap.Get(srcCharset) - if enc != nil { - tmp, err := ioutil.ReadAll(transform.NewReader(bytes.NewReader([]byte(src)), enc.(encoding.Encoding).NewDecoder())) - if err != nil { - return "", fmt.Errorf("%s to utf8 failed. %v", srcCharset, err) - } - srctmp = string(tmp) - } else { - s = mahonia.GetCharset(srcCharsetUpper) - if s == nil { - return "", errors.New(fmt.Sprintf("not support charset:%s", srcCharset)) - } - - if s.Name != "UTF-8" { - srctmp = s.NewDecoder().ConvertString(srctmp) - } - } - } - - dst = srctmp - - if dstCharset != "UTF-8" { - enc := encodingMap.Get(dstCharset) - if enc != nil { - tmp, err := ioutil.ReadAll(transform.NewReader(bytes.NewReader([]byte(srctmp)), enc.(encoding.Encoding).NewEncoder())) - if err != nil { - return "", fmt.Errorf("utf to %s failed. %v", dstCharset, err) - } - dst = string(tmp) - } else { - d = mahonia.GetCharset(dstCharsetUpper) - if d == nil { - return "", errors.New(fmt.Sprintf("not support charset:%s", dstCharset)) - } - - dst = srctmp - if d.Name != "UTF-8" { - dst = d.NewEncoder().ConvertString(dst) - } - } - } - return dst, nil + return internal.Convert(dstCharset, srcCharset, src) } // 指定字符集转UTF8 diff --git a/g/encoding/gxml/gxml.go b/g/encoding/gxml/gxml.go index 6d4b1ea97..374641a11 100644 --- a/g/encoding/gxml/gxml.go +++ b/g/encoding/gxml/gxml.go @@ -9,8 +9,8 @@ package gxml import ( "fmt" + "github.com/gogf/gf/g/encoding/internal" "github.com/gogf/gf/g/text/gregex" - "github.com/gogf/gf/third/github.com/axgle/mahonia" "github.com/gogf/gf/third/github.com/clbanning/mxj" "strings" ) @@ -60,16 +60,21 @@ func convert(xml []byte) (res []byte, err error) { if len(matchStr) == 2 { xmlEncode = matchStr[1] } - s := mahonia.GetCharset(xmlEncode) - if s == nil { + xmlEncode = strings.ToUpper(xmlEncode) + s := internal.GetCharset(xmlEncode) + if s == false { return nil, fmt.Errorf("not support charset:%s\n", xmlEncode) } res, err = gregex.Replace(patten, []byte(""), xml) if err != nil { return nil, err } - if !strings.EqualFold(s.Name, "UTF-8") { - res = []byte(s.NewDecoder().ConvertString(string(res))) + if xmlEncode != "UTF-8" && xmlEncode != "UTF8" { + dst, err := internal.Convert("UTF-8", xmlEncode, string(res)) + if err != nil { + return nil, err + } + res = []byte(dst) } return res, nil } diff --git a/g/encoding/gxml/gxml_test.go b/g/encoding/gxml/gxml_test.go index 183a3c8b5..785d900d1 100644 --- a/g/encoding/gxml/gxml_test.go +++ b/g/encoding/gxml/gxml_test.go @@ -112,7 +112,7 @@ func Test_Encode(t *testing.T) { if err != nil { t.Errorf("encode error.") } - t.Logf("%s\n", string(xmlStr)) + //t.Logf("%s\n", string(xmlStr)) res := `true100.92123hello world` if string(xmlStr) != res { @@ -130,11 +130,11 @@ func Test_EncodeIndent(t *testing.T) { } m["root"] = interface{}(v) - xmlStr, err := gxml.EncodeWithIndent(m, "xml") + _, err := gxml.EncodeWithIndent(m, "xml") if err != nil { t.Errorf("encodeWithIndent error.") } - t.Logf("%s\n", string(xmlStr)) + //t.Logf("%s\n", string(xmlStr)) } diff --git a/g/encoding/internal/gcharsetconvert.go b/g/encoding/internal/gcharsetconvert.go new file mode 100644 index 000000000..ba301da90 --- /dev/null +++ b/g/encoding/internal/gcharsetconvert.go @@ -0,0 +1,114 @@ +// Copyright 2018 gf Author(https://github.com/gogf/gf). All Rights Reserved. +// +// This Source Code Form is subject to the terms of the MIT License. +// If a copy of the MIT was not distributed with this file, +// You can obtain one at https://github.com/gogf/gf. +// @author wenzi1 +// @date 20180604 + +// Package gcharset provides converting string to requested character encoding. +// +// 字符集转换方法, +package internal + +import ( + "bytes" + "errors" + "fmt" + "github.com/gogf/gf/g/container/gmap" + "github.com/gogf/gf/third/github.com/axgle/mahonia" + "github.com/gogf/gf/third/golang.org/x/text/encoding" + "github.com/gogf/gf/third/golang.org/x/text/encoding/japanese" + "github.com/gogf/gf/third/golang.org/x/text/encoding/korean" + "github.com/gogf/gf/third/golang.org/x/text/encoding/simplifiedchinese" + "github.com/gogf/gf/third/golang.org/x/text/encoding/traditionalchinese" + "github.com/gogf/gf/third/golang.org/x/text/transform" + "io/ioutil" + "strings" +) + +var encodingMap *gmap.Map + +func init() { + encodingMap = gmap.New() + encodingMap.Sets( + map[interface{}]interface{}{ + "GBK": simplifiedchinese.GBK, + "GB18030": simplifiedchinese.GB18030, + "HZGB2312": simplifiedchinese.HZGB2312, + "GB2312": simplifiedchinese.HZGB2312, + "EUCJP": japanese.EUCJP, + "ISO2022JP": japanese.ISO2022JP, + "SHIFTJIS": japanese.ShiftJIS, + "EUCKR": korean.EUCKR, + "BIG5": traditionalchinese.Big5, + }) +} + +func GetCharset(charset string) bool { + c := strings.ToUpper(charset) + if encodingMap.Contains(c) == false { + if mahonia.GetCharset(c) == nil { + return false + } + } + return true +} + +// 2个字符集之间的转换 +func Convert(dstCharset string, srcCharset string, src string) (dst string, err error) { + srcCharsetUpper := strings.ToUpper(srcCharset) + dstCharsetUpper := strings.ToUpper(dstCharset) + + if srcCharsetUpper == dstCharsetUpper { + return src, nil + } + + s := new(mahonia.Charset) + d := new(mahonia.Charset) + srctmp := src + + if srcCharset != "UTF-8" { + enc := encodingMap.Get(srcCharset) + if enc != nil { + tmp, err := ioutil.ReadAll(transform.NewReader(bytes.NewReader([]byte(src)), enc.(encoding.Encoding).NewDecoder())) + if err != nil { + return "", fmt.Errorf("%s to utf8 failed. %v", srcCharset, err) + } + srctmp = string(tmp) + } else { + s = mahonia.GetCharset(srcCharsetUpper) + if s == nil { + return "", errors.New(fmt.Sprintf("not support charset:%s", srcCharset)) + } + + if s.Name != "UTF-8" { + srctmp = s.NewDecoder().ConvertString(srctmp) + } + } + } + + dst = srctmp + + if dstCharset != "UTF-8" { + enc := encodingMap.Get(dstCharset) + if enc != nil { + tmp, err := ioutil.ReadAll(transform.NewReader(bytes.NewReader([]byte(srctmp)), enc.(encoding.Encoding).NewEncoder())) + if err != nil { + return "", fmt.Errorf("utf to %s failed. %v", dstCharset, err) + } + dst = string(tmp) + } else { + d = mahonia.GetCharset(dstCharsetUpper) + if d == nil { + return "", errors.New(fmt.Sprintf("not support charset:%s", dstCharset)) + } + + dst = srctmp + if d.Name != "UTF-8" { + dst = d.NewEncoder().ConvertString(dst) + } + } + } + return dst, nil +} diff --git a/g/encoding/internal/gcharsetconvert_test.go b/g/encoding/internal/gcharsetconvert_test.go new file mode 100644 index 000000000..c39d9a9cd --- /dev/null +++ b/g/encoding/internal/gcharsetconvert_test.go @@ -0,0 +1,127 @@ +// Copyright 2018 gf Author(https://github.com/gogf/gf). All Rights Reserved. +// +// This Source Code Form is subject to the terms of the MIT License. +// If a copy of the MIT was not distributed with this file, +// You can obtain one at https://github.com/gogf/gf. + +package internal_test + +import ( + "github.com/gogf/gf/g/encoding/internal" + "github.com/gogf/gf/g/test/gtest" + "testing" +) + +var testData = []struct { + utf8, other, otherEncoding string +}{ + {"Résumé", "Résumé", "utf8"}, + {"Résumé", "R\xe9sum\xe9", "latin-1"}, + {"これは漢字です。", "S0\x8c0o0\"oW[g0Y0\x020", "UTF-16LE"}, + {"これは漢字です。", "0S0\x8c0oo\"[W0g0Y0\x02", "UTF-16BE"}, + {"これは漢字です。", "\xfe\xff0S0\x8c0oo\"[W0g0Y0\x02", "UTF-16"}, + {"𝄢𝄞𝄪𝄫", "\xfe\xff\xd8\x34\xdd\x22\xd8\x34\xdd\x1e\xd8\x34\xdd\x2a\xd8\x34\xdd\x2b", "UTF-16"}, + {"Hello, world", "Hello, world", "ASCII"}, + {"Gdańsk", "Gda\xf1sk", "ISO-8859-2"}, + {"Ââ Čč Đđ Ŋŋ Õõ Šš Žž Åå Ää", "\xc2\xe2 \xc8\xe8 \xa9\xb9 \xaf\xbf \xd5\xf5 \xaa\xba \xac\xbc \xc5\xe5 \xc4\xe4", "ISO-8859-10"}, + {"สำหรับ", "\xca\xd3\xcb\xc3\u047a", "ISO-8859-11"}, + {"latviešu", "latvie\xf0u", "ISO-8859-13"}, + {"Seònaid", "Se\xf2naid", "ISO-8859-14"}, + {"€1 is cheap", "\xa41 is cheap", "ISO-8859-15"}, + {"românește", "rom\xe2ne\xbate", "ISO-8859-16"}, + {"nutraĵo", "nutra\xbco", "ISO-8859-3"}, + {"Kalâdlit", "Kal\xe2dlit", "ISO-8859-4"}, + {"русский", "\xe0\xe3\xe1\xe1\xda\xd8\xd9", "ISO-8859-5"}, + {"ελληνικά", "\xe5\xeb\xeb\xe7\xed\xe9\xea\xdc", "ISO-8859-7"}, + {"Kağan", "Ka\xf0an", "ISO-8859-9"}, + {"Résumé", "R\x8esum\x8e", "macintosh"}, + {"Gdańsk", "Gda\xf1sk", "windows-1250"}, + {"русский", "\xf0\xf3\xf1\xf1\xea\xe8\xe9", "windows-1251"}, + {"Résumé", "R\xe9sum\xe9", "windows-1252"}, + {"ελληνικά", "\xe5\xeb\xeb\xe7\xed\xe9\xea\xdc", "windows-1253"}, + {"Kağan", "Ka\xf0an", "windows-1254"}, + {"עִבְרִית", "\xf2\xc4\xe1\xc0\xf8\xc4\xe9\xfa", "windows-1255"}, + {"العربية", "\xc7\xe1\xda\xd1\xc8\xed\xc9", "windows-1256"}, + {"latviešu", "latvie\xf0u", "windows-1257"}, + {"Việt", "Vi\xea\xf2t", "windows-1258"}, + {"สำหรับ", "\xca\xd3\xcb\xc3\u047a", "windows-874"}, + {"русский", "\xd2\xd5\xd3\xd3\xcb\xc9\xca", "KOI8-R"}, + {"українська", "\xd5\xcb\xd2\xc1\xa7\xce\xd3\xd8\xcb\xc1", "KOI8-U"}, + {"Hello 常用國字標準字體表", "Hello \xb1`\xa5\u03b0\xea\xa6r\xbc\u0437\u01e6r\xc5\xe9\xaa\xed", "big5"}, + {"Hello 常用國字標準字體表", "Hello \xb3\xa3\xd3\xc3\x87\xf8\xd7\xd6\x98\xcb\x9c\xca\xd7\xd6\xf3\x77\xb1\xed", "gbk"}, + {"Hello 常用國字標準字體表", "Hello \xb3\xa3\xd3\xc3\x87\xf8\xd7\xd6\x98\xcb\x9c\xca\xd7\xd6\xf3\x77\xb1\xed", "gb18030"}, + {"花间一壶酒,独酌无相亲。", "~{;(F#,6@WCN^O`GW!#", "GB2312"}, + {"花间一壶酒,独酌无相亲。", "~{;(F#,6@WCN^O`GW!#", "HZGB2312"}, + {"עִבְרִית", "\x81\x30\xfb\x30\x81\x30\xf6\x34\x81\x30\xf9\x33\x81\x30\xf6\x30\x81\x30\xfb\x36\x81\x30\xf6\x34\x81\x30\xfa\x31\x81\x30\xfb\x38", "gb18030"}, + {"㧯", "\x82\x31\x89\x38", "gb18030"}, + {"㧯", "㧯", "UTF-8"}, + {"これは漢字です。", "\x82\xb1\x82\xea\x82\xcd\x8a\xbf\x8e\x9a\x82\xc5\x82\xb7\x81B", "SJIS"}, + {"これは漢字です。", "\xa4\xb3\xa4\xec\xa4\u03f4\xc1\xbb\xfa\xa4\u01e4\xb9\xa1\xa3", "EUC-JP"}, +} + +func TestDecode(t *testing.T) { + for _, data := range testData { + str := "" + str, err := internal.Convert("UTF-8", data.otherEncoding, data.other) + if err != nil { + t.Errorf("Could not create decoder for %v", err) + continue + } + + if str != data.utf8 { + t.Errorf("Unexpected value: %#v (expected %#v) %v", str, data.utf8, data.otherEncoding) + } + } +} + +func TestEncode(t *testing.T) { + for _, data := range testData { + str := "" + str, err := internal.Convert(data.otherEncoding, "UTF-8", data.utf8) + if err != nil { + t.Errorf("Could not create decoder for %v", err) + continue + } + + if str != data.other { + t.Errorf("Unexpected value: %#v (expected %#v)", str, data.other) + } + } +} + +func TestConvert(t *testing.T) { + srcCharset := "big5" + src := "Hello \xb1`\xa5\u03b0\xea\xa6r\xbc\u0437\u01e6r\xc5\xe9\xaa\xed" + dstCharset := "gbk" + dst := "Hello \xb3\xa3\xd3\xc3\x87\xf8\xd7\xd6\x98\xcb\x9c\xca\xd7\xd6\xf3\x77\xb1\xed" + + str, err := internal.Convert(dstCharset, srcCharset, src) + if err != nil { + t.Errorf("convert error. %v", err) + return + } + + if str != dst { + t.Errorf("unexpected value:%#v (expected %#v)", str, dst) + } +} + +func TestGetCharset(t *testing.T) { + gtest.Case(t, func() { + if internal.GetCharset("XX") { + t.Errorf("unexpected value:%v (expected %v)", true, false) + } + }) + + gtest.Case(t, func() { + if internal.GetCharset("UTF-8") == false { + t.Errorf("unexpected value:%v (expected %v)", false, true) + } + }) + + gtest.Case(t, func() { + if internal.GetCharset("gbk") == false { + t.Errorf("unexpected value:%v (expected %v)", false, true) + } + }) +}