From ca7a0e4b5bbd370fffac5e8b798f691830bbe053 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lauris=20Buk=C5=A1is-Haberkorns?= Date: Mon, 17 Sep 2018 23:59:31 +0300 Subject: [PATCH 1/6] Detect charset and convert non UTF-8 files for display --- modules/base/tool.go | 19 +++++++++++++++++++ routers/repo/view.go | 6 ++++-- 2 files changed, 23 insertions(+), 2 deletions(-) diff --git a/modules/base/tool.go b/modules/base/tool.go index 2dfd8ffec04c3..09524c16a0f22 100644 --- a/modules/base/tool.go +++ b/modules/base/tool.go @@ -1,3 +1,4 @@ +// Copyright 2018 The Gitea Authors. All rights reserved. // Copyright 2014 The Gogs Authors. All rights reserved. // Use of this source code is governed by a MIT-style // license that can be found in the LICENSE file. @@ -27,9 +28,12 @@ import ( "code.gitea.io/gitea/modules/log" "code.gitea.io/gitea/modules/setting" "code.gitea.io/gitea/modules/util" + "github.com/Unknwon/com" "github.com/Unknwon/i18n" "github.com/gogits/chardet" + "golang.org/x/net/html/charset" + "golang.org/x/text/transform" ) // EncodeMD5 encodes string to md5 hex value. @@ -72,6 +76,21 @@ func DetectEncoding(content []byte) (string, error) { return result.Charset, err } +// DetectEncodingAndConvert detects the encoding of content and coverts to UTF-8 if possible +func DetectEncodingAndConvert(content []byte) []byte { + charsetLabel, err := DetectEncoding(content) + if charsetLabel != "UTF-8" && err == nil { + encoding, _ := charset.Lookup(charsetLabel) + if encoding != nil { + d := encoding.NewDecoder() + if buf, _, err := transform.Bytes(d, content); err == nil { + return buf + } + } + } + return content +} + // BasicAuthDecode decode basic auth string func BasicAuthDecode(encoded string) (string, string, error) { s, err := base64.StdEncoding.DecodeString(encoded) diff --git a/routers/repo/view.go b/routers/repo/view.go index ff5c1afb49590..9fc897042b769 100644 --- a/routers/repo/view.go +++ b/routers/repo/view.go @@ -25,6 +25,7 @@ import ( "code.gitea.io/gitea/modules/markup" "code.gitea.io/gitea/modules/setting" "code.gitea.io/gitea/modules/templates" + "github.com/Unknwon/paginater" ) @@ -99,7 +100,8 @@ func renderDirectory(ctx *context.Context, treeLink string) { ctx.Data["FileSize"] = readmeFile.Size() } else { d, _ := ioutil.ReadAll(dataRc) - buf = append(buf, d...) + buf = base.DetectEncodingAndConvert(append(buf, d...)) + if markup.Type(readmeFile.Name()) != "" { ctx.Data["IsMarkup"] = true ctx.Data["FileContent"] = string(markup.Render(readmeFile.Name(), buf, treeLink, ctx.Repo.Repository.ComposeMetas())) @@ -203,7 +205,7 @@ func renderFile(ctx *context.Context, entry *git.TreeEntry, treeLink, rawLink st } d, _ := ioutil.ReadAll(dataRc) - buf = append(buf, d...) + buf = base.DetectEncodingAndConvert(append(buf, d...)) readmeExist := markup.IsReadmeFile(blob.Name()) ctx.Data["ReadmeExist"] = readmeExist From a1ee5edb27526b34746507450a4d49e03f37088a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lauris=20Buk=C5=A1is-Haberkorns?= Date: Tue, 18 Sep 2018 21:33:01 +0300 Subject: [PATCH 2/6] Refactor and move function to correct module --- modules/base/tool.go | 18 ------------------ modules/templates/helper.go | 35 +++++++++++++++++++++++++++++------ routers/repo/view.go | 4 ++-- 3 files changed, 31 insertions(+), 26 deletions(-) diff --git a/modules/base/tool.go b/modules/base/tool.go index 09524c16a0f22..a1a01adb62c3a 100644 --- a/modules/base/tool.go +++ b/modules/base/tool.go @@ -1,4 +1,3 @@ -// Copyright 2018 The Gitea Authors. All rights reserved. // Copyright 2014 The Gogs Authors. All rights reserved. // Use of this source code is governed by a MIT-style // license that can be found in the LICENSE file. @@ -32,8 +31,6 @@ import ( "github.com/Unknwon/com" "github.com/Unknwon/i18n" "github.com/gogits/chardet" - "golang.org/x/net/html/charset" - "golang.org/x/text/transform" ) // EncodeMD5 encodes string to md5 hex value. @@ -76,21 +73,6 @@ func DetectEncoding(content []byte) (string, error) { return result.Charset, err } -// DetectEncodingAndConvert detects the encoding of content and coverts to UTF-8 if possible -func DetectEncodingAndConvert(content []byte) []byte { - charsetLabel, err := DetectEncoding(content) - if charsetLabel != "UTF-8" && err == nil { - encoding, _ := charset.Lookup(charsetLabel) - if encoding != nil { - d := encoding.NewDecoder() - if buf, _, err := transform.Bytes(d, content); err == nil { - return buf - } - } - } - return content -} - // BasicAuthDecode decode basic auth string func BasicAuthDecode(encoded string) (string, string, error) { s, err := base64.StdEncoding.DecodeString(encoded) diff --git a/modules/templates/helper.go b/modules/templates/helper.go index d55c122df0999..182349b43684d 100644 --- a/modules/templates/helper.go +++ b/modules/templates/helper.go @@ -1,3 +1,4 @@ +// Copyright 2018 The Gitea Authors. All rights reserved. // Copyright 2014 The Gogs Authors. All rights reserved. // Use of this source code is governed by a MIT-style // license that can be found in the LICENSE file. @@ -103,12 +104,12 @@ func NewFuncMap() []template.FuncMap { } return str[start:end] }, - "EllipsisString": base.EllipsisString, - "DiffTypeToStr": DiffTypeToStr, - "DiffLineTypeToStr": DiffLineTypeToStr, - "Sha1": Sha1, - "ShortSha": base.ShortSha, - "MD5": base.EncodeMD5, + "EllipsisString": base.EllipsisString, + "DiffTypeToStr": DiffTypeToStr, + "DiffLineTypeToStr": DiffLineTypeToStr, + "Sha1": Sha1, + "ShortSha": base.ShortSha, + "MD5": base.EncodeMD5, "ActionContent2Commits": ActionContent2Commits, "PathEscape": url.PathEscape, "EscapePound": func(str string) string { @@ -284,6 +285,28 @@ func ToUTF8WithErr(content []byte) (string, error) { return result, err } +// ToUTF8WithFallback detects the encoding of content and coverts to UTF-8 if possible +func ToUTF8WithFallback(content []byte) []byte { + charsetLabel, err := base.DetectEncoding(content) + if err != nil || charsetLabel == "UTF-8" { + return content + } + + encoding, _ := charset.Lookup(charsetLabel) + if encoding == nil { + return content + } + + // If there is an error, we concatenate the nicely decoded part and the + // original left over. This way we won't loose data. + result, n, err := transform.Bytes(encoding.NewDecoder(), content) + if err != nil { + return append(result, content[n:]...) + } + + return result +} + // ToUTF8 converts content to UTF8 encoding and ignore error func ToUTF8(content string) string { res, _ := ToUTF8WithErr([]byte(content)) diff --git a/routers/repo/view.go b/routers/repo/view.go index 9fc897042b769..210eb9fe5ffb4 100644 --- a/routers/repo/view.go +++ b/routers/repo/view.go @@ -100,7 +100,7 @@ func renderDirectory(ctx *context.Context, treeLink string) { ctx.Data["FileSize"] = readmeFile.Size() } else { d, _ := ioutil.ReadAll(dataRc) - buf = base.DetectEncodingAndConvert(append(buf, d...)) + buf = templates.ToUTF8WithFallback(append(buf, d...)) if markup.Type(readmeFile.Name()) != "" { ctx.Data["IsMarkup"] = true @@ -205,7 +205,7 @@ func renderFile(ctx *context.Context, entry *git.TreeEntry, treeLink, rawLink st } d, _ := ioutil.ReadAll(dataRc) - buf = base.DetectEncodingAndConvert(append(buf, d...)) + buf = templates.ToUTF8WithFallback(append(buf, d...)) readmeExist := markup.IsReadmeFile(blob.Name()) ctx.Data["ReadmeExist"] = readmeExist From f80681bc9ccdc4e26c8cdacd71abdcb2bafbd805 Mon Sep 17 00:00:00 2001 From: Lauris BH Date: Tue, 18 Sep 2018 21:35:12 +0300 Subject: [PATCH 3/6] Revert unrelated changes --- modules/templates/helper.go | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/modules/templates/helper.go b/modules/templates/helper.go index 182349b43684d..90321658500b4 100644 --- a/modules/templates/helper.go +++ b/modules/templates/helper.go @@ -104,12 +104,12 @@ func NewFuncMap() []template.FuncMap { } return str[start:end] }, - "EllipsisString": base.EllipsisString, - "DiffTypeToStr": DiffTypeToStr, - "DiffLineTypeToStr": DiffLineTypeToStr, - "Sha1": Sha1, - "ShortSha": base.ShortSha, - "MD5": base.EncodeMD5, + "EllipsisString": base.EllipsisString, + "DiffTypeToStr": DiffTypeToStr, + "DiffLineTypeToStr": DiffLineTypeToStr, + "Sha1": Sha1, + "ShortSha": base.ShortSha, + "MD5": base.EncodeMD5, "ActionContent2Commits": ActionContent2Commits, "PathEscape": url.PathEscape, "EscapePound": func(str string) string { From 5c4dac7286f066dea14c55bdec19e5e266b0aea2 Mon Sep 17 00:00:00 2001 From: Lauris BH Date: Tue, 18 Sep 2018 21:35:37 +0300 Subject: [PATCH 4/6] More unrelated changes --- modules/base/tool.go | 1 - 1 file changed, 1 deletion(-) diff --git a/modules/base/tool.go b/modules/base/tool.go index a1a01adb62c3a..2dfd8ffec04c3 100644 --- a/modules/base/tool.go +++ b/modules/base/tool.go @@ -27,7 +27,6 @@ import ( "code.gitea.io/gitea/modules/log" "code.gitea.io/gitea/modules/setting" "code.gitea.io/gitea/modules/util" - "github.com/Unknwon/com" "github.com/Unknwon/i18n" "github.com/gogits/chardet" From fab01bbb6a4fd657920c4629edd23353f82f6594 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lauris=20Buk=C5=A1is-Haberkorns?= Date: Mon, 24 Sep 2018 20:51:24 +0300 Subject: [PATCH 5/6] Duplicate content for small text to have better encoding detection --- modules/base/tool.go | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/modules/base/tool.go b/modules/base/tool.go index 2dfd8ffec04c3..512682cb8e3c5 100644 --- a/modules/base/tool.go +++ b/modules/base/tool.go @@ -59,7 +59,17 @@ func DetectEncoding(content []byte) (string, error) { return "UTF-8", nil } - result, err := chardet.NewTextDetector().DetectBest(content) + var detectContent []byte + if len(content) < 1024 { + times := 1024 / len(content) + detectContent = make([]byte, 0, times*len(content)) + for i := 0; i < times; i++ { + detectContent = append(detectContent, content...) + } + } else { + detectContent = content + } + result, err := chardet.NewTextDetector().DetectBest(detectContent) if err != nil { return "", err } From 92d7a6973960ede9024b190fa184caf6fc4a63c9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lauris=20Buk=C5=A1is-Haberkorns?= Date: Sat, 29 Sep 2018 10:51:03 +0300 Subject: [PATCH 6/6] Check if original content is valid before duplicating it --- modules/base/tool.go | 7 ++++++- modules/templates/helper.go | 4 ++-- 2 files changed, 8 insertions(+), 3 deletions(-) diff --git a/modules/base/tool.go b/modules/base/tool.go index 512682cb8e3c5..d5ec9e83fcbea 100644 --- a/modules/base/tool.go +++ b/modules/base/tool.go @@ -59,8 +59,13 @@ func DetectEncoding(content []byte) (string, error) { return "UTF-8", nil } + textDetector := chardet.NewTextDetector() var detectContent []byte if len(content) < 1024 { + // Check if original content is valid + if _, err := textDetector.DetectBest(content); err != nil { + return "", err + } times := 1024 / len(content) detectContent = make([]byte, 0, times*len(content)) for i := 0; i < times; i++ { @@ -69,7 +74,7 @@ func DetectEncoding(content []byte) (string, error) { } else { detectContent = content } - result, err := chardet.NewTextDetector().DetectBest(detectContent) + result, err := textDetector.DetectBest(detectContent) if err != nil { return "", err } diff --git a/modules/templates/helper.go b/modules/templates/helper.go index 90321658500b4..ce077d1a928e7 100644 --- a/modules/templates/helper.go +++ b/modules/templates/helper.go @@ -276,7 +276,7 @@ func ToUTF8WithErr(content []byte) (string, error) { } // If there is an error, we concatenate the nicely decoded part and the - // original left over. This way we won't loose data. + // original left over. This way we won't lose data. result, n, err := transform.String(encoding.NewDecoder(), string(content)) if err != nil { result = result + string(content[n:]) @@ -298,7 +298,7 @@ func ToUTF8WithFallback(content []byte) []byte { } // If there is an error, we concatenate the nicely decoded part and the - // original left over. This way we won't loose data. + // original left over. This way we won't lose data. result, n, err := transform.Bytes(encoding.NewDecoder(), content) if err != nil { return append(result, content[n:]...)