From 689f29c9833c3e56c91266ccb0cf67f6fda2a62e Mon Sep 17 00:00:00 2001 From: Andrew Thornton Date: Tue, 23 Apr 2019 21:29:27 +0100 Subject: [PATCH 1/8] detect and remove a decoded BOM Signed-off-by: Andrew Thornton --- modules/templates/helper.go | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/modules/templates/helper.go b/modules/templates/helper.go index d3eb8c48b8415..fc6d0897bc615 100644 --- a/modules/templates/helper.go +++ b/modules/templates/helper.go @@ -31,6 +31,8 @@ import ( "gopkg.in/editorconfig/editorconfig-core-go.v1" ) +var utf8bom = []byte{'\xef', '\xbb', '\xbf'} + // NewFuncMap returns functions for injecting to templates func NewFuncMap() []template.FuncMap { return []template.FuncMap{map[string]interface{}{ @@ -267,6 +269,10 @@ func ToUTF8WithErr(content []byte) (string, error) { if err != nil { return "", err } else if charsetLabel == "UTF-8" { + if len(content) > 2 && bytes.Equal(content[0:3], utf8bom) { + log.Debug("Removing BOM from UTF-8 string") + return string(content[3:]), nil + } return string(content), nil } @@ -282,6 +288,11 @@ func ToUTF8WithErr(content []byte) (string, error) { result = result + string(content[n:]) } + if len(result) > 2 && bytes.Equal([]byte(result[0:3]), utf8bom) { + log.Debug("Removing BOM from decoded string") + result = result[3:] + } + return result, err } @@ -289,6 +300,10 @@ func ToUTF8WithErr(content []byte) (string, error) { func ToUTF8WithFallback(content []byte) []byte { charsetLabel, err := base.DetectEncoding(content) if err != nil || charsetLabel == "UTF-8" { + if len(content) > 2 && bytes.Equal(content[0:3], utf8bom) { + log.Debug("Removing BOM from UTF-8 string") + return content[3:] + } return content } @@ -304,6 +319,11 @@ func ToUTF8WithFallback(content []byte) []byte { return append(result, content[n:]...) } + if len(result) > 2 && bytes.Equal(result[0:3], utf8bom) { + log.Debug("Removing BOM from decoded string") + result = result[3:] + } + return result } From f2d4e167848e4ff73246067aaae743197ca7e47b Mon Sep 17 00:00:00 2001 From: Andrew Thornton Date: Tue, 23 Apr 2019 22:46:49 +0100 Subject: [PATCH 2/8] Restore the previous encoding and BOM --- modules/base/tool.go | 3 ++ modules/repofiles/update.go | 60 +++++++++++++++++++++++++++++++++++++ modules/templates/helper.go | 10 +++---- 3 files changed, 67 insertions(+), 6 deletions(-) diff --git a/modules/base/tool.go b/modules/base/tool.go index 97fd87e85c402..7b9676d40e7dd 100644 --- a/modules/base/tool.go +++ b/modules/base/tool.go @@ -36,6 +36,9 @@ import ( "github.com/gogits/chardet" ) +// UTF8BOM is the utf-8 byte-order marker +var UTF8BOM = []byte{'\xef', '\xbb', '\xbf'} + // EncodeMD5 encodes string to md5 hex value. func EncodeMD5(str string) string { m := md5.New() diff --git a/modules/repofiles/update.go b/modules/repofiles/update.go index e9b3077535602..1d295ac87db8e 100644 --- a/modules/repofiles/update.go +++ b/modules/repofiles/update.go @@ -5,13 +5,19 @@ package repofiles import ( + "bytes" "fmt" "path" "strings" + "golang.org/x/net/html/charset" + "golang.org/x/text/transform" + "code.gitea.io/gitea/models" + "code.gitea.io/gitea/modules/base" "code.gitea.io/gitea/modules/git" "code.gitea.io/gitea/modules/lfs" + "code.gitea.io/gitea/modules/log" "code.gitea.io/gitea/modules/setting" "code.gitea.io/sdk/gitea" ) @@ -37,6 +43,41 @@ type UpdateRepoFileOptions struct { Committer *IdentityOptions } +func detectEncodingAndBOM(entry *git.TreeEntry) (string, bool) { + reader, err := entry.Blob().DataAsync() + if err != nil { + // just default to utf-8 and no bom + return "UTF-8", false + } + buf := make([]byte, 1024) + n, err := reader.Read(buf) + if err != nil { + // just default to utf-8 and no bom + return "UTF-8", false + } + buf = buf[:n] + encoding, err := base.DetectEncoding(buf) + if err != nil { + // just default to utf-8 and no bom + return "UTF-8", false + } + if encoding == "UTF-8" { + return encoding, bytes.Equal(buf[0:3], base.UTF8BOM) + } + charsetEncoding, _ := charset.Lookup(encoding) + if charsetEncoding == nil { + return "UTF-8", false + } + + result, n, err := transform.String(charsetEncoding.NewDecoder(), string(buf)) + + if n > 2 { + return encoding, bytes.Equal([]byte(result)[0:3], base.UTF8BOM) + } + + return encoding, false +} + // CreateOrUpdateRepoFile adds or updates a file in the given repository func CreateOrUpdateRepoFile(repo *models.Repository, doer *models.User, opts *UpdateRepoFileOptions) (*gitea.FileResponse, error) { // If no branch name is set, assume master @@ -118,6 +159,9 @@ func CreateOrUpdateRepoFile(repo *models.Repository, doer *models.User, opts *Up opts.LastCommitID = commit.ID.String() } + encoding := "UTF-8" + bom := false + if !opts.IsNewFile { fromEntry, err := commit.GetTreeEntryByPath(fromTreePath) if err != nil { @@ -151,6 +195,7 @@ func CreateOrUpdateRepoFile(repo *models.Repository, doer *models.User, opts *Up // haven't been made. We throw an error if one wasn't provided. return nil, models.ErrSHAOrCommitIDNotProvided{} } + encoding, bom = detectEncodingAndBOM(fromEntry) } // For the path where this file will be created/updated, we need to make @@ -235,6 +280,21 @@ func CreateOrUpdateRepoFile(repo *models.Repository, doer *models.User, opts *Up } content := opts.Content + if bom { + content = string(base.UTF8BOM) + content + } + if encoding != "UTF-8" { + charsetEncoding, _ := charset.Lookup(encoding) + if charsetEncoding != nil { + result, n, err := transform.String(charsetEncoding.NewEncoder(), string(content)) + if err != nil { + result = result + string(content[n:]) + } + content = result + } else { + log.Error("Unknown encoding: %s", encoding) + } + } var lfsMetaObject *models.LFSMetaObject if filename2attribute2info[treePath] != nil && filename2attribute2info[treePath]["filter"] == "lfs" { diff --git a/modules/templates/helper.go b/modules/templates/helper.go index fc6d0897bc615..eb0c273f60c9a 100644 --- a/modules/templates/helper.go +++ b/modules/templates/helper.go @@ -31,8 +31,6 @@ import ( "gopkg.in/editorconfig/editorconfig-core-go.v1" ) -var utf8bom = []byte{'\xef', '\xbb', '\xbf'} - // NewFuncMap returns functions for injecting to templates func NewFuncMap() []template.FuncMap { return []template.FuncMap{map[string]interface{}{ @@ -269,7 +267,7 @@ func ToUTF8WithErr(content []byte) (string, error) { if err != nil { return "", err } else if charsetLabel == "UTF-8" { - if len(content) > 2 && bytes.Equal(content[0:3], utf8bom) { + if len(content) > 2 && bytes.Equal(content[0:3], base.UTF8BOM) { log.Debug("Removing BOM from UTF-8 string") return string(content[3:]), nil } @@ -288,7 +286,7 @@ func ToUTF8WithErr(content []byte) (string, error) { result = result + string(content[n:]) } - if len(result) > 2 && bytes.Equal([]byte(result[0:3]), utf8bom) { + if len(result) > 2 && bytes.Equal([]byte(result[0:3]), base.UTF8BOM) { log.Debug("Removing BOM from decoded string") result = result[3:] } @@ -300,7 +298,7 @@ func ToUTF8WithErr(content []byte) (string, error) { func ToUTF8WithFallback(content []byte) []byte { charsetLabel, err := base.DetectEncoding(content) if err != nil || charsetLabel == "UTF-8" { - if len(content) > 2 && bytes.Equal(content[0:3], utf8bom) { + if len(content) > 2 && bytes.Equal(content[0:3], base.UTF8BOM) { log.Debug("Removing BOM from UTF-8 string") return content[3:] } @@ -319,7 +317,7 @@ func ToUTF8WithFallback(content []byte) []byte { return append(result, content[n:]...) } - if len(result) > 2 && bytes.Equal(result[0:3], utf8bom) { + if len(result) > 2 && bytes.Equal(result[0:3], base.UTF8BOM) { log.Debug("Removing BOM from decoded string") result = result[3:] } From 508aa4aa128ef06c2d1d4a42dd7e3beb59ae4642 Mon Sep 17 00:00:00 2001 From: Andrew Thornton Date: Tue, 23 Apr 2019 22:51:23 +0100 Subject: [PATCH 3/8] On error keep as UTF-8 Signed-off-by: Andrew Thornton --- modules/repofiles/update.go | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/modules/repofiles/update.go b/modules/repofiles/update.go index 1d295ac87db8e..299513a026ec8 100644 --- a/modules/repofiles/update.go +++ b/modules/repofiles/update.go @@ -286,9 +286,11 @@ func CreateOrUpdateRepoFile(repo *models.Repository, doer *models.User, opts *Up if encoding != "UTF-8" { charsetEncoding, _ := charset.Lookup(encoding) if charsetEncoding != nil { - result, n, err := transform.String(charsetEncoding.NewEncoder(), string(content)) + result, _, err := transform.String(charsetEncoding.NewEncoder(), string(content)) if err != nil { - result = result + string(content[n:]) + // Look if we can't encode back in to the original we should just stick with utf-8 + log.Error("Error re-encoding %s (%s) as %s - will stay as UTF-8: %v", opts.TreePath, opts.FromTreePath, encoding, err) + result = content } content = result } else { From c179e7c7ba82c32aefc3ee75879470410db52c12 Mon Sep 17 00:00:00 2001 From: Andrew Thornton Date: Fri, 26 Apr 2019 08:39:52 +0100 Subject: [PATCH 4/8] create remove BOM function --- modules/base/tool.go | 9 +++++++++ modules/templates/helper.go | 30 +++++++----------------------- 2 files changed, 16 insertions(+), 23 deletions(-) diff --git a/modules/base/tool.go b/modules/base/tool.go index 7b9676d40e7dd..d99ed2bab4f1b 100644 --- a/modules/base/tool.go +++ b/modules/base/tool.go @@ -5,6 +5,7 @@ package base import ( + "bytes" "crypto/md5" "crypto/rand" "crypto/sha1" @@ -94,6 +95,14 @@ func DetectEncoding(content []byte) (string, error) { return result.Charset, err } +// RemoveBOMIfPresent removes a UTF-8 BOM from a []byte +func RemoveBOMIfPresent(content []byte) []byte { + if len(content) > 2 && bytes.Equal(content[0:3], UTF8BOM) { + return content[3:] + } + return content +} + // BasicAuthDecode decode basic auth string func BasicAuthDecode(encoded string) (string, string, error) { s, err := base64.StdEncoding.DecodeString(encoded) diff --git a/modules/templates/helper.go b/modules/templates/helper.go index eb0c273f60c9a..b6c5cc594549f 100644 --- a/modules/templates/helper.go +++ b/modules/templates/helper.go @@ -267,11 +267,7 @@ func ToUTF8WithErr(content []byte) (string, error) { if err != nil { return "", err } else if charsetLabel == "UTF-8" { - if len(content) > 2 && bytes.Equal(content[0:3], base.UTF8BOM) { - log.Debug("Removing BOM from UTF-8 string") - return string(content[3:]), nil - } - return string(content), nil + return string(base.RemoveBOMIfPresent(content)), nil } encoding, _ := charset.Lookup(charsetLabel) @@ -281,28 +277,21 @@ func ToUTF8WithErr(content []byte) (string, error) { // If there is an error, we concatenate the nicely decoded part and the // original left over. This way we won't lose data. - result, n, err := transform.String(encoding.NewDecoder(), string(content)) + result, n, err := transform.Bytes(encoding.NewDecoder(), content) if err != nil { - result = result + string(content[n:]) + result = append(result, content[n:]...) } - if len(result) > 2 && bytes.Equal([]byte(result[0:3]), base.UTF8BOM) { - log.Debug("Removing BOM from decoded string") - result = result[3:] - } + result = base.RemoveBOMIfPresent(result) - return result, err + return string(result), err } // ToUTF8WithFallback detects the encoding of content and coverts to UTF-8 if possible func ToUTF8WithFallback(content []byte) []byte { charsetLabel, err := base.DetectEncoding(content) if err != nil || charsetLabel == "UTF-8" { - if len(content) > 2 && bytes.Equal(content[0:3], base.UTF8BOM) { - log.Debug("Removing BOM from UTF-8 string") - return content[3:] - } - return content + return base.RemoveBOMIfPresent(content) } encoding, _ := charset.Lookup(charsetLabel) @@ -317,12 +306,7 @@ func ToUTF8WithFallback(content []byte) []byte { return append(result, content[n:]...) } - if len(result) > 2 && bytes.Equal(result[0:3], base.UTF8BOM) { - log.Debug("Removing BOM from decoded string") - result = result[3:] - } - - return result + return base.RemoveBOMIfPresent(result) } // ToUTF8 converts content to UTF8 encoding and ignore error From a320c40ce348e0eb0a795740929729f666bcc3e3 Mon Sep 17 00:00:00 2001 From: Andrew Thornton Date: Fri, 26 Apr 2019 08:51:46 +0100 Subject: [PATCH 5/8] Deal with LFSed content --- modules/repofiles/update.go | 39 ++++++++++++++++++++++++++++++++----- 1 file changed, 34 insertions(+), 5 deletions(-) diff --git a/modules/repofiles/update.go b/modules/repofiles/update.go index 299513a026ec8..be3abd2a80076 100644 --- a/modules/repofiles/update.go +++ b/modules/repofiles/update.go @@ -43,19 +43,48 @@ type UpdateRepoFileOptions struct { Committer *IdentityOptions } -func detectEncodingAndBOM(entry *git.TreeEntry) (string, bool) { +func detectEncodingAndBOM(entry *git.TreeEntry, repo *models.Repository) (string, bool) { reader, err := entry.Blob().DataAsync() if err != nil { - // just default to utf-8 and no bom + // return default return "UTF-8", false } + defer reader.Close() buf := make([]byte, 1024) n, err := reader.Read(buf) if err != nil { - // just default to utf-8 and no bom + // return default return "UTF-8", false } buf = buf[:n] + + if setting.LFS.StartServer { + meta := lfs.IsPointerFile(&buf) + if meta != nil { + meta, err = repo.GetLFSMetaObjectByOid(meta.Oid) + if err != nil && err != models.ErrLFSObjectNotExist { + // return default + return "UTF-8", false + } + } + if meta != nil { + dataRc, err := lfs.ReadMetaObject(meta) + if err != nil { + // return default + return "UTF-8", false + } + defer dataRc.Close() + buf = make([]byte, 1024) + n, err = dataRc.Read(buf) + if err != nil { + // return default + return "UTF-8", false + } + buf = buf[:n] + } + + } + encoding, err := base.DetectEncoding(buf) if err != nil { // just default to utf-8 and no bom @@ -195,7 +224,7 @@ func CreateOrUpdateRepoFile(repo *models.Repository, doer *models.User, opts *Up // haven't been made. We throw an error if one wasn't provided. return nil, models.ErrSHAOrCommitIDNotProvided{} } - encoding, bom = detectEncodingAndBOM(fromEntry) + encoding, bom = detectEncodingAndBOM(fromEntry, repo) } // For the path where this file will be created/updated, we need to make @@ -299,7 +328,7 @@ func CreateOrUpdateRepoFile(repo *models.Repository, doer *models.User, opts *Up } var lfsMetaObject *models.LFSMetaObject - if filename2attribute2info[treePath] != nil && filename2attribute2info[treePath]["filter"] == "lfs" { + if setting.LFS.StartServer && filename2attribute2info[treePath] != nil && filename2attribute2info[treePath]["filter"] == "lfs" { // OK so we are supposed to LFS this data! oid, err := models.GenerateLFSOid(strings.NewReader(opts.Content)) if err != nil { From e732116db854765fbc9750efbeb483a3161261c3 Mon Sep 17 00:00:00 2001 From: zeripath Date: Fri, 26 Apr 2019 10:14:35 +0100 Subject: [PATCH 6/8] Update modules/repofiles/update.go --- modules/repofiles/update.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modules/repofiles/update.go b/modules/repofiles/update.go index be3abd2a80076..c1dec71665fe1 100644 --- a/modules/repofiles/update.go +++ b/modules/repofiles/update.go @@ -330,7 +330,7 @@ func CreateOrUpdateRepoFile(repo *models.Repository, doer *models.User, opts *Up if setting.LFS.StartServer && filename2attribute2info[treePath] != nil && filename2attribute2info[treePath]["filter"] == "lfs" { // OK so we are supposed to LFS this data! - oid, err := models.GenerateLFSOid(strings.NewReader(opts.Content)) + oid, err := models.GenerateLFSOid(strings.NewReader(content)) if err != nil { return nil, err } From fee0d65eabe09407801d62920cc31f0bc6ab053f Mon Sep 17 00:00:00 2001 From: zeripath Date: Fri, 26 Apr 2019 10:40:25 +0100 Subject: [PATCH 7/8] Fix final LFS bug --- modules/repofiles/update.go | 2 ++ 1 file changed, 2 insertions(+) diff --git a/modules/repofiles/update.go b/modules/repofiles/update.go index c1dec71665fe1..7912419a17019 100644 --- a/modules/repofiles/update.go +++ b/modules/repofiles/update.go @@ -326,6 +326,8 @@ func CreateOrUpdateRepoFile(repo *models.Repository, doer *models.User, opts *Up log.Error("Unknown encoding: %s", encoding) } } + // Reset the opts.Content to our adjusted content to ensure that LFS gets the correct content + opts.Content = content var lfsMetaObject *models.LFSMetaObject if setting.LFS.StartServer && filename2attribute2info[treePath] != nil && filename2attribute2info[treePath]["filter"] == "lfs" { From 3dbc3b55920f9b068820bb72a7a53b1229b61240 Mon Sep 17 00:00:00 2001 From: zeripath Date: Fri, 26 Apr 2019 10:41:08 +0100 Subject: [PATCH 8/8] Keep LFS sections referring to opts.Content --- modules/repofiles/update.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modules/repofiles/update.go b/modules/repofiles/update.go index 7912419a17019..8bc3b50ae046c 100644 --- a/modules/repofiles/update.go +++ b/modules/repofiles/update.go @@ -332,7 +332,7 @@ func CreateOrUpdateRepoFile(repo *models.Repository, doer *models.User, opts *Up if setting.LFS.StartServer && filename2attribute2info[treePath] != nil && filename2attribute2info[treePath]["filter"] == "lfs" { // OK so we are supposed to LFS this data! - oid, err := models.GenerateLFSOid(strings.NewReader(content)) + oid, err := models.GenerateLFSOid(strings.NewReader(opts.Content)) if err != nil { return nil, err }