iteration: fall back to UTF-8 (not windows-1252) if encoding uncertain

petertseng · petertseng · commit 6ec60143cdc7 · 2016-09-04T23:16:10.000-07:00
As discussed in #309: Since #184 we have been using DetermineEncoding to deal with the case of UTF-16 files. That was a reasonable fix for exercism/exercism#2303. DetermineEncoding only looks at the first 1024 bytes of a file. If it can't determine an encoding, it defaults to windows-1252. This causes undesirable behaviour for files with Unicode characters but also only ASCII in their first 1024 characters - they get interpreted as windows-1252, mangling the Unicode characters. This commit takes advantage of the fact that DetermineEncoding reports whether it is *certain* about its encoding guess. If it is uncertain, we default to UTF-8 instead of windows-1252. Note that if DetermineEncoding sees UTF-16 BOMs, it will declare that it is certain. Therefore, behaviour for UTF-16 files is preserved (existing tests would have caught it if behaviour were accidentally altered). A new fixture file is attached that tests this case - the test fails without the attached code change. I find it unlikely that DetermineEncoding would have returned anything other than UTF-16, UTF-8, or windows-1252 since it was made to examine HTML documents and thus examine the content-type (we always pass text/plain) and the meta tags (unlikely to be present in a non-HTML Exercism submission). The risk of this change is that anyone who **actually** wanted to submit in windows-1252 will now be unable to, but I doubt that anyone is in this constituency, and discussion in #309 seems to be in favor of nudging them toward UTF-8 anyway. Closes #309
diff --git a/api/iteration.go b/api/iteration.go
@@ -137,7 +137,16 @@ func readFileAsUTF8String(filename string) (*string, error) {
 		return nil, err
 	}
 
-	encoding, _, _ := charset.DetermineEncoding(b, mimeType)
+	encoding, _, certain := charset.DetermineEncoding(b, mimeType)
+	if !certain {
+		// We don't want to use an uncertain encoding.
+		// In particular, doing that may mangle UTF-8 files
+		// that have only ASCII in their first 1024 bytes.
+		// See https://github.com/exercism/cli/issues/309.
+		// So if we're unsure, use UTF-8 (no transformation).
+		s := string(b)
+		return &s, nil
+	}
 	decoder := encoding.NewDecoder()
 	decodedBytes, _, err := transform.Bytes(decoder, b)
 	if err != nil {
diff --git a/api/iteration_test.go b/api/iteration_test.go
@@ -18,6 +18,7 @@ func TestNewIteration(t *testing.T) {
 		filepath.Join(dir, "python", "leap", "lib", "three.py"),
 		filepath.Join(dir, "python", "leap", "utf16le.py"),
 		filepath.Join(dir, "python", "leap", "utf16be.py"),
+		filepath.Join(dir, "python", "leap", "long-utf8.py"),
 	}
 
 	iter, err := NewIteration(dir, files)
@@ -32,8 +33,8 @@ func TestNewIteration(t *testing.T) {
 		t.Errorf("Expected problem to be leap, was %s", iter.Problem)
 	}
 
-	if len(iter.Solution) != 5 {
-		t.Fatalf("Expected solution to have 5 files, had %d", len(iter.Solution))
+	if len(iter.Solution) != 6 {
+		t.Fatalf("Expected solution to have 6 files, had %d", len(iter.Solution))
 	}
 
 	expected := map[string]struct {
@@ -45,6 +46,7 @@ func TestNewIteration(t *testing.T) {
 		filepath.Join("lib", "three.py"): {prefix: "# three"},
 		"utf16le.py":                     {prefix: "# utf16le"},
 		"utf16be.py":                     {prefix: "# utf16be"},
+		"long-utf8.py":                   {prefix: "# The first 1024", suffix: "👍\n"},
 	}
 
 	for filename, code := range expected {
diff --git a/fixtures/iteration/python/leap/long-utf8.py b/fixtures/iteration/python/leap/long-utf8.py
@@ -0,0 +1,31 @@
+# The first 1024 bytes of this file need to contain only ASCII characters.
+# After the first 1024 bytes, then there should be a non-ASCII character.
+#
+# Explanation:
+# We use golang.org/x/net/html/charset.DetectEncoding to guess file encoding.
+# DetectEncoding checks the first 1024 bytes of a file.
+# If it can't determine the encoding and saw no non-ASCII characters,
+# it declares the file to have windows-1252 encoding.
+# This mangles the submitted file if it should have been UTF-8.
+# We test to make sure we use UTF-8 for such files, instead of windows-1252.
+
+lipsum = """
+Lorem ipsum dolor sit amet, consectetur adipiscing elit. Nam condimentum vitae
+ipsum eget tempor. Morbi sed ex quis orci vulputate cursus quis non massa.
+Vestibulum quam nibh, elementum in justo in, venenatis tristique nisl. Morbi
+sagittis elit id velit ultricies, sed rutrum augue posuere. Donec nec nulla nec
+eros fringilla pellentesque. Duis at dictum justo. Nunc ut magna felis. Aliquam
+volutpat, lectus et molestie porttitor, est orci malesuada erat, ac pretium
+eros ligula vel erat. Nullam venenatis dui eget sapien semper lobortis. Aenean
+ac eros eget neque porta auctor in nec erat. Phasellus ac nulla ac turpis
+porttitor auctor. Etiam eget posuere diam, ac feugiat lacus. Curabitur ornare
+justo ut nulla congue, vitae posuere erat venenatis. Aliquam pulvinar eleifend
+faucibus.
+
+Etiam justo sem, faucibus malesuada purus a, ultrices efficitur ex.
+Pellentesque habitant morbi tristique senectus et netus et malesuada fames ac
+turpis egestas. Duis maximus dapibus mattis. Quisque sem ex, convallis eu
+ultricies posuere.
+"""
+
+# 👍

Original file line number	Diff line number	Diff line change
`@@ -18,6 +18,7 @@ func TestNewIteration(t *testing.T) {`
`18`	`18`	`filepath.Join(dir, "python", "leap", "lib", "three.py"),`
`19`	`19`	`filepath.Join(dir, "python", "leap", "utf16le.py"),`
`20`	`20`	`filepath.Join(dir, "python", "leap", "utf16be.py"),`
	`21`	`+ filepath.Join(dir, "python", "leap", "long-utf8.py"),`
`21`	`22`	`}`
`22`	`23`
`23`	`24`	`iter, err := NewIteration(dir, files)`
`@@ -32,8 +33,8 @@ func TestNewIteration(t *testing.T) {`
`32`	`33`	`t.Errorf("Expected problem to be leap, was %s", iter.Problem)`
`33`	`34`	`}`
`34`	`35`
`35`		`- if len(iter.Solution) != 5 {`
`36`		`- t.Fatalf("Expected solution to have 5 files, had %d", len(iter.Solution))`
	`36`	`+ if len(iter.Solution) != 6 {`
	`37`	`+ t.Fatalf("Expected solution to have 6 files, had %d", len(iter.Solution))`
`37`	`38`	`}`
`38`	`39`
`39`	`40`	`expected := map[string]struct {`
`@@ -45,6 +46,7 @@ func TestNewIteration(t *testing.T) {`
`45`	`46`	`filepath.Join("lib", "three.py"): {prefix: "# three"},`
`46`	`47`	`"utf16le.py": {prefix: "# utf16le"},`
`47`	`48`	`"utf16be.py": {prefix: "# utf16be"},`
	`49`	`+ "long-utf8.py": {prefix: "# The first 1024", suffix: "👍\n"},`
`48`	`50`	`}`
`49`	`51`
`50`	`52`	`for filename, code := range expected {`