Skip to content

Commit 7fd3c92

Browse files
author
Katrina Owen
authored
Merge pull request #346 from petertseng/encoding
iteration: fall back to UTF-8 (not windows-1252) if encoding uncertain
2 parents ddc18ac + 6ec6014 commit 7fd3c92

3 files changed

Lines changed: 59 additions & 11 deletions

File tree

api/iteration.go

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -137,7 +137,16 @@ func readFileAsUTF8String(filename string) (*string, error) {
137137
return nil, err
138138
}
139139

140-
encoding, _, _ := charset.DetermineEncoding(b, mimeType)
140+
encoding, _, certain := charset.DetermineEncoding(b, mimeType)
141+
if !certain {
142+
// We don't want to use an uncertain encoding.
143+
// In particular, doing that may mangle UTF-8 files
144+
// that have only ASCII in their first 1024 bytes.
145+
// See https://github.com/exercism/cli/issues/309.
146+
// So if we're unsure, use UTF-8 (no transformation).
147+
s := string(b)
148+
return &s, nil
149+
}
141150
decoder := encoding.NewDecoder()
142151
decodedBytes, _, err := transform.Bytes(decoder, b)
143152
if err != nil {

api/iteration_test.go

Lines changed: 18 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@ func TestNewIteration(t *testing.T) {
1818
filepath.Join(dir, "python", "leap", "lib", "three.py"),
1919
filepath.Join(dir, "python", "leap", "utf16le.py"),
2020
filepath.Join(dir, "python", "leap", "utf16be.py"),
21+
filepath.Join(dir, "python", "leap", "long-utf8.py"),
2122
}
2223

2324
iter, err := NewIteration(dir, files)
@@ -32,25 +33,32 @@ func TestNewIteration(t *testing.T) {
3233
t.Errorf("Expected problem to be leap, was %s", iter.Problem)
3334
}
3435

35-
if len(iter.Solution) != 5 {
36-
t.Fatalf("Expected solution to have 3 files, had %d", len(iter.Solution))
36+
if len(iter.Solution) != 6 {
37+
t.Fatalf("Expected solution to have 6 files, had %d", len(iter.Solution))
3738
}
3839

39-
expected := map[string]string{
40-
"one.py": "# one",
41-
"two.py": "# two",
42-
filepath.Join("lib", "three.py"): "# three",
43-
"utf16le.py": "# utf16le",
44-
"utf16be.py": "# utf16be",
40+
expected := map[string]struct {
41+
prefix string
42+
suffix string
43+
}{
44+
"one.py": {prefix: "# one"},
45+
"two.py": {prefix: "# two"},
46+
filepath.Join("lib", "three.py"): {prefix: "# three"},
47+
"utf16le.py": {prefix: "# utf16le"},
48+
"utf16be.py": {prefix: "# utf16be"},
49+
"long-utf8.py": {prefix: "# The first 1024", suffix: "👍\n"},
4550
}
4651

4752
for filename, code := range expected {
4853
if !utf8.ValidString(iter.Solution[filename]) {
4954
t.Errorf("Iteration content is not valid UTF-8 data: %s", iter.Solution[filename])
5055
}
5156

52-
if !strings.HasPrefix(iter.Solution[filename], code) {
53-
t.Errorf("Expected %s to contain `%s', had `%s'", filename, code, iter.Solution[filename])
57+
if !strings.HasPrefix(iter.Solution[filename], code.prefix) {
58+
t.Errorf("Expected %s to start with `%s', had `%s'", filename, code.prefix, iter.Solution[filename])
59+
}
60+
if !strings.HasSuffix(iter.Solution[filename], code.suffix) {
61+
t.Errorf("Expected %s to end with `%s', had `%s'", filename, code.suffix, iter.Solution[filename])
5462
}
5563
}
5664
}
Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,31 @@
1+
# The first 1024 bytes of this file need to contain only ASCII characters.
2+
# After the first 1024 bytes, then there should be a non-ASCII character.
3+
#
4+
# Explanation:
5+
# We use golang.org/x/net/html/charset.DetectEncoding to guess file encoding.
6+
# DetectEncoding checks the first 1024 bytes of a file.
7+
# If it can't determine the encoding and saw no non-ASCII characters,
8+
# it declares the file to have windows-1252 encoding.
9+
# This mangles the submitted file if it should have been UTF-8.
10+
# We test to make sure we use UTF-8 for such files, instead of windows-1252.
11+
12+
lipsum = """
13+
Lorem ipsum dolor sit amet, consectetur adipiscing elit. Nam condimentum vitae
14+
ipsum eget tempor. Morbi sed ex quis orci vulputate cursus quis non massa.
15+
Vestibulum quam nibh, elementum in justo in, venenatis tristique nisl. Morbi
16+
sagittis elit id velit ultricies, sed rutrum augue posuere. Donec nec nulla nec
17+
eros fringilla pellentesque. Duis at dictum justo. Nunc ut magna felis. Aliquam
18+
volutpat, lectus et molestie porttitor, est orci malesuada erat, ac pretium
19+
eros ligula vel erat. Nullam venenatis dui eget sapien semper lobortis. Aenean
20+
ac eros eget neque porta auctor in nec erat. Phasellus ac nulla ac turpis
21+
porttitor auctor. Etiam eget posuere diam, ac feugiat lacus. Curabitur ornare
22+
justo ut nulla congue, vitae posuere erat venenatis. Aliquam pulvinar eleifend
23+
faucibus.
24+
25+
Etiam justo sem, faucibus malesuada purus a, ultrices efficitur ex.
26+
Pellentesque habitant morbi tristique senectus et netus et malesuada fames ac
27+
turpis egestas. Duis maximus dapibus mattis. Quisque sem ex, convallis eu
28+
ultricies posuere.
29+
"""
30+
31+
# 👍

0 commit comments

Comments
 (0)