skip UTF-8 BOM also (#381)

kmuto · arp242 · web-flow · commit 1a6ca6e47722 · 2023-01-28T20:45:07.000+01:00
Co-authored-by: Martin Tournoij &lt;martin@arp242.net&gt;
diff --git a/decode_test.go b/decode_test.go
@@ -66,6 +66,7 @@ func TestDecodeBOM(t *testing.T) {
 	for _, tt := range [][]byte{
 		[]byte("\xff\xfea = \"b\""),
 		[]byte("\xfe\xffa = \"b\""),
+		[]byte("\xef\xbb\xbfa = \"b\""),
 	} {
 		t.Run("", func(t *testing.T) {
 			var s struct{ A string }
diff --git a/parse.go b/parse.go
@@ -47,9 +47,12 @@ func parse(data string) (p *parser, err error) {
 	}()
 
 	// Read over BOM; do this here as the lexer calls utf8.DecodeRuneInString()
-	// which mangles stuff.
-	if strings.HasPrefix(data, "\xff\xfe") || strings.HasPrefix(data, "\xfe\xff") {
+	// which mangles stuff. UTF-16 BOM isn't strictly valid, but some tools add
+	// it anyway.
+	if strings.HasPrefix(data, "\xff\xfe") || strings.HasPrefix(data, "\xfe\xff") { // UTF-16
 		data = data[2:]
+	} else if strings.HasPrefix(data, "\xef\xbb\xbf") { // UTF-8
+		data = data[3:]
 	}
 
 	// Examine first few bytes for NULL bytes; this probably means it's a UTF-16