Accept Byteslike in decoders

BrandonTheBuilder · tekknolagi · commit ab3fdd33c88f · 2021-08-25T14:37:47.000-07:00
Summary: The python builtins were already checking _bytes_like_guard when type checking. This updates the actual decode functions to use Byteslike objects instead of assuming they are Bytes or a Bytearray.

Based on Facebook D27862662
diff --git a/library/_codecs_test.py b/library/_codecs_test.py
@@ -1,6 +1,7 @@
 #!/usr/bin/env python3
 import _codecs
 import unittest
+from array import array
 
 from test_support import pyro_only
 
@@ -369,6 +370,11 @@ def test_decode_ascii_with_well_formed_ascii_returns_string(self):
         self.assertEqual(decoded, "hello")
         self.assertEqual(consumed, 5)
 
+    def test_decode_ascii_with_well_formed_ascii_array_returns_string(self):
+        decoded, consumed = _codecs.ascii_decode(array("B", b"hello"))
+        self.assertEqual(decoded, "hello")
+        self.assertEqual(consumed, 5)
+
     def test_decode_ascii_with_well_formed_ascii_bytearray_returns_string(self):
         decoded, consumed = _codecs.ascii_decode(bytearray(b"hello"))
         self.assertEqual(decoded, "hello")
@@ -384,6 +390,11 @@ class B(bytearray):
         self.assertEqual(decoded, "hello")
         self.assertEqual(consumed, 5)
 
+    def test_decode_ascii_with_well_formed_ascii_memoryview_returns_string(self):
+        decoded, consumed = _codecs.ascii_decode(memoryview(b"hello"))
+        self.assertEqual(decoded, "hello")
+        self.assertEqual(consumed, 5)
+
     def test_decode_ascii_with_custom_error_handler_returns_string(self):
         _codecs.register_error("test", lambda x: ("-testing-", x.end))
         decoded, consumed = _codecs.ascii_decode(b"ab\x90c", "test")
@@ -457,6 +468,11 @@ def test_decode_latin_1_with_ascii_returns_string(self):
         self.assertEqual(decoded, "hello")
         self.assertEqual(consumed, 5)
 
+    def test_decode_latin_1_with_ascii_array_returns_string(self):
+        decoded, consumed = _codecs.latin_1_decode(array("B", b"hello"))
+        self.assertEqual(decoded, "hello")
+        self.assertEqual(consumed, 5)
+
     def test_decode_latin_1_with_ascii_bytearray_returns_string(self):
         decoded, consumed = _codecs.latin_1_decode(bytearray(b"hello"))
         self.assertEqual(decoded, "hello")
@@ -470,6 +486,11 @@ class B(bytearray):
         self.assertEqual(decoded, "hello")
         self.assertEqual(consumed, 5)
 
+    def test_decode_latin_1_with_ascii_memoryview_returns_string(self):
+        decoded, consumed = _codecs.latin_1_decode(memoryview(b"hello"))
+        self.assertEqual(decoded, "hello")
+        self.assertEqual(consumed, 5)
+
     def test_decode_latin_1_with_latin_1_returns_string(self):
         decoded, consumed = _codecs.latin_1_decode(b"\x7D\x7E\x7F\x80\x81\x82")
         self.assertEqual(decoded, "\x7D\x7E\x7F\x80\x81\x82")
@@ -495,6 +516,13 @@ def test_decode_unicode_escape_with_well_formed_latin_1_returns_string(self):
         self.assertEqual(decoded, "hello\x95")
         self.assertEqual(consumed, 6)
 
+    def test_decode_unicode_escape_with_well_formed_latin_1_array_returns_string(
+        self,
+    ):
+        decoded, consumed = _codecs.unicode_escape_decode(array("B", b"hello\x95"))
+        self.assertEqual(decoded, "hello\x95")
+        self.assertEqual(consumed, 6)
+
     def test_decode_unicode_escape_with_well_formed_latin_1_bytearray_returns_string(
         self,
     ):
@@ -510,6 +538,13 @@ class B(bytearray):
         self.assertEqual(decoded, "hello\x95")
         self.assertEqual(consumed, 6)
 
+    def test_decode_unicode_escape_with_well_formed_latin_1_memoryview_returns_string(
+        self,
+    ):
+        decoded, consumed = _codecs.unicode_escape_decode(memoryview(b"hello\x95"))
+        self.assertEqual(decoded, "hello\x95")
+        self.assertEqual(consumed, 6)
+
     def test_decode_unicode_escape_with_escaped_back_slash_returns_string(self):
         decoded, consumed = _codecs.unicode_escape_decode(b"hello\\x95")
         self.assertEqual(decoded, "hello\x95")
@@ -614,13 +649,27 @@ def test_decode_raw_unicode_escape_with_escaped_back_slash_returns_string(self):
         self.assertEqual(decoded, "hello\\x95")
         self.assertEqual(consumed, 9)
 
+    def test_decode_raw_unicode_escape_with_well_formed_latin_1_array_returns_string(
+        self,
+    ):
+        decoded, consumed = _codecs.raw_unicode_escape_decode(array("B", b"hello\x95"))
+        self.assertEqual(decoded, "hello\x95")
+        self.assertEqual(consumed, 6)
+
     def test_decode_raw_unicode_escape_with_well_formed_latin_1_bytearray_returns_string(
         self,
     ):
         decoded, consumed = _codecs.raw_unicode_escape_decode(bytearray(b"hello\x95"))
         self.assertEqual(decoded, "hello\x95")
         self.assertEqual(consumed, 6)
 
+    def test_decode_raw_unicode_escape_with_well_formed_latin_1_memoryview_returns_string(
+        self,
+    ):
+        decoded, consumed = _codecs.raw_unicode_escape_decode(memoryview(b"hello\x95"))
+        self.assertEqual(decoded, "hello\x95")
+        self.assertEqual(consumed, 6)
+
     def test_decode_raw_unicode_escape_with_latin_1_bytearray_subclass_returns_string(
         self,
     ):
@@ -738,6 +787,13 @@ def test_decode_utf_8_with_well_formed_utf_8_returns_string(self):
         self.assertEqual(decoded, "\U0001f192h\xe4l\u2cc0")
         self.assertEqual(consumed, 11)
 
+    def test_decode_utf_8_with_well_formed_utf8_array_returns_string(self):
+        decoded, consumed = _codecs.utf_8_decode(
+            array("B", b"\xf0\x9f\x86\x92h\xc3\xa4l\xe2\xb3\x80")
+        )
+        self.assertEqual(decoded, "\U0001f192h\xe4l\u2cc0")
+        self.assertEqual(consumed, 11)
+
     def test_decode_utf_8_with_well_formed_utf8_bytearray_returns_string(self):
         decoded, consumed = _codecs.utf_8_decode(
             bytearray(b"\xf0\x9f\x86\x92h\xc3\xa4l\xe2\xb3\x80")
@@ -755,6 +811,13 @@ class B(bytearray):
         self.assertEqual(decoded, "\U0001f192h\xe4l\u2cc0")
         self.assertEqual(consumed, 11)
 
+    def test_decode_utf_8_with_well_formed_utf8_memoryview_returns_string(self):
+        decoded, consumed = _codecs.utf_8_decode(
+            memoryview(b"\xf0\x9f\x86\x92h\xc3\xa4l\xe2\xb3\x80")
+        )
+        self.assertEqual(decoded, "\U0001f192h\xe4l\u2cc0")
+        self.assertEqual(consumed, 11)
+
     def test_decode_utf_8_with_custom_error_handler_returns_string(self):
         _codecs.register_error("test", lambda x: ("-testing-", x.end))
         decoded, consumed = _codecs.utf_8_decode(b"ab\x90c", "test")
diff --git a/runtime/under-codecs-module.cpp b/runtime/under-codecs-module.cpp
@@ -35,8 +35,8 @@ static SymbolId lookupSymbolForErrorHandler(const Str& error) {
   return SymbolId::kInvalid;
 }
 
-static int asciiDecode(Thread* thread, const StrArray& dst, const Bytes& src,
-                       word start, word end) {
+static int asciiDecode(Thread* thread, const StrArray& dst,
+                       const Byteslike& src, word start, word end) {
   // TODO(T41032331): Implement a fastpass to read longs instead of chars
   Runtime* runtime = thread->runtime();
   for (word i = start; i < end; i++) {
@@ -57,16 +57,8 @@ RawObject FUNC(_codecs, _ascii_decode)(Thread* thread, Arguments args) {
   word index = intUnderlying(args.get(2)).asWord();
   StrArray dst(&scope, args.get(3));
 
-  word length;
-  Bytes bytes(&scope, Bytes::empty());
-  if (runtime->isInstanceOfBytearray(*data)) {
-    Bytearray array(&scope, *data);
-    bytes = array.items();
-    length = array.numItems();
-  } else {
-    bytes = bytesUnderlying(*data);
-    length = bytes.length();
-  }
+  Byteslike bytes(&scope, thread, *data);
+  word length = bytes.length();
   runtime->strArrayEnsureCapacity(thread, dst, length);
   word outpos = asciiDecode(thread, dst, bytes, index, length);
   if (outpos == length) {
@@ -176,7 +168,7 @@ RawObject FUNC(_codecs, _ascii_encode)(Thread* thread, Arguments args) {
 // -1 if no value should be written, and -2 if an error occurred. Sets the
 // iterating variable to where decoding should continue, and sets
 // invalid_escape_index if it doesn't recognize the escape sequence.
-static int32_t decodeEscaped(const Bytes& bytes, word* i,
+static int32_t decodeEscaped(const Byteslike& bytes, word* i,
                              word* invalid_escape_index) {
   word length = bytes.length();
   switch (byte ch = bytes.byteAt((*i)++)) {
@@ -264,7 +256,7 @@ RawObject FUNC(_codecs, _escape_decode)(Thread* thread, Arguments args) {
   }
   DCHECK(runtime->isInstanceOfStr(args.get(2)),
          "Third arg to _escape_decode must be str");
-  Bytes bytes(&scope, bytesUnderlying(*bytes_obj));
+  Byteslike bytes(&scope, thread, *bytes_obj);
   Str errors(&scope, strUnderlying(args.get(1)));
 
   Bytearray dst(&scope, runtime->newBytearray());
@@ -333,15 +325,8 @@ RawObject FUNC(_codecs, _latin_1_decode)(Thread* thread, Arguments args) {
   Object data(&scope, args.get(0));
   StrArray array(&scope, runtime->newStrArray());
   word length;
-  Bytes bytes(&scope, Bytes::empty());
-  if (runtime->isInstanceOfBytearray(*data)) {
-    Bytearray byte_array(&scope, *data);
-    bytes = byte_array.items();
-    length = byte_array.numItems();
-  } else {
-    bytes = bytesUnderlying(*data);
-    length = bytes.length();
-  }
+  Byteslike bytes(&scope, thread, *data);
+  length = bytes.length();
   runtime->strArrayEnsureCapacity(thread, array, length);
   // First, try a quick ASCII decoding
   word num_bytes = asciiDecode(thread, array, bytes, 0, length);
@@ -669,7 +654,8 @@ enum Utf8DecoderResult {
 // function returns specific values for errors to determine whether they could
 // be caused by incremental decoding, or if they would be an error no matter
 // what other bytes might be streamed in later.
-static Utf8DecoderResult isValidUtf8Codepoint(const Bytes& bytes, word index) {
+static Utf8DecoderResult isValidUtf8Codepoint(const Byteslike& bytes,
+                                              word index) {
   word length = bytes.length();
   byte ch = bytes.byteAt(index);
   if (ch <= kMaxASCII) {
@@ -781,16 +767,8 @@ RawObject FUNC(_codecs, _utf_8_decode)(Thread* thread, Arguments args) {
   StrArray dst(&scope, args.get(3));
 
   word length;
-  Bytes bytes(&scope, Bytes::empty());
-  // TODO(T45849551): Handle any bytes-like object
-  if (runtime->isInstanceOfBytearray(*data)) {
-    Bytearray array(&scope, *data);
-    bytes = array.items();
-    length = array.numItems();
-  } else {
-    bytes = bytesUnderlying(*data);
-    length = bytes.length();
-  }
+  Byteslike bytes(&scope, thread, *data);
+  length = bytes.length();
   runtime->strArrayEnsureCapacity(thread, dst, length);
   word i = asciiDecode(thread, dst, bytes, index, length);
   if (i == length) {