Expose to/from fp8 in Python and don't auto-convert fp8 when loading from safetensors (#2985)

Awni Hannun · web-flow · commit 099dcc0f4cec · 2026-01-13T15:48:21.000-08:00
diff --git a/mlx/backend/cpu/unary_ops.h b/mlx/backend/cpu/unary_ops.h
@@ -154,24 +154,12 @@ struct ToFP8 {
 struct FromFP8 {
   template <int N>
   Simd<float, N> operator()(Simd<uint8_t, N> x) {
-    auto w = Simd<uint32_t, N>(x) << 24;
-    auto sign = w & 0x80000000;
-    auto nonsign = w & 0x7FFFFFFF;
-
-    auto renorm_shift = clz(nonsign);
-    renorm_shift = simd::select(
-        renorm_shift > Simd<uint32_t, N>{4},
-        renorm_shift - Simd<uint32_t, N>{4},
-        Simd<uint32_t, N>{0});
-
-    Simd<int32_t, N> inf_nan_mask =
-        (Simd<int32_t, N>(nonsign + 0x01000000) >> 8) & 0x7F800000;
-    auto zero_mask = Simd<int32_t, N>(nonsign - 1) >> 31;
-    auto result = sign |
-        ((((nonsign << renorm_shift >> 4) + ((0x78 - renorm_shift) << 23)) |
-          inf_nan_mask) &
-         ~zero_mask);
-    return fp32_from_bits(result);
+    auto v = Simd<uint16_t, N>(x & 127) << 7;
+    auto converted = *(Simd<float16_t, N>*)(&v);
+    converted = converted * 256.0;
+    auto sign = Simd<bool, N>(x & 128);
+    Simd<float, N> out = select(sign, -converted, converted);
+    return out;
   }
   float operator()(uint8_t x) {
     return (*this)(Simd<uint8_t, 1>(x)).value;
diff --git a/mlx/io/safetensors.cpp b/mlx/io/safetensors.cpp
@@ -95,7 +95,6 @@ Dtype dtype_from_safetensor_str(std::string_view str) {
   } else if (str == ST_C64) {
     return complex64;
   } else if (str == ST_F8_E4M3) {
-    // We convert this manually later
     return uint8;
   } else {
     throw std::runtime_error(
@@ -148,16 +147,14 @@ SafetensorsLoad load_safetensors(
     const Shape& shape = item.value().at("shape");
     const std::vector<size_t>& data_offsets = item.value().at("data_offsets");
     Dtype type = dtype_from_safetensor_str(dtype);
-    auto loaded_array = array(
-        shape,
-        type,
-        std::make_shared<Load>(
-            stream, in_stream, offset + data_offsets.at(0), false),
-        std::vector<array>{});
-    if (dtype == ST_F8_E4M3) {
-      loaded_array = from_fp8(loaded_array, bfloat16, s);
-    }
-    res.insert({item.key(), loaded_array});
+    res.insert(
+        {item.key(),
+         array(
+             shape,
+             type,
+             std::make_shared<Load>(
+                 stream, in_stream, offset + data_offsets.at(0), false),
+             std::vector<array>{})});
   }
   return {res, metadata_map};
 }
diff --git a/python/src/ops.cpp b/python/src/ops.cpp
@@ -5479,10 +5479,10 @@ void init_ops(nb::module_& m) {
         If ``w`` is expected to receive gradients, it must be provided in
         non-quantized form.
 
-        If ``x`` and `w`` are not quantized, their data types must be ``float32``, 
+        If ``x`` and `w`` are not quantized, their data types must be ``float32``,
         ``float16``, or ``bfloat16``.
         If ``w`` is quantized, it must be packed in unsigned integers.
-        
+
       Args:
         x (array): Input array.
         w (array): Weight matrix. If quantized, it is packed in unsigned integers.
@@ -5502,4 +5502,40 @@ void init_ops(nb::module_& m) {
         array: The result of the multiplication of quantized ``x`` with quantized ``w``.
         needed).
   )pbdoc");
+  m.def(
+      "from_fp8",
+      &mx::from_fp8,
+      nb::arg(),
+      "dtype"_a = mx::bfloat16,
+      nb::kw_only(),
+      "stream"_a = nb::none(),
+      nb::sig(
+          "def from_fp8(x: array, dtype: Dtype = bfloat16, *, stream: Union[None, Stream, Device] = None) -> array"),
+      R"pbdoc(
+      Convert the array from fp8 (e4m3) to another floating-point type.
+
+      Args:
+        x (array): The input fp8 array with type ``uint8``.
+        dtype (Dtype): The data type to convert to. Default: ``bfloat16``.
+
+      Returns:
+        array: The array converted from fp8.
+  )pbdoc");
+  m.def(
+      "to_fp8",
+      &mx::to_fp8,
+      nb::arg(),
+      nb::kw_only(),
+      "stream"_a = nb::none(),
+      nb::sig(
+          "def to_fp8(x: array, *, stream: Union[None, Stream, Device] = None) -> array"),
+      R"pbdoc(
+      Convert the array to fp8 (e4m3) from another floating-point type.
+
+      Args:
+        x (array): The input array.
+
+      Returns:
+        array: The array converted to fp8 with type ``uint8``.
+  )pbdoc");
 }
diff --git a/python/tests/cuda_skip.py b/python/tests/cuda_skip.py
@@ -1,5 +1,4 @@
 cuda_skip = {
-    "TestLoad.test_load_f8_e4m3",
     "TestLayers.test_quantized_embedding",
     # Block masked matmul NYI
     "TestBlas.test_block_masked_matmul",
diff --git a/python/tests/test_load.py b/python/tests/test_load.py
@@ -168,8 +168,8 @@ def test_load_f8_e4m3(self):
 
         expected = [
             0,
-            mx.nan,
-            mx.nan,
+            448,
+            -448,
             -0.875,
             0.4375,
             -0.005859,
@@ -179,12 +179,12 @@ def test_load_f8_e4m3(self):
             -0.0039,
         ]
         expected = mx.array(expected, dtype=mx.bfloat16)
-        contents = b'H\x00\x00\x00\x00\x00\x00\x00{"tensor":{"dtype":"F8_E4M3","shape":[10],"data_offsets":[0,10]}}       \x00\x7f\xff\xb6.\x83\xba\xba\xbc\x82'
+        contents = b'H\x00\x00\x00\x00\x00\x00\x00{"tensor":{"dtype":"F8_E4M3","shape":[10],"data_offsets":[0,10]}}       \x00~\xfe\xb6.\x83\xba\xba\xbc\x82'
         with tempfile.NamedTemporaryFile(suffix=".safetensors") as f:
             f.write(contents)
             f.seek(0)
             out = mx.load(f)["tensor"]
-        self.assertTrue(mx.allclose(out[0], expected[0], equal_nan=True))
+        self.assertTrue(mx.allclose(mx.from_fp8(out), expected))
 
     def test_save_and_load_gguf_metadata_basic(self):
         if not os.path.isdir(self.test_dir):
diff --git a/python/tests/test_ops.py b/python/tests/test_ops.py
@@ -3197,8 +3197,6 @@ def test_masked_scatter(self):
             )
         )
 
-
-class TestBroadcast(mlx_tests.MLXTestCase):
     def test_broadcast_shapes(self):
         # Basic broadcasting
         self.assertEqual(mx.broadcast_shapes((1, 2, 3), (3,)), (1, 2, 3))
@@ -3243,6 +3241,13 @@ def test_sort_nan(self):
         self.assertTrue(mx.array_equal(mx.sort(x), expected, equal_nan=True))
         x = mx.array([3.0, mx.nan, 2.0, 0.0]) + 1j * mx.array([1.0] * 4)
 
+    def test_to_from_fp8(self):
+        vals = mx.array(
+            [448, 256, 192, 128, 96, 64, 48, 32, 24, 16, 12, 8, 6, 4, 3, 2, 0.015625]
+        )
+        self.assertTrue(mx.array_equal(mx.from_fp8(mx.to_fp8(vals)), vals))
+        self.assertTrue(mx.array_equal(mx.from_fp8(mx.to_fp8(-vals)), -vals))
+
 
 if __name__ == "__main__":
     mlx_tests.MLXTestRunner()

Original file line number	Diff line number	Diff line change
`@@ -1,5 +1,4 @@`
`1`	`1`	`cuda_skip = {`
`2`		`- "TestLoad.test_load_f8_e4m3",`
`3`	`2`	`"TestLayers.test_quantized_embedding",`
`4`	`3`	`# Block masked matmul NYI`
`5`	`4`	`"TestBlas.test_block_masked_matmul",`