More flexible output_shape computation in keras.layers.MultiHeadAttention (#20503)

lcs-crr · web-flow · commit 8a79442e4273 · 2024-11-18T09:16:42.000+01:00
* Made the compute_output_shape method more flexible; now _output_shape can be either an integer or a tuple (as previously required). Fix discussed in #19769 * Added unit test * Minor changes to comments in unit test * Minor changes to comments in unit test
diff --git a/keras/src/layers/attention/multi_head_attention.py b/keras/src/layers/attention/multi_head_attention.py
@@ -669,7 +669,10 @@ def compute_output_shape(
             )
 
         if self._output_shape:
-            return query_shape[:-1] + self._output_shape
+            if isinstance(self._output_shape, tuple):
+                return query_shape[:-1] + self._output_shape
+            else:
+                return query_shape[:-1] + (self._output_shape,)
 
         return query_shape
 
diff --git a/keras/src/layers/attention/multi_head_attention_test.py b/keras/src/layers/attention/multi_head_attention_test.py
@@ -16,6 +16,7 @@
 from keras.src import testing
 from keras.src.layers.attention.attention import disable_flash_attention
 from keras.src.layers.attention.attention import enable_flash_attention
+from keras.src.layers.attention.multi_head_attention import MultiHeadAttention
 
 
 class MultiHeadAttentionTest(testing.TestCase):
@@ -593,3 +594,27 @@ def test_flash_attention_numerical_correctness(self):
         )
 
         self.assertAllClose(output_with_flash, output_without_flash)
+
+
+
+
+def test_multi_head_attention_output_shape_as_int():
+    """Test MultiHeadAttention with output_shape as an int."""
+    mha = MultiHeadAttention(num_heads=2, key_dim=16, output_shape=8)
+    query = random.uniform((2, 4, 16))
+    value = random.uniform((2, 4, 16))
+    output = mha(query=query, value=value)
+
+    assert output.shape == (2, 4, 8), (f"Expected shape (2, 4, 8),"
+                                       f" got {output.shape}")
+
+
+def test_multi_head_attention_output_shape_as_tuple():
+    """Test MultiHeadAttention with output_shape as a tuple."""
+    mha = MultiHeadAttention(num_heads=2, key_dim=16, output_shape=(8, 8))
+    query = random.uniform((2, 4, 16))
+    value = random.uniform((2, 4, 16))
+    output = mha(query=query, value=value)
+
+    assert output.shape == (2, 4, 8, 8), (f"Expected shape (2, 4, 8, 8),"
+                                          f" got {output.shape}")

Original file line number	Diff line number	Diff line change
`@@ -669,7 +669,10 @@ def compute_output_shape(`
`669`	`669`	`)`
`670`	`670`
`671`	`671`	`if self._output_shape:`
`672`		`- return query_shape[:-1] + self._output_shape`
	`672`	`+ if isinstance(self._output_shape, tuple):`
	`673`	`+ return query_shape[:-1] + self._output_shape`
	`674`	`+ else:`
	`675`	`+ return query_shape[:-1] + (self._output_shape,)`
`673`	`676`
`674`	`677`	`return query_shape`
`675`	`678`