Add conversion implementation

james77777778 · james77777778 · commit 9568ba639243 · 2024-08-14T14:28:24.000+08:00
diff --git a/keras_nlp/src/models/backbone.py b/keras_nlp/src/models/backbone.py
@@ -30,6 +30,7 @@
 from keras_nlp.src.utils.preset_utils import save_metadata
 from keras_nlp.src.utils.preset_utils import save_serialized_object
 from keras_nlp.src.utils.python_utils import classproperty
+from keras_nlp.src.utils.timm.convert import load_timm_backbone
 from keras_nlp.src.utils.transformers.convert import load_transformers_backbone
 
 
@@ -204,6 +205,8 @@ class like `keras_nlp.models.Backbone.from_preset()`, or from
 
         if format == "transformers":
             return load_transformers_backbone(cls, preset, load_weights)
+        elif format == "timm":
+            return load_timm_backbone(cls, preset, load_weights, **kwargs)
 
         preset_cls = check_config_class(preset)
         if not issubclass(preset_cls, cls):
diff --git a/keras_nlp/src/models/resnet/resnet_backbone.py b/keras_nlp/src/models/resnet/resnet_backbone.py
@@ -49,8 +49,8 @@ class ResNetBackbone(FeaturePyramidBackbone):
         use_pre_activation: boolean. Whether to use pre-activation or not.
             `True` for ResNetV2, `False` for ResNet.
         include_rescaling: boolean. If `True`, rescale the input using
-            `Rescaling(1 / 255.0)` layer. If `False`, do nothing. Defaults to
-            `True`.
+            `Rescaling` and `Normalization` layers. If `False`, do nothing.
+            Defaults to `True`.
         input_image_shape: tuple. The input shape without the batch size.
             Defaults to `(None, None, 3)`.
         pooling: `None` or str. Pooling mode for feature extraction. Defaults
@@ -139,6 +139,12 @@ def __init__(
         image_input = layers.Input(shape=input_image_shape)
         if include_rescaling:
             x = layers.Rescaling(scale=1 / 255.0, dtype=dtype)(image_input)
+            x = layers.Normalization(
+                mean=(0.485, 0.456, 0.406),
+                variance=(0.229**2, 0.224**2, 0.225**2),
+                dtype=dtype,
+                name="normalization",
+            )(x)
         else:
             x = image_input
 
@@ -327,13 +333,14 @@ def apply_basic_block(
             dtype=dtype,
             name=f"{name}_0_conv",
         )(x)
-        shortcut = layers.BatchNormalization(
-            axis=bn_axis,
-            epsilon=1e-5,
-            momentum=0.9,
-            dtype=dtype,
-            name=f"{name}_0_bn",
-        )(shortcut)
+        if not use_pre_activation:
+            shortcut = layers.BatchNormalization(
+                axis=bn_axis,
+                epsilon=1e-5,
+                momentum=0.9,
+                dtype=dtype,
+                name=f"{name}_0_bn",
+            )(shortcut)
     else:
         shortcut = x
 
@@ -363,6 +370,7 @@ def apply_basic_block(
         name=f"{name}_1_bn",
     )(x)
     x = layers.Activation("relu", dtype=dtype, name=f"{name}_1_relu")(x)
+
     x = layers.Conv2D(
         filters,
         kernel_size,
@@ -373,7 +381,6 @@ def apply_basic_block(
         dtype=dtype,
         name=f"{name}_2_conv",
     )(x)
-
     if not use_pre_activation:
         x = layers.BatchNormalization(
             axis=bn_axis,
diff --git a/keras_nlp/src/utils/preset_utils.py b/keras_nlp/src/utils/preset_utils.py
@@ -50,6 +50,7 @@
 KAGGLE_PREFIX = "kaggle://"
 GS_PREFIX = "gs://"
 HF_PREFIX = "hf://"
+TIMM_PREFIX = "hf://timm"
 
 KAGGLE_SCHEME = "kaggle"
 GS_SCHEME = "gs"
@@ -544,6 +545,8 @@ def check_format(preset):
     if check_file_exists(preset, SAFETENSOR_FILE) or check_file_exists(
         preset, SAFETENSOR_CONFIG_FILE
     ):
+        if TIMM_PREFIX in preset:
+            return "timm"
         return "transformers"
 
     if not check_file_exists(preset, METADATA_FILE):
diff --git a/keras_nlp/src/utils/timm/__init__.py b/keras_nlp/src/utils/timm/__init__.py
@@ -0,0 +1,13 @@
+# Copyright 2024 The KerasNLP Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/keras_nlp/src/utils/timm/convert.py b/keras_nlp/src/utils/timm/convert.py
@@ -0,0 +1,37 @@
+# Copyright 2024 The KerasNLP Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Convert timm models to KerasNLP."""
+
+from keras_nlp.src.utils.timm.convert_resnet import load_resnet_backbone
+
+
+def load_timm_backbone(cls, preset, load_weights, **kwargs):
+    """Load a timm model config and weights as a KerasNLP backbone.
+
+    Args:
+        cls (class): Keras model class.
+        preset (str): Preset configuration name.
+        load_weights (bool): Whether to load the weights.
+
+    Returns:
+        backbone: Initialized Keras model backbone.
+    """
+    if cls is None:
+        raise ValueError("Backbone class is None")
+    if cls.__name__ == "ResNetBackbone":
+        return load_resnet_backbone(cls, preset, load_weights, **kwargs)
+    raise ValueError(
+        f"{cls} has not been ported from the Hugging Face format yet. "
+        "Please check Hugging Face Hub for the Keras model. "
+    )
diff --git a/keras_nlp/src/utils/timm/convert_resnet.py b/keras_nlp/src/utils/timm/convert_resnet.py
@@ -0,0 +1,173 @@
+# Copyright 2024 The KerasNLP Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import numpy as np
+
+from keras_nlp.src.utils.preset_utils import HF_CONFIG_FILE
+from keras_nlp.src.utils.preset_utils import jax_memory_cleanup
+from keras_nlp.src.utils.preset_utils import load_config
+from keras_nlp.src.utils.timm.safetensor_utils import SafetensorLoader
+
+
+def convert_backbone_config(timm_config):
+    timm_architecture = timm_config["architecture"]
+
+    if "resnetv2_" in timm_architecture:
+        use_pre_activation = True
+    else:
+        use_pre_activation = False
+
+    if timm_architecture == "resnet18":
+        stackwise_num_blocks = [2, 2, 2, 2]
+        block_type = "basic_block"
+    elif timm_architecture == "resnet26":
+        stackwise_num_blocks = [2, 2, 2, 2]
+        block_type = "bottleneck_block"
+    elif timm_architecture == "resnet34":
+        stackwise_num_blocks = [3, 4, 6, 3]
+        block_type = "basic_block"
+    elif timm_architecture in ("resnet50", "resnetv2_50"):
+        stackwise_num_blocks = [3, 4, 6, 3]
+        block_type = "bottleneck_block"
+    elif timm_architecture in ("resnet101", "resnetv2_101"):
+        stackwise_num_blocks = [3, 4, 23, 3]
+        block_type = "bottleneck_block"
+    elif timm_architecture in ("resnet152", "resnetv2_152"):
+        stackwise_num_blocks = [3, 8, 36, 3]
+        block_type = "bottleneck_block"
+    else:
+        raise ValueError(
+            f"Currently, the architecture {timm_architecture} is not supported."
+        )
+
+    return dict(
+        stackwise_num_filters=[64, 128, 256, 512],
+        stackwise_num_blocks=stackwise_num_blocks,
+        stackwise_num_strides=[1, 2, 2, 2],
+        block_type=block_type,
+        use_pre_activation=use_pre_activation,
+    )
+
+
+def convert_weights(backbone, loader, timm_config):
+    def transpose_conv2d(x, shape):
+        return np.transpose(x, (2, 3, 1, 0))
+
+    def port_conv2d(keras_layer_name, hf_weight_prefix):
+        loader.port_weight(
+            backbone.get_layer(keras_layer_name).kernel,
+            hf_weight_key=f"{hf_weight_prefix}.weight",
+            hook_fn=transpose_conv2d,
+        )
+
+    def port_batch_normalization(keras_layer_name, hf_weight_prefix):
+        loader.port_weight(
+            backbone.get_layer(keras_layer_name).gamma,
+            hf_weight_key=f"{hf_weight_prefix}.weight",
+        )
+        loader.port_weight(
+            backbone.get_layer(keras_layer_name).beta,
+            hf_weight_key=f"{hf_weight_prefix}.bias",
+        )
+        loader.port_weight(
+            backbone.get_layer(keras_layer_name).moving_mean,
+            hf_weight_key=f"{hf_weight_prefix}.running_mean",
+        )
+        loader.port_weight(
+            backbone.get_layer(keras_layer_name).moving_variance,
+            hf_weight_key=f"{hf_weight_prefix}.running_var",
+        )
+
+    version = "v1" if not backbone.use_pre_activation else "v2"
+    block_type = backbone.block_type
+
+    # Stem
+    if version == "v1":
+        port_conv2d("conv1_conv", "conv1")
+        port_batch_normalization("conv1_bn", "bn1")
+    else:
+        port_conv2d("conv1_conv", "stem.conv")
+
+    # Stages
+    num_stacks = len(backbone.stackwise_num_filters)
+    for stack_index in range(num_stacks):
+        for block_idx in range(backbone.stackwise_num_blocks[stack_index]):
+            if version == "v1":
+                keras_name = f"v1_stack{stack_index}_block{block_idx}"
+                hf_name = f"layer{stack_index+1}.{block_idx}"
+            else:
+                keras_name = f"v2_stack{stack_index}_block{block_idx}"
+                hf_name = f"stages.{stack_index}.blocks.{block_idx}"
+
+            if version == "v1":
+                if block_idx == 0 and (
+                    block_type == "bottleneck_block" or stack_index > 0
+                ):
+                    port_conv2d(
+                        f"{keras_name}_0_conv", f"{hf_name}.downsample.0"
+                    )
+                    port_batch_normalization(
+                        f"{keras_name}_0_bn", f"{hf_name}.downsample.1"
+                    )
+                port_conv2d(f"{keras_name}_1_conv", f"{hf_name}.conv1")
+                port_batch_normalization(f"{keras_name}_1_bn", f"{hf_name}.bn1")
+                port_conv2d(f"{keras_name}_2_conv", f"{hf_name}.conv2")
+                port_batch_normalization(f"{keras_name}_2_bn", f"{hf_name}.bn2")
+                if block_type == "bottleneck_block":
+                    port_conv2d(f"{keras_name}_3_conv", f"{hf_name}.conv3")
+                    port_batch_normalization(
+                        f"{keras_name}_3_bn", f"{hf_name}.bn3"
+                    )
+            else:
+                if block_idx == 0 and (
+                    block_type == "bottleneck_block" or stack_index > 0
+                ):
+                    port_conv2d(
+                        f"{keras_name}_0_conv", f"{hf_name}.downsample.conv"
+                    )
+                port_batch_normalization(
+                    f"{keras_name}_pre_activation_bn", f"{hf_name}.norm1"
+                )
+                port_conv2d(f"{keras_name}_1_conv", f"{hf_name}.conv1")
+                port_batch_normalization(
+                    f"{keras_name}_1_bn", f"{hf_name}.norm2"
+                )
+                port_conv2d(f"{keras_name}_2_conv", f"{hf_name}.conv2")
+                if block_type == "bottleneck_block":
+                    port_batch_normalization(
+                        f"{keras_name}_2_bn", f"{hf_name}.norm3"
+                    )
+                    port_conv2d(f"{keras_name}_3_conv", f"{hf_name}.conv3")
+
+    # Post
+    if version == "v2":
+        port_batch_normalization("post_bn", "norm")
+
+    # Rebuild normalization layer with pretrained mean & std
+    mean = timm_config["pretrained_cfg"]["mean"]
+    std = timm_config["pretrained_cfg"]["std"]
+    normalization_layer = backbone.get_layer("normalization")
+    normalization_layer.input_mean = mean
+    normalization_layer.input_variance = [s**2 for s in std]
+    normalization_layer.build(normalization_layer._build_input_shape)
+
+
+def load_resnet_backbone(cls, preset, load_weights, **kwargs):
+    timm_config = load_config(preset, HF_CONFIG_FILE)
+    keras_config = convert_backbone_config(timm_config)
+    backbone = cls(**keras_config, **kwargs)
+    if load_weights:
+        jax_memory_cleanup(backbone)
+        with SafetensorLoader(preset) as loader:
+            convert_weights(backbone, loader, timm_config)
+    return backbone
diff --git a/keras_nlp/src/utils/timm/convert_resnet_test.py b/keras_nlp/src/utils/timm/convert_resnet_test.py
@@ -0,0 +1,28 @@
+# Copyright 2024 The KerasNLP Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import pytest
+from keras import ops
+
+from keras_nlp.src.models.resnet.resnet_backbone import ResNetBackbone
+from keras_nlp.src.tests.test_case import TestCase
+
+
+class TimmResNetBackboneTest(TestCase):
+    @pytest.mark.large
+    def test_convert_resnet18_preset(self):
+        model = ResNetBackbone.from_preset("hf://timm/resnet18.a1_in1k")
+        outputs = model.predict(ops.ones((1, 224, 224, 3)))
+        self.assertEqual(outputs.shape, (1, 512))
+
+    # TODO: compare numerics with timm model
diff --git a/keras_nlp/src/utils/timm/safetensor_utils.py b/keras_nlp/src/utils/timm/safetensor_utils.py