diff --git a/.github/workflows/ci_gpu.yml b/.github/workflows/ci_gpu.yml
index fe35a69ad3..5185512eb4 100644
--- a/.github/workflows/ci_gpu.yml
+++ b/.github/workflows/ci_gpu.yml
@@ -17,6 +17,13 @@ concurrency:
 jobs:
   ci-gpu:
     runs-on: [self-hosted, ci-gpu]
+    runs-on:
+      - self-hosted
+      - ci-gpu
+      - ${{ matrix.gpu-type }}
+    strategy:
+      matrix:
+        gpu-type: [cuda, rocm]
     steps:
       - name: Clean up previous run
         run: |
@@ -41,9 +48,14 @@ jobs:
         uses: actions/checkout@v3
         with:
           submodules: recursive
-      - name: Install dependencies
+      - name: Install dependencies for CUDA
+        if: matrix.gpu-type == 'cuda'
         run: |
           python ts_scripts/install_dependencies.py --environment=dev --cuda=cu121
+      - name: Install dependencies for ROCm
+        if: matrix.gpu-type == 'rocm'
+        run: |
+          python ts_scripts/install_dependencies.py --environment=dev --rocm=rocm6.2
       - name: Torchserve Sanity
         uses: nick-fields/retry@v3
         with:
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index 2b209fe57e..1014eee2af 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -67,10 +67,10 @@ If you plan to develop with TorchServe and change some source code, you must ins
     Use the optional `--rocm` or `--cuda` flag with `install_dependencies.py` for installing accelerator specific dependencies.
 
     Possible values are
-    - rocm: `rocm61`, `rocm60`
+    - rocm: `rocm6.3`, `rocm6.2`, `rocm6.1`, 'rocm6.0'
     - cuda: `cu111`, `cu102`, `cu101`, `cu92`
 
-    For example `python ./ts_scripts/install_dependencies.py --environment=dev --rocm=rocm61`
+    For example `python ./ts_scripts/install_dependencies.py --environment=dev --rocm=rocm6.2`
 
     #### For Windows
 
diff --git a/README.md b/README.md
index afe21d8428..d80e7656f2 100644
--- a/README.md
+++ b/README.md
@@ -29,7 +29,7 @@ curl http://127.0.0.1:8080/predictions/bert -T input.txt
 python ./ts_scripts/install_dependencies.py
 
 # Include dependencies for accelerator support with the relevant optional flags
-python ./ts_scripts/install_dependencies.py --rocm=rocm61
+python ./ts_scripts/install_dependencies.py --rocm=rocm6.2
 python ./ts_scripts/install_dependencies.py --cuda=cu121
 
 # Latest release
@@ -45,8 +45,8 @@ pip install torchserve-nightly torch-model-archiver-nightly torch-workflow-archi
 # Install dependencies
 python ./ts_scripts/install_dependencies.py
 
-# Include depeendencies for accelerator support with the relevant optional flags
-python ./ts_scripts/install_dependencies.py --rocm=rocm61
+# Include dependencies for accelerator support with the relevant optional flags
+python ./ts_scripts/install_dependencies.py --rocm=rocm6.2
 python ./ts_scripts/install_dependencies.py --cuda=cu121
 
 # Latest release
@@ -106,7 +106,7 @@ curl -X POST -d '{"model":"meta-llama/Meta-Llama-3-8B-Instruct", "prompt":"Hello
 Refer to [LLM deployment](docs/llm_deployment.md) for details and other methods.
 
 ## ⚡ Why TorchServe
-* Write once, run anywhere, on-prem, on-cloud, supports inference on CPUs, GPUs, AWS Inf1/Inf2/Trn1, Google Cloud TPUs, [Nvidia MPS](docs/nvidia_mps.md)
+* Write once, run anywhere, on-prem, on-cloud, supports inference on CPUs, GPUs, AWS Inf1/Inf2/Trn1, Google Cloud TPUs, [Nvidia MPS](docs/hardware_support/nvidia_mps.md)
 * [Model Management API](docs/management_api.md): multi model management with optimized worker to model allocation
 * [Inference API](docs/inference_api.md): REST and gRPC support for batched inference
 * [TorchServe Workflows](examples/Workflows/README.md): deploy complex DAGs with multiple interdependent models
diff --git a/cpp/src/backends/handler/handler_factory.hh b/cpp/src/backends/handler/handler_factory.hh
index 52689cecf1..b42e5e31b2 100644
--- a/cpp/src/backends/handler/handler_factory.hh
+++ b/cpp/src/backends/handler/handler_factory.hh
@@ -18,6 +18,9 @@ class HandlerFactory {
       const std::string& handler_class_name) {
     auto it = handlers_.find(handler_class_name);
     if (it == handlers_.end()) {
+      // XXX:
+      // Why not use the default ctor of `std::shared_ptr` directly?
+      // What are the benefits of using this `std::shared_ptr(nullptr_t)`?
       return std::shared_ptr<BaseHandler>(nullptr);
     } else {
       return it->second();
@@ -25,6 +28,11 @@ class HandlerFactory {
   };
 
  private:
+  // XXX:
+  // 1) What are the benefits of using a function (ctor) pointer as the value
+  // instead of using a `shared_ptr` instance directly?
+  // 2) Whenever we want to add a new pair to `handlers_`, we'll have to
+  // change the definition here.
   std::map<std::string, std::shared_ptr<BaseHandler> (*)()> handlers_ = {
       {"TorchScriptHandler", []() -> std::shared_ptr<BaseHandler> {
          return std::make_shared<TorchScriptHandler>();
diff --git a/cpp/src/backends/handler/torch_scripted_handler.hh b/cpp/src/backends/handler/torch_scripted_handler.hh
index 1e7c816a34..edd173c1ab 100644
--- a/cpp/src/backends/handler/torch_scripted_handler.hh
+++ b/cpp/src/backends/handler/torch_scripted_handler.hh
@@ -4,6 +4,7 @@
 namespace torchserve {
 
 class TorchScriptHandler : public BaseHandler {
+ public:
   std::pair<std::shared_ptr<void>, std::shared_ptr<torch::Device>> LoadModel(
       std::shared_ptr<LoadModelRequest>& load_model_request) override;
 };
diff --git a/docker/Dockerfile b/docker/Dockerfile
index 94f4a1ba99..1d9246e034 100644
--- a/docker/Dockerfile
+++ b/docker/Dockerfile
@@ -37,12 +37,12 @@ ARG BRANCH_NAME
 ARG REPO_URL=https://github.com/pytorch/serve.git
 ENV PYTHONUNBUFFERED TRUE
 
-RUN --mount=type=cache,id=apt-dev,target=/var/cache/apt \
+RUN --mount=type=cache,sharing=locked,id=apt-dev,target=/var/cache/apt \
     apt-get update && \
     apt-get upgrade -y && \
     apt-get install software-properties-common -y && \
     add-apt-repository -y ppa:deadsnakes/ppa && \
-    apt remove python-pip  python3-pip && \
+    apt remove -y python-pip  python3-pip && \
     DEBIAN_FRONTEND=noninteractive apt-get install --no-install-recommends -y \
         ca-certificates \
         g++ \
@@ -55,6 +55,13 @@ RUN --mount=type=cache,id=apt-dev,target=/var/cache/apt \
         git \
     && rm -rf /var/lib/apt/lists/*
 
+RUN --mount=type=cache,sharing=locked,id=apt-dev,target=/var/cache/apt \
+    if [ "$USE_ROCM_VERSION" ]; then \
+        apt-get update && \
+        DEBIAN_FRONTEND=noninteractive apt-get install --no-install-recommends -y rocm-dev amd-smi-lib \
+        && rm -rf /var/lib/apt/lists/* ; \
+    fi
+
 # Make the virtual environment and "activating" it by adding it first to the path.
 # From here on the python$PYTHON_VERSION interpreter is used and the packages
 # are installed in /home/venv which is what we need for the "runtime-image"
@@ -67,6 +74,7 @@ RUN python -m pip install -U pip setuptools
 RUN export USE_CUDA=1
 
 ARG USE_CUDA_VERSION=""
+ARG USE_ROCM_VERSION=""
 
 COPY ./ serve
 
@@ -90,6 +98,14 @@ RUN \
         else \
             python ./ts_scripts/install_dependencies.py;\
         fi; \
+    elif echo "${BASE_IMAGE}" | grep -q "rocm/"; then \
+        # Install ROCm version specific binary when ROCm version is specified as a build arg
+        if [ "$USE_ROCM_VERSION" ]; then \
+            python ./ts_scripts/install_dependencies.py --rocm $USE_ROCM_VERSION;\
+        # Install the binary with the latest CPU image on a ROCm base image
+        else \
+            python ./ts_scripts/install_dependencies.py; \
+        fi; \
     # Install the CPU binary
     else \
         python ./ts_scripts/install_dependencies.py; \
@@ -111,13 +127,14 @@ FROM ${BASE_IMAGE} AS production-image
 # Re-state ARG PYTHON_VERSION to make it active in this build-stage (uses default define at the top)
 ARG PYTHON_VERSION
 ENV PYTHONUNBUFFERED TRUE
+ARG USE_ROCM_VERSION
 
 RUN --mount=type=cache,target=/var/cache/apt \
     apt-get update && \
     apt-get upgrade -y && \
     apt-get install software-properties-common -y && \
     add-apt-repository ppa:deadsnakes/ppa -y && \
-    apt remove python-pip  python3-pip && \
+    apt remove -y python-pip python3-pip && \
     DEBIAN_FRONTEND=noninteractive apt-get install --no-install-recommends -y \
     python$PYTHON_VERSION \
     python3-distutils \
@@ -130,6 +147,13 @@ RUN --mount=type=cache,target=/var/cache/apt \
     && rm -rf /var/lib/apt/lists/* \
     && cd /tmp
 
+RUN --mount=type=cache,sharing=locked,id=apt-dev,target=/var/cache/apt \
+    if [ "$USE_ROCM_VERSION" ]; then \
+        apt-get update && \
+        DEBIAN_FRONTEND=noninteractive apt-get install --no-install-recommends -y rocm-dev amd-smi-lib \
+        && rm -rf /var/lib/apt/lists/* ; \
+    fi
+
 RUN useradd -m model-server \
     && mkdir -p /home/model-server/tmp
 
@@ -137,6 +161,11 @@ COPY --chown=model-server --from=compile-image /home/venv /home/venv
 COPY --from=compile-image /usr/local/bin/dockerd-entrypoint.sh /usr/local/bin/dockerd-entrypoint.sh
 ENV PATH="/home/venv/bin:$PATH"
 
+RUN \
+    if [ "$USE_ROCM_VERSION" ]; then \
+        python -m pip install /opt/rocm/share/amd_smi; \
+    fi
+
 RUN chmod +x /usr/local/bin/dockerd-entrypoint.sh \
     && chown -R model-server /home/model-server
 
@@ -157,13 +186,14 @@ FROM ${BASE_IMAGE} AS ci-image
 ARG PYTHON_VERSION
 ARG BRANCH_NAME
 ENV PYTHONUNBUFFERED TRUE
+ARG USE_ROCM_VERSION
 
-RUN --mount=type=cache,target=/var/cache/apt \
+RUN --mount=type=cache,sharing=locked,target=/var/cache/apt \
     apt-get update && \
     apt-get upgrade -y && \
     apt-get install software-properties-common -y && \
     add-apt-repository -y ppa:deadsnakes/ppa && \
-    apt remove python-pip  python3-pip && \
+    apt remove -y python-pip python3-pip && \
     DEBIAN_FRONTEND=noninteractive apt-get install --no-install-recommends -y \
     python$PYTHON_VERSION \
     python3-distutils \
@@ -183,6 +213,12 @@ RUN --mount=type=cache,target=/var/cache/apt \
     && rm -rf /var/lib/apt/lists/* \
     && cd /tmp
 
+RUN --mount=type=cache,sharing=locked,id=apt-dev,target=/var/cache/apt \
+    if [ "$USE_ROCM_VERSION" ]; then \
+        apt-get update && \
+        DEBIAN_FRONTEND=noninteractive apt-get install --no-install-recommends -y rocm-dev amd-smi-lib \
+        && rm -rf /var/lib/apt/lists/* ; \
+    fi
 
 COPY --from=compile-image /home/venv /home/venv
 
@@ -190,6 +226,11 @@ ENV PATH="/home/venv/bin:$PATH"
 
 RUN python -m pip install --no-cache-dir -r https://raw.githubusercontent.com/pytorch/serve/$BRANCH_NAME/requirements/developer.txt
 
+RUN \
+    if [ "$USE_ROCM_VERSION" ]; then \
+        python -m pip install /opt/rocm/share/amd_smi; \
+    fi
+
 RUN mkdir /home/serve
 ENV TS_RUN_IN_DOCKER True
 
@@ -203,11 +244,12 @@ ARG PYTHON_VERSION
 ARG BRANCH_NAME
 ARG BUILD_FROM_SRC
 ARG LOCAL_CHANGES
+ARG USE_ROCM_VERSION
 ARG BUILD_WITH_IPEX
 ARG IPEX_VERSION=1.11.0
 ARG IPEX_URL=https://software.intel.com/ipex-whl-stable
 ENV PYTHONUNBUFFERED TRUE
-RUN --mount=type=cache,target=/var/cache/apt \
+RUN --mount=type=cache,sharing=locked,target=/var/cache/apt \
     apt-get update && \
     apt-get upgrade -y && \
     apt-get install software-properties-common -y && \
@@ -243,10 +285,16 @@ RUN \
 
 COPY --from=compile-image /home/venv /home/venv
 ENV PATH="/home/venv/bin:$PATH"
+
+RUN \
+    if [ "$USE_ROCM_VERSION" ]; then \
+        python -m pip install /opt/rocm/share/amd_smi; \
+    fi
+
 WORKDIR "serve"
 RUN python -m pip install -U pip setuptools \
     && python -m pip install --no-cache-dir -r requirements/developer.txt \
-    && python ts_scripts/install_from_src.py \
+    && python ts_scripts/install_from_src.py --environment=dev \
     && useradd -m model-server \
     && mkdir -p /home/model-server/tmp \
     && cp docker/dockerd-entrypoint.sh /usr/local/bin/dockerd-entrypoint.sh \
diff --git a/docker/Dockerfile.cpp b/docker/Dockerfile.cpp
index 50d19b3322..e42fe3fd05 100644
--- a/docker/Dockerfile.cpp
+++ b/docker/Dockerfile.cpp
@@ -19,6 +19,7 @@ ARG CMAKE_VERSION=3.26.4
 ARG GCC_VERSION=9
 ARG BRANCH_NAME="master"
 ARG USE_CUDA_VERSION=""
+ARG USE_ROCM_VERSION=""
 
 FROM ${BASE_IMAGE} AS cpp-dev-image
 ARG BASE_IMAGE
@@ -28,6 +29,7 @@ ARG GCC_VERSION
 ARG BRANCH_NAME
 ARG REPO_URL=https://github.com/pytorch/serve.git
 ARG USE_CUDA_VERSION
+ARG USE_ROCM_VERSION
 ARG DEBIAN_FRONTEND=noninteractive
 ENV PYTHONUNBUFFERED TRUE
 ENV TZ=Etc/UTC
diff --git a/docker/Dockerfile.dev b/docker/Dockerfile.dev
index 2f02d84680..0c39e24f48 100644
--- a/docker/Dockerfile.dev
+++ b/docker/Dockerfile.dev
@@ -19,6 +19,7 @@ ARG BRANCH_NAME=master
 ARG REPO_URL=https://github.com/pytorch/serve.git
 ARG MACHINE_TYPE=cpu
 ARG CUDA_VERSION
+ARG ROCM_VERSION
 
 ARG BUILD_WITH_IPEX
 ARG IPEX_VERSION=1.11.0
@@ -62,7 +63,8 @@ RUN update-alternatives --install /usr/bin/python python /usr/bin/python$PYTHON_
 FROM compile-image AS dev-image
 ARG MACHINE_TYPE=cpu
 ARG CUDA_VERSION
-RUN if [ "$MACHINE_TYPE" = "gpu" ]; then export USE_CUDA=1; fi \
+ARG ROCM_VERSION
+RUN if [ "$MACHINE_TYPE" = "gpu" ]; then if [ -n "$ROCM_VERSION" ]; then export USE_ROCM=1; else export USE_CUDA=1; fi fi \
     && git clone $REPO_URL \
     && cd serve \
     && git checkout ${BRANCH_NAME} \
@@ -70,7 +72,7 @@ RUN if [ "$MACHINE_TYPE" = "gpu" ]; then export USE_CUDA=1; fi \
 ENV PATH="/home/venv/bin:$PATH"
 WORKDIR serve
 RUN python -m pip install -U pip setuptools \
-    && if [ -z "$CUDA_VERSION" ]; then python ts_scripts/install_dependencies.py --environment=dev; else python ts_scripts/install_dependencies.py --environment=dev  --cuda $CUDA_VERSION; fi \
+    && if [ -n "$CUDA_VERSION" ]; then python ts_scripts/install_dependencies.py --environment=dev --cuda $CUDA_VERSION; elif [ -n "$ROCM_VERSION" ]; then python ts_scripts/install_dependencies.py --environment=dev --rocm $ROCM_VERSION; else python ts_scripts/install_dependencies.py --environment=dev; fi \
     && if [ "$BUILD_WITH_IPEX" = "true" ]; then python -m pip install --no-cache-dir intel_extension_for_pytorch==${IPEX_VERSION} -f ${IPEX_URL}; fi \
     && python ts_scripts/install_from_src.py \
     && useradd -m model-server \
diff --git a/docker/README.md b/docker/README.md
index 7ddd9d01a0..36e3ba093e 100644
--- a/docker/README.md
+++ b/docker/README.md
@@ -48,6 +48,7 @@ Use `build_image.sh` script to build the docker images. The script builds the `p
 |-bt, --buildtype|Which type of docker image to build. Can be one of : production, dev, ci|
 |-t, --tag|Tag name for image. If not specified, script uses torchserve default tag names.|
 |-cv, --cudaversion| Specify to cuda version to use. Supported values `cu92`, `cu101`, `cu102`, `cu111`, `cu113`, `cu116`, `cu117`, `cu118`. `cu121`, Default `cu121`|
+|-rv, --rocmversion| Specify to ROCm version to use. Supported values `rocm6.0`, `rocm6.1`, `rocm6.2`, `rocm6.3`|
 |-ipex, --build-with-ipex| Specify to build with intel_extension_for_pytorch. If not specified, script builds without intel_extension_for_pytorch.|
 |-cpp, --build-cpp specify to build TorchServe CPP|
 |-n, --nightly| Specify to build with TorchServe nightly.|
@@ -66,9 +67,9 @@ Creates a docker image with publicly available `torchserve` and `torch-model-arc
 ./build_image.sh
 ```
 
- - To create a GPU based image with cuda 10.2. Options are `cu92`, `cu101`, `cu102`, `cu111`, `cu113`, `cu116`, `cu117`, `cu118`
+ - To create a GPU based image with cuda 10.2. Options are `cu92`, `cu101`, `cu102`, `cu111`, `cu113`, `cu116`, `cu117`, `cu118` for CUDA and `rocm6.0`, `rocm6.1`, `rocm6.2`, `rocm6.3` for ROCm.
 
-    - GPU images are built with NVIDIA CUDA base image. If you want to use ONNX, please specify the base image as shown in the next section.
+    - GPU images are built with either NVIDIA CUDA base image or AMD ROCm base image. If you want to use ONNX, please specify the base image as shown in the next section.
 
   ```bash
   ./build_image.sh -g -cv cu117
@@ -136,6 +137,30 @@ Creates a docker image with `torchserve` and `torch-model-archiver` installed fr
 ./build_image.sh -bt dev -g -cv cu92
 ```
 
+- For creating GPU based image with ROCm version 6.0:
+
+```bash
+./build_image.sh -bt dev -g -rv rocm6.0
+```
+
+- For creating GPU based image with ROCm version 6.1:
+
+```bash
+./build_image.sh -bt dev -g -rv rocm6.1
+```
+
+- For creating GPU based image with ROCm version 6.2:
+
+```bash
+./build_image.sh -bt dev -g -rv rocm6.2
+```
+
+- For creating GPU based image with ROCm version 6.3:
+
+```bash
+./build_image.sh -bt dev -g -rv rocm6.3
+```
+
 - For creating GPU based image with a different branch:
 
 ```bash
@@ -152,7 +177,7 @@ Creates a docker image with `torchserve` and `torch-model-archiver` installed fr
 ./build_image.sh -bt dev -t torchserve-dev:1.0
 ```
 
- - For creating image with Intel® Extension for PyTorch*:
+ - For creating image with Intel® Extension for PyTorch:
 
 ```bash
 ./build_image.sh -bt dev -ipex -t torchserve-ipex:1.0
@@ -168,7 +193,7 @@ Creates a docker image with `torchserve` and `torch-model-archiver` installed fr
 ./build_image.sh -bt dev -g [-cv cu121|cu118] -cpp
 ```
 
-- For ROCm support (*experimental*), refer to [this documentation](../docs/hardware_support/amd_support.md).
+- For more ROCm support (*experimental*), refer to [this documentation](../docs/hardware_support/amd_support.md).
 
 ## Start a container with a TorchServe image
 
@@ -194,7 +219,7 @@ For specific versions you can pass in the specific tag to use (ex: pytorch/torch
 docker run --rm -it -p 127.0.0.1:8080:8080 -p 127.0.0.1:8081:8081 -p 127.0.0.1:8082:8082 -p 127.0.0.1:7070:7070 -p 127.0.0.1:7071:7071 pytorch/torchserve:0.1.1-cpu
 ```
 
-#### Start CPU container with Intel® Extension for PyTorch*
+#### Start CPU container with Intel® Extension for PyTorch
 
 ```bash
 docker run --rm -it -p 127.0.0.1:8080:8080 -p 127.0.0.1:8081:8081 -p 127.0.0.1:8082:8082 -p 127.0.0.1:7070:7070 -p 127.0.0.1:7071:7071  torchserve-ipex:1.0
@@ -208,6 +233,12 @@ For GPU latest image with gpu devices 1 and 2:
 docker run --rm -it --gpus '"device=1,2"' -p 127.0.0.1:8080:8080 -p 127.0.0.1:8081:8081 -p 127.0.0.1:8082:8082 -p 127.0.0.1:7070:7070 -p 127.0.0.1:7071:7071 pytorch/torchserve:latest-gpu
 ```
 
+For GPU with ROCm support with gpu devices 1 and 2:
+
+```bash
+docker run --rm -it --device=/dev/kfd --device=/dev/dri -e HIP_VISIBLE_DEVICES=1,2 -p 127.0.0.1:8080:8080 -p 127.0.0.1:8081:8081 -p 127.0.0.1:8082:8082 -p 127.0.0.1:7070:7070 -p 127.0.0.1:7071:7071 pytorch/torchserve:latest-gpu
+```
+
 For specific versions you can pass in the specific tag to use (ex: `0.1.1-cuda10.1-cudnn7-runtime`):
 
 ```bash
diff --git a/docker/build_image.sh b/docker/build_image.sh
index b5b9f8e87e..5cb071d967 100755
--- a/docker/build_image.sh
+++ b/docker/build_image.sh
@@ -11,6 +11,7 @@ BASE_IMAGE="ubuntu:20.04"
 UPDATE_BASE_IMAGE=false
 USE_CUSTOM_TAG=false
 CUDA_VERSION=""
+ROCM_VERSION=""
 USE_LOCAL_SERVE_FOLDER=false
 BUILD_WITH_IPEX=false
 BUILD_CPP=false
@@ -33,6 +34,7 @@ do
           echo "-bi, --baseimage specify base docker image. Example: nvidia/cuda:11.7.0-cudnn8-runtime-ubuntu20.04 "
           echo "-bt, --buildtype specify for type of created image. Possible values: production, dev, ci."
           echo "-cv, --cudaversion specify to cuda version to use"
+          echo "-rv, --rocmversion specify to rocm version to use"
           echo "-t, --tag specify tag name for docker image"
           echo "-lf, --use-local-serve-folder specify this option for the benchmark image if the current 'serve' folder should be used during automated benchmarks"
           echo "-ipex, --build-with-ipex specify to build with intel_extension_for_pytorch"
@@ -167,6 +169,27 @@ do
           shift
           shift
           ;;
+        -rv|--rocmversion)
+          ROCM_VERSION="$2"
+          if [ "${ROCM_VERSION}" == "rocm6.0" ];
+          then
+            BASE_IMAGE="rocm/dev-ubuntu-22.04:6.0.2"
+          elif [ "${ROCM_VERSION}" == "rocm6.1" ];
+          then
+            BASE_IMAGE="rocm/dev-ubuntu-22.04:6.1.2"
+          elif [ "${ROCM_VERSION}" == "rocm6.2" ];
+          then
+            BASE_IMAGE="rocm/dev-ubuntu-22.04:6.2.4"
+          elif [ "${ROCM_VERSION}" == "rocm6.3" ];
+          then
+            BASE_IMAGE="rocm/dev-ubuntu-22.04:6.3.4"
+          else
+            echo "ROCm version not supported"
+            exit 1
+          fi
+          shift
+          shift
+          ;;
     esac
 done
 
@@ -218,30 +241,50 @@ then
       exit 1
     fi
   fi
+
+  if [[ "${MACHINE}" == "gpu" || "${ROCM_VERSION}" != "" ]];
+  then
+    if [ "${ROCM_VERSION}" == "rocm6.0" ];
+    then
+      BASE_IMAGE="rocm/dev-ubuntu-22.04:6.0.2"
+    elif [ "${ROCM_VERSION}" == "rocm6.1" ];
+    then
+      BASE_IMAGE="rocm/dev-ubuntu-22.04:6.1.2"
+    elif [ "${ROCM_VERSION}" == "rocm6.2" ];
+    then
+      BASE_IMAGE="rocm/dev-ubuntu-22.04:6.2.4"
+    elif [ "${ROCM_VERSION}" == "rocm6.3" ];
+    then
+      BASE_IMAGE="rocm/dev-ubuntu-22.04:6.3.4"
+    else
+      echo "ROCm version $ROCM_VERSION is not supported for CPP"
+      exit 1
+    fi
+  fi
 fi
 
 if [ "${BUILD_TYPE}" == "production" ]; then
   if [ "${MULTI}" == "true" ]; then
-    DOCKER_BUILDKIT=1 docker buildx build --file Dockerfile --build-arg BASE_IMAGE="${BASE_IMAGE}" --build-arg USE_CUDA_VERSION="${CUDA_VERSION}" --build-arg PYTHON_VERSION="${PYTHON_VERSION}"\
+    DOCKER_BUILDKIT=1 docker buildx build --file Dockerfile --build-arg BASE_IMAGE="${BASE_IMAGE}" --build-arg USE_CUDA_VERSION="${CUDA_VERSION}" --build-arg USE_ROCM_VERSION="${ROCM_VERSION}" --build-arg PYTHON_VERSION="${PYTHON_VERSION}"\
     --build-arg BUILD_NIGHTLY="${BUILD_NIGHTLY}" --build-arg BRANCH_NAME="${BRANCH_NAME}" --build-arg REPO_URL="${REPO_URL}" --build-arg BUILD_FROM_SRC="${BUILD_FROM_SRC}"\
     --build-arg LOCAL_CHANGES="${LOCAL_CHANGES}" -t "${DOCKER_TAG}" --platform "${ARCH}" --target production-image ../ --push
   else
-    DOCKER_BUILDKIT=1 docker buildx build --file Dockerfile --build-arg BASE_IMAGE="${BASE_IMAGE}" --build-arg USE_CUDA_VERSION="${CUDA_VERSION}" --build-arg PYTHON_VERSION="${PYTHON_VERSION}"\
+    DOCKER_BUILDKIT=1 docker buildx build --file Dockerfile --build-arg BASE_IMAGE="${BASE_IMAGE}" --build-arg USE_CUDA_VERSION="${CUDA_VERSION}" --build-arg USE_ROCM_VERSION="${ROCM_VERSION}" --build-arg PYTHON_VERSION="${PYTHON_VERSION}"\
     --build-arg BUILD_NIGHTLY="${BUILD_NIGHTLY}" --build-arg BRANCH_NAME="${BRANCH_NAME}" --build-arg REPO_URL="${REPO_URL}" --build-arg BUILD_FROM_SRC="${BUILD_FROM_SRC}"\
     --build-arg LOCAL_CHANGES="${LOCAL_CHANGES}" -t "${DOCKER_TAG}" --target production-image ../ --load
   fi
 elif [ "${BUILD_TYPE}" == "ci" ];
 then
-  DOCKER_BUILDKIT=1 docker build --file Dockerfile --build-arg BASE_IMAGE="${BASE_IMAGE}" --build-arg USE_CUDA_VERSION="${CUDA_VERSION}"  --build-arg PYTHON_VERSION="${PYTHON_VERSION}"\
+  DOCKER_BUILDKIT=1 docker build --file Dockerfile --build-arg BASE_IMAGE="${BASE_IMAGE}" --build-arg USE_CUDA_VERSION="${CUDA_VERSION}" --build-arg USE_ROCM_VERSION="${ROCM_VERSION}" --build-arg PYTHON_VERSION="${PYTHON_VERSION}"\
   --build-arg BUILD_NIGHTLY="${BUILD_NIGHTLY}" --build-arg BRANCH_NAME="${BRANCH_NAME}" --build-arg REPO_URL="${REPO_URL}" --build-arg BUILD_FROM_SRC="${BUILD_FROM_SRC}"\
   --build-arg LOCAL_CHANGES="${LOCAL_CHANGES}" -t "${DOCKER_TAG}" --target ci-image  ../
 else
   if [ "${BUILD_CPP}" == "true" ]
   then
-    DOCKER_BUILDKIT=1 docker build --file Dockerfile.cpp --build-arg BASE_IMAGE="${BASE_IMAGE}" --build-arg USE_CUDA_VERSION="${CUDA_VERSION}" --build-arg PYTHON_VERSION="${PYTHON_VERSION}"\
+    DOCKER_BUILDKIT=1 docker build --file Dockerfile.cpp --build-arg BASE_IMAGE="${BASE_IMAGE}" --build-arg USE_CUDA_VERSION="${CUDA_VERSION}" --build-arg USE_ROCM_VERSION="${ROCM_VERSION}" --build-arg PYTHON_VERSION="${PYTHON_VERSION}"\
     --build-arg BRANCH_NAME="${BRANCH_NAME}" --build-arg REPO_URL="${REPO_URL}" -t "${DOCKER_TAG}" --target cpp-dev-image .
   else
-    DOCKER_BUILDKIT=1 docker build --file Dockerfile --build-arg BASE_IMAGE="${BASE_IMAGE}" --build-arg USE_CUDA_VERSION="${CUDA_VERSION}"  --build-arg PYTHON_VERSION="${PYTHON_VERSION}"\
+    DOCKER_BUILDKIT=1 docker build --file Dockerfile --build-arg BASE_IMAGE="${BASE_IMAGE}" --build-arg USE_CUDA_VERSION="${CUDA_VERSION}" --build-arg USE_ROCM_VERSION="${ROCM_VERSION}" --build-arg PYTHON_VERSION="${PYTHON_VERSION}"\
     --build-arg BUILD_NIGHTLY="${BUILD_NIGHTLY}" --build-arg BRANCH_NAME="${BRANCH_NAME}" --build-arg REPO_URL="${REPO_URL}" --build-arg BUILD_FROM_SRC="${BUILD_FROM_SRC}" --build-arg LOCAL_CHANGES="${LOCAL_CHANGES}"\
     --build-arg BUILD_WITH_IPEX="${BUILD_WITH_IPEX}"  -t "${DOCKER_TAG}" --target dev-image  ../
   fi
diff --git a/docs/README.md b/docs/README.md
index d00e726070..cf0fa078a2 100644
--- a/docs/README.md
+++ b/docs/README.md
@@ -62,4 +62,4 @@ TorchServe is a performant, flexible and easy to use tool for serving PyTorch ea
 * [TorchServe on Kubernetes](https://github.com/pytorch/serve/blob/master/kubernetes/README.md#torchserve-on-kubernetes) -  Demonstrates a Torchserve deployment in Kubernetes using Helm Chart supported in both Azure Kubernetes Service and Google Kubernetes service
 * [mlflow-torchserve](https://github.com/mlflow/mlflow-torchserve) - Deploy mlflow pipeline models into TorchServe
 * [Kubeflow pipelines](https://github.com/kubeflow/pipelines/tree/master/samples/contrib/pytorch-samples) - Kubeflow pipelines and Google Vertex AI Managed pipelines
-* [NVIDIA MPS](nvidia_mps.md) - Use NVIDIA MPS to optimize multi-worker deployment on a single GPU
+* [NVIDIA MPS](hardware_support/nvidia_mps.md) - Use NVIDIA MPS to optimize multi-worker deployment on a single GPU
diff --git a/docs/getting_started.md b/docs/getting_started.md
index 3f9e6176e2..71d1f58df8 100644
--- a/docs/getting_started.md
+++ b/docs/getting_started.md
@@ -26,6 +26,12 @@ This project is no longer actively maintained. While existing releases remain av
 
      Note: PyTorch 1.9+ will not support cu92 and cu101. So TorchServe only supports cu92 and cu101 up to PyTorch 1.8.1.
 
+     - For GPU with ROCm. Options are `rocm6.0`, `rocm6.1`, `rocm6.2`, `rocm6.3`
+
+       ```bash
+       python ./ts_scripts/install_dependencies.py --rocm=rocm6.2
+       ```
+
     #### For Windows
 
     Refer to the documentation [here](./torchserve_on_win_native.md).
diff --git a/docs/github_actions.md b/docs/github_actions.md
index 62800a23fd..d648578260 100644
--- a/docs/github_actions.md
+++ b/docs/github_actions.md
@@ -65,10 +65,10 @@ This project is no longer actively maintained. While existing releases remain av
     -  This would create 2 runs. One run on `ci-gpu` with CUDA 11.6 and a second run on `ci-gpu` with CUDA 11.7
     ```
         runs-on: [self-hosted, ci-gpu]
-            strategy:
-            fail-fast: false
-            matrix:
-                cuda: ["cu116", "cu117"]
+        strategy:
+          fail-fast: false
+          matrix:
+            cuda: ["cu116", "cu117"]
     ```
 
 6. Specify the commands to be executed for the run
diff --git a/docs/hardware_support/amd_support.md b/docs/hardware_support/amd_support.md
index e231f4cae1..0a2b34fb93 100644
--- a/docs/hardware_support/amd_support.md
+++ b/docs/hardware_support/amd_support.md
@@ -9,7 +9,7 @@ TorchServe can be run on any combination of operating system and device that is
 
 ## Supported Versions of ROCm
 
-The current stable `major.patch` version of ROCm and the previous path version will be supported. For example version `N.2` and `N.1` where `N` is the current major version.
+The current stable `major.patch` version of ROCm and the previous patch version will be supported. For example version `N.2` and `N.1` where `N` is the current major version.
 
 ## Installation
 
@@ -39,7 +39,7 @@ The current stable `major.patch` version of ROCm and the previous path version w
   - install the dependencies needed for ROCm support.
 
     ```bash
-    python ./ts_scripts/install_dependencies.py --rocm=rocm61
+    python ./ts_scripts/install_dependencies.py --rocm=rocm6.2
     python ./ts_scripts/install_from_src.py
     ```
   - enable amd-smi in the python virtual environment
@@ -68,12 +68,12 @@ If you have 8 accelerators but only want TorchServe to see the last four of them
 
 **In Development**
 
-`Dockerfile.rocm` provides preliminary ROCm support for TorchServe.
+`Dockerfile` and `build_image.sh` provides ROCm support for TorchServe.
 
 Building and running `dev-image`:
 
 ```bash
-docker build --file docker/Dockerfile.rocm --target dev-image -t torch-serve-dev-image-rocm --build-arg USE_ROCM_VERSION=rocm62 --build-arg BUILD_FROM_SRC=true .
+docker build --file docker/Dockerfile.rocm --target dev-image -t torch-serve-dev-image-rocm --build-arg USE_ROCM_VERSION=rocm6.2 --build-arg BUILD_FROM_SRC=true .
 
 docker run -it --rm --device=/dev/kfd --device=/dev/dri torch-serve-dev-image-rocm bash
 ```
diff --git a/docs/performance_checklist.md b/docs/performance_checklist.md
index 32536d50f2..609c939aed 100644
--- a/docs/performance_checklist.md
+++ b/docs/performance_checklist.md
@@ -10,7 +10,7 @@ This checklist describes some steps that should be completed when diagnosing mod
 
 - Check the versions of PyTorch, Nvidia driver, and other components and update to the latest compatible releases.  Oftentimes known performance bugs have already been fixed.
 
-- Collect system-level activity logs to understand the overall resource utilizations. It’s useful to know how the model inference pipeline is using the system resources at a high level, as the first step of optimization.  Even simple CLI tools such as nvidia-smi and htop would be helpful.
+- Collect system-level activity logs to understand the overall resource utilizations. It’s useful to know how the model inference pipeline is using the system resources at a high level, as the first step of optimization.  Even simple CLI tools such as nvidia-smi, amd-smi, and htop would be helpful.
 
 - Start with a target with the highest impact on performance.  It should be obvious from the system activity logs where the biggest bottleneck is – look beyond model inference, as pre/post processing can be expensive and can affect the end-to-end throughput just as much.
 
diff --git a/docs/performance_guide.md b/docs/performance_guide.md
index a342eeaee1..2af4d076a4 100644
--- a/docs/performance_guide.md
+++ b/docs/performance_guide.md
@@ -85,7 +85,7 @@ While NVIDIA GPUs allow multiple processes to run on CUDA kernels, this comes wi
 * The execution of the kernels is generally serialized
 * Each processes creates its own CUDA context which occupies additional GPU memory
 
-To get around these drawbacks, you can utilize the NVIDIA Multi-Process Service (MPS) to increase performance. You can find more information on how to utilize NVIDIA MPS with TorchServe  [here](nvidia_mps.md).
+To get around these drawbacks, you can utilize the NVIDIA Multi-Process Service (MPS) to increase performance. You can find more information on how to utilize NVIDIA MPS with TorchServe  [here](hardware_support/nvidia_mps.md).
 
 <h6> NVIDIA DALI</h6>
 
diff --git a/docs/use_cases.md b/docs/use_cases.md
index cf58baf30a..b0f3457b66 100644
--- a/docs/use_cases.md
+++ b/docs/use_cases.md
@@ -155,7 +155,7 @@ The example taken here uses scripted mode model however you can also deploy eage
 - Move MAR file in a new directory name it as `model-store`
     - Docker -  Make sure that MAR file is being copied in volume/directory shared while starting torchserve docker image
 - torchserve start command in following instruction will automatically detect GPUs and use for loading/serving models. If you want to [limit the GPU usage](https://github.com/pytorch/serve/blob/master/docs/configuration.md#limit-gpu-usage)
-then use `nvidia-smi` to determine the number of GPU and corresponding ids. Once you have gpu details, you can add `number_of_gpu` param in config.proerties and use second command as given next instruction.
+then use `nvidia-smi` or `amd-smi` to determine the number of GPU and corresponding ids. Once you have gpu details, you can add `number_of_gpu` param in config.proerties and use second command as given next instruction.
 e.g. number_of_gpu=2
 - Start torchserve with all GPUs- `torchserve --start --ncs --model-store <model_store or your_model_store_dir>`. With restricted GPUs - `torchserve --start --ncs --model-store <model_store or your_model_store_dir> --ts-config <your_path>/config.properties`
     - Docker -  For all GPU `docker run --rm -it --gpus all -p 127.0.0.1:8080:8080 -p 127.0.0.1:8081:8081 torchserve:gpu-latest` For GPUs 1 and 2 `docker run --rm -it --gpus '"device=1,2"' -p 8080:8080 -p 8081:8081 pytorch/torchserve:latest-gpu`
diff --git a/kubernetes/kserve/Dockerfile.dev b/kubernetes/kserve/Dockerfile.dev
index 54af943553..ebfb0f76f0 100644
--- a/kubernetes/kserve/Dockerfile.dev
+++ b/kubernetes/kserve/Dockerfile.dev
@@ -55,11 +55,12 @@ RUN python -m pip install -U pip setuptools
 FROM compile-image AS dev-image
 ARG MACHINE_TYPE=cpu
 ARG CUDA_VERSION
-RUN if [ "$MACHINE_TYPE" = "gpu" ]; then export USE_CUDA=1; fi \
+ARG ROCM_VERSION
+RUN if [ "$MACHINE_TYPE" = "gpu" ]; then if [ -n "$ROCM_VERSION"]; then export USE_ROCM=1; else export USE_CUDA=1; fi fi \
   && git clone https://github.com/pytorch/serve.git \
   && cd serve \
   && git checkout ${BRANCH_NAME} \
-  && if [ -z "$CUDA_VERSION" ]; then python ts_scripts/install_dependencies.py --environment=dev; else python ts_scripts/install_dependencies.py --environment=dev  --cuda $CUDA_VERSION; fi \
+  && if [ -n "$CUDA_VERSION" ]; then python ts_scripts/install_dependencies.py --environment=dev --cuda $CUDA_VERSION; elif [ -n "$ROCM_VERSION" ]; then python ts_scripts/install_dependencies.py --environment=dev --rcom $ROCM_VERSION; else python ts_scripts/install_dependencies.py --environment=dev; fi \
   && python ts_scripts/install_from_src.py \
   && python -m pip install captum transformers kserve[storage]>=0.11.0 \
   && python -m pip install . \
diff --git a/requirements/torch_rocm60.txt b/requirements/torch_rocm6.0.txt
similarity index 100%
rename from requirements/torch_rocm60.txt
rename to requirements/torch_rocm6.0.txt
diff --git a/requirements/torch_rocm61.txt b/requirements/torch_rocm6.1.txt
similarity index 100%
rename from requirements/torch_rocm61.txt
rename to requirements/torch_rocm6.1.txt
diff --git a/requirements/torch_rocm62.txt b/requirements/torch_rocm6.2.txt
similarity index 100%
rename from requirements/torch_rocm62.txt
rename to requirements/torch_rocm6.2.txt
diff --git a/requirements/torch_rocm6.3.txt b/requirements/torch_rocm6.3.txt
new file mode 100644
index 0000000000..6416aea8aa
--- /dev/null
+++ b/requirements/torch_rocm6.3.txt
@@ -0,0 +1,6 @@
+# PyTorch 2.7 will be the first release with ROCm 6.3 support.
+# For now (as of 20250317), nightly builds need to be used.
+#--index-url https://download.pytorch.org/whl/rocm6.3
+#torch==2.7.1+rocm6.2; sys_platform == 'linux'
+#torchvision==0.20.1+rocm6.2; sys_platform == 'linux'
+#torchaudio==2.5.1+rocm6.2; sys_platform == 'linux'
diff --git a/ts/torch_handler/base_handler.py b/ts/torch_handler/base_handler.py
index 1dad241922..41a3a4588b 100644
--- a/ts/torch_handler/base_handler.py
+++ b/ts/torch_handler/base_handler.py
@@ -99,11 +99,12 @@
 
 
 def setup_ort_session(model_pt_path, map_location):
-    providers = (
-        ["CUDAExecutionProvider", "CPUExecutionProvider"]
-        if map_location == "cuda"
-        else ["CPUExecutionProvider"]
-    )
+    providers = ["CPUExecutionProvider"]
+    if map_location == "cuda":
+        if torch.version.cuda:
+            providers.append("CUDAExecutionProvider")
+        elif torch.version.hip:
+            providers.append("ROCMExecutionProvider")
 
     sess_options = ort.SessionOptions()
     sess_options.intra_op_num_threads = psutil.cpu_count(logical=True)
diff --git a/ts_scripts/api_utils.py b/ts_scripts/api_utils.py
index 1b719018d6..3f4063fa50 100755
--- a/ts_scripts/api_utils.py
+++ b/ts_scripts/api_utils.py
@@ -2,6 +2,7 @@
 import os
 import shutil
 import sys
+import time
 
 REPO_ROOT = os.path.join(os.path.dirname(os.path.abspath(__file__)), "..")
 sys.path.append(REPO_ROOT)
@@ -108,10 +109,38 @@ def cleanup_model_store():
         os.remove(f)
 
 
-def move_logs(log_file, artifact_dir):
+def move_logs(log_file, artifact_dir, retries=5):
+    """
+    Move log files to artifacts directory. If directory already exists, merge contents.
+    """
     logs_dir = os.path.join("logs")
-    os.rename(log_file, os.path.join(logs_dir, log_file))  # mv file logs/
-    os.rename(logs_dir, os.path.join(artifact_dir, logs_dir))  # mv logs/ dir
+    if not os.path.exists(logs_dir):
+        os.makedirs(logs_dir)
+
+    shutil.move(log_file, os.path.join(logs_dir, log_file))  # mv file logs/
+
+    destination_dir = os.path.join(artifact_dir, logs_dir)
+
+    # Retry is used because the directory might not be ready to be moved.
+    for attempt in range(retries):
+        try:
+            if os.path.exists(destination_dir):
+                # Merge contents if destination directory already exists
+                for root, dirs, files in os.walk(logs_dir):
+                    for file in files:
+                        shutil.move(
+                            os.path.join(root, file),
+                            os.path.join(destination_dir, file),
+                        )
+                shutil.rmtree(logs_dir)  # Remove the empty logs directory
+            else:
+                shutil.move(logs_dir, destination_dir)  # mv logs/ dir
+            break
+        except:
+            if attempt < retries - 1:
+                time.sleep(2)
+            else:
+                raise
 
 
 def trigger_management_tests():
diff --git a/ts_scripts/install_dependencies.py b/ts_scripts/install_dependencies.py
index 4d464e03fe..85a6ec0ba8 100644
--- a/ts_scripts/install_dependencies.py
+++ b/ts_scripts/install_dependencies.py
@@ -385,7 +385,7 @@ def get_brew_version():
     parser.add_argument(
         "--rocm",
         default=None,
-        choices=["rocm60", "rocm61", "rocm62"],
+        choices=["rocm6.0", "rocm6.1", "rocm6.2", "rocm6.3"],
         help="ROCm version for torch",
     )
     parser.add_argument(
diff --git a/ts_scripts/install_from_src.py b/ts_scripts/install_from_src.py
index 9c555f9c3d..8f0a8eb9a2 100644
--- a/ts_scripts/install_from_src.py
+++ b/ts_scripts/install_from_src.py
@@ -26,7 +26,7 @@ def install_from_src(dev=False):
     parser.add_argument(
         "--environment",
         type=str,
-        default="production",
+        default="prod",
         help="options: dev|prod",
     )
     args = parser.parse_args()
diff --git a/ts_scripts/print_env_info.py b/ts_scripts/print_env_info.py
index 2d3fc059ae..ec7a3a80cb 100644
--- a/ts_scripts/print_env_info.py
+++ b/ts_scripts/print_env_info.py
@@ -231,6 +231,9 @@ def get_nvidia_driver_cuda_version():
     return f"{cuda_major}.{cuda_minor}"
 
 
+# XXX:
+# Even though `torch._C_._cuda_getCompiledVersion` is compatible with both CUDA and ROCm/HIP,
+# the way of calculating the major & minor version numbers may not.
 def get_running_cuda_version():
     cuda = torch._C._cuda_getCompiledVersion()
     cuda_major = cuda // 1000
diff --git a/ts_scripts/spellcheck_conf/wordlist.txt b/ts_scripts/spellcheck_conf/wordlist.txt
index 3e055db67d..09da229bf1 100644
--- a/ts_scripts/spellcheck_conf/wordlist.txt
+++ b/ts_scripts/spellcheck_conf/wordlist.txt
@@ -1301,3 +1301,10 @@ OpenAI
 openai
 kv
 OOM
+recurse
+submodules
+ROCm
+rocm
+rocmversion
+rv
+amd