diff --git a/compiler-rt/cmake/Modules/AddCompilerRT.cmake b/compiler-rt/cmake/Modules/AddCompilerRT.cmake
index 361538a58e47c..21efa2251219d 100644
--- a/compiler-rt/cmake/Modules/AddCompilerRT.cmake
+++ b/compiler-rt/cmake/Modules/AddCompilerRT.cmake
@@ -158,11 +158,11 @@ function(add_compiler_rt_runtime name type)
   set(libnames)
   # Until we support this some other way, build compiler-rt runtime without LTO
   # to allow non-LTO projects to link with it.
-  if(COMPILER_RT_HAS_FNO_LTO_FLAG)
-    set(NO_LTO_FLAGS "-fno-lto")
-  else()
-    set(NO_LTO_FLAGS "")
-  endif()
+  # if(COMPILER_RT_HAS_FNO_LTO_FLAG)
+  #   set(NO_LTO_FLAGS "-fno-lto")
+  # else()
+  #   set(NO_LTO_FLAGS "")
+  # endif()
 
   # By default do not instrument or use profdata for compiler-rt.
   set(NO_PGO_FLAGS "")
diff --git a/compiler-rt/cmake/builtin-config-ix.cmake b/compiler-rt/cmake/builtin-config-ix.cmake
index ad3b98799c5c9..f2caf3650eab0 100644
--- a/compiler-rt/cmake/builtin-config-ix.cmake
+++ b/compiler-rt/cmake/builtin-config-ix.cmake
@@ -38,6 +38,7 @@ asm(\"cas w0, w1, [x2]\");
 
 set(ARM64 aarch64)
 set(ARM32 arm armhf armv6m armv7m armv7em armv7 armv7s armv7k)
+set(DPU dpu)
 set(HEXAGON hexagon)
 set(X86 i386)
 set(X86_64 x86_64)
@@ -60,7 +61,7 @@ if(APPLE)
 endif()
 
 set(ALL_BUILTIN_SUPPORTED_ARCH
-  ${X86} ${X86_64} ${ARM32} ${ARM64}
+  ${X86} ${X86_64} ${ARM32} ${ARM64} ${DPU}
   ${HEXAGON} ${MIPS32} ${MIPS64} ${PPC32} ${PPC64}
   ${RISCV32} ${RISCV64} ${SPARC} ${SPARCV9}
   ${WASM32} ${WASM64} ${VE})
diff --git a/compiler-rt/lib/builtins/CMakeLists.txt b/compiler-rt/lib/builtins/CMakeLists.txt
index 73b6bead84245..b2f2adf0c88d0 100644
--- a/compiler-rt/lib/builtins/CMakeLists.txt
+++ b/compiler-rt/lib/builtins/CMakeLists.txt
@@ -94,13 +94,13 @@ set(GENERIC_SOURCES
   ffssi2.c
   ffsti2.c
   fixdfdi.c
-  fixdfsi.c
+  # fixdfsi.c
   fixdfti.c
   fixsfdi.c
   fixsfsi.c
   fixsfti.c
   fixunsdfdi.c
-  fixunsdfsi.c
+  # fixunsdfsi.c
   fixunsdfti.c
   fixunssfdi.c
   fixunssfsi.c
@@ -263,6 +263,11 @@ if (NOT MSVC)
   )
 endif ()
 
+set(dpu_SOURCES
+  dpu/udivsi3.c
+  ${GENERIC_SOURCES}
+  )
+
 # Implement extended-precision builtins, assuming long double is 80 bits.
 # long double is not 80 bits on Android or MSVC.
 set(x86_80_BIT_SOURCES
diff --git a/compiler-rt/lib/builtins/dpu/udiv32_stepper.c b/compiler-rt/lib/builtins/dpu/udiv32_stepper.c
new file mode 100644
index 0000000000000..ce4a1960ad5bf
--- /dev/null
+++ b/compiler-rt/lib/builtins/dpu/udiv32_stepper.c
@@ -0,0 +1,67 @@
+/* Copyright 2024 UPMEM. All rights reserved.
+ * Use of this source code is governed by a BSD-style license that can be
+ * found in the LICENSE file.
+ */
+
+#include <stdint.h>
+
+uint64_t __attribute__((used)) __udiv32(uint32_t dividend, uint32_t divider)
+{
+    uint64_t dest;
+
+    uint32_t temp0;
+    uint32_t temp1;
+
+    /* clang-format off */
+    __asm__ volatile("  clz %[temp0], %[divider], max, 1f\n" // %[temp0] = by how many the divider can be shifted on 32-bit
+                     "  clz %[temp1], %[dividend]\n" // %[temp1] = number of useless bits of the dividend
+                     "  sub %[temp0], %[temp1], %[temp0], gtu, 2f\n" // %[temp0] = the maximal shift to be done
+                     "  move %[temp1], %[divider]\n"
+                     "  move.u %[dest], %[dividend]\n"
+                     "  jump %[temp0], 3f\n" // As we will jump backward relatively to label 3 forward
+                     "  div_step %[dest], %[temp1], %[dest], 31\n"
+                     "  div_step %[dest], %[temp1], %[dest], 30\n"
+                     "  div_step %[dest], %[temp1], %[dest], 29\n"
+                     "  div_step %[dest], %[temp1], %[dest], 28\n"
+                     "  div_step %[dest], %[temp1], %[dest], 27\n"
+                     "  div_step %[dest], %[temp1], %[dest], 26\n"
+                     "  div_step %[dest], %[temp1], %[dest], 25\n"
+                     "  div_step %[dest], %[temp1], %[dest], 24\n"
+                     "  div_step %[dest], %[temp1], %[dest], 23\n"
+                     "  div_step %[dest], %[temp1], %[dest], 22\n"
+                     "  div_step %[dest], %[temp1], %[dest], 21\n"
+                     "  div_step %[dest], %[temp1], %[dest], 20\n"
+                     "  div_step %[dest], %[temp1], %[dest], 19\n"
+                     "  div_step %[dest], %[temp1], %[dest], 18\n"
+                     "  div_step %[dest], %[temp1], %[dest], 17\n"
+                     "  div_step %[dest], %[temp1], %[dest], 16\n"
+                     "  div_step %[dest], %[temp1], %[dest], 15\n"
+                     "  div_step %[dest], %[temp1], %[dest], 14\n"
+                     "  div_step %[dest], %[temp1], %[dest], 13\n"
+                     "  div_step %[dest], %[temp1], %[dest], 12\n"
+                     "  div_step %[dest], %[temp1], %[dest], 11\n"
+                     "  div_step %[dest], %[temp1], %[dest], 10\n"
+                     "  div_step %[dest], %[temp1], %[dest], 9\n"
+                     "  div_step %[dest], %[temp1], %[dest], 8\n"
+                     "  div_step %[dest], %[temp1], %[dest], 7\n"
+                     "  div_step %[dest], %[temp1], %[dest], 6\n"
+                     "  div_step %[dest], %[temp1], %[dest], 5\n"
+                     "  div_step %[dest], %[temp1], %[dest], 4\n"
+                     "  div_step %[dest], %[temp1], %[dest], 3\n"
+                     "  div_step %[dest], %[temp1], %[dest], 2\n"
+                     "  div_step %[dest], %[temp1], %[dest], 1\n"
+                     "3:\n"
+                     "  div_step %[dest], %[temp1], %[dest], 0\n"
+                     "4:\n"
+                     "  jump 5f\n"
+                     "2:\n"
+                     "  move.u %[dest], %[dividend], true, 4b\n"
+                     "1:\n"
+                     "  fault 2\n"
+                     "5:\n"
+                     : [dest] "=r"(dest), [temp0] "=&r" (temp0), [temp1] "=&r" (temp1)
+                     : [dividend] "r"(dividend), [divider] "r"(divider));
+    /* clang-format on */
+
+    return dest;
+}
diff --git a/compiler-rt/lib/builtins/dpu/udivsi3.c b/compiler-rt/lib/builtins/dpu/udivsi3.c
new file mode 100644
index 0000000000000..299208b0a5a29
--- /dev/null
+++ b/compiler-rt/lib/builtins/dpu/udivsi3.c
@@ -0,0 +1,15 @@
+#include "../int_lib.h"
+
+typedef su_int fixuint_t;
+typedef si_int fixint_t;
+
+/* extern void uint64_t __attribute__((used)) __udiv32(uint32_t dividend, uint32_t divider) */
+#include "udiv32_stepper.c"
+
+// Returns: a / b
+
+COMPILER_RT_ABI su_int __udivsi3(su_int a, su_int b) {
+  uint64_t res = __udiv32(a, b);
+  return (su_int) (res >> 32);
+  /* return 42; */
+}
diff --git a/llvm/lib/Target/DPU/DPUTargetLowering.cpp b/llvm/lib/Target/DPU/DPUTargetLowering.cpp
index 95ed30c7086ec..74801610df51c 100644
--- a/llvm/lib/Target/DPU/DPUTargetLowering.cpp
+++ b/llvm/lib/Target/DPU/DPUTargetLowering.cpp
@@ -89,10 +89,10 @@ DPUTargetLowering::DPUTargetLowering(const TargetMachine &TM, DPUSubtarget &STI)
   PredictableSelectIsExpensive = true;
   setJumpIsExpensive(false);
 
-  setLibcallName(RTLIB::SDIVREM_I32, "__divmodsi4");
-  setLibcallName(RTLIB::UDIVREM_I32, "__udivmodsi4");
-  setLibcallName(RTLIB::SDIV_I32, "__div32");
-  setLibcallName(RTLIB::UDIV_I32, "__udiv32");
+  // setLibcallName(RTLIB::SDIVREM_I32, "__divmodsi4");
+  // setLibcallName(RTLIB::UDIVREM_I32, "__udivmodsi4");
+  // setLibcallName(RTLIB::SDIV_I32, "__div32");
+  // setLibcallName(RTLIB::UDIV_I32, "__udiv32");
 
   // Set up the register classes.
   addRegisterClass(MVT::i32, &DPU::GP_REGRegClass);
diff --git a/llvm/lib/Target/DPU/MCTargetDesc/DPUMCCodeEmitter.cpp b/llvm/lib/Target/DPU/MCTargetDesc/DPUMCCodeEmitter.cpp
index 4eaaec5755b07..b7acf108155c5 100644
--- a/llvm/lib/Target/DPU/MCTargetDesc/DPUMCCodeEmitter.cpp
+++ b/llvm/lib/Target/DPU/MCTargetDesc/DPUMCCodeEmitter.cpp
@@ -36,6 +36,9 @@ void DPUMCCodeEmitter::encodeInstruction(const MCInst &MI, raw_ostream &OS,
                                          SmallVectorImpl<MCFixup> &Fixups,
                                          const MCSubtargetInfo &STI) const {
   // Get instruction encoding and emit it
+  // MI.dump();
+  MI.dump_pretty(dbgs());
+  dbgs() << "\n";
   uint64_t Value = getBinaryCodeForInstr(MI, Fixups, STI);
 
   // Emit bytes in little-endian