microsoft · vraspar · Dec 16, 2025 · Dec 18, 2025
diff --git a/cmake/deps.txt b/cmake/deps.txt
@@ -34,7 +34,7 @@ microsoft_gsl;https://github.com/microsoft/GSL/archive/refs/tags/v4.0.0.zip;cf36
 microsoft_wil;https://github.com/microsoft/wil/archive/refs/tags/v1.0.250325.1.zip;826c8bd47c2258ec61b8b218e031e5b33d27f761
 mimalloc;https://github.com/microsoft/mimalloc/archive/refs/tags/v2.1.1.zip;d5ee7d34223d0567892db5179849939c8769dc41
 mp11;https://github.com/boostorg/mp11/archive/refs/tags/boost-1.82.0.zip;9bc9e01dffb64d9e0773b2e44d2f22c51aace063
-onnx;https://github.com/onnx/onnx/archive/refs/tags/v1.19.1.zip;c5215b5697dcdfd71799f001b8c4054a6bba6b09
+onnx;https://github.com/onnx/onnx/archive/refs/tags/v1.20.0.zip;90d3254c3fec057db1886b84480491040296863d
 # Use the latest commit of 10.9-GA
 onnx_tensorrt;https://github.com/onnx/onnx-tensorrt/archive/d5dce67db7c2e64b07e055571f5ec06f7f254de2.zip;01114d3b67650857281fa50faa2e412130a63b69
 protobuf;https://github.com/protocolbuffers/protobuf/archive/refs/tags/v21.12.zip;7cf2733949036c7d52fda017badcab093fe73bfa

diff --git a/cmake/external/onnxruntime_external_deps.cmake b/cmake/external/onnxruntime_external_deps.cmake
@@ -498,6 +498,9 @@ else()
   set(ONNX_USE_LITE_PROTO OFF CACHE BOOL "" FORCE)
 endif()
 
+# Disable ONNX's static schema registration to prevent duplicate registrations
+set(ONNX_DISABLE_STATIC_REGISTRATION ON CACHE BOOL "" FORCE)
+
 if(Patch_FOUND)
   set(ONNXRUNTIME_ONNX_PATCH_COMMAND ${Patch_EXECUTABLE} --binary --ignore-whitespace -p1 < ${PROJECT_SOURCE_DIR}/patches/onnx/onnx.patch)
 else()
@@ -586,7 +589,11 @@ set(onnxruntime_EXTERNAL_LIBRARIES ${onnxruntime_EXTERNAL_LIBRARIES_XNNPACK} ${W
 set(onnxruntime_EXTERNAL_DEPENDENCIES onnx_proto flatbuffers::flatbuffers)
 
 if(NOT (onnx_FOUND OR ONNX_FOUND)) # building ONNX from source
-  target_compile_definitions(onnx PUBLIC $<TARGET_PROPERTY:onnx_proto,INTERFACE_COMPILE_DEFINITIONS> PRIVATE "__ONNX_DISABLE_STATIC_REGISTRATION")
+  target_compile_definitions(onnx PUBLIC $<TARGET_PROPERTY:onnx_proto,INTERFACE_COMPILE_DEFINITIONS>)
+  # For ONNX >= 1.20.0, the actual compilation happens in onnx_object target
+  if(TARGET onnx_object)
+    target_compile_definitions(onnx_object PRIVATE "__ONNX_DISABLE_STATIC_REGISTRATION")
+  endif()
   if (NOT onnxruntime_USE_FULL_PROTOBUF)
     target_compile_definitions(onnx PUBLIC "__ONNX_NO_DOC_STRINGS")
   endif()

diff --git a/cmake/patches/onnx/onnx.patch b/cmake/patches/onnx/onnx.patch
@@ -2,25 +2,25 @@ diff --git a/CMakeLists.txt b/CMakeLists.txt
 index cc3ef140..f70312ba 100644
 --- a/CMakeLists.txt
 +++ b/CMakeLists.txt
-@@ -57,6 +57,7 @@ option(ONNX_USE_LITE_PROTO "Use lite protobuf instead of full." OFF)
+@@ -52,6 +52,7 @@ option(ONNX_USE_LITE_PROTO "Use lite protobuf instead of full." OFF)
  option(ONNX_DISABLE_EXCEPTIONS "Disable exception handling." OFF)
  option(ONNX_DISABLE_STATIC_REGISTRATION "Disable static registration for ONNX operator schemas." OFF)
  option(ONNX_USE_UNITY_BUILD "Enable Unity (Jumbo) build for" OFF)
 +option(ONNX_MINIMAL_BUILD "Build only essential ONNX components" OFF)
  if(WIN32)
    option(ONNX_USE_MSVC_STATIC_RUNTIME "Build with MSVC static runtime" OFF)
  endif()
-@@ -409,14 +410,28 @@ relative_protobuf_generate_cpp(ONNX_PROTO_SRCS
- 
- add_library(onnx_proto ${ONNX_PROTO_SRCS})
- 
+@@ -322,14 +323,28 @@ relative_protobuf_generate_cpp(ONNX_PROTO_SRCS
+
+ add_library(onnx_proto_object OBJECT ${ONNX_PROTO_SRCS})
+
 -file(GLOB_RECURSE __tmp_srcs "${ONNX_ROOT}/onnx/*.h" "${ONNX_ROOT}/onnx/*.cc")
 -file(GLOB_RECURSE onnx_gtests_src "${ONNX_ROOT}/onnx/test/cpp/*.h"
 -    "${ONNX_ROOT}/onnx/test/cpp/*.cc"
 -    "${ONNX_ROOT}/onnx/backend/test/cpp/*.cc"
 -    "${ONNX_ROOT}/onnx/backend/test/cpp/*.h")
 -list(REMOVE_ITEM __tmp_srcs "${ONNX_ROOT}/onnx/cpp2py_export.cc")
--list(REMOVE_ITEM __tmp_srcs ${onnx_gtests_src})
+-list(REMOVE_ITEM __tmp_srcs ${onnx_gtests_src} "${ONNX_ROOT}/onnx/test/cmake/main.cc")
 -list(APPEND ONNX_SRCS ${__tmp_srcs})
 +if(ONNX_MINIMAL_BUILD)
 +    message(STATUS "Configuring ONNX minimal build")
@@ -41,54 +41,21 @@ index cc3ef140..f70312ba 100644
 +        "${ONNX_ROOT}/onnx/backend/test/cpp/*.cc"
 +        "${ONNX_ROOT}/onnx/backend/test/cpp/*.h")
 +    list(REMOVE_ITEM __tmp_srcs "${ONNX_ROOT}/onnx/cpp2py_export.cc")
-+    list(REMOVE_ITEM __tmp_srcs ${onnx_gtests_src})
++    list(REMOVE_ITEM __tmp_srcs ${onnx_gtests_src} "${ONNX_ROOT}/onnx/test/cmake/main.cc")
 +    list(APPEND ONNX_SRCS ${__tmp_srcs})
 +endif()
-
- # Hide all symbols we don't need
- set_target_properties(onnx_proto PROPERTIES CXX_VISIBILITY_PRESET hidden)
-@@ -438,19 +453,6 @@ add_onnx_global_defines(onnx_proto)
- target_include_directories(onnx_proto PUBLIC
-   $<BUILD_INTERFACE:${ONNX_ROOT}>
-   $<BUILD_INTERFACE:${CMAKE_CURRENT_BINARY_DIR}>)
--if(MSVC)
--  # For disabling Protobuf related warnings
--  target_compile_options(onnx_proto PUBLIC
--    /wd4146 # unary minus operator applied to unsigned type,
--            # result still unsigned
--    /wd4244 # 'argument': conversion from 'google::
--            # protobuf::uint64' to 'int', possible
--            # loss of data
--    /wd4267 # Conversion from 'size_t' to 'int',
--            # possible loss of data
--    /wd4141 # 'inline': used more than once
--  )
--endif()
-
- if(CMAKE_SYSTEM_NAME STREQUAL "AIX")
-   # whole-archive linker option not available on AIX.
+
+ set(LINKED_PROTOBUF_TARGET protobuf::libprotobuf)
+ if(ONNX_USE_LITE_PROTO)
 diff --git a/onnx/defs/nn/old.cc b/onnx/defs/nn/old.cc
 index ad6dd0c1..50259f32 100644
 --- a/onnx/defs/nn/old.cc
 +++ b/onnx/defs/nn/old.cc
-@@ -4091,7 +4091,6 @@ ONNX_OPERATOR_SET_SCHEMA(
+@@ -4152,7 +4152,6 @@ ONNX_OPERATOR_SET_SCHEMA(
      GroupNormalization,
      18,
      OpSchema()
 -        .Deprecate()
          .SetDoc(GroupNormalization_ver18_doc)
          .Attr("epsilon", "The epsilon value to use to avoid division by zero.", AttributeProto::FLOAT, 1e-5f)
          .Attr(
-diff --git a/onnx/defs/schema.h b/onnx/defs/schema.h
-index 7e9bc27f..4b87c5a5 100644
---- a/onnx/defs/schema.h
-+++ b/onnx/defs/schema.h
-@@ -999,7 +999,7 @@ class OpSchemaRegistry final : public ISchemaRegistry {
-   class OpSchemaRegisterOnce final {
-    public:
-     // Export to cpp custom register macro
--    explicit OpSchemaRegisterOnce(
-+    OpSchemaRegisterOnce(
-         OpSchema op_schema,
-         int opset_version_to_load = 0,
-         bool fail_duplicate_schema = true) {
diff --git a/include/onnxruntime/core/framework/data_types.h b/include/onnxruntime/core/framework/data_types.h
@@ -16,6 +16,7 @@
 #include "core/common/float8.h"
 #include "core/common/float16.h"
 #include "core/framework/int4.h"
+#include "core/framework/int2.h"
 #include "core/framework/float4.h"
 #include "core/graph/onnx_protobuf.h"
 #include "core/framework/to_tensor_proto_element_type.h"
@@ -285,7 +286,7 @@ template <typename T>
 struct IsTensorContainedType : public IsAnyOf<T, float, uint8_t, int8_t, uint16_t, int16_t,
                                               int32_t, int64_t, std::string, bool, MLFloat16,
                                               double, uint32_t, uint64_t, BFloat16,
-                                              Int4x2, UInt4x2
+                                              Int4x2, UInt4x2, Int2x4, UInt2x4
 #if !defined(DISABLE_FLOAT8_TYPES)
                                               ,
                                               Float8E4M3FN, Float8E4M3FNUZ, Float8E5M2, Float8E5M2FNUZ

diff --git a/include/onnxruntime/core/framework/data_types_internal.h b/include/onnxruntime/core/framework/data_types_internal.h
@@ -171,6 +171,12 @@ namespace utils {
     case ONNX_NAMESPACE::TensorProto_DataType_UINT4:                       \
       retval = function<UInt4x2>(__VA_ARGS__);                             \
       break;                                                               \
+    case ONNX_NAMESPACE::TensorProto_DataType_INT2:                        \
+      retval = function<Int2x4>(__VA_ARGS__);                              \
+      break;                                                               \
+    case ONNX_NAMESPACE::TensorProto_DataType_UINT2:                       \
+      retval = function<UInt2x4>(__VA_ARGS__);                             \
+      break;                                                               \
     default:                                                               \
       ORT_ENFORCE(false, "Unknown tensor type of ", tensor_type);          \
   }
@@ -287,6 +293,12 @@ namespace utils {
     case ONNX_NAMESPACE::TensorProto_DataType_UINT4:                       \
       retval = function<UInt4x2>(__VA_ARGS__);                             \
       break;                                                               \
+    case ONNX_NAMESPACE::TensorProto_DataType_INT2:                        \
+      retval = function<Int2x4>(__VA_ARGS__);                              \
+      break;                                                               \
+    case ONNX_NAMESPACE::TensorProto_DataType_UINT2:                       \
+      retval = function<UInt2x4>(__VA_ARGS__);                             \
+      break;                                                               \
     default:                                                               \
       ORT_ENFORCE(false, "Unknown tensor type of ", tensor_type);          \
   }
@@ -355,6 +367,12 @@ namespace utils {
     case ONNX_NAMESPACE::TensorProto_DataType_UINT4:              \
       function<UInt4x2>(__VA_ARGS__);                             \
       break;                                                      \
+    case ONNX_NAMESPACE::TensorProto_DataType_INT2:               \
+      function<Int2x4>(__VA_ARGS__);                              \
+      break;                                                      \
+    case ONNX_NAMESPACE::TensorProto_DataType_UINT2:              \
+      function<UInt2x4>(__VA_ARGS__);                             \
+      break;                                                      \
     default:                                                      \
       ORT_ENFORCE(false, "Unknown tensor type of ", tensor_type); \
   }
@@ -421,6 +439,12 @@ namespace utils {
     case ONNX_NAMESPACE::TensorProto_DataType_UINT4:                       \
       retval = function<UInt4x2>(__VA_ARGS__);                             \
       break;                                                               \
+    case ONNX_NAMESPACE::TensorProto_DataType_INT2:                        \
+      retval = function<Int2x4>(__VA_ARGS__);                              \
+      break;                                                               \
+    case ONNX_NAMESPACE::TensorProto_DataType_UINT2:                       \
+      retval = function<UInt2x4>(__VA_ARGS__);                             \
+      break;                                                               \
     default:                                                               \
       ORT_ENFORCE(false, "Unknown tensor type of ", tensor_type);          \
   }
@@ -477,6 +501,12 @@ namespace utils {
     case ONNX_NAMESPACE::TensorProto_DataType_UINT4:              \
       function<UInt4x2>(__VA_ARGS__);                             \
       break;                                                      \
+    case ONNX_NAMESPACE::TensorProto_DataType_INT2:               \
+      function<Int2x4>(__VA_ARGS__);                              \
+      break;                                                      \
+    case ONNX_NAMESPACE::TensorProto_DataType_UINT2:              \
+      function<UInt2x4>(__VA_ARGS__);                             \
+      break;                                                      \
     default:                                                      \
       ORT_ENFORCE(false, "Unknown tensor type of ", tensor_type); \
   }
@@ -531,6 +561,12 @@ namespace utils {
     case ONNX_NAMESPACE::TensorProto_DataType_UINT4:                       \
       retval = function<UInt4x2>(__VA_ARGS__);                             \
       break;                                                               \
+    case ONNX_NAMESPACE::TensorProto_DataType_INT2:                        \
+      retval = function<Int2x4>(__VA_ARGS__);                              \
+      break;                                                               \
+    case ONNX_NAMESPACE::TensorProto_DataType_UINT2:                       \
+      retval = function<UInt2x4>(__VA_ARGS__);                             \
+      break;                                                               \
     default:                                                               \
       ORT_ENFORCE(false, "Unknown tensor type of ", tensor_type);          \
   }

diff --git a/include/onnxruntime/core/framework/int2.h b/include/onnxruntime/core/framework/int2.h
@@ -0,0 +1,171 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#pragma once
+
+#include <cassert>
+#include <type_traits>
+#include "core/common/common.h"
+#include <gsl/gsl>
+
+namespace onnxruntime {
+
+template <bool Signed>
+struct Int2Traits;
+
+template <>
+struct Int2Traits<true> {
+  using UnpackedType = int8_t;
+  static constexpr int8_t min_val = -2;
+  static constexpr int8_t max_val = 1;
+};
+
+template <>
+struct Int2Traits<false> {
+  using UnpackedType = uint8_t;
+  static constexpr uint8_t min_val = 0;
+  static constexpr uint8_t max_val = 3;
+};
+
+/// <summary>
+/// Stores 4 packed 2-bit elements in 1 byte.
+/// Elements are packed from LSB to MSB: x0 at bits [1:0], x1 at bits [3:2], x2 at bits [5:4], x3 at bits [7:6]
+/// </summary>
+/// <typeparam name="Signed">Set to true if signed int2, or false if unsigned uint2.</typeparam>
+template <bool Signed>
+struct Int2x4Base {
+  using UnpackedType = typename Int2Traits<Signed>::UnpackedType;
+  static constexpr UnpackedType min_val = Int2Traits<Signed>::min_val;
+  static constexpr UnpackedType max_val = Int2Traits<Signed>::max_val;
+
+  std::byte bits_{};
+
+  Int2x4Base() = default;
+
+  explicit Int2x4Base(std::byte bits) {
+    bits_ = bits;
+  }
+
+  Int2x4Base(UnpackedType val0, UnpackedType val1, UnpackedType val2, UnpackedType val3) {
+    bits_ = static_cast<std::byte>(((val3 & 0x3) << 6) | ((val2 & 0x3) << 4) | ((val1 & 0x3) << 2) | (val0 & 0x3));
+  }
+
+  static inline int8_t SignExtendLower2Bits(std::byte bits) {
+    // Sign-extend lower 2-bits by left shifting and then doing an arithmetic right shift.
+    constexpr uint8_t shift = (sizeof(int32_t) * 8) - 2;
+    return static_cast<int8_t>((static_cast<int32_t>(bits) << shift) >> shift);
+  }
+
+  inline UnpackedType GetElem(size_t index) const {
+    assert(index <= 3);
+    const uint8_t shift = 2 * static_cast<uint8_t>(index);
+    const std::byte val = (bits_ >> shift) & std::byte{0x3};
+
+    if constexpr (Signed) {
+      return SignExtendLower2Bits(val);
+    } else {
+      return static_cast<UnpackedType>(val);
+    }
+  }
+
+  inline void SetElem(size_t index, UnpackedType val) {
+    assert(index <= 3);
+    const uint8_t shift = 2 * static_cast<uint8_t>(index);
+    const std::byte mask = ~(std::byte{0x3} << shift);
+
+    bits_ &= mask;                                          // Clear 2-bit element to 0
+    bits_ |= static_cast<std::byte>((val & 0x3) << shift);  // Set 2-bit element to val
+  }
+
+  inline std::byte ToBits() const {
+    return bits_;
+  }
+
+  /// <summary>
+  /// Calculate the number of bytes needed to store num_int2_elems 2-bit elements.
+  /// Each byte stores 4 elements.
+  /// </summary>
+  static size_t CalcNumInt2Quads(size_t num_int2_elems) {
+    return (num_int2_elems + 3) / 4;
+  }
+
+  /// <summary>
+  /// Copy a source buffer of 2-bit elements (packed) into a destination buffer of 8-bit elements (unpacked).
+  /// </summary>
+  /// <param name="dst">Destination buffer to store unpacked 8-bit elements</param>
+  /// <param name="src">Source buffer with 2-bit elements</param>
+  /// <returns>True on success</returns>
+  static bool Unpack(gsl::span<UnpackedType> dst, gsl::span<const Int2x4Base<Signed>> src) {
+    if (CalcNumInt2Quads(dst.size()) != src.size()) {
+      return false;
+    }
+
+    if (src.empty()) {
+      return true;
+    }
+
+    for (size_t i = 0; i < dst.size(); i++) {
+      size_t r = i >> 2;   // i / 4;
+      size_t c = i & 0x3;  // i % 4;
+      dst[i] = src[r].GetElem(c);
+    }
+
+    return true;
+  }
+
+  /// <summary>
+  /// Copy a source buffer of 8-bit elements (unpacked) into a destination buffer of 2-bit elements (packed).
+  /// </summary>
+  /// <param name="dst">Destination buffer to store packed 2-bit elements</param>
+  /// <param name="src">Source buffer with 8-bit elements</param>
+  /// <returns>True on success</returns>
+  static bool Pack(gsl::span<Int2x4Base<Signed>> dst, gsl::span<const UnpackedType> src) {
+    if (CalcNumInt2Quads(src.size()) != dst.size()) {
+      return false;
+    }
+
+    if (src.empty()) {
+      return true;
+    }
+
+    size_t src_i = 0;
+    size_t dst_i = 0;
+
+    // Process complete groups of 4 elements
+    for (; src_i + 3 < src.size(); src_i += 4) {
+      dst[dst_i++] = Int2x4Base<Signed>(src[src_i], src[src_i + 1], src[src_i + 2], src[src_i + 3]);
+    }
+
+    // Handle remaining elements (1-3) with zero padding
+    if (src_i < src.size()) {
+      UnpackedType vals[4] = {0, 0, 0, 0};
+      for (size_t j = 0; src_i + j < src.size(); ++j) {
+        vals[j] = src[src_i + j];
+      }
+      dst[dst_i] = Int2x4Base<Signed>(vals[0], vals[1], vals[2], vals[3]);
+    }
+
+    return true;
+  }
+
+  /// <summary>
+  /// Returns hierarchical indices for a packed int2 element from the given element index.
+  ///
+  /// Usage:
+  ///   Int2x4* data = ...;
+  ///   auto indices = GetTensorElemIndices(5);  // 6th int2 element
+  ///   int8_t elem = data[indices.first].GetElem(indices.second);
+  /// </summary>
+  /// <param name="index">Index of 2-bit element</param>
+  /// <returns>Pair of (byte_index, sub_index)</returns>
+  static inline std::pair<size_t, size_t> GetTensorElemIndices(size_t index) {
+    return {index >> 2, index & 0x3};
+  }
+};
+
+using Int2x4 = Int2x4Base<true>;
+using UInt2x4 = Int2x4Base<false>;
+static_assert(sizeof(Int2x4) == sizeof(std::byte));
+static_assert(sizeof(UInt2x4) == sizeof(std::byte));
+
+}  // namespace onnxruntime